Coverage Report

Created: 2025-11-16 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xnnpack/src/configs/gemm-config.c
Line
Count
Source
1
// Copyright 2023 Google LLC
2
//
3
// This source code is licensed under the BSD-style license found in the
4
// LICENSE file in the root directory of this source tree.
5
6
#include <assert.h>
7
#include <stddef.h>
8
#include <stdint.h>
9
10
#include "include/xnnpack.h"
11
#include "src/xnnpack/common.h"
12
#include "src/xnnpack/config.h"
13
#include "src/xnnpack/gemm.h"
14
#include "src/xnnpack/hardware-config.h"
15
#include "src/xnnpack/igemm.h"
16
#include "src/xnnpack/init-once.h"
17
#include "src/xnnpack/log.h"
18
#include "src/xnnpack/math.h"
19
#include "src/xnnpack/microfnptr.h"
20
#include "src/xnnpack/microparams-init.h"
21
#include "src/xnnpack/pack.h"
22
#include "src/xnnpack/packw.h"
23
24
#if XNN_ARCH_WASMSIMD
25
  #include <emscripten.h>
26
#endif
27
28
4
#define XNN_MR_TO_INDEX(MR) (MR-1)
29
// UARCH 0 is big core.  1 is medium or little core.
30
#ifndef XNN_UARCH_INDEX
31
0
#define XNN_UARCH_INDEX 0
32
#endif
33
34
static const int default_config = 0;
35
static const int consistent_config = 1;
36
37
static struct xnn_gemm_config bf16_f32_gemm_config = {0};
38
static struct xnn_gemm_config f16_gemm_config = {0};
39
static struct xnn_gemm_config f32_gemm_config[2] = {0};
40
static struct xnn_gemm_config f32_igemm_config = {0};
41
static struct xnn_gemm_config f32_gemm_nr2_config[2] = {0};
42
static struct xnn_gemm_config f32_qc4w_gemm_config = {0};
43
static struct xnn_gemm_config f32_qc8w_gemm_config = {0};
44
static struct xnn_gemm_config pf16_gemm_config = {0};
45
static struct xnn_gemm_config pf32_gemm_config = {0};
46
static struct xnn_gemm_config pqs8_qc8w_gemm_config = {0};
47
static struct xnn_gemm_config qd8_f16_qb4w_gemm_config = {0};
48
static struct xnn_gemm_config qd8_f16_qc4w_gemm_config = {0};
49
static struct xnn_gemm_config qd8_f16_qc8w_gemm_config = {0};
50
static struct xnn_gemm_config qd8_f16_qc8w_igemm_config = {0};
51
static struct xnn_gemm_config qd8_f32_qb4w_gemm_config = {0};
52
static struct xnn_gemm_config qd8_f32_qc4w_gemm_config = {0};
53
static struct xnn_gemm_config qd8_f32_qc2w_gemm_config = {0};
54
static struct xnn_gemm_config qd8_f32_qc8w_gemm_config = {0};
55
static struct xnn_gemm_config qp8_f32_qc4w_gemm_config = {0};
56
static struct xnn_gemm_config qp8_f32_qc8w_gemm_config = {0};
57
static struct xnn_gemm_config qp8_f32_qb4w_gemm_config = {0};
58
static struct xnn_gemm_config qdu8_f32_qc4w_gemm_config = {0};
59
static struct xnn_gemm_config qdu8_f16_qc8w_gemm_config = {0};
60
static struct xnn_gemm_config qdu8_f32_qc8w_igemm_config = {0};
61
static struct xnn_gemm_config qdu8_f32_qc8w_gemm_config = {0};
62
static struct xnn_gemm_config qdu8_f32_qb4w_gemm_config = {0};
63
static struct xnn_gemm_config qdu8_f16_qc4w_gemm_config = {0};
64
static struct xnn_gemm_config qs8_qc4w_gemm_config = {0};
65
static struct xnn_gemm_config qs8_qc8w_gemm_config = {0};
66
static struct xnn_gemm_config qu8_gemm_config = {0};
67
68
XNN_INIT_ONCE_GUARD(bf16_f32_gemm);
69
XNN_INIT_ONCE_GUARD(f16_gemm);
70
XNN_INIT_ONCE_GUARD(f32_igemm);
71
XNN_INIT_ONCE_GUARD(f32_gemm);
72
XNN_INIT_ONCE_GUARD(f32_gemm_nr2);
73
XNN_INIT_ONCE_GUARD(f32_qc4w_gemm);
74
XNN_INIT_ONCE_GUARD(f32_qc8w_gemm);
75
XNN_INIT_ONCE_GUARD(pf16_gemm);
76
XNN_INIT_ONCE_GUARD(pf32_gemm);
77
XNN_INIT_ONCE_GUARD(pqs8_qc8w_gemm);
78
XNN_INIT_ONCE_GUARD(qd8_f16_qb4w_gemm);
79
XNN_INIT_ONCE_GUARD(qd8_f16_qc4w_gemm);
80
XNN_INIT_ONCE_GUARD(qd8_f16_qc8w_gemm);
81
XNN_INIT_ONCE_GUARD(qd8_f16_qc8w_igemm);
82
XNN_INIT_ONCE_GUARD(qd8_f32_qb4w_gemm);
83
XNN_INIT_ONCE_GUARD(qd8_f32_qc4w_gemm);
84
XNN_INIT_ONCE_GUARD(qd8_f32_qc2w_gemm);
85
XNN_INIT_ONCE_GUARD(qd8_f32_qc8w_gemm);
86
XNN_INIT_ONCE_GUARD(qp8_f32_qc4w_gemm);
87
XNN_INIT_ONCE_GUARD(qp8_f32_qc8w_gemm);
88
XNN_INIT_ONCE_GUARD(qp8_f32_qb4w_gemm);
89
XNN_INIT_ONCE_GUARD(qdu8_f32_qc4w_gemm);
90
XNN_INIT_ONCE_GUARD(qdu8_f16_qc8w_gemm);
91
XNN_INIT_ONCE_GUARD(qdu8_f32_qc8w_gemm);
92
XNN_INIT_ONCE_GUARD(qdu8_f32_qc8w_igemm);
93
XNN_INIT_ONCE_GUARD(qdu8_f32_qb4w_gemm);
94
XNN_INIT_ONCE_GUARD(qdu8_f16_qc4w_gemm);
95
XNN_INIT_ONCE_GUARD(qs8_qc4w_gemm);
96
XNN_INIT_ONCE_GUARD(qs8_qc8w_gemm);
97
XNN_INIT_ONCE_GUARD(qu8_gemm);
98
99
// Macros to log the microkernel names if and when they are registered.
100
#define XNN_INIT_GEMM_UKERNEL(ukernel) \
101
  (xnn_gemm_ukernel_fn) ukernel;       \
102
  xnn_log_info("Using gemm microkernel '%s'.", #ukernel);
103
104
#define XNN_INIT_HMP_GEMM_UKERNEL(ukernel)                 \
105
2
  xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn)ukernel); \
106
2
  xnn_log_info("Using gemm microkernel '%s'.", #ukernel);
107
108
#define XNN_INIT_IGEMM_UKERNEL(ukernel) \
109
  (xnn_igemm_ukernel_fn) ukernel;       \
110
  xnn_log_info("Using igemm microkernel '%s'.", #ukernel);
111
112
#define XNN_INIT_HMP_IGEMM_UKERNEL(ukernel)                  \
113
2
  xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn)ukernel); \
114
2
  xnn_log_info("Using igemm microkernel '%s'.", #ukernel);
115
116
#define XNN_INIT_DQGEMM_UKERNEL(ukernel) \
117
  (xnn_dqgemm_ukernel_fn) ukernel;       \
118
  xnn_log_info("Using dqgemm microkernel '%s'.", #ukernel);
119
120
#define XNN_INIT_HMP_DQGEMM_UKERNEL(ukernel)                   \
121
0
  xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn)ukernel); \
122
0
  xnn_log_info("Using dqgemm microkernel '%s'.", #ukernel);
123
124
#define XNN_INIT_DQIGEMM_UKERNEL(ukernel) \
125
  (xnn_dqigemm_ukernel_fn) ukernel;       \
126
  xnn_log_info("Using dqigemm microkernel '%s'.", #ukernel);
127
128
#define XNN_INIT_HMP_DQIGEMM_UKERNEL(ukernel)                    \
129
0
  xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn)ukernel); \
130
0
  xnn_log_info("Using dqigemm microkernel '%s'.", #ukernel);
131
132
#define XNN_INIT_HMP_QP8GEMM_UKERNEL(ukernel)            \
133
  xnn_init_hmp_qp8gemm_ukernel(                          \
134
      (xnn_qp8_f32_qc4w_gemm_minmax_ukernel_fn)ukernel); \
135
  xnn_log_info("Using qp8gemm microkernel '%s'.", #ukernel);
136
137
#define XNN_INIT_HMP_QP8GEMM_BL_UKERNEL(ukernel)         \
138
  xnn_init_hmp_qp8gemm_bl_ukernel(                       \
139
      (xnn_qp8_f32_qb4w_gemm_minmax_ukernel_fn)ukernel); \
140
  xnn_log_info("Using qp8gemm_bl microkernel '%s'.", #ukernel);
141
142
0
static void init_f16_gemm_config(void) {
143
  // Common parameters.
144
0
  f16_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT16;
145
0
  f16_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_FLOAT16;
146
0
  f16_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_FLOAT16 + 3;
147
0
  f16_gemm_config.bias_element_size = sizeof(xnn_float16);
148
149
  // Arch-specific parameters.
150
  #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
151
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
152
    assert(hardware_config != NULL);
153
    (void) hardware_config;  // May be unused.
154
    if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
155
      f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
156
      f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
157
      f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
158
      f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
159
      f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
160
      f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
161
      f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8_prfm;
162
      f16_gemm_config.mr = 6;
163
      f16_gemm_config.nr = 8;
164
    }
165
  #elif XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
166
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
167
    assert(hardware_config != NULL);
168
    (void) hardware_config;  // May be unused.
169
    if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
170
      #if XNN_ENABLE_ASSEMBLY
171
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
172
          case xnn_uarch_cortex_a55:
173
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
174
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55);
175
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
176
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55);
177
            f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
178
            f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
179
            f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8_prfm;
180
            f16_gemm_config.mr = 6;
181
            f16_gemm_config.nr = 16;
182
            break;
183
          case xnn_uarch_cortex_a55r0:
184
          case xnn_uarch_cortex_a75:
185
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
186
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
187
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
188
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
189
            f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
190
            f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
191
            f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8_prfm;
192
            f16_gemm_config.mr = 6;
193
            f16_gemm_config.nr = 16;
194
            break;
195
          case xnn_uarch_exynos_m5:
196
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
197
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64);
198
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
199
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64);
200
            f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
201
            f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
202
            f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8_prfm;
203
            f16_gemm_config.mr = 4;
204
            f16_gemm_config.nr = 16;
205
            break;
206
          case xnn_uarch_exynos_m4:
207
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
208
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64);
209
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
210
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64);
211
            f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
212
            f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
213
            f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8_prfm;
214
            f16_gemm_config.mr = 6;
215
            f16_gemm_config.nr = 16;
216
            break;
217
          default:
218
          case xnn_uarch_cortex_a76:
219
          case xnn_uarch_cortex_a77:
220
          case xnn_uarch_cortex_a78:
221
          case xnn_uarch_cortex_a510:
222
          case xnn_uarch_cortex_a710:
223
          case xnn_uarch_cortex_a715:
224
          case xnn_uarch_cortex_x1:
225
          case xnn_uarch_cortex_x2:
226
          case xnn_uarch_cortex_x3:
227
          case xnn_uarch_cortex_x4:
228
          case xnn_uarch_oryon:
229
          case xnn_uarch_neoverse_n1:
230
          case xnn_uarch_neoverse_n2:
231
          case xnn_uarch_neoverse_v1:
232
          case xnn_uarch_neoverse_v2:
233
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
234
            f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75);
235
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64);
236
            f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75);
237
            f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
238
            f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
239
            f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8_prfm;
240
            f16_gemm_config.mr = 6;
241
            f16_gemm_config.nr = 16;
242
            break;
243
        }
244
245
        #if XNN_MAX_UARCH_TYPES > 1
246
        {
247
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
248
          const uint32_t mr = f16_gemm_config.mr;
249
          const uint32_t nr = f16_gemm_config.nr;
250
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
251
            switch (hardware_config->uarch[i]) {
252
              case xnn_uarch_cortex_a55:
253
                if (mr == 6 && nr == 16) {
254
                  f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55);
255
                  f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55);
256
                }
257
                break;
258
              case xnn_uarch_cortex_a55r0:
259
              case xnn_uarch_cortex_a75:
260
                if (mr == 6 && nr == 16) {
261
                  f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
262
                  f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
263
                }
264
                break;
265
              default:
266
                break;
267
            }
268
          }
269
        }
270
        #endif  // XNN_MAX_UARCH_TYPES > 1
271
      #else  // XNN_ENABLE_ASSEMBLY
272
        f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
273
        f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
274
        f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
275
        f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
276
        f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
277
        f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
278
        f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8_prfm;
279
        f16_gemm_config.mr = 6;
280
        f16_gemm_config.nr = 16;
281
      #endif  // XNN_ENABLE_ASSEMBLY
282
    }
283
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
284
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
285
0
    assert(hardware_config != NULL);
286
0
    (void) hardware_config;  // May be unused.
287
0
    if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
288
0
      f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast);
289
0
      f16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast);
290
0
      f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast);
291
0
      f16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast);
292
0
      f16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
293
0
      f16_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f16_gemm_gio_w;
294
0
      #if XNN_ENABLE_AVX2
295
0
        f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm;
296
      #else
297
        f16_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f16_gemm_goi_w;
298
      #endif
299
0
      f16_gemm_config.mr = 4;
300
0
      f16_gemm_config.nr = 16;
301
0
    }
302
0
  #endif
303
0
  assert(f16_gemm_config.mr <= XNN_MAX_MR);
304
0
}
305
306
#if XNN_ARCH_WASMSIMD
307
  EM_JS(int, hardware_concurrency, (void), {
308
    var concurrency = 1;
309
    try {
310
      concurrency = self.navigator.hardwareConcurrency;
311
    } catch(e) {
312
      // d8 environment doesn't provide `self`, thus we keep the default
313
    }
314
    return concurrency;
315
  });
316
  // A cpu with more than `kCoreCountThresholdForAdaptiveAvxOptimization` is
317
  // assumed to support AVX instructions.
318
  const int kCoreCountThresholdForAdaptiveAvxOptimization = 4;
319
#endif
320
321
0
static void init_pf16_gemm_config(void) {
322
  // Common parameters.
323
0
  pf16_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT16;
324
0
  pf16_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_FLOAT16;
325
0
  pf16_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_FLOAT16 + 3;
326
0
  pf16_gemm_config.bias_element_size = sizeof(xnn_float16);
327
#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
328
  const struct xnn_hardware_config* hardware_config =
329
      xnn_init_hardware_config();
330
  assert(hardware_config != NULL);
331
  if (XNN_ENABLE_ARM_SME2 && (hardware_config->arch_flags & xnn_arch_arm_sme2)) {
332
    #if XNN_ENABLE_ARM_SME2
333
      const size_t mr = xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2_get_mr();
334
      const size_t nr = xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2_get_nr();
335
      pf16_gemm_config.arch = xnn_arch_arm_sme2;
336
      pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_1x32c2__neonsme2);
337
      pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2);
338
      pf16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
339
      pf16_gemm_config.pack_weights_and_biases = xnn_pack_kai_f16_weights_and_biases;
340
      pf16_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f16_weights_and_biases;
341
      pf16_gemm_config.mr = mr;
342
      pf16_gemm_config.mr_packed = mr;
343
      pf16_gemm_config.nr = nr;
344
      pf16_gemm_config.log2_kr = 1;
345
    #endif  // XNN_ENABLE_ARM_SME2
346
  }
347
#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
348
0
}
349
350
0
static void init_bf16_f32_gemm_config(void) {
351
  // Common parameters.
352
0
  bf16_f32_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_BFLOAT16;
353
0
  bf16_f32_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_BFLOAT16;
354
0
  bf16_f32_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_BFLOAT16 + 3;
355
0
  bf16_f32_gemm_config.bias_element_size = sizeof(float);
356
357
  // Arch-specific parameters.
358
0
#if XNN_ARCH_X86_64
359
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
360
0
  assert(hardware_config != NULL);
361
0
  (void) hardware_config;  // May be unused.
362
0
  if (XNN_ENABLE_AVX512BF16 && (hardware_config->arch_flags & xnn_arch_x86_avx512bf16)) {
363
0
    #if XNN_ENABLE_AVX512BF16 && XNN_ENABLE_ASSEMBLY
364
0
      bf16_f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_bf16_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512bf16_broadcast);
365
0
      bf16_f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(11)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_bf16_f32_gemm_minmax_ukernel_11x32c2__asm_amd64_avx512bf16_broadcast);
366
0
      bf16_f32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
367
0
      bf16_f32_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x16_x32_packw_gemm_goi_ukernel_x32c2__scalar;
368
0
      bf16_f32_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x16_x32_packw_gemm_gio_ukernel_x32c2__scalar;
369
0
      bf16_f32_gemm_config.mr = 11;
370
0
      bf16_f32_gemm_config.nr = 32;
371
0
      bf16_f32_gemm_config.log2_kr = 1;
372
0
    #endif  // XNN_ENABLE_AVX512BF16
373
0
  }
374
0
  assert(bf16_f32_gemm_config.mr <= XNN_MAX_MR);
375
0
#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
376
0
}
377
378
0
static void init_pf32_gemm_config(void) {
379
  // Common parameters.
380
0
  pf32_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
381
0
  pf32_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_FLOAT;
382
0
  pf32_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_FLOAT + 3;
383
0
  pf32_gemm_config.bias_element_size = sizeof(float);
384
385
  // Arch-specific parameters.
386
#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
387
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
388
  assert(hardware_config != NULL);
389
  (void) hardware_config;  // May be unused.
390
  if (XNN_ENABLE_ARM_SME2 && (hardware_config->arch_flags & xnn_arch_arm_sme2)) {
391
    #if XNN_ENABLE_ARM_SME2
392
      const size_t mr = xnn_pf32_gemm_minmax_ukernel_32x32__neonsme2_get_mr();
393
      const size_t nr = xnn_pf32_gemm_minmax_ukernel_32x32__neonsme2_get_nr();
394
      pf32_gemm_config.arch = xnn_arch_arm_sme2;
395
      pf32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf32_gemm_minmax_ukernel_1x32__neonsme2);
396
      pf32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] =XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf32_gemm_minmax_ukernel_32x32__neonsme2);
397
      pf32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
398
      pf32_gemm_config.pack_weights_and_biases = xnn_pack_kai_f32_weights_and_biases;
399
      pf32_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f32_weights_and_biases;
400
      pf32_gemm_config.mr = mr;
401
      pf32_gemm_config.mr_packed = mr;
402
      pf32_gemm_config.nr = nr;
403
    #endif  // XNN_ENABLE_ARM_SME2
404
  } else if (XNN_ENABLE_ARM_SME &&
405
             (hardware_config->arch_flags & xnn_arch_arm_sme)) {
406
#if XNN_ENABLE_ARM_SME
407
    const size_t mr = xnn_pf32_gemm_minmax_ukernel_32x32__neonsme_get_mr();
408
    const size_t nr = xnn_pf32_gemm_minmax_ukernel_32x32__neonsme_get_nr();
409
    pf32_gemm_config.arch = xnn_arch_arm_sme;
410
    pf32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf32_gemm_minmax_ukernel_1x32__neonsme);
411
    pf32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] =XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf32_gemm_minmax_ukernel_32x32__neonsme);
412
    pf32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
413
    pf32_gemm_config.pack_weights_and_biases = xnn_pack_kai_f32_weights_and_biases;
414
    pf32_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f32_weights_and_biases;
415
    pf32_gemm_config.mr = mr;
416
    pf32_gemm_config.mr_packed = mr;
417
    pf32_gemm_config.nr = nr;
418
#endif  // XNN_ENABLE_ARM_SME
419
  } else {
420
    /* No Action */
421
  }
422
  assert(pf32_gemm_config.mr <= XNN_MAX_MR);
423
#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
424
0
}
425
426
0
static void init_pqs8_qc8w_gemm_config(void) {
427
  // Common parameters.
428
0
  pqs8_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
429
0
  pqs8_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
430
0
  pqs8_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
431
0
  pqs8_qc8w_gemm_config.bias_element_size = sizeof(int32_t);
432
433
  // Arch-specific parameters.
434
#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
435
  const struct xnn_hardware_config* hardware_config =
436
      xnn_init_hardware_config();
437
  assert(hardware_config != NULL);
438
  (void)hardware_config;  // May be unused.
439
  if (XNN_ENABLE_ARM_SME2 && (hardware_config->arch_flags & xnn_arch_arm_sme2)) {
440
#if XNN_ENABLE_ARM_SME2
441
    const size_t mr =
442
        xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme2_get_mr();
443
    const size_t nr =
444
        xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme2_get_nr();
445
    pqs8_qc8w_gemm_config.arch = xnn_arch_arm_sme2;
446
    pqs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme2);
447
    pqs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pqs8_qc8w_gemm_minmax_ukernel_1x32c4__neonsme2);
448
    pqs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(mr)] =
449
        xnn_init_hmp_packed_igemm_ukernel(
450
            (xnn_packed_lhs_igemm_ukernel_fn)
451
                xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme2);
452
    pqs8_qc8w_gemm_config.init.qs8_qc8w =
453
        xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
454
    pqs8_qc8w_gemm_config.pack_weights_and_biases =
455
        xnn_pack_kai_qs8_qc8w_weights_and_biases_sme2;
456
    pqs8_qc8w_gemm_config.packed_stride_weights_and_biases =
457
        xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme2;
458
    pqs8_qc8w_gemm_config.pack_igemm_goki =
459
        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_qs8_conv_goki_w_sme2;
460
    pqs8_qc8w_gemm_config.pack_igemm_kgo =
461
        (xnn_pack_conv_kgo_w_fn)xnn_pack_qs8_conv_kgo_w;
462
    pqs8_qc8w_gemm_config.pack_deconv_goki =
463
        (xnn_pack_deconv_goki_w_fn)xnn_pack_qs8_deconv_goki_w;
464
    pqs8_qc8w_gemm_config.mr = mr;
465
    pqs8_qc8w_gemm_config.mr_packed = mr;
466
    pqs8_qc8w_gemm_config.nr = nr;
467
    pqs8_qc8w_gemm_config.log2_kr = 2;
468
#endif  // XNN_ENABLE_ARM_SME2
469
  }
470
  assert(pqs8_qc8w_gemm_config.mr <= XNN_MAX_MR);
471
#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
472
0
}
473
474
0
static void init_f32_gemm_config_impl(struct xnn_gemm_config* f32_gemm_config, bool consistent_arithmetic) {
475
  // Common parameters.
476
0
  f32_gemm_config->log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
477
0
  f32_gemm_config->log2_filter_element_size = XNN_LOG2_SIZEOF_FLOAT;
478
0
  f32_gemm_config->log2_filter_element_bit_size = XNN_LOG2_SIZEOF_FLOAT + 3;
479
0
  f32_gemm_config->bias_element_size = sizeof(float);
480
481
  // Arch-specific parameters.
482
  #if XNN_ARCH_ARM
483
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
484
    assert(hardware_config != NULL);
485
    (void) hardware_config;  // May be unused.
486
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
487
      #if XNN_ENABLE_ASSEMBLY
488
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
489
          case xnn_uarch_cortex_a5:
490
          case xnn_uarch_cortex_a7:
491
          case xnn_uarch_krait:
492
          case xnn_uarch_kryo:
493
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
494
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7);
495
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
496
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
497
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
498
            f32_gemm_config->mr = 4;
499
            f32_gemm_config->nr = 8;
500
            break;
501
          case xnn_uarch_cortex_a53:
502
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
503
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm);
504
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
505
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
506
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
507
            f32_gemm_config->mr = 4;
508
            f32_gemm_config->nr = 8;
509
            break;
510
          case xnn_uarch_cortex_a55r0:
511
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
512
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53);
513
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
514
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
515
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
516
            f32_gemm_config->mr = 4;
517
            f32_gemm_config->nr = 8;
518
            break;
519
          case xnn_uarch_cortex_a32:
520
          case xnn_uarch_cortex_a35:
521
          case xnn_uarch_cortex_a55:
522
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
523
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55);
524
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
525
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
526
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
527
            f32_gemm_config->mr = 4;
528
            f32_gemm_config->nr = 8;
529
            break;
530
531
          case xnn_uarch_cortex_a57:
532
          case xnn_uarch_cortex_a72:
533
          case xnn_uarch_cortex_a73:
534
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
535
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm);
536
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
537
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
538
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
539
            f32_gemm_config->mr = 4;
540
            f32_gemm_config->nr = 8;
541
            break;
542
543
          default:
544
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
545
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75);
546
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
547
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
548
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
549
            f32_gemm_config->mr = 4;
550
            f32_gemm_config->nr = 8;
551
            break;
552
        }
553
        #if XNN_MAX_UARCH_TYPES > 1
554
        {
555
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
556
          const uint32_t mr = f32_gemm_config->mr;
557
          const uint32_t nr = f32_gemm_config->nr;
558
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
559
            switch (hardware_config->uarch[i]) {
560
              case xnn_uarch_cortex_a53:
561
                if (mr == 4 && nr == 8) {
562
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
563
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm);
564
                }
565
                break;
566
              case xnn_uarch_cortex_a55r0:
567
                if (mr == 4 && nr == 8) {
568
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
569
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53);
570
                }
571
                break;
572
              case xnn_uarch_cortex_a55:
573
                if (mr == 4 && nr == 8) {
574
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
575
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55);
576
                }
577
                break;
578
              default:
579
                break;
580
            }
581
          }
582
        }
583
        #endif  // XNN_MAX_UARCH_TYPES > 1
584
      #else  // XNN_ENABLE_ASSEMBLY
585
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
586
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
587
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
588
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
589
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
590
        f32_gemm_config->mr = 4;
591
        f32_gemm_config->nr = 8;
592
      #endif  // XNN_ENABLE_ASSEMBLY
593
    } else {
594
      f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x4__scalar);
595
      f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x4__scalar);
596
      f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x4__scalar);
597
      f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x4__scalar);
598
      f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x4__scalar);
599
      f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_4x4__scalar);
600
      f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
601
      f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x4__scalar;
602
      f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4;
603
      f32_gemm_config->mr = 4;
604
      f32_gemm_config->nr = 4;
605
    }
606
  #elif XNN_ARCH_ARM64
607
    #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
608
        const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
609
        assert(hardware_config);
610
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
611
          case xnn_uarch_cortex_a72:
612
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm);
613
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm);
614
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
615
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
616
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
617
            f32_gemm_config->mr = 4;
618
            f32_gemm_config->nr = 8;
619
            break;
620
          case xnn_uarch_cortex_a57:
621
          case xnn_uarch_cortex_a75:
622
          case xnn_uarch_cortex_a76:
623
          case xnn_uarch_exynos_m3:
624
          case xnn_uarch_exynos_m4:
625
          case xnn_uarch_neoverse_n1:
626
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm);
627
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm);
628
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm);
629
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
630
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
631
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
632
            f32_gemm_config->mr = 6;
633
            f32_gemm_config->nr = 8;
634
            break;
635
          case xnn_uarch_exynos_m1:
636
          case xnn_uarch_exynos_m2:
637
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
638
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma);
639
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
640
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
641
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
642
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8s4__neon_ld4lane_u4_prfm;
643
            f32_gemm_config->mr = 6;
644
            f32_gemm_config->nr = 8;
645
            f32_gemm_config->log2_sr = 2;
646
            break;
647
          case xnn_uarch_cortex_a53:
648
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm);
649
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm);
650
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm);
651
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
652
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
653
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
654
            f32_gemm_config->mr = 6;
655
            f32_gemm_config->nr = 8;
656
            break;
657
          case xnn_uarch_cortex_a55r0:
658
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
659
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53);
660
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53);
661
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
662
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
663
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
664
            f32_gemm_config->mr = 6;
665
            f32_gemm_config->nr = 8;
666
            break;
667
          case xnn_uarch_cortex_a35:
668
          case xnn_uarch_cortex_a55:
669
          case xnn_uarch_kryo:
670
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
671
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55);
672
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55);
673
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
674
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
675
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
676
            f32_gemm_config->mr = 6;
677
            f32_gemm_config->nr = 8;
678
            break;
679
          case xnn_uarch_cortex_a73:
680
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm);
681
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73);
682
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
683
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
684
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
685
            f32_gemm_config->mr = 6;
686
            f32_gemm_config->nr = 8;
687
            break;
688
          case xnn_uarch_cortex_a77:
689
          case xnn_uarch_exynos_m5:
690
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75);
691
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75);
692
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
693
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x8__neon_u2;
694
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
695
            f32_gemm_config->mr = 4;
696
            f32_gemm_config->nr = 8;
697
            break;
698
          case xnn_uarch_cortex_x3:
699
          case xnn_uarch_neoverse_v2:
700
            // TODO(fbarchard): Implement asm with indexed inputs
701
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_2);
702
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_8x8__asm_aarch64_neonfma_ld128_2);
703
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
704
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x8__neon_u2;
705
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
706
            f32_gemm_config->mr = 8;
707
            f32_gemm_config->nr = 8;
708
            break;
709
          case xnn_uarch_oryon:
710
          case xnn_uarch_cortex_x4:
711
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128);
712
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128);
713
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
714
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__neon_u2;
715
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__neon_ld4lane_u4_prfm;
716
            f32_gemm_config->mr = 4;
717
            f32_gemm_config->nr = 16;
718
            break;
719
          case xnn_uarch_cortex_a78:
720
          case xnn_uarch_cortex_a510:
721
          case xnn_uarch_cortex_a710:
722
          case xnn_uarch_cortex_a715:
723
          case xnn_uarch_cortex_x1:
724
          case xnn_uarch_cortex_x2:
725
          case xnn_uarch_neoverse_n2:
726
          case xnn_uarch_neoverse_v1:
727
          default:
728
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_2);
729
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128_2);
730
            f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_7x8__asm_aarch64_neonfma_ld128_2);
731
            f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
732
            f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
733
            f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
734
            f32_gemm_config->mr = 7;
735
            f32_gemm_config->nr = 8;
736
            break;
737
        }
738
        #if XNN_MAX_UARCH_TYPES > 1
739
        {
740
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
741
          const uint32_t mr = f32_gemm_config->mr;
742
          const uint32_t nr = f32_gemm_config->nr;
743
          const uint32_t log2_sr = f32_gemm_config->log2_sr;
744
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
745
            switch (hardware_config->uarch[i]) {
746
              case xnn_uarch_cortex_a53:
747
                if (mr == 6 && nr == 8 && log2_sr == 0) {
748
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm);
749
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm);
750
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm);
751
                } else if (mr == 4 && nr == 8 && log2_sr == 0) {
752
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm);
753
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm);
754
                }
755
                break;
756
              case xnn_uarch_cortex_a55r0:
757
                if (mr == 6 && nr == 8 && log2_sr == 0) {
758
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
759
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53);
760
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53);
761
                } else if (mr == 4 && nr == 8 && log2_sr == 0) {
762
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
763
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53);
764
                }
765
                break;
766
              case xnn_uarch_cortex_a55:
767
                if (mr == 6 && nr == 8 && log2_sr == 0) {
768
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
769
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55);
770
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55);
771
                } else if (mr == 4 && nr == 8 && log2_sr == 0) {
772
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
773
                  f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55);
774
                }
775
                break;
776
              default:
777
                break;
778
            }
779
          }
780
        }
781
      #endif  // XNN_MAX_UARCH_TYPES > 1
782
    #else  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
783
      #if XNN_ENABLE_ASSEMBLY
784
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4);
785
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
786
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
787
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
788
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
789
        f32_gemm_config->mr = 6;
790
        f32_gemm_config->nr = 8;
791
      #else  // !XNN_ENABLE_ASSEMBLY
792
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128);
793
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128);
794
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
795
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
796
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
797
        f32_gemm_config->mr = 6;
798
        f32_gemm_config->nr = 8;
799
       #endif  // XNN_ENABLE_ASSEMBLY
800
    #endif  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
801
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
802
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
803
0
    assert(hardware_config != NULL);
804
0
    (void) hardware_config;  // May be unused.
805
    // TODO(b/460806642): Don't enable this without a fast packing kernel.
806
    #if false && XNN_ENABLE_AVX512F && XNN_ARCH_X86_64 && !XNN_PLATFORM_WINDOWS && XNN_ENABLE_ASSEMBLY
807
      if (!consistent_arithmetic && hardware_config->arch_flags & xnn_arch_x86_avx512f) {
808
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x32c2__asm_amd64_avx512f_broadcast);
809
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x32c2__asm_amd64_avx512f_broadcast);
810
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
811
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
812
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x32c2__avx512f_u4_prfm;
813
        f32_gemm_config->mr = 5;
814
        f32_gemm_config->nr = 32;
815
        f32_gemm_config->log2_kr = 1;
816
        f32_gemm_config->log2_sr = 0;
817
      } else
818
    #endif
819
0
    #if XNN_ENABLE_AVX512F
820
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512f) {
821
0
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x32__avx512f_broadcast);
822
0
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_7x32__avx512f_broadcast);
823
0
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
824
0
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x32__avx512f_u8;
825
0
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x32__avx512f_u4_prfm;
826
0
        f32_gemm_config->mr = 7;
827
0
        f32_gemm_config->nr = 32;
828
0
      } else
829
0
    #endif
830
0
    #if XNN_ENABLE_FMA3
831
0
      if (hardware_config->arch_flags & xnn_arch_x86_fma3) {
832
0
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
833
0
          #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
834
0
            case xnn_uarch_zen:
835
0
            case xnn_uarch_dhyana:
836
0
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
837
0
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
838
0
              f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
839
0
              f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
840
0
              f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4;
841
0
              f32_gemm_config->mr = 4;
842
0
              f32_gemm_config->nr = 16;
843
0
              f32_gemm_config->log2_sr = 2;
844
0
              break;
845
0
          #endif
846
0
          default:
847
0
            #if XNN_ARCH_X86_64 && !XNN_PLATFORM_WINDOWS && XNN_ENABLE_ASSEMBLY && XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
848
0
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16__asm_amd64_fma3_broadcast);
849
0
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_2x16__asm_amd64_fma3_broadcast);
850
0
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_fma3_broadcast);
851
0
              f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
852
0
              f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__avx_u8;
853
0
              f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4;
854
0
              f32_gemm_config->mr = 6;
855
0
              f32_gemm_config->nr = 16;
856
            #elif XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
857
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
858
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_2x16__fma3_broadcast);
859
              f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
860
              f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
861
              f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__avx_u8;
862
              f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4;
863
              f32_gemm_config->mr = 5;
864
              f32_gemm_config->nr = 16;
865
            #endif
866
0
            break;
867
0
        }
868
0
      } else
869
0
    #endif
870
0
    #if XNN_ENABLE_AVX
871
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
872
0
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
873
0
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
874
0
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
875
0
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__avx_u8;
876
0
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4;
877
0
        f32_gemm_config->mr = 5;
878
0
        f32_gemm_config->nr = 16;
879
0
      } else
880
0
    #endif
881
0
    #if XNN_ENABLE_SSE2
882
0
      if (hardware_config->arch_flags & xnn_arch_x86_sse2) {
883
0
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
884
0
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
885
0
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
886
0
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
887
0
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__sse2_u4;
888
0
        f32_gemm_config->mr = 4;
889
0
        f32_gemm_config->nr = 8;
890
0
      }
891
0
    #endif
892
  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
893
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
894
    assert(hardware_config != NULL);
895
    (void) hardware_config;  // May be unused.
896
    if (hardware_config->is_x86) {
897
      #if XNN_ARCH_WASMRELAXEDSIMD
898
        f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
899
        f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
900
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
901
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
902
        f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
903
        f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
904
      #else
905
        if (hardware_concurrency() > kCoreCountThresholdForAdaptiveAvxOptimization) {
906
          f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat);
907
          f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat);
908
          f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat);
909
          f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat);
910
          f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat);
911
          f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat);
912
        } else {
913
          f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
914
          f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
915
          f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
916
          f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
917
          f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
918
          f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
919
        }
920
      #endif
921
922
      f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
923
      f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
924
      f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_u4;
925
      f32_gemm_config->mr = 4;
926
      f32_gemm_config->nr = 8;
927
    } else {
928
      #if XNN_ARCH_WASMRELAXEDSIMD
929
        f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
930
        f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_splat);
931
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
932
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat);
933
        f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
934
        f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat);
935
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
936
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
937
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_u4;
938
        f32_gemm_config->mr = 6;
939
        f32_gemm_config->nr = 8;
940
      #else
941
        f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
942
        f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
943
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
944
        f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
945
        f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
946
        f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
947
        f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
948
        f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
949
        f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_u4;
950
        f32_gemm_config->mr = 5;
951
        f32_gemm_config->nr = 8;
952
      #endif
953
    }
954
  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
955
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
956
    assert(hardware_config != NULL);
957
    (void) hardware_config;  // May be unused.
958
    if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
959
      f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x4v__rvv);
960
      f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_7x4v__rvv);
961
      f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
962
      f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
963
      f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x4v__rvv_u8;
964
      f32_gemm_config->mr = 7;
965
      // nr is set to vlen * 4 / sizeof(float) = 4 * VLENB * 8 / 32 = VLENB
966
      f32_gemm_config->nr = hardware_config->vlenb;
967
    }
968
  #elif XNN_ARCH_HEXAGON && XNN_ENABLE_HVX
969
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
970
    assert(hardware_config != NULL);
971
    if (hardware_config->arch_flags & xnn_arch_hvx) {
972
      f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x64__hvx_broadcast);
973
      f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x64__hvx_broadcast);
974
      f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
975
      f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x64__hvx_u2;
976
      f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x64__scalar_int_u2;
977
      f32_gemm_config->mr = 5;
978
      f32_gemm_config->nr = 64;
979
    }
980
  #else
981
    f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_1x4__scalar);
982
    f32_gemm_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x4__scalar);
983
    f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x4__scalar);
984
    f32_gemm_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x4__scalar);
985
    f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_1x4__scalar);
986
    f32_gemm_config->relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_relu_ukernel_4x4__scalar);
987
    f32_gemm_config->init.f32 = xnn_init_f32_minmax_scalar_params;
988
    f32_gemm_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x4__scalar;
989
    f32_gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4;
990
    f32_gemm_config->mr = 4;
991
    f32_gemm_config->nr = 4;
992
  #endif
993
0
  assert(f32_gemm_config->mr <= XNN_MAX_MR);
994
0
}
995
996
0
static void init_f32_gemm_config() {
997
0
  init_f32_gemm_config_impl(&f32_gemm_config[default_config], false);
998
0
  init_f32_gemm_config_impl(&f32_gemm_config[consistent_config], true);
999
0
}
1000
1001
0
static void init_f32_igemm_config(void) {
1002
  // Common parameters.
1003
0
  f32_igemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
1004
0
  f32_igemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_FLOAT;
1005
0
  f32_igemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_FLOAT + 3;
1006
0
  f32_igemm_config.bias_element_size = sizeof(float);
1007
1008
  // Arch-specific parameters.
1009
  #if XNN_ARCH_ARM
1010
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1011
    assert(hardware_config != NULL);
1012
    (void) hardware_config;  // May be unused.
1013
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
1014
      #if XNN_ENABLE_ASSEMBLY
1015
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
1016
          case xnn_uarch_cortex_a5:
1017
          case xnn_uarch_cortex_a7:
1018
          case xnn_uarch_krait:
1019
          case xnn_uarch_kryo:
1020
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
1021
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7);
1022
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1023
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1024
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1025
            f32_igemm_config.mr = 4;
1026
            f32_igemm_config.nr = 8;
1027
            break;
1028
          case xnn_uarch_cortex_a53:
1029
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
1030
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm);
1031
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1032
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1033
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1034
            f32_igemm_config.mr = 4;
1035
            f32_igemm_config.nr = 8;
1036
            break;
1037
          case xnn_uarch_cortex_a55r0:
1038
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
1039
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53);
1040
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1041
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1042
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1043
            f32_igemm_config.mr = 4;
1044
            f32_igemm_config.nr = 8;
1045
            break;
1046
          case xnn_uarch_cortex_a32:
1047
          case xnn_uarch_cortex_a35:
1048
          case xnn_uarch_cortex_a55:
1049
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
1050
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55);
1051
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1052
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1053
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1054
            f32_igemm_config.mr = 4;
1055
            f32_igemm_config.nr = 8;
1056
            break;
1057
1058
          case xnn_uarch_cortex_a57:
1059
          case xnn_uarch_cortex_a72:
1060
          case xnn_uarch_cortex_a73:
1061
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
1062
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm);
1063
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1064
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1065
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1066
            f32_igemm_config.mr = 4;
1067
            f32_igemm_config.nr = 8;
1068
            break;
1069
1070
          default:
1071
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
1072
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75);
1073
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1074
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1075
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1076
            f32_igemm_config.mr = 4;
1077
            f32_igemm_config.nr = 8;
1078
            break;
1079
        }
1080
        #if XNN_MAX_UARCH_TYPES > 1
1081
        {
1082
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1083
          const uint32_t mr = f32_igemm_config.mr;
1084
          const uint32_t nr = f32_igemm_config.nr;
1085
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1086
            switch (hardware_config->uarch[i]) {
1087
              case xnn_uarch_cortex_a53:
1088
                if (mr == 4 && nr == 8) {
1089
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm);
1090
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm);
1091
                }
1092
                break;
1093
              case xnn_uarch_cortex_a55r0:
1094
                if (mr == 4 && nr == 8) {
1095
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
1096
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53);
1097
                }
1098
                break;
1099
              case xnn_uarch_cortex_a55:
1100
                if (mr == 4 && nr == 8) {
1101
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53);
1102
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55);
1103
                }
1104
                break;
1105
              default:
1106
                break;
1107
            }
1108
          }
1109
        }
1110
        #endif  // XNN_MAX_UARCH_TYPES > 1
1111
      #else  // XNN_ENABLE_ASSEMBLY
1112
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
1113
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
1114
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1115
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1116
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1117
        f32_igemm_config.mr = 4;
1118
        f32_igemm_config.nr = 8;
1119
      #endif  // XNN_ENABLE_ASSEMBLY
1120
    } else {
1121
      f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x4__scalar);
1122
      f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x4__scalar);
1123
      f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1124
      f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1125
      f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x4__scalar);
1126
      f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_4x4__scalar);
1127
      f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1128
      f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x4__scalar;
1129
      f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4;
1130
      f32_igemm_config.mr = 4;
1131
      f32_igemm_config.nr = 4;
1132
    }
1133
  #elif XNN_ARCH_ARM64
1134
    #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1135
        const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1136
        assert(hardware_config);
1137
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
1138
          case xnn_uarch_cortex_a72:
1139
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm);
1140
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm);
1141
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1142
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1143
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1144
            f32_igemm_config.mr = 4;
1145
            f32_igemm_config.nr = 8;
1146
            break;
1147
          case xnn_uarch_cortex_a57:
1148
          case xnn_uarch_cortex_a75:
1149
          case xnn_uarch_cortex_a76:
1150
          case xnn_uarch_exynos_m3:
1151
          case xnn_uarch_exynos_m4:
1152
          case xnn_uarch_neoverse_n1:
1153
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm);
1154
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm);
1155
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm);
1156
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1157
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1158
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1159
            f32_igemm_config.mr = 6;
1160
            f32_igemm_config.nr = 8;
1161
            break;
1162
          case xnn_uarch_exynos_m1:
1163
          case xnn_uarch_exynos_m2:
1164
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
1165
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma);
1166
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
1167
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1168
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1169
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8s4__neon_ld4lane_u4_prfm;
1170
            f32_igemm_config.mr = 6;
1171
            f32_igemm_config.nr = 8;
1172
            f32_igemm_config.log2_sr = 2;
1173
            break;
1174
          case xnn_uarch_cortex_a53:
1175
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm);
1176
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm);
1177
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm);
1178
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1179
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1180
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1181
            f32_igemm_config.mr = 6;
1182
            f32_igemm_config.nr = 8;
1183
            break;
1184
          case xnn_uarch_cortex_a55r0:
1185
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
1186
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53);
1187
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53);
1188
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1189
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1190
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1191
            f32_igemm_config.mr = 6;
1192
            f32_igemm_config.nr = 8;
1193
            break;
1194
          case xnn_uarch_cortex_a35:
1195
          case xnn_uarch_cortex_a55:
1196
          case xnn_uarch_kryo:
1197
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
1198
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55);
1199
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55);
1200
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1201
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1202
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1203
            f32_igemm_config.mr = 6;
1204
            f32_igemm_config.nr = 8;
1205
            break;
1206
          case xnn_uarch_cortex_a73:
1207
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm);
1208
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73);
1209
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1210
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1211
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1212
            f32_igemm_config.mr = 6;
1213
            f32_igemm_config.nr = 8;
1214
            break;
1215
          case xnn_uarch_cortex_a77:
1216
          case xnn_uarch_exynos_m5:
1217
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75);
1218
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75);
1219
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1220
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x8__neon_u2;
1221
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1222
            f32_igemm_config.mr = 4;
1223
            f32_igemm_config.nr = 8;
1224
            break;
1225
          case xnn_uarch_cortex_x3:
1226
          case xnn_uarch_neoverse_v2:
1227
            // TODO(fbarchard): Implement asm with indexed inputs
1228
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64);
1229
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128);
1230
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1231
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x8__neon_u2;
1232
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1233
            f32_igemm_config.mr = 6;
1234
            f32_igemm_config.nr = 8;
1235
            break;
1236
          case xnn_uarch_oryon:
1237
          case xnn_uarch_cortex_x4:
1238
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128);
1239
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128);
1240
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1241
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__neon_u2;
1242
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__neon_ld4lane_u4_prfm;
1243
            f32_igemm_config.mr = 4;
1244
            f32_igemm_config.nr = 16;
1245
            break;
1246
          case xnn_uarch_cortex_a78:
1247
          case xnn_uarch_cortex_a510:
1248
          case xnn_uarch_cortex_a710:
1249
          case xnn_uarch_cortex_a715:
1250
          case xnn_uarch_cortex_x1:
1251
          case xnn_uarch_cortex_x2:
1252
          case xnn_uarch_neoverse_n2:
1253
          case xnn_uarch_neoverse_v1:
1254
          default:
1255
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64);
1256
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128);
1257
            f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
1258
            f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1259
            f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1260
            f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1261
            f32_igemm_config.mr = 6;
1262
            f32_igemm_config.nr = 8;
1263
            break;
1264
        }
1265
        #if XNN_MAX_UARCH_TYPES > 1
1266
        {
1267
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1268
          const uint32_t mr = f32_igemm_config.mr;
1269
          const uint32_t nr = f32_igemm_config.nr;
1270
          const uint32_t log2_sr = f32_igemm_config.log2_sr;
1271
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1272
            switch (hardware_config->uarch[i]) {
1273
              case xnn_uarch_cortex_a53:
1274
                if (mr == 6 && nr == 8 && log2_sr == 0) {
1275
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm);
1276
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm);
1277
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm);
1278
                } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1279
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm);
1280
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm);
1281
                }
1282
                break;
1283
              case xnn_uarch_cortex_a55r0:
1284
                if (mr == 6 && nr == 8 && log2_sr == 0) {
1285
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
1286
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53);
1287
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53);
1288
                } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1289
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
1290
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53);
1291
                }
1292
                break;
1293
              case xnn_uarch_cortex_a55:
1294
                if (mr == 6 && nr == 8 && log2_sr == 0) {
1295
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
1296
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55);
1297
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55);
1298
                } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1299
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53);
1300
                  f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55);
1301
                }
1302
                break;
1303
              default:
1304
                break;
1305
            }
1306
          }
1307
        }
1308
      #endif  // XNN_MAX_UARCH_TYPES > 1
1309
    #else  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1310
      #if XNN_ENABLE_ASSEMBLY
1311
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64);
1312
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
1313
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1314
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1315
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1316
        f32_igemm_config.mr = 6;
1317
        f32_igemm_config.nr = 8;
1318
      #else  // !XNN_ENABLE_ASSEMBLY
1319
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128);
1320
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128);
1321
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1322
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1323
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
1324
        f32_igemm_config.mr = 6;
1325
        f32_igemm_config.nr = 8;
1326
       #endif  // XNN_ENABLE_ASSEMBLY
1327
    #endif  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1328
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1329
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1330
0
    assert(hardware_config != NULL);
1331
0
    (void) hardware_config;  // May be unused.
1332
0
    #if XNN_ENABLE_AVX512F
1333
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512f) {
1334
0
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x32__avx512f_broadcast);
1335
0
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_7x32__avx512f_broadcast);
1336
0
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1337
0
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x32__avx512f_u8;
1338
0
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x32__avx512f_u4_prfm;
1339
0
        f32_igemm_config.mr = 7;
1340
0
        f32_igemm_config.nr = 32;
1341
0
      } else
1342
0
    #endif
1343
0
    #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
1344
0
      if (hardware_config->arch_flags & xnn_arch_x86_fma3) {
1345
0
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
1346
0
          #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
1347
0
            case xnn_uarch_zen:
1348
0
            case xnn_uarch_dhyana:
1349
0
              f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
1350
0
              f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
1351
0
              f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1352
0
              f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1353
0
              f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4;
1354
0
              f32_igemm_config.mr = 4;
1355
0
              f32_igemm_config.nr = 16;
1356
0
              f32_igemm_config.log2_sr = 2;
1357
0
              break;
1358
0
          #endif
1359
0
          default:
1360
0
            #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
1361
0
              f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
1362
0
              f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast_prfm);
1363
0
              f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1364
0
              f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__avx_u8;
1365
0
              f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4;
1366
0
              f32_igemm_config.mr = 5;
1367
0
              f32_igemm_config.nr = 16;
1368
0
            #endif
1369
0
            break;
1370
0
        }
1371
0
      } else
1372
0
    #endif
1373
0
    #if XNN_ENABLE_AVX
1374
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
1375
0
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
1376
0
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
1377
0
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1378
0
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__avx_u8;
1379
0
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4;
1380
0
        f32_igemm_config.mr = 5;
1381
0
        f32_igemm_config.nr = 16;
1382
0
      } else
1383
0
    #endif
1384
0
    #if XNN_ENABLE_SSE2
1385
0
      if (hardware_config->arch_flags & xnn_arch_x86_sse2) {
1386
0
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
1387
0
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
1388
0
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1389
0
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1390
0
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__sse2_u4;
1391
0
        f32_igemm_config.mr = 4;
1392
0
        f32_igemm_config.nr = 8;
1393
0
      }
1394
0
    #endif
1395
  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1396
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1397
    assert(hardware_config != NULL);
1398
    (void) hardware_config;  // May be unused.
1399
    if (hardware_config->is_x86) {
1400
      #if XNN_ARCH_WASMRELAXEDSIMD
1401
        f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
1402
        f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
1403
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
1404
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
1405
        f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
1406
        f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
1407
      #else
1408
        if (hardware_concurrency() > kCoreCountThresholdForAdaptiveAvxOptimization) {
1409
          f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat);
1410
          f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat);
1411
          f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat);
1412
          f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat);
1413
          f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat);
1414
          f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat);
1415
        } else {
1416
          f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
1417
          f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
1418
          f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
1419
          f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
1420
          f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
1421
          f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
1422
        }
1423
      #endif
1424
1425
      f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1426
      f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1427
      f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_u4;
1428
      f32_igemm_config.mr = 4;
1429
      f32_igemm_config.nr = 8;
1430
    } else {
1431
      #if XNN_ARCH_WASMRELAXEDSIMD
1432
        f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
1433
        f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_splat);
1434
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
1435
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat);
1436
        f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
1437
        f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat);
1438
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1439
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1440
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_u4;
1441
        f32_igemm_config.mr = 6;
1442
        f32_igemm_config.nr = 8;
1443
      #else
1444
        f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
1445
        f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
1446
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
1447
        f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
1448
        f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
1449
        f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
1450
        f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1451
        f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1452
        f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_u4;
1453
        f32_igemm_config.mr = 5;
1454
        f32_igemm_config.nr = 8;
1455
      #endif
1456
    }
1457
  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
1458
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1459
    assert(hardware_config != NULL);
1460
    (void) hardware_config;  // May be unused.
1461
    if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
1462
      f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_7x4v__rvv);
1463
      f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x4v__rvv);
1464
      f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1465
      f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1466
      f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x4v__rvv_u8;
1467
      f32_igemm_config.mr = 7;
1468
      // nr is set to vlen * 4 / sizeof(float) = 4 * VLENB * 8 / 32 = VLENB
1469
      f32_igemm_config.nr = hardware_config->vlenb;
1470
    }
1471
  #elif XNN_ARCH_HEXAGON && XNN_ENABLE_HVX
1472
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1473
    assert(hardware_config != NULL);
1474
    if (hardware_config->arch_flags & xnn_arch_hvx) {
1475
      f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x64__hvx_broadcast);
1476
      f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_5x64__hvx_broadcast);
1477
      f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1478
      f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x64__hvx_u2;
1479
      f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x64__scalar_int_u2;
1480
      f32_igemm_config.mr = 5;
1481
      f32_igemm_config.nr = 64;
1482
    }
1483
  #else
1484
    f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_1x4__scalar);
1485
    f32_igemm_config.linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x4__scalar);
1486
    f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1487
    f32_igemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1488
    f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_1x4__scalar);
1489
    f32_igemm_config.relu.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_relu_ukernel_4x4__scalar);
1490
    f32_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1491
    f32_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x4__scalar;
1492
    f32_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4;
1493
    f32_igemm_config.mr = 4;
1494
    f32_igemm_config.nr = 4;
1495
  #endif
1496
0
  assert(f32_igemm_config.mr <= XNN_MAX_MR);
1497
0
}
1498
1499
0
static void init_f32_gemm_nr2_config_impl(struct xnn_gemm_config* f32_gemm_nr2_config, bool consistent_arithmetic) {
1500
  // Common parameters.
1501
0
  f32_gemm_nr2_config->log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
1502
0
  f32_gemm_nr2_config->log2_filter_element_size = XNN_LOG2_SIZEOF_FLOAT;
1503
0
  f32_gemm_nr2_config->log2_filter_element_bit_size = XNN_LOG2_SIZEOF_FLOAT + 3;
1504
0
  f32_gemm_nr2_config->bias_element_size = sizeof(float);
1505
1506
  // Arch-specific parameters.
1507
  #if XNN_ARCH_ARM
1508
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1509
    assert(hardware_config != NULL);
1510
    (void) hardware_config;  // May be unused.
1511
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
1512
      f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
1513
      f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
1514
      f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1515
      f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1516
      f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2__neon_ld2lane_u2_prfm;
1517
      f32_gemm_nr2_config->mr = 4;
1518
      f32_gemm_nr2_config->nr = 2;
1519
    } else {
1520
      f32_gemm_nr2_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x2__scalar);
1521
      f32_gemm_nr2_config->linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x2__scalar);
1522
      f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1523
      f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2__scalar);
1524
      f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1525
      f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x2__scalar;
1526
      f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2__scalar_float_u4;
1527
      f32_gemm_nr2_config->mr = 4;
1528
      f32_gemm_nr2_config->nr = 2;
1529
    }
1530
  #elif XNN_ARCH_ARM64
1531
    #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1532
      f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld128);
1533
      f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm);
1534
      f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1535
      f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1536
      f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2__neon_ld2lane_u2_prfm;
1537
      f32_gemm_nr2_config->mr = 4;
1538
      f32_gemm_nr2_config->nr = 2;
1539
    #else  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1540
      #if XNN_ENABLE_ASSEMBLY
1541
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld128);
1542
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm);
1543
        f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1544
        f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1545
        f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2__neon_ld2lane_u2_prfm;
1546
        f32_gemm_nr2_config->mr = 4;
1547
        f32_gemm_nr2_config->nr = 2;
1548
      #else  // !XNN_ENABLE_ASSEMBLY
1549
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64);
1550
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64);
1551
        f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1552
        f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1553
        f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2__neon_ld2lane_u2_prfm;
1554
        f32_gemm_nr2_config->mr = 4;
1555
        f32_gemm_nr2_config->nr = 2;
1556
       #endif  // XNN_ENABLE_ASSEMBLY
1557
    #endif  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1558
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1559
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1560
0
    assert(hardware_config != NULL);
1561
0
    (void) hardware_config;  // May be unused.
1562
0
    #if XNN_ENABLE_AVX512F && XNN_ARCH_X86_64 && !XNN_PLATFORM_WINDOWS && XNN_ENABLE_ASSEMBLY
1563
0
      if (!consistent_arithmetic && hardware_config->arch_flags & xnn_arch_x86_avx512f) {
1564
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16c2__asm_amd64_avx512f_broadcast);
1565
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(10)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_10x16c2__asm_amd64_avx512f_broadcast);
1566
0
        f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1567
0
        f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1568
0
        f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_gemm_goi_w;
1569
0
        f32_gemm_nr2_config->mr = 10;
1570
0
        f32_gemm_nr2_config->nr = 16;
1571
0
        f32_gemm_nr2_config->log2_kr = 1;
1572
0
        f32_gemm_nr2_config->log2_sr = 0;
1573
0
      } else
1574
0
    #endif
1575
0
    #if XNN_ENABLE_AVX512F
1576
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512f) {
1577
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
1578
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
1579
0
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
1580
0
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
1581
0
        f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1582
0
        f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x16__avx512f_u8;
1583
0
        f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4_prfm;
1584
0
        f32_gemm_nr2_config->mr = 7;
1585
0
        f32_gemm_nr2_config->nr = 16;
1586
0
      } else
1587
0
    #endif
1588
0
    #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
1589
0
      if (hardware_config->arch_flags & xnn_arch_x86_fma3) {
1590
0
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
1591
0
          #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
1592
0
            case xnn_uarch_zen:
1593
0
            case xnn_uarch_dhyana:
1594
0
              f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast);
1595
0
              f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast);
1596
0
              f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast);
1597
0
              f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast);
1598
0
              f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1599
0
              f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1600
0
              f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__avx_u4;
1601
0
              f32_gemm_nr2_config->mr = 4;
1602
0
              f32_gemm_nr2_config->nr = 8;
1603
0
              break;
1604
0
          #endif
1605
0
            default:
1606
0
            #if XNN_ENABLE_FMA3 && XNN_ENABLE_AVX
1607
0
              f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast);
1608
0
              f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast);
1609
0
              f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(10)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_10x8__fma3_broadcast);
1610
0
              f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast);
1611
0
              f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast);
1612
0
              f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(10)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_10x8__fma3_broadcast);
1613
0
              f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1614
0
              f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x8__avx_u8;
1615
0
              f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__avx_u4;
1616
0
              f32_gemm_nr2_config->mr = 10;
1617
0
              f32_gemm_nr2_config->nr = 8;
1618
0
            #endif
1619
0
              break;
1620
0
        }
1621
0
      } else
1622
0
    #endif
1623
0
    #if XNN_ENABLE_AVX
1624
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
1625
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast);
1626
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast);
1627
0
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast);
1628
0
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast);
1629
0
        f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1630
0
        f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x8__avx_u8;
1631
0
        f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__avx_u4;
1632
0
        f32_gemm_nr2_config->mr = 5;
1633
0
        f32_gemm_nr2_config->nr = 8;
1634
0
      } else
1635
0
    #endif
1636
0
    #if XNN_ENABLE_SSE2
1637
0
      if (hardware_config->arch_flags & xnn_arch_x86_sse2) {
1638
0
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
1639
0
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
1640
0
        f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1641
0
        f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1642
0
        f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2c4__sse2_u4;
1643
0
        f32_gemm_nr2_config->mr = 4;
1644
0
        f32_gemm_nr2_config->nr = 2;
1645
0
        f32_gemm_nr2_config->log2_kr = 2;
1646
0
      }
1647
0
    #endif
1648
  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1649
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1650
    assert(hardware_config != NULL);
1651
    (void) hardware_config;  // May be unused.
1652
    if (hardware_config->is_x86) {
1653
      #if XNN_ARCH_WASMRELAXEDSIMD
1654
        f32_gemm_nr2_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
1655
        f32_gemm_nr2_config->linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
1656
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
1657
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
1658
      #else
1659
        f32_gemm_nr2_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
1660
        f32_gemm_nr2_config->linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
1661
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
1662
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
1663
      #endif
1664
1665
      f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1666
      f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1667
      f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2c4__wasmsimd_u4;
1668
      f32_gemm_nr2_config->mr = 4;
1669
      f32_gemm_nr2_config->nr = 2;
1670
      f32_gemm_nr2_config->log2_kr = 2;
1671
    } else {
1672
      #if XNN_ARCH_WASMRELAXEDSIMD
1673
        f32_gemm_nr2_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
1674
        f32_gemm_nr2_config->linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
1675
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
1676
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
1677
      #else
1678
        f32_gemm_nr2_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
1679
        f32_gemm_nr2_config->linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
1680
        f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
1681
        f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
1682
      #endif
1683
1684
      f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1685
      f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
1686
      f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2c4__wasmsimd_u4;
1687
      f32_gemm_nr2_config->mr = 4;
1688
      f32_gemm_nr2_config->nr = 2;
1689
      f32_gemm_nr2_config->log2_kr = 2;
1690
    }
1691
  #elif XNN_ARCH_HEXAGON && XNN_ENABLE_HVX
1692
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1693
    assert(hardware_config != NULL);
1694
    if (hardware_config->arch_flags & xnn_arch_hvx) {
1695
      f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_1x32__hvx_broadcast);
1696
      f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_8x32__hvx_broadcast);
1697
      f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1698
      f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x32__hvx_u2;
1699
      f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x32__scalar_int_u2;
1700
      f32_gemm_nr2_config->mr = 8;
1701
      f32_gemm_nr2_config->nr = 32;
1702
    }
1703
  #else
1704
    f32_gemm_nr2_config->linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_ukernel_4x2__scalar);
1705
    f32_gemm_nr2_config->linear.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_ukernel_4x2__scalar);
1706
    f32_gemm_nr2_config->minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1707
    f32_gemm_nr2_config->minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_f32_igemm_minmax_ukernel_4x2__scalar);
1708
    f32_gemm_nr2_config->init.f32 = xnn_init_f32_minmax_scalar_params;
1709
    f32_gemm_nr2_config->pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_x32_packw_gemm_gio_ukernel_x2__scalar;
1710
    f32_gemm_nr2_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x2__scalar_float_u4;
1711
    f32_gemm_nr2_config->mr = 4;
1712
    f32_gemm_nr2_config->nr = 2;
1713
  #endif
1714
0
  assert(f32_gemm_nr2_config->mr <= XNN_MAX_MR);
1715
0
}
1716
1717
0
static void init_f32_gemm_nr2_config() {
1718
0
  init_f32_gemm_nr2_config_impl(&f32_gemm_nr2_config[default_config], false);
1719
0
  init_f32_gemm_nr2_config_impl(&f32_gemm_nr2_config[consistent_config], true);
1720
0
}
1721
1722
0
static void init_f32_qc4w_gemm_config(void) {
1723
  // Common parameters.
1724
0
  f32_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
1725
  // Pass 1 byte even though it is half byte, we handle the division via filter_is_nibble == true.
1726
0
  f32_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
1727
0
  f32_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
1728
0
  f32_qc4w_gemm_config.bias_element_size = sizeof(float);
1729
0
  f32_qc4w_gemm_config.planes = 1;
1730
1731
  // Arch-specific parameters.
1732
  #if XNN_ARCH_ARM
1733
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1734
    assert(hardware_config != NULL);
1735
    (void) hardware_config;  // May be unused.
1736
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
1737
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x8__neon_lane_ld64);
1738
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_4x8__neon_lane_ld64);
1739
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1740
      f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1741
      f32_qc4w_gemm_config.mr = 4;
1742
      f32_qc4w_gemm_config.nr = 8;
1743
    } else {
1744
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x4__scalar);
1745
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_4x4__scalar);
1746
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1747
      f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1748
      f32_qc4w_gemm_config.mr = 4;
1749
      f32_qc4w_gemm_config.nr = 4;
1750
    }
1751
  #elif XNN_ARCH_ARM64
1752
    f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128);
1753
    f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128);
1754
    f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128);
1755
    f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1756
    f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1757
    f32_qc4w_gemm_config.mr = 6;
1758
    f32_qc4w_gemm_config.nr = 8;
1759
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1760
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1761
0
    assert(hardware_config != NULL);
1762
0
    (void) hardware_config;  // May be unused.
1763
0
    #if XNN_ENABLE_AVX512SKX
1764
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
1765
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast);
1766
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast);
1767
0
        f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1768
0
        f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1769
0
        f32_qc4w_gemm_config.mr = 7;
1770
0
        f32_qc4w_gemm_config.nr = 32;
1771
0
      } else
1772
0
    #endif
1773
0
    #if XNN_ENABLE_AVX2
1774
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
1775
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1776
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast);
1777
0
        f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1778
0
        f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1779
0
        f32_qc4w_gemm_config.mr = 3;
1780
0
        f32_qc4w_gemm_config.nr = 16;
1781
0
    } else
1782
0
    #endif
1783
0
    #if XNN_ENABLE_FMA3
1784
0
      if (hardware_config->arch_flags & xnn_arch_x86_fma3) {
1785
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x16__fma3_broadcast);
1786
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_3x16__fma3_broadcast);
1787
0
        f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1788
0
        f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1789
0
        f32_qc4w_gemm_config.mr = 3;
1790
0
        f32_qc4w_gemm_config.nr = 16;
1791
0
    } else
1792
0
    #endif
1793
0
    #if XNN_ENABLE_AVX
1794
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
1795
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx_broadcast);
1796
0
        f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx_broadcast);
1797
0
        f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1798
0
        f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1799
0
        f32_qc4w_gemm_config.mr = 3;
1800
0
        f32_qc4w_gemm_config.nr = 16;
1801
0
    } else
1802
0
    #endif
1803
0
    {
1804
0
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x8__sse41_dup);
1805
0
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_4x8__sse41_dup);
1806
0
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1807
0
      f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1808
0
      f32_qc4w_gemm_config.mr = 4;
1809
0
      f32_qc4w_gemm_config.nr = 8;
1810
0
    }
1811
  #else
1812
    f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_1x4__scalar);
1813
    f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc4w_gemm_minmax_ukernel_4x4__scalar);
1814
    f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1815
    f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1816
    f32_qc4w_gemm_config.mr = 4;
1817
    f32_qc4w_gemm_config.nr = 4;
1818
  #endif
1819
0
  assert(f32_qc4w_gemm_config.mr <= XNN_MAX_MR);
1820
0
}
1821
1822
0
static void init_f32_qc8w_gemm_config(void) {
1823
  // Common parameters.
1824
0
  f32_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
1825
0
  f32_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
1826
0
  f32_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
1827
0
  f32_qc8w_gemm_config.bias_element_size = sizeof(float);
1828
1829
  // Arch-specific parameters.
1830
  #if XNN_ARCH_ARM
1831
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1832
    assert(hardware_config != NULL);
1833
    (void) hardware_config;  // May be unused.
1834
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
1835
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__neon_lane_ld64);
1836
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_4x8__neon_lane_ld64);
1837
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1838
      f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1839
      f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
1840
      f32_qc8w_gemm_config.mr = 4;
1841
      f32_qc8w_gemm_config.nr = 8;
1842
    } else {
1843
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x4__scalar);
1844
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_4x4__scalar);
1845
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1846
      f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1847
      f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x4__scalar_u2;
1848
      f32_qc8w_gemm_config.mr = 4;
1849
      f32_qc8w_gemm_config.nr = 4;
1850
    }
1851
  #elif XNN_ARCH_ARM64
1852
    #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1853
      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1854
      assert(hardware_config);
1855
      switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
1856
        // TODO(fbarchard): fill in microkernels.
1857
        case xnn_uarch_cortex_a72:
1858
        case xnn_uarch_cortex_a57:
1859
        case xnn_uarch_cortex_a75:
1860
        case xnn_uarch_cortex_a76:
1861
        case xnn_uarch_exynos_m3:
1862
        case xnn_uarch_exynos_m4:
1863
        case xnn_uarch_exynos_m1:
1864
        case xnn_uarch_exynos_m2:
1865
        case xnn_uarch_cortex_a53:
1866
        case xnn_uarch_cortex_a55r0:
1867
        case xnn_uarch_cortex_a35:
1868
        case xnn_uarch_cortex_a55:
1869
        case xnn_uarch_kryo:
1870
        case xnn_uarch_cortex_a73:
1871
        case xnn_uarch_cortex_a77:
1872
        case xnn_uarch_exynos_m5:
1873
        case xnn_uarch_cortex_a78:
1874
        case xnn_uarch_cortex_x1:
1875
        case xnn_uarch_neoverse_n1:
1876
        case xnn_uarch_neoverse_v1:
1877
        default:
1878
          f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4);
1879
          f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
1880
          f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1881
          f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1882
          f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
1883
          f32_qc8w_gemm_config.mr = 6;
1884
          f32_qc8w_gemm_config.nr = 8;
1885
      }
1886
      #if XNN_MAX_UARCH_TYPES > 1
1887
        /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1888
        const uint32_t mr = f32_qc8w_gemm_config.mr;
1889
        const uint32_t nr = f32_qc8w_gemm_config.nr;
1890
        const uint32_t log2_sr = f32_qc8w_gemm_config.log2_sr;
1891
        // TODO(fbarchard): fill in with microkernels.
1892
        (void) mr;
1893
        (void) nr;
1894
        (void) log2_sr;
1895
        for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1896
          switch (hardware_config->uarch[i]) {
1897
            case xnn_uarch_cortex_a53:
1898
            case xnn_uarch_cortex_a55r0:
1899
            case xnn_uarch_cortex_a55:
1900
            default:
1901
              break;
1902
          }
1903
        }
1904
      #endif  // XNN_MAX_UARCH_TYPES > 1
1905
    #else  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1906
      #if XNN_ENABLE_ASSEMBLY
1907
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128);
1908
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128);
1909
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1910
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1911
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
1912
        f32_qc8w_gemm_config.mr = 6;
1913
        f32_qc8w_gemm_config.nr = 8;
1914
      #else  // !XNN_ENABLE_ASSEMBLY
1915
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64);
1916
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64);
1917
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1918
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1919
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
1920
        f32_qc8w_gemm_config.mr = 6;
1921
        f32_qc8w_gemm_config.nr = 8;
1922
      #endif  // XNN_ENABLE_ASSEMBLY
1923
    #endif  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1924
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1925
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1926
0
    assert(hardware_config != NULL);
1927
0
    (void) hardware_config;  // May be unused.
1928
0
    #if XNN_ENABLE_AVX512SKX
1929
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
1930
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast);
1931
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_7x32__avx512skx_broadcast);
1932
0
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1933
0
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1934
0
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x32__scalar_u2;
1935
0
        f32_qc8w_gemm_config.mr = 7;
1936
0
        f32_qc8w_gemm_config.nr = 32;
1937
0
      } else
1938
0
    #endif
1939
0
    #if XNN_ENABLE_AVX2
1940
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
1941
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1942
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast);
1943
0
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1944
0
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1945
0
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x16__scalar_u2;
1946
0
        f32_qc8w_gemm_config.mr = 5;
1947
0
        f32_qc8w_gemm_config.nr = 16;
1948
0
    } else
1949
0
    #endif
1950
0
    #if XNN_ENABLE_FMA3
1951
0
      if (hardware_config->arch_flags & xnn_arch_x86_fma3) {
1952
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x16__fma3_broadcast);
1953
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_5x16__fma3_broadcast);
1954
0
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1955
0
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1956
0
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x16__scalar_u2;
1957
0
        f32_qc8w_gemm_config.mr = 5;
1958
0
        f32_qc8w_gemm_config.nr = 16;
1959
0
    } else
1960
0
    #endif
1961
0
    #if XNN_ENABLE_AVX
1962
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
1963
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx_broadcast);
1964
0
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx_broadcast);
1965
0
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1966
0
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1967
0
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x16__scalar_u2;
1968
0
        f32_qc8w_gemm_config.mr = 5;
1969
0
        f32_qc8w_gemm_config.nr = 16;
1970
0
    } else
1971
0
    #endif
1972
0
     {
1973
0
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__sse41_dup);
1974
0
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_4x8__sse41_dup);
1975
0
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1976
0
      f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1977
0
      f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
1978
0
      f32_qc8w_gemm_config.mr = 4;
1979
0
      f32_qc8w_gemm_config.nr = 8;
1980
0
    }
1981
  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1982
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1983
    assert(hardware_config != NULL);
1984
    (void) hardware_config;  // May be unused.
1985
    if (hardware_config->is_x86) {
1986
      #if XNN_ARCH_WASMRELAXEDSIMD
1987
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
1988
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
1989
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
1990
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
1991
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
1992
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
1993
      #else
1994
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_1x8__wasmsimd_splat);
1995
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_4x8__wasmsimd_splat);
1996
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
1997
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
1998
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmsimd_splat);
1999
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_4x8__wasmsimd_splat);
2000
      #endif
2001
2002
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2003
      f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
2004
      f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
2005
      f32_qc8w_gemm_config.mr = 4;
2006
      f32_qc8w_gemm_config.nr = 8;
2007
    } else {
2008
      #if XNN_ARCH_WASMRELAXEDSIMD
2009
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
2010
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_6x8__wasmrelaxedsimd_fma_splat);
2011
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
2012
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat);
2013
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
2014
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat);
2015
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2016
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
2017
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
2018
        f32_qc8w_gemm_config.mr = 6;
2019
        f32_qc8w_gemm_config.nr = 8;
2020
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_1x8__wasmsimd_splat);
2021
        f32_qc8w_gemm_config.linear.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_ukernel_5x8__wasmsimd_splat);
2022
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
2023
        f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
2024
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmsimd_splat);
2025
        f32_qc8w_gemm_config.relu.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_relu_ukernel_5x8__wasmsimd_splat);
2026
      #else
2027
        f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2028
        f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
2029
        f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u2;
2030
        f32_qc8w_gemm_config.mr = 5;
2031
        f32_qc8w_gemm_config.nr = 8;
2032
      #endif
2033
    }
2034
  #else
2035
    f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_1x4__scalar);
2036
    f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_f32_qc8w_gemm_minmax_ukernel_4x4__scalar);
2037
    f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2038
    f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
2039
    f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x4__scalar_u2;
2040
    f32_qc8w_gemm_config.mr = 4;
2041
    f32_qc8w_gemm_config.nr = 4;
2042
  #endif
2043
0
  assert(f32_qc8w_gemm_config.mr <= XNN_MAX_MR);
2044
0
}
2045
2046
0
static void init_qdu8_f16_qc4w_gemm_config(void) {
2047
  // Common parameters.
2048
0
  qdu8_f16_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2049
0
  qdu8_f16_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2050
0
  qdu8_f16_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2051
0
  qdu8_f16_qc4w_gemm_config.bias_element_size = sizeof(float);
2052
  // Use the same packing function throughout.
2053
0
  qdu8_f16_qc4w_gemm_config.pack_weights_and_biases =
2054
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs4_weights_and_biases;
2055
0
  qdu8_f16_qc4w_gemm_config.packed_stride_weights_and_biases =
2056
0
      (xnn_packed_stride_weights_and_biases_fn)
2057
0
          xnn_packed_stride_qs4_weights_and_biases;
2058
0
  qdu8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
2059
0
  qdu8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
2060
2061
  // Arch-specific parameters.
2062
0
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
2063
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2064
0
    assert(hardware_config != NULL);
2065
0
    (void) hardware_config;  // May be unused.
2066
0
    #if XNN_ENABLE_AVX256VNNI
2067
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256vnni) {
2068
0
        qdu8_f16_qc4w_gemm_config.arch = xnn_arch_x86_avx256vnni;
2069
0
        qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnni);
2070
0
        qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnni);
2071
0
        qdu8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2072
0
        qdu8_f16_qc4w_gemm_config.mr = 8;
2073
0
        qdu8_f16_qc4w_gemm_config.nr = 8;
2074
0
        qdu8_f16_qc4w_gemm_config.log2_kr = 3;
2075
0
        qdu8_f16_qc4w_gemm_config.planes = 2;
2076
0
      } else
2077
0
    #endif
2078
0
    #if XNN_ENABLE_AVXVNNI
2079
0
      if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
2080
0
        qdu8_f16_qc4w_gemm_config.arch = xnn_arch_x86_avxvnni;
2081
0
        qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2082
0
        qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2083
0
        qdu8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2084
0
        qdu8_f16_qc4w_gemm_config.mr = 5;
2085
0
        qdu8_f16_qc4w_gemm_config.nr = 8;
2086
0
        qdu8_f16_qc4w_gemm_config.log2_kr = 3;
2087
0
        qdu8_f16_qc4w_gemm_config.planes = 2;
2088
0
      } else
2089
0
    #endif
2090
0
    #if XNN_ENABLE_AVX256SKX
2091
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
2092
0
        qdu8_f16_qc4w_gemm_config.arch = xnn_arch_x86_avx256skx;
2093
0
        qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx_madd_prfm);
2094
0
        qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx_madd_prfm);
2095
0
        qdu8_f16_qc4w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
2096
0
        qdu8_f16_qc4w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
2097
0
        qdu8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_gio_w;
2098
0
        qdu8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_goi_w;
2099
0
        qdu8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2100
0
        qdu8_f16_qc4w_gemm_config.mr = 8;
2101
0
        qdu8_f16_qc4w_gemm_config.nr = 8;
2102
0
        qdu8_f16_qc4w_gemm_config.log2_kr = 3;
2103
0
        qdu8_f16_qc4w_gemm_config.planes = 2;
2104
0
      } else
2105
0
    #endif
2106
0
    if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
2107
0
      qdu8_f16_qc4w_gemm_config.arch = xnn_arch_x86_avx2;
2108
0
      qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2_madd_prfm);
2109
0
      qdu8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avx2_madd_prfm);
2110
0
      qdu8_f16_qc4w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
2111
0
      qdu8_f16_qc4w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
2112
0
      qdu8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_gio_w;
2113
0
      qdu8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_goi_w;
2114
0
      qdu8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2115
0
      qdu8_f16_qc4w_gemm_config.mr = 4;
2116
0
      qdu8_f16_qc4w_gemm_config.nr = 8;
2117
0
      qdu8_f16_qc4w_gemm_config.log2_kr = 3;
2118
0
      qdu8_f16_qc4w_gemm_config.planes = 2;
2119
0
    }
2120
0
  #endif
2121
0
  assert(qdu8_f16_qc4w_gemm_config.mr <= XNN_MAX_MR);
2122
0
  assert(qdu8_f16_qc4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
2123
0
}
2124
2125
0
static void init_qd8_f16_qc4w_gemm_config(void) {
2126
0
  // Common parameters.
2127
0
  qd8_f16_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2128
0
  qd8_f16_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2129
0
  qd8_f16_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2130
0
  qd8_f16_qc4w_gemm_config.bias_element_size = sizeof(float);
2131
0
  // Use the same packing function throughout.
2132
0
  qd8_f16_qc4w_gemm_config.pack_weights_and_biases =
2133
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs4_weights_and_biases;
2134
0
  qd8_f16_qc4w_gemm_config.packed_stride_weights_and_biases =
2135
0
      (xnn_packed_stride_weights_and_biases_fn)
2136
0
          xnn_packed_stride_qs4_weights_and_biases;
2137
0
  qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
2138
0
  qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
2139
0
2140
0
  // Arch-specific parameters.
2141
0
  #if XNN_ENABLE_ARM_FP16_SCALAR && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM
2142
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2143
0
    assert(hardware_config != NULL);
2144
0
    (void) hardware_config;  // May be unused.
2145
0
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
2146
0
      if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2147
0
        #if XNN_ENABLE_ARM_DOTPROD
2148
0
          qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
2149
0
          qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16c4__neondotfp16arith);
2150
0
          qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2151
0
          qd8_f16_qc4w_gemm_config.mr = 4;
2152
0
          qd8_f16_qc4w_gemm_config.nr = 16;
2153
0
          qd8_f16_qc4w_gemm_config.log2_kr = 2;
2154
0
          qd8_f16_qc4w_gemm_config.planes = 2;
2155
0
        #endif  // XNN_ENABLE_ARM_DOTPROD
2156
0
      } else {
2157
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane);
2158
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane);
2159
0
        qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2160
0
        qd8_f16_qc4w_gemm_config.mr = 6;
2161
0
        qd8_f16_qc4w_gemm_config.nr = 16;
2162
0
        qd8_f16_qc4w_gemm_config.planes = 2;
2163
0
      }
2164
0
    }
2165
0
  #elif XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
2166
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2167
0
    assert(hardware_config != NULL);
2168
0
    (void) hardware_config;  // May be unused.
2169
0
    if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2170
0
      #if XNN_ENABLE_ARM_I8MM
2171
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2172
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2173
0
        qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2174
0
        qd8_f16_qc4w_gemm_config.mr = 4;
2175
0
        qd8_f16_qc4w_gemm_config.nr = 16;
2176
0
        qd8_f16_qc4w_gemm_config.log2_kr = 3;
2177
0
        qd8_f16_qc4w_gemm_config.planes = 2;
2178
0
      #endif  // XNN_ENABLE_ARM_I8MM
2179
0
    } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2180
0
      #if XNN_ENABLE_ARM_DOTPROD
2181
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
2182
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16c4__neondotfp16arith);
2183
0
        qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2184
0
        qd8_f16_qc4w_gemm_config.mr = 4;
2185
0
        qd8_f16_qc4w_gemm_config.nr = 16;
2186
0
        qd8_f16_qc4w_gemm_config.log2_kr = 2;
2187
0
        qd8_f16_qc4w_gemm_config.planes = 2;
2188
0
      #endif  // XNN_ENABLE_ARM_DOTPROD
2189
0
    } else {
2190
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane);
2191
0
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane);
2192
0
        qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_scalar_params;
2193
0
        qd8_f16_qc4w_gemm_config.mr = 6;
2194
0
        qd8_f16_qc4w_gemm_config.nr = 16;
2195
0
        qd8_f16_qc4w_gemm_config.planes = 2;
2196
0
    }
2197
0
  #endif
2198
0
  assert(qd8_f16_qc4w_gemm_config.mr <= XNN_MAX_MR);
2199
0
  assert(qd8_f16_qc4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
2200
0
}
2201
2202
0
static void init_qd8_f16_qb4w_gemm_config(void) {
2203
  // Common parameters.
2204
0
  qd8_f16_qb4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2205
0
  qd8_f16_qb4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2206
0
  qd8_f16_qb4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2207
0
  qd8_f16_qb4w_gemm_config.bias_element_size = sizeof(float);
2208
0
  qd8_f16_qb4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_qb4_weights_and_biases;
2209
0
  qd8_f16_qb4w_gemm_config.pack_weights_and_biases = xnn_pack_qb4_weights_and_biases;
2210
2211
  // Arch-specific parameters.
2212
  #if XNN_ENABLE_ARM_FP16_SCALAR && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM
2213
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2214
    assert(hardware_config != NULL);
2215
    (void) hardware_config;  // May be unused.
2216
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
2217
      if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2218
        #if XNN_ENABLE_ARM_DOTPROD
2219
          qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
2220
          qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16c4__neondotfp16arith);
2221
          qd8_f16_qb4w_gemm_config.init.f16_qb4w = xnn_init_f16_qb4w_minmax_scalar_params;
2222
          qd8_f16_qb4w_gemm_config.mr = 4;
2223
          qd8_f16_qb4w_gemm_config.nr = 16;
2224
          qd8_f16_qb4w_gemm_config.log2_kr = 2;
2225
          qd8_f16_qb4w_gemm_config.planes = 2;
2226
        #endif  // XNN_ENABLE_ARM_DOTPROD
2227
      } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
2228
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane);
2229
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane);
2230
        qd8_f16_qb4w_gemm_config.init.f16_qb4w = xnn_init_f16_qb4w_minmax_scalar_params;
2231
        qd8_f16_qb4w_gemm_config.mr = 6;
2232
        qd8_f16_qb4w_gemm_config.nr = 16;
2233
        qd8_f16_qb4w_gemm_config.planes = 2;
2234
      }
2235
    }
2236
  #elif XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
2237
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2238
    assert(hardware_config != NULL);
2239
    (void) hardware_config;  // May be unused.
2240
    if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2241
      #if XNN_ENABLE_ARM_I8MM
2242
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2243
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2244
        qd8_f16_qb4w_gemm_config.init.f16_qb4w = xnn_init_f16_qb4w_minmax_scalar_params;
2245
        qd8_f16_qb4w_gemm_config.mr = 4;
2246
        qd8_f16_qb4w_gemm_config.nr = 16;
2247
        qd8_f16_qb4w_gemm_config.log2_kr = 3;
2248
        qd8_f16_qb4w_gemm_config.planes = 2;
2249
      #endif  // XNN_ENABLE_ARM_I8MM
2250
    } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2251
      #if XNN_ENABLE_ARM_DOTPROD
2252
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
2253
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16c4__neondotfp16arith);
2254
        qd8_f16_qb4w_gemm_config.init.f16_qb4w = xnn_init_f16_qb4w_minmax_scalar_params;
2255
        qd8_f16_qb4w_gemm_config.mr = 4;
2256
        qd8_f16_qb4w_gemm_config.nr = 16;
2257
        qd8_f16_qb4w_gemm_config.log2_kr = 2;
2258
        qd8_f16_qb4w_gemm_config.planes = 2;
2259
      #endif  // XNN_ENABLE_ARM_DOTPROD
2260
    } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
2261
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane);
2262
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane);
2263
        qd8_f16_qb4w_gemm_config.init.f16_qb4w = xnn_init_f16_qb4w_minmax_scalar_params;
2264
        qd8_f16_qb4w_gemm_config.mr = 6;
2265
        qd8_f16_qb4w_gemm_config.nr = 16;
2266
        qd8_f16_qb4w_gemm_config.planes = 2;
2267
    }
2268
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2269
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2270
0
    assert(hardware_config != NULL);
2271
0
    (void) hardware_config;  // May be unused.
2272
0
    #if XNN_ENABLE_AVX2
2273
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
2274
0
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x8c8__avx2);
2275
0
        qd8_f16_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x8c8__avx2);
2276
0
        qd8_f16_qb4w_gemm_config.init.f16_qb4w = xnn_init_f16_qb4w_minmax_scalar_params;
2277
0
        qd8_f16_qb4w_gemm_config.mr = 3;
2278
0
        qd8_f16_qb4w_gemm_config.nr = 8;
2279
0
        qd8_f16_qb4w_gemm_config.log2_kr = 3;
2280
0
        qd8_f16_qb4w_gemm_config.planes = 2;
2281
0
      }
2282
0
    #endif
2283
0
  #endif
2284
0
  assert(qd8_f16_qb4w_gemm_config.mr <= XNN_MAX_MR);
2285
0
  assert(qd8_f16_qb4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
2286
0
}
2287
2288
0
static void init_qd8_f32_qc2w_gemm_config(void) {
2289
  // Common parameters.
2290
0
  qd8_f32_qc2w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2291
0
  qd8_f32_qc2w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2292
0
  qd8_f32_qc2w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT2;
2293
0
  qd8_f32_qc2w_gemm_config.bias_element_size = sizeof(float);
2294
  // Use the same packing function throughout.
2295
0
  qd8_f32_qc2w_gemm_config.pack_weights_and_biases =
2296
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qc2w_weights_and_biases;
2297
0
  qd8_f32_qc2w_gemm_config.packed_stride_weights_and_biases =
2298
0
      (xnn_packed_stride_weights_and_biases_fn)
2299
0
          xnn_packed_stride_qc2w_weights_and_biases;
2300
0
  qd8_f32_qc2w_gemm_config.pack_gemm_gio =
2301
0
      (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc2w_gemm_gio_w;  // Ignored
2302
0
  qd8_f32_qc2w_gemm_config.pack_gemm_goi =
2303
0
      (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc2w_gemm_goi_w;  // Ignored
2304
#if XNN_ARCH_ARM64
2305
  const struct xnn_hardware_config* hardware_config =
2306
      xnn_init_hardware_config();
2307
  assert(hardware_config != NULL);
2308
  if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2309
#if XNN_ENABLE_ARM_DOTPROD
2310
    qd8_f32_qc2w_gemm_config.arch = xnn_arch_arm_neon_dot;
2311
    qd8_f32_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] =
2312
        xnn_init_hmp_dqgemm_ukernel(
2313
            (xnn_dqgemm_ukernel_fn)
2314
                xnn_qd8_f32_qc2w_gemm_minmax_ukernel_1x16c4__neondot);
2315
    qd8_f32_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] =
2316
        xnn_init_hmp_dqgemm_ukernel(
2317
            (xnn_dqgemm_ukernel_fn)
2318
                xnn_qd8_f32_qc2w_gemm_minmax_ukernel_4x16c4__neondot);
2319
    qd8_f32_qc2w_gemm_config.init.f32 =
2320
        xnn_init_f32_minmax_scalar_params;
2321
    qd8_f32_qc2w_gemm_config.mr = 4;
2322
    qd8_f32_qc2w_gemm_config.nr = 16;
2323
    qd8_f32_qc2w_gemm_config.log2_kr = 2;
2324
    qd8_f32_qc2w_gemm_config.planes = 4;
2325
    assert(qd8_f32_qc2w_gemm_config.mr <= XNN_MAX_MR);
2326
#endif  // XNN_ENABLE_ARM_DOTPROD
2327
  }
2328
#endif // XNN_ARCH_ARM64
2329
0
}
2330
2331
0
static void init_qd8_f32_qc4w_gemm_config(void) {
2332
  // Common parameters.
2333
0
  qd8_f32_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2334
0
  qd8_f32_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2335
0
  qd8_f32_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2336
0
  qd8_f32_qc4w_gemm_config.bias_element_size = sizeof(float);
2337
  // Use the same packing function throughout.
2338
0
  qd8_f32_qc4w_gemm_config.pack_weights_and_biases = (xnn_pack_weights_and_biases_fn) xnn_pack_qs4_weights_and_biases;
2339
0
  qd8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = (xnn_packed_stride_weights_and_biases_fn) xnn_packed_stride_qs4_weights_and_biases;
2340
0
  qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;  // Ignored
2341
0
  qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;  // Ignored
2342
2343
  // Arch-specific parameters.
2344
  #if XNN_ARCH_ARM
2345
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2346
    assert(hardware_config != NULL);
2347
    (void) hardware_config;  // May be unused.
2348
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
2349
      if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2350
        #if XNN_ENABLE_ARM_DOTPROD
2351
          qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__neondot);
2352
          qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__neondot);
2353
          qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2354
          qd8_f32_qc4w_gemm_config.mr = 4;
2355
          qd8_f32_qc4w_gemm_config.nr = 16;
2356
          qd8_f32_qc4w_gemm_config.log2_kr = 2;
2357
          qd8_f32_qc4w_gemm_config.planes = 2;
2358
        #endif  // XNN_ENABLE_ARM_DOTPROD
2359
      } else {
2360
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
2361
          #if XNN_ENABLE_ASSEMBLY
2362
            case xnn_uarch_cortex_a53:
2363
            case xnn_uarch_cortex_a55r0:
2364
            case xnn_uarch_cortex_a55:
2365
              qd8_f32_qc4w_gemm_config.minmax
2366
                  .dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(
2367
                  xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch32_neonmlal_ld64_2);
2368
              qd8_f32_qc4w_gemm_config.minmax
2369
                  .dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(
2370
                  xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8__asm_aarch32_neonmlal_ld64_2);
2371
              qd8_f32_qc4w_gemm_config.init.f32_qc4w =
2372
                  xnn_init_f32_qc4w_minmax_scalar_params;
2373
              qd8_f32_qc4w_gemm_config.mr = 4;
2374
              qd8_f32_qc4w_gemm_config.nr = 8;
2375
              qd8_f32_qc4w_gemm_config.planes = 2;
2376
              break;
2377
          #endif  // XNN_ENABLE_ASSEMBLY
2378
          default:
2379
            qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] =
2380
                XNN_INIT_HMP_DQGEMM_UKERNEL(
2381
                    xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16__neon_mlal_lane);
2382
            qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] =
2383
                XNN_INIT_HMP_DQGEMM_UKERNEL(
2384
                    xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x16__neon_mlal_lane);
2385
            qd8_f32_qc4w_gemm_config.init.f32_qc4w =
2386
                xnn_init_f32_qc4w_minmax_scalar_params;
2387
            qd8_f32_qc4w_gemm_config.mr = 6;
2388
            qd8_f32_qc4w_gemm_config.nr = 16;
2389
            qd8_f32_qc4w_gemm_config.planes = 2;
2390
            break;
2391
        }
2392
      }
2393
    } else {
2394
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4__scalar);
2395
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4__scalar);
2396
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2397
      qd8_f32_qc4w_gemm_config.mr = 4;
2398
      qd8_f32_qc4w_gemm_config.nr = 4;
2399
      qd8_f32_qc4w_gemm_config.planes = 2;
2400
    }
2401
  #elif XNN_ARCH_ARM64
2402
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2403
    assert(hardware_config != NULL);
2404
    (void) hardware_config;  // May be unused.
2405
    if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2406
      #if XNN_ENABLE_ARM_I8MM
2407
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2408
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2409
        qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2410
        qd8_f32_qc4w_gemm_config.mr = 4;
2411
        qd8_f32_qc4w_gemm_config.nr = 16;
2412
        qd8_f32_qc4w_gemm_config.log2_kr = 3;
2413
        qd8_f32_qc4w_gemm_config.planes = 2;
2414
      #endif  // XNN_ENABLE_ARM_I8MM
2415
    } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2416
      #if XNN_ENABLE_ARM_DOTPROD
2417
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__neondot);
2418
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__neondot);
2419
        qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2420
        qd8_f32_qc4w_gemm_config.mr = 4;
2421
        qd8_f32_qc4w_gemm_config.nr = 16;
2422
        qd8_f32_qc4w_gemm_config.log2_kr = 2;
2423
        qd8_f32_qc4w_gemm_config.planes = 2;
2424
      #endif  // XNN_ENABLE_ARM_DOTPROD
2425
    } else {
2426
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16__neon_mlal_lane);
2427
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x16__neon_mlal_lane);
2428
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2429
      qd8_f32_qc4w_gemm_config.mr = 6;
2430
      qd8_f32_qc4w_gemm_config.nr = 16;
2431
      qd8_f32_qc4w_gemm_config.planes = 2;
2432
    }
2433
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2434
    #if XNN_ENABLE_AVX512AMX
2435
0
      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2436
0
      assert(hardware_config != NULL);
2437
0
    (void) hardware_config;  // May be unused.
2438
0
      (void) hardware_config;  // May be unused.
2439
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512amx) {
2440
0
        qd8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avx512amx;
2441
0
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x64c4__avx512amx);
2442
0
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x64c4__avx512amx);
2443
0
        qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2444
0
        qd8_f32_qc4w_gemm_config.mr = 16;
2445
0
        qd8_f32_qc4w_gemm_config.nr = 64;
2446
0
        qd8_f32_qc4w_gemm_config.log2_kr = 2;
2447
0
        qd8_f32_qc4w_gemm_config.planes = 2;
2448
0
      } else
2449
0
    #endif  // XNN_ENABLE_AVX512AMX
2450
0
    {
2451
0
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld128);
2452
0
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld128);
2453
0
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2454
0
      qd8_f32_qc4w_gemm_config.mr = 4;
2455
0
      qd8_f32_qc4w_gemm_config.nr = 4;
2456
0
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
2457
0
      qd8_f32_qc4w_gemm_config.planes = 2;
2458
0
    }
2459
  #elif XNN_ARCH_WASMRELAXEDSIMD || XNN_ARCH_WASMSIMD
2460
    qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__wasmsimd_dot16x2_ld64);
2461
    qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__wasmsimd_dot16x2_ld64);
2462
    qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2463
    qd8_f32_qc4w_gemm_config.mr = 4;
2464
    qd8_f32_qc4w_gemm_config.nr = 4;
2465
    qd8_f32_qc4w_gemm_config.log2_kr = 3;
2466
    qd8_f32_qc4w_gemm_config.planes = 2;
2467
  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
2468
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2469
    qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4v__rvv);
2470
    qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4v__rvv);
2471
    qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2472
    qd8_f32_qc4w_gemm_config.mr = 4;
2473
    qd8_f32_qc4w_gemm_config.nr = 4 * hardware_config->vlenb / sizeof(int32_t);
2474
    qd8_f32_qc4w_gemm_config.planes = 2;
2475
  #else
2476
    qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4__scalar);
2477
    qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4__scalar);
2478
    qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
2479
    qd8_f32_qc4w_gemm_config.mr = 4;
2480
    qd8_f32_qc4w_gemm_config.nr = 4;
2481
    qd8_f32_qc4w_gemm_config.planes = 2;
2482
  #endif
2483
0
  assert(qd8_f32_qc4w_gemm_config.mr <= XNN_MAX_MR);
2484
0
  assert(qd8_f32_qc4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
2485
0
}
2486
2487
0
static void init_qp8_f32_qc4w_gemm_config(void) {
2488
  // Common parameters.
2489
0
  qp8_f32_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2490
0
  qp8_f32_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2491
0
  qp8_f32_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2492
0
  qp8_f32_qc4w_gemm_config.bias_element_size = sizeof(float);
2493
2494
  // Arch-specific parameters.
2495
#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
2496
  const struct xnn_hardware_config* hardware_config =
2497
      xnn_init_hardware_config();
2498
  assert(hardware_config != NULL);
2499
  if (XNN_ENABLE_ARM_SME2 && (hardware_config->arch_flags & xnn_arch_arm_sme2)) {
2500
    #if XNN_ENABLE_ARM_SME2
2501
    const size_t mr = xnn_qp8_f32_qc4w_gemm_minmax_ukernel_16x64c4__neonsme2_get_mr();
2502
    const size_t nr = xnn_qp8_f32_qc4w_gemm_minmax_ukernel_16x64c4__neonsme2_get_nr();
2503
    qp8_f32_qc4w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc4w_gemm_minmax_ukernel_1x64c4__neonsme2);
2504
    qp8_f32_qc4w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc4w_gemm_minmax_ukernel_16x64c4__neonsme2);
2505
    qp8_f32_qc4w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2506
    qp8_f32_qc4w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qs4_weights_and_biases_sme;
2507
    qp8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qs4_weights_and_biases_sme;
2508
    qp8_f32_qc4w_gemm_config.mr = mr;
2509
    qp8_f32_qc4w_gemm_config.nr = nr;
2510
    qp8_f32_qc4w_gemm_config.log2_kr = 2;
2511
    qp8_f32_qc4w_gemm_config.planes = 2;
2512
    qp8_f32_qc4w_gemm_config.mr_packed = mr;
2513
    #endif  // XNN_ENABLE_ARM_SME2
2514
  } else if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2515
#if XNN_ENABLE_ARM_I8MM && XNN_ENABLE_ARM_DOTPROD
2516
    qp8_f32_qc4w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc4w_gemm_minmax_ukernel_1x8c16s2__aarch64_neondot);
2517
    qp8_f32_qc4w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc4w_gemm_minmax_ukernel_8x8c16s2__neoni8mm_mstep2);
2518
    qp8_f32_qc4w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2519
    qp8_f32_qc4w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qs4_weights_and_biases;
2520
    qp8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qs4_weights_and_biases;
2521
    qp8_f32_qc4w_gemm_config.mr = 8;
2522
    qp8_f32_qc4w_gemm_config.nr = 8;
2523
    qp8_f32_qc4w_gemm_config.log2_kr = 4;
2524
    qp8_f32_qc4w_gemm_config.log2_sr = 1;
2525
    qp8_f32_qc4w_gemm_config.planes = 2;
2526
    qp8_f32_qc4w_gemm_config.mr_packed = 4;
2527
#endif  // XNN_ENABLE_ARM_I8MM
2528
  } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2529
#if XNN_ENABLE_ARM_DOTPROD
2530
    qp8_f32_qc4w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc4w_gemm_minmax_ukernel_1x4c8s2__aarch64_neondot);
2531
    qp8_f32_qc4w_gemm_config.minmax
2532
        .qp8gemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(
2533
        xnn_qp8_f32_qc4w_gemm_minmax_ukernel_16x4c8s2__aarch64_neondot_mstep4);
2534
    qp8_f32_qc4w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2535
    qp8_f32_qc4w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qs4_weights_and_biases;
2536
    qp8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qs4_weights_and_biases;
2537
    qp8_f32_qc4w_gemm_config.mr = 16;
2538
    qp8_f32_qc4w_gemm_config.nr = 4;
2539
    qp8_f32_qc4w_gemm_config.log2_kr = 3;
2540
    qp8_f32_qc4w_gemm_config.log2_sr = 1;
2541
    qp8_f32_qc4w_gemm_config.planes = 2;
2542
    qp8_f32_qc4w_gemm_config.mr_packed = 4;
2543
#endif  // XNN_ENABLE_ARM_DOTPROD
2544
  }
2545
  assert(qp8_f32_qc4w_gemm_config.mr <= XNN_MAX_MR);
2546
#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
2547
0
}
2548
2549
0
static void init_qp8_f32_qc8w_gemm_config(void) {
2550
  // Common parameters.
2551
0
  qp8_f32_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2552
0
  qp8_f32_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2553
0
  qp8_f32_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_UINT8_T + 3;
2554
0
  qp8_f32_qc8w_gemm_config.bias_element_size = sizeof(float);
2555
2556
  // Arch-specific parameters.
2557
#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
2558
  const struct xnn_hardware_config* hardware_config =
2559
      xnn_init_hardware_config();
2560
  assert(hardware_config != NULL);
2561
  if (XNN_ENABLE_ARM_SME2 && (hardware_config->arch_flags & xnn_arch_arm_sme2)) {
2562
    #if XNN_ENABLE_ARM_SME2
2563
    const size_t mr = xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x64c4__neonsme2_get_mr();
2564
    const size_t nr = xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x64c4__neonsme2_get_nr();
2565
    qp8_f32_qc8w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc8w_gemm_minmax_ukernel_1x64c4__neonsme2);
2566
    qp8_f32_qc8w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x64c4__neonsme2);
2567
    qp8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2568
    qp8_f32_qc8w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qs8_weights_and_biases;
2569
    qp8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qs8_weights_and_biases;
2570
    qp8_f32_qc8w_gemm_config.mr = mr;
2571
    qp8_f32_qc8w_gemm_config.nr = nr;
2572
    qp8_f32_qc8w_gemm_config.log2_kr = 2;
2573
    qp8_f32_qc8w_gemm_config.mr_packed = mr;
2574
    #endif  // XNN_ENABLE_ARM_SME2
2575
  } else if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2576
#if XNN_ENABLE_ARM_I8MM && XNN_ENABLE_ARM_DOTPROD
2577
    qp8_f32_qc8w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc8w_gemm_minmax_ukernel_1x4c8__aarch64_neondot);
2578
    qp8_f32_qc8w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x4c8__neoni8mm_mstep4);
2579
    qp8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2580
    qp8_f32_qc8w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qs8_weights_and_biases;
2581
    qp8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qs8_weights_and_biases;
2582
    qp8_f32_qc8w_gemm_config.mr = 16;
2583
    qp8_f32_qc8w_gemm_config.nr = 4;
2584
    qp8_f32_qc8w_gemm_config.log2_kr = 3;
2585
    qp8_f32_qc8w_gemm_config.mr_packed = 4;
2586
#endif  // XNN_ENABLE_ARM_I8MM
2587
  } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2588
#if XNN_ENABLE_ARM_DOTPROD
2589
    qp8_f32_qc8w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc8w_gemm_minmax_ukernel_1x4c4__aarch64_neondot);
2590
    qp8_f32_qc8w_gemm_config.minmax.qp8gemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_QP8GEMM_UKERNEL(xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x4c4__aarch64_neondot_mstep4);
2591
    qp8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2592
    qp8_f32_qc8w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qs8_weights_and_biases;
2593
    qp8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qs8_weights_and_biases;
2594
    qp8_f32_qc8w_gemm_config.mr = 16;
2595
    qp8_f32_qc8w_gemm_config.nr = 4;
2596
    qp8_f32_qc8w_gemm_config.log2_kr = 2;
2597
    qp8_f32_qc8w_gemm_config.mr_packed = 4;
2598
#endif  // XNN_ENABLE_ARM_DOTPROD
2599
  }
2600
  assert(qp8_f32_qc8w_gemm_config.mr <= XNN_MAX_MR);
2601
#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
2602
0
}
2603
2604
0
static void init_qp8_f32_qb4w_gemm_config(void) {
2605
  // Common parameters.
2606
0
  qp8_f32_qb4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2607
0
  qp8_f32_qb4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2608
0
  qp8_f32_qb4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2609
0
  qp8_f32_qb4w_gemm_config.bias_element_size = sizeof(float);
2610
2611
  // Arch-specific parameters.
2612
  #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
2613
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2614
    assert(hardware_config != NULL);
2615
    (void) hardware_config;  // May be unused.
2616
    if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2617
      #if XNN_ENABLE_ARM_I8MM && XNN_ENABLE_ARM_DOTPROD
2618
        qp8_f32_qb4w_gemm_config.minmax.qp8gemm_bl[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_BL_UKERNEL(xnn_qp8_f32_qb4w_gemm_minmax_ukernel_1x4c16s2__aarch64_neondot);
2619
        qp8_f32_qb4w_gemm_config.minmax.qp8gemm_bl[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_QP8GEMM_BL_UKERNEL(xnn_qp8_f32_qb4w_gemm_minmax_ukernel_16x4c16s2__neoni8mm_mstep4);
2620
        qp8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2621
        qp8_f32_qb4w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qb4_weights_and_biases;
2622
        qp8_f32_qb4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qb4_weights_and_biases;
2623
        qp8_f32_qb4w_gemm_config.mr = 16;
2624
        qp8_f32_qb4w_gemm_config.nr = 4;
2625
        qp8_f32_qb4w_gemm_config.log2_kr = 4;
2626
        qp8_f32_qb4w_gemm_config.log2_sr = 1;
2627
        qp8_f32_qb4w_gemm_config.planes = 2;
2628
        qp8_f32_qb4w_gemm_config.mr_packed = 4;
2629
      #endif  // XNN_ENABLE_ARM_I8MM
2630
    } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2631
      #if XNN_ENABLE_ARM_DOTPROD
2632
        qp8_f32_qb4w_gemm_config.minmax.qp8gemm_bl[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_QP8GEMM_BL_UKERNEL(xnn_qp8_f32_qb4w_gemm_minmax_ukernel_1x4c8s2__aarch64_neondot);
2633
        qp8_f32_qb4w_gemm_config.minmax.qp8gemm_bl[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_QP8GEMM_BL_UKERNEL(xnn_qp8_f32_qb4w_gemm_minmax_ukernel_4x4c8s2__aarch64_neondot);
2634
        qp8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2635
        qp8_f32_qb4w_gemm_config.pack_weights_and_biases = xnn_pack_kai_qb4_weights_and_biases;
2636
        qp8_f32_qb4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_qb4_weights_and_biases;
2637
        qp8_f32_qb4w_gemm_config.mr = 4;
2638
        qp8_f32_qb4w_gemm_config.nr = 4;
2639
        qp8_f32_qb4w_gemm_config.log2_kr = 3;
2640
        qp8_f32_qb4w_gemm_config.log2_sr = 1;
2641
        qp8_f32_qb4w_gemm_config.planes = 2;
2642
        qp8_f32_qb4w_gemm_config.mr_packed = 4;
2643
      #endif  // XNN_ENABLE_ARM_DOTPROD
2644
    }
2645
    assert(qp8_f32_qb4w_gemm_config.mr <= XNN_MAX_MR);
2646
  #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
2647
0
}
2648
2649
0
static void init_qdu8_f32_qb4w_gemm_config(void) {
2650
  // Common parameters.
2651
0
  qdu8_f32_qb4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2652
0
  qdu8_f32_qb4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2653
0
  qdu8_f32_qb4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2654
0
  qdu8_f32_qb4w_gemm_config.bias_element_size = sizeof(float);
2655
0
  qdu8_f32_qb4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_qb4_weights_and_biases;
2656
0
  qdu8_f32_qb4w_gemm_config.pack_weights_and_biases = xnn_pack_qb4_weights_and_biases;
2657
2658
  // Arch-specific parameters.
2659
0
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
2660
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2661
0
    assert(hardware_config != NULL);
2662
0
    (void) hardware_config;  // May be unused.
2663
0
    #if XNN_ENABLE_AVX512VNNIGFNI
2664
      // AMD Zen4 and Zen5 have gfni but it is slow.
2665
0
      if ((hardware_config->arch_flags & xnn_arch_x86_avx512vnnigfni) && hardware_config->uarch[XNN_UARCH_INDEX] != xnn_uarch_zen4 && hardware_config->uarch[XNN_UARCH_INDEX] != xnn_uarch_zen5) {
2666
0
        qdu8_f32_qb4w_gemm_config.arch = xnn_arch_x86_avx512vnnigfni;
2667
0
        qdu8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm);
2668
0
        qdu8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(14)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm);
2669
0
        qdu8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2670
0
        qdu8_f32_qb4w_gemm_config.mr = 14;
2671
0
        qdu8_f32_qb4w_gemm_config.nr = 16;
2672
0
        qdu8_f32_qb4w_gemm_config.log2_kr = 3;
2673
0
        qdu8_f32_qb4w_gemm_config.planes = 2;
2674
0
      } else
2675
0
    #endif  // XNN_ENABLE_AVX512VNNIGFNI
2676
0
    #if XNN_ENABLE_AVX512VNNI
2677
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
2678
0
        qdu8_f32_qb4w_gemm_config.arch = xnn_arch_x86_avx512vnni;
2679
0
        qdu8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
2680
0
        qdu8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm);
2681
0
        qdu8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2682
0
        qdu8_f32_qb4w_gemm_config.mr = 8;
2683
0
        qdu8_f32_qb4w_gemm_config.nr = 16;
2684
0
        qdu8_f32_qb4w_gemm_config.log2_kr = 3;
2685
0
        qdu8_f32_qb4w_gemm_config.planes = 2;
2686
0
      }
2687
    #else
2688
      {
2689
      }
2690
    #endif
2691
0
    assert(qdu8_f32_qb4w_gemm_config.mr <= XNN_MAX_MR);
2692
0
  #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2693
0
}
2694
2695
0
static void init_qd8_f32_qb4w_gemm_config(void) {
2696
  // Common parameters.
2697
0
  qd8_f32_qb4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2698
0
  qd8_f32_qb4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
2699
0
  qd8_f32_qb4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
2700
0
  qd8_f32_qb4w_gemm_config.bias_element_size = sizeof(float);
2701
0
  qd8_f32_qb4w_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_qb4_weights_and_biases;
2702
0
  qd8_f32_qb4w_gemm_config.pack_weights_and_biases = xnn_pack_qb4_weights_and_biases;
2703
0
  qd8_f32_qb4w_gemm_config.pack_gemm_goi_bl = NULL;
2704
2705
  // Arch-specific parameters.
2706
  #if XNN_ARCH_ARM
2707
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2708
    assert(hardware_config != NULL);
2709
    (void) hardware_config;  // May be unused.
2710
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
2711
      if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2712
        #if XNN_ENABLE_ARM_DOTPROD
2713
          qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c4__neondot);
2714
          qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16c4__neondot);
2715
          qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2716
          qd8_f32_qb4w_gemm_config.mr = 4;
2717
          qd8_f32_qb4w_gemm_config.nr = 16;
2718
          qd8_f32_qb4w_gemm_config.log2_kr = 2;
2719
          qd8_f32_qb4w_gemm_config.planes = 2;
2720
        #endif  // XNN_ENABLE_ARM_DOTPROD
2721
      } else {
2722
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16__neon_mlal_lane);
2723
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x16__neon_mlal_lane);
2724
        qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2725
        qd8_f32_qb4w_gemm_config.mr = 6;
2726
        qd8_f32_qb4w_gemm_config.nr = 16;
2727
        qd8_f32_qb4w_gemm_config.planes = 2;
2728
      }
2729
    } else {
2730
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4__scalar);
2731
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4__scalar);
2732
      qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2733
      qd8_f32_qb4w_gemm_config.mr = 4;
2734
      qd8_f32_qb4w_gemm_config.nr = 4;
2735
      qd8_f32_qb4w_gemm_config.planes = 2;
2736
    }
2737
  #elif XNN_ARCH_ARM64
2738
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2739
    assert(hardware_config != NULL);
2740
    (void) hardware_config;  // May be unused.
2741
    if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2742
      #if XNN_ENABLE_ARM_I8MM && XNN_ENABLE_ARM_DOTPROD
2743
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2744
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2745
        qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2746
        qd8_f32_qb4w_gemm_config.mr = 4;
2747
        qd8_f32_qb4w_gemm_config.nr = 16;
2748
        qd8_f32_qb4w_gemm_config.log2_kr = 3;
2749
        qd8_f32_qb4w_gemm_config.planes = 2;
2750
        qd8_f32_qb4w_gemm_config.pack_gemm_goi_bl = (xnn_packw_gemm_goi_bl_ukernel_fn) xnn_qb4_packw_gemm_goi_ukernel_x16c8__aarch64_neondot;
2751
      #endif  // XNN_ENABLE_ARM_I8MM
2752
    } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
2753
      #if XNN_ENABLE_ARM_DOTPROD
2754
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c4__neondot);
2755
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16c4__neondot);
2756
        qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2757
        qd8_f32_qb4w_gemm_config.mr = 4;
2758
        qd8_f32_qb4w_gemm_config.nr = 16;
2759
        qd8_f32_qb4w_gemm_config.log2_kr = 2;
2760
        qd8_f32_qb4w_gemm_config.planes = 2;
2761
        qd8_f32_qb4w_gemm_config.pack_gemm_goi_bl = (xnn_packw_gemm_goi_bl_ukernel_fn) xnn_qb4_packw_gemm_goi_ukernel_x16c4__aarch64_neondot;
2762
      #endif  // XNN_ENABLE_ARM_DOTPROD
2763
    } else {
2764
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16__neon_mlal_lane);
2765
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x16__neon_mlal_lane);
2766
      qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2767
      qd8_f32_qb4w_gemm_config.mr = 6;
2768
      qd8_f32_qb4w_gemm_config.nr = 16;
2769
      qd8_f32_qb4w_gemm_config.planes = 2;
2770
    }
2771
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2772
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2773
0
    assert(hardware_config != NULL);
2774
0
    (void) hardware_config;  // May be unused.
2775
0
    #if XNN_ENABLE_AVX2
2776
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
2777
0
        qd8_f32_qb4w_gemm_config.arch = xnn_arch_x86_avx2;
2778
0
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x8c8__avx2);
2779
0
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x8c8__avx2);
2780
0
        qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2781
0
        qd8_f32_qb4w_gemm_config.mr = 3;
2782
0
        qd8_f32_qb4w_gemm_config.nr = 8;
2783
0
        qd8_f32_qb4w_gemm_config.log2_kr = 3;
2784
0
        qd8_f32_qb4w_gemm_config.planes = 2;
2785
0
    } else
2786
0
    #endif
2787
0
    #if XNN_ENABLE_AVX2
2788
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
2789
0
        qd8_f32_qb4w_gemm_config.arch = xnn_arch_x86_avx;
2790
0
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__avx_ld128);
2791
0
        qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__avx_ld128);
2792
0
        qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2793
0
        qd8_f32_qb4w_gemm_config.mr = 4;
2794
0
        qd8_f32_qb4w_gemm_config.nr = 4;
2795
0
        qd8_f32_qb4w_gemm_config.log2_kr = 3;
2796
0
        qd8_f32_qb4w_gemm_config.planes = 1;
2797
0
      } else
2798
0
    #endif
2799
0
    if (hardware_config->arch_flags & xnn_arch_x86_sse4_1) {
2800
0
      qd8_f32_qb4w_gemm_config.arch = xnn_arch_x86_sse4_1;
2801
0
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse41_ld128);
2802
0
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse41_ld128);
2803
0
      qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2804
0
      qd8_f32_qb4w_gemm_config.mr = 3;
2805
0
      qd8_f32_qb4w_gemm_config.nr = 4;
2806
0
      qd8_f32_qb4w_gemm_config.log2_kr = 3;
2807
0
      qd8_f32_qb4w_gemm_config.planes = 1;
2808
0
    } else {
2809
0
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse2_ld128);
2810
0
      qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse2_ld128);
2811
0
      qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2812
0
      qd8_f32_qb4w_gemm_config.mr = 4;
2813
0
      qd8_f32_qb4w_gemm_config.nr = 4;
2814
0
      qd8_f32_qb4w_gemm_config.log2_kr = 3;
2815
0
      qd8_f32_qb4w_gemm_config.planes = 1;
2816
0
    }
2817
  #else
2818
    qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4__scalar);
2819
    qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4__scalar);
2820
    qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_scalar_params;
2821
    qd8_f32_qb4w_gemm_config.mr = 4;
2822
    qd8_f32_qb4w_gemm_config.nr = 4;
2823
    qd8_f32_qb4w_gemm_config.planes = 2;
2824
  #endif
2825
0
  assert(qd8_f32_qb4w_gemm_config.mr <= XNN_MAX_MR);
2826
0
  assert(qd8_f32_qb4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
2827
0
}
2828
2829
0
static void init_qd8_f16_qc8w_gemm_config(void) {
2830
  // Common parameters.
2831
0
  qd8_f16_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
2832
0
  qd8_f16_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
2833
0
  qd8_f16_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
2834
0
  qd8_f16_qc8w_gemm_config.bias_element_size = sizeof(float);
2835
  // Use the same packing function throughout.
2836
0
  qd8_f16_qc8w_gemm_config.pack_weights_and_biases = (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
2837
0
  qd8_f16_qc8w_gemm_config.packed_stride_weights_and_biases = (xnn_packed_stride_weights_and_biases_fn) xnn_packed_stride_qs8_weights_and_biases;
2838
0
  qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2839
0
  qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2840
2841
  // Arch-specific parameters.
2842
  #if XNN_ENABLE_ARM_FP16_SCALAR && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM
2843
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2844
    assert(hardware_config != NULL);
2845
    (void) hardware_config;  // May be unused.
2846
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
2847
      #if XNN_ENABLE_ASSEMBLY
2848
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2849
          #if XNN_ENABLE_ARM_DOTPROD
2850
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c4__neondotfp16arith);
2851
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c4__neondotfp16arith);
2852
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2853
            qd8_f16_qc8w_gemm_config.mr = 4;
2854
            qd8_f16_qc8w_gemm_config.nr = 8;
2855
            qd8_f16_qc8w_gemm_config.log2_kr = 2;
2856
          #endif  // XNN_ENABLE_ARM_DOTPROD
2857
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
2858
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
2859
            case xnn_uarch_cortex_a53:
2860
            case xnn_uarch_cortex_a55:
2861
              qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8__asm_aarch32_neonfp16arith_ld64_2);
2862
              qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8__asm_aarch32_neonfp16arith_ld64_2);
2863
              qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2864
              qd8_f16_qc8w_gemm_config.mr = 4;
2865
              qd8_f16_qc8w_gemm_config.nr = 8;
2866
              break;
2867
            default:
2868
              qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith);
2869
              qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith);
2870
              qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2871
              qd8_f16_qc8w_gemm_config.mr = 2;
2872
              qd8_f16_qc8w_gemm_config.nr = 8;
2873
              qd8_f16_qc8w_gemm_config.log2_kr = 1;
2874
              qd8_f16_qc8w_gemm_config.log2_sr = 2;
2875
              break;
2876
          }
2877
        }
2878
        #if XNN_MAX_UARCH_TYPES > 1 && XNN_ENABLE_ARM_DOTPROD
2879
        {
2880
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2881
          const uint32_t mr = qd8_f16_qc8w_gemm_config.mr;
2882
          const uint32_t nr = qd8_f16_qc8w_gemm_config.nr;
2883
          const uint32_t log2_kr = qd8_f16_qc8w_gemm_config.log2_kr;
2884
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2885
            switch (hardware_config->uarch[i]) {
2886
              case xnn_uarch_cortex_a55:
2887
                #if XNN_ENABLE_ARM_DOTPROD
2888
                  if (mr == 4 && nr == 8 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2889
                    qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c4__neondotfp16arith);
2890
                    qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c4__asm_aarch32_neondotfp16arith_cortex_a55);
2891
                  }
2892
                #endif  // XNN_ENABLE_ARM_DOTPROD
2893
                break;
2894
              default:
2895
                break;
2896
            }
2897
          }
2898
        }
2899
        #endif  // XNN_MAX_UARCH_TYPES > 1
2900
      #else  // XNN_ENABLE_ASSEMBLY
2901
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2902
          #if XNN_ENABLE_ARM_DOTPROD
2903
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c4__neondotfp16arith);
2904
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c4__neondotfp16arith);
2905
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2906
            qd8_f16_qc8w_gemm_config.mr = 4;
2907
            qd8_f16_qc8w_gemm_config.nr = 8;
2908
            qd8_f16_qc8w_gemm_config.log2_kr = 2;
2909
          #endif  // XNN_ENABLE_ARM_DOTPROD
2910
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
2911
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith);
2912
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith);
2913
          qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2914
          qd8_f16_qc8w_gemm_config.mr = 2;
2915
          qd8_f16_qc8w_gemm_config.nr = 8;
2916
          qd8_f16_qc8w_gemm_config.log2_kr = 1;
2917
          qd8_f16_qc8w_gemm_config.log2_sr = 2;
2918
        }
2919
      #endif  // XNN_ENABLE_ASSEMBLY
2920
    }
2921
  #elif XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
2922
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2923
    assert(hardware_config != NULL);
2924
    (void) hardware_config;  // May be unused.
2925
    #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
2926
      #if XNN_ENABLE_ASSEMBLY
2927
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2928
          #if XNN_ENABLE_ARM_I8MM
2929
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2930
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2931
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2932
            qd8_f16_qc8w_gemm_config.mr = 4;
2933
            qd8_f16_qc8w_gemm_config.nr = 16;
2934
            qd8_f16_qc8w_gemm_config.log2_kr = 3;
2935
          #endif  // XNN_ENABLE_ARM_I8MM
2936
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2937
          #if XNN_ENABLE_ARM_DOTPROD
2938
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
2939
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_ld128);
2940
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2941
            qd8_f16_qc8w_gemm_config.mr = 4;
2942
            qd8_f16_qc8w_gemm_config.nr = 16;
2943
            qd8_f16_qc8w_gemm_config.log2_kr = 2;
2944
          #endif  // XNN_ENABLE_ARM_DOTPROD
2945
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
2946
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith);
2947
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith);
2948
          qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2949
          qd8_f16_qc8w_gemm_config.mr = 2;
2950
          qd8_f16_qc8w_gemm_config.nr = 8;
2951
          qd8_f16_qc8w_gemm_config.log2_kr = 1;
2952
          qd8_f16_qc8w_gemm_config.log2_sr = 2;
2953
        }
2954
      #else  // !XNN_ENABLE_ASSEMBLY
2955
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2956
          #if XNN_ENABLE_ARM_I8MM
2957
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2958
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2959
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2960
            qd8_f16_qc8w_gemm_config.mr = 4;
2961
            qd8_f16_qc8w_gemm_config.nr = 16;
2962
            qd8_f16_qc8w_gemm_config.log2_kr = 3;
2963
          #endif  // XNN_ENABLE_ARM_I8MM
2964
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2965
          #if XNN_ENABLE_ARM_DOTPROD
2966
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
2967
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__neondotfp16arith);
2968
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2969
            qd8_f16_qc8w_gemm_config.mr = 4;
2970
            qd8_f16_qc8w_gemm_config.nr = 16;
2971
            qd8_f16_qc8w_gemm_config.log2_kr = 2;
2972
          #endif  // XNN_ENABLE_ARM_DOTPROD
2973
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
2974
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith);
2975
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith);
2976
          qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2977
          qd8_f16_qc8w_gemm_config.mr = 2;
2978
          qd8_f16_qc8w_gemm_config.nr = 8;
2979
          qd8_f16_qc8w_gemm_config.log2_kr = 1;
2980
          qd8_f16_qc8w_gemm_config.log2_sr = 2;
2981
        }
2982
      #endif  // XNN_ENABLE_ASSEMBLY
2983
    #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2984
      #if XNN_ENABLE_ASSEMBLY
2985
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
2986
          #if XNN_ENABLE_ARM_I8MM
2987
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
2988
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
2989
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
2990
            qd8_f16_qc8w_gemm_config.mr = 4;
2991
            qd8_f16_qc8w_gemm_config.nr = 16;
2992
            qd8_f16_qc8w_gemm_config.log2_kr = 3;
2993
          #endif  // XNN_ENABLE_ARM_I8MM
2994
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
2995
          #if XNN_ENABLE_ARM_DOTPROD
2996
           switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
2997
              case xnn_uarch_cortex_a55:
2998
                qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_cortex_a55);
2999
                break;
3000
              default:
3001
                qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_ld128);
3002
                break;
3003
            }
3004
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
3005
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3006
            qd8_f16_qc8w_gemm_config.mr = 4;
3007
            qd8_f16_qc8w_gemm_config.nr = 16;
3008
            qd8_f16_qc8w_gemm_config.log2_kr = 2;
3009
          #endif  // XNN_ENABLE_ARM_DOTPROD
3010
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3011
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3012
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3013
          qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3014
          qd8_f16_qc8w_gemm_config.mr = 2;
3015
          qd8_f16_qc8w_gemm_config.nr = 8;
3016
          qd8_f16_qc8w_gemm_config.log2_kr = 1;
3017
          qd8_f16_qc8w_gemm_config.log2_sr = 2;
3018
        }
3019
        #if XNN_MAX_UARCH_TYPES > 1
3020
        {
3021
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
3022
          const uint32_t mr = qd8_f16_qc8w_gemm_config.mr;
3023
          const uint32_t nr = qd8_f16_qc8w_gemm_config.nr;
3024
          const uint32_t log2_kr = qd8_f16_qc8w_gemm_config.log2_kr;
3025
          // Avoid unused warnings.
3026
          (void) mr;
3027
          (void) nr;
3028
          (void) log2_kr;
3029
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
3030
            switch (hardware_config->uarch[i]) {
3031
              case xnn_uarch_cortex_a55:
3032
                #if XNN_ENABLE_ARM_DOTPROD
3033
                  if (mr == 4 && nr == 16 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3034
                    qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
3035
                    qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_cortex_a55);
3036
                  }
3037
                #endif  // XNN_ENABLE_ARM_DOTPROD
3038
                break;
3039
              default:
3040
                break;
3041
            }
3042
          }
3043
        }
3044
        #endif  // XNN_MAX_UARCH_TYPES > 1
3045
      #else  // !XNN_ENABLE_ASSEMBLY
3046
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3047
          #if XNN_ENABLE_ARM_I8MM
3048
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
3049
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
3050
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3051
            qd8_f16_qc8w_gemm_config.mr = 4;
3052
            qd8_f16_qc8w_gemm_config.nr = 16;
3053
            qd8_f16_qc8w_gemm_config.log2_kr = 3;
3054
          #endif  // XNN_ENABLE_ARM_I8MM
3055
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3056
          #if XNN_ENABLE_ARM_DOTPROD
3057
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c4__neondotfp16arith);
3058
            qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__neondotfp16arith);
3059
            qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3060
            qd8_f16_qc8w_gemm_config.mr = 4;
3061
            qd8_f16_qc8w_gemm_config.nr = 16;
3062
            qd8_f16_qc8w_gemm_config.log2_kr = 2;
3063
          #endif  // XNN_ENABLE_ARM_DOTPROD
3064
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3065
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3066
          qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3067
          qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3068
          qd8_f16_qc8w_gemm_config.mr = 2;
3069
          qd8_f16_qc8w_gemm_config.nr = 8;
3070
          qd8_f16_qc8w_gemm_config.log2_kr = 1;
3071
          qd8_f16_qc8w_gemm_config.log2_sr = 2;
3072
        }
3073
      #endif  // XNN_ENABLE_ASSEMBLY
3074
    #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
3075
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3076
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3077
0
    assert(hardware_config != NULL);
3078
0
    (void) hardware_config;  // May be unused.
3079
0
    #if XNN_ENABLE_AVX512AMX
3080
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512amx) {
3081
0
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x64c4__avx512amx);
3082
0
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_16x64c4__avx512amx);
3083
0
        qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3084
0
        qd8_f16_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3085
0
        qd8_f16_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3086
0
        qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3087
0
        #if XNN_ENABLE_AVX256VNNI
3088
0
        qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm;
3089
        #else
3090
        qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3091
        #endif
3092
0
        qd8_f16_qc8w_gemm_config.mr = 16;
3093
0
        qd8_f16_qc8w_gemm_config.nr = 64;
3094
0
        qd8_f16_qc8w_gemm_config.log2_kr = 2;
3095
0
      } else
3096
0
    #endif
3097
0
    #if XNN_ENABLE_AVX256SKX
3098
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
3099
0
        qd8_f16_qc8w_gemm_config.arch = xnn_arch_x86_avx256skx;
3100
0
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx256skx);
3101
0
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx256skx);
3102
0
        qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3103
0
        qd8_f16_qc8w_gemm_config.mr = 5;
3104
0
        qd8_f16_qc8w_gemm_config.nr = 8;
3105
0
        qd8_f16_qc8w_gemm_config.log2_kr = 3;
3106
0
      } else
3107
0
    #endif
3108
0
    #if XNN_ENABLE_AVX2
3109
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
3110
0
        qd8_f16_qc8w_gemm_config.arch = xnn_arch_x86_avx2;
3111
0
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
3112
0
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
3113
0
        qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3114
0
        qd8_f16_qc8w_gemm_config.mr = 3;
3115
0
        qd8_f16_qc8w_gemm_config.nr = 8;
3116
0
        qd8_f16_qc8w_gemm_config.log2_kr = 3;
3117
0
      }
3118
0
    #endif
3119
0
  #endif
3120
0
  assert(qd8_f16_qc8w_gemm_config.mr <= XNN_MAX_MR);
3121
0
  assert(qd8_f16_qc8w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
3122
0
}
3123
3124
0
static void init_qdu8_f16_qc8w_gemm_config(void) {
3125
  // Common parameters.
3126
0
  qdu8_f16_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
3127
0
  qdu8_f16_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
3128
0
  qdu8_f16_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
3129
0
  qdu8_f16_qc8w_gemm_config.bias_element_size = sizeof(float);
3130
  // Use the same packing function throughout.
3131
0
  qdu8_f16_qc8w_gemm_config.pack_weights_and_biases =
3132
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
3133
0
  qdu8_f16_qc8w_gemm_config.packed_stride_weights_and_biases =
3134
0
      (xnn_packed_stride_weights_and_biases_fn)
3135
0
          xnn_packed_stride_qs8_weights_and_biases;
3136
0
  qdu8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3137
0
  qdu8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3138
3139
  // Arch-specific parameters.
3140
0
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3141
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3142
0
    assert(hardware_config != NULL);
3143
0
    (void) hardware_config;  // May be unused.
3144
0
    #if XNN_ENABLE_AVX256VNNI
3145
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256vnni) {
3146
0
        qdu8_f16_qc8w_gemm_config.arch = xnn_arch_x86_avx256vnni;
3147
0
        qdu8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx256vnni);
3148
0
        qdu8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__avx256vnni);
3149
0
        qdu8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx256vnni);
3150
0
        qdu8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_8x8c8__avx256vnni);
3151
0
        qdu8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3152
0
        qdu8_f16_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3153
0
        qdu8_f16_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3154
0
        qdu8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3155
0
        qdu8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm;
3156
0
        qdu8_f16_qc8w_gemm_config.mr = 8;
3157
0
        qdu8_f16_qc8w_gemm_config.nr = 8;
3158
0
        qdu8_f16_qc8w_gemm_config.log2_kr = 3;
3159
0
      } else
3160
0
    #endif
3161
0
    #if XNN_ENABLE_AVXVNNI
3162
0
      if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
3163
        // AVX VNNI checked before AVX512SKX as it performs better with VNNI microkernels
3164
0
        qdu8_f16_qc8w_gemm_config.arch = xnn_arch_x86_avxvnni;
3165
0
        qdu8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
3166
0
        qdu8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
3167
0
        qdu8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm);
3168
0
        qdu8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm);
3169
0
        qdu8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3170
0
        qdu8_f16_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3171
0
        qdu8_f16_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3172
0
        qdu8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3173
0
        qdu8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm;
3174
0
        qdu8_f16_qc8w_gemm_config.mr = 5;
3175
0
        qdu8_f16_qc8w_gemm_config.nr = 8;
3176
0
        qdu8_f16_qc8w_gemm_config.log2_kr = 3;
3177
0
      }
3178
    #else
3179
    {
3180
    }
3181
    #endif
3182
0
    assert(qdu8_f16_qc8w_gemm_config.mr <= XNN_MAX_MR);
3183
0
    assert(qdu8_f16_qc8w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
3184
0
  #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3185
0
}
3186
3187
0
static void init_qd8_f16_qc8w_igemm_config(void) {
3188
  // Common parameters.
3189
0
  qd8_f16_qc8w_igemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
3190
0
  qd8_f16_qc8w_igemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
3191
0
  qd8_f16_qc8w_igemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
3192
0
  qd8_f16_qc8w_igemm_config.bias_element_size = sizeof(float);
3193
  // Use the same packing function throughout.
3194
0
  qd8_f16_qc8w_igemm_config.pack_weights_and_biases = (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
3195
0
  qd8_f16_qc8w_igemm_config.packed_stride_weights_and_biases = (xnn_packed_stride_weights_and_biases_fn) xnn_packed_stride_qs8_weights_and_biases;
3196
0
  qd8_f16_qc8w_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3197
0
  qd8_f16_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3198
  #if XNN_ENABLE_ARM_FP16_SCALAR && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM
3199
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3200
    assert(hardware_config != NULL);
3201
    (void) hardware_config;  // May be unused.
3202
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
3203
      #if XNN_ENABLE_ASSEMBLY
3204
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3205
          #if XNN_ENABLE_ARM_DOTPROD
3206
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c4__neondotfp16arith);
3207
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x8c4__neondotfp16arith);
3208
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3209
            qd8_f16_qc8w_igemm_config.mr = 4;
3210
            qd8_f16_qc8w_igemm_config.nr = 8;
3211
            qd8_f16_qc8w_igemm_config.log2_kr = 2;
3212
          #endif  // XNN_ENABLE_ARM_DOTPROD
3213
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3214
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3215
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3216
          qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3217
          qd8_f16_qc8w_igemm_config.mr = 2;
3218
          qd8_f16_qc8w_igemm_config.nr = 8;
3219
          qd8_f16_qc8w_igemm_config.log2_kr = 1;
3220
          qd8_f16_qc8w_igemm_config.log2_sr = 2;
3221
        }
3222
        #if XNN_MAX_UARCH_TYPES > 1 && XNN_ENABLE_ARM_DOTPROD
3223
        {
3224
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
3225
          const uint32_t mr = qd8_f16_qc8w_igemm_config.mr;
3226
          const uint32_t nr = qd8_f16_qc8w_igemm_config.nr;
3227
          const uint32_t log2_kr = qd8_f16_qc8w_igemm_config.log2_kr;
3228
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
3229
            switch (hardware_config->uarch[i]) {
3230
              case xnn_uarch_cortex_a55:
3231
                #if XNN_ENABLE_ARM_DOTPROD
3232
                  if (mr == 4 && nr == 8 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3233
                    qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c4__neondotfp16arith);
3234
                    qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x8c4__asm_aarch32_neondotfp16arith_cortex_a55);
3235
                  }
3236
                #endif  // XNN_ENABLE_ARM_DOTPROD
3237
                break;
3238
              default:
3239
                break;
3240
            }
3241
          }
3242
        }
3243
        #endif  // XNN_MAX_UARCH_TYPES > 1
3244
      #else  // XNN_ENABLE_ASSEMBLY
3245
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3246
          #if XNN_ENABLE_ARM_DOTPROD
3247
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c4__neondotfp16arith);
3248
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x8c4__neondotfp16arith);
3249
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3250
            qd8_f16_qc8w_igemm_config.mr = 4;
3251
            qd8_f16_qc8w_igemm_config.nr = 8;
3252
            qd8_f16_qc8w_igemm_config.log2_kr = 2;
3253
          #endif  // XNN_ENABLE_ARM_DOTPROD
3254
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3255
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3256
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3257
          qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3258
          qd8_f16_qc8w_igemm_config.mr = 2;
3259
          qd8_f16_qc8w_igemm_config.nr = 8;
3260
          qd8_f16_qc8w_igemm_config.log2_kr = 1;
3261
          qd8_f16_qc8w_igemm_config.log2_sr = 2;
3262
        }
3263
      #endif  // XNN_ENABLE_ASSEMBLY
3264
    }
3265
  #elif XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
3266
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3267
    assert(hardware_config != NULL);
3268
    (void) hardware_config;  // May be unused.
3269
    #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
3270
      #if XNN_ENABLE_ASSEMBLY
3271
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3272
          #if XNN_ENABLE_ARM_I8MM
3273
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3274
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3275
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3276
            qd8_f16_qc8w_igemm_config.mr = 4;
3277
            qd8_f16_qc8w_igemm_config.nr = 16;
3278
            qd8_f16_qc8w_igemm_config.log2_kr = 3;
3279
          #endif  // XNN_ENABLE_ARM_I8MM
3280
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3281
          #if XNN_ENABLE_ARM_DOTPROD
3282
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c4__neondotfp16arith);
3283
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c4__neondotfp16arith);
3284
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3285
            qd8_f16_qc8w_igemm_config.mr = 4;
3286
            qd8_f16_qc8w_igemm_config.nr = 16;
3287
            qd8_f16_qc8w_igemm_config.log2_kr = 2;
3288
          #endif  // XNN_ENABLE_ARM_DOTPROD
3289
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3290
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3291
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3292
          qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3293
          qd8_f16_qc8w_igemm_config.mr = 2;
3294
          qd8_f16_qc8w_igemm_config.nr = 8;
3295
          qd8_f16_qc8w_igemm_config.log2_kr = 1;
3296
          qd8_f16_qc8w_igemm_config.log2_sr = 2;
3297
        }
3298
      #else  // !XNN_ENABLE_ASSEMBLY
3299
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3300
          #if XNN_ENABLE_ARM_I8MM
3301
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3302
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3303
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3304
            qd8_f16_qc8w_igemm_config.mr = 4;
3305
            qd8_f16_qc8w_igemm_config.nr = 16;
3306
            qd8_f16_qc8w_igemm_config.log2_kr = 3;
3307
          #endif  // XNN_ENABLE_ARM_I8MM
3308
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3309
          #if XNN_ENABLE_ARM_DOTPROD
3310
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c4__neondotfp16arith);
3311
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c4__neondotfp16arith);
3312
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3313
            qd8_f16_qc8w_igemm_config.mr = 4;
3314
            qd8_f16_qc8w_igemm_config.nr = 16;
3315
            qd8_f16_qc8w_igemm_config.log2_kr = 2;
3316
          #endif  // XNN_ENABLE_ARM_DOTPROD
3317
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3318
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3319
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3320
          qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3321
          qd8_f16_qc8w_igemm_config.mr = 2;
3322
          qd8_f16_qc8w_igemm_config.nr = 8;
3323
          qd8_f16_qc8w_igemm_config.log2_kr = 1;
3324
          qd8_f16_qc8w_igemm_config.log2_sr = 2;
3325
        }
3326
      #endif  // XNN_ENABLE_ASSEMBLY
3327
    #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3328
      #if XNN_ENABLE_ASSEMBLY
3329
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3330
          #if XNN_ENABLE_ARM_I8MM
3331
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3332
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3333
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3334
            qd8_f16_qc8w_igemm_config.mr = 4;
3335
            qd8_f16_qc8w_igemm_config.nr = 16;
3336
            qd8_f16_qc8w_igemm_config.log2_kr = 3;
3337
          #endif  // XNN_ENABLE_ARM_I8MM
3338
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3339
          #if XNN_ENABLE_ARM_DOTPROD
3340
           switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3341
              case xnn_uarch_cortex_a55:
3342
                qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_cortex_a55);
3343
                break;
3344
              default:
3345
                qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_ld128);
3346
                break;
3347
            }
3348
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c4__neondotfp16arith);
3349
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3350
            qd8_f16_qc8w_igemm_config.mr = 4;
3351
            qd8_f16_qc8w_igemm_config.nr = 16;
3352
            qd8_f16_qc8w_igemm_config.log2_kr = 2;
3353
          #endif  // XNN_ENABLE_ARM_DOTPROD
3354
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3355
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3356
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3357
          qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3358
          qd8_f16_qc8w_igemm_config.mr = 2;
3359
          qd8_f16_qc8w_igemm_config.nr = 8;
3360
          qd8_f16_qc8w_igemm_config.log2_kr = 1;
3361
          qd8_f16_qc8w_igemm_config.log2_sr = 2;
3362
        }
3363
        #if XNN_MAX_UARCH_TYPES > 1
3364
        {
3365
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
3366
          const uint32_t mr = qd8_f16_qc8w_igemm_config.mr;
3367
          const uint32_t nr = qd8_f16_qc8w_igemm_config.nr;
3368
          const uint32_t log2_kr = qd8_f16_qc8w_igemm_config.log2_kr;
3369
          // Avoid unused warnings.
3370
          (void) mr;
3371
          (void) nr;
3372
          (void) log2_kr;
3373
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
3374
            switch (hardware_config->uarch[i]) {
3375
              case xnn_uarch_cortex_a55:
3376
                #if XNN_ENABLE_ARM_DOTPROD
3377
                  if (mr == 4 && nr == 16 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3378
                    qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c4__neondotfp16arith);
3379
                    qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_cortex_a55);
3380
                  }
3381
                #endif  // XNN_ENABLE_ARM_DOTPROD
3382
                break;
3383
              default:
3384
                break;
3385
            }
3386
          }
3387
        }
3388
        #endif  // XNN_MAX_UARCH_TYPES > 1
3389
      #else  // !XNN_ENABLE_ASSEMBLY
3390
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3391
          #if XNN_ENABLE_ARM_I8MM
3392
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3393
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3394
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3395
            qd8_f16_qc8w_igemm_config.mr = 4;
3396
            qd8_f16_qc8w_igemm_config.nr = 16;
3397
            qd8_f16_qc8w_igemm_config.log2_kr = 3;
3398
          #endif  // XNN_ENABLE_ARM_I8MM
3399
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot) && (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith)) {
3400
          #if XNN_ENABLE_ARM_DOTPROD
3401
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x16c4__neondotfp16arith);
3402
            qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_4x16c4__neondotfp16arith);
3403
            qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3404
            qd8_f16_qc8w_igemm_config.mr = 4;
3405
            qd8_f16_qc8w_igemm_config.nr = 16;
3406
            qd8_f16_qc8w_igemm_config.log2_kr = 2;
3407
          #endif  // XNN_ENABLE_ARM_DOTPROD
3408
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith) {
3409
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c2s4__neonfp16arith);
3410
          qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_2x8c2s4__neonfp16arith);
3411
          qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3412
          qd8_f16_qc8w_igemm_config.mr = 2;
3413
          qd8_f16_qc8w_igemm_config.nr = 8;
3414
          qd8_f16_qc8w_igemm_config.log2_kr = 1;
3415
          qd8_f16_qc8w_igemm_config.log2_sr = 2;
3416
        }
3417
      #endif  // XNN_ENABLE_ASSEMBLY
3418
    #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
3419
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3420
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3421
0
    assert(hardware_config != NULL);
3422
0
    (void) hardware_config;  // May be unused.
3423
0
    #if XNN_ENABLE_AVX512AMX
3424
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512amx) {
3425
0
        qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x64c4__avx512amx);
3426
0
        qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_16x64c4__avx512amx);
3427
0
        qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3428
0
        qd8_f16_qc8w_igemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3429
0
        qd8_f16_qc8w_igemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3430
0
        qd8_f16_qc8w_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3431
0
        #if XNN_ENABLE_AVX256VNNI
3432
0
        qd8_f16_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm;
3433
        #else
3434
        qd8_f16_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3435
        #endif
3436
0
        qd8_f16_qc8w_igemm_config.mr = 16;
3437
0
        qd8_f16_qc8w_igemm_config.nr = 64;
3438
0
        qd8_f16_qc8w_igemm_config.log2_kr = 2;
3439
0
      } else
3440
0
    #endif
3441
0
    #if XNN_ENABLE_AVX256SKX
3442
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
3443
0
        qd8_f16_qc8w_igemm_config.arch = xnn_arch_x86_avx256skx;
3444
0
        qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx256skx);
3445
0
        qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avx256skx);
3446
0
        qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3447
0
        qd8_f16_qc8w_igemm_config.mr = 5;
3448
0
        qd8_f16_qc8w_igemm_config.nr = 8;
3449
0
        qd8_f16_qc8w_igemm_config.log2_kr = 3;
3450
0
      } else
3451
0
    #endif
3452
0
    #if XNN_ENABLE_AVX2
3453
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
3454
0
        qd8_f16_qc8w_igemm_config.arch = xnn_arch_x86_avx2;
3455
0
        qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
3456
0
        qd8_f16_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f16_qc8w_igemm_minmax_ukernel_3x8c8__avx2);
3457
0
        qd8_f16_qc8w_igemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
3458
0
        qd8_f16_qc8w_igemm_config.mr = 3;
3459
0
        qd8_f16_qc8w_igemm_config.nr = 8;
3460
0
        qd8_f16_qc8w_igemm_config.log2_kr = 3;
3461
0
      }
3462
0
    #endif
3463
0
  #endif
3464
0
  assert(qd8_f16_qc8w_igemm_config.mr <= XNN_MAX_MR);
3465
0
  assert(qd8_f16_qc8w_igemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
3466
0
}
3467
3468
0
static void init_qdu8_f32_qc8w_gemm_config(void) {
3469
  // Common parameters.
3470
0
  qdu8_f32_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
3471
0
  qdu8_f32_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
3472
0
  qdu8_f32_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
3473
0
  qdu8_f32_qc8w_gemm_config.bias_element_size = sizeof(float);
3474
  // Use the same packing function throughout.
3475
0
  qdu8_f32_qc8w_gemm_config.pack_weights_and_biases =
3476
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
3477
0
  qdu8_f32_qc8w_gemm_config.packed_stride_weights_and_biases =
3478
0
      (xnn_packed_stride_weights_and_biases_fn)
3479
0
          xnn_packed_stride_qs8_weights_and_biases;
3480
0
  qdu8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3481
0
  qdu8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3482
3483
  // Arch-specific parameters.
3484
0
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3485
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3486
0
    assert(hardware_config != NULL);
3487
0
    (void) hardware_config;  // May be unused.
3488
0
    #if XNN_ENABLE_AVX512VNNI && XNN_ARCH_X86_64 && !XNN_PLATFORM_WINDOWS && XNN_ENABLE_ASSEMBLY
3489
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
3490
0
        qdu8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avx512vnni;
3491
0
        qdu8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x64c4__asm_amd64_avx512vnni);
3492
0
        qdu8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x64c4__asm_amd64_avx512vnni);
3493
0
        qdu8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3494
0
        qdu8_f32_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3495
0
        qdu8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3496
0
        qdu8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3497
0
        qdu8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3498
0
        qdu8_f32_qc8w_gemm_config.mr = 5;
3499
0
        qdu8_f32_qc8w_gemm_config.nr = 64;
3500
0
        qdu8_f32_qc8w_gemm_config.log2_kr = 2;
3501
0
      } else
3502
0
    #endif
3503
0
    #if XNN_ENABLE_AVX512VNNI
3504
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
3505
0
        qdu8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avx512vnni;
3506
0
        qdu8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
3507
0
        qdu8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(10)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm);
3508
0
        qdu8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3509
0
        qdu8_f32_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3510
0
        qdu8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3511
0
        qdu8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3512
0
        #if XNN_ENABLE_AVX256VNNI
3513
0
          qdu8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm;
3514
        #else
3515
          qdu8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x16c8__scalar;
3516
        #endif
3517
0
        qdu8_f32_qc8w_gemm_config.mr = 10;
3518
0
        qdu8_f32_qc8w_gemm_config.nr = 16;
3519
0
        qdu8_f32_qc8w_gemm_config.log2_kr = 3;
3520
0
      } else
3521
0
    #endif
3522
0
    #if XNN_ENABLE_AVXVNNI
3523
0
     if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
3524
0
        qdu8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avxvnni;
3525
0
        qdu8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
3526
0
        qdu8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
3527
0
        qdu8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3528
0
        qdu8_f32_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3529
0
        qdu8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3530
0
        qdu8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3531
0
        qdu8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm;
3532
0
        qdu8_f32_qc8w_gemm_config.mr = 5;
3533
0
        qdu8_f32_qc8w_gemm_config.nr = 8;
3534
0
        qdu8_f32_qc8w_gemm_config.log2_kr = 3;
3535
0
      }
3536
    #else
3537
    {
3538
    }
3539
    #endif
3540
0
    assert(qdu8_f32_qc8w_gemm_config.mr <= XNN_MAX_MR);
3541
0
    assert(qdu8_f32_qc8w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
3542
0
  #endif //XNN_ARCH_X86 || XNN_ARCH_X86_64
3543
0
}
3544
3545
0
static void init_qdu8_f32_qc8w_igemm_config(void) {
3546
  // Common parameters.
3547
0
  qdu8_f32_qc8w_igemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
3548
0
  qdu8_f32_qc8w_igemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
3549
0
  qdu8_f32_qc8w_igemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_UINT8_T + 3;
3550
0
  qdu8_f32_qc8w_igemm_config.bias_element_size = sizeof(float);
3551
  // Use the same packing function throughout.
3552
0
  qdu8_f32_qc8w_igemm_config.pack_weights_and_biases =
3553
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
3554
0
  qdu8_f32_qc8w_igemm_config.packed_stride_weights_and_biases =
3555
0
      (xnn_packed_stride_weights_and_biases_fn)
3556
0
          xnn_packed_stride_qs8_weights_and_biases;
3557
0
  qdu8_f32_qc8w_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3558
0
  qdu8_f32_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3559
0
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3560
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3561
0
    assert(hardware_config != NULL);
3562
0
    (void) hardware_config;  // May be unused.
3563
0
    #if XNN_ENABLE_AVX512VNNI
3564
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
3565
0
        qdu8_f32_qc8w_igemm_config.arch = xnn_arch_x86_avx512vnni;
3566
0
        qdu8_f32_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
3567
0
        qdu8_f32_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(10)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c8__avx512vnni_prfm);
3568
0
        qdu8_f32_qc8w_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3569
0
        qdu8_f32_qc8w_igemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3570
0
        qdu8_f32_qc8w_igemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3571
0
        qdu8_f32_qc8w_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3572
0
        #if XNN_ENABLE_AVX256VNNI
3573
0
          qdu8_f32_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm;
3574
        #else
3575
          qdu8_f32_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x16c8__scalar;
3576
        #endif
3577
0
        qdu8_f32_qc8w_igemm_config.mr = 10;
3578
0
        qdu8_f32_qc8w_igemm_config.nr = 16;
3579
0
        qdu8_f32_qc8w_igemm_config.log2_kr = 3;
3580
0
      } else
3581
0
    #endif
3582
0
    #if XNN_ENABLE_AVXVNNI
3583
0
     if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
3584
0
        qdu8_f32_qc8w_igemm_config.arch = xnn_arch_x86_avxvnni;
3585
0
        qdu8_f32_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm);
3586
0
        qdu8_f32_qc8w_igemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm);
3587
0
        qdu8_f32_qc8w_igemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3588
0
        qdu8_f32_qc8w_igemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3589
0
        qdu8_f32_qc8w_igemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3590
0
        qdu8_f32_qc8w_igemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3591
0
        qdu8_f32_qc8w_igemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm;
3592
0
        qdu8_f32_qc8w_igemm_config.mr = 5;
3593
0
        qdu8_f32_qc8w_igemm_config.nr = 8;
3594
0
        qdu8_f32_qc8w_igemm_config.log2_kr = 3;
3595
0
      }
3596
    #else
3597
    {
3598
    }
3599
    #endif
3600
0
    assert(qdu8_f32_qc8w_igemm_config.mr <= XNN_MAX_MR);
3601
0
    assert(qdu8_f32_qc8w_igemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
3602
0
  #endif //XNN_ARCH_X86 || XNN_ARCH_X86_64
3603
0
}
3604
3605
0
static void init_qdu8_f32_qc4w_gemm_config(void) {
3606
  // Common parameters.
3607
0
  qdu8_f32_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
3608
0
  qdu8_f32_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
3609
0
  qdu8_f32_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
3610
0
  qdu8_f32_qc4w_gemm_config.bias_element_size = sizeof(float);
3611
  // Use the same packing function throughout.
3612
0
  qdu8_f32_qc4w_gemm_config.pack_weights_and_biases = (xnn_pack_weights_and_biases_fn) xnn_pack_qs4_weights_and_biases;
3613
0
  qdu8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = (xnn_packed_stride_weights_and_biases_fn) xnn_packed_stride_qs4_weights_and_biases;
3614
0
  qdu8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;  // Ignored
3615
0
  qdu8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;  // Ignored
3616
3617
  // Arch-specific parameters.
3618
0
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3619
0
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3620
0
    assert(hardware_config != NULL);
3621
0
    (void) hardware_config;  // May be unused.
3622
0
    #if XNN_ENABLE_AVX512VNNIGFNI
3623
      // AMD Zen4 and Zen5 have gfni but it is slow.
3624
0
      if ((hardware_config->arch_flags & xnn_arch_x86_avx512vnnigfni) && hardware_config->uarch[XNN_UARCH_INDEX] != xnn_uarch_zen4 && hardware_config->uarch[XNN_UARCH_INDEX] != xnn_uarch_zen5) {
3625
0
        qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avx512vnnigfni;
3626
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm);
3627
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(14)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm);
3628
0
        qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3629
0
        qdu8_f32_qc4w_gemm_config.mr = 14;
3630
0
        qdu8_f32_qc4w_gemm_config.nr = 16;
3631
0
        qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3632
0
        qdu8_f32_qc4w_gemm_config.planes = 2;
3633
0
      } else
3634
0
    #endif // XNN_ENABLE_AVX512VNNIGFNI
3635
0
    #if XNN_ENABLE_AVX512VNNI
3636
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
3637
0
        qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avx512vnni;
3638
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
3639
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm);
3640
0
        qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3641
0
        qdu8_f32_qc4w_gemm_config.mr = 8;
3642
0
        qdu8_f32_qc4w_gemm_config.nr = 16;
3643
0
        qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3644
0
        qdu8_f32_qc4w_gemm_config.planes = 2;
3645
0
      } else
3646
0
    #endif
3647
0
    #if XNN_ENABLE_AVXVNNI
3648
0
      if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
3649
0
        qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avxvnni;
3650
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
3651
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
3652
0
        qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3653
0
        qdu8_f32_qc4w_gemm_config.mr = 5;
3654
0
        qdu8_f32_qc4w_gemm_config.nr = 8;
3655
0
        qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3656
0
        qdu8_f32_qc4w_gemm_config.planes = 2;
3657
0
      } else
3658
0
    #endif
3659
0
    #if XNN_ENABLE_AVX512SKX
3660
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
3661
0
        qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avx512skx;
3662
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_madd_prfm);
3663
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx_madd_prfm);
3664
0
        qdu8_f32_qc4w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3665
0
        qdu8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3666
0
        qdu8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_gio_w;
3667
0
        qdu8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_goi_w;
3668
0
        qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3669
0
        qdu8_f32_qc4w_gemm_config.mr = 8;
3670
0
        qdu8_f32_qc4w_gemm_config.nr = 16;
3671
0
        qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3672
0
        qdu8_f32_qc4w_gemm_config.planes = 2;
3673
0
    } else
3674
0
    #endif
3675
0
    #if XNN_ENABLE_AVX256SKX
3676
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
3677
0
        qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avx256skx;
3678
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx_madd_prfm);
3679
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx_madd_prfm);
3680
0
        qdu8_f32_qc4w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3681
0
        qdu8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3682
0
        qdu8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_gio_w;
3683
0
        qdu8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_goi_w;
3684
0
        qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3685
0
        qdu8_f32_qc4w_gemm_config.mr = 8;
3686
0
        qdu8_f32_qc4w_gemm_config.nr = 8;
3687
0
        qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3688
0
        qdu8_f32_qc4w_gemm_config.planes = 2;
3689
0
      } else
3690
0
    #endif
3691
0
    #if XNN_ENABLE_AVX2
3692
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
3693
0
        qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_avx2;
3694
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2_madd_prfm);
3695
0
        qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avx2_madd_prfm);
3696
0
        qdu8_f32_qc4w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3697
0
        qdu8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3698
0
        qdu8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_gio_w;
3699
0
        qdu8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_goi_w;
3700
0
        qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3701
0
        qdu8_f32_qc4w_gemm_config.mr = 4;
3702
0
        qdu8_f32_qc4w_gemm_config.nr = 8;
3703
0
        qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3704
0
        qdu8_f32_qc4w_gemm_config.planes = 2;
3705
0
      } else
3706
0
    #endif
3707
0
    if (hardware_config->arch_flags & xnn_arch_x86_ssse3) {
3708
0
      qdu8_f32_qc4w_gemm_config.arch = xnn_arch_x86_ssse3;
3709
0
      qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__ssse3_madd_prfm);
3710
0
      qdu8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4c8__ssse3_madd_prfm);
3711
0
      qdu8_f32_qc4w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
3712
0
      qdu8_f32_qc4w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
3713
0
      qdu8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_gio_w;
3714
0
      qdu8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4uw_gemm_goi_w;
3715
0
      qdu8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
3716
0
      qdu8_f32_qc4w_gemm_config.mr = 5;
3717
0
      qdu8_f32_qc4w_gemm_config.nr = 4;
3718
0
      qdu8_f32_qc4w_gemm_config.log2_kr = 3;
3719
0
      qdu8_f32_qc4w_gemm_config.planes = 2;
3720
0
    }
3721
0
    assert(qdu8_f32_qc4w_gemm_config.mr <= XNN_MAX_MR);
3722
0
    assert(qdu8_f32_qc4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
3723
0
  #endif //XNN_ARCH_X86 || XNN_ARCH_X86_64
3724
0
}
3725
3726
0
static void init_qd8_f32_qc8w_gemm_config(void) {
3727
  // Common parameters.
3728
0
  qd8_f32_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
3729
0
  qd8_f32_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
3730
0
  qd8_f32_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
3731
0
  qd8_f32_qc8w_gemm_config.bias_element_size = sizeof(float);
3732
  // Use the same packing function throughout.
3733
0
  qd8_f32_qc8w_gemm_config.pack_weights_and_biases =
3734
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
3735
0
  qd8_f32_qc8w_gemm_config.packed_stride_weights_and_biases =
3736
0
      (xnn_packed_stride_weights_and_biases_fn)
3737
0
          xnn_packed_stride_qs8_weights_and_biases;
3738
0
  qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3739
0
  qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3740
3741
  // Arch-specific parameters.
3742
  #if XNN_ARCH_ARM
3743
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3744
    assert(hardware_config != NULL);
3745
    (void) hardware_config;  // May be unused.
3746
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
3747
      #if XNN_ENABLE_ASSEMBLY
3748
        #if XNN_ENABLE_ARM_DOTPROD
3749
          if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
3750
            switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3751
              case xnn_uarch_cortex_a55:
3752
                qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__neondot);
3753
                qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
3754
                qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c4__neondot);
3755
                qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
3756
                qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3757
                qd8_f32_qc8w_gemm_config.mr = 4;
3758
                qd8_f32_qc8w_gemm_config.nr = 8;
3759
                qd8_f32_qc8w_gemm_config.log2_kr = 2;
3760
                break;
3761
              default:
3762
                qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__neondot);
3763
                qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__neondot);
3764
                qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c4__neondot);
3765
                qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c4__neondot);
3766
                qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3767
                qd8_f32_qc8w_gemm_config.mr = 4;
3768
                qd8_f32_qc8w_gemm_config.nr = 8;
3769
                qd8_f32_qc8w_gemm_config.log2_kr = 2;
3770
                break;
3771
            }
3772
          } else
3773
        #endif  // XNN_ENABLE_ARM_DOTPROD
3774
        {
3775
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3776
            case xnn_uarch_cortex_a53:
3777
            case xnn_uarch_cortex_a55:
3778
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch32_neonmlal_ld64_2);
3779
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8__asm_aarch32_neonmlal_ld64_2);
3780
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8__neon_mlal_lane);
3781
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8__neon_mlal_lane);
3782
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3783
              qd8_f32_qc8w_gemm_config.mr = 4;
3784
              qd8_f32_qc8w_gemm_config.nr = 8;
3785
              break;
3786
            default:
3787
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal);
3788
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal);
3789
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal);
3790
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal);
3791
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3792
              qd8_f32_qc8w_gemm_config.mr = 2;
3793
              qd8_f32_qc8w_gemm_config.nr = 8;
3794
              qd8_f32_qc8w_gemm_config.log2_kr = 1;
3795
              qd8_f32_qc8w_gemm_config.log2_sr = 2;
3796
              break;
3797
          }
3798
        }
3799
        #if XNN_MAX_UARCH_TYPES > 1 && XNN_ENABLE_ARM_DOTPROD
3800
        {
3801
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
3802
          const uint32_t mr = qd8_f32_qc8w_gemm_config.mr;
3803
          const uint32_t nr = qd8_f32_qc8w_gemm_config.nr;
3804
          const uint32_t log2_kr = qd8_f32_qc8w_gemm_config.log2_kr;
3805
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
3806
            switch (hardware_config->uarch[i]) {
3807
              case xnn_uarch_cortex_a55:
3808
                #if XNN_ENABLE_ARM_DOTPROD
3809
                  if (mr == 4 && nr == 8 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
3810
                    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__neondot);
3811
                    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
3812
                    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c4__neondot);
3813
                    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
3814
                  }
3815
                #endif  // XNN_ENABLE_ARM_DOTPROD
3816
                break;
3817
              default:
3818
                break;
3819
            }
3820
          }
3821
        }
3822
        #endif  // XNN_MAX_UARCH_TYPES > 1
3823
      #else  // XNN_ENABLE_ASSEMBLY
3824
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
3825
          #if XNN_ENABLE_ARM_DOTPROD
3826
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__neondot);
3827
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__neondot);
3828
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c4__neondot);
3829
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c4__neondot);
3830
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3831
            qd8_f32_qc8w_gemm_config.mr = 4;
3832
            qd8_f32_qc8w_gemm_config.nr = 8;
3833
            qd8_f32_qc8w_gemm_config.log2_kr = 2;
3834
          #endif  // XNN_ENABLE_ARM_DOTPROD
3835
        } else {
3836
          qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal);
3837
          qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal);
3838
          qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal);
3839
          qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal);
3840
          qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3841
          qd8_f32_qc8w_gemm_config.mr = 2;
3842
          qd8_f32_qc8w_gemm_config.nr = 8;
3843
          qd8_f32_qc8w_gemm_config.log2_kr = 1;
3844
          qd8_f32_qc8w_gemm_config.log2_sr = 2;
3845
        }
3846
      #endif  // XNN_ENABLE_ASSEMBLY
3847
    } else {
3848
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__scalar);
3849
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x2__scalar);
3850
      qd8_f32_qc8w_gemm_config.mr = 1;
3851
      qd8_f32_qc8w_gemm_config.nr = 2;
3852
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3853
    }
3854
  #elif XNN_ARCH_ARM64
3855
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3856
    assert(hardware_config != NULL);
3857
    (void) hardware_config;  // May be unused.
3858
    #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
3859
      #if XNN_ENABLE_ASSEMBLY
3860
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3861
          #if XNN_ENABLE_ARM_I8MM
3862
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
3863
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
3864
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3865
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3866
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3867
            qd8_f32_qc8w_gemm_config.mr = 4;
3868
            qd8_f32_qc8w_gemm_config.nr = 16;
3869
            qd8_f32_qc8w_gemm_config.log2_kr = 3;
3870
          #endif  // XNN_ENABLE_ARM_I8MM
3871
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
3872
          #if XNN_ENABLE_ARM_DOTPROD
3873
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__neondot);
3874
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld128);
3875
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__neondot);
3876
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld128);
3877
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3878
            qd8_f32_qc8w_gemm_config.mr = 4;
3879
            qd8_f32_qc8w_gemm_config.nr = 16;
3880
            qd8_f32_qc8w_gemm_config.log2_kr = 2;
3881
          #endif  // XNN_ENABLE_ARM_DOTPROD
3882
        } else {
3883
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3884
            case xnn_uarch_cortex_a53:
3885
            case xnn_uarch_cortex_a55r0:
3886
            case xnn_uarch_cortex_a55:
3887
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
3888
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
3889
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
3890
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
3891
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3892
              qd8_f32_qc8w_gemm_config.mr = 4;
3893
              qd8_f32_qc8w_gemm_config.nr = 16;
3894
              break;
3895
            default:
3896
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal);
3897
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal);
3898
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal);
3899
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal);
3900
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3901
              qd8_f32_qc8w_gemm_config.mr = 2;
3902
              qd8_f32_qc8w_gemm_config.nr = 8;
3903
              qd8_f32_qc8w_gemm_config.log2_kr = 1;
3904
              qd8_f32_qc8w_gemm_config.log2_sr = 2;
3905
              break;
3906
          }
3907
        }
3908
      #else  // !XNN_ENABLE_ASSEMBLY
3909
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3910
          #if XNN_ENABLE_ARM_I8MM
3911
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
3912
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
3913
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3914
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3915
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3916
            qd8_f32_qc8w_gemm_config.mr = 4;
3917
            qd8_f32_qc8w_gemm_config.nr = 16;
3918
            qd8_f32_qc8w_gemm_config.log2_kr = 3;
3919
          #endif  // XNN_ENABLE_ARM_I8MM
3920
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
3921
          #if XNN_ENABLE_ARM_DOTPROD
3922
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__neondot);
3923
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__neondot);
3924
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__neondot);
3925
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__neondot);
3926
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3927
            qd8_f32_qc8w_gemm_config.mr = 4;
3928
            qd8_f32_qc8w_gemm_config.nr = 16;
3929
            qd8_f32_qc8w_gemm_config.log2_kr = 2;
3930
          #endif  // XNN_ENABLE_ARM_DOTPROD
3931
        } else {
3932
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3933
            case xnn_uarch_cortex_a53:
3934
            case xnn_uarch_cortex_a55r0:
3935
            case xnn_uarch_cortex_a55:
3936
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
3937
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
3938
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
3939
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
3940
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3941
              qd8_f32_qc8w_gemm_config.mr = 4;
3942
              qd8_f32_qc8w_gemm_config.nr = 16;
3943
              break;
3944
            default:
3945
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal);
3946
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal);
3947
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal);
3948
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal);
3949
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3950
              qd8_f32_qc8w_gemm_config.mr = 2;
3951
              qd8_f32_qc8w_gemm_config.nr = 8;
3952
              qd8_f32_qc8w_gemm_config.log2_kr = 1;
3953
              qd8_f32_qc8w_gemm_config.log2_sr = 2;
3954
              break;
3955
          }
3956
        }
3957
      #endif  // XNN_ENABLE_ASSEMBLY
3958
    #else  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
3959
      #if XNN_ENABLE_ASSEMBLY
3960
        #if XNN_ENABLE_ARM_I8MM
3961
          if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
3962
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
3963
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
3964
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
3965
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
3966
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3967
            qd8_f32_qc8w_gemm_config.mr = 4;
3968
            qd8_f32_qc8w_gemm_config.nr = 16;
3969
            qd8_f32_qc8w_gemm_config.log2_kr = 3;
3970
          } else
3971
        #endif  // XNN_ENABLE_ARM_I8MM
3972
        #if XNN_ENABLE_ARM_DOTPROD
3973
          if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
3974
            switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3975
              case xnn_uarch_cortex_a55:
3976
                qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
3977
                qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
3978
                break;
3979
              default:
3980
                qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld128);
3981
                qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld128);
3982
                break;
3983
            }
3984
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__neondot);
3985
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__neondot);
3986
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
3987
            qd8_f32_qc8w_gemm_config.mr = 4;
3988
            qd8_f32_qc8w_gemm_config.nr = 16;
3989
            qd8_f32_qc8w_gemm_config.log2_kr = 2;
3990
          } else
3991
        #endif  // XNN_ENABLE_ARM_DOTPROD
3992
        {
3993
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
3994
            case xnn_uarch_cortex_a53:
3995
            case xnn_uarch_cortex_a55r0:
3996
            case xnn_uarch_cortex_a55:
3997
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
3998
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
3999
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
4000
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
4001
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4002
              qd8_f32_qc8w_gemm_config.mr = 4;
4003
              qd8_f32_qc8w_gemm_config.nr = 16;
4004
              break;
4005
            default:
4006
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal);
4007
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal);
4008
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal);
4009
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal);
4010
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4011
              qd8_f32_qc8w_gemm_config.mr = 2;
4012
              qd8_f32_qc8w_gemm_config.nr = 8;
4013
              qd8_f32_qc8w_gemm_config.log2_kr = 1;
4014
              qd8_f32_qc8w_gemm_config.log2_sr = 2;
4015
              break;
4016
          }
4017
        }
4018
        #if XNN_MAX_UARCH_TYPES > 1
4019
        {
4020
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
4021
          const uint32_t mr = qd8_f32_qc8w_gemm_config.mr;
4022
          const uint32_t nr = qd8_f32_qc8w_gemm_config.nr;
4023
          const uint32_t log2_kr = qd8_f32_qc8w_gemm_config.log2_kr;
4024
          // Avoid unused warnings.
4025
          (void) mr;
4026
          (void) nr;
4027
          (void) log2_kr;
4028
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
4029
            switch (hardware_config->uarch[i]) {
4030
              case xnn_uarch_cortex_a55:
4031
                #if XNN_ENABLE_ARM_DOTPROD
4032
                  if (mr == 4 && nr == 16 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4033
                    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__neondot);
4034
                    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
4035
                    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__neondot);
4036
                    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
4037
                  }
4038
                #endif  // XNN_ENABLE_ARM_DOTPROD
4039
                break;
4040
              default:
4041
                break;
4042
            }
4043
          }
4044
        }
4045
        #endif  // XNN_MAX_UARCH_TYPES > 1
4046
      #else  // !XNN_ENABLE_ASSEMBLY
4047
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
4048
          #if XNN_ENABLE_ARM_I8MM
4049
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm);
4050
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm);
4051
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm);
4052
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm);
4053
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4054
            qd8_f32_qc8w_gemm_config.mr = 4;
4055
            qd8_f32_qc8w_gemm_config.nr = 16;
4056
            qd8_f32_qc8w_gemm_config.log2_kr = 3;
4057
          #endif  // XNN_ENABLE_ARM_I8MM
4058
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4059
          #if XNN_ENABLE_ARM_DOTPROD
4060
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__neondot);
4061
            qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__neondot);
4062
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__neondot);
4063
            qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__neondot);
4064
            qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4065
            qd8_f32_qc8w_gemm_config.mr = 4;
4066
            qd8_f32_qc8w_gemm_config.nr = 16;
4067
            qd8_f32_qc8w_gemm_config.log2_kr = 2;
4068
          #endif  // XNN_ENABLE_ARM_DOTPROD
4069
        } else {
4070
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
4071
            case xnn_uarch_cortex_a53:
4072
            case xnn_uarch_cortex_a55r0:
4073
            case xnn_uarch_cortex_a55:
4074
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
4075
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
4076
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm);
4077
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm);
4078
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4079
              qd8_f32_qc8w_gemm_config.mr = 4;
4080
              qd8_f32_qc8w_gemm_config.nr = 16;
4081
              break;
4082
            default:
4083
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal);
4084
              qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal);
4085
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal);
4086
              qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal);
4087
              qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4088
              qd8_f32_qc8w_gemm_config.mr = 2;
4089
              qd8_f32_qc8w_gemm_config.nr = 8;
4090
              qd8_f32_qc8w_gemm_config.log2_kr = 1;
4091
              qd8_f32_qc8w_gemm_config.log2_sr = 2;
4092
              break;
4093
          }
4094
        }
4095
      #endif  // XNN_ENABLE_ASSEMBLY
4096
    #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
4097
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
4098
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4099
0
    assert(hardware_config != NULL);
4100
0
    (void) hardware_config;  // May be unused.
4101
0
    #if XNN_ENABLE_AVX512AMX
4102
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512amx) {
4103
0
        qd8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avx512amx;
4104
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x64c4__avx512amx);
4105
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x64c4__avx512amx);
4106
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x64c4__avx512amx);
4107
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x64c4__avx512amx);
4108
0
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4109
0
        qd8_f32_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
4110
0
        qd8_f32_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
4111
0
        qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
4112
0
        #if XNN_ENABLE_AVX256VNNI
4113
0
        qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm;
4114
        #else
4115
        qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
4116
        #endif
4117
0
        qd8_f32_qc8w_gemm_config.mr = 16;
4118
0
        qd8_f32_qc8w_gemm_config.nr = 64;
4119
0
        qd8_f32_qc8w_gemm_config.log2_kr = 2;
4120
0
      } else
4121
0
    #endif
4122
0
    #if XNN_ENABLE_AVX512SKX
4123
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
4124
0
        qd8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avx512skx;
4125
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm);
4126
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c8__avx512skx_prfm);
4127
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512skx_prfm);
4128
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c8__avx512skx_prfm);
4129
0
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4130
0
        qd8_f32_qc8w_gemm_config.mr = 8;
4131
0
        qd8_f32_qc8w_gemm_config.nr = 16;
4132
0
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
4133
0
      } else
4134
0
    #endif
4135
0
    #if XNN_ENABLE_AVX256SKX
4136
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
4137
0
        qd8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avx256skx;
4138
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx256skx);
4139
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__avx256skx);
4140
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx256skx);
4141
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__avx256skx);
4142
0
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4143
0
        qd8_f32_qc8w_gemm_config.mr = 8;
4144
0
        qd8_f32_qc8w_gemm_config.nr = 8;
4145
0
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
4146
0
      } else
4147
0
    #endif
4148
0
    #if XNN_ENABLE_AVX2
4149
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
4150
0
        qd8_f32_qc8w_gemm_config.arch = xnn_arch_x86_avx2;
4151
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
4152
0
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__avx2);
4153
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
4154
0
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__avx2);
4155
0
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4156
0
        qd8_f32_qc8w_gemm_config.mr = 4;
4157
0
        qd8_f32_qc8w_gemm_config.nr = 8;
4158
0
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
4159
0
      } else
4160
0
    #endif
4161
0
    if (hardware_config->arch_flags & xnn_arch_x86_sse4_1) {
4162
0
      qd8_f32_qc8w_gemm_config.arch = xnn_arch_x86_sse4_1;
4163
0
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld64);
4164
0
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld64);
4165
0
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__sse41_ld64);
4166
0
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__sse41_ld64);
4167
0
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4168
0
      qd8_f32_qc8w_gemm_config.mr = 4;
4169
0
      qd8_f32_qc8w_gemm_config.nr = 4;
4170
0
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
4171
0
    } else {
4172
0
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld64);
4173
0
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld64);
4174
0
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__sse2_ld64);
4175
0
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__sse2_ld64);
4176
0
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4177
0
      qd8_f32_qc8w_gemm_config.mr = 4;
4178
0
      qd8_f32_qc8w_gemm_config.nr = 4;
4179
0
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
4180
0
    }
4181
  #elif XNN_ARCH_WASMRELAXEDSIMD
4182
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4183
    assert(hardware_config != NULL);
4184
    (void) hardware_config;  // May be unused.
4185
    if (hardware_config->arch_flags & xnn_arch_wasm_sdot) {
4186
      if (hardware_config->is_x86) {
4187
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmsdot);
4188
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__wasmsdot);
4189
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmsdot);
4190
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__wasmsdot);
4191
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4192
        qd8_f32_qc8w_gemm_config.mr = 2;
4193
        qd8_f32_qc8w_gemm_config.nr = 8;
4194
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
4195
      } else {
4196
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmsdot_u2);
4197
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmsdot_u2);
4198
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmsdot_u2);
4199
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmsdot_u2);
4200
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4201
        qd8_f32_qc8w_gemm_config.mr = 4;
4202
        qd8_f32_qc8w_gemm_config.nr = 8;
4203
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
4204
      }
4205
    } else if (hardware_config->arch_flags & xnn_arch_wasm_usdot) {
4206
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmusdot_u2);
4207
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmusdot_u2);
4208
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmusdot_u2);
4209
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmusdot_u2);
4210
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4211
      qd8_f32_qc8w_gemm_config.mr = 4;
4212
      qd8_f32_qc8w_gemm_config.nr = 8;
4213
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
4214
    } else {
4215
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4216
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4217
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4218
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4219
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4220
      qd8_f32_qc8w_gemm_config.mr = 4;
4221
      qd8_f32_qc8w_gemm_config.nr = 4;
4222
      qd8_f32_qc8w_gemm_config.log2_kr = 1;
4223
      qd8_f32_qc8w_gemm_config.log2_sr = 2;
4224
    }
4225
  #elif XNN_ARCH_WASMSIMD
4226
    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4227
    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4228
    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4229
    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4230
    qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4231
    qd8_f32_qc8w_gemm_config.mr = 4;
4232
    qd8_f32_qc8w_gemm_config.nr = 4;
4233
    qd8_f32_qc8w_gemm_config.log2_kr = 1;
4234
    qd8_f32_qc8w_gemm_config.log2_sr = 2;
4235
  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
4236
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4237
    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4v__rvv);
4238
    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4v__rvv);
4239
    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4v__rvv);
4240
    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4v__rvv);
4241
    qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4242
    qd8_f32_qc8w_gemm_config.mr = 4;
4243
    qd8_f32_qc8w_gemm_config.nr = 4 * hardware_config->vlenb / sizeof(int32_t);
4244
  #else
4245
    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4__scalar);
4246
    qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4__scalar);
4247
    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4__scalar);
4248
    qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_DQIGEMM_UKERNEL(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4__scalar);
4249
    qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
4250
    qd8_f32_qc8w_gemm_config.mr = 4;
4251
    qd8_f32_qc8w_gemm_config.nr = 4;
4252
  #endif
4253
0
  assert(qd8_f32_qc8w_gemm_config.mr <= XNN_MAX_MR);
4254
0
  assert(qd8_f32_qc8w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
4255
0
}
4256
4257
0
static void init_qs8_qc4w_gemm_config(void) {
4258
  // Common parameters.
4259
0
  qs8_qc4w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
4260
0
  qs8_qc4w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
4261
0
  qs8_qc4w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_BIT_SIZEOF_INT4;
4262
0
  qs8_qc4w_gemm_config.bias_element_size = sizeof(int32_t);
4263
4264
  // Arch-specific parameters.
4265
  #if XNN_ARCH_ARM64 && !XNN_PLATFORM_WINDOWS && XNN_ENABLE_ASSEMBLY
4266
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4267
    assert(hardware_config != NULL);
4268
    (void) hardware_config;  // May be unused.
4269
    if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4270
      #if XNN_ENABLE_ARM_DOTPROD
4271
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c4__asm_aarch64_neondot_ld128_2);
4272
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c4__asm_aarch64_neondot_ld128_2);
4273
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4274
        qs8_qc4w_gemm_config.mr = 5;
4275
        qs8_qc4w_gemm_config.nr = 16;
4276
        qs8_qc4w_gemm_config.log2_kr = 2;
4277
        qs8_qc4w_gemm_config.planes = 1;
4278
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w_non_planar_aarch64;
4279
      #endif  // XNN_ENABLE_ARM_DOTPROD
4280
    } else
4281
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
4282
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4283
0
    assert(hardware_config != NULL);
4284
0
    (void) hardware_config;  // May be unused.
4285
0
    #if XNN_ENABLE_AVX512VNNIGFNI
4286
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnnigfni) {
4287
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx512vnnigfni;
4288
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnnigfni_prfm);
4289
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnnigfni_prfm);
4290
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4291
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4w_gemm_goi_w;
4292
0
        qs8_qc4w_gemm_config.planes = 2;
4293
0
        qs8_qc4w_gemm_config.mr = 7;
4294
0
        qs8_qc4w_gemm_config.nr = 16;
4295
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4296
0
      } else
4297
0
    #endif
4298
4299
0
    #if XNN_ENABLE_AVX512VNNI && XNN_ARCH_X86_64 && !XNN_PLATFORM_WINDOWS && XNN_ENABLE_ASSEMBLY
4300
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
4301
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx512vnni;
4302
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__asm_amd64_avx512vnni);
4303
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__asm_amd64_avx512vnni);
4304
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4305
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4w_gemm_goi_w_non_planar_avx512;
4306
0
        qs8_qc4w_gemm_config.planes = 1;
4307
0
        qs8_qc4w_gemm_config.mr = 8;
4308
0
        qs8_qc4w_gemm_config.nr = 16;
4309
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4310
0
      } else
4311
0
    #endif
4312
0
    #if XNN_ENABLE_AVX256VNNI
4313
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256vnni) {
4314
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx256vnni;
4315
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni_prfm);
4316
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx256vnni_prfm);
4317
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4318
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4w_gemm_goi_w;
4319
0
        qs8_qc4w_gemm_config.planes = 2;
4320
0
        qs8_qc4w_gemm_config.mr = 7;
4321
0
        qs8_qc4w_gemm_config.nr = 8;
4322
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4323
0
      } else
4324
0
    #endif
4325
0
    #if XNN_ENABLE_AVXVNNI
4326
      // AVXVNNI is faster than AVX512SKX
4327
0
      if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
4328
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avxvnni;
4329
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
4330
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
4331
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4332
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4w_gemm_goi_w;
4333
0
        qs8_qc4w_gemm_config.planes = 2;
4334
0
        qs8_qc4w_gemm_config.mr = 5;
4335
0
        qs8_qc4w_gemm_config.nr = 8;
4336
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4337
0
      } else
4338
0
    #endif
4339
0
    #if XNN_ENABLE_AVX512SKX
4340
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
4341
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx512skx;
4342
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_madd_prfm);
4343
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_madd_prfm);
4344
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4345
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4uw_gemm_goi_w;
4346
0
        qs8_qc4w_gemm_config.planes = 2;
4347
0
        qs8_qc4w_gemm_config.mr = 7;
4348
0
        qs8_qc4w_gemm_config.nr = 16;
4349
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4350
0
      } else
4351
0
    #endif
4352
0
    #if XNN_ENABLE_AVX256SKX
4353
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
4354
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx256skx;
4355
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx256skx_madd_prfm);
4356
0
        qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx256skx_madd_prfm);
4357
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4358
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4uw_gemm_goi_w;
4359
0
        qs8_qc4w_gemm_config.planes = 2;
4360
0
        qs8_qc4w_gemm_config.mr = 7;
4361
0
        qs8_qc4w_gemm_config.nr = 8;
4362
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4363
0
      } else
4364
0
    #endif
4365
0
    #if XNN_ENABLE_AVX2
4366
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
4367
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx2;
4368
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx2_madd_prfm);
4369
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx2_madd_prfm);
4370
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4371
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4uw_gemm_goi_w;
4372
0
        qs8_qc4w_gemm_config.planes = 2;
4373
0
        qs8_qc4w_gemm_config.mr = 7;
4374
0
        qs8_qc4w_gemm_config.nr = 8;
4375
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4376
0
      } else
4377
0
    #endif
4378
0
    #if XNN_ENABLE_AVX
4379
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
4380
0
        qs8_qc4w_gemm_config.arch = xnn_arch_x86_avx;
4381
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4c8__avx_madd_prfm);
4382
0
        qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x4c8__avx_madd_prfm);
4383
0
        qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4384
0
        qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4uw_gemm_goi_w;
4385
0
        qs8_qc4w_gemm_config.planes = 2;
4386
0
        qs8_qc4w_gemm_config.mr = 5;
4387
0
        qs8_qc4w_gemm_config.nr = 4;
4388
0
        qs8_qc4w_gemm_config.log2_kr = 3;
4389
0
      } else
4390
0
    #endif
4391
0
    if (hardware_config->arch_flags & xnn_arch_x86_ssse3) {
4392
0
      qs8_qc4w_gemm_config.arch = xnn_arch_x86_ssse3;
4393
0
      qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4c8__ssse3_madd_prfm);
4394
0
      qs8_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x4c8__ssse3_madd_prfm);
4395
0
      qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4396
0
      qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_qc4uw_gemm_goi_w;
4397
0
      qs8_qc4w_gemm_config.planes = 2;
4398
0
      qs8_qc4w_gemm_config.mr = 5;
4399
0
      qs8_qc4w_gemm_config.nr = 4;
4400
0
      qs8_qc4w_gemm_config.log2_kr = 3;
4401
0
    } else
4402
0
  #endif //XNN_ARCH_X86 || XNN_ARCH_X86_64
4403
0
  {
4404
0
    qs8_qc4w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4405
0
    qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic);
4406
0
    qs8_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic);
4407
0
    qs8_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
4408
0
    qs8_qc4w_gemm_config.planes = 2;
4409
0
    qs8_qc4w_gemm_config.mr = 3;
4410
0
    qs8_qc4w_gemm_config.nr = 4;
4411
0
  }
4412
0
  assert(qs8_qc4w_gemm_config.mr <= XNN_MAX_MR);
4413
0
  assert(qs8_qc4w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
4414
0
}
4415
4416
1
static void init_qs8_qc8w_gemm_config(void) {
4417
  // Common parameters.
4418
1
  qs8_qc8w_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_INT8_T;
4419
1
  qs8_qc8w_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_INT8_T;
4420
1
  qs8_qc8w_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_INT8_T + 3;
4421
1
  qs8_qc8w_gemm_config.bias_element_size = sizeof(int32_t);
4422
  // Use the same packing function throughout.
4423
1
  qs8_qc8w_gemm_config.pack_weights_and_biases =
4424
1
      (xnn_pack_weights_and_biases_fn)xnn_pack_qs8_weights_and_biases;
4425
1
  qs8_qc8w_gemm_config.packed_stride_weights_and_biases =
4426
1
      (xnn_packed_stride_weights_and_biases_fn)
4427
1
          xnn_packed_stride_qs8_weights_and_biases;
4428
1
  qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
4429
1
  qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
4430
4431
  // Arch-specific parameters.
4432
  #if XNN_ARCH_ARM
4433
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4434
    assert(hardware_config != NULL);
4435
    (void) hardware_config;  // May be unused.
4436
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
4437
      #if XNN_ENABLE_ASSEMBLY
4438
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4439
          #if XNN_ENABLE_ARM_DOTPROD
4440
            switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
4441
              case xnn_uarch_cortex_a55:
4442
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot);
4443
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
4444
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot);
4445
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
4446
                qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4447
                qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4448
                qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4449
                qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4450
                qs8_qc8w_gemm_config.mr = 4;
4451
                qs8_qc8w_gemm_config.nr = 8;
4452
                qs8_qc8w_gemm_config.log2_kr = 2;
4453
                break;
4454
              default:
4455
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot);
4456
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_ld64);
4457
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot);
4458
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_ld64);
4459
                qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4460
                qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4461
                qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4462
                qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4463
                qs8_qc8w_gemm_config.mr = 4;
4464
                qs8_qc8w_gemm_config.nr = 8;
4465
                qs8_qc8w_gemm_config.log2_kr = 2;
4466
                break;
4467
            }
4468
          #endif  // XNN_ENABLE_ARM_DOTPROD
4469
        } else {
4470
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
4471
            case xnn_uarch_cortex_a5:
4472
            case xnn_uarch_cortex_a7:
4473
            case xnn_uarch_krait:
4474
            case xnn_uarch_kryo:
4475
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
4476
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
4477
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
4478
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
4479
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
4480
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4481
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4482
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4483
              qs8_qc8w_gemm_config.mr = 4;
4484
              qs8_qc8w_gemm_config.nr = 8;
4485
              break;
4486
            case xnn_uarch_cortex_a32:
4487
            case xnn_uarch_cortex_a35:
4488
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4489
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4490
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4491
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4492
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4493
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4494
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4495
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4496
              qs8_qc8w_gemm_config.mr = 4;
4497
              qs8_qc8w_gemm_config.nr = 8;
4498
              break;
4499
            case xnn_uarch_cortex_a53:
4500
            case xnn_uarch_cortex_a57:
4501
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35_prfm);
4502
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53_prfm);
4503
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35_prfm);
4504
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53_prfm);
4505
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4506
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4507
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4508
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4509
              qs8_qc8w_gemm_config.mr = 4;
4510
              qs8_qc8w_gemm_config.nr = 8;
4511
              break;
4512
            case xnn_uarch_cortex_a55r0:
4513
            case xnn_uarch_cortex_a55:
4514
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4515
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53);
4516
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4517
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53);
4518
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4519
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4520
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4521
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4522
              qs8_qc8w_gemm_config.mr = 4;
4523
              qs8_qc8w_gemm_config.nr = 8;
4524
              break;
4525
            case xnn_uarch_cortex_a72:
4526
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4527
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4528
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4529
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4530
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4531
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4532
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4533
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4534
              qs8_qc8w_gemm_config.mr = 2;
4535
              qs8_qc8w_gemm_config.nr = 8;
4536
              qs8_qc8w_gemm_config.log2_kr = 1;
4537
              qs8_qc8w_gemm_config.log2_sr = 2;
4538
              break;
4539
            case xnn_uarch_exynos_m1:
4540
            case xnn_uarch_exynos_m2:
4541
            case xnn_uarch_exynos_m3:
4542
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35_prfm);
4543
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_ld64_prfm);
4544
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35_prfm);
4545
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_ld64_prfm);
4546
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4547
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4548
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4549
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4550
              qs8_qc8w_gemm_config.mr = 4;
4551
              qs8_qc8w_gemm_config.nr = 8;
4552
              break;
4553
4554
            default:
4555
              if (hardware_config->arch_flags & xnn_arch_arm_neon_v8) {
4556
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4557
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_ld64);
4558
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4559
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_ld64);
4560
                qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4561
                qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4562
                qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4563
                qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4564
                qs8_qc8w_gemm_config.mr = 4;
4565
                qs8_qc8w_gemm_config.nr = 8;
4566
              } else {
4567
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
4568
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
4569
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
4570
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
4571
                qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
4572
                qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4573
                qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4574
                qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4575
                qs8_qc8w_gemm_config.mr = 4;
4576
                qs8_qc8w_gemm_config.nr = 8;
4577
              }
4578
              break;
4579
          }
4580
        }
4581
        #if XNN_MAX_UARCH_TYPES > 1
4582
        {
4583
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
4584
          const uint32_t mr = qs8_qc8w_gemm_config.mr;
4585
          const uint32_t nr = qs8_qc8w_gemm_config.nr;
4586
          const uint32_t log2_kr = qs8_qc8w_gemm_config.log2_kr;
4587
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
4588
            switch (hardware_config->uarch[i]) {
4589
              case xnn_uarch_cortex_a53:
4590
                if (mr == 4 && nr == 8 && log2_kr == 0) {
4591
                  qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35_prfm);
4592
                  qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53_prfm);
4593
                  qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35_prfm);
4594
                  qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53_prfm);
4595
                }
4596
                break;
4597
              case xnn_uarch_cortex_a55:
4598
                #if XNN_ENABLE_ARM_DOTPROD
4599
                  if (mr == 4 && nr == 8 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4600
                    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot);
4601
                    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
4602
                    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot);
4603
                    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55);
4604
                    break;
4605
                  }
4606
                #endif  // XNN_ENABLE_ARM_DOTPROD
4607
                XNN_FALLTHROUGH
4608
              case xnn_uarch_cortex_a55r0:
4609
                if (mr == 4 && nr == 8 && log2_kr == 0) {
4610
                  qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4611
                  qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53);
4612
                  qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__asm_aarch32_neonv8_mlal_lane_cortex_a35);
4613
                  qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8__asm_aarch32_neonv8_mlal_lane_cortex_a53);
4614
                }
4615
                break;
4616
4617
              default:
4618
                break;
4619
            }
4620
          }
4621
        }
4622
        #endif  // XNN_MAX_UARCH_TYPES > 1
4623
      #else  // XNN_ENABLE_ASSEMBLY
4624
        if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4625
          #if XNN_ENABLE_ARM_DOTPROD
4626
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c4__neondot);
4627
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c4__neondot);
4628
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c4__neondot);
4629
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c4__neondot);
4630
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4631
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4632
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4633
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4634
            qs8_qc8w_gemm_config.mr = 4;
4635
            qs8_qc8w_gemm_config.nr = 8;
4636
            qs8_qc8w_gemm_config.log2_kr = 2;
4637
          #endif  // XNN_ENABLE_ARM_DOTPROD
4638
        } else if (hardware_config->arch_flags & xnn_arch_arm_neon_v8) {
4639
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4640
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4641
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4642
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4643
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4644
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4645
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4646
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4647
          qs8_qc8w_gemm_config.mr = 2;
4648
          qs8_qc8w_gemm_config.nr = 8;
4649
          qs8_qc8w_gemm_config.log2_kr = 1;
4650
          qs8_qc8w_gemm_config.log2_sr = 2;
4651
        } else {
4652
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
4653
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
4654
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
4655
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
4656
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
4657
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4658
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4659
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4660
          qs8_qc8w_gemm_config.mr = 2;
4661
          qs8_qc8w_gemm_config.nr = 8;
4662
          qs8_qc8w_gemm_config.log2_kr = 1;
4663
          qs8_qc8w_gemm_config.log2_sr = 2;
4664
        }
4665
      #endif  // XNN_ENABLE_ASSEMBLY
4666
    } else {
4667
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
4668
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
4669
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
4670
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
4671
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_armsimd32_params;
4672
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4673
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4674
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4675
      qs8_qc8w_gemm_config.mr = 2;
4676
      qs8_qc8w_gemm_config.nr = 2;
4677
      qs8_qc8w_gemm_config.log2_kr = 2;
4678
    }
4679
  #elif XNN_ARCH_ARM64
4680
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4681
    assert(hardware_config != NULL);
4682
    (void) hardware_config;  // May be unused.
4683
    #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
4684
      #if XNN_ENABLE_ASSEMBLY
4685
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
4686
          #if XNN_ENABLE_ARM_I8MM
4687
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4688
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4689
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4690
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4691
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4692
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4693
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4694
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4695
            qs8_qc8w_gemm_config.mr = 4;
4696
            qs8_qc8w_gemm_config.nr = 16;
4697
            qs8_qc8w_gemm_config.log2_kr = 3;
4698
          #endif  // XNN_ENABLE_ARM_I8MM
4699
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4700
          #if XNN_ENABLE_ARM_DOTPROD
4701
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot);
4702
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
4703
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot);
4704
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
4705
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4706
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4707
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4708
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4709
            qs8_qc8w_gemm_config.mr = 4;
4710
            qs8_qc8w_gemm_config.nr = 16;
4711
            qs8_qc8w_gemm_config.log2_kr = 2;
4712
          #endif  // XNN_ENABLE_ARM_DOTPROD
4713
        } else {
4714
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
4715
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
4716
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
4717
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
4718
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4719
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4720
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4721
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4722
          qs8_qc8w_gemm_config.mr = 2;
4723
          qs8_qc8w_gemm_config.nr = 8;
4724
          qs8_qc8w_gemm_config.log2_kr = 3;
4725
        }
4726
      #else  // !XNN_ENABLE_ASSEMBLY
4727
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
4728
          #if XNN_ENABLE_ARM_I8MM
4729
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4730
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4731
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4732
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4733
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4734
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4735
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4736
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4737
            qs8_qc8w_gemm_config.mr = 4;
4738
            qs8_qc8w_gemm_config.nr = 16;
4739
            qs8_qc8w_gemm_config.log2_kr = 3;
4740
          #endif  // XNN_ENABLE_ARM_I8MM
4741
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4742
          #if XNN_ENABLE_ARM_DOTPROD
4743
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot);
4744
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__neondot);
4745
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot);
4746
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__neondot);
4747
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4748
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4749
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4750
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4751
            qs8_qc8w_gemm_config.mr = 4;
4752
            qs8_qc8w_gemm_config.nr = 16;
4753
            qs8_qc8w_gemm_config.log2_kr = 2;
4754
          #endif  // XNN_ENABLE_ARM_DOTPROD
4755
        } else {
4756
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4757
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4758
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4759
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4760
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4761
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4762
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4763
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4764
          qs8_qc8w_gemm_config.mr = 2;
4765
          qs8_qc8w_gemm_config.nr = 8;
4766
          qs8_qc8w_gemm_config.log2_kr = 1;
4767
          qs8_qc8w_gemm_config.log2_sr = 2;
4768
        }
4769
      #endif  // XNN_ENABLE_ASSEMBLY
4770
    #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
4771
      #if XNN_ENABLE_ASSEMBLY
4772
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
4773
          #if XNN_ENABLE_ARM_I8MM
4774
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4775
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4776
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4777
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4778
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4779
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4780
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4781
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4782
            qs8_qc8w_gemm_config.mr = 4;
4783
            qs8_qc8w_gemm_config.nr = 16;
4784
            qs8_qc8w_gemm_config.log2_kr = 3;
4785
          #endif  // XNN_ENABLE_ARM_I8MM
4786
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4787
          #if XNN_ENABLE_ARM_DOTPROD
4788
            switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
4789
              case xnn_uarch_cortex_a55:
4790
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
4791
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
4792
                break;
4793
              default:
4794
                qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
4795
                qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_ld128);
4796
                break;
4797
            }
4798
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot);
4799
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot);
4800
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4801
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4802
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4803
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4804
            qs8_qc8w_gemm_config.mr = 4;
4805
            qs8_qc8w_gemm_config.nr = 16;
4806
            qs8_qc8w_gemm_config.log2_kr = 2;
4807
          #endif  // XNN_ENABLE_ARM_DOTPROD
4808
        } else {
4809
          switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
4810
            case xnn_uarch_cortex_a35:
4811
            case xnn_uarch_kryo:
4812
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
4813
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64);
4814
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
4815
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_ld64);
4816
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4817
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4818
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4819
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4820
              qs8_qc8w_gemm_config.mr = 4;
4821
              qs8_qc8w_gemm_config.nr = 16;
4822
              break;
4823
4824
            case xnn_uarch_cortex_a53:
4825
            case xnn_uarch_cortex_a55r0:
4826
            case xnn_uarch_cortex_a55:
4827
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
4828
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm);
4829
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
4830
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm);
4831
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4832
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4833
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4834
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4835
              qs8_qc8w_gemm_config.mr = 4;
4836
              qs8_qc8w_gemm_config.nr = 16;
4837
              break;
4838
4839
            case xnn_uarch_cortex_a72:
4840
            case xnn_uarch_cortex_a73:
4841
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm);
4842
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm);
4843
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_prfm);
4844
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_prfm);
4845
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4846
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4847
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4848
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4849
              qs8_qc8w_gemm_config.mr = 2;
4850
              qs8_qc8w_gemm_config.nr = 8;
4851
              qs8_qc8w_gemm_config.log2_kr = 3;
4852
              break;
4853
4854
            default:
4855
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
4856
              qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
4857
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal);
4858
              qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal);
4859
              qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4860
              qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4861
              qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4862
              qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4863
              qs8_qc8w_gemm_config.mr = 2;
4864
              qs8_qc8w_gemm_config.nr = 8;
4865
              qs8_qc8w_gemm_config.log2_kr = 3;
4866
              break;
4867
          }
4868
        }
4869
        #if XNN_MAX_UARCH_TYPES > 1
4870
        {
4871
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
4872
          const uint32_t mr = qs8_qc8w_gemm_config.mr;
4873
          const uint32_t nr = qs8_qc8w_gemm_config.nr;
4874
          const uint32_t log2_kr = qs8_qc8w_gemm_config.log2_kr;
4875
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
4876
            switch (hardware_config->uarch[i]) {
4877
              case xnn_uarch_cortex_a55:
4878
                #if XNN_ENABLE_ARM_DOTPROD
4879
                  if (mr == 4 && nr == 16 && log2_kr == 2 && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4880
                    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot);
4881
                    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
4882
                    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot);
4883
                    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55);
4884
                    break;
4885
                  }
4886
                #endif  // XNN_ENABLE_ARM_DOTPROD
4887
                XNN_FALLTHROUGH
4888
              case xnn_uarch_cortex_a53:
4889
              case xnn_uarch_cortex_a55r0:
4890
                if (mr == 2 && nr == 8 && log2_kr == 3) {
4891
                  qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm);
4892
                  qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm);
4893
                  qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm);
4894
                  qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__asm_aarch64_neon_mlal_cortex_a53_prfm);
4895
                }
4896
                break;
4897
4898
              default:
4899
                break;
4900
            }
4901
          }
4902
        }
4903
        #endif  // XNN_MAX_UARCH_TYPES > 1
4904
      #else  // !XNN_ENABLE_ASSEMBLY
4905
        if (XNN_ENABLE_ARM_I8MM && (hardware_config->arch_flags & xnn_arch_arm_neon_i8mm)) {
4906
          #if XNN_ENABLE_ARM_I8MM
4907
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4908
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4909
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__neoni8mm);
4910
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c8__neoni8mm);
4911
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4912
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4913
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4914
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4915
            qs8_qc8w_gemm_config.mr = 4;
4916
            qs8_qc8w_gemm_config.nr = 16;
4917
            qs8_qc8w_gemm_config.log2_kr = 3;
4918
          #endif  // XNN_ENABLE_ARM_I8MM && XNN_ENABLE_ARM_DOTPROD
4919
        } else if (XNN_ENABLE_ARM_DOTPROD && (hardware_config->arch_flags & xnn_arch_arm_neon_dot)) {
4920
          #if XNN_ENABLE_ARM_DOTPROD
4921
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__neondot);
4922
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__neondot);
4923
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__neondot);
4924
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__neondot);
4925
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4926
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4927
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4928
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4929
            qs8_qc8w_gemm_config.mr = 4;
4930
            qs8_qc8w_gemm_config.nr = 16;
4931
            qs8_qc8w_gemm_config.log2_kr = 2;
4932
          #endif  // XNN_ENABLE_ARM_DOTPROD
4933
        } else {
4934
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4935
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4936
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
4937
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
4938
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
4939
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4940
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4941
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4942
          qs8_qc8w_gemm_config.mr = 2;
4943
          qs8_qc8w_gemm_config.nr = 8;
4944
          qs8_qc8w_gemm_config.log2_kr = 1;
4945
          qs8_qc8w_gemm_config.log2_sr = 2;
4946
        }
4947
      #endif  // XNN_ENABLE_ASSEMBLY
4948
    #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
4949
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
4950
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
4951
1
    assert(hardware_config != NULL);
4952
1
    (void) hardware_config;  // May be unused.
4953
1
    #if XNN_ENABLE_AVX512AMX
4954
1
      if (hardware_config->arch_flags & xnn_arch_x86_avx512amx) {
4955
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x64c4__avx512amx);
4956
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x64c4__avx512amx);
4957
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x64c4__avx512amx);
4958
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(16)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_16x64c4__avx512amx);
4959
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4960
0
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
4961
0
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
4962
0
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
4963
0
        #if XNN_ENABLE_AVX256VNNI
4964
0
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x64c4__avx256vnni_prfm;
4965
        #else
4966
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
4967
        #endif
4968
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
4969
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
4970
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
4971
0
        qs8_qc8w_gemm_config.mr = 16;
4972
0
        qs8_qc8w_gemm_config.nr = 64;
4973
0
        qs8_qc8w_gemm_config.log2_kr = 2;
4974
0
      } else
4975
1
    #endif
4976
1
    #if XNN_ENABLE_AVX512VNNI
4977
1
      if (hardware_config->arch_flags & xnn_arch_x86_avx512vnni) {
4978
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm);
4979
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm);
4980
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm);
4981
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm);
4982
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
4983
0
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
4984
0
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
4985
0
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
4986
0
        #if XNN_ENABLE_AVX256VNNI
4987
0
          qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni;
4988
        #else
4989
          qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__scalar;
4990
        #endif
4991
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
4992
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
4993
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
4994
0
        qs8_qc8w_gemm_config.mr = 7;
4995
0
        qs8_qc8w_gemm_config.nr = 16;
4996
0
        qs8_qc8w_gemm_config.log2_kr = 3;
4997
0
      } else
4998
1
    #endif
4999
1
    #if XNN_ENABLE_AVXVNNIINT8 && XNN_ENABLE_AVXVNNI
5000
1
      if ((hardware_config->arch_flags & xnn_arch_x86_avxvnniint8) && (hardware_config->arch_flags & xnn_arch_x86_avxvnni)) {
5001
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnniint8_prfm);
5002
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnniint8_prfm);
5003
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnniint8_prfm);
5004
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnniint8_prfm);
5005
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5006
0
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5007
0
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5008
0
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
5009
0
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni;
5010
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5011
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5012
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5013
0
        qs8_qc8w_gemm_config.mr = 5;
5014
0
        qs8_qc8w_gemm_config.nr = 8;
5015
0
        qs8_qc8w_gemm_config.log2_kr = 3;
5016
0
      } else
5017
1
    #endif
5018
1
    #if XNN_ENABLE_AVXVNNI
5019
1
      if (hardware_config->arch_flags & xnn_arch_x86_avxvnni) {
5020
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
5021
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
5022
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
5023
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
5024
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5025
0
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5026
0
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5027
0
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
5028
0
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni;
5029
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
5030
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
5031
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
5032
0
        qs8_qc8w_gemm_config.mr = 5;
5033
0
        qs8_qc8w_gemm_config.nr = 8;
5034
0
        qs8_qc8w_gemm_config.log2_kr = 3;
5035
0
      } else
5036
1
    #endif
5037
1
    #if XNN_ENABLE_AVX512SKX
5038
1
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
5039
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
5040
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
5041
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
5042
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
5043
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5044
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5045
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5046
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5047
0
        qs8_qc8w_gemm_config.mr = 7;
5048
0
        qs8_qc8w_gemm_config.nr = 16;
5049
0
        qs8_qc8w_gemm_config.log2_kr = 3;
5050
0
      } else
5051
1
    #endif
5052
1
    #if XNN_ENABLE_AVX256SKX && XNN_ENABLE_AVX2
5053
1
      if (hardware_config->arch_flags & xnn_arch_x86_avx256skx) {
5054
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256skx);
5055
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avx256skx);
5056
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256skx);
5057
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avx256skx);
5058
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5059
0
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5060
0
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5061
0
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_qs8_packw_gemm_gio_ukernel_x8c8__scalar;
5062
0
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx2_madd;
5063
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5064
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5065
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5066
0
        qs8_qc8w_gemm_config.mr = 4;
5067
0
        qs8_qc8w_gemm_config.nr = 8;
5068
0
        qs8_qc8w_gemm_config.log2_kr = 3;
5069
0
      } else
5070
1
    #endif
5071
1
    #if XNN_ENABLE_AVX2
5072
1
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
5073
1
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2);
5074
1
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2);
5075
1
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx2);
5076
1
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__avx2);
5077
1
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5078
1
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5079
1
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5080
1
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_qs8_packw_gemm_gio_ukernel_x8c8__scalar;
5081
1
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx2_madd;
5082
1
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5083
1
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5084
1
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5085
1
        qs8_qc8w_gemm_config.mr = 3;
5086
1
        qs8_qc8w_gemm_config.nr = 8;
5087
1
        qs8_qc8w_gemm_config.log2_kr = 3;
5088
1
      } else
5089
0
    #endif
5090
0
    #if XNN_ENABLE_AVX
5091
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
5092
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
5093
0
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
5094
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
5095
0
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
5096
0
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5097
0
        qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5098
0
        qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5099
0
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_qs8_packw_gemm_gio_ukernel_x4c8__scalar;
5100
0
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x4c8__scalar;
5101
0
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5102
0
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5103
0
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5104
0
        qs8_qc8w_gemm_config.mr = 2;
5105
0
        qs8_qc8w_gemm_config.nr = 4;
5106
0
        qs8_qc8w_gemm_config.log2_kr = 3;
5107
0
      } else
5108
0
    #endif
5109
0
    if (hardware_config->arch_flags & xnn_arch_x86_sse4_1) {
5110
0
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
5111
0
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
5112
0
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
5113
0
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
5114
0
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5115
0
      qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5116
0
      qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5117
0
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_qs8_packw_gemm_gio_ukernel_x4c8__scalar;
5118
0
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x4c8__scalar;
5119
0
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5120
0
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5121
0
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5122
0
      qs8_qc8w_gemm_config.mr = 3;
5123
0
      qs8_qc8w_gemm_config.nr = 4;
5124
0
      qs8_qc8w_gemm_config.log2_kr = 3;
5125
0
    } else {
5126
0
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
5127
0
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
5128
0
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
5129
0
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
5130
0
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5131
0
      qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5132
0
      qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5133
0
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_qs8_packw_gemm_gio_ukernel_x4c8__scalar;
5134
0
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x4c8__scalar;
5135
0
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5136
0
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5137
0
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5138
0
      qs8_qc8w_gemm_config.mr = 3;
5139
0
      qs8_qc8w_gemm_config.nr = 4;
5140
0
      qs8_qc8w_gemm_config.log2_kr = 3;
5141
0
    }
5142
  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5143
    #if XNN_ARCH_WASMRELAXEDSIMD
5144
      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5145
      assert(hardware_config != NULL);
5146
    (void) hardware_config;  // May be unused.
5147
      (void) hardware_config;  // May be unused.
5148
      if (hardware_config->arch_flags & xnn_arch_wasm_sdot) {
5149
        if (hardware_config->is_x86) {
5150
          #if XNN_ENABLE_WASM_REVECTORIZE
5151
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2);
5152
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2);
5153
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2);
5154
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2);
5155
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5156
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5157
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5158
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5159
            qs8_qc8w_gemm_config.mr = 4;
5160
            qs8_qc8w_gemm_config.nr = 16;
5161
            qs8_qc8w_gemm_config.log2_kr = 2;
5162
          #else
5163
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot);
5164
            qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__wasmsdot);
5165
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot);
5166
            qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__wasmsdot);
5167
            qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5168
            qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5169
            qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5170
            qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5171
            qs8_qc8w_gemm_config.mr = 2;
5172
            qs8_qc8w_gemm_config.nr = 8;
5173
            qs8_qc8w_gemm_config.log2_kr = 3;
5174
          #endif
5175
        } else {
5176
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2);
5177
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2);
5178
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2);
5179
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2);
5180
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5181
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5182
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5183
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5184
          qs8_qc8w_gemm_config.mr = 4;
5185
          qs8_qc8w_gemm_config.nr = 8;
5186
          qs8_qc8w_gemm_config.log2_kr = 3;
5187
        }
5188
      } else if (hardware_config->arch_flags & xnn_arch_wasm_usdot) {
5189
        #if XNN_ENABLE_WASM_REVECTORIZE
5190
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2);
5191
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2);
5192
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2);
5193
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2);
5194
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5195
          qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5196
          qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5197
          qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
5198
          qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
5199
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
5200
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
5201
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
5202
          qs8_qc8w_gemm_config.mr = 4;
5203
          qs8_qc8w_gemm_config.nr = 16;
5204
          qs8_qc8w_gemm_config.log2_kr = 2;
5205
        #else
5206
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2);
5207
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2);
5208
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2);
5209
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2);
5210
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5211
          qs8_qc8w_gemm_config.pack_weights_and_biases = NULL;  // Override the default packing function.
5212
          qs8_qc8w_gemm_config.packed_stride_weights_and_biases = NULL;  // Override the default packing function.
5213
          qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
5214
          qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
5215
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
5216
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
5217
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
5218
          qs8_qc8w_gemm_config.mr = 4;
5219
          qs8_qc8w_gemm_config.nr = 8;
5220
          qs8_qc8w_gemm_config.log2_kr = 3;
5221
        #endif
5222
      } else {
5223
        #if XNN_ENABLE_WASM_REVECTORIZE
5224
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c2s2__wasmsimd_dot16x2);
5225
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c2s2__wasmsimd_dot16x2);
5226
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c2s2__wasmsimd_dot16x2);
5227
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c2s2__wasmsimd_dot16x2);
5228
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5229
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5230
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5231
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5232
          qs8_qc8w_gemm_config.mr = 4;
5233
          qs8_qc8w_gemm_config.nr = 16;
5234
          qs8_qc8w_gemm_config.log2_kr = 1;
5235
          qs8_qc8w_gemm_config.log2_sr = 1;
5236
        #else
5237
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5238
          qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5239
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5240
          qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5241
          qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5242
          qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5243
          qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5244
          qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5245
          qs8_qc8w_gemm_config.mr = 4;
5246
          qs8_qc8w_gemm_config.nr = 4;
5247
          qs8_qc8w_gemm_config.log2_kr = 1;
5248
          qs8_qc8w_gemm_config.log2_sr = 2;
5249
        #endif
5250
      }
5251
    #else
5252
      #if XNN_ENABLE_WASM_REVECTORIZE
5253
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c2s2__wasmsimd_dot16x2);
5254
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c2s2__wasmsimd_dot16x2);
5255
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c2s2__wasmsimd_dot16x2);
5256
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c2s2__wasmsimd_dot16x2);
5257
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5258
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5259
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5260
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5261
        qs8_qc8w_gemm_config.mr = 4;
5262
        qs8_qc8w_gemm_config.nr = 16;
5263
        qs8_qc8w_gemm_config.log2_kr = 1;
5264
        qs8_qc8w_gemm_config.log2_sr = 1;
5265
      #else
5266
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5267
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5268
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5269
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5270
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5271
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5272
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5273
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5274
        qs8_qc8w_gemm_config.mr = 4;
5275
        qs8_qc8w_gemm_config.nr = 4;
5276
        qs8_qc8w_gemm_config.log2_kr = 1;
5277
        qs8_qc8w_gemm_config.log2_sr = 2;
5278
      #endif
5279
    #endif
5280
  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
5281
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5282
    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4v__rvv);
5283
    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4v__rvv);
5284
    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4v__rvv);
5285
    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4v__rvv);
5286
    qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5287
    qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5288
    qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5289
    qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5290
    qs8_qc8w_gemm_config.mr = 4;
5291
    qs8_qc8w_gemm_config.nr = 4 * hardware_config->vlenb / sizeof(int32_t);
5292
  #elif XNN_ARCH_HEXAGON && XNN_ENABLE_HVX
5293
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5294
    assert(hardware_config != NULL);
5295
    if (hardware_config->arch_flags & xnn_arch_hvx) {
5296
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x128c4__hvx);
5297
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x128c4__hvx);
5298
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x128c4__hvx);
5299
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x128c4__hvx);
5300
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5301
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5302
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5303
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5304
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_qs8_packw_gemm_gio_ukernel_x128c4__scalar;
5305
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_qs8_packw_gemm_goi_ukernel_x128c4__scalar;
5306
      qs8_qc8w_gemm_config.mr = 2;
5307
      qs8_qc8w_gemm_config.nr = 128;
5308
      qs8_qc8w_gemm_config.log2_kr = 2;
5309
    }
5310
  #else
5311
    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5312
    qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5313
    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5314
    qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5315
    qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
5316
    qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
5317
    qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
5318
    qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
5319
    qs8_qc8w_gemm_config.mr = 3;
5320
    qs8_qc8w_gemm_config.nr = 4;
5321
  #endif
5322
1
  assert(qs8_qc8w_gemm_config.mr <= XNN_MAX_MR);
5323
1
}
5324
5325
0
static void init_qu8_gemm_config(void) {
5326
  // Common parameters.
5327
0
  qu8_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_UINT8_T;
5328
0
  qu8_gemm_config.log2_filter_element_size = XNN_LOG2_SIZEOF_UINT8_T;
5329
0
  qu8_gemm_config.log2_filter_element_bit_size = XNN_LOG2_SIZEOF_UINT8_T + 3;
5330
0
  qu8_gemm_config.bias_element_size = sizeof(int32_t);
5331
  // Use the same packing function throughout.
5332
0
  qu8_gemm_config.pack_weights_and_biases =
5333
0
      (xnn_pack_weights_and_biases_fn)xnn_pack_qu8_weights_and_biases;
5334
0
  qu8_gemm_config.packed_stride_weights_and_biases =
5335
0
      (xnn_packed_stride_weights_and_biases_fn)
5336
0
          xnn_packed_stride_qu8_weights_and_biases;
5337
0
  qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w;
5338
0
  qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w;
5339
5340
  // Arch-specific parameters.
5341
  #if XNN_ARCH_ARM
5342
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5343
    assert(hardware_config != NULL);
5344
    (void) hardware_config;  // May be unused.
5345
    if (hardware_config->arch_flags & xnn_arch_arm_neon) {
5346
      #if XNN_ENABLE_ASSEMBLY
5347
        switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
5348
          case xnn_uarch_cortex_a5:
5349
          case xnn_uarch_cortex_a7:
5350
          case xnn_uarch_krait:
5351
          case xnn_uarch_kryo:
5352
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5353
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5354
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5355
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5356
            qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5357
            qu8_gemm_config.mr = 4;
5358
            qu8_gemm_config.nr = 8;
5359
            break;
5360
          case xnn_uarch_cortex_a32:
5361
          case xnn_uarch_cortex_a35:
5362
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5363
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5364
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5365
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5366
            qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5367
            qu8_gemm_config.mr = 4;
5368
            qu8_gemm_config.nr = 8;
5369
            break;
5370
          case xnn_uarch_cortex_a53:
5371
          case xnn_uarch_cortex_a57:
5372
          case xnn_uarch_cortex_a72:
5373
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5374
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm);
5375
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5376
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm);
5377
            qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5378
            qu8_gemm_config.mr = 4;
5379
            qu8_gemm_config.nr = 8;
5380
            break;
5381
          case xnn_uarch_cortex_a55r0:
5382
          case xnn_uarch_cortex_a55:
5383
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5384
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
5385
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5386
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
5387
            qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5388
            qu8_gemm_config.mr = 4;
5389
            qu8_gemm_config.nr = 8;
5390
            break;
5391
          case xnn_uarch_exynos_m1:
5392
          case xnn_uarch_exynos_m2:
5393
          case xnn_uarch_exynos_m3:
5394
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5395
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm);
5396
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5397
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64_prfm);
5398
            qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5399
            qu8_gemm_config.mr = 4;
5400
            qu8_gemm_config.nr = 8;
5401
            break;
5402
          default:
5403
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5404
            qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
5405
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5406
            qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_ld64);
5407
            qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5408
            qu8_gemm_config.mr = 4;
5409
            qu8_gemm_config.nr = 8;
5410
            break;
5411
        }
5412
5413
        #if XNN_MAX_UARCH_TYPES > 1
5414
        {
5415
          /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
5416
          const uint32_t mr = qu8_gemm_config.mr;
5417
          const uint32_t nr = qu8_gemm_config.nr;
5418
          const uint32_t log2_kr = qu8_gemm_config.log2_kr;
5419
          for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
5420
            switch (hardware_config->uarch[i]) {
5421
              case xnn_uarch_cortex_a53:
5422
                if (mr == 4 && nr == 8 && log2_kr == 0) {
5423
                  qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5424
                  qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm);
5425
                  qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7_prfm);
5426
                  qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53_prfm);
5427
                }
5428
                break;
5429
              case xnn_uarch_cortex_a55r0:
5430
              case xnn_uarch_cortex_a55:
5431
                if (mr == 4 && nr == 8 && log2_kr == 0) {
5432
                  qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5433
                  qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
5434
                  qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__asm_aarch32_neon_mlal_lane_cortex_a7);
5435
                  qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__asm_aarch32_neon_mlal_lane_cortex_a53);
5436
                }
5437
                break;
5438
              default:
5439
                break;
5440
            }
5441
          }
5442
        }
5443
        #endif  // XNN_MAX_UARCH_TYPES > 1
5444
      #else  // XNN_ENABLE_ASSEMBLY
5445
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
5446
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
5447
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
5448
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
5449
        qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5450
        qu8_gemm_config.mr = 3;
5451
        qu8_gemm_config.nr = 8;
5452
      #endif  // XNN_ENABLE_ASSEMBLY
5453
    } else {
5454
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
5455
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
5456
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
5457
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
5458
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_armsimd32_params;
5459
      qu8_gemm_config.mr = 2;
5460
      qu8_gemm_config.nr = 2;
5461
      qu8_gemm_config.log2_kr = 2;
5462
    }
5463
  #elif XNN_ARCH_ARM64
5464
    #if XNN_ENABLE_ASSEMBLY
5465
      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5466
      assert(hardware_config);
5467
      switch (hardware_config->uarch[XNN_UARCH_INDEX]) {
5468
        case xnn_uarch_cortex_a53:
5469
        case xnn_uarch_cortex_a55r0:
5470
        case xnn_uarch_cortex_a55:
5471
        case xnn_uarch_kryo:
5472
          qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu16_ukernel_1x16__neon_mlal_lane);
5473
          qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu16_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm);
5474
          qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu16_ukernel_1x16__neon_mlal_lane);
5475
          qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu16_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm);
5476
          qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu16_scalar_params;
5477
          qu8_gemm_config.mr = 4;
5478
          qu8_gemm_config.nr = 16;
5479
          break;
5480
5481
        case xnn_uarch_cortex_a57:
5482
        case xnn_uarch_cortex_a72:
5483
        case xnn_uarch_cortex_a73:
5484
        case xnn_uarch_cortex_a75:
5485
        case xnn_uarch_cortex_a76:
5486
        case xnn_uarch_exynos_m1:
5487
        case xnn_uarch_exynos_m2:
5488
        case xnn_uarch_exynos_m3:
5489
        case xnn_uarch_exynos_m4:
5490
        case xnn_uarch_neoverse_n1:
5491
          qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
5492
          qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75_prfm);
5493
          qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
5494
          qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75_prfm);
5495
          qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5496
          qu8_gemm_config.mr = 4;
5497
          qu8_gemm_config.nr = 16;
5498
          break;
5499
5500
        default:
5501
          qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
5502
          qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75);
5503
          qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
5504
          qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a75);
5505
          qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5506
          qu8_gemm_config.mr = 4;
5507
          qu8_gemm_config.nr = 16;
5508
          break;
5509
      }
5510
      #if XNN_MAX_UARCH_TYPES > 1
5511
      {
5512
        /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
5513
        const uint32_t mr = qu8_gemm_config.mr;
5514
        const uint32_t nr = qu8_gemm_config.nr;
5515
        const uint32_t log2_kr = qu8_gemm_config.log2_kr;
5516
        for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
5517
          switch (hardware_config->uarch[i]) {
5518
            case xnn_uarch_cortex_a53:
5519
            case xnn_uarch_cortex_a55r0:
5520
            case xnn_uarch_cortex_a55:
5521
              if (mr == 4 && nr == 16 && log2_kr == 0) {
5522
                qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm);
5523
                qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = XNN_INIT_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__asm_aarch64_neon_mlal_lane_cortex_a53_prfm);
5524
              }
5525
              break;
5526
            default:
5527
              break;
5528
          }
5529
        }
5530
      }
5531
      #endif  // XNN_MAX_UARCH_TYPES > 1
5532
    #else  // !XNN_ENABLE_ASSEMBLY
5533
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
5534
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
5535
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
5536
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
5537
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
5538
      qu8_gemm_config.mr = 4;
5539
      qu8_gemm_config.nr = 16;
5540
    #endif  // XNN_ENABLE_ASSEMBLY
5541
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
5542
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5543
0
    assert(hardware_config != NULL);
5544
0
    (void) hardware_config;  // May be unused.
5545
0
    #if XNN_ENABLE_AVX512SKX
5546
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx512skx) {
5547
0
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
5548
0
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
5549
0
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
5550
0
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
5551
0
        qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5552
0
        qu8_gemm_config.mr = 7;
5553
0
        qu8_gemm_config.nr = 16;
5554
0
        qu8_gemm_config.log2_kr = 3;
5555
0
      } else
5556
0
    #endif
5557
0
    #if XNN_ENABLE_AVX2
5558
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
5559
0
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
5560
0
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
5561
0
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
5562
0
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
5563
0
        qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5564
0
        qu8_gemm_config.mr = 3;
5565
0
        qu8_gemm_config.nr = 8;
5566
0
        qu8_gemm_config.log2_kr = 3;
5567
0
      } else
5568
0
    #endif
5569
0
    #if XNN_ENABLE_AVX
5570
0
      if (hardware_config->arch_flags & xnn_arch_x86_avx) {
5571
0
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
5572
0
        qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
5573
0
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
5574
0
        qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
5575
0
        qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5576
0
        qu8_gemm_config.mr = 2;
5577
0
        qu8_gemm_config.nr = 4;
5578
0
        qu8_gemm_config.log2_kr = 3;
5579
0
      } else
5580
0
    #endif
5581
0
    if (hardware_config->arch_flags & xnn_arch_x86_sse4_1) {
5582
0
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
5583
0
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
5584
0
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
5585
0
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
5586
0
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5587
0
      qu8_gemm_config.mr = 3;
5588
0
      qu8_gemm_config.nr = 4;
5589
0
      qu8_gemm_config.log2_kr = 3;
5590
0
    } else {
5591
0
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
5592
0
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
5593
0
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
5594
0
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
5595
0
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5596
0
      qu8_gemm_config.mr = 3;
5597
0
      qu8_gemm_config.nr = 4;
5598
0
      qu8_gemm_config.log2_kr = 3;
5599
0
    }
5600
  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5601
    qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5602
    qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5603
    qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5604
    qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5605
    qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5606
    qu8_gemm_config.mr = 4;
5607
    qu8_gemm_config.nr = 4;
5608
    qu8_gemm_config.log2_kr = 1;
5609
    qu8_gemm_config.log2_sr = 2;
5610
  #else
5611
    qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5612
    qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5613
    qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5614
    qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(3)] = XNN_INIT_HMP_IGEMM_UKERNEL(xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5615
    qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_params;
5616
    qu8_gemm_config.mr = 3;
5617
    qu8_gemm_config.nr = 4;
5618
  #endif
5619
0
  assert(qu8_gemm_config.mr <= XNN_MAX_MR);
5620
0
}
5621
5622
0
const struct xnn_gemm_config* xnn_init_f16_gemm_config() {
5623
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5624
0
  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
5625
0
    return NULL;
5626
0
  }
5627
0
  XNN_INIT_ONCE(f16_gemm);
5628
0
  return &f16_gemm_config;
5629
0
}
5630
5631
0
const struct xnn_gemm_config* xnn_init_pf16_gemm_config() {
5632
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5633
0
  if (hardware_config == NULL) {
5634
0
    return NULL;
5635
0
  }
5636
0
  XNN_INIT_ONCE(pf16_gemm);
5637
0
  return pf16_gemm_config.mr ? &pf16_gemm_config : NULL;
5638
0
}
5639
5640
0
const struct xnn_gemm_config* xnn_init_bf16_f32_gemm_config() {
5641
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5642
0
  if (hardware_config == NULL || !xnn_is_bf16_compatible_config(hardware_config)) {
5643
0
    return NULL;
5644
0
  }
5645
0
  XNN_INIT_ONCE(bf16_f32_gemm);
5646
0
  return &bf16_f32_gemm_config;
5647
0
}
5648
5649
0
const struct xnn_gemm_config* xnn_init_pf32_gemm_config() {
5650
0
  if (xnn_init_hardware_config() == NULL) {
5651
0
    return NULL;
5652
0
  }
5653
0
  XNN_INIT_ONCE(pf32_gemm);
5654
0
  return pf32_gemm_config.mr ? &pf32_gemm_config : NULL;
5655
0
}
5656
5657
0
const struct xnn_gemm_config* xnn_init_pqs8_qc8w_gemm_config() {
5658
0
  if (xnn_init_hardware_config() == NULL) {
5659
0
    return NULL;
5660
0
  }
5661
0
  XNN_INIT_ONCE(pqs8_qc8w_gemm);
5662
0
  return pqs8_qc8w_gemm_config.mr ? &pqs8_qc8w_gemm_config : NULL;
5663
0
}
5664
5665
0
const struct xnn_gemm_config* xnn_init_f32_gemm_config(uint32_t flags) {
5666
0
  if (xnn_init_hardware_config() == NULL) {
5667
0
    return NULL;
5668
0
  }
5669
0
  XNN_INIT_ONCE(f32_gemm);
5670
0
  if (flags & XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC) {
5671
0
    return &f32_gemm_config[consistent_config];
5672
0
  } else {
5673
0
    return &f32_gemm_config[default_config];
5674
0
  }
5675
0
}
5676
5677
0
const struct xnn_gemm_config* xnn_init_f32_igemm_config() {
5678
0
  if (xnn_init_hardware_config() == NULL) {
5679
0
    return NULL;
5680
0
  }
5681
0
  XNN_INIT_ONCE(f32_igemm);
5682
0
  return &f32_igemm_config;
5683
0
}
5684
5685
0
const struct xnn_gemm_config* xnn_init_f32_gemm_nr2_config(uint32_t flags) {
5686
0
  if (xnn_init_hardware_config() == NULL) {
5687
0
    return NULL;
5688
0
  }
5689
0
  XNN_INIT_ONCE(f32_gemm_nr2);
5690
0
  if (flags & XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC) {
5691
0
    return &f32_gemm_nr2_config[consistent_config];
5692
0
  } else {
5693
0
    return &f32_gemm_nr2_config[default_config];
5694
0
  }
5695
0
}
5696
5697
0
const struct xnn_gemm_config* xnn_init_f32_qc4w_gemm_config() {
5698
0
  if (xnn_init_hardware_config() == NULL) {
5699
0
    return NULL;
5700
0
  }
5701
0
  XNN_INIT_ONCE(f32_qc4w_gemm);
5702
0
  return &f32_qc4w_gemm_config;
5703
0
}
5704
5705
0
const struct xnn_gemm_config* xnn_init_f32_qc8w_gemm_config() {
5706
0
  if (xnn_init_hardware_config() == NULL) {
5707
0
    return NULL;
5708
0
  }
5709
0
  XNN_INIT_ONCE(f32_qc8w_gemm);
5710
0
  return &f32_qc8w_gemm_config;
5711
0
}
5712
5713
0
const struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_gemm_config() {
5714
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5715
0
  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
5716
0
    return NULL;
5717
0
  }
5718
0
  XNN_INIT_ONCE(qd8_f16_qc8w_gemm);
5719
0
  return &qd8_f16_qc8w_gemm_config;
5720
0
}
5721
5722
0
const struct xnn_gemm_config* xnn_init_qd8_f16_qc8w_igemm_config() {
5723
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5724
0
  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
5725
0
    return NULL;
5726
0
  }
5727
0
  XNN_INIT_ONCE(qd8_f16_qc8w_igemm);
5728
0
  return &qd8_f16_qc8w_igemm_config;
5729
0
}
5730
5731
0
const struct xnn_gemm_config* xnn_init_qd8_f16_qc4w_gemm_config() {
5732
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5733
0
  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
5734
0
    return NULL;
5735
0
  }
5736
0
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5737
  // there are no kernels on x86. qdu8_f16_qc4w kernels are used instead.
5738
0
    return NULL;
5739
0
#endif
5740
5741
0
  XNN_INIT_ONCE(qd8_f16_qc4w_gemm);
5742
0
  return &qd8_f16_qc4w_gemm_config;
5743
0
}
5744
5745
0
const struct xnn_gemm_config* xnn_init_qdu8_f16_qc4w_gemm_config() {
5746
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5747
0
  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
5748
0
    return NULL;
5749
0
  }
5750
0
  XNN_INIT_ONCE(qdu8_f16_qc4w_gemm);
5751
0
  return qdu8_f16_qc4w_gemm_config.arch ? &qdu8_f16_qc4w_gemm_config : NULL;
5752
0
}
5753
5754
0
const struct xnn_gemm_config* xnn_init_qd8_f16_qb4w_gemm_config() {
5755
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5756
0
  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
5757
0
    return NULL;
5758
0
  }
5759
0
  XNN_INIT_ONCE(qd8_f16_qb4w_gemm);
5760
0
  return &qd8_f16_qb4w_gemm_config;
5761
0
}
5762
5763
0
const struct xnn_gemm_config* xnn_init_qd8_f32_qc4w_gemm_config() {
5764
0
  if (xnn_init_hardware_config() == NULL) {
5765
0
    return NULL;
5766
0
  }
5767
0
  XNN_INIT_ONCE(qd8_f32_qc4w_gemm);
5768
0
  return &qd8_f32_qc4w_gemm_config;
5769
0
}
5770
5771
0
const struct xnn_gemm_config* xnn_init_qd8_f32_qc2w_gemm_config() {
5772
0
  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
5773
0
  if (hardware_config == NULL || !xnn_is_qc2w_compatible_config(hardware_config)) {
5774
0
    return NULL;
5775
0
  }
5776
0
  XNN_INIT_ONCE(qd8_f32_qc2w_gemm);
5777
  // Only return the config pointer if it actually provides a kernel.
5778
0
  if (qd8_f32_qc2w_gemm_config.minmax.dqgemm[0].function[0] != NULL) {
5779
0
    return &qd8_f32_qc2w_gemm_config;
5780
0
  }
5781
0
  return NULL;
5782
0
}
5783
5784
0
const struct xnn_gemm_config* xnn_init_qdu8_f32_qc4w_gemm_config() {
5785
0
  if (xnn_init_hardware_config() == NULL) {
5786
0
    return NULL;
5787
0
  }
5788
0
  XNN_INIT_ONCE(qdu8_f32_qc4w_gemm);
5789
0
  return qdu8_f32_qc4w_gemm_config.arch ? &qdu8_f32_qc4w_gemm_config : NULL;
5790
0
}
5791
5792
0
const struct xnn_gemm_config* xnn_init_qd8_f32_qb4w_gemm_config() {
5793
0
  if (xnn_init_hardware_config() == NULL) {
5794
0
    return NULL;
5795
0
  }
5796
0
  XNN_INIT_ONCE(qd8_f32_qb4w_gemm);
5797
0
  return &qd8_f32_qb4w_gemm_config;
5798
0
}
5799
5800
0
const struct xnn_gemm_config* xnn_init_qdu8_f32_qb4w_gemm_config() {
5801
0
  if (xnn_init_hardware_config() == NULL) {
5802
0
    return NULL;
5803
0
  }
5804
0
  XNN_INIT_ONCE(qdu8_f32_qb4w_gemm);
5805
0
  return qdu8_f32_qb4w_gemm_config.arch ? &qdu8_f32_qb4w_gemm_config : NULL;
5806
0
}
5807
5808
0
const struct xnn_gemm_config* xnn_init_qdu8_f16_qc8w_gemm_config() {
5809
0
  if (xnn_init_hardware_config() == NULL) {
5810
0
    return NULL;
5811
0
  }
5812
0
  XNN_INIT_ONCE(qdu8_f16_qc8w_gemm);
5813
0
  return qdu8_f16_qc8w_gemm_config.arch ? &qdu8_f16_qc8w_gemm_config : NULL;
5814
0
}
5815
5816
0
const struct xnn_gemm_config* xnn_init_qdu8_f32_qc8w_igemm_config() {
5817
0
  if (xnn_init_hardware_config() == NULL) {
5818
0
    return NULL;
5819
0
  }
5820
0
  XNN_INIT_ONCE(qdu8_f32_qc8w_igemm);
5821
0
  return qdu8_f32_qc8w_igemm_config.arch ? &qdu8_f32_qc8w_igemm_config : NULL;
5822
0
}
5823
5824
0
const struct xnn_gemm_config* xnn_init_qdu8_f32_qc8w_gemm_config() {
5825
0
  if (xnn_init_hardware_config() == NULL) {
5826
0
    return NULL;
5827
0
  }
5828
0
  XNN_INIT_ONCE(qdu8_f32_qc8w_gemm);
5829
0
  return qdu8_f32_qc8w_gemm_config.arch ? &qdu8_f32_qc8w_gemm_config : NULL;
5830
0
}
5831
5832
0
const struct xnn_gemm_config* xnn_init_qd8_f32_qc8w_gemm_config() {
5833
0
  if (xnn_init_hardware_config() == NULL) {
5834
0
    return NULL;
5835
0
  }
5836
0
  XNN_INIT_ONCE(qd8_f32_qc8w_gemm);
5837
0
  return &qd8_f32_qc8w_gemm_config;
5838
0
}
5839
5840
0
const struct xnn_gemm_config* xnn_init_qp8_f32_qc4w_gemm_config() {
5841
0
  if (xnn_init_hardware_config() == NULL) {
5842
0
    return NULL;
5843
0
  }
5844
0
  XNN_INIT_ONCE(qp8_f32_qc4w_gemm);
5845
  // Only return the config pointer if it actually provides a kernel.
5846
0
  if (qp8_f32_qc4w_gemm_config.minmax.qp8gemm[0].function[0] != NULL) {
5847
0
    return &qp8_f32_qc4w_gemm_config;
5848
0
  }
5849
0
  return NULL;
5850
0
}
5851
5852
0
const struct xnn_gemm_config* xnn_init_qp8_f32_qc8w_gemm_config() {
5853
0
  if (xnn_init_hardware_config() == NULL) {
5854
0
    return NULL;
5855
0
  }
5856
0
  XNN_INIT_ONCE(qp8_f32_qc8w_gemm);
5857
  // Only return the config pointer if it actually provides a kernel.
5858
0
  if (qp8_f32_qc8w_gemm_config.minmax.qp8gemm[0].function[0] != NULL) {
5859
0
    return &qp8_f32_qc8w_gemm_config;
5860
0
  }
5861
0
  return NULL;
5862
0
}
5863
5864
0
const struct xnn_gemm_config* xnn_init_qp8_f32_qb4w_gemm_config() {
5865
0
  const struct xnn_hardware_config* hardware_config =
5866
0
      xnn_init_hardware_config();
5867
0
  if (hardware_config == NULL) {
5868
0
    return NULL;
5869
0
  }
5870
0
XNN_INIT_ONCE(qp8_f32_qb4w_gemm);
5871
  // Only return the config pointer if it actually provides a kernel.
5872
0
  if (qp8_f32_qb4w_gemm_config.minmax.qp8gemm_bl[0].function[0] != NULL) {
5873
0
    return &qp8_f32_qb4w_gemm_config;
5874
0
  }
5875
0
  return NULL;
5876
0
}
5877
5878
0
const struct xnn_gemm_config* xnn_init_qs8_qc4w_gemm_config() {
5879
0
  if (xnn_init_hardware_config() == NULL) {
5880
0
    return NULL;
5881
0
  }
5882
0
  XNN_INIT_ONCE(qs8_qc4w_gemm);
5883
0
  return &qs8_qc4w_gemm_config;
5884
0
}
5885
5886
178
const struct xnn_gemm_config* xnn_init_qs8_qc8w_gemm_config() {
5887
178
  if (xnn_init_hardware_config() == NULL) {
5888
0
    return NULL;
5889
0
  }
5890
178
  XNN_INIT_ONCE(qs8_qc8w_gemm);
5891
178
  return &qs8_qc8w_gemm_config;
5892
178
}
5893
5894
0
const struct xnn_gemm_config* xnn_init_qu8_gemm_config() {
5895
0
  if (xnn_init_hardware_config() == NULL) {
5896
0
    return NULL;
5897
0
  }
5898
0
  XNN_INIT_ONCE(qu8_gemm);
5899
0
  return &qu8_gemm_config;
5900
0
}