Coverage Report

Created: 2023-09-25 06:31

/src/xnnpack/src/operators/convolution-nhwc.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) Facebook, Inc. and its affiliates.
2
// All rights reserved.
3
//
4
// Copyright 2019 Google LLC
5
//
6
// This source code is licensed under the BSD-style license found in the
7
// LICENSE file in the root directory of this source tree.
8
9
#include <assert.h>
10
#include <math.h>
11
#include <stdbool.h>
12
#include <stddef.h>
13
#include <stdint.h>
14
#include <stdlib.h>
15
#include <string.h>
16
17
#include <fp16/fp16.h>
18
19
#include <xnnpack.h>
20
#include <xnnpack/allocator.h>
21
#include <xnnpack/cache.h>
22
#include <xnnpack/common.h>
23
#include <xnnpack/compute.h>
24
#include <xnnpack/config.h>
25
#include <xnnpack/indirection.h>
26
#include <xnnpack/log.h>
27
#include <xnnpack/math.h>
28
#include <xnnpack/microkernel-utils.h>
29
#include <xnnpack/operator.h>
30
#include <xnnpack/operator-utils.h>
31
#include <xnnpack/operator-type.h>
32
#include <xnnpack/pack.h>
33
#include <xnnpack/params.h>
34
#include <xnnpack/post-operation.h>
35
#include <xnnpack/microparams-init.h>
36
37
#ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION
38
#error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined"
39
#endif
40
41
static inline size_t compute_output_dimension_with_tf_same_padding(
42
    size_t input_dimension,
43
    size_t subsampling_dimension)
44
0
{
45
0
  return divide_round_up(input_dimension, subsampling_dimension);
46
0
}
47
48
static inline const struct xnn_dwconv_config* find_dwconv_ukernel(
49
    size_t kernel_size,
50
    const struct xnn_dwconv_config* ukernel,
51
    size_t num_ukernels)
52
0
{
53
0
  const struct xnn_dwconv_config* best_ukernel = NULL;
54
0
  while (num_ukernels-- != 0) {
55
    // Find the smallest unipass primary_tile that is at least as big as kernel_size.
56
0
    if (ukernel->last_tile == 0 && ukernel->primary_tile >= kernel_size) {
57
0
      if (best_ukernel == NULL || ukernel->primary_tile < best_ukernel->primary_tile) {
58
0
        best_ukernel = ukernel;
59
0
      }
60
0
    } else if (ukernel->last_tile != 0) {
61
      // Use multi-pass if it fits the kernel size nicely, or if kernel_size is large.
62
0
      if (ukernel->primary_tile + ukernel->middle_tile + ukernel->last_tile == kernel_size || kernel_size >= 25) {
63
0
        best_ukernel = ukernel;
64
0
      }
65
0
    }
66
0
    ukernel++;
67
0
  }
68
0
  if (best_ukernel == NULL) {
69
0
    xnn_log_debug("no dwconv ukernel found");
70
0
  } else if (best_ukernel->last_tile == 0) {
71
0
    xnn_log_debug("dwconv unipass ukernel of primary tile %"PRIu8" found", best_ukernel->primary_tile);
72
0
  } else {
73
0
    xnn_log_debug("dwconv multipass ukernel of tiles %"PRIu8", %"PRIu8", %"PRIu8" found",
74
0
                  best_ukernel->primary_tile,
75
0
                  best_ukernel->middle_tile,
76
0
                  best_ukernel->last_tile);
77
0
  }
78
0
  return best_ukernel;
79
0
}
80
81
static enum xnn_status create_vmulcaddc_path(
82
    uint32_t groups,
83
    const void* kernel,
84
    const void* bias,
85
    uint32_t log2_filter_element_size,
86
    uint32_t bias_element_size,
87
    xnn_pack_vmulcaddc_w_fn pack_vmulcaddc_w,
88
    const void* packing_params,
89
    int packed_weights_padding_byte,
90
    const void* vmulcaddc_params,
91
    size_t vmulcaddc_params_size,
92
    const struct xnn_vmulcaddc_config* vmulcaddc_config,
93
    enum xnn_operator_type operator_type,
94
    xnn_operator_t convolution_op)
95
0
{
96
0
  assert(vmulcaddc_config != NULL);
97
0
  assert(vmulcaddc_params != NULL);
98
99
0
  enum xnn_status status = xnn_status_out_of_memory;
100
101
0
  const size_t c_stride = round_up_po2(groups, vmulcaddc_config->channel_tile);
102
0
  const size_t packed_weights_size = ((UINT32_C(1) << log2_filter_element_size) + bias_element_size) * c_stride;
103
0
  size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
104
0
  void* weights_ptr = xnn_get_pointer_to_write_weights(
105
0
      convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
106
0
  if (weights_ptr == NULL) {
107
0
    xnn_log_error("failed to reserve or allocated %zu bytes for %s operator vmulcaddc packed weights",
108
0
                  aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
109
0
    goto error;
110
0
  }
111
0
  xnn_log_debug("allocated %zu bytes for packed weights in %s operator",
112
0
                aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
113
114
0
  pack_vmulcaddc_w(groups, vmulcaddc_config->channel_tile, kernel, bias, weights_ptr, packing_params);
115
116
0
  if (use_weights_cache(convolution_op)) {
117
0
    convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
118
0
        convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
119
0
  }
120
121
0
  memcpy(&convolution_op->params, vmulcaddc_params, vmulcaddc_params_size);
122
123
0
  convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
124
0
    .function = vmulcaddc_config->ukernel,
125
0
    .mr = vmulcaddc_config->row_tile,
126
0
  };
127
0
  return xnn_status_success;
128
129
0
error:
130
0
  return status;
131
0
}
132
133
static enum xnn_status create_dwconv_path(
134
    uint32_t kernel_height,
135
    uint32_t kernel_width,
136
    uint32_t groups,
137
    const void* kernel,
138
    const void* bias,
139
    uint32_t flags,
140
    uint32_t log2_input_element_size,
141
    uint32_t log2_filter_element_size,
142
    uint32_t bias_element_size,
143
    xnn_pack_dwconv_hwg_w_fn pack_dwconv_hwg_w,
144
    xnn_pack_dwconv_ghw_w_fn pack_dwconv_ghw_w,
145
    const void* packing_params,
146
    int packed_weights_padding_byte,
147
    size_t extra_weights_bytes,
148
    xnn_init_qs8_qc8w_scale_params_fn init_scale_params,
149
    const float* scale_params,
150
    const void* dwconv_params,
151
    size_t dwconv_params_size,
152
    const struct xnn_dwconv_config* dwconv_ukernel,
153
    bool linear_activation,
154
    enum xnn_operator_type operator_type,
155
    size_t* zero_size,
156
    xnn_operator_t convolution_op)
157
0
{
158
0
  assert(dwconv_ukernel != NULL);
159
0
  enum xnn_status status = xnn_status_out_of_memory;
160
0
  const uint8_t primary_tile = dwconv_ukernel->primary_tile;
161
0
  const bool is_unipass = dwconv_ukernel->last_tile == 0;
162
0
  const size_t kernel_size = kernel_height * kernel_width;
163
0
  if (is_unipass) {
164
0
    assert(primary_tile >= kernel_size);
165
0
    xnn_log_debug("using dwconv unipass of primary_tile %u", primary_tile);
166
0
  } else {
167
0
    assert(kernel_size > primary_tile);
168
0
    xnn_log_debug("using dwconv multipass ukernel of tiles %d, %d, %d",
169
0
                  primary_tile,
170
0
                  dwconv_ukernel->middle_tile,
171
0
                  dwconv_ukernel->last_tile);
172
0
  }
173
174
0
  const size_t c_stride = round_up_po2(groups, dwconv_ukernel->channel_tile);
175
0
  size_t tile_size = 0;
176
0
  size_t packed_weights_size = 0;
177
0
  if (is_unipass) {
178
0
    tile_size = primary_tile;
179
0
    packed_weights_size = ((primary_tile << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * c_stride;
180
0
  } else {
181
0
    tile_size = xnn_dwconv_multipass_tile_size(
182
0
      kernel_size, primary_tile, dwconv_ukernel->middle_tile, dwconv_ukernel->last_tile);
183
0
    packed_weights_size = xnn_dwconv_multipass_weights_size(
184
0
      tile_size, groups, dwconv_ukernel->channel_tile, dwconv_ukernel->channel_subtile,
185
0
      dwconv_ukernel->channel_round, bias_element_size, log2_filter_element_size, extra_weights_bytes);
186
0
  }
187
0
  size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
188
0
  void* weights_ptr = xnn_get_pointer_to_write_weights(
189
0
      convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
190
0
  if (weights_ptr == NULL) {
191
0
    xnn_log_error("failed to reserve or allocated %zu bytes for %s operator dwconv packed weights",
192
0
                  aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
193
0
    goto error;
194
0
  }
195
0
  xnn_log_debug("allocated %zu bytes for packed weights in %s operator",
196
0
                aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
197
198
0
  memcpy(&convolution_op->params, dwconv_params, dwconv_params_size);
199
200
0
  if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
201
0
    pack_dwconv_hwg_w(
202
0
        primary_tile,
203
0
        dwconv_ukernel->middle_tile,
204
0
        dwconv_ukernel->last_tile,
205
0
        kernel_height, kernel_width,
206
0
        groups,
207
0
        dwconv_ukernel->channel_tile, dwconv_ukernel->channel_subtile, dwconv_ukernel->channel_round,
208
0
        kernel, bias, /*scale=*/NULL, weights_ptr,
209
0
        dwconv_ukernel->channel_tile * extra_weights_bytes,
210
0
        dwconv_ukernel->channel_subtile * extra_weights_bytes,
211
0
        packing_params);
212
0
  } else {
213
0
    pack_dwconv_ghw_w(
214
0
        primary_tile,
215
0
        dwconv_ukernel->middle_tile,
216
0
        dwconv_ukernel->last_tile,
217
0
        kernel_height, kernel_width,
218
0
        groups,
219
0
        dwconv_ukernel->channel_tile, dwconv_ukernel->channel_subtile, dwconv_ukernel->channel_round,
220
0
        kernel, bias, /*scale=*/NULL, weights_ptr,
221
0
        dwconv_ukernel->channel_tile * extra_weights_bytes,
222
0
        dwconv_ukernel->channel_subtile * extra_weights_bytes,
223
0
        packing_params);
224
0
  }
225
226
0
  if (scale_params != NULL) {
227
0
    assert(init_scale_params != NULL);
228
    // TODO(zhin): QC8 DWCONV multipass is not implemented for now, fix this when it is supported.
229
0
    assert(is_unipass);
230
0
    size_t stride = dwconv_ukernel->channel_tile *
231
0
                    ((primary_tile << log2_filter_element_size) + bias_element_size + extra_weights_bytes);
232
233
0
    init_scale_params(
234
0
      /*channels=*/groups,
235
0
      /*channels_tile=*/dwconv_ukernel->channel_tile,
236
0
      /*channels_subtile=*/dwconv_ukernel->channel_tile,
237
0
      /*stride=*/stride,
238
0
      /*substride=*/stride,
239
0
      /*stride_offset=*/0,
240
0
      /*scale=*/scale_params,
241
      /*packed_w=*/
242
0
      (void*) ((uintptr_t) weights_ptr +
243
0
               dwconv_ukernel->channel_tile * ((primary_tile << log2_filter_element_size) + bias_element_size)));
244
0
  }
245
246
0
  if (use_weights_cache(convolution_op)) {
247
0
    convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
248
0
        convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
249
0
  }
250
251
0
  const union xnn_dwconv_ukernel* ukernels = &dwconv_ukernel->minmax;
252
0
  if (linear_activation && dwconv_ukernel->linear.unipass != NULL) {
253
0
    ukernels = &dwconv_ukernel->linear;
254
0
  }
255
0
  convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
256
0
    .primary_tile = primary_tile,
257
0
    .middle_tile = dwconv_ukernel->middle_tile,
258
0
    .last_tile = dwconv_ukernel->last_tile,
259
0
    .tile_size = tile_size,
260
0
  };
261
262
0
  if (is_unipass) {
263
0
    convolution_op->ukernel.dwconv.unipass_fn = ukernels->unipass;
264
0
  } else {
265
0
    convolution_op->ukernel.dwconv.multipass_fn = ukernels->multipass;
266
0
  }
267
268
0
  *zero_size = XNN_EXTRA_BYTES + (c_stride << log2_input_element_size);
269
0
  return xnn_status_success;
270
0
error:
271
0
  return status;
272
0
}
273
274
static enum xnn_status create_gemm_or_igemm(
275
    enum xnn_microkernel_type ukernel_type,
276
    uint32_t kernel_size,
277
    uint32_t groups,
278
    size_t group_input_channels,
279
    size_t group_output_channels,
280
    const void* kernel,
281
    const void* bias,
282
    uint32_t flags,
283
    uint32_t log2_input_element_size,
284
    uint32_t log2_filter_element_size,
285
    uint32_t bias_element_size,
286
    xnn_packw_gemm_goi_ukernel_fn pack_gemm_goi_w,
287
    xnn_pack_conv_kgo_w_fn pack_conv_kgo_w,
288
    xnn_pack_conv_goki_w_fn pack_conv_goki_w,
289
    const void* packing_params,
290
    int packed_weights_padding_byte,
291
    size_t extra_weights_bytes,
292
    xnn_init_qs8_qc8w_scale_params_fn init_scale_params,
293
    const float* scale_params,
294
    const void* gemm_params,
295
    size_t gemm_params_size,
296
    const struct xnn_gemm_config* gemm_config,
297
    const struct jit_gemm_params* jit_gemm_params,
298
    bool linear_activation,
299
    bool relu_activation,
300
    enum xnn_operator_type operator_type,
301
    size_t num_post_operations,
302
    void* post_operation_params,
303
    xnn_operator_t convolution_op,
304
    size_t* zero_size)
305
0
{
306
0
  enum xnn_status status = xnn_status_out_of_memory;
307
0
  const uint32_t nr = gemm_config->nr;
308
0
  const uint32_t kr = UINT32_C(1) << gemm_config->log2_kr;
309
0
  const uint32_t sr = UINT32_C(1) << gemm_config->log2_sr;
310
0
  const size_t n_stride = round_up(group_output_channels, nr);
311
0
  const size_t k_stride = round_up_po2(group_input_channels, kr * sr);
312
313
0
  const size_t packed_group_weights_size =
314
0
      ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * n_stride;
315
0
  const size_t aligned_total_weights_size = round_up_po2(packed_group_weights_size * groups, XNN_ALLOCATION_ALIGNMENT);
316
0
  void* weights_ptr = xnn_get_pointer_to_write_weights(
317
0
      convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
318
0
  if (weights_ptr == NULL) {
319
0
    xnn_log_error("failed to reserve or allocated %zu bytes for %s operator gemm packed weights",
320
0
                  aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
321
0
    goto error;
322
0
  }
323
0
  xnn_log_debug("allocated %zu bytes for packed weights in %s operator",
324
0
                aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
325
326
0
  memcpy(&convolution_op->params, gemm_params, gemm_params_size);
327
0
  convolution_op->num_post_operation_params = num_post_operations;
328
0
  convolution_op->post_operation_params = post_operation_params;
329
330
0
  const struct gemm_fused_ukernels* gemm_ukernels = &gemm_config->minmax;
331
0
  const uint32_t mr = gemm_config->mr;
332
0
  if (linear_activation && gemm_config->linear.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) {
333
0
    gemm_ukernels = &gemm_config->linear;
334
0
  } else if (relu_activation && gemm_config->relu.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) {
335
0
    gemm_ukernels = &gemm_config->relu;
336
0
  }
337
0
  switch (ukernel_type) {
338
0
    case xnn_microkernel_type_gemm:
339
0
      pack_gemm_goi_w(
340
0
          groups, group_output_channels, group_input_channels,
341
0
          nr, kr, sr,
342
0
          kernel, bias, /*scale=*/NULL, weights_ptr, gemm_config->nr * extra_weights_bytes, packing_params);
343
0
      convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
344
0
        .mr = mr,
345
0
        .nr = nr,
346
0
        .kr = kr,
347
0
        .sr = sr,
348
0
      };
349
350
0
      assert(XNN_MAX_MR >= mr);
351
0
      for (size_t i = 0; i < mr; i++) {
352
0
        convolution_op->ukernel.gemm.gemm_cases[i] = gemm_ukernels->gemm[i];
353
0
      }
354
355
#if XNN_PLATFORM_JIT
356
      xnn_generate_gemms_up_to_max_mr(
357
        mr, gemm_config->generator, jit_gemm_params, group_output_channels, nr,
358
        group_input_channels << log2_input_element_size, convolution_op);
359
#endif  // XNN_PLATFORM_JIT
360
361
0
      break;
362
0
    case xnn_microkernel_type_igemm:
363
0
      if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
364
0
        pack_conv_kgo_w(
365
0
            groups, group_output_channels, kernel_size,
366
0
            nr, kr, sr,
367
0
            kernel, bias, /*scale=*/NULL, weights_ptr, gemm_config->nr * extra_weights_bytes, packing_params);
368
0
      } else {
369
0
        pack_conv_goki_w(
370
0
            groups, group_output_channels, kernel_size, group_input_channels,
371
0
            nr, kr, sr,
372
0
            kernel, bias, /*scale=*/NULL, weights_ptr, gemm_config->nr * extra_weights_bytes, packing_params);
373
0
      }
374
0
      convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
375
0
        .mr = mr,
376
0
            .nr = nr,
377
0
            .kr = kr,
378
0
            .sr = sr,
379
0
      };
380
381
0
      assert(XNN_MAX_MR >= mr);
382
0
      for (size_t i = 0; i < mr; i++) {
383
0
        convolution_op->ukernel.igemm.igemm_cases[i] = gemm_ukernels->igemm[i];
384
0
      }
385
386
#if XNN_PLATFORM_JIT
387
      xnn_generate_igemms_up_to_max_mr(
388
          mr, gemm_config->generator, jit_gemm_params, group_output_channels, nr,
389
          group_input_channels << log2_input_element_size, kernel_size, convolution_op);
390
#endif  // XNN_PLATFORM_JIT
391
392
0
      break;
393
0
    default:
394
0
      XNN_UNREACHABLE;
395
0
  }
396
397
0
  if (scale_params != NULL) {
398
0
    assert(init_scale_params != NULL);
399
400
0
    void* group_weights =
401
0
        (void*)((uintptr_t)weights_ptr +
402
0
                gemm_config->nr * ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size));
403
0
    const size_t weights_stride =
404
0
        (kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes;
405
0
    for (uint32_t group = 0; group < groups; group++) {
406
0
      init_scale_params(
407
0
          group_output_channels, gemm_config->nr, gemm_config->nr,
408
0
          gemm_config->nr * weights_stride, gemm_config->nr * weights_stride, 0,
409
0
          scale_params, group_weights);
410
0
      scale_params += group_output_channels;
411
0
      group_weights = (void*) ((uintptr_t) group_weights + n_stride * weights_stride);
412
0
    }
413
0
  }
414
415
0
  if (use_weights_cache(convolution_op)) {
416
0
    convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
417
0
        convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
418
0
  }
419
420
0
  *zero_size = XNN_EXTRA_BYTES + (k_stride << log2_input_element_size);
421
0
  return xnn_status_success;
422
423
0
error:
424
0
  return status;
425
0
}
426
427
static enum xnn_status create_convolution2d_nhwc(
428
    uint32_t input_padding_top,
429
    uint32_t input_padding_right,
430
    uint32_t input_padding_bottom,
431
    uint32_t input_padding_left,
432
    uint32_t kernel_height,
433
    uint32_t kernel_width,
434
    uint32_t subsampling_height,
435
    uint32_t subsampling_width,
436
    uint32_t dilation_height,
437
    uint32_t dilation_width,
438
    uint32_t groups,
439
    size_t group_input_channels,
440
    size_t group_output_channels,
441
    size_t input_channel_stride,
442
    size_t output_channel_stride,
443
    const void* kernel,
444
    const void* bias,
445
    uint32_t flags,
446
    uint32_t log2_input_element_size,
447
    uint32_t log2_filter_element_size,
448
    uint32_t bias_element_size,
449
    xnn_pack_vmulcaddc_w_fn pack_vmulcaddc_w,
450
    xnn_pack_dwconv_hwg_w_fn pack_dwconv_hwg_w,
451
    xnn_pack_dwconv_ghw_w_fn pack_dwconv_ghw_w,
452
    xnn_packw_gemm_goi_ukernel_fn pack_gemm_goi_w,
453
    xnn_pack_conv_kgo_w_fn pack_conv_kgo_w,
454
    xnn_pack_conv_goki_w_fn pack_conv_goki_w,
455
    const void* packing_params,
456
    int input_padding_byte,
457
    int packed_weights_padding_byte,
458
    size_t extra_weights_bytes,
459
    xnn_init_qs8_qc8w_scale_params_fn init_scale_params,
460
    const float* scale_params,
461
    const void* gemm_params,
462
    size_t gemm_params_size,
463
    const void* dwconv_params,
464
    size_t dwconv_params_size,
465
    const void* vmulcaddc_params,
466
    size_t vmulcaddc_params_size,
467
    const struct xnn_gemm_config* gemm_config,
468
    const struct xnn_dwconv_config* dwconv_ukernel,
469
    const struct xnn_vmulcaddc_config* vmulcaddc_config,
470
    struct jit_gemm_params* jit_gemm_params,
471
    bool linear_activation,
472
    bool relu_activation,
473
    enum xnn_operator_type operator_type,
474
    size_t num_post_operations,
475
    void* post_operation_params,
476
    xnn_code_cache_t code_cache,
477
    xnn_weights_cache_t weights_cache,
478
    xnn_operator_t* convolution_op_out)
479
0
{
480
0
  xnn_operator_t convolution_op = NULL;
481
0
  enum xnn_status status = xnn_status_uninitialized;
482
483
0
  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
484
0
    xnn_log_error(
485
0
      "failed to create %s operator: XNNPACK is not initialized",
486
0
      xnn_operator_type_to_string(operator_type));
487
0
    goto error;
488
0
  }
489
490
0
  status = xnn_status_invalid_parameter;
491
492
0
  if (kernel_width == 0 || kernel_height == 0) {
493
0
    xnn_log_error(
494
0
      "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
495
0
      xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
496
0
    goto error;
497
0
  }
498
499
0
  if (subsampling_width == 0 || subsampling_height == 0) {
500
0
    xnn_log_error(
501
0
      "failed to create %s operator with %" PRIu32 "x%" PRIu32 " subsampling: subsampling dimensions must be non-zero",
502
0
      xnn_operator_type_to_string(operator_type), subsampling_width, subsampling_height);
503
0
    goto error;
504
0
  }
505
506
0
  if (dilation_width == 0 || dilation_height == 0) {
507
0
    xnn_log_error(
508
0
      "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
509
0
      xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
510
0
    goto error;
511
0
  }
512
513
0
  if (groups == 0) {
514
0
    xnn_log_error(
515
0
      "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
516
0
      xnn_operator_type_to_string(operator_type), groups);
517
0
    goto error;
518
0
  }
519
520
0
  if (group_input_channels == 0) {
521
0
    xnn_log_error(
522
0
      "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
523
0
      xnn_operator_type_to_string(operator_type), group_input_channels);
524
0
    goto error;
525
0
  }
526
527
0
  if (group_output_channels == 0) {
528
0
    xnn_log_error(
529
0
      "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
530
0
      xnn_operator_type_to_string(operator_type), group_output_channels);
531
0
    goto error;
532
0
  }
533
534
0
  const size_t input_channels = groups * group_input_channels;
535
0
  if (input_channel_stride < input_channels) {
536
0
    xnn_log_error(
537
0
      "failed to create %s operator with input channel stride of %zu: "
538
0
      "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
539
0
      xnn_operator_type_to_string(operator_type),
540
0
      input_channel_stride, groups, group_input_channels);
541
0
    goto error;
542
0
  }
543
544
0
  const size_t output_channels = groups * group_output_channels;
545
0
  if (output_channel_stride < output_channels) {
546
0
    xnn_log_error(
547
0
      "failed to create %s operator with output channel stride of %zu: "
548
0
      "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
549
0
      xnn_operator_type_to_string(operator_type),
550
0
      output_channel_stride, groups, group_output_channels);
551
0
    goto error;
552
0
  }
553
554
0
  if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
555
0
    xnn_log_error(
556
0
      "failed to create depthwise %s operator with %zu input channels per group: "
557
0
      "depthwise convolution must have exactly 1 input channel per group",
558
0
      xnn_operator_type_to_string(operator_type), group_input_channels);
559
0
    goto error;
560
0
  }
561
562
0
  const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
563
0
  if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
564
0
    if (any_padding) {
565
0
      xnn_log_error(
566
0
        "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
567
0
        "TensorFlow SAME padding can't be combined with explicit padding specification",
568
0
        xnn_operator_type_to_string(operator_type),
569
0
        input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
570
0
      goto error;
571
0
    }
572
0
  }
573
574
0
  status = xnn_status_out_of_memory;
575
576
0
  convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
577
0
  if (convolution_op == NULL) {
578
0
    xnn_log_error(
579
0
      "failed to allocate %zu bytes for %s operator descriptor",
580
0
      sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
581
0
    goto error;
582
0
  }
583
584
0
  convolution_op->weights_cache = weights_cache;
585
0
  convolution_op->code_cache = code_cache;
586
587
0
  const size_t kernel_size = kernel_height * kernel_width;
588
589
0
  enum xnn_microkernel_type ukernel_type = xnn_microkernel_type_default;
590
0
  const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
591
0
  if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding && vmulcaddc_config != NULL) {
592
0
    ukernel_type = xnn_microkernel_type_vmulcaddc;
593
0
  } else if (group_input_channels == 1 && group_output_channels == 1 && dwconv_ukernel != NULL) {
594
0
    ukernel_type = xnn_microkernel_type_dwconv;
595
0
  } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
596
0
    ukernel_type = xnn_microkernel_type_gemm;
597
0
  } else {
598
0
    ukernel_type = xnn_microkernel_type_igemm;
599
0
  }
600
0
  assert(ukernel_type != xnn_microkernel_type_default);
601
602
0
  if (num_post_operations != 0 && (ukernel_type != xnn_microkernel_type_gemm && ukernel_type != xnn_microkernel_type_igemm)) {
603
0
    xnn_log_error(
604
0
        "convolution with post operations not support for these parameters: "
605
0
        "kernel_size: %zu unit_subsampling: %d padding: %d, ukernel_type: %d",
606
0
        kernel_size, unit_subsampling, any_padding, ukernel_type);
607
0
    goto error;
608
0
  }
609
610
0
  size_t zero_size = 0;
611
0
  switch (ukernel_type) {
612
0
    case xnn_microkernel_type_vmulcaddc:
613
0
    {
614
0
      status = create_vmulcaddc_path(
615
0
          groups, kernel, bias, log2_filter_element_size, bias_element_size,
616
0
          pack_vmulcaddc_w, packing_params, packed_weights_padding_byte,
617
0
          vmulcaddc_params, vmulcaddc_params_size, vmulcaddc_config,
618
0
          operator_type, convolution_op);
619
0
      if (status != xnn_status_success) {
620
0
        goto error;
621
0
      }
622
0
      break;
623
0
    }
624
0
    case xnn_microkernel_type_dwconv:
625
0
    {
626
0
      status = create_dwconv_path(
627
0
          kernel_height, kernel_width,
628
0
          groups, kernel, bias, flags,
629
0
          log2_input_element_size, log2_filter_element_size, bias_element_size,
630
0
          pack_dwconv_hwg_w, pack_dwconv_ghw_w,
631
0
          packing_params, packed_weights_padding_byte, extra_weights_bytes,
632
0
          init_scale_params, scale_params,
633
0
          dwconv_params, dwconv_params_size, dwconv_ukernel,
634
0
          linear_activation, operator_type, &zero_size, convolution_op);
635
0
      if (status != xnn_status_success) {
636
0
        goto error;
637
0
      }
638
0
      break;
639
0
    }
640
0
    case xnn_microkernel_type_gemm:
641
0
    case xnn_microkernel_type_igemm:
642
0
    {
643
0
      status = create_gemm_or_igemm(
644
0
          ukernel_type, kernel_size,
645
0
          groups, group_input_channels, group_output_channels,
646
0
          kernel, bias, flags,
647
0
          log2_input_element_size, log2_filter_element_size, bias_element_size,
648
0
          pack_gemm_goi_w, pack_conv_kgo_w, pack_conv_goki_w, packing_params,
649
0
          packed_weights_padding_byte, extra_weights_bytes,
650
0
          init_scale_params, scale_params,
651
0
          gemm_params, gemm_params_size, gemm_config, jit_gemm_params,
652
0
          linear_activation, relu_activation,
653
0
          operator_type,
654
0
          num_post_operations, post_operation_params,
655
0
          convolution_op,
656
0
          &zero_size);
657
0
      if (status != xnn_status_success) {
658
0
        goto error;
659
0
      }
660
0
      break;
661
0
    }
662
0
    default:
663
0
      XNN_UNREACHABLE;
664
0
  }
665
666
0
  const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
667
0
  if (any_padding || tf_same_padding) {
668
0
    convolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
669
0
    if (convolution_op->zero_buffer == NULL) {
670
0
      xnn_log_error(
671
0
        "failed to allocate %zu bytes for %s operator zero padding",
672
0
        zero_size, xnn_operator_type_to_string(operator_type));
673
0
      goto error;
674
0
    }
675
0
    memset(convolution_op->zero_buffer, input_padding_byte, zero_size);
676
0
  }
677
678
0
  convolution_op->padding_top = input_padding_top;
679
0
  convolution_op->padding_right = input_padding_right;
680
0
  convolution_op->padding_bottom = input_padding_bottom;
681
0
  convolution_op->padding_left = input_padding_left;
682
683
0
  convolution_op->kernel_height = kernel_height;
684
0
  convolution_op->kernel_width = kernel_width;
685
0
  convolution_op->stride_height = subsampling_height;
686
0
  convolution_op->stride_width = subsampling_width;
687
0
  convolution_op->dilation_height = dilation_height;
688
0
  convolution_op->dilation_width = dilation_width;
689
0
  convolution_op->groups = groups;
690
0
  convolution_op->group_input_channels = group_input_channels;
691
0
  convolution_op->group_output_channels = group_output_channels;
692
0
  convolution_op->input_pixel_stride = input_channel_stride;
693
0
  convolution_op->output_pixel_stride = output_channel_stride;
694
695
0
  convolution_op->type = operator_type;
696
0
  convolution_op->ukernel.type = ukernel_type;
697
0
  convolution_op->flags = flags & ~XNN_FLAG_TENSORFLOW_SAME_PADDING;
698
0
  if (tf_same_padding) {
699
0
    convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
700
0
  }
701
702
0
  convolution_op->state = xnn_run_state_invalid;
703
704
0
  *convolution_op_out = convolution_op;
705
0
  return xnn_status_success;
706
707
0
error:
708
0
  xnn_delete_operator(convolution_op);
709
0
  return status;
710
0
}
711
712
enum xnn_status xnn_create_convolution2d_nhwc_qu8(
713
    uint32_t input_padding_top,
714
    uint32_t input_padding_right,
715
    uint32_t input_padding_bottom,
716
    uint32_t input_padding_left,
717
    uint32_t kernel_height,
718
    uint32_t kernel_width,
719
    uint32_t subsampling_height,
720
    uint32_t subsampling_width,
721
    uint32_t dilation_height,
722
    uint32_t dilation_width,
723
    uint32_t groups,
724
    size_t group_input_channels,
725
    size_t group_output_channels,
726
    size_t input_channel_stride,
727
    size_t output_channel_stride,
728
    uint8_t input_zero_point,
729
    float input_scale,
730
    uint8_t kernel_zero_point,
731
    float kernel_scale,
732
    const uint8_t* kernel,
733
    const int32_t* bias,
734
    uint8_t output_zero_point,
735
    float output_scale,
736
    uint8_t output_min,
737
    uint8_t output_max,
738
    uint32_t flags,
739
    xnn_code_cache_t code_cache,
740
    xnn_weights_cache_t weights_cache,
741
    xnn_operator_t* convolution_op_out)
742
0
{
743
0
  if (input_scale <= 0.0f || !isnormal(input_scale)) {
744
0
    xnn_log_error(
745
0
      "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
746
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), input_scale);
747
0
    return xnn_status_invalid_parameter;
748
0
  }
749
750
0
  if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
751
0
    xnn_log_error(
752
0
      "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
753
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), kernel_scale);
754
0
    return xnn_status_invalid_parameter;
755
0
  }
756
757
0
  if (output_scale <= 0.0f || !isnormal(output_scale)) {
758
0
    xnn_log_error(
759
0
      "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
760
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_scale);
761
0
    return xnn_status_invalid_parameter;
762
0
  }
763
764
0
  if (output_min >= output_max) {
765
0
    xnn_log_error(
766
0
      "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
767
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_min, output_max);
768
0
    return xnn_status_invalid_parameter;
769
0
  }
770
771
0
  const float requantization_scale = input_scale * kernel_scale / output_scale;
772
0
  if (requantization_scale >= 256.0f) {
773
0
    xnn_log_error(
774
0
      "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
775
0
      "requantization scale %.7g is greater or equal to 256.0",
776
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
777
0
      input_scale, kernel_scale, output_scale, requantization_scale);
778
0
    return xnn_status_unsupported_parameter;
779
0
  }
780
781
0
  const struct xnn_qu8_packing_params packing_params = {
782
0
    .input_zero_point = input_zero_point,
783
0
    .kernel_zero_point = kernel_zero_point,
784
0
  };
785
786
0
  const struct xnn_gemm_config* gemm_config = xnn_init_qu8_gemm_config();
787
0
  assert(gemm_config != NULL);
788
789
0
  union xnn_qu8_conv_minmax_params gemm_params;
790
0
  if XNN_LIKELY(gemm_config->init.qu8 != NULL) {
791
0
    gemm_config->init.qu8(&gemm_params,
792
0
      kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
793
0
  }
794
795
0
  const struct xnn_dwconv_config* dwconv_config = xnn_init_qu8_dwconv_config();
796
0
  assert(dwconv_config != NULL);
797
798
0
  union xnn_qu8_conv_minmax_params dwconv_params;
799
0
  const struct xnn_dwconv_config* dwconv_ukernel =
800
0
    find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_QU8_DWCONV_UKERNELS);
801
0
  if XNN_LIKELY(dwconv_ukernel != NULL) {
802
0
    dwconv_ukernel->init.qu8(&dwconv_params,
803
0
      kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
804
0
  }
805
806
0
  return create_convolution2d_nhwc(
807
0
    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
808
0
    kernel_height, kernel_width,
809
0
    subsampling_height, subsampling_width,
810
0
    dilation_height, dilation_width,
811
0
    groups, group_input_channels, group_output_channels,
812
0
    input_channel_stride, output_channel_stride,
813
0
    kernel, bias, flags,
814
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_UINT8_T,
815
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_UINT8_T,
816
0
    /*bias_element_size=*/sizeof(int32_t),
817
0
    (xnn_pack_vmulcaddc_w_fn) NULL,
818
0
    (xnn_pack_dwconv_hwg_w_fn) xnn_pack_qu8_dwconv_hwg_w,
819
0
    (xnn_pack_dwconv_ghw_w_fn) xnn_pack_qu8_dwconv_ghw_w,
820
0
    (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi,
821
0
    (xnn_pack_conv_kgo_w_fn) xnn_pack_qu8_conv_kgo_w,
822
0
    (xnn_pack_conv_goki_w_fn) xnn_pack_qu8_conv_goki_w,
823
0
    /*packing_params=*/&packing_params,
824
0
    /*input_padding_byte=*/input_zero_point,
825
0
    /*packed_weights_padding_byte=*/kernel_zero_point,
826
0
    /*extra_weights_bytes=*/0,
827
    /*init_scale_params=*/NULL,
828
    /*scale_params=*/NULL,
829
0
    /*gemm_params=*/&gemm_params,
830
0
    /*gemm_params_size=*/sizeof(gemm_params),
831
0
    /*dwconv_params=*/&dwconv_params,
832
0
    /*dwconv_params_size=*/sizeof(dwconv_params),
833
    /*vmulcaddc_params=*/NULL,
834
0
    /*vmulcaddc_params_size=*/0,
835
0
    /*gemm_config=*/gemm_config,
836
0
    /*dwconv_ukernel=*/dwconv_ukernel,
837
    /*vmulcaddc_config=*/NULL,
838
    /*jit_gemm_params=*/NULL,
839
    /*linear_activation=*/false,
840
    /*relu_activation=*/false,
841
0
    /*operator_type=*/xnn_operator_type_convolution_nhwc_qu8,
842
0
    /*num_post_operations=*/0,
843
    /*post_operation_params=*/NULL,
844
0
    /*code_cache=*/code_cache,
845
0
    /*weights_cache=*/weights_cache,
846
0
    convolution_op_out);
847
0
}
848
849
enum xnn_status xnn_create_convolution2d_nhwc_qs8(
850
    uint32_t input_padding_top,
851
    uint32_t input_padding_right,
852
    uint32_t input_padding_bottom,
853
    uint32_t input_padding_left,
854
    uint32_t kernel_height,
855
    uint32_t kernel_width,
856
    uint32_t subsampling_height,
857
    uint32_t subsampling_width,
858
    uint32_t dilation_height,
859
    uint32_t dilation_width,
860
    uint32_t groups,
861
    size_t group_input_channels,
862
    size_t group_output_channels,
863
    size_t input_channel_stride,
864
    size_t output_channel_stride,
865
    int8_t input_zero_point,
866
    float input_scale,
867
    float kernel_scale,
868
    const int8_t* kernel,
869
    const int32_t* bias,
870
    int8_t output_zero_point,
871
    float output_scale,
872
    int8_t output_min,
873
    int8_t output_max,
874
    uint32_t flags,
875
    xnn_code_cache_t code_cache,
876
    xnn_weights_cache_t weights_cache,
877
    xnn_operator_t* convolution_op_out)
878
180
{
879
180
  if (input_scale <= 0.0f || !isnormal(input_scale)) {
880
112
    xnn_log_error(
881
112
      "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
882
112
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), input_scale);
883
112
    return xnn_status_invalid_parameter;
884
112
  }
885
886
68
  if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
887
38
    xnn_log_error(
888
38
      "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
889
38
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), kernel_scale);
890
38
    return xnn_status_invalid_parameter;
891
38
  }
892
893
30
  if (output_scale <= 0.0f || !isnormal(output_scale)) {
894
9
    xnn_log_error(
895
9
      "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
896
9
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_scale);
897
9
    return xnn_status_invalid_parameter;
898
9
  }
899
900
21
  if (output_min >= output_max) {
901
0
    xnn_log_error(
902
0
      "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
903
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_min, output_max);
904
0
    return xnn_status_invalid_parameter;
905
0
  }
906
907
21
  const float requantization_scale = input_scale * kernel_scale / output_scale;
908
21
  if (requantization_scale >= 256.0f) {
909
21
    xnn_log_error(
910
21
      "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
911
21
      "requantization scale %.7g is greater or equal to 256.0",
912
21
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
913
21
      input_scale, kernel_scale, output_scale, requantization_scale);
914
21
    return xnn_status_unsupported_parameter;
915
21
  }
916
917
0
  const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
918
919
0
  const struct xnn_gemm_config* gemm_config = xnn_init_qs8_gemm_config();
920
0
  assert(gemm_config != NULL);
921
922
0
  union xnn_qs8_conv_minmax_params gemm_params;
923
0
  if XNN_LIKELY(gemm_config->init.qs8 != NULL) {
924
0
    gemm_config->init.qs8(&gemm_params,
925
0
      requantization_scale, output_zero_point, output_min, output_max);
926
0
  }
927
928
0
  const struct xnn_dwconv_config* dwconv_config = xnn_init_qs8_dwconv_config();
929
0
  assert(dwconv_config != NULL);
930
931
0
  union xnn_qs8_conv_minmax_params dwconv_params;
932
0
  const struct xnn_dwconv_config* dwconv_ukernel =
933
0
    find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_QS8_DWCONV_UKERNELS);
934
0
  if XNN_LIKELY(dwconv_ukernel != NULL) {
935
0
    dwconv_ukernel->init.qs8(&dwconv_params,
936
0
      requantization_scale, output_zero_point, output_min, output_max);
937
0
  }
938
939
0
  return create_convolution2d_nhwc(
940
0
    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
941
0
    kernel_height, kernel_width,
942
0
    subsampling_height, subsampling_width,
943
0
    dilation_height, dilation_width,
944
0
    groups, group_input_channels, group_output_channels,
945
0
    input_channel_stride, output_channel_stride,
946
0
    kernel, bias, flags,
947
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
948
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
949
0
    /*bias_element_size=*/sizeof(int32_t),
950
0
    (xnn_pack_vmulcaddc_w_fn) NULL,
951
0
    (xnn_pack_dwconv_hwg_w_fn) xnn_pack_qs8_dwconv_hwg_w,
952
0
    (xnn_pack_dwconv_ghw_w_fn) xnn_pack_qs8_dwconv_ghw_w,
953
0
    (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi,
954
0
    (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w,
955
0
    (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w,
956
0
    /*packing_params=*/&packing_params,
957
0
    /*input_padding_byte=*/input_zero_point,
958
0
    /*packed_weights_padding_byte=*/0,
959
0
    /*extra_weights_bytes=*/0,
960
    /*init_scale_params=*/NULL,
961
    /*scale_params=*/NULL,
962
0
    /*gemm_params=*/&gemm_params,
963
0
    /*gemm_params_size=*/sizeof(gemm_params),
964
0
    /*dwconv_params=*/&dwconv_params,
965
0
    /*dwconv_params_size=*/sizeof(dwconv_params),
966
    /*vmulcaddc_params=*/NULL,
967
0
    /*vmulcaddc_params_size=*/0,
968
0
    /*gemm_config=*/gemm_config,
969
0
    /*dwconv_ukernel=*/dwconv_ukernel,
970
    /*vmulcaddc_config=*/NULL,
971
    /*jit_gemm_params=*/NULL,
972
    /*linear_activation=*/false,
973
    /*relu_activation=*/false,
974
0
    /*operator_type=*/xnn_operator_type_convolution_nhwc_qs8,
975
0
    /*num_post_operations=*/0,
976
    /*post_operation_params=*/NULL,
977
0
    /*code_cache=*/code_cache,
978
0
    /*weights_cache=*/weights_cache,
979
0
    convolution_op_out);
980
21
}
981
982
enum xnn_status xnn_create_convolution2d_nhwc_qs8_qc8w(
983
    uint32_t input_padding_top,
984
    uint32_t input_padding_right,
985
    uint32_t input_padding_bottom,
986
    uint32_t input_padding_left,
987
    uint32_t kernel_height,
988
    uint32_t kernel_width,
989
    uint32_t subsampling_height,
990
    uint32_t subsampling_width,
991
    uint32_t dilation_height,
992
    uint32_t dilation_width,
993
    uint32_t groups,
994
    size_t group_input_channels,
995
    size_t group_output_channels,
996
    size_t input_channel_stride,
997
    size_t output_channel_stride,
998
    int8_t input_zero_point,
999
    float input_scale,
1000
    const float* kernel_scale,
1001
    const int8_t* kernel,
1002
    const int32_t* bias,
1003
    int8_t output_zero_point,
1004
    float output_scale,
1005
    int8_t output_min,
1006
    int8_t output_max,
1007
    uint32_t flags,
1008
    xnn_code_cache_t code_cache,
1009
    xnn_weights_cache_t weights_cache,
1010
    xnn_operator_t* convolution_op_out)
1011
0
{
1012
0
  if (input_scale <= 0.0f || !isnormal(input_scale)) {
1013
0
    xnn_log_error(
1014
0
      "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
1015
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), input_scale);
1016
0
    return xnn_status_invalid_parameter;
1017
0
  }
1018
1019
0
  for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) {
1020
0
    if (kernel_scale[output_channel] <= 0.0f || !isnormal(kernel_scale[output_channel])) {
1021
0
      xnn_log_error(
1022
0
        "failed to create %s operator with %.7g kernel scale in output channel #%zu: "
1023
0
        "scale must be finite, normalized, and positive",
1024
0
        xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), kernel_scale[output_channel],
1025
0
        output_channel);
1026
0
      return xnn_status_invalid_parameter;
1027
0
    }
1028
0
  }
1029
1030
0
  if (output_scale <= 0.0f || !isnormal(output_scale)) {
1031
0
    xnn_log_error(
1032
0
      "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
1033
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_scale);
1034
0
    return xnn_status_invalid_parameter;
1035
0
  }
1036
1037
0
  if (output_min >= output_max) {
1038
0
    xnn_log_error(
1039
0
      "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
1040
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_min, output_max);
1041
0
    return xnn_status_invalid_parameter;
1042
0
  }
1043
1044
0
  float* requantization_scale = xnn_allocate_simd_memory(groups * group_output_channels * sizeof(float));
1045
0
  if (requantization_scale == NULL) {
1046
0
    xnn_log_error(
1047
0
      "failed to allocate %zu bytes for %s operator packed weights",
1048
0
      groups * group_output_channels * sizeof(float),
1049
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8));
1050
0
    return xnn_status_out_of_memory;
1051
0
  }
1052
0
  for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) {
1053
0
    requantization_scale[output_channel] = input_scale * kernel_scale[output_channel] / output_scale;
1054
0
    if (requantization_scale[output_channel] >= 256.0f) {
1055
0
      xnn_log_error(
1056
0
        "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale in output channel #%zu: "
1057
0
        "requantization scale %.7g is greater or equal to 256.0",
1058
0
        xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8),
1059
0
        input_scale, kernel_scale[output_channel], output_scale,
1060
0
        output_channel, requantization_scale[output_channel]);
1061
1062
0
      xnn_release_simd_memory(requantization_scale);
1063
0
      return xnn_status_unsupported_parameter;
1064
0
    }
1065
0
  }
1066
1067
0
  const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
1068
1069
0
  const struct xnn_gemm_config* gemm_config = xnn_init_qs8_qc8w_gemm_config();
1070
0
  assert(gemm_config != NULL);
1071
1072
0
  union xnn_qs8_qc8w_conv_minmax_params gemm_params;
1073
0
  if XNN_LIKELY(gemm_config->init.qs8_qc8w != NULL) {
1074
0
    gemm_config->init.qs8_qc8w(&gemm_params,
1075
0
      output_zero_point, output_min, output_max);
1076
0
  }
1077
1078
0
  const struct xnn_dwconv_config* dwconv_config = xnn_init_qs8_qc8w_dwconv_config();
1079
0
  assert(dwconv_config != NULL);
1080
1081
0
  union xnn_qs8_qc8w_conv_minmax_params dwconv_params;
1082
0
  const struct xnn_dwconv_config* dwconv_ukernel =
1083
0
    find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_QC8_DWCONV_UKERNELS);
1084
0
  if XNN_LIKELY(dwconv_ukernel != NULL) {
1085
0
    dwconv_ukernel->init.qs8_qc8w(&dwconv_params,
1086
0
      output_zero_point, output_min, output_max);
1087
0
  }
1088
1089
0
  enum xnn_status status = create_convolution2d_nhwc(
1090
0
    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1091
0
    kernel_height, kernel_width,
1092
0
    subsampling_height, subsampling_width,
1093
0
    dilation_height, dilation_width,
1094
0
    groups, group_input_channels, group_output_channels,
1095
0
    input_channel_stride, output_channel_stride,
1096
0
    kernel, bias, flags,
1097
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
1098
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
1099
0
    /*bias_element_size=*/sizeof(int32_t),
1100
0
    (xnn_pack_vmulcaddc_w_fn) NULL,
1101
0
    (xnn_pack_dwconv_hwg_w_fn) xnn_pack_qs8_dwconv_hwg_w,
1102
0
    (xnn_pack_dwconv_ghw_w_fn) xnn_pack_qs8_dwconv_ghw_w,
1103
0
    (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi,
1104
0
    (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w,
1105
0
    (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w,
1106
0
    /*packing_params=*/&packing_params,
1107
0
    /*input_padding_byte=*/input_zero_point,
1108
0
    /*packed_weights_padding_byte=*/0,
1109
0
    /*extra_weights_bytes=*/sizeof(float),
1110
0
    /*init_scale_params=*/xnn_init_qs8_qc8w_scale_fp32_params,
1111
0
    /*scale_params=*/requantization_scale,
1112
0
    /*gemm_params=*/&gemm_params,
1113
0
    /*gemm_params_size=*/sizeof(gemm_params),
1114
0
    /*dwconv_params=*/&dwconv_params,
1115
0
    /*dwconv_params_size=*/sizeof(dwconv_params),
1116
    /*vmulcaddc_params=*/NULL,
1117
0
    /*vmulcaddc_params_size=*/0,
1118
0
    /*gemm_config=*/gemm_config,
1119
0
    /*dwconv_ukernel=*/dwconv_ukernel,
1120
    /*vmulcaddc_config=*/NULL,
1121
    /*jit_gemm_params=*/NULL,
1122
    /*linear_activation=*/false,
1123
    /*relu_activation=*/false,
1124
0
    /*operator_type=*/xnn_operator_type_convolution_nhwc_qc8,
1125
0
    /*num_post_operations=*/0,
1126
    /*post_operation_params=*/NULL,
1127
0
    /*code_cache=*/code_cache,
1128
0
    /*weights_cache=*/weights_cache,
1129
0
    convolution_op_out);
1130
1131
0
  xnn_release_simd_memory(requantization_scale);
1132
0
  return status;
1133
0
}
1134
1135
enum xnn_status xnn_create_convolution2d_nhwc_f16(
1136
    uint32_t input_padding_top,
1137
    uint32_t input_padding_right,
1138
    uint32_t input_padding_bottom,
1139
    uint32_t input_padding_left,
1140
    uint32_t kernel_height,
1141
    uint32_t kernel_width,
1142
    uint32_t subsampling_height,
1143
    uint32_t subsampling_width,
1144
    uint32_t dilation_height,
1145
    uint32_t dilation_width,
1146
    uint32_t groups,
1147
    size_t group_input_channels,
1148
    size_t group_output_channels,
1149
    size_t input_channel_stride,
1150
    size_t output_channel_stride,
1151
    const void* kernel,
1152
    const void* bias,
1153
    float output_min,
1154
    float output_max,
1155
    uint32_t flags,
1156
    xnn_code_cache_t code_cache,
1157
    xnn_weights_cache_t weights_cache,
1158
    xnn_operator_t* convolution_op_out)
1159
0
{
1160
0
  if (isnan(output_min)) {
1161
0
    xnn_log_error(
1162
0
      "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
1163
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1164
0
    return xnn_status_invalid_parameter;
1165
0
  }
1166
1167
0
  if (isnan(output_max)) {
1168
0
    xnn_log_error(
1169
0
      "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
1170
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1171
0
    return xnn_status_invalid_parameter;
1172
0
  }
1173
1174
0
  const uint16_t fp16_output_min = fp16_ieee_from_fp32_value(output_min);
1175
0
  const uint16_t fp16_output_max = fp16_ieee_from_fp32_value(output_max);
1176
0
  const float rounded_output_min = fp16_ieee_to_fp32_value(fp16_output_min);
1177
0
  const float rounded_output_max = fp16_ieee_to_fp32_value(fp16_output_max);
1178
0
  if (rounded_output_min >= rounded_output_max) {
1179
0
    xnn_log_error(
1180
0
      "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
1181
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16), rounded_output_min, rounded_output_max);
1182
0
    return xnn_status_invalid_parameter;
1183
0
  }
1184
1185
0
  const struct xnn_gemm_config* gemm_config = xnn_init_f16_gemm_config();
1186
0
  if (gemm_config == NULL) {
1187
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1188
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1189
0
    return xnn_status_unsupported_hardware;
1190
0
  }
1191
1192
0
  union xnn_f16_minmax_params gemm_params;
1193
0
  if XNN_LIKELY(gemm_config->init.f16 != NULL) {
1194
0
    gemm_config->init.f16(&gemm_params, fp16_output_min, fp16_output_max);
1195
0
  }
1196
1197
0
  const struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
1198
0
  if (dwconv_config == NULL) {
1199
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1200
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1201
0
    return xnn_status_unsupported_hardware;
1202
0
  }
1203
1204
0
  union xnn_f16_minmax_params dwconv_params;
1205
0
  const struct xnn_dwconv_config* dwconv_ukernel =
1206
0
    find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_F16_DWCONV_UKERNELS);
1207
0
  if XNN_LIKELY(dwconv_ukernel != NULL) {
1208
0
    dwconv_ukernel->init.f16(&dwconv_params, fp16_output_min, fp16_output_max);
1209
0
  }
1210
1211
0
  const struct xnn_vmulcaddc_config* vmulcaddc_config = xnn_init_f16_vmulcaddc_config();
1212
0
  if (vmulcaddc_config == NULL) {
1213
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1214
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1215
0
    return xnn_status_unsupported_hardware;
1216
0
  }
1217
1218
0
  union xnn_f16_minmax_params vmulcaddc_params;
1219
0
  if XNN_LIKELY(vmulcaddc_config->init.f16 != NULL) {
1220
0
    vmulcaddc_config->init.f16(&vmulcaddc_params, fp16_output_min, fp16_output_max);
1221
0
  }
1222
1223
0
  struct jit_gemm_params jit_gemm_params = {
1224
0
    .f16_minmax = {
1225
0
      .min = fp16_output_min,
1226
0
      .max = fp16_output_max
1227
0
    }
1228
0
  };
1229
1230
0
  xnn_pack_vmulcaddc_w_fn pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_fn) xnn_pack_f16_vmulcaddc_w;
1231
0
  xnn_pack_dwconv_hwg_w_fn pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f16_dwconv_hwg_w;
1232
0
  xnn_pack_dwconv_ghw_w_fn pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f16_dwconv_ghw_w;
1233
0
  xnn_packw_gemm_goi_ukernel_fn pack_gemm_goi_w = (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi;
1234
0
  xnn_pack_conv_kgo_w_fn pack_conv_kgo_w = (xnn_pack_conv_kgo_w_fn) xnn_pack_f16_conv_kgo_w;
1235
0
  xnn_pack_conv_goki_w_fn pack_conv_goki_w = (xnn_pack_conv_goki_w_fn) xnn_pack_f16_conv_goki_w;
1236
0
  if (flags & XNN_FLAG_FP32_STATIC_WEIGHTS) {
1237
0
    pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_fn) xnn_pack_f32_to_f16_vmulcaddc_w;
1238
0
    pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f32_to_f16_dwconv_hwg_w;
1239
0
    pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f32_to_f16_dwconv_ghw_w;
1240
0
    pack_gemm_goi_w = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_to_f16_gemm_goi_w;
1241
0
    pack_conv_kgo_w = (xnn_pack_conv_kgo_w_fn) xnn_pack_f32_to_f16_conv_kgo_w;
1242
0
    pack_conv_goki_w = (xnn_pack_conv_goki_w_fn) xnn_pack_f32_to_f16_conv_goki_w;
1243
0
  }
1244
1245
0
  return create_convolution2d_nhwc(
1246
0
    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1247
0
    kernel_height, kernel_width,
1248
0
    subsampling_height, subsampling_width,
1249
0
    dilation_height, dilation_width,
1250
0
    groups, group_input_channels, group_output_channels,
1251
0
    input_channel_stride, output_channel_stride,
1252
0
    kernel, bias, flags,
1253
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_HALF,
1254
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_HALF,
1255
0
    /*bias_element_size=*/sizeof(uint16_t),
1256
0
    pack_vmulcaddc_w,
1257
0
    pack_dwconv_hwg_w,
1258
0
    pack_dwconv_ghw_w,
1259
0
    pack_gemm_goi_w,
1260
0
    pack_conv_kgo_w,
1261
0
    pack_conv_goki_w,
1262
    /*packing_params=*/NULL,
1263
0
    /*input_padding_byte=*/0,
1264
0
    /*packed_weights_padding_byte=*/0,
1265
0
    /*extra_weights_bytes=*/0,
1266
    /*init_scale_params=*/NULL,
1267
    /*scale_params=*/NULL,
1268
0
    /*gemm_params=*/&gemm_params,
1269
0
    /*gemm_params_size=*/sizeof(gemm_params),
1270
0
    /*dwconv_params=*/&dwconv_params,
1271
0
    /*dwconv_params_size=*/sizeof(dwconv_params),
1272
0
    /*vmulcaddc_params=*/&vmulcaddc_params,
1273
0
    /*vmulcaddc_params_size=*/sizeof(vmulcaddc_params),
1274
0
    /*gemm_config=*/gemm_config,
1275
0
    /*dwconv_ukernel=*/dwconv_ukernel,
1276
0
    /*vmulcaddc_config=*/vmulcaddc_config,
1277
0
    /*jit_gemm_params=*/&jit_gemm_params,
1278
    /*linear_activation=*/false,
1279
    /*relu_activation=*/false,
1280
0
    /*operator_type=*/xnn_operator_type_convolution_nhwc_f16,
1281
0
    /*num_post_operations=*/0,
1282
    /*post_operation_params=*/NULL,
1283
0
    /*code_cache=*/code_cache,
1284
0
    /*weights_cache=*/weights_cache,
1285
0
    convolution_op_out);
1286
0
}
1287
1288
enum xnn_status xnn_create_convolution2d_nhwc_f32(
1289
    uint32_t input_padding_top,
1290
    uint32_t input_padding_right,
1291
    uint32_t input_padding_bottom,
1292
    uint32_t input_padding_left,
1293
    uint32_t kernel_height,
1294
    uint32_t kernel_width,
1295
    uint32_t subsampling_height,
1296
    uint32_t subsampling_width,
1297
    uint32_t dilation_height,
1298
    uint32_t dilation_width,
1299
    uint32_t groups,
1300
    size_t group_input_channels,
1301
    size_t group_output_channels,
1302
    size_t input_channel_stride,
1303
    size_t output_channel_stride,
1304
    const float* kernel,
1305
    const float* bias,
1306
    float output_min,
1307
    float output_max,
1308
    uint32_t flags,
1309
    xnn_code_cache_t code_cache,
1310
    xnn_weights_cache_t weights_cache,
1311
    xnn_operator_t* convolution_op_out)
1312
0
{
1313
0
  if (isnan(output_min)) {
1314
0
    xnn_log_error(
1315
0
      "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
1316
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1317
0
    return xnn_status_invalid_parameter;
1318
0
  }
1319
1320
0
  if (isnan(output_max)) {
1321
0
    xnn_log_error(
1322
0
      "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
1323
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1324
0
    return xnn_status_invalid_parameter;
1325
0
  }
1326
1327
0
  if (output_min >= output_max) {
1328
0
    xnn_log_error(
1329
0
      "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
1330
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32), output_min, output_max);
1331
0
    return xnn_status_invalid_parameter;
1332
0
  }
1333
1334
0
  const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
1335
0
  const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f);
1336
1337
0
  const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config();
1338
0
  if (gemm_config == NULL) {
1339
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1340
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1341
0
    return xnn_status_unsupported_hardware;
1342
0
  }
1343
1344
0
  const struct xnn_gemm_config* gemm_nr2_config = xnn_init_f32_gemm_nr2_config();
1345
0
  if (gemm_nr2_config == NULL) {
1346
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1347
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1348
0
    return xnn_status_unsupported_hardware;
1349
0
  }
1350
1351
0
  if (gemm_config->nr > group_output_channels) {
1352
    // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
1353
1354
0
    if (gemm_nr2_config->minmax.igemm[gemm_config->mr].function[XNN_UARCH_DEFAULT] != NULL) {
1355
0
      gemm_config = gemm_nr2_config;
1356
0
    }
1357
0
  }
1358
1359
0
  union xnn_f32_minmax_params gemm_params;
1360
0
  if XNN_LIKELY(gemm_config->init.f32 != NULL) {
1361
0
    gemm_config->init.f32(&gemm_params, output_min, output_max);
1362
0
  }
1363
1364
0
  struct jit_gemm_params jit_gemm_params = {
1365
0
    .f32_minmax = {
1366
0
      .min = output_min,
1367
0
      .max = output_max
1368
0
    }
1369
0
  };
1370
1371
0
  const struct xnn_dwconv_config* dwconv_config = xnn_init_f32_dwconv_config();
1372
0
  if (dwconv_config == NULL) {
1373
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1374
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1375
0
    return xnn_status_unsupported_hardware;
1376
0
  }
1377
1378
0
  union xnn_f32_minmax_params dwconv_params;
1379
0
  const struct xnn_dwconv_config* dwconv_ukernel =
1380
0
    find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_F32_DWCONV_UKERNELS);
1381
0
  if XNN_LIKELY(dwconv_ukernel != NULL) {
1382
0
    dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max);
1383
0
  }
1384
1385
0
  const struct xnn_vmulcaddc_config* vmulcaddc_config = xnn_init_f32_vmulcaddc_config();
1386
0
  if (vmulcaddc_config == NULL) {
1387
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1388
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1389
0
    return xnn_status_unsupported_hardware;
1390
0
  }
1391
1392
0
  union xnn_f32_minmax_params vmulcaddc_params;
1393
0
  if XNN_LIKELY(vmulcaddc_config->init.f32 != NULL) {
1394
0
    vmulcaddc_config->init.f32(&vmulcaddc_params, output_min, output_max);
1395
0
  }
1396
1397
0
  return create_convolution2d_nhwc(
1398
0
    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1399
0
    kernel_height, kernel_width,
1400
0
    subsampling_height, subsampling_width,
1401
0
    dilation_height, dilation_width,
1402
0
    groups, group_input_channels, group_output_channels,
1403
0
    input_channel_stride, output_channel_stride,
1404
0
    kernel, bias, flags,
1405
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
1406
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
1407
0
    /*bias_element_size=*/sizeof(float),
1408
0
    (xnn_pack_vmulcaddc_w_fn) xnn_pack_f32_vmulcaddc_w,
1409
0
    (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f32_dwconv_hwg_w,
1410
0
    (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f32_dwconv_ghw_w,
1411
0
    (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi,
1412
0
    (xnn_pack_conv_kgo_w_fn) xnn_pack_f32_conv_kgo_w,
1413
0
    (xnn_pack_conv_goki_w_fn) xnn_pack_f32_conv_goki_w,
1414
    /*packing_params=*/NULL,
1415
0
    /*input_padding_byte=*/0,
1416
0
    /*packed_weights_padding_byte=*/0,
1417
0
    /*extra_weights_bytes=*/0,
1418
    /*init_scale_params=*/NULL,
1419
    /*scale_params=*/NULL,
1420
0
    /*gemm_params=*/&gemm_params,
1421
0
    /*gemm_params_size=*/sizeof(gemm_params),
1422
0
    /*dwconv_params=*/&dwconv_params,
1423
0
    /*dwconv_params_size=*/sizeof(dwconv_params),
1424
0
    /*vmulcaddc_params=*/&vmulcaddc_params,
1425
0
    /*vmulcaddc_params_size=*/sizeof(vmulcaddc_params),
1426
0
    /*gemm_config=*/gemm_config,
1427
0
    /*dwconv_ukernel=*/dwconv_ukernel,
1428
0
    /*vmulcaddc_config=*/vmulcaddc_config,
1429
0
    /*jit_gemm_params=*/&jit_gemm_params,
1430
0
    /*linear_activation=*/linear_activation,
1431
0
    /*relu_activation=*/relu_activation,
1432
0
    /*operator_type=*/xnn_operator_type_convolution_nhwc_f32,
1433
0
    /*num_post_operations=*/0,
1434
    /*post_operation_params=*/NULL,
1435
0
    /*code_cache=*/code_cache,
1436
0
    /*weights_cache=*/weights_cache,
1437
0
    convolution_op_out);
1438
0
}
1439
1440
enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1441
    uint32_t input_padding_top,
1442
    uint32_t input_padding_right,
1443
    uint32_t input_padding_bottom,
1444
    uint32_t input_padding_left,
1445
    uint32_t kernel_height,
1446
    uint32_t kernel_width,
1447
    uint32_t subsampling_height,
1448
    uint32_t subsampling_width,
1449
    uint32_t dilation_height,
1450
    uint32_t dilation_width,
1451
    uint32_t groups,
1452
    size_t group_input_channels,
1453
    size_t group_output_channels,
1454
    size_t input_channel_stride,
1455
    size_t output_channel_stride,
1456
    const float* kernel,
1457
    const float* bias,
1458
    size_t num_post_operations,
1459
    struct xnn_post_operation* post_operations,
1460
    uint32_t flags,
1461
    xnn_code_cache_t code_cache,
1462
    xnn_weights_cache_t weights_cache,
1463
    xnn_operator_t* convolution_op_out)
1464
0
{
1465
0
  if (code_cache == NULL) {
1466
0
    xnn_log_error(
1467
0
      "failed to create %s operator: convolution with post operations available only if JIT is enabled",
1468
0
      xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1469
0
    return xnn_status_invalid_parameter;
1470
0
  }
1471
1472
  // Convolution is specified with linear activation, any clamping should be specified as a post operator.
1473
0
  const float output_max = INFINITY;
1474
0
  const float output_min = -INFINITY;
1475
1476
0
  struct jit_gemm_params jit_gemm_params = {
1477
0
    .f32_minmax = {
1478
0
      .min = output_min,
1479
0
      .max = output_max
1480
0
    },
1481
0
    .num_post_operations = num_post_operations,
1482
0
    .post_operations = post_operations,
1483
0
  };
1484
1485
0
  char* post_operation_params = allocate_and_initialize_post_operation_params(num_post_operations, post_operations);
1486
1487
0
  const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config();
1488
0
  if (gemm_config == NULL) {
1489
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1490
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1491
0
    return xnn_status_unsupported_hardware;
1492
0
  }
1493
1494
0
  union xnn_f32_minmax_params gemm_params;
1495
0
  if XNN_LIKELY(gemm_config->init.f32 != NULL) {
1496
0
    gemm_config->init.f32(&gemm_params, output_min, output_max);
1497
0
  }
1498
1499
0
  const struct xnn_dwconv_config* dwconv_config = xnn_init_f32_dwconv_config();
1500
0
  if (dwconv_config == NULL) {
1501
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1502
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1503
0
    return xnn_status_unsupported_hardware;
1504
0
  }
1505
1506
0
  union xnn_f32_minmax_params dwconv_params;
1507
0
  const struct xnn_dwconv_config* dwconv_ukernel =
1508
0
    find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_F32_DWCONV_UKERNELS);
1509
0
  if XNN_LIKELY(dwconv_ukernel != NULL) {
1510
0
    dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max);
1511
0
  }
1512
1513
0
  const struct xnn_vmulcaddc_config* vmulcaddc_config = xnn_init_f32_vmulcaddc_config();
1514
0
  if (vmulcaddc_config == NULL) {
1515
0
    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
1516
0
                  xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1517
0
    return xnn_status_unsupported_hardware;
1518
0
  }
1519
1520
0
  union xnn_f32_minmax_params vmulcaddc_params;
1521
0
  if XNN_LIKELY(vmulcaddc_config->init.f32 != NULL) {
1522
0
    vmulcaddc_config->init.f32(&vmulcaddc_params, output_min, output_max);
1523
0
  }
1524
1525
0
  return create_convolution2d_nhwc(
1526
0
    input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1527
0
    kernel_height, kernel_width,
1528
0
    subsampling_height, subsampling_width,
1529
0
    dilation_height, dilation_width,
1530
0
    groups, group_input_channels, group_output_channels,
1531
0
    input_channel_stride, output_channel_stride,
1532
0
    kernel, bias, flags,
1533
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
1534
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
1535
0
    /*bias_element_size=*/sizeof(float),
1536
0
    (xnn_pack_vmulcaddc_w_fn) xnn_pack_f32_vmulcaddc_w,
1537
0
    (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f32_dwconv_hwg_w,
1538
0
    (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f32_dwconv_ghw_w,
1539
0
    (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi,
1540
0
    (xnn_pack_conv_kgo_w_fn) xnn_pack_f32_conv_kgo_w,
1541
0
    (xnn_pack_conv_goki_w_fn) xnn_pack_f32_conv_goki_w,
1542
    /*packing_params=*/NULL,
1543
0
    /*input_padding_byte=*/0,
1544
0
    /*packed_weights_padding_byte=*/0,
1545
0
    /*extra_weights_bytes=*/0,
1546
    /*init_scale_params=*/NULL,
1547
    /*scale_params=*/NULL,
1548
0
    /*gemm_params=*/(void*) &gemm_params,
1549
0
    /*gemm_params_size=*/sizeof(gemm_params),
1550
0
    /*dwconv_params=*/&dwconv_params,
1551
0
    /*dwconv_params_size=*/sizeof(dwconv_params),
1552
0
    /*vmulcaddc_params=*/&vmulcaddc_params,
1553
0
    /*vmulcaddc_params_size=*/sizeof(vmulcaddc_params),
1554
0
    /*gemm_config=*/gemm_config,
1555
0
    /*dwconv_ukernel=*/dwconv_ukernel,
1556
0
    /*vmulcaddc_config=*/vmulcaddc_config,
1557
0
    /*jit_gemm_params=*/&jit_gemm_params,
1558
    /*linear_activation=*/true,
1559
    /*relu_activation=*/false,
1560
0
    /*operator_type=*/xnn_operator_type_convolution_nhwc_f32,
1561
0
    /*num_post_operations=*/num_post_operations,
1562
0
    /*post_operation_params=*/post_operation_params,
1563
0
    /*code_cache=*/code_cache,
1564
0
    /*weights_cache=*/weights_cache,
1565
0
    convolution_op_out);
1566
0
}
1567
1568
static inline bool input_size_changed(xnn_operator_t convolution_op)
1569
0
{
1570
0
  return convolution_op->input_height != convolution_op->last_input_height ||
1571
0
         convolution_op->input_width != convolution_op->last_input_width;
1572
0
}
1573
1574
static enum xnn_status reshape_gemm(
1575
    xnn_operator_t convolution_op,
1576
    uint32_t log2_input_element_size,
1577
    uint32_t log2_filter_element_size,
1578
    uint32_t extra_weights_elements_size,
1579
    uint32_t log2_output_element_size,
1580
    size_t* workspace_size,
1581
    size_t* workspace_alignment,
1582
    size_t num_threads)
1583
0
{
1584
  // Convolution maps directly to GEMM and doesn't use indirection buffer.
1585
0
  const size_t batch_size = convolution_op->batch_size;
1586
1587
0
  const size_t output_height = convolution_op->output_height;
1588
0
  const size_t output_width = convolution_op->output_width;
1589
0
  const size_t output_size = output_height * output_width;
1590
0
  const size_t batch_output_size = batch_size * output_size;
1591
1592
0
  const size_t groups = convolution_op->groups;
1593
0
  const size_t group_input_channels = convolution_op->group_input_channels;
1594
0
  const size_t w_stride = extra_weights_elements_size +
1595
0
    (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr * convolution_op->ukernel.gemm.sr) << log2_filter_element_size);
1596
0
  const size_t group_output_channels = convolution_op->group_output_channels;
1597
1598
0
  uint32_t mr = convolution_op->ukernel.gemm.mr;
1599
0
  const uint32_t nr = convolution_op->ukernel.gemm.nr;
1600
0
  struct xnn_hmp_gemm_ukernel *gemm_cases = convolution_op->ukernel.gemm.gemm_cases;
1601
1602
0
  #if XNN_ENABLE_GEMM_M_SPECIALIZATION
1603
0
    mr = xnn_get_heuristic_mr_gemm(batch_output_size, mr, nr, gemm_cases, convolution_op->code_cache != NULL);
1604
  #else
1605
    if (batch_output_size == 1 && gemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) {
1606
      mr = 1;
1607
    }
1608
  #endif
1609
1610
  #if XNN_PLATFORM_JIT
1611
    xnn_overwrite_gemm_cases_with_generated_code(convolution_op, gemm_cases, mr);
1612
  #endif  // XNN_PLATFORM_JIT
1613
0
  struct xnn_hmp_gemm_ukernel gemm_ukernel = gemm_cases[mr - 1];
1614
1615
0
  convolution_op->context.gemm = (struct gemm_context) {
1616
0
      .k_scaled = group_input_channels << log2_input_element_size,
1617
0
      .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1618
0
      .ga_stride = group_input_channels << log2_input_element_size,
1619
0
      .packed_w = packed_weights(convolution_op),
1620
0
      .w_stride = w_stride,
1621
0
      .gw_stride = w_stride * round_up(group_output_channels, nr),
1622
0
      .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1623
0
      .cn_stride = nr << log2_output_element_size,
1624
0
      .gc_stride = group_output_channels << log2_output_element_size,
1625
0
      .log2_csize = log2_output_element_size,
1626
0
      .ukernel = gemm_ukernel,
1627
0
  };
1628
0
  memcpy(&convolution_op->context.gemm.params, &convolution_op->params, sizeof(convolution_op->context.gemm.params));
1629
0
  if (convolution_op->num_post_operation_params == 0) {
1630
0
    convolution_op->context.gemm.fused_params = &convolution_op->context.gemm.params;
1631
0
  } else {
1632
0
    convolution_op->context.gemm.fused_params = convolution_op->post_operation_params;
1633
0
  }
1634
1635
  #if XNN_TEST_MODE
1636
    const size_t nc = nr;
1637
  #else
1638
0
    size_t nc = group_output_channels;
1639
0
    if (num_threads > 1) {
1640
0
      const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
1641
0
      const size_t target_tiles_per_thread = 5;
1642
0
      const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
1643
0
      if (max_nc < nc) {
1644
0
        nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1645
0
      }
1646
0
    }
1647
0
  #endif
1648
0
  if (groups == 1) {
1649
    #if XNN_MAX_UARCH_TYPES > 1
1650
      if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
1651
        convolution_op->compute[0].type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1652
        convolution_op->compute[0].task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm;
1653
      } else {
1654
        convolution_op->compute[0].type = xnn_parallelization_type_2d_tile_2d;
1655
        convolution_op->compute[0].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
1656
      }
1657
    #else
1658
0
      convolution_op->compute[0].type = xnn_parallelization_type_2d_tile_2d;
1659
0
      convolution_op->compute[0].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
1660
0
    #endif
1661
0
    convolution_op->compute[0].range[0] = batch_output_size;
1662
0
    convolution_op->compute[0].range[1] = group_output_channels;
1663
0
    convolution_op->compute[0].tile[0] = mr;
1664
0
    convolution_op->compute[0].tile[1] = nc;
1665
0
  } else {
1666
    #if XNN_MAX_UARCH_TYPES > 1
1667
      if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
1668
        convolution_op->compute[0].type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1669
        convolution_op->compute[0].task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm;
1670
      } else {
1671
        convolution_op->compute[0].type = xnn_parallelization_type_3d_tile_2d;
1672
        convolution_op->compute[0].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
1673
      }
1674
    #else
1675
0
      convolution_op->compute[0].type = xnn_parallelization_type_3d_tile_2d;
1676
0
      convolution_op->compute[0].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
1677
0
    #endif
1678
0
    convolution_op->compute[0].range[0] = groups;
1679
0
    convolution_op->compute[0].range[1] = batch_output_size;
1680
0
    convolution_op->compute[0].range[2] = group_output_channels;
1681
0
    convolution_op->compute[0].tile[0] = mr;
1682
0
    convolution_op->compute[0].tile[1] = nc;
1683
0
  }
1684
0
  convolution_op->state = xnn_run_state_needs_setup;
1685
1686
0
  *workspace_size = 0;
1687
0
  *workspace_alignment = 1;
1688
1689
0
  return xnn_status_success;
1690
0
}
1691
1692
static enum xnn_status reshape_igemm(
1693
    xnn_operator_t convolution_op,
1694
    uint32_t log2_input_element_size,
1695
    uint32_t log2_filter_element_size,
1696
    uint32_t extra_weights_elements_size,
1697
    uint32_t log2_output_element_size,
1698
    size_t* workspace_size,
1699
    size_t* workspace_alignment,
1700
    size_t num_threads)
1701
0
{
1702
0
  const size_t batch_size = convolution_op->batch_size;
1703
0
  const size_t input_height = convolution_op->input_height;
1704
0
  const size_t input_width = convolution_op->input_width;
1705
0
  const size_t groups = convolution_op->groups;
1706
0
  const size_t kernel_height = convolution_op->kernel_height;
1707
0
  const size_t kernel_width = convolution_op->kernel_width;
1708
0
  const size_t kernel_size = kernel_height * kernel_width;
1709
0
  const size_t output_height = convolution_op->output_height;
1710
0
  const size_t output_width = convolution_op->output_width;
1711
0
  const size_t output_size = output_height * output_width;
1712
1713
0
  uint32_t mr = convolution_op->ukernel.igemm.mr;
1714
0
  const uint32_t nr = convolution_op->ukernel.igemm.nr;
1715
0
  struct xnn_hmp_igemm_ukernel* igemm_cases = convolution_op->ukernel.igemm.igemm_cases;
1716
1717
0
  #if XNN_ENABLE_GEMM_M_SPECIALIZATION
1718
0
    mr = xnn_get_heuristic_mr_igemm(output_size, mr, nr, igemm_cases, convolution_op->code_cache != NULL);
1719
  #else
1720
    if (output_size == 1 && igemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) {
1721
      mr = 1;
1722
    }
1723
  #endif
1724
1725
  #if XNN_PLATFORM_JIT
1726
    xnn_overwrite_igemm_cases_with_generated_code(convolution_op, igemm_cases, mr);
1727
  #endif  // XNN_PLATFORM_JIT
1728
0
  struct xnn_hmp_igemm_ukernel igemm_ukernel = igemm_cases[mr - 1];
1729
1730
0
  const size_t tiled_output_size = round_up(output_size, mr);
1731
0
  const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
1732
0
  size_t igemm_compute_index;
1733
0
  if (convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) {
1734
0
    *workspace_size = indirection_buffer_size;
1735
0
    *workspace_alignment = XNN_ALLOCATION_ALIGNMENT;
1736
0
    igemm_compute_index = 1;
1737
1738
0
    convolution_op->context.conv2d_igemm_indirection_init = (struct conv2d_igemm_indirection_init_context) {
1739
0
      .zero_buffer = convolution_op->zero_buffer,
1740
0
      .input_pixel_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1741
0
      .input_height = input_height,
1742
0
      .input_width = input_width,
1743
0
      .output_height = output_height,
1744
0
      .output_width = output_width,
1745
0
      .kernel_height = kernel_height,
1746
0
      .kernel_width = kernel_width,
1747
0
      .stride_height = convolution_op->stride_height,
1748
0
      .stride_width = convolution_op->stride_width,
1749
0
      .dilation_height = convolution_op->dilation_height,
1750
0
      .dilation_width = convolution_op->dilation_width,
1751
0
      .input_padding_top = convolution_op->padding_top,
1752
0
      .input_padding_left = convolution_op->padding_left,
1753
0
    };
1754
1755
0
    convolution_op->compute[0].type = xnn_parallelization_type_1d_tile_1d;
1756
0
    convolution_op->compute[0].context_offset = offsetof(struct xnn_operator, context.conv2d_igemm_indirection_init) - offsetof(struct xnn_operator, context);
1757
0
    convolution_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_conv2d_igemm_indirection;
1758
0
    convolution_op->compute[0].range[0] = tiled_output_size;
1759
0
    convolution_op->compute[0].tile[0] = mr;
1760
0
  } else {
1761
0
    *workspace_size = 0;
1762
0
    *workspace_alignment = 1;
1763
0
    igemm_compute_index = 0;
1764
1765
0
    if (input_size_changed(convolution_op)) {
1766
0
      const void** indirection_buffer =
1767
0
        (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
1768
0
      if (indirection_buffer == NULL) {
1769
0
        xnn_log_error(
1770
0
            "failed to allocate %zu bytes for %s operator indirection buffer",
1771
0
            indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1772
0
        return xnn_status_out_of_memory;
1773
0
      }
1774
0
      convolution_op->indirection_buffer = indirection_buffer;
1775
0
      xnn_log_debug("allocated %zu bytes for indirection buffer in %s operator",
1776
0
                    indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1777
1778
      // Set a dummy input first, the actual input offset is calculated in setup when we have the input pointer.
1779
      // This offset must be aligned properly because inputs and input offsets need to be aligned.
1780
0
      convolution_op->input = (void*) ((uintptr_t) convolution_op->zero_buffer + XNN_ALLOCATION_ALIGNMENT);
1781
0
      convolution_op->last_input = convolution_op->input;
1782
0
      convolution_op->last_input_height = convolution_op->input_height;
1783
0
      convolution_op->last_input_width = convolution_op->input_width;
1784
1785
0
      xnn_indirection_init_conv2d(
1786
0
        /*output_tile_size=*/mr,
1787
0
        /*output_start=*/0,
1788
0
        /*output_end=*/tiled_output_size,
1789
0
        convolution_op->indirection_buffer,
1790
0
        convolution_op->input,
1791
0
        convolution_op->zero_buffer,
1792
0
        convolution_op->input_pixel_stride << log2_input_element_size,
1793
0
        convolution_op->input_height, convolution_op->input_width,
1794
0
        convolution_op->output_height, convolution_op->output_width,
1795
0
        convolution_op->kernel_height, convolution_op->kernel_width,
1796
0
        convolution_op->stride_height, convolution_op->stride_width,
1797
0
        convolution_op->dilation_height, convolution_op->dilation_width,
1798
0
        convolution_op->padding_top, convolution_op->padding_left);
1799
0
    }
1800
0
  }
1801
1802
1803
0
  const size_t group_input_channels = convolution_op->group_input_channels;
1804
0
  const size_t w_stride = extra_weights_elements_size +
1805
0
    (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr * convolution_op->ukernel.igemm.sr) * kernel_size << log2_filter_element_size);
1806
0
  const size_t group_output_channels = convolution_op->group_output_channels;
1807
0
  convolution_op->context.igemm = (struct igemm_context) {
1808
0
      .ks = kernel_size,
1809
0
      .ks_scaled = kernel_size * mr * sizeof(void*),
1810
0
      .kc = group_input_channels << log2_input_element_size,
1811
0
      .w_stride = w_stride,
1812
0
      .indirect_a = convolution_op->indirection_buffer,
1813
0
      .zero = convolution_op->zero_buffer,
1814
0
      .packed_w = packed_weights(convolution_op),
1815
0
      .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1816
0
      .cn_stride = nr << log2_output_element_size,
1817
0
      .ga_stride = group_input_channels << log2_input_element_size,
1818
0
      .gw_stride = w_stride * round_up(group_output_channels, nr),
1819
0
      .gc_stride = group_output_channels << log2_output_element_size,
1820
0
      .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
1821
0
      .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
1822
0
      .log2_csize = log2_output_element_size,
1823
0
      .ukernel = igemm_ukernel,
1824
0
  };
1825
0
  memcpy(&convolution_op->context.igemm.params, &convolution_op->params, sizeof(convolution_op->context.igemm.params));
1826
1827
  #if XNN_TEST_MODE
1828
    const size_t nc = nr;
1829
  #else
1830
0
    size_t nc = group_output_channels;
1831
0
    if (num_threads > 1) {
1832
0
      const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
1833
0
      const size_t target_tiles_per_thread = 5;
1834
0
      const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
1835
0
      if (max_nc < nc) {
1836
0
        nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1837
0
      }
1838
0
    }
1839
0
  #endif
1840
0
  if (groups == 1) {
1841
    #if XNN_MAX_UARCH_TYPES > 1
1842
      if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1843
        if (batch_size > 1) {
1844
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1845
          convolution_op->compute[igemm_compute_index].task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_batch_hmp_igemm;
1846
        } else {
1847
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1848
          convolution_op->compute[igemm_compute_index].task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_igemm;
1849
        }
1850
      } else {
1851
        if (batch_size > 1) {
1852
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d;
1853
          convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1854
        } else {
1855
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_2d_tile_2d;
1856
          convolution_op->compute[igemm_compute_index].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1857
        }
1858
      }
1859
    #else
1860
0
      if (batch_size > 1) {
1861
0
        convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d;
1862
0
        convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1863
0
      } else {
1864
0
        convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_2d_tile_2d;
1865
0
        convolution_op->compute[igemm_compute_index].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1866
0
      }
1867
0
    #endif
1868
0
    if (batch_size > 1) {
1869
0
      convolution_op->compute[igemm_compute_index].range[0] = batch_size;
1870
0
      convolution_op->compute[igemm_compute_index].range[1] = output_size;
1871
0
      convolution_op->compute[igemm_compute_index].range[2] = group_output_channels;
1872
0
    } else {
1873
0
      convolution_op->compute[igemm_compute_index].range[0] = output_size;
1874
0
      convolution_op->compute[igemm_compute_index].range[1] = group_output_channels;
1875
0
    }
1876
0
    convolution_op->compute[igemm_compute_index].tile[0] = mr;
1877
0
    convolution_op->compute[igemm_compute_index].tile[1] = nc;
1878
0
  } else {
1879
    #if XNN_MAX_UARCH_TYPES > 1
1880
      if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1881
        if (batch_size > 1) {
1882
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_4d_tile_2d_with_uarch;
1883
          convolution_op->compute[igemm_compute_index].task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_batch_igemm;
1884
        } else {
1885
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1886
          convolution_op->compute[igemm_compute_index].task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm;
1887
        }
1888
      } else {
1889
        if (batch_size > 1) {
1890
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_4d_tile_2d;
1891
          convolution_op->compute[igemm_compute_index].task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1892
        } else {
1893
          convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d;
1894
          convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1895
        }
1896
      }
1897
    #else
1898
0
      if (batch_size > 1) {
1899
0
        convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_4d_tile_2d;
1900
0
        convolution_op->compute[igemm_compute_index].task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1901
0
      } else {
1902
0
        convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d;
1903
0
        convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1904
0
      }
1905
0
    #endif
1906
0
    if (batch_size > 1) {
1907
0
      convolution_op->compute[igemm_compute_index].range[0] = batch_size;
1908
0
      convolution_op->compute[igemm_compute_index].range[1] = groups;
1909
0
      convolution_op->compute[igemm_compute_index].range[2] = output_size;
1910
0
      convolution_op->compute[igemm_compute_index].range[3] = group_output_channels;
1911
0
    } else {
1912
0
      convolution_op->compute[igemm_compute_index].range[0] = groups;
1913
0
      convolution_op->compute[igemm_compute_index].range[1] = output_size;
1914
0
      convolution_op->compute[igemm_compute_index].range[2] = group_output_channels;
1915
0
    }
1916
0
    convolution_op->compute[igemm_compute_index].tile[0] = mr;
1917
0
    convolution_op->compute[igemm_compute_index].tile[1] = nc;
1918
0
  }
1919
0
  convolution_op->state = xnn_run_state_needs_setup;
1920
1921
0
  return xnn_status_success;
1922
0
}
1923
1924
static enum xnn_status reshape_dwconv(
1925
    xnn_operator_t convolution_op,
1926
    uint32_t log2_input_element_size,
1927
    uint32_t log2_accumulator_element_size,
1928
    uint32_t log2_output_element_size,
1929
    size_t* workspace_size,
1930
    size_t* workspace_alignment,
1931
    size_t num_threads)
1932
0
{
1933
0
  const size_t input_height = convolution_op->input_height;
1934
0
  const size_t input_width = convolution_op->input_width;
1935
0
  const size_t kernel_height = convolution_op->kernel_height;
1936
0
  const size_t kernel_width = convolution_op->kernel_width;
1937
0
  const size_t kernel_size = kernel_height * kernel_width;
1938
0
  const size_t output_height = convolution_op->output_height;
1939
0
  const size_t output_width = convolution_op->output_width;
1940
0
  const size_t step_width = convolution_op->dilation_width == 1 ?
1941
0
      min(convolution_op->stride_width, kernel_width) : kernel_width;
1942
0
  const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
1943
0
  const struct xnn_ukernel_dwconv dwconv_ukernel = convolution_op->ukernel.dwconv;
1944
0
  const bool is_unipass = dwconv_ukernel.last_tile == 0;
1945
0
  const size_t tile_size = dwconv_ukernel.tile_size;
1946
0
  size_t total_workspace_size = 0;
1947
1948
  // Micro-kernel will read (tile_size - kernel_size) elements after the end of indirection buffer.
1949
0
  const size_t indirection_buffer_size =
1950
0
    round_up_po2(sizeof(void*) * (tile_size - kernel_size + output_height * step_height), XNN_ALLOCATION_ALIGNMENT);
1951
1952
0
  size_t dwconv_compute_index;
1953
0
  const bool is_transient_indirection_buffer = convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER;
1954
0
  if (is_transient_indirection_buffer) {
1955
0
    total_workspace_size += indirection_buffer_size;
1956
0
    dwconv_compute_index = 1;
1957
1958
0
    convolution_op->context.dwconv_indirection_init = (struct dwconv_indirection_init_context) {
1959
0
      .zero_buffer = convolution_op->zero_buffer,
1960
0
      .input_pixel_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1961
0
      .input_height = input_height,
1962
0
      .input_width = input_width,
1963
0
      .output_height = output_height,
1964
0
      .output_width = output_width,
1965
0
      .kernel_height = kernel_height,
1966
0
      .kernel_width = kernel_width,
1967
0
      .stride_height = convolution_op->stride_height,
1968
0
      .stride_width = convolution_op->stride_width,
1969
0
      .dilation_height = convolution_op->dilation_height,
1970
0
      .dilation_width = convolution_op->dilation_width,
1971
0
      .input_padding_top = convolution_op->padding_top,
1972
0
      .input_padding_left = convolution_op->padding_left,
1973
0
      .step_height = step_height,
1974
0
      .step_width = step_width,
1975
0
      .tile_size = tile_size,
1976
0
    };
1977
1978
0
    convolution_op->compute[0].type = xnn_parallelization_type_1d_tile_1d;
1979
0
    convolution_op->compute[0].context_offset = offsetof(struct xnn_operator, context.dwconv_indirection_init) - offsetof(struct xnn_operator, context);
1980
0
    convolution_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_dwconv_indirection;
1981
0
    convolution_op->compute[0].range[0] = output_height;
1982
    #if XNN_TEST_MODE
1983
      convolution_op->compute[0].tile[0] = output_height;
1984
    #else
1985
0
      if (num_threads > 1) {
1986
0
        const size_t target_tiles_per_thread = 5;
1987
0
        convolution_op->compute[0].tile[0] = divide_round_up(output_height, num_threads * target_tiles_per_thread);
1988
0
      } else {
1989
0
        convolution_op->compute[0].tile[0] = output_height;
1990
0
      }
1991
0
    #endif
1992
0
  } else {
1993
0
    dwconv_compute_index = 0;
1994
1995
0
    if (input_size_changed(convolution_op)) {
1996
0
      const void** indirection_buffer =
1997
0
        (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
1998
0
      if (indirection_buffer == NULL) {
1999
0
        xnn_log_error("failed to allocate %zu bytes for %s operator indirection buffer",
2000
0
          indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
2001
0
        return xnn_status_out_of_memory;
2002
0
      }
2003
0
      convolution_op->indirection_buffer = indirection_buffer;
2004
0
      xnn_log_debug("allocated %zu bytes for indirection buffer in %s operator",
2005
0
        indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
2006
2007
      #if XNN_TEST_MODE
2008
        memset(convolution_op->indirection_buffer, 0, indirection_buffer_size);
2009
      #endif
2010
2011
      // Set a dummy input first, the actual input offset is calculated in setup when we have the input pointer.
2012
      // This offset must be aligned properly because inputs and input offsets need to be aligned.
2013
0
      convolution_op->input = (void* ) ((uintptr_t) convolution_op->zero_buffer + XNN_ALLOCATION_ALIGNMENT);
2014
0
      convolution_op->last_input = convolution_op->input;
2015
0
      convolution_op->last_input_height = convolution_op->input_height;
2016
0
      convolution_op->last_input_width = convolution_op->input_width;
2017
2018
0
      xnn_indirection_init_dwconv2d(
2019
0
        /*output_y_start=*/0, /*output_y_end=*/convolution_op->output_height,
2020
0
        convolution_op->indirection_buffer,
2021
0
        convolution_op->input,
2022
0
        convolution_op->input_pixel_stride << log2_input_element_size,
2023
0
        convolution_op->zero_buffer,
2024
0
        convolution_op->input_height, convolution_op->input_width,
2025
0
        convolution_op->output_height, convolution_op->output_width,
2026
0
        convolution_op->kernel_height, convolution_op->kernel_width,
2027
0
        convolution_op->stride_height, convolution_op->stride_width,
2028
0
        convolution_op->dilation_height, convolution_op->dilation_width,
2029
0
        convolution_op->padding_top, convolution_op->padding_left,
2030
0
        step_height, step_width, tile_size);
2031
0
    }
2032
0
  }
2033
2034
0
  const size_t groups = convolution_op->groups;
2035
0
  int32_t extra_input_advanced = is_unipass ? 0 : tile_size - convolution_op->ukernel.dwconv.last_tile;
2036
0
  convolution_op->context.dwconv = (struct dwconv_context) {
2037
0
      .kernel_size = kernel_size,
2038
0
      .indirect_input = convolution_op->indirection_buffer,
2039
0
      .indirect_input_width_stride = (kernel_height * step_width - extra_input_advanced) * sizeof(void*),
2040
0
      .indirect_input_height_stride = step_height * sizeof(void*),
2041
0
      .input_batch_stride = (input_height * input_width * convolution_op->input_pixel_stride) << log2_input_element_size,
2042
0
      .packed_weights = packed_weights(convolution_op),
2043
0
      .output_batch_stride = (output_height * output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
2044
0
      .output_height_stride = (output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
2045
0
      .output_height = output_height,
2046
0
      .output_width = output_width,
2047
0
      .groups = groups,
2048
0
      .zero = convolution_op->zero_buffer,
2049
0
      .output_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
2050
0
  };
2051
0
  memcpy(&convolution_op->context.dwconv.params, &convolution_op->params, sizeof(convolution_op->context.dwconv.params));
2052
2053
0
  const size_t batch_size = convolution_op->batch_size;
2054
0
  convolution_op->compute[dwconv_compute_index].range[0] = batch_size;
2055
0
  convolution_op->compute[dwconv_compute_index].range[1] = output_height;
2056
0
  convolution_op->state = xnn_run_state_needs_setup;
2057
2058
0
  if (is_unipass) {
2059
0
    convolution_op->compute[dwconv_compute_index].type = xnn_parallelization_type_2d;
2060
0
    convolution_op->compute[dwconv_compute_index].task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv_unipass;
2061
0
    convolution_op->context.dwconv.unipass_ukernel = convolution_op->ukernel.dwconv.unipass_fn;
2062
0
  } else {
2063
0
    const size_t buffer_size =
2064
0
      round_up_po2(
2065
0
        (groups + (XNN_MULTIPASS_EXTRA_BYTES >> log2_input_element_size)) << log2_accumulator_element_size,
2066
0
        XNN_ALLOCATION_ALIGNMENT);
2067
0
    convolution_op->context.dwconv.buffer_size = buffer_size;
2068
0
    if (is_transient_indirection_buffer) {
2069
0
      convolution_op->context.dwconv.multipass_buffer_offset = indirection_buffer_size;
2070
0
    }
2071
0
    const bool use_threads_workspace_size = num_threads < batch_size * output_height;
2072
0
    if (use_threads_workspace_size) {
2073
0
      convolution_op->compute[dwconv_compute_index].type = xnn_parallelization_type_2d_with_thread;
2074
0
      convolution_op->compute[dwconv_compute_index].task_2d_with_thread =
2075
0
        (pthreadpool_task_2d_with_thread_t) xnn_compute_dwconv_multipass_with_thread;
2076
0
      total_workspace_size += num_threads * buffer_size;
2077
0
    } else {
2078
0
      convolution_op->compute[dwconv_compute_index].type = xnn_parallelization_type_2d;
2079
0
      convolution_op->compute[dwconv_compute_index].task_2d =
2080
0
        (pthreadpool_task_2d_t) xnn_compute_dwconv_multipass;
2081
0
      total_workspace_size += batch_size * output_height * buffer_size;
2082
0
    }
2083
2084
0
    convolution_op->context.dwconv.multipass_ukernel = convolution_op->ukernel.dwconv.multipass_fn;
2085
0
  }
2086
2087
0
  *workspace_size = total_workspace_size;
2088
0
  *workspace_alignment = total_workspace_size == 0 ? 1 : XNN_ALLOCATION_ALIGNMENT;
2089
2090
0
  return xnn_status_success;
2091
0
}
2092
2093
static enum xnn_status reshape_vmulcaddc(
2094
  xnn_operator_t convolution_op,
2095
  uint32_t log2_input_element_size,
2096
  uint32_t log2_output_element_size,
2097
  size_t* workspace_size,
2098
  size_t* workspace_alignment,
2099
  size_t num_threads)
2100
0
{
2101
0
  const size_t batch_output_size = convolution_op->batch_size * convolution_op->output_height * convolution_op->output_width;
2102
2103
0
  convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
2104
0
    .n = convolution_op->groups << log2_input_element_size,
2105
0
    .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
2106
0
    .w = packed_weights(convolution_op),
2107
0
    .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
2108
0
    .ukernel = convolution_op->ukernel.vmulcaddc.function,
2109
0
  };
2110
0
  memcpy(&convolution_op->context.vmulcaddc.params, &convolution_op->params,
2111
0
         sizeof(convolution_op->context.vmulcaddc.params));
2112
2113
#if XNN_TEST_MODE
2114
  const size_t mc = convolution_op->ukernel.vmulcaddc.mr;
2115
#else
2116
0
  size_t mc = batch_output_size;
2117
0
  if (num_threads > 1) {
2118
0
    const size_t target_tiles_per_thread = 5;
2119
0
    const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
2120
0
    if (max_mc < mc) {
2121
0
      const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
2122
0
      mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
2123
0
    }
2124
0
  }
2125
0
#endif
2126
0
  convolution_op->compute[0].type = xnn_parallelization_type_1d_tile_1d;
2127
0
  convolution_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
2128
0
  convolution_op->compute[0].range[0] = batch_output_size;
2129
0
  convolution_op->compute[0].tile[0] = mc;
2130
0
  convolution_op->state = xnn_run_state_needs_setup;
2131
2132
0
  *workspace_size = 0;
2133
0
  *workspace_alignment = 1;
2134
2135
0
  return xnn_status_success;
2136
0
}
2137
2138
static enum xnn_status reshape_convolution2d_nhwc(
2139
  xnn_operator_t convolution_op,
2140
  enum xnn_operator_type expected_operator_type,
2141
  size_t batch_size,
2142
  size_t input_height,
2143
  size_t input_width,
2144
  uint32_t log2_input_element_size,
2145
  uint32_t log2_filter_element_size,
2146
  uint32_t log2_accumulator_element_size,
2147
  uint32_t extra_weights_elements_size,
2148
  uint32_t log2_output_element_size,
2149
  size_t* workspace_size,
2150
  size_t* workspace_alignment,
2151
  size_t* output_height_out,
2152
  size_t* output_width_out,
2153
  pthreadpool_t threadpool)
2154
0
{
2155
0
  if (convolution_op->type != expected_operator_type) {
2156
0
    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
2157
0
      xnn_operator_type_to_string(expected_operator_type),
2158
0
      xnn_operator_type_to_string(convolution_op->type));
2159
0
    return xnn_status_invalid_parameter;
2160
0
  }
2161
0
  convolution_op->state = xnn_run_state_invalid;
2162
2163
0
  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
2164
0
    xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
2165
0
      xnn_operator_type_to_string(convolution_op->type));
2166
0
    return xnn_status_uninitialized;
2167
0
  }
2168
2169
0
  if (input_width == 0 || input_height == 0) {
2170
0
    xnn_log_error(
2171
0
      "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
2172
0
      xnn_operator_type_to_string(convolution_op->type), input_width, input_height);
2173
0
    return xnn_status_invalid_parameter;
2174
0
  }
2175
2176
0
  if (batch_size == 0) {
2177
0
    convolution_op->state = xnn_run_state_skip;
2178
0
    return xnn_status_success;
2179
0
  }
2180
2181
0
  if (convolution_op->weights_cache != NULL && !xnn_weights_cache_is_finalized(convolution_op->weights_cache)) {
2182
0
    xnn_log_error("failed to setup %s operator: weights cache is not finalized",
2183
0
      xnn_operator_type_to_string(convolution_op->type));
2184
0
    return xnn_status_invalid_state;
2185
0
  }
2186
2187
0
  convolution_op->batch_size = batch_size;
2188
0
  convolution_op->input_height = input_height;
2189
0
  convolution_op->input_width = input_width;
2190
2191
0
  if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
2192
0
    convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
2193
0
        input_height, convolution_op->stride_height);
2194
0
    convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
2195
0
        input_width, convolution_op->stride_width);
2196
2197
0
    const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
2198
0
    const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
2199
0
    const size_t total_padding_height =
2200
0
      (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
2201
0
    const size_t total_padding_width =
2202
0
      (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
2203
0
    convolution_op->padding_top = total_padding_height / 2;
2204
0
    convolution_op->padding_left = total_padding_width / 2;
2205
0
    convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
2206
0
    convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
2207
0
  } else {
2208
0
    convolution_op->output_height = xnn_compute_convolution_output_dimension(
2209
0
        convolution_op->padding_top + input_height + convolution_op->padding_bottom,
2210
0
        convolution_op->kernel_height,
2211
0
        convolution_op->dilation_height,
2212
0
        convolution_op->stride_height);
2213
0
    convolution_op->output_width = xnn_compute_convolution_output_dimension(
2214
0
        convolution_op->padding_left + input_width + convolution_op->padding_right,
2215
0
        convolution_op->kernel_width,
2216
0
        convolution_op->dilation_width,
2217
0
        convolution_op->stride_width);
2218
0
  }
2219
2220
0
  if (output_height_out != NULL) {
2221
0
    *output_height_out = convolution_op->output_height;
2222
0
  }
2223
0
  if (output_width_out != NULL) {
2224
0
    *output_width_out = convolution_op->output_width;
2225
0
  }
2226
2227
0
  const size_t num_threads = pthreadpool_get_threads_count(threadpool);
2228
0
  switch (convolution_op->ukernel.type) {
2229
0
    case xnn_microkernel_type_gemm:
2230
0
      return reshape_gemm(
2231
0
          convolution_op,
2232
0
          log2_input_element_size, log2_filter_element_size, extra_weights_elements_size, log2_output_element_size,
2233
0
          workspace_size, workspace_alignment, num_threads);
2234
0
    case xnn_microkernel_type_igemm:
2235
0
      return reshape_igemm(
2236
0
          convolution_op,
2237
0
          log2_input_element_size, log2_filter_element_size, extra_weights_elements_size, log2_output_element_size,
2238
0
          workspace_size, workspace_alignment, num_threads);
2239
0
    case xnn_microkernel_type_dwconv:
2240
0
      return reshape_dwconv(
2241
0
          convolution_op,
2242
0
          log2_input_element_size, log2_accumulator_element_size, log2_output_element_size,
2243
0
          workspace_size, workspace_alignment, num_threads);
2244
0
    case xnn_microkernel_type_vmulcaddc:
2245
0
      return reshape_vmulcaddc(
2246
0
          convolution_op,
2247
0
          log2_input_element_size, log2_output_element_size,
2248
0
          workspace_size, workspace_alignment, num_threads);
2249
0
    default:
2250
0
      XNN_UNREACHABLE;
2251
0
  }
2252
0
}
2253
2254
enum xnn_status xnn_reshape_convolution2d_nhwc_qu8(
2255
    xnn_operator_t convolution_op,
2256
    size_t batch_size,
2257
    size_t input_height,
2258
    size_t input_width,
2259
    size_t* workspace_size,
2260
    size_t* workspace_alignment,
2261
    size_t* output_height_out,
2262
    size_t* output_width_out,
2263
    pthreadpool_t threadpool)
2264
0
{
2265
0
  return reshape_convolution2d_nhwc(
2266
0
    convolution_op, xnn_operator_type_convolution_nhwc_qu8,
2267
0
    batch_size, input_height, input_width,
2268
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_UINT8_T,
2269
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_UINT8_T,
2270
    /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_INT32_T,
2271
0
    /*extra_weights_elements_size=*/sizeof(int32_t),
2272
    /*log2_output_element_size=*/XNN_LOG2_SIZEOF_UINT8_T,
2273
0
    workspace_size, workspace_alignment,
2274
0
    output_height_out, output_width_out,
2275
0
    threadpool);
2276
0
}
2277
2278
enum xnn_status xnn_reshape_convolution2d_nhwc_qs8(
2279
    xnn_operator_t convolution_op,
2280
    size_t batch_size,
2281
    size_t input_height,
2282
    size_t input_width,
2283
    size_t* workspace_size,
2284
    size_t* workspace_alignment,
2285
    size_t* output_height_out,
2286
    size_t* output_width_out,
2287
    pthreadpool_t threadpool)
2288
0
{
2289
0
  return reshape_convolution2d_nhwc(
2290
0
    convolution_op, xnn_operator_type_convolution_nhwc_qs8,
2291
0
    batch_size, input_height, input_width,
2292
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
2293
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
2294
    /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_INT32_T,
2295
0
    /*extra_weights_elements_size=*/sizeof(int32_t),
2296
    /*log2_output_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
2297
0
    workspace_size, workspace_alignment,
2298
0
    output_height_out, output_width_out,
2299
0
    threadpool);
2300
0
}
2301
2302
enum xnn_status xnn_reshape_convolution2d_nhwc_qs8_qc8w(
2303
    xnn_operator_t convolution_op,
2304
    size_t batch_size,
2305
    size_t input_height,
2306
    size_t input_width,
2307
    size_t* workspace_size,
2308
    size_t* workspace_alignment,
2309
    size_t* output_height_out,
2310
    size_t* output_width_out,
2311
    pthreadpool_t threadpool)
2312
0
{
2313
0
  return reshape_convolution2d_nhwc(
2314
0
    convolution_op, xnn_operator_type_convolution_nhwc_qc8,
2315
0
    batch_size, input_height, input_width,
2316
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
2317
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
2318
    /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_INT32_T,
2319
0
    /*extra_weights_elements_size=*/sizeof(int32_t) + sizeof(float),
2320
    /*log2_output_element_size=*/XNN_LOG2_SIZEOF_INT8_T,
2321
0
    workspace_size, workspace_alignment,
2322
0
    output_height_out, output_width_out,
2323
0
    threadpool);
2324
0
}
2325
2326
enum xnn_status xnn_reshape_convolution2d_nhwc_f16(
2327
    xnn_operator_t convolution_op,
2328
    size_t batch_size,
2329
    size_t input_height,
2330
    size_t input_width,
2331
    size_t* workspace_size,
2332
    size_t* workspace_alignment,
2333
    size_t* output_height_out,
2334
    size_t* output_width_out,
2335
    pthreadpool_t threadpool)
2336
0
{
2337
0
  return reshape_convolution2d_nhwc(
2338
0
    convolution_op, xnn_operator_type_convolution_nhwc_f16,
2339
0
    batch_size, input_height, input_width,
2340
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_HALF,
2341
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_HALF,
2342
    /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_HALF,
2343
0
    /*extra_weights_elements_size=*/sizeof(uint16_t),
2344
    /*log2_output_element_size=*/XNN_LOG2_SIZEOF_HALF,
2345
0
    workspace_size, workspace_alignment,
2346
0
    output_height_out, output_width_out,
2347
0
    threadpool);
2348
0
}
2349
2350
enum xnn_status xnn_reshape_convolution2d_nhwc_f32(
2351
    xnn_operator_t convolution_op,
2352
    size_t batch_size,
2353
    size_t input_height,
2354
    size_t input_width,
2355
    size_t* workspace_size,
2356
    size_t* workspace_alignment,
2357
    size_t* output_height_out,
2358
    size_t* output_width_out,
2359
    pthreadpool_t threadpool)
2360
0
{
2361
0
  return reshape_convolution2d_nhwc(
2362
0
    convolution_op, xnn_operator_type_convolution_nhwc_f32,
2363
0
    batch_size, input_height, input_width,
2364
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
2365
    /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
2366
    /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
2367
0
    /*extra_weights_elements_size=*/sizeof(float),
2368
    /*log2_output_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
2369
0
    workspace_size, workspace_alignment,
2370
0
    output_height_out, output_width_out,
2371
0
    threadpool);
2372
0
}
2373
2374
static enum xnn_status setup_gemm(xnn_operator_t convolution_op)
2375
0
{
2376
0
  convolution_op->context.gemm.a = convolution_op->input;
2377
0
  convolution_op->context.gemm.c = convolution_op->output;
2378
0
  convolution_op->state = xnn_run_state_ready;
2379
2380
0
  return xnn_status_success;
2381
0
}
2382
2383
static enum xnn_status setup_igemm(
2384
    xnn_operator_t convolution_op,
2385
    void* workspace,
2386
    uint32_t log2_input_element_size)
2387
0
{
2388
0
  if (convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) {
2389
0
    convolution_op->context.igemm.a_offset = (size_t) 0;
2390
0
    convolution_op->context.igemm.indirect_a = (const void**) workspace;
2391
0
    convolution_op->context.conv2d_igemm_indirection_init.indirection_buffer = (const void**) workspace;
2392
0
    convolution_op->context.conv2d_igemm_indirection_init.input = convolution_op->input;
2393
0
  } else {
2394
0
    convolution_op->context.igemm.a_offset = (size_t) ((uintptr_t) convolution_op->input - (uintptr_t) convolution_op->last_input);
2395
0
  }
2396
0
  convolution_op->context.igemm.c = convolution_op->output;
2397
0
  convolution_op->state = xnn_run_state_ready;
2398
2399
0
  return xnn_status_success;
2400
0
}
2401
2402
static enum xnn_status setup_dwconv(
2403
    xnn_operator_t convolution_op,
2404
    void* workspace,
2405
    uint32_t log2_input_element_size)
2406
0
{
2407
  #if XNN_TEST_MODE
2408
    // indirection buffer is only set at this time if it is persistent.
2409
    if (!(convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER)) {
2410
      const size_t kernel_height = convolution_op->kernel_height;
2411
      const size_t kernel_width = convolution_op->kernel_width;
2412
      const size_t kernel_size = kernel_height * kernel_width;
2413
      const size_t output_width = convolution_op->output_width;
2414
      const size_t step_width = convolution_op->dilation_width == 1 ?
2415
          min(convolution_op->stride_width, kernel_width) : kernel_width;
2416
      const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
2417
      const struct xnn_ukernel_dwconv dwconv_ukernel = convolution_op->ukernel.dwconv;
2418
      const size_t tile_size = dwconv_ukernel.tile_size;
2419
      const size_t indirection_buffer_size =
2420
        sizeof(void*) * (tile_size - kernel_size + convolution_op->output_height * step_height);
2421
2422
      // TODO(zhin): store step_height and step_width, this is already computed in create.
2423
      for (size_t i = 0; i < indirection_buffer_size / sizeof(void*); i++) {
2424
        // Indirection initialization should have set all indirection pointers, make sure none of them are NULL.
2425
        assert(convolution_op->indirection_buffer[i] != NULL);
2426
      }
2427
    }
2428
  #endif
2429
2430
0
  if (convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) {
2431
0
    convolution_op->context.dwconv.input_offset = (size_t) 0;
2432
0
    convolution_op->context.dwconv.indirect_input = (const void**) workspace;
2433
0
    convolution_op->context.dwconv_indirection_init.input = convolution_op->input;
2434
0
    convolution_op->context.dwconv_indirection_init.indirection_buffer = (const void**) workspace;
2435
0
  } else {
2436
0
    convolution_op->context.dwconv.input_offset = (size_t) ((uintptr_t) convolution_op->input - (uintptr_t) convolution_op->last_input);
2437
0
  }
2438
2439
0
  if (convolution_op->context.dwconv.buffer_size) {
2440
0
    assert(workspace != NULL);
2441
0
    convolution_op->context.dwconv.multipass_buffer =
2442
0
      (void*) ((uintptr_t) workspace + convolution_op->context.dwconv.multipass_buffer_offset);
2443
0
  }
2444
2445
0
  convolution_op->context.dwconv.output = convolution_op->output;
2446
0
  convolution_op->state = xnn_run_state_ready;
2447
2448
0
  return xnn_status_success;
2449
0
}
2450
2451
static enum xnn_status setup_vmulcaddc(xnn_operator_t convolution_op)
2452
0
{
2453
0
  convolution_op->context.vmulcaddc.x = convolution_op->input;
2454
0
  convolution_op->context.vmulcaddc.y = convolution_op->output;
2455
0
  convolution_op->state = xnn_run_state_ready;
2456
2457
0
  return xnn_status_success;
2458
0
}
2459
2460
static enum xnn_status setup_convolution2d_nhwc(
2461
  xnn_operator_t convolution_op,
2462
  enum xnn_operator_type expected_operator_type,
2463
  void* workspace,
2464
  const void* input,
2465
  void* output,
2466
  uint32_t log2_input_element_size)
2467
0
{
2468
0
  if (convolution_op->type != expected_operator_type) {
2469
0
    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
2470
0
      xnn_operator_type_to_string(expected_operator_type),
2471
0
      xnn_operator_type_to_string(convolution_op->type));
2472
0
    return xnn_status_invalid_parameter;
2473
0
  }
2474
2475
0
  switch (convolution_op->state) {
2476
0
    case xnn_run_state_skip:
2477
0
      return xnn_status_success;
2478
0
    case xnn_run_state_invalid:
2479
0
      xnn_log_error(
2480
0
        "failed to setup %s operator: operator has not been reshaped yet",
2481
0
        xnn_operator_type_to_string(convolution_op->type));
2482
0
      return xnn_status_invalid_state;
2483
0
    case xnn_run_state_needs_setup:
2484
      // Operator has been reshaped, but not setup, continue with setup.
2485
0
    case xnn_run_state_ready:
2486
      // Operator has been reshaped, and we are setting up with different pointers.
2487
0
      break;
2488
0
  }
2489
2490
0
  convolution_op->input = input;
2491
0
  convolution_op->output = output;
2492
2493
0
  switch (convolution_op->ukernel.type) {
2494
0
    case xnn_microkernel_type_gemm:
2495
0
      return setup_gemm(convolution_op);
2496
0
    case xnn_microkernel_type_igemm:
2497
0
      return setup_igemm(convolution_op, workspace, log2_input_element_size);
2498
0
    case xnn_microkernel_type_dwconv:
2499
0
      return setup_dwconv(convolution_op, workspace, log2_input_element_size);
2500
0
    case xnn_microkernel_type_vmulcaddc:
2501
0
      return setup_vmulcaddc(convolution_op);
2502
0
    default:
2503
0
      XNN_UNREACHABLE;
2504
0
  }
2505
0
}
2506
2507
enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
2508
    xnn_operator_t convolution_op,
2509
    void* workspace,
2510
    const uint8_t* input,
2511
    uint8_t* output)
2512
0
{
2513
0
  return setup_convolution2d_nhwc(
2514
0
    convolution_op, xnn_operator_type_convolution_nhwc_qu8,
2515
0
    workspace, input, output,
2516
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_UINT8_T);
2517
0
}
2518
2519
enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
2520
    xnn_operator_t convolution_op,
2521
    void* workspace,
2522
    const int8_t* input,
2523
    int8_t* output)
2524
0
{
2525
0
  return setup_convolution2d_nhwc(
2526
0
    convolution_op, xnn_operator_type_convolution_nhwc_qs8,
2527
0
    workspace, input, output,
2528
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T);
2529
0
}
2530
2531
enum xnn_status xnn_setup_convolution2d_nhwc_qs8_qc8w(
2532
    xnn_operator_t convolution_op,
2533
    void* workspace,
2534
    const int8_t* input,
2535
    int8_t* output)
2536
0
{
2537
0
  return setup_convolution2d_nhwc(
2538
0
    convolution_op, xnn_operator_type_convolution_nhwc_qc8,
2539
0
    workspace, input, output,
2540
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T);
2541
0
}
2542
2543
enum xnn_status xnn_setup_convolution2d_nhwc_f16(
2544
    xnn_operator_t convolution_op,
2545
    void* workspace,
2546
    const void* input,
2547
    void* output)
2548
0
{
2549
0
  return setup_convolution2d_nhwc(
2550
0
    convolution_op, xnn_operator_type_convolution_nhwc_f16,
2551
0
    workspace, input, output,
2552
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_HALF);
2553
0
}
2554
2555
enum xnn_status xnn_setup_convolution2d_nhwc_f32(
2556
    xnn_operator_t convolution_op,
2557
    void* workspace,
2558
    const float* input,
2559
    float* output)
2560
0
{
2561
0
  return setup_convolution2d_nhwc(
2562
0
    convolution_op, xnn_operator_type_convolution_nhwc_f32,
2563
0
    workspace, input, output,
2564
    /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT);
2565
0
}