/src/xnnpack/src/operators/convolution-nhwc.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) Facebook, Inc. and its affiliates. |
2 | | // All rights reserved. |
3 | | // |
4 | | // Copyright 2019 Google LLC |
5 | | // |
6 | | // This source code is licensed under the BSD-style license found in the |
7 | | // LICENSE file in the root directory of this source tree. |
8 | | |
9 | | #include <assert.h> |
10 | | #include <math.h> |
11 | | #include <stdbool.h> |
12 | | #include <stddef.h> |
13 | | #include <stdint.h> |
14 | | #include <stdlib.h> |
15 | | #include <string.h> |
16 | | |
17 | | #include <fp16/fp16.h> |
18 | | |
19 | | #include <xnnpack.h> |
20 | | #include <xnnpack/allocator.h> |
21 | | #include <xnnpack/cache.h> |
22 | | #include <xnnpack/common.h> |
23 | | #include <xnnpack/compute.h> |
24 | | #include <xnnpack/config.h> |
25 | | #include <xnnpack/indirection.h> |
26 | | #include <xnnpack/log.h> |
27 | | #include <xnnpack/math.h> |
28 | | #include <xnnpack/microkernel-utils.h> |
29 | | #include <xnnpack/operator.h> |
30 | | #include <xnnpack/operator-utils.h> |
31 | | #include <xnnpack/operator-type.h> |
32 | | #include <xnnpack/pack.h> |
33 | | #include <xnnpack/params.h> |
34 | | #include <xnnpack/post-operation.h> |
35 | | #include <xnnpack/microparams-init.h> |
36 | | |
37 | | #ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION |
38 | | #error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined" |
39 | | #endif |
40 | | |
41 | | static inline size_t compute_output_dimension_with_tf_same_padding( |
42 | | size_t input_dimension, |
43 | | size_t subsampling_dimension) |
44 | 0 | { |
45 | 0 | return divide_round_up(input_dimension, subsampling_dimension); |
46 | 0 | } |
47 | | |
48 | | static inline const struct xnn_dwconv_config* find_dwconv_ukernel( |
49 | | size_t kernel_size, |
50 | | const struct xnn_dwconv_config* ukernel, |
51 | | size_t num_ukernels) |
52 | 0 | { |
53 | 0 | const struct xnn_dwconv_config* best_ukernel = NULL; |
54 | 0 | while (num_ukernels-- != 0) { |
55 | | // Find the smallest unipass primary_tile that is at least as big as kernel_size. |
56 | 0 | if (ukernel->last_tile == 0 && ukernel->primary_tile >= kernel_size) { |
57 | 0 | if (best_ukernel == NULL || ukernel->primary_tile < best_ukernel->primary_tile) { |
58 | 0 | best_ukernel = ukernel; |
59 | 0 | } |
60 | 0 | } else if (ukernel->last_tile != 0) { |
61 | | // Use multi-pass if it fits the kernel size nicely, or if kernel_size is large. |
62 | 0 | if (ukernel->primary_tile + ukernel->middle_tile + ukernel->last_tile == kernel_size || kernel_size >= 25) { |
63 | 0 | best_ukernel = ukernel; |
64 | 0 | } |
65 | 0 | } |
66 | 0 | ukernel++; |
67 | 0 | } |
68 | 0 | if (best_ukernel == NULL) { |
69 | 0 | xnn_log_debug("no dwconv ukernel found"); |
70 | 0 | } else if (best_ukernel->last_tile == 0) { |
71 | 0 | xnn_log_debug("dwconv unipass ukernel of primary tile %"PRIu8" found", best_ukernel->primary_tile); |
72 | 0 | } else { |
73 | 0 | xnn_log_debug("dwconv multipass ukernel of tiles %"PRIu8", %"PRIu8", %"PRIu8" found", |
74 | 0 | best_ukernel->primary_tile, |
75 | 0 | best_ukernel->middle_tile, |
76 | 0 | best_ukernel->last_tile); |
77 | 0 | } |
78 | 0 | return best_ukernel; |
79 | 0 | } |
80 | | |
81 | | static enum xnn_status create_vmulcaddc_path( |
82 | | uint32_t groups, |
83 | | const void* kernel, |
84 | | const void* bias, |
85 | | uint32_t log2_filter_element_size, |
86 | | uint32_t bias_element_size, |
87 | | xnn_pack_vmulcaddc_w_fn pack_vmulcaddc_w, |
88 | | const void* packing_params, |
89 | | int packed_weights_padding_byte, |
90 | | const void* vmulcaddc_params, |
91 | | size_t vmulcaddc_params_size, |
92 | | const struct xnn_vmulcaddc_config* vmulcaddc_config, |
93 | | enum xnn_operator_type operator_type, |
94 | | xnn_operator_t convolution_op) |
95 | 0 | { |
96 | 0 | assert(vmulcaddc_config != NULL); |
97 | 0 | assert(vmulcaddc_params != NULL); |
98 | | |
99 | 0 | enum xnn_status status = xnn_status_out_of_memory; |
100 | |
|
101 | 0 | const size_t c_stride = round_up_po2(groups, vmulcaddc_config->channel_tile); |
102 | 0 | const size_t packed_weights_size = ((UINT32_C(1) << log2_filter_element_size) + bias_element_size) * c_stride; |
103 | 0 | size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT); |
104 | 0 | void* weights_ptr = xnn_get_pointer_to_write_weights( |
105 | 0 | convolution_op, aligned_total_weights_size, packed_weights_padding_byte); |
106 | 0 | if (weights_ptr == NULL) { |
107 | 0 | xnn_log_error("failed to reserve or allocated %zu bytes for %s operator vmulcaddc packed weights", |
108 | 0 | aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); |
109 | 0 | goto error; |
110 | 0 | } |
111 | 0 | xnn_log_debug("allocated %zu bytes for packed weights in %s operator", |
112 | 0 | aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); |
113 | |
|
114 | 0 | pack_vmulcaddc_w(groups, vmulcaddc_config->channel_tile, kernel, bias, weights_ptr, packing_params); |
115 | |
|
116 | 0 | if (use_weights_cache(convolution_op)) { |
117 | 0 | convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache( |
118 | 0 | convolution_op->weights_cache, weights_ptr, aligned_total_weights_size); |
119 | 0 | } |
120 | |
|
121 | 0 | memcpy(&convolution_op->params, vmulcaddc_params, vmulcaddc_params_size); |
122 | |
|
123 | 0 | convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) { |
124 | 0 | .function = vmulcaddc_config->ukernel, |
125 | 0 | .mr = vmulcaddc_config->row_tile, |
126 | 0 | }; |
127 | 0 | return xnn_status_success; |
128 | | |
129 | 0 | error: |
130 | 0 | return status; |
131 | 0 | } |
132 | | |
133 | | static enum xnn_status create_dwconv_path( |
134 | | uint32_t kernel_height, |
135 | | uint32_t kernel_width, |
136 | | uint32_t groups, |
137 | | const void* kernel, |
138 | | const void* bias, |
139 | | uint32_t flags, |
140 | | uint32_t log2_input_element_size, |
141 | | uint32_t log2_filter_element_size, |
142 | | uint32_t bias_element_size, |
143 | | xnn_pack_dwconv_hwg_w_fn pack_dwconv_hwg_w, |
144 | | xnn_pack_dwconv_ghw_w_fn pack_dwconv_ghw_w, |
145 | | const void* packing_params, |
146 | | int packed_weights_padding_byte, |
147 | | size_t extra_weights_bytes, |
148 | | xnn_init_qs8_qc8w_scale_params_fn init_scale_params, |
149 | | const float* scale_params, |
150 | | const void* dwconv_params, |
151 | | size_t dwconv_params_size, |
152 | | const struct xnn_dwconv_config* dwconv_ukernel, |
153 | | bool linear_activation, |
154 | | enum xnn_operator_type operator_type, |
155 | | size_t* zero_size, |
156 | | xnn_operator_t convolution_op) |
157 | 0 | { |
158 | 0 | assert(dwconv_ukernel != NULL); |
159 | 0 | enum xnn_status status = xnn_status_out_of_memory; |
160 | 0 | const uint8_t primary_tile = dwconv_ukernel->primary_tile; |
161 | 0 | const bool is_unipass = dwconv_ukernel->last_tile == 0; |
162 | 0 | const size_t kernel_size = kernel_height * kernel_width; |
163 | 0 | if (is_unipass) { |
164 | 0 | assert(primary_tile >= kernel_size); |
165 | 0 | xnn_log_debug("using dwconv unipass of primary_tile %u", primary_tile); |
166 | 0 | } else { |
167 | 0 | assert(kernel_size > primary_tile); |
168 | 0 | xnn_log_debug("using dwconv multipass ukernel of tiles %d, %d, %d", |
169 | 0 | primary_tile, |
170 | 0 | dwconv_ukernel->middle_tile, |
171 | 0 | dwconv_ukernel->last_tile); |
172 | 0 | } |
173 | | |
174 | 0 | const size_t c_stride = round_up_po2(groups, dwconv_ukernel->channel_tile); |
175 | 0 | size_t tile_size = 0; |
176 | 0 | size_t packed_weights_size = 0; |
177 | 0 | if (is_unipass) { |
178 | 0 | tile_size = primary_tile; |
179 | 0 | packed_weights_size = ((primary_tile << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * c_stride; |
180 | 0 | } else { |
181 | 0 | tile_size = xnn_dwconv_multipass_tile_size( |
182 | 0 | kernel_size, primary_tile, dwconv_ukernel->middle_tile, dwconv_ukernel->last_tile); |
183 | 0 | packed_weights_size = xnn_dwconv_multipass_weights_size( |
184 | 0 | tile_size, groups, dwconv_ukernel->channel_tile, dwconv_ukernel->channel_subtile, |
185 | 0 | dwconv_ukernel->channel_round, bias_element_size, log2_filter_element_size, extra_weights_bytes); |
186 | 0 | } |
187 | 0 | size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT); |
188 | 0 | void* weights_ptr = xnn_get_pointer_to_write_weights( |
189 | 0 | convolution_op, aligned_total_weights_size, packed_weights_padding_byte); |
190 | 0 | if (weights_ptr == NULL) { |
191 | 0 | xnn_log_error("failed to reserve or allocated %zu bytes for %s operator dwconv packed weights", |
192 | 0 | aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); |
193 | 0 | goto error; |
194 | 0 | } |
195 | 0 | xnn_log_debug("allocated %zu bytes for packed weights in %s operator", |
196 | 0 | aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); |
197 | |
|
198 | 0 | memcpy(&convolution_op->params, dwconv_params, dwconv_params_size); |
199 | |
|
200 | 0 | if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) { |
201 | 0 | pack_dwconv_hwg_w( |
202 | 0 | primary_tile, |
203 | 0 | dwconv_ukernel->middle_tile, |
204 | 0 | dwconv_ukernel->last_tile, |
205 | 0 | kernel_height, kernel_width, |
206 | 0 | groups, |
207 | 0 | dwconv_ukernel->channel_tile, dwconv_ukernel->channel_subtile, dwconv_ukernel->channel_round, |
208 | 0 | kernel, bias, /*scale=*/NULL, weights_ptr, |
209 | 0 | dwconv_ukernel->channel_tile * extra_weights_bytes, |
210 | 0 | dwconv_ukernel->channel_subtile * extra_weights_bytes, |
211 | 0 | packing_params); |
212 | 0 | } else { |
213 | 0 | pack_dwconv_ghw_w( |
214 | 0 | primary_tile, |
215 | 0 | dwconv_ukernel->middle_tile, |
216 | 0 | dwconv_ukernel->last_tile, |
217 | 0 | kernel_height, kernel_width, |
218 | 0 | groups, |
219 | 0 | dwconv_ukernel->channel_tile, dwconv_ukernel->channel_subtile, dwconv_ukernel->channel_round, |
220 | 0 | kernel, bias, /*scale=*/NULL, weights_ptr, |
221 | 0 | dwconv_ukernel->channel_tile * extra_weights_bytes, |
222 | 0 | dwconv_ukernel->channel_subtile * extra_weights_bytes, |
223 | 0 | packing_params); |
224 | 0 | } |
225 | |
|
226 | 0 | if (scale_params != NULL) { |
227 | 0 | assert(init_scale_params != NULL); |
228 | | // TODO(zhin): QC8 DWCONV multipass is not implemented for now, fix this when it is supported. |
229 | 0 | assert(is_unipass); |
230 | 0 | size_t stride = dwconv_ukernel->channel_tile * |
231 | 0 | ((primary_tile << log2_filter_element_size) + bias_element_size + extra_weights_bytes); |
232 | |
|
233 | 0 | init_scale_params( |
234 | 0 | /*channels=*/groups, |
235 | 0 | /*channels_tile=*/dwconv_ukernel->channel_tile, |
236 | 0 | /*channels_subtile=*/dwconv_ukernel->channel_tile, |
237 | 0 | /*stride=*/stride, |
238 | 0 | /*substride=*/stride, |
239 | 0 | /*stride_offset=*/0, |
240 | 0 | /*scale=*/scale_params, |
241 | | /*packed_w=*/ |
242 | 0 | (void*) ((uintptr_t) weights_ptr + |
243 | 0 | dwconv_ukernel->channel_tile * ((primary_tile << log2_filter_element_size) + bias_element_size))); |
244 | 0 | } |
245 | | |
246 | 0 | if (use_weights_cache(convolution_op)) { |
247 | 0 | convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache( |
248 | 0 | convolution_op->weights_cache, weights_ptr, aligned_total_weights_size); |
249 | 0 | } |
250 | |
|
251 | 0 | const union xnn_dwconv_ukernel* ukernels = &dwconv_ukernel->minmax; |
252 | 0 | if (linear_activation && dwconv_ukernel->linear.unipass != NULL) { |
253 | 0 | ukernels = &dwconv_ukernel->linear; |
254 | 0 | } |
255 | 0 | convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) { |
256 | 0 | .primary_tile = primary_tile, |
257 | 0 | .middle_tile = dwconv_ukernel->middle_tile, |
258 | 0 | .last_tile = dwconv_ukernel->last_tile, |
259 | 0 | .tile_size = tile_size, |
260 | 0 | }; |
261 | |
|
262 | 0 | if (is_unipass) { |
263 | 0 | convolution_op->ukernel.dwconv.unipass_fn = ukernels->unipass; |
264 | 0 | } else { |
265 | 0 | convolution_op->ukernel.dwconv.multipass_fn = ukernels->multipass; |
266 | 0 | } |
267 | |
|
268 | 0 | *zero_size = XNN_EXTRA_BYTES + (c_stride << log2_input_element_size); |
269 | 0 | return xnn_status_success; |
270 | 0 | error: |
271 | 0 | return status; |
272 | 0 | } |
273 | | |
274 | | static enum xnn_status create_gemm_or_igemm( |
275 | | enum xnn_microkernel_type ukernel_type, |
276 | | uint32_t kernel_size, |
277 | | uint32_t groups, |
278 | | size_t group_input_channels, |
279 | | size_t group_output_channels, |
280 | | const void* kernel, |
281 | | const void* bias, |
282 | | uint32_t flags, |
283 | | uint32_t log2_input_element_size, |
284 | | uint32_t log2_filter_element_size, |
285 | | uint32_t bias_element_size, |
286 | | xnn_packw_gemm_goi_ukernel_fn pack_gemm_goi_w, |
287 | | xnn_pack_conv_kgo_w_fn pack_conv_kgo_w, |
288 | | xnn_pack_conv_goki_w_fn pack_conv_goki_w, |
289 | | const void* packing_params, |
290 | | int packed_weights_padding_byte, |
291 | | size_t extra_weights_bytes, |
292 | | xnn_init_qs8_qc8w_scale_params_fn init_scale_params, |
293 | | const float* scale_params, |
294 | | const void* gemm_params, |
295 | | size_t gemm_params_size, |
296 | | const struct xnn_gemm_config* gemm_config, |
297 | | const struct jit_gemm_params* jit_gemm_params, |
298 | | bool linear_activation, |
299 | | bool relu_activation, |
300 | | enum xnn_operator_type operator_type, |
301 | | size_t num_post_operations, |
302 | | void* post_operation_params, |
303 | | xnn_operator_t convolution_op, |
304 | | size_t* zero_size) |
305 | 0 | { |
306 | 0 | enum xnn_status status = xnn_status_out_of_memory; |
307 | 0 | const uint32_t nr = gemm_config->nr; |
308 | 0 | const uint32_t kr = UINT32_C(1) << gemm_config->log2_kr; |
309 | 0 | const uint32_t sr = UINT32_C(1) << gemm_config->log2_sr; |
310 | 0 | const size_t n_stride = round_up(group_output_channels, nr); |
311 | 0 | const size_t k_stride = round_up_po2(group_input_channels, kr * sr); |
312 | |
|
313 | 0 | const size_t packed_group_weights_size = |
314 | 0 | ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * n_stride; |
315 | 0 | const size_t aligned_total_weights_size = round_up_po2(packed_group_weights_size * groups, XNN_ALLOCATION_ALIGNMENT); |
316 | 0 | void* weights_ptr = xnn_get_pointer_to_write_weights( |
317 | 0 | convolution_op, aligned_total_weights_size, packed_weights_padding_byte); |
318 | 0 | if (weights_ptr == NULL) { |
319 | 0 | xnn_log_error("failed to reserve or allocated %zu bytes for %s operator gemm packed weights", |
320 | 0 | aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); |
321 | 0 | goto error; |
322 | 0 | } |
323 | 0 | xnn_log_debug("allocated %zu bytes for packed weights in %s operator", |
324 | 0 | aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); |
325 | |
|
326 | 0 | memcpy(&convolution_op->params, gemm_params, gemm_params_size); |
327 | 0 | convolution_op->num_post_operation_params = num_post_operations; |
328 | 0 | convolution_op->post_operation_params = post_operation_params; |
329 | |
|
330 | 0 | const struct gemm_fused_ukernels* gemm_ukernels = &gemm_config->minmax; |
331 | 0 | const uint32_t mr = gemm_config->mr; |
332 | 0 | if (linear_activation && gemm_config->linear.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) { |
333 | 0 | gemm_ukernels = &gemm_config->linear; |
334 | 0 | } else if (relu_activation && gemm_config->relu.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) { |
335 | 0 | gemm_ukernels = &gemm_config->relu; |
336 | 0 | } |
337 | 0 | switch (ukernel_type) { |
338 | 0 | case xnn_microkernel_type_gemm: |
339 | 0 | pack_gemm_goi_w( |
340 | 0 | groups, group_output_channels, group_input_channels, |
341 | 0 | nr, kr, sr, |
342 | 0 | kernel, bias, /*scale=*/NULL, weights_ptr, gemm_config->nr * extra_weights_bytes, packing_params); |
343 | 0 | convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) { |
344 | 0 | .mr = mr, |
345 | 0 | .nr = nr, |
346 | 0 | .kr = kr, |
347 | 0 | .sr = sr, |
348 | 0 | }; |
349 | |
|
350 | 0 | assert(XNN_MAX_MR >= mr); |
351 | 0 | for (size_t i = 0; i < mr; i++) { |
352 | 0 | convolution_op->ukernel.gemm.gemm_cases[i] = gemm_ukernels->gemm[i]; |
353 | 0 | } |
354 | |
|
355 | | #if XNN_PLATFORM_JIT |
356 | | xnn_generate_gemms_up_to_max_mr( |
357 | | mr, gemm_config->generator, jit_gemm_params, group_output_channels, nr, |
358 | | group_input_channels << log2_input_element_size, convolution_op); |
359 | | #endif // XNN_PLATFORM_JIT |
360 | |
|
361 | 0 | break; |
362 | 0 | case xnn_microkernel_type_igemm: |
363 | 0 | if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) { |
364 | 0 | pack_conv_kgo_w( |
365 | 0 | groups, group_output_channels, kernel_size, |
366 | 0 | nr, kr, sr, |
367 | 0 | kernel, bias, /*scale=*/NULL, weights_ptr, gemm_config->nr * extra_weights_bytes, packing_params); |
368 | 0 | } else { |
369 | 0 | pack_conv_goki_w( |
370 | 0 | groups, group_output_channels, kernel_size, group_input_channels, |
371 | 0 | nr, kr, sr, |
372 | 0 | kernel, bias, /*scale=*/NULL, weights_ptr, gemm_config->nr * extra_weights_bytes, packing_params); |
373 | 0 | } |
374 | 0 | convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) { |
375 | 0 | .mr = mr, |
376 | 0 | .nr = nr, |
377 | 0 | .kr = kr, |
378 | 0 | .sr = sr, |
379 | 0 | }; |
380 | |
|
381 | 0 | assert(XNN_MAX_MR >= mr); |
382 | 0 | for (size_t i = 0; i < mr; i++) { |
383 | 0 | convolution_op->ukernel.igemm.igemm_cases[i] = gemm_ukernels->igemm[i]; |
384 | 0 | } |
385 | |
|
386 | | #if XNN_PLATFORM_JIT |
387 | | xnn_generate_igemms_up_to_max_mr( |
388 | | mr, gemm_config->generator, jit_gemm_params, group_output_channels, nr, |
389 | | group_input_channels << log2_input_element_size, kernel_size, convolution_op); |
390 | | #endif // XNN_PLATFORM_JIT |
391 | |
|
392 | 0 | break; |
393 | 0 | default: |
394 | 0 | XNN_UNREACHABLE; |
395 | 0 | } |
396 | | |
397 | 0 | if (scale_params != NULL) { |
398 | 0 | assert(init_scale_params != NULL); |
399 | | |
400 | 0 | void* group_weights = |
401 | 0 | (void*)((uintptr_t)weights_ptr + |
402 | 0 | gemm_config->nr * ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size)); |
403 | 0 | const size_t weights_stride = |
404 | 0 | (kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes; |
405 | 0 | for (uint32_t group = 0; group < groups; group++) { |
406 | 0 | init_scale_params( |
407 | 0 | group_output_channels, gemm_config->nr, gemm_config->nr, |
408 | 0 | gemm_config->nr * weights_stride, gemm_config->nr * weights_stride, 0, |
409 | 0 | scale_params, group_weights); |
410 | 0 | scale_params += group_output_channels; |
411 | 0 | group_weights = (void*) ((uintptr_t) group_weights + n_stride * weights_stride); |
412 | 0 | } |
413 | 0 | } |
414 | | |
415 | 0 | if (use_weights_cache(convolution_op)) { |
416 | 0 | convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache( |
417 | 0 | convolution_op->weights_cache, weights_ptr, aligned_total_weights_size); |
418 | 0 | } |
419 | |
|
420 | 0 | *zero_size = XNN_EXTRA_BYTES + (k_stride << log2_input_element_size); |
421 | 0 | return xnn_status_success; |
422 | | |
423 | 0 | error: |
424 | 0 | return status; |
425 | 0 | } |
426 | | |
427 | | static enum xnn_status create_convolution2d_nhwc( |
428 | | uint32_t input_padding_top, |
429 | | uint32_t input_padding_right, |
430 | | uint32_t input_padding_bottom, |
431 | | uint32_t input_padding_left, |
432 | | uint32_t kernel_height, |
433 | | uint32_t kernel_width, |
434 | | uint32_t subsampling_height, |
435 | | uint32_t subsampling_width, |
436 | | uint32_t dilation_height, |
437 | | uint32_t dilation_width, |
438 | | uint32_t groups, |
439 | | size_t group_input_channels, |
440 | | size_t group_output_channels, |
441 | | size_t input_channel_stride, |
442 | | size_t output_channel_stride, |
443 | | const void* kernel, |
444 | | const void* bias, |
445 | | uint32_t flags, |
446 | | uint32_t log2_input_element_size, |
447 | | uint32_t log2_filter_element_size, |
448 | | uint32_t bias_element_size, |
449 | | xnn_pack_vmulcaddc_w_fn pack_vmulcaddc_w, |
450 | | xnn_pack_dwconv_hwg_w_fn pack_dwconv_hwg_w, |
451 | | xnn_pack_dwconv_ghw_w_fn pack_dwconv_ghw_w, |
452 | | xnn_packw_gemm_goi_ukernel_fn pack_gemm_goi_w, |
453 | | xnn_pack_conv_kgo_w_fn pack_conv_kgo_w, |
454 | | xnn_pack_conv_goki_w_fn pack_conv_goki_w, |
455 | | const void* packing_params, |
456 | | int input_padding_byte, |
457 | | int packed_weights_padding_byte, |
458 | | size_t extra_weights_bytes, |
459 | | xnn_init_qs8_qc8w_scale_params_fn init_scale_params, |
460 | | const float* scale_params, |
461 | | const void* gemm_params, |
462 | | size_t gemm_params_size, |
463 | | const void* dwconv_params, |
464 | | size_t dwconv_params_size, |
465 | | const void* vmulcaddc_params, |
466 | | size_t vmulcaddc_params_size, |
467 | | const struct xnn_gemm_config* gemm_config, |
468 | | const struct xnn_dwconv_config* dwconv_ukernel, |
469 | | const struct xnn_vmulcaddc_config* vmulcaddc_config, |
470 | | struct jit_gemm_params* jit_gemm_params, |
471 | | bool linear_activation, |
472 | | bool relu_activation, |
473 | | enum xnn_operator_type operator_type, |
474 | | size_t num_post_operations, |
475 | | void* post_operation_params, |
476 | | xnn_code_cache_t code_cache, |
477 | | xnn_weights_cache_t weights_cache, |
478 | | xnn_operator_t* convolution_op_out) |
479 | 0 | { |
480 | 0 | xnn_operator_t convolution_op = NULL; |
481 | 0 | enum xnn_status status = xnn_status_uninitialized; |
482 | |
|
483 | 0 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
484 | 0 | xnn_log_error( |
485 | 0 | "failed to create %s operator: XNNPACK is not initialized", |
486 | 0 | xnn_operator_type_to_string(operator_type)); |
487 | 0 | goto error; |
488 | 0 | } |
489 | | |
490 | 0 | status = xnn_status_invalid_parameter; |
491 | |
|
492 | 0 | if (kernel_width == 0 || kernel_height == 0) { |
493 | 0 | xnn_log_error( |
494 | 0 | "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero", |
495 | 0 | xnn_operator_type_to_string(operator_type), kernel_width, kernel_height); |
496 | 0 | goto error; |
497 | 0 | } |
498 | | |
499 | 0 | if (subsampling_width == 0 || subsampling_height == 0) { |
500 | 0 | xnn_log_error( |
501 | 0 | "failed to create %s operator with %" PRIu32 "x%" PRIu32 " subsampling: subsampling dimensions must be non-zero", |
502 | 0 | xnn_operator_type_to_string(operator_type), subsampling_width, subsampling_height); |
503 | 0 | goto error; |
504 | 0 | } |
505 | | |
506 | 0 | if (dilation_width == 0 || dilation_height == 0) { |
507 | 0 | xnn_log_error( |
508 | 0 | "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero", |
509 | 0 | xnn_operator_type_to_string(operator_type), dilation_width, dilation_height); |
510 | 0 | goto error; |
511 | 0 | } |
512 | | |
513 | 0 | if (groups == 0) { |
514 | 0 | xnn_log_error( |
515 | 0 | "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero", |
516 | 0 | xnn_operator_type_to_string(operator_type), groups); |
517 | 0 | goto error; |
518 | 0 | } |
519 | | |
520 | 0 | if (group_input_channels == 0) { |
521 | 0 | xnn_log_error( |
522 | 0 | "failed to create %s operator with %zu input channels per group: number of channels must be non-zero", |
523 | 0 | xnn_operator_type_to_string(operator_type), group_input_channels); |
524 | 0 | goto error; |
525 | 0 | } |
526 | | |
527 | 0 | if (group_output_channels == 0) { |
528 | 0 | xnn_log_error( |
529 | 0 | "failed to create %s operator with %zu output channels per group: number of channels must be non-zero", |
530 | 0 | xnn_operator_type_to_string(operator_type), group_output_channels); |
531 | 0 | goto error; |
532 | 0 | } |
533 | | |
534 | 0 | const size_t input_channels = groups * group_input_channels; |
535 | 0 | if (input_channel_stride < input_channels) { |
536 | 0 | xnn_log_error( |
537 | 0 | "failed to create %s operator with input channel stride of %zu: " |
538 | 0 | "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)", |
539 | 0 | xnn_operator_type_to_string(operator_type), |
540 | 0 | input_channel_stride, groups, group_input_channels); |
541 | 0 | goto error; |
542 | 0 | } |
543 | | |
544 | 0 | const size_t output_channels = groups * group_output_channels; |
545 | 0 | if (output_channel_stride < output_channels) { |
546 | 0 | xnn_log_error( |
547 | 0 | "failed to create %s operator with output channel stride of %zu: " |
548 | 0 | "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)", |
549 | 0 | xnn_operator_type_to_string(operator_type), |
550 | 0 | output_channel_stride, groups, group_output_channels); |
551 | 0 | goto error; |
552 | 0 | } |
553 | | |
554 | 0 | if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) { |
555 | 0 | xnn_log_error( |
556 | 0 | "failed to create depthwise %s operator with %zu input channels per group: " |
557 | 0 | "depthwise convolution must have exactly 1 input channel per group", |
558 | 0 | xnn_operator_type_to_string(operator_type), group_input_channels); |
559 | 0 | goto error; |
560 | 0 | } |
561 | | |
562 | 0 | const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0; |
563 | 0 | if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) { |
564 | 0 | if (any_padding) { |
565 | 0 | xnn_log_error( |
566 | 0 | "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: " |
567 | 0 | "TensorFlow SAME padding can't be combined with explicit padding specification", |
568 | 0 | xnn_operator_type_to_string(operator_type), |
569 | 0 | input_padding_top, input_padding_left, input_padding_bottom, input_padding_right); |
570 | 0 | goto error; |
571 | 0 | } |
572 | 0 | } |
573 | | |
574 | 0 | status = xnn_status_out_of_memory; |
575 | |
|
576 | 0 | convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); |
577 | 0 | if (convolution_op == NULL) { |
578 | 0 | xnn_log_error( |
579 | 0 | "failed to allocate %zu bytes for %s operator descriptor", |
580 | 0 | sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type)); |
581 | 0 | goto error; |
582 | 0 | } |
583 | | |
584 | 0 | convolution_op->weights_cache = weights_cache; |
585 | 0 | convolution_op->code_cache = code_cache; |
586 | |
|
587 | 0 | const size_t kernel_size = kernel_height * kernel_width; |
588 | |
|
589 | 0 | enum xnn_microkernel_type ukernel_type = xnn_microkernel_type_default; |
590 | 0 | const bool unit_subsampling = (subsampling_width | subsampling_height) == 1; |
591 | 0 | if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding && vmulcaddc_config != NULL) { |
592 | 0 | ukernel_type = xnn_microkernel_type_vmulcaddc; |
593 | 0 | } else if (group_input_channels == 1 && group_output_channels == 1 && dwconv_ukernel != NULL) { |
594 | 0 | ukernel_type = xnn_microkernel_type_dwconv; |
595 | 0 | } else if (kernel_size == 1 && unit_subsampling && !any_padding) { |
596 | 0 | ukernel_type = xnn_microkernel_type_gemm; |
597 | 0 | } else { |
598 | 0 | ukernel_type = xnn_microkernel_type_igemm; |
599 | 0 | } |
600 | 0 | assert(ukernel_type != xnn_microkernel_type_default); |
601 | | |
602 | 0 | if (num_post_operations != 0 && (ukernel_type != xnn_microkernel_type_gemm && ukernel_type != xnn_microkernel_type_igemm)) { |
603 | 0 | xnn_log_error( |
604 | 0 | "convolution with post operations not support for these parameters: " |
605 | 0 | "kernel_size: %zu unit_subsampling: %d padding: %d, ukernel_type: %d", |
606 | 0 | kernel_size, unit_subsampling, any_padding, ukernel_type); |
607 | 0 | goto error; |
608 | 0 | } |
609 | | |
610 | 0 | size_t zero_size = 0; |
611 | 0 | switch (ukernel_type) { |
612 | 0 | case xnn_microkernel_type_vmulcaddc: |
613 | 0 | { |
614 | 0 | status = create_vmulcaddc_path( |
615 | 0 | groups, kernel, bias, log2_filter_element_size, bias_element_size, |
616 | 0 | pack_vmulcaddc_w, packing_params, packed_weights_padding_byte, |
617 | 0 | vmulcaddc_params, vmulcaddc_params_size, vmulcaddc_config, |
618 | 0 | operator_type, convolution_op); |
619 | 0 | if (status != xnn_status_success) { |
620 | 0 | goto error; |
621 | 0 | } |
622 | 0 | break; |
623 | 0 | } |
624 | 0 | case xnn_microkernel_type_dwconv: |
625 | 0 | { |
626 | 0 | status = create_dwconv_path( |
627 | 0 | kernel_height, kernel_width, |
628 | 0 | groups, kernel, bias, flags, |
629 | 0 | log2_input_element_size, log2_filter_element_size, bias_element_size, |
630 | 0 | pack_dwconv_hwg_w, pack_dwconv_ghw_w, |
631 | 0 | packing_params, packed_weights_padding_byte, extra_weights_bytes, |
632 | 0 | init_scale_params, scale_params, |
633 | 0 | dwconv_params, dwconv_params_size, dwconv_ukernel, |
634 | 0 | linear_activation, operator_type, &zero_size, convolution_op); |
635 | 0 | if (status != xnn_status_success) { |
636 | 0 | goto error; |
637 | 0 | } |
638 | 0 | break; |
639 | 0 | } |
640 | 0 | case xnn_microkernel_type_gemm: |
641 | 0 | case xnn_microkernel_type_igemm: |
642 | 0 | { |
643 | 0 | status = create_gemm_or_igemm( |
644 | 0 | ukernel_type, kernel_size, |
645 | 0 | groups, group_input_channels, group_output_channels, |
646 | 0 | kernel, bias, flags, |
647 | 0 | log2_input_element_size, log2_filter_element_size, bias_element_size, |
648 | 0 | pack_gemm_goi_w, pack_conv_kgo_w, pack_conv_goki_w, packing_params, |
649 | 0 | packed_weights_padding_byte, extra_weights_bytes, |
650 | 0 | init_scale_params, scale_params, |
651 | 0 | gemm_params, gemm_params_size, gemm_config, jit_gemm_params, |
652 | 0 | linear_activation, relu_activation, |
653 | 0 | operator_type, |
654 | 0 | num_post_operations, post_operation_params, |
655 | 0 | convolution_op, |
656 | 0 | &zero_size); |
657 | 0 | if (status != xnn_status_success) { |
658 | 0 | goto error; |
659 | 0 | } |
660 | 0 | break; |
661 | 0 | } |
662 | 0 | default: |
663 | 0 | XNN_UNREACHABLE; |
664 | 0 | } |
665 | | |
666 | 0 | const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1; |
667 | 0 | if (any_padding || tf_same_padding) { |
668 | 0 | convolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size); |
669 | 0 | if (convolution_op->zero_buffer == NULL) { |
670 | 0 | xnn_log_error( |
671 | 0 | "failed to allocate %zu bytes for %s operator zero padding", |
672 | 0 | zero_size, xnn_operator_type_to_string(operator_type)); |
673 | 0 | goto error; |
674 | 0 | } |
675 | 0 | memset(convolution_op->zero_buffer, input_padding_byte, zero_size); |
676 | 0 | } |
677 | | |
678 | 0 | convolution_op->padding_top = input_padding_top; |
679 | 0 | convolution_op->padding_right = input_padding_right; |
680 | 0 | convolution_op->padding_bottom = input_padding_bottom; |
681 | 0 | convolution_op->padding_left = input_padding_left; |
682 | |
|
683 | 0 | convolution_op->kernel_height = kernel_height; |
684 | 0 | convolution_op->kernel_width = kernel_width; |
685 | 0 | convolution_op->stride_height = subsampling_height; |
686 | 0 | convolution_op->stride_width = subsampling_width; |
687 | 0 | convolution_op->dilation_height = dilation_height; |
688 | 0 | convolution_op->dilation_width = dilation_width; |
689 | 0 | convolution_op->groups = groups; |
690 | 0 | convolution_op->group_input_channels = group_input_channels; |
691 | 0 | convolution_op->group_output_channels = group_output_channels; |
692 | 0 | convolution_op->input_pixel_stride = input_channel_stride; |
693 | 0 | convolution_op->output_pixel_stride = output_channel_stride; |
694 | |
|
695 | 0 | convolution_op->type = operator_type; |
696 | 0 | convolution_op->ukernel.type = ukernel_type; |
697 | 0 | convolution_op->flags = flags & ~XNN_FLAG_TENSORFLOW_SAME_PADDING; |
698 | 0 | if (tf_same_padding) { |
699 | 0 | convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING; |
700 | 0 | } |
701 | |
|
702 | 0 | convolution_op->state = xnn_run_state_invalid; |
703 | |
|
704 | 0 | *convolution_op_out = convolution_op; |
705 | 0 | return xnn_status_success; |
706 | | |
707 | 0 | error: |
708 | 0 | xnn_delete_operator(convolution_op); |
709 | 0 | return status; |
710 | 0 | } |
711 | | |
712 | | enum xnn_status xnn_create_convolution2d_nhwc_qu8( |
713 | | uint32_t input_padding_top, |
714 | | uint32_t input_padding_right, |
715 | | uint32_t input_padding_bottom, |
716 | | uint32_t input_padding_left, |
717 | | uint32_t kernel_height, |
718 | | uint32_t kernel_width, |
719 | | uint32_t subsampling_height, |
720 | | uint32_t subsampling_width, |
721 | | uint32_t dilation_height, |
722 | | uint32_t dilation_width, |
723 | | uint32_t groups, |
724 | | size_t group_input_channels, |
725 | | size_t group_output_channels, |
726 | | size_t input_channel_stride, |
727 | | size_t output_channel_stride, |
728 | | uint8_t input_zero_point, |
729 | | float input_scale, |
730 | | uint8_t kernel_zero_point, |
731 | | float kernel_scale, |
732 | | const uint8_t* kernel, |
733 | | const int32_t* bias, |
734 | | uint8_t output_zero_point, |
735 | | float output_scale, |
736 | | uint8_t output_min, |
737 | | uint8_t output_max, |
738 | | uint32_t flags, |
739 | | xnn_code_cache_t code_cache, |
740 | | xnn_weights_cache_t weights_cache, |
741 | | xnn_operator_t* convolution_op_out) |
742 | 0 | { |
743 | 0 | if (input_scale <= 0.0f || !isnormal(input_scale)) { |
744 | 0 | xnn_log_error( |
745 | 0 | "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive", |
746 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), input_scale); |
747 | 0 | return xnn_status_invalid_parameter; |
748 | 0 | } |
749 | | |
750 | 0 | if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) { |
751 | 0 | xnn_log_error( |
752 | 0 | "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive", |
753 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), kernel_scale); |
754 | 0 | return xnn_status_invalid_parameter; |
755 | 0 | } |
756 | | |
757 | 0 | if (output_scale <= 0.0f || !isnormal(output_scale)) { |
758 | 0 | xnn_log_error( |
759 | 0 | "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive", |
760 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_scale); |
761 | 0 | return xnn_status_invalid_parameter; |
762 | 0 | } |
763 | | |
764 | 0 | if (output_min >= output_max) { |
765 | 0 | xnn_log_error( |
766 | 0 | "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max", |
767 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_min, output_max); |
768 | 0 | return xnn_status_invalid_parameter; |
769 | 0 | } |
770 | | |
771 | 0 | const float requantization_scale = input_scale * kernel_scale / output_scale; |
772 | 0 | if (requantization_scale >= 256.0f) { |
773 | 0 | xnn_log_error( |
774 | 0 | "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: " |
775 | 0 | "requantization scale %.7g is greater or equal to 256.0", |
776 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), |
777 | 0 | input_scale, kernel_scale, output_scale, requantization_scale); |
778 | 0 | return xnn_status_unsupported_parameter; |
779 | 0 | } |
780 | | |
781 | 0 | const struct xnn_qu8_packing_params packing_params = { |
782 | 0 | .input_zero_point = input_zero_point, |
783 | 0 | .kernel_zero_point = kernel_zero_point, |
784 | 0 | }; |
785 | |
|
786 | 0 | const struct xnn_gemm_config* gemm_config = xnn_init_qu8_gemm_config(); |
787 | 0 | assert(gemm_config != NULL); |
788 | | |
789 | 0 | union xnn_qu8_conv_minmax_params gemm_params; |
790 | 0 | if XNN_LIKELY(gemm_config->init.qu8 != NULL) { |
791 | 0 | gemm_config->init.qu8(&gemm_params, |
792 | 0 | kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max); |
793 | 0 | } |
794 | |
|
795 | 0 | const struct xnn_dwconv_config* dwconv_config = xnn_init_qu8_dwconv_config(); |
796 | 0 | assert(dwconv_config != NULL); |
797 | | |
798 | 0 | union xnn_qu8_conv_minmax_params dwconv_params; |
799 | 0 | const struct xnn_dwconv_config* dwconv_ukernel = |
800 | 0 | find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_QU8_DWCONV_UKERNELS); |
801 | 0 | if XNN_LIKELY(dwconv_ukernel != NULL) { |
802 | 0 | dwconv_ukernel->init.qu8(&dwconv_params, |
803 | 0 | kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max); |
804 | 0 | } |
805 | |
|
806 | 0 | return create_convolution2d_nhwc( |
807 | 0 | input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, |
808 | 0 | kernel_height, kernel_width, |
809 | 0 | subsampling_height, subsampling_width, |
810 | 0 | dilation_height, dilation_width, |
811 | 0 | groups, group_input_channels, group_output_channels, |
812 | 0 | input_channel_stride, output_channel_stride, |
813 | 0 | kernel, bias, flags, |
814 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, |
815 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, |
816 | 0 | /*bias_element_size=*/sizeof(int32_t), |
817 | 0 | (xnn_pack_vmulcaddc_w_fn) NULL, |
818 | 0 | (xnn_pack_dwconv_hwg_w_fn) xnn_pack_qu8_dwconv_hwg_w, |
819 | 0 | (xnn_pack_dwconv_ghw_w_fn) xnn_pack_qu8_dwconv_ghw_w, |
820 | 0 | (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi, |
821 | 0 | (xnn_pack_conv_kgo_w_fn) xnn_pack_qu8_conv_kgo_w, |
822 | 0 | (xnn_pack_conv_goki_w_fn) xnn_pack_qu8_conv_goki_w, |
823 | 0 | /*packing_params=*/&packing_params, |
824 | 0 | /*input_padding_byte=*/input_zero_point, |
825 | 0 | /*packed_weights_padding_byte=*/kernel_zero_point, |
826 | 0 | /*extra_weights_bytes=*/0, |
827 | | /*init_scale_params=*/NULL, |
828 | | /*scale_params=*/NULL, |
829 | 0 | /*gemm_params=*/&gemm_params, |
830 | 0 | /*gemm_params_size=*/sizeof(gemm_params), |
831 | 0 | /*dwconv_params=*/&dwconv_params, |
832 | 0 | /*dwconv_params_size=*/sizeof(dwconv_params), |
833 | | /*vmulcaddc_params=*/NULL, |
834 | 0 | /*vmulcaddc_params_size=*/0, |
835 | 0 | /*gemm_config=*/gemm_config, |
836 | 0 | /*dwconv_ukernel=*/dwconv_ukernel, |
837 | | /*vmulcaddc_config=*/NULL, |
838 | | /*jit_gemm_params=*/NULL, |
839 | | /*linear_activation=*/false, |
840 | | /*relu_activation=*/false, |
841 | 0 | /*operator_type=*/xnn_operator_type_convolution_nhwc_qu8, |
842 | 0 | /*num_post_operations=*/0, |
843 | | /*post_operation_params=*/NULL, |
844 | 0 | /*code_cache=*/code_cache, |
845 | 0 | /*weights_cache=*/weights_cache, |
846 | 0 | convolution_op_out); |
847 | 0 | } |
848 | | |
849 | | enum xnn_status xnn_create_convolution2d_nhwc_qs8( |
850 | | uint32_t input_padding_top, |
851 | | uint32_t input_padding_right, |
852 | | uint32_t input_padding_bottom, |
853 | | uint32_t input_padding_left, |
854 | | uint32_t kernel_height, |
855 | | uint32_t kernel_width, |
856 | | uint32_t subsampling_height, |
857 | | uint32_t subsampling_width, |
858 | | uint32_t dilation_height, |
859 | | uint32_t dilation_width, |
860 | | uint32_t groups, |
861 | | size_t group_input_channels, |
862 | | size_t group_output_channels, |
863 | | size_t input_channel_stride, |
864 | | size_t output_channel_stride, |
865 | | int8_t input_zero_point, |
866 | | float input_scale, |
867 | | float kernel_scale, |
868 | | const int8_t* kernel, |
869 | | const int32_t* bias, |
870 | | int8_t output_zero_point, |
871 | | float output_scale, |
872 | | int8_t output_min, |
873 | | int8_t output_max, |
874 | | uint32_t flags, |
875 | | xnn_code_cache_t code_cache, |
876 | | xnn_weights_cache_t weights_cache, |
877 | | xnn_operator_t* convolution_op_out) |
878 | 180 | { |
879 | 180 | if (input_scale <= 0.0f || !isnormal(input_scale)) { |
880 | 112 | xnn_log_error( |
881 | 112 | "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive", |
882 | 112 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), input_scale); |
883 | 112 | return xnn_status_invalid_parameter; |
884 | 112 | } |
885 | | |
886 | 68 | if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) { |
887 | 38 | xnn_log_error( |
888 | 38 | "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive", |
889 | 38 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), kernel_scale); |
890 | 38 | return xnn_status_invalid_parameter; |
891 | 38 | } |
892 | | |
893 | 30 | if (output_scale <= 0.0f || !isnormal(output_scale)) { |
894 | 9 | xnn_log_error( |
895 | 9 | "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive", |
896 | 9 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_scale); |
897 | 9 | return xnn_status_invalid_parameter; |
898 | 9 | } |
899 | | |
900 | 21 | if (output_min >= output_max) { |
901 | 0 | xnn_log_error( |
902 | 0 | "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max", |
903 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_min, output_max); |
904 | 0 | return xnn_status_invalid_parameter; |
905 | 0 | } |
906 | | |
907 | 21 | const float requantization_scale = input_scale * kernel_scale / output_scale; |
908 | 21 | if (requantization_scale >= 256.0f) { |
909 | 21 | xnn_log_error( |
910 | 21 | "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: " |
911 | 21 | "requantization scale %.7g is greater or equal to 256.0", |
912 | 21 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), |
913 | 21 | input_scale, kernel_scale, output_scale, requantization_scale); |
914 | 21 | return xnn_status_unsupported_parameter; |
915 | 21 | } |
916 | | |
917 | 0 | const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, }; |
918 | |
|
919 | 0 | const struct xnn_gemm_config* gemm_config = xnn_init_qs8_gemm_config(); |
920 | 0 | assert(gemm_config != NULL); |
921 | | |
922 | 0 | union xnn_qs8_conv_minmax_params gemm_params; |
923 | 0 | if XNN_LIKELY(gemm_config->init.qs8 != NULL) { |
924 | 0 | gemm_config->init.qs8(&gemm_params, |
925 | 0 | requantization_scale, output_zero_point, output_min, output_max); |
926 | 0 | } |
927 | |
|
928 | 0 | const struct xnn_dwconv_config* dwconv_config = xnn_init_qs8_dwconv_config(); |
929 | 0 | assert(dwconv_config != NULL); |
930 | | |
931 | 0 | union xnn_qs8_conv_minmax_params dwconv_params; |
932 | 0 | const struct xnn_dwconv_config* dwconv_ukernel = |
933 | 0 | find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_QS8_DWCONV_UKERNELS); |
934 | 0 | if XNN_LIKELY(dwconv_ukernel != NULL) { |
935 | 0 | dwconv_ukernel->init.qs8(&dwconv_params, |
936 | 0 | requantization_scale, output_zero_point, output_min, output_max); |
937 | 0 | } |
938 | |
|
939 | 0 | return create_convolution2d_nhwc( |
940 | 0 | input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, |
941 | 0 | kernel_height, kernel_width, |
942 | 0 | subsampling_height, subsampling_width, |
943 | 0 | dilation_height, dilation_width, |
944 | 0 | groups, group_input_channels, group_output_channels, |
945 | 0 | input_channel_stride, output_channel_stride, |
946 | 0 | kernel, bias, flags, |
947 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
948 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
949 | 0 | /*bias_element_size=*/sizeof(int32_t), |
950 | 0 | (xnn_pack_vmulcaddc_w_fn) NULL, |
951 | 0 | (xnn_pack_dwconv_hwg_w_fn) xnn_pack_qs8_dwconv_hwg_w, |
952 | 0 | (xnn_pack_dwconv_ghw_w_fn) xnn_pack_qs8_dwconv_ghw_w, |
953 | 0 | (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi, |
954 | 0 | (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w, |
955 | 0 | (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w, |
956 | 0 | /*packing_params=*/&packing_params, |
957 | 0 | /*input_padding_byte=*/input_zero_point, |
958 | 0 | /*packed_weights_padding_byte=*/0, |
959 | 0 | /*extra_weights_bytes=*/0, |
960 | | /*init_scale_params=*/NULL, |
961 | | /*scale_params=*/NULL, |
962 | 0 | /*gemm_params=*/&gemm_params, |
963 | 0 | /*gemm_params_size=*/sizeof(gemm_params), |
964 | 0 | /*dwconv_params=*/&dwconv_params, |
965 | 0 | /*dwconv_params_size=*/sizeof(dwconv_params), |
966 | | /*vmulcaddc_params=*/NULL, |
967 | 0 | /*vmulcaddc_params_size=*/0, |
968 | 0 | /*gemm_config=*/gemm_config, |
969 | 0 | /*dwconv_ukernel=*/dwconv_ukernel, |
970 | | /*vmulcaddc_config=*/NULL, |
971 | | /*jit_gemm_params=*/NULL, |
972 | | /*linear_activation=*/false, |
973 | | /*relu_activation=*/false, |
974 | 0 | /*operator_type=*/xnn_operator_type_convolution_nhwc_qs8, |
975 | 0 | /*num_post_operations=*/0, |
976 | | /*post_operation_params=*/NULL, |
977 | 0 | /*code_cache=*/code_cache, |
978 | 0 | /*weights_cache=*/weights_cache, |
979 | 0 | convolution_op_out); |
980 | 21 | } |
981 | | |
982 | | enum xnn_status xnn_create_convolution2d_nhwc_qs8_qc8w( |
983 | | uint32_t input_padding_top, |
984 | | uint32_t input_padding_right, |
985 | | uint32_t input_padding_bottom, |
986 | | uint32_t input_padding_left, |
987 | | uint32_t kernel_height, |
988 | | uint32_t kernel_width, |
989 | | uint32_t subsampling_height, |
990 | | uint32_t subsampling_width, |
991 | | uint32_t dilation_height, |
992 | | uint32_t dilation_width, |
993 | | uint32_t groups, |
994 | | size_t group_input_channels, |
995 | | size_t group_output_channels, |
996 | | size_t input_channel_stride, |
997 | | size_t output_channel_stride, |
998 | | int8_t input_zero_point, |
999 | | float input_scale, |
1000 | | const float* kernel_scale, |
1001 | | const int8_t* kernel, |
1002 | | const int32_t* bias, |
1003 | | int8_t output_zero_point, |
1004 | | float output_scale, |
1005 | | int8_t output_min, |
1006 | | int8_t output_max, |
1007 | | uint32_t flags, |
1008 | | xnn_code_cache_t code_cache, |
1009 | | xnn_weights_cache_t weights_cache, |
1010 | | xnn_operator_t* convolution_op_out) |
1011 | 0 | { |
1012 | 0 | if (input_scale <= 0.0f || !isnormal(input_scale)) { |
1013 | 0 | xnn_log_error( |
1014 | 0 | "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive", |
1015 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), input_scale); |
1016 | 0 | return xnn_status_invalid_parameter; |
1017 | 0 | } |
1018 | | |
1019 | 0 | for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) { |
1020 | 0 | if (kernel_scale[output_channel] <= 0.0f || !isnormal(kernel_scale[output_channel])) { |
1021 | 0 | xnn_log_error( |
1022 | 0 | "failed to create %s operator with %.7g kernel scale in output channel #%zu: " |
1023 | 0 | "scale must be finite, normalized, and positive", |
1024 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), kernel_scale[output_channel], |
1025 | 0 | output_channel); |
1026 | 0 | return xnn_status_invalid_parameter; |
1027 | 0 | } |
1028 | 0 | } |
1029 | | |
1030 | 0 | if (output_scale <= 0.0f || !isnormal(output_scale)) { |
1031 | 0 | xnn_log_error( |
1032 | 0 | "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive", |
1033 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_scale); |
1034 | 0 | return xnn_status_invalid_parameter; |
1035 | 0 | } |
1036 | | |
1037 | 0 | if (output_min >= output_max) { |
1038 | 0 | xnn_log_error( |
1039 | 0 | "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max", |
1040 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_min, output_max); |
1041 | 0 | return xnn_status_invalid_parameter; |
1042 | 0 | } |
1043 | | |
1044 | 0 | float* requantization_scale = xnn_allocate_simd_memory(groups * group_output_channels * sizeof(float)); |
1045 | 0 | if (requantization_scale == NULL) { |
1046 | 0 | xnn_log_error( |
1047 | 0 | "failed to allocate %zu bytes for %s operator packed weights", |
1048 | 0 | groups * group_output_channels * sizeof(float), |
1049 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8)); |
1050 | 0 | return xnn_status_out_of_memory; |
1051 | 0 | } |
1052 | 0 | for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) { |
1053 | 0 | requantization_scale[output_channel] = input_scale * kernel_scale[output_channel] / output_scale; |
1054 | 0 | if (requantization_scale[output_channel] >= 256.0f) { |
1055 | 0 | xnn_log_error( |
1056 | 0 | "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale in output channel #%zu: " |
1057 | 0 | "requantization scale %.7g is greater or equal to 256.0", |
1058 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), |
1059 | 0 | input_scale, kernel_scale[output_channel], output_scale, |
1060 | 0 | output_channel, requantization_scale[output_channel]); |
1061 | |
|
1062 | 0 | xnn_release_simd_memory(requantization_scale); |
1063 | 0 | return xnn_status_unsupported_parameter; |
1064 | 0 | } |
1065 | 0 | } |
1066 | | |
1067 | 0 | const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, }; |
1068 | |
|
1069 | 0 | const struct xnn_gemm_config* gemm_config = xnn_init_qs8_qc8w_gemm_config(); |
1070 | 0 | assert(gemm_config != NULL); |
1071 | | |
1072 | 0 | union xnn_qs8_qc8w_conv_minmax_params gemm_params; |
1073 | 0 | if XNN_LIKELY(gemm_config->init.qs8_qc8w != NULL) { |
1074 | 0 | gemm_config->init.qs8_qc8w(&gemm_params, |
1075 | 0 | output_zero_point, output_min, output_max); |
1076 | 0 | } |
1077 | |
|
1078 | 0 | const struct xnn_dwconv_config* dwconv_config = xnn_init_qs8_qc8w_dwconv_config(); |
1079 | 0 | assert(dwconv_config != NULL); |
1080 | | |
1081 | 0 | union xnn_qs8_qc8w_conv_minmax_params dwconv_params; |
1082 | 0 | const struct xnn_dwconv_config* dwconv_ukernel = |
1083 | 0 | find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_QC8_DWCONV_UKERNELS); |
1084 | 0 | if XNN_LIKELY(dwconv_ukernel != NULL) { |
1085 | 0 | dwconv_ukernel->init.qs8_qc8w(&dwconv_params, |
1086 | 0 | output_zero_point, output_min, output_max); |
1087 | 0 | } |
1088 | |
|
1089 | 0 | enum xnn_status status = create_convolution2d_nhwc( |
1090 | 0 | input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, |
1091 | 0 | kernel_height, kernel_width, |
1092 | 0 | subsampling_height, subsampling_width, |
1093 | 0 | dilation_height, dilation_width, |
1094 | 0 | groups, group_input_channels, group_output_channels, |
1095 | 0 | input_channel_stride, output_channel_stride, |
1096 | 0 | kernel, bias, flags, |
1097 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
1098 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
1099 | 0 | /*bias_element_size=*/sizeof(int32_t), |
1100 | 0 | (xnn_pack_vmulcaddc_w_fn) NULL, |
1101 | 0 | (xnn_pack_dwconv_hwg_w_fn) xnn_pack_qs8_dwconv_hwg_w, |
1102 | 0 | (xnn_pack_dwconv_ghw_w_fn) xnn_pack_qs8_dwconv_ghw_w, |
1103 | 0 | (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi, |
1104 | 0 | (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w, |
1105 | 0 | (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w, |
1106 | 0 | /*packing_params=*/&packing_params, |
1107 | 0 | /*input_padding_byte=*/input_zero_point, |
1108 | 0 | /*packed_weights_padding_byte=*/0, |
1109 | 0 | /*extra_weights_bytes=*/sizeof(float), |
1110 | 0 | /*init_scale_params=*/xnn_init_qs8_qc8w_scale_fp32_params, |
1111 | 0 | /*scale_params=*/requantization_scale, |
1112 | 0 | /*gemm_params=*/&gemm_params, |
1113 | 0 | /*gemm_params_size=*/sizeof(gemm_params), |
1114 | 0 | /*dwconv_params=*/&dwconv_params, |
1115 | 0 | /*dwconv_params_size=*/sizeof(dwconv_params), |
1116 | | /*vmulcaddc_params=*/NULL, |
1117 | 0 | /*vmulcaddc_params_size=*/0, |
1118 | 0 | /*gemm_config=*/gemm_config, |
1119 | 0 | /*dwconv_ukernel=*/dwconv_ukernel, |
1120 | | /*vmulcaddc_config=*/NULL, |
1121 | | /*jit_gemm_params=*/NULL, |
1122 | | /*linear_activation=*/false, |
1123 | | /*relu_activation=*/false, |
1124 | 0 | /*operator_type=*/xnn_operator_type_convolution_nhwc_qc8, |
1125 | 0 | /*num_post_operations=*/0, |
1126 | | /*post_operation_params=*/NULL, |
1127 | 0 | /*code_cache=*/code_cache, |
1128 | 0 | /*weights_cache=*/weights_cache, |
1129 | 0 | convolution_op_out); |
1130 | |
|
1131 | 0 | xnn_release_simd_memory(requantization_scale); |
1132 | 0 | return status; |
1133 | 0 | } |
1134 | | |
1135 | | enum xnn_status xnn_create_convolution2d_nhwc_f16( |
1136 | | uint32_t input_padding_top, |
1137 | | uint32_t input_padding_right, |
1138 | | uint32_t input_padding_bottom, |
1139 | | uint32_t input_padding_left, |
1140 | | uint32_t kernel_height, |
1141 | | uint32_t kernel_width, |
1142 | | uint32_t subsampling_height, |
1143 | | uint32_t subsampling_width, |
1144 | | uint32_t dilation_height, |
1145 | | uint32_t dilation_width, |
1146 | | uint32_t groups, |
1147 | | size_t group_input_channels, |
1148 | | size_t group_output_channels, |
1149 | | size_t input_channel_stride, |
1150 | | size_t output_channel_stride, |
1151 | | const void* kernel, |
1152 | | const void* bias, |
1153 | | float output_min, |
1154 | | float output_max, |
1155 | | uint32_t flags, |
1156 | | xnn_code_cache_t code_cache, |
1157 | | xnn_weights_cache_t weights_cache, |
1158 | | xnn_operator_t* convolution_op_out) |
1159 | 0 | { |
1160 | 0 | if (isnan(output_min)) { |
1161 | 0 | xnn_log_error( |
1162 | 0 | "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", |
1163 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16)); |
1164 | 0 | return xnn_status_invalid_parameter; |
1165 | 0 | } |
1166 | | |
1167 | 0 | if (isnan(output_max)) { |
1168 | 0 | xnn_log_error( |
1169 | 0 | "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", |
1170 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16)); |
1171 | 0 | return xnn_status_invalid_parameter; |
1172 | 0 | } |
1173 | | |
1174 | 0 | const uint16_t fp16_output_min = fp16_ieee_from_fp32_value(output_min); |
1175 | 0 | const uint16_t fp16_output_max = fp16_ieee_from_fp32_value(output_max); |
1176 | 0 | const float rounded_output_min = fp16_ieee_to_fp32_value(fp16_output_min); |
1177 | 0 | const float rounded_output_max = fp16_ieee_to_fp32_value(fp16_output_max); |
1178 | 0 | if (rounded_output_min >= rounded_output_max) { |
1179 | 0 | xnn_log_error( |
1180 | 0 | "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound", |
1181 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16), rounded_output_min, rounded_output_max); |
1182 | 0 | return xnn_status_invalid_parameter; |
1183 | 0 | } |
1184 | | |
1185 | 0 | const struct xnn_gemm_config* gemm_config = xnn_init_f16_gemm_config(); |
1186 | 0 | if (gemm_config == NULL) { |
1187 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1188 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16)); |
1189 | 0 | return xnn_status_unsupported_hardware; |
1190 | 0 | } |
1191 | | |
1192 | 0 | union xnn_f16_minmax_params gemm_params; |
1193 | 0 | if XNN_LIKELY(gemm_config->init.f16 != NULL) { |
1194 | 0 | gemm_config->init.f16(&gemm_params, fp16_output_min, fp16_output_max); |
1195 | 0 | } |
1196 | |
|
1197 | 0 | const struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config(); |
1198 | 0 | if (dwconv_config == NULL) { |
1199 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1200 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16)); |
1201 | 0 | return xnn_status_unsupported_hardware; |
1202 | 0 | } |
1203 | | |
1204 | 0 | union xnn_f16_minmax_params dwconv_params; |
1205 | 0 | const struct xnn_dwconv_config* dwconv_ukernel = |
1206 | 0 | find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_F16_DWCONV_UKERNELS); |
1207 | 0 | if XNN_LIKELY(dwconv_ukernel != NULL) { |
1208 | 0 | dwconv_ukernel->init.f16(&dwconv_params, fp16_output_min, fp16_output_max); |
1209 | 0 | } |
1210 | |
|
1211 | 0 | const struct xnn_vmulcaddc_config* vmulcaddc_config = xnn_init_f16_vmulcaddc_config(); |
1212 | 0 | if (vmulcaddc_config == NULL) { |
1213 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1214 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16)); |
1215 | 0 | return xnn_status_unsupported_hardware; |
1216 | 0 | } |
1217 | | |
1218 | 0 | union xnn_f16_minmax_params vmulcaddc_params; |
1219 | 0 | if XNN_LIKELY(vmulcaddc_config->init.f16 != NULL) { |
1220 | 0 | vmulcaddc_config->init.f16(&vmulcaddc_params, fp16_output_min, fp16_output_max); |
1221 | 0 | } |
1222 | |
|
1223 | 0 | struct jit_gemm_params jit_gemm_params = { |
1224 | 0 | .f16_minmax = { |
1225 | 0 | .min = fp16_output_min, |
1226 | 0 | .max = fp16_output_max |
1227 | 0 | } |
1228 | 0 | }; |
1229 | |
|
1230 | 0 | xnn_pack_vmulcaddc_w_fn pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_fn) xnn_pack_f16_vmulcaddc_w; |
1231 | 0 | xnn_pack_dwconv_hwg_w_fn pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f16_dwconv_hwg_w; |
1232 | 0 | xnn_pack_dwconv_ghw_w_fn pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f16_dwconv_ghw_w; |
1233 | 0 | xnn_packw_gemm_goi_ukernel_fn pack_gemm_goi_w = (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi; |
1234 | 0 | xnn_pack_conv_kgo_w_fn pack_conv_kgo_w = (xnn_pack_conv_kgo_w_fn) xnn_pack_f16_conv_kgo_w; |
1235 | 0 | xnn_pack_conv_goki_w_fn pack_conv_goki_w = (xnn_pack_conv_goki_w_fn) xnn_pack_f16_conv_goki_w; |
1236 | 0 | if (flags & XNN_FLAG_FP32_STATIC_WEIGHTS) { |
1237 | 0 | pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_fn) xnn_pack_f32_to_f16_vmulcaddc_w; |
1238 | 0 | pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f32_to_f16_dwconv_hwg_w; |
1239 | 0 | pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f32_to_f16_dwconv_ghw_w; |
1240 | 0 | pack_gemm_goi_w = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_to_f16_gemm_goi_w; |
1241 | 0 | pack_conv_kgo_w = (xnn_pack_conv_kgo_w_fn) xnn_pack_f32_to_f16_conv_kgo_w; |
1242 | 0 | pack_conv_goki_w = (xnn_pack_conv_goki_w_fn) xnn_pack_f32_to_f16_conv_goki_w; |
1243 | 0 | } |
1244 | |
|
1245 | 0 | return create_convolution2d_nhwc( |
1246 | 0 | input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, |
1247 | 0 | kernel_height, kernel_width, |
1248 | 0 | subsampling_height, subsampling_width, |
1249 | 0 | dilation_height, dilation_width, |
1250 | 0 | groups, group_input_channels, group_output_channels, |
1251 | 0 | input_channel_stride, output_channel_stride, |
1252 | 0 | kernel, bias, flags, |
1253 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_HALF, |
1254 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_HALF, |
1255 | 0 | /*bias_element_size=*/sizeof(uint16_t), |
1256 | 0 | pack_vmulcaddc_w, |
1257 | 0 | pack_dwconv_hwg_w, |
1258 | 0 | pack_dwconv_ghw_w, |
1259 | 0 | pack_gemm_goi_w, |
1260 | 0 | pack_conv_kgo_w, |
1261 | 0 | pack_conv_goki_w, |
1262 | | /*packing_params=*/NULL, |
1263 | 0 | /*input_padding_byte=*/0, |
1264 | 0 | /*packed_weights_padding_byte=*/0, |
1265 | 0 | /*extra_weights_bytes=*/0, |
1266 | | /*init_scale_params=*/NULL, |
1267 | | /*scale_params=*/NULL, |
1268 | 0 | /*gemm_params=*/&gemm_params, |
1269 | 0 | /*gemm_params_size=*/sizeof(gemm_params), |
1270 | 0 | /*dwconv_params=*/&dwconv_params, |
1271 | 0 | /*dwconv_params_size=*/sizeof(dwconv_params), |
1272 | 0 | /*vmulcaddc_params=*/&vmulcaddc_params, |
1273 | 0 | /*vmulcaddc_params_size=*/sizeof(vmulcaddc_params), |
1274 | 0 | /*gemm_config=*/gemm_config, |
1275 | 0 | /*dwconv_ukernel=*/dwconv_ukernel, |
1276 | 0 | /*vmulcaddc_config=*/vmulcaddc_config, |
1277 | 0 | /*jit_gemm_params=*/&jit_gemm_params, |
1278 | | /*linear_activation=*/false, |
1279 | | /*relu_activation=*/false, |
1280 | 0 | /*operator_type=*/xnn_operator_type_convolution_nhwc_f16, |
1281 | 0 | /*num_post_operations=*/0, |
1282 | | /*post_operation_params=*/NULL, |
1283 | 0 | /*code_cache=*/code_cache, |
1284 | 0 | /*weights_cache=*/weights_cache, |
1285 | 0 | convolution_op_out); |
1286 | 0 | } |
1287 | | |
1288 | | enum xnn_status xnn_create_convolution2d_nhwc_f32( |
1289 | | uint32_t input_padding_top, |
1290 | | uint32_t input_padding_right, |
1291 | | uint32_t input_padding_bottom, |
1292 | | uint32_t input_padding_left, |
1293 | | uint32_t kernel_height, |
1294 | | uint32_t kernel_width, |
1295 | | uint32_t subsampling_height, |
1296 | | uint32_t subsampling_width, |
1297 | | uint32_t dilation_height, |
1298 | | uint32_t dilation_width, |
1299 | | uint32_t groups, |
1300 | | size_t group_input_channels, |
1301 | | size_t group_output_channels, |
1302 | | size_t input_channel_stride, |
1303 | | size_t output_channel_stride, |
1304 | | const float* kernel, |
1305 | | const float* bias, |
1306 | | float output_min, |
1307 | | float output_max, |
1308 | | uint32_t flags, |
1309 | | xnn_code_cache_t code_cache, |
1310 | | xnn_weights_cache_t weights_cache, |
1311 | | xnn_operator_t* convolution_op_out) |
1312 | 0 | { |
1313 | 0 | if (isnan(output_min)) { |
1314 | 0 | xnn_log_error( |
1315 | 0 | "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", |
1316 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1317 | 0 | return xnn_status_invalid_parameter; |
1318 | 0 | } |
1319 | | |
1320 | 0 | if (isnan(output_max)) { |
1321 | 0 | xnn_log_error( |
1322 | 0 | "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", |
1323 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1324 | 0 | return xnn_status_invalid_parameter; |
1325 | 0 | } |
1326 | | |
1327 | 0 | if (output_min >= output_max) { |
1328 | 0 | xnn_log_error( |
1329 | 0 | "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound", |
1330 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32), output_min, output_max); |
1331 | 0 | return xnn_status_invalid_parameter; |
1332 | 0 | } |
1333 | | |
1334 | 0 | const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max); |
1335 | 0 | const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f); |
1336 | |
|
1337 | 0 | const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config(); |
1338 | 0 | if (gemm_config == NULL) { |
1339 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1340 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1341 | 0 | return xnn_status_unsupported_hardware; |
1342 | 0 | } |
1343 | | |
1344 | 0 | const struct xnn_gemm_config* gemm_nr2_config = xnn_init_f32_gemm_nr2_config(); |
1345 | 0 | if (gemm_nr2_config == NULL) { |
1346 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1347 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1348 | 0 | return xnn_status_unsupported_hardware; |
1349 | 0 | } |
1350 | | |
1351 | 0 | if (gemm_config->nr > group_output_channels) { |
1352 | | // Default micro-kernel is suboptimal. Try to find a better micro-kernel. |
1353 | |
|
1354 | 0 | if (gemm_nr2_config->minmax.igemm[gemm_config->mr].function[XNN_UARCH_DEFAULT] != NULL) { |
1355 | 0 | gemm_config = gemm_nr2_config; |
1356 | 0 | } |
1357 | 0 | } |
1358 | |
|
1359 | 0 | union xnn_f32_minmax_params gemm_params; |
1360 | 0 | if XNN_LIKELY(gemm_config->init.f32 != NULL) { |
1361 | 0 | gemm_config->init.f32(&gemm_params, output_min, output_max); |
1362 | 0 | } |
1363 | |
|
1364 | 0 | struct jit_gemm_params jit_gemm_params = { |
1365 | 0 | .f32_minmax = { |
1366 | 0 | .min = output_min, |
1367 | 0 | .max = output_max |
1368 | 0 | } |
1369 | 0 | }; |
1370 | |
|
1371 | 0 | const struct xnn_dwconv_config* dwconv_config = xnn_init_f32_dwconv_config(); |
1372 | 0 | if (dwconv_config == NULL) { |
1373 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1374 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1375 | 0 | return xnn_status_unsupported_hardware; |
1376 | 0 | } |
1377 | | |
1378 | 0 | union xnn_f32_minmax_params dwconv_params; |
1379 | 0 | const struct xnn_dwconv_config* dwconv_ukernel = |
1380 | 0 | find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_F32_DWCONV_UKERNELS); |
1381 | 0 | if XNN_LIKELY(dwconv_ukernel != NULL) { |
1382 | 0 | dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max); |
1383 | 0 | } |
1384 | |
|
1385 | 0 | const struct xnn_vmulcaddc_config* vmulcaddc_config = xnn_init_f32_vmulcaddc_config(); |
1386 | 0 | if (vmulcaddc_config == NULL) { |
1387 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1388 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1389 | 0 | return xnn_status_unsupported_hardware; |
1390 | 0 | } |
1391 | | |
1392 | 0 | union xnn_f32_minmax_params vmulcaddc_params; |
1393 | 0 | if XNN_LIKELY(vmulcaddc_config->init.f32 != NULL) { |
1394 | 0 | vmulcaddc_config->init.f32(&vmulcaddc_params, output_min, output_max); |
1395 | 0 | } |
1396 | |
|
1397 | 0 | return create_convolution2d_nhwc( |
1398 | 0 | input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, |
1399 | 0 | kernel_height, kernel_width, |
1400 | 0 | subsampling_height, subsampling_width, |
1401 | 0 | dilation_height, dilation_width, |
1402 | 0 | groups, group_input_channels, group_output_channels, |
1403 | 0 | input_channel_stride, output_channel_stride, |
1404 | 0 | kernel, bias, flags, |
1405 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
1406 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
1407 | 0 | /*bias_element_size=*/sizeof(float), |
1408 | 0 | (xnn_pack_vmulcaddc_w_fn) xnn_pack_f32_vmulcaddc_w, |
1409 | 0 | (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f32_dwconv_hwg_w, |
1410 | 0 | (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f32_dwconv_ghw_w, |
1411 | 0 | (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi, |
1412 | 0 | (xnn_pack_conv_kgo_w_fn) xnn_pack_f32_conv_kgo_w, |
1413 | 0 | (xnn_pack_conv_goki_w_fn) xnn_pack_f32_conv_goki_w, |
1414 | | /*packing_params=*/NULL, |
1415 | 0 | /*input_padding_byte=*/0, |
1416 | 0 | /*packed_weights_padding_byte=*/0, |
1417 | 0 | /*extra_weights_bytes=*/0, |
1418 | | /*init_scale_params=*/NULL, |
1419 | | /*scale_params=*/NULL, |
1420 | 0 | /*gemm_params=*/&gemm_params, |
1421 | 0 | /*gemm_params_size=*/sizeof(gemm_params), |
1422 | 0 | /*dwconv_params=*/&dwconv_params, |
1423 | 0 | /*dwconv_params_size=*/sizeof(dwconv_params), |
1424 | 0 | /*vmulcaddc_params=*/&vmulcaddc_params, |
1425 | 0 | /*vmulcaddc_params_size=*/sizeof(vmulcaddc_params), |
1426 | 0 | /*gemm_config=*/gemm_config, |
1427 | 0 | /*dwconv_ukernel=*/dwconv_ukernel, |
1428 | 0 | /*vmulcaddc_config=*/vmulcaddc_config, |
1429 | 0 | /*jit_gemm_params=*/&jit_gemm_params, |
1430 | 0 | /*linear_activation=*/linear_activation, |
1431 | 0 | /*relu_activation=*/relu_activation, |
1432 | 0 | /*operator_type=*/xnn_operator_type_convolution_nhwc_f32, |
1433 | 0 | /*num_post_operations=*/0, |
1434 | | /*post_operation_params=*/NULL, |
1435 | 0 | /*code_cache=*/code_cache, |
1436 | 0 | /*weights_cache=*/weights_cache, |
1437 | 0 | convolution_op_out); |
1438 | 0 | } |
1439 | | |
1440 | | enum xnn_status xnn_create_fused_convolution2d_nhwc_f32( |
1441 | | uint32_t input_padding_top, |
1442 | | uint32_t input_padding_right, |
1443 | | uint32_t input_padding_bottom, |
1444 | | uint32_t input_padding_left, |
1445 | | uint32_t kernel_height, |
1446 | | uint32_t kernel_width, |
1447 | | uint32_t subsampling_height, |
1448 | | uint32_t subsampling_width, |
1449 | | uint32_t dilation_height, |
1450 | | uint32_t dilation_width, |
1451 | | uint32_t groups, |
1452 | | size_t group_input_channels, |
1453 | | size_t group_output_channels, |
1454 | | size_t input_channel_stride, |
1455 | | size_t output_channel_stride, |
1456 | | const float* kernel, |
1457 | | const float* bias, |
1458 | | size_t num_post_operations, |
1459 | | struct xnn_post_operation* post_operations, |
1460 | | uint32_t flags, |
1461 | | xnn_code_cache_t code_cache, |
1462 | | xnn_weights_cache_t weights_cache, |
1463 | | xnn_operator_t* convolution_op_out) |
1464 | 0 | { |
1465 | 0 | if (code_cache == NULL) { |
1466 | 0 | xnn_log_error( |
1467 | 0 | "failed to create %s operator: convolution with post operations available only if JIT is enabled", |
1468 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1469 | 0 | return xnn_status_invalid_parameter; |
1470 | 0 | } |
1471 | | |
1472 | | // Convolution is specified with linear activation, any clamping should be specified as a post operator. |
1473 | 0 | const float output_max = INFINITY; |
1474 | 0 | const float output_min = -INFINITY; |
1475 | |
|
1476 | 0 | struct jit_gemm_params jit_gemm_params = { |
1477 | 0 | .f32_minmax = { |
1478 | 0 | .min = output_min, |
1479 | 0 | .max = output_max |
1480 | 0 | }, |
1481 | 0 | .num_post_operations = num_post_operations, |
1482 | 0 | .post_operations = post_operations, |
1483 | 0 | }; |
1484 | |
|
1485 | 0 | char* post_operation_params = allocate_and_initialize_post_operation_params(num_post_operations, post_operations); |
1486 | |
|
1487 | 0 | const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_config(); |
1488 | 0 | if (gemm_config == NULL) { |
1489 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1490 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1491 | 0 | return xnn_status_unsupported_hardware; |
1492 | 0 | } |
1493 | | |
1494 | 0 | union xnn_f32_minmax_params gemm_params; |
1495 | 0 | if XNN_LIKELY(gemm_config->init.f32 != NULL) { |
1496 | 0 | gemm_config->init.f32(&gemm_params, output_min, output_max); |
1497 | 0 | } |
1498 | |
|
1499 | 0 | const struct xnn_dwconv_config* dwconv_config = xnn_init_f32_dwconv_config(); |
1500 | 0 | if (dwconv_config == NULL) { |
1501 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1502 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1503 | 0 | return xnn_status_unsupported_hardware; |
1504 | 0 | } |
1505 | | |
1506 | 0 | union xnn_f32_minmax_params dwconv_params; |
1507 | 0 | const struct xnn_dwconv_config* dwconv_ukernel = |
1508 | 0 | find_dwconv_ukernel(kernel_height * kernel_width, dwconv_config, XNN_MAX_F32_DWCONV_UKERNELS); |
1509 | 0 | if XNN_LIKELY(dwconv_ukernel != NULL) { |
1510 | 0 | dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max); |
1511 | 0 | } |
1512 | |
|
1513 | 0 | const struct xnn_vmulcaddc_config* vmulcaddc_config = xnn_init_f32_vmulcaddc_config(); |
1514 | 0 | if (vmulcaddc_config == NULL) { |
1515 | 0 | xnn_log_error("failed to create %s operator: unsupported hardware configuration", |
1516 | 0 | xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32)); |
1517 | 0 | return xnn_status_unsupported_hardware; |
1518 | 0 | } |
1519 | | |
1520 | 0 | union xnn_f32_minmax_params vmulcaddc_params; |
1521 | 0 | if XNN_LIKELY(vmulcaddc_config->init.f32 != NULL) { |
1522 | 0 | vmulcaddc_config->init.f32(&vmulcaddc_params, output_min, output_max); |
1523 | 0 | } |
1524 | |
|
1525 | 0 | return create_convolution2d_nhwc( |
1526 | 0 | input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, |
1527 | 0 | kernel_height, kernel_width, |
1528 | 0 | subsampling_height, subsampling_width, |
1529 | 0 | dilation_height, dilation_width, |
1530 | 0 | groups, group_input_channels, group_output_channels, |
1531 | 0 | input_channel_stride, output_channel_stride, |
1532 | 0 | kernel, bias, flags, |
1533 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
1534 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
1535 | 0 | /*bias_element_size=*/sizeof(float), |
1536 | 0 | (xnn_pack_vmulcaddc_w_fn) xnn_pack_f32_vmulcaddc_w, |
1537 | 0 | (xnn_pack_dwconv_hwg_w_fn) xnn_pack_f32_dwconv_hwg_w, |
1538 | 0 | (xnn_pack_dwconv_ghw_w_fn) xnn_pack_f32_dwconv_ghw_w, |
1539 | 0 | (xnn_packw_gemm_goi_ukernel_fn) gemm_config->pack_gemm_goi, |
1540 | 0 | (xnn_pack_conv_kgo_w_fn) xnn_pack_f32_conv_kgo_w, |
1541 | 0 | (xnn_pack_conv_goki_w_fn) xnn_pack_f32_conv_goki_w, |
1542 | | /*packing_params=*/NULL, |
1543 | 0 | /*input_padding_byte=*/0, |
1544 | 0 | /*packed_weights_padding_byte=*/0, |
1545 | 0 | /*extra_weights_bytes=*/0, |
1546 | | /*init_scale_params=*/NULL, |
1547 | | /*scale_params=*/NULL, |
1548 | 0 | /*gemm_params=*/(void*) &gemm_params, |
1549 | 0 | /*gemm_params_size=*/sizeof(gemm_params), |
1550 | 0 | /*dwconv_params=*/&dwconv_params, |
1551 | 0 | /*dwconv_params_size=*/sizeof(dwconv_params), |
1552 | 0 | /*vmulcaddc_params=*/&vmulcaddc_params, |
1553 | 0 | /*vmulcaddc_params_size=*/sizeof(vmulcaddc_params), |
1554 | 0 | /*gemm_config=*/gemm_config, |
1555 | 0 | /*dwconv_ukernel=*/dwconv_ukernel, |
1556 | 0 | /*vmulcaddc_config=*/vmulcaddc_config, |
1557 | 0 | /*jit_gemm_params=*/&jit_gemm_params, |
1558 | | /*linear_activation=*/true, |
1559 | | /*relu_activation=*/false, |
1560 | 0 | /*operator_type=*/xnn_operator_type_convolution_nhwc_f32, |
1561 | 0 | /*num_post_operations=*/num_post_operations, |
1562 | 0 | /*post_operation_params=*/post_operation_params, |
1563 | 0 | /*code_cache=*/code_cache, |
1564 | 0 | /*weights_cache=*/weights_cache, |
1565 | 0 | convolution_op_out); |
1566 | 0 | } |
1567 | | |
1568 | | static inline bool input_size_changed(xnn_operator_t convolution_op) |
1569 | 0 | { |
1570 | 0 | return convolution_op->input_height != convolution_op->last_input_height || |
1571 | 0 | convolution_op->input_width != convolution_op->last_input_width; |
1572 | 0 | } |
1573 | | |
1574 | | static enum xnn_status reshape_gemm( |
1575 | | xnn_operator_t convolution_op, |
1576 | | uint32_t log2_input_element_size, |
1577 | | uint32_t log2_filter_element_size, |
1578 | | uint32_t extra_weights_elements_size, |
1579 | | uint32_t log2_output_element_size, |
1580 | | size_t* workspace_size, |
1581 | | size_t* workspace_alignment, |
1582 | | size_t num_threads) |
1583 | 0 | { |
1584 | | // Convolution maps directly to GEMM and doesn't use indirection buffer. |
1585 | 0 | const size_t batch_size = convolution_op->batch_size; |
1586 | |
|
1587 | 0 | const size_t output_height = convolution_op->output_height; |
1588 | 0 | const size_t output_width = convolution_op->output_width; |
1589 | 0 | const size_t output_size = output_height * output_width; |
1590 | 0 | const size_t batch_output_size = batch_size * output_size; |
1591 | |
|
1592 | 0 | const size_t groups = convolution_op->groups; |
1593 | 0 | const size_t group_input_channels = convolution_op->group_input_channels; |
1594 | 0 | const size_t w_stride = extra_weights_elements_size + |
1595 | 0 | (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr * convolution_op->ukernel.gemm.sr) << log2_filter_element_size); |
1596 | 0 | const size_t group_output_channels = convolution_op->group_output_channels; |
1597 | |
|
1598 | 0 | uint32_t mr = convolution_op->ukernel.gemm.mr; |
1599 | 0 | const uint32_t nr = convolution_op->ukernel.gemm.nr; |
1600 | 0 | struct xnn_hmp_gemm_ukernel *gemm_cases = convolution_op->ukernel.gemm.gemm_cases; |
1601 | |
|
1602 | 0 | #if XNN_ENABLE_GEMM_M_SPECIALIZATION |
1603 | 0 | mr = xnn_get_heuristic_mr_gemm(batch_output_size, mr, nr, gemm_cases, convolution_op->code_cache != NULL); |
1604 | | #else |
1605 | | if (batch_output_size == 1 && gemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) { |
1606 | | mr = 1; |
1607 | | } |
1608 | | #endif |
1609 | |
|
1610 | | #if XNN_PLATFORM_JIT |
1611 | | xnn_overwrite_gemm_cases_with_generated_code(convolution_op, gemm_cases, mr); |
1612 | | #endif // XNN_PLATFORM_JIT |
1613 | 0 | struct xnn_hmp_gemm_ukernel gemm_ukernel = gemm_cases[mr - 1]; |
1614 | |
|
1615 | 0 | convolution_op->context.gemm = (struct gemm_context) { |
1616 | 0 | .k_scaled = group_input_channels << log2_input_element_size, |
1617 | 0 | .a_stride = convolution_op->input_pixel_stride << log2_input_element_size, |
1618 | 0 | .ga_stride = group_input_channels << log2_input_element_size, |
1619 | 0 | .packed_w = packed_weights(convolution_op), |
1620 | 0 | .w_stride = w_stride, |
1621 | 0 | .gw_stride = w_stride * round_up(group_output_channels, nr), |
1622 | 0 | .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size, |
1623 | 0 | .cn_stride = nr << log2_output_element_size, |
1624 | 0 | .gc_stride = group_output_channels << log2_output_element_size, |
1625 | 0 | .log2_csize = log2_output_element_size, |
1626 | 0 | .ukernel = gemm_ukernel, |
1627 | 0 | }; |
1628 | 0 | memcpy(&convolution_op->context.gemm.params, &convolution_op->params, sizeof(convolution_op->context.gemm.params)); |
1629 | 0 | if (convolution_op->num_post_operation_params == 0) { |
1630 | 0 | convolution_op->context.gemm.fused_params = &convolution_op->context.gemm.params; |
1631 | 0 | } else { |
1632 | 0 | convolution_op->context.gemm.fused_params = convolution_op->post_operation_params; |
1633 | 0 | } |
1634 | |
|
1635 | | #if XNN_TEST_MODE |
1636 | | const size_t nc = nr; |
1637 | | #else |
1638 | 0 | size_t nc = group_output_channels; |
1639 | 0 | if (num_threads > 1) { |
1640 | 0 | const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr); |
1641 | 0 | const size_t target_tiles_per_thread = 5; |
1642 | 0 | const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread); |
1643 | 0 | if (max_nc < nc) { |
1644 | 0 | nc = min(nc, divide_round_up(nc, max_nc * nr) * nr); |
1645 | 0 | } |
1646 | 0 | } |
1647 | 0 | #endif |
1648 | 0 | if (groups == 1) { |
1649 | | #if XNN_MAX_UARCH_TYPES > 1 |
1650 | | if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) { |
1651 | | convolution_op->compute[0].type = xnn_parallelization_type_2d_tile_2d_with_uarch; |
1652 | | convolution_op->compute[0].task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm; |
1653 | | } else { |
1654 | | convolution_op->compute[0].type = xnn_parallelization_type_2d_tile_2d; |
1655 | | convolution_op->compute[0].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm; |
1656 | | } |
1657 | | #else |
1658 | 0 | convolution_op->compute[0].type = xnn_parallelization_type_2d_tile_2d; |
1659 | 0 | convolution_op->compute[0].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm; |
1660 | 0 | #endif |
1661 | 0 | convolution_op->compute[0].range[0] = batch_output_size; |
1662 | 0 | convolution_op->compute[0].range[1] = group_output_channels; |
1663 | 0 | convolution_op->compute[0].tile[0] = mr; |
1664 | 0 | convolution_op->compute[0].tile[1] = nc; |
1665 | 0 | } else { |
1666 | | #if XNN_MAX_UARCH_TYPES > 1 |
1667 | | if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) { |
1668 | | convolution_op->compute[0].type = xnn_parallelization_type_3d_tile_2d_with_uarch; |
1669 | | convolution_op->compute[0].task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm; |
1670 | | } else { |
1671 | | convolution_op->compute[0].type = xnn_parallelization_type_3d_tile_2d; |
1672 | | convolution_op->compute[0].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm; |
1673 | | } |
1674 | | #else |
1675 | 0 | convolution_op->compute[0].type = xnn_parallelization_type_3d_tile_2d; |
1676 | 0 | convolution_op->compute[0].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm; |
1677 | 0 | #endif |
1678 | 0 | convolution_op->compute[0].range[0] = groups; |
1679 | 0 | convolution_op->compute[0].range[1] = batch_output_size; |
1680 | 0 | convolution_op->compute[0].range[2] = group_output_channels; |
1681 | 0 | convolution_op->compute[0].tile[0] = mr; |
1682 | 0 | convolution_op->compute[0].tile[1] = nc; |
1683 | 0 | } |
1684 | 0 | convolution_op->state = xnn_run_state_needs_setup; |
1685 | |
|
1686 | 0 | *workspace_size = 0; |
1687 | 0 | *workspace_alignment = 1; |
1688 | |
|
1689 | 0 | return xnn_status_success; |
1690 | 0 | } |
1691 | | |
1692 | | static enum xnn_status reshape_igemm( |
1693 | | xnn_operator_t convolution_op, |
1694 | | uint32_t log2_input_element_size, |
1695 | | uint32_t log2_filter_element_size, |
1696 | | uint32_t extra_weights_elements_size, |
1697 | | uint32_t log2_output_element_size, |
1698 | | size_t* workspace_size, |
1699 | | size_t* workspace_alignment, |
1700 | | size_t num_threads) |
1701 | 0 | { |
1702 | 0 | const size_t batch_size = convolution_op->batch_size; |
1703 | 0 | const size_t input_height = convolution_op->input_height; |
1704 | 0 | const size_t input_width = convolution_op->input_width; |
1705 | 0 | const size_t groups = convolution_op->groups; |
1706 | 0 | const size_t kernel_height = convolution_op->kernel_height; |
1707 | 0 | const size_t kernel_width = convolution_op->kernel_width; |
1708 | 0 | const size_t kernel_size = kernel_height * kernel_width; |
1709 | 0 | const size_t output_height = convolution_op->output_height; |
1710 | 0 | const size_t output_width = convolution_op->output_width; |
1711 | 0 | const size_t output_size = output_height * output_width; |
1712 | |
|
1713 | 0 | uint32_t mr = convolution_op->ukernel.igemm.mr; |
1714 | 0 | const uint32_t nr = convolution_op->ukernel.igemm.nr; |
1715 | 0 | struct xnn_hmp_igemm_ukernel* igemm_cases = convolution_op->ukernel.igemm.igemm_cases; |
1716 | |
|
1717 | 0 | #if XNN_ENABLE_GEMM_M_SPECIALIZATION |
1718 | 0 | mr = xnn_get_heuristic_mr_igemm(output_size, mr, nr, igemm_cases, convolution_op->code_cache != NULL); |
1719 | | #else |
1720 | | if (output_size == 1 && igemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) { |
1721 | | mr = 1; |
1722 | | } |
1723 | | #endif |
1724 | |
|
1725 | | #if XNN_PLATFORM_JIT |
1726 | | xnn_overwrite_igemm_cases_with_generated_code(convolution_op, igemm_cases, mr); |
1727 | | #endif // XNN_PLATFORM_JIT |
1728 | 0 | struct xnn_hmp_igemm_ukernel igemm_ukernel = igemm_cases[mr - 1]; |
1729 | |
|
1730 | 0 | const size_t tiled_output_size = round_up(output_size, mr); |
1731 | 0 | const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size; |
1732 | 0 | size_t igemm_compute_index; |
1733 | 0 | if (convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) { |
1734 | 0 | *workspace_size = indirection_buffer_size; |
1735 | 0 | *workspace_alignment = XNN_ALLOCATION_ALIGNMENT; |
1736 | 0 | igemm_compute_index = 1; |
1737 | |
|
1738 | 0 | convolution_op->context.conv2d_igemm_indirection_init = (struct conv2d_igemm_indirection_init_context) { |
1739 | 0 | .zero_buffer = convolution_op->zero_buffer, |
1740 | 0 | .input_pixel_stride = convolution_op->input_pixel_stride << log2_input_element_size, |
1741 | 0 | .input_height = input_height, |
1742 | 0 | .input_width = input_width, |
1743 | 0 | .output_height = output_height, |
1744 | 0 | .output_width = output_width, |
1745 | 0 | .kernel_height = kernel_height, |
1746 | 0 | .kernel_width = kernel_width, |
1747 | 0 | .stride_height = convolution_op->stride_height, |
1748 | 0 | .stride_width = convolution_op->stride_width, |
1749 | 0 | .dilation_height = convolution_op->dilation_height, |
1750 | 0 | .dilation_width = convolution_op->dilation_width, |
1751 | 0 | .input_padding_top = convolution_op->padding_top, |
1752 | 0 | .input_padding_left = convolution_op->padding_left, |
1753 | 0 | }; |
1754 | |
|
1755 | 0 | convolution_op->compute[0].type = xnn_parallelization_type_1d_tile_1d; |
1756 | 0 | convolution_op->compute[0].context_offset = offsetof(struct xnn_operator, context.conv2d_igemm_indirection_init) - offsetof(struct xnn_operator, context); |
1757 | 0 | convolution_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_conv2d_igemm_indirection; |
1758 | 0 | convolution_op->compute[0].range[0] = tiled_output_size; |
1759 | 0 | convolution_op->compute[0].tile[0] = mr; |
1760 | 0 | } else { |
1761 | 0 | *workspace_size = 0; |
1762 | 0 | *workspace_alignment = 1; |
1763 | 0 | igemm_compute_index = 0; |
1764 | |
|
1765 | 0 | if (input_size_changed(convolution_op)) { |
1766 | 0 | const void** indirection_buffer = |
1767 | 0 | (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size); |
1768 | 0 | if (indirection_buffer == NULL) { |
1769 | 0 | xnn_log_error( |
1770 | 0 | "failed to allocate %zu bytes for %s operator indirection buffer", |
1771 | 0 | indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type)); |
1772 | 0 | return xnn_status_out_of_memory; |
1773 | 0 | } |
1774 | 0 | convolution_op->indirection_buffer = indirection_buffer; |
1775 | 0 | xnn_log_debug("allocated %zu bytes for indirection buffer in %s operator", |
1776 | 0 | indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type)); |
1777 | | |
1778 | | // Set a dummy input first, the actual input offset is calculated in setup when we have the input pointer. |
1779 | | // This offset must be aligned properly because inputs and input offsets need to be aligned. |
1780 | 0 | convolution_op->input = (void*) ((uintptr_t) convolution_op->zero_buffer + XNN_ALLOCATION_ALIGNMENT); |
1781 | 0 | convolution_op->last_input = convolution_op->input; |
1782 | 0 | convolution_op->last_input_height = convolution_op->input_height; |
1783 | 0 | convolution_op->last_input_width = convolution_op->input_width; |
1784 | |
|
1785 | 0 | xnn_indirection_init_conv2d( |
1786 | 0 | /*output_tile_size=*/mr, |
1787 | 0 | /*output_start=*/0, |
1788 | 0 | /*output_end=*/tiled_output_size, |
1789 | 0 | convolution_op->indirection_buffer, |
1790 | 0 | convolution_op->input, |
1791 | 0 | convolution_op->zero_buffer, |
1792 | 0 | convolution_op->input_pixel_stride << log2_input_element_size, |
1793 | 0 | convolution_op->input_height, convolution_op->input_width, |
1794 | 0 | convolution_op->output_height, convolution_op->output_width, |
1795 | 0 | convolution_op->kernel_height, convolution_op->kernel_width, |
1796 | 0 | convolution_op->stride_height, convolution_op->stride_width, |
1797 | 0 | convolution_op->dilation_height, convolution_op->dilation_width, |
1798 | 0 | convolution_op->padding_top, convolution_op->padding_left); |
1799 | 0 | } |
1800 | 0 | } |
1801 | | |
1802 | | |
1803 | 0 | const size_t group_input_channels = convolution_op->group_input_channels; |
1804 | 0 | const size_t w_stride = extra_weights_elements_size + |
1805 | 0 | (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr * convolution_op->ukernel.igemm.sr) * kernel_size << log2_filter_element_size); |
1806 | 0 | const size_t group_output_channels = convolution_op->group_output_channels; |
1807 | 0 | convolution_op->context.igemm = (struct igemm_context) { |
1808 | 0 | .ks = kernel_size, |
1809 | 0 | .ks_scaled = kernel_size * mr * sizeof(void*), |
1810 | 0 | .kc = group_input_channels << log2_input_element_size, |
1811 | 0 | .w_stride = w_stride, |
1812 | 0 | .indirect_a = convolution_op->indirection_buffer, |
1813 | 0 | .zero = convolution_op->zero_buffer, |
1814 | 0 | .packed_w = packed_weights(convolution_op), |
1815 | 0 | .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size, |
1816 | 0 | .cn_stride = nr << log2_output_element_size, |
1817 | 0 | .ga_stride = group_input_channels << log2_input_element_size, |
1818 | 0 | .gw_stride = w_stride * round_up(group_output_channels, nr), |
1819 | 0 | .gc_stride = group_output_channels << log2_output_element_size, |
1820 | 0 | .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size, |
1821 | 0 | .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size, |
1822 | 0 | .log2_csize = log2_output_element_size, |
1823 | 0 | .ukernel = igemm_ukernel, |
1824 | 0 | }; |
1825 | 0 | memcpy(&convolution_op->context.igemm.params, &convolution_op->params, sizeof(convolution_op->context.igemm.params)); |
1826 | |
|
1827 | | #if XNN_TEST_MODE |
1828 | | const size_t nc = nr; |
1829 | | #else |
1830 | 0 | size_t nc = group_output_channels; |
1831 | 0 | if (num_threads > 1) { |
1832 | 0 | const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr); |
1833 | 0 | const size_t target_tiles_per_thread = 5; |
1834 | 0 | const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread); |
1835 | 0 | if (max_nc < nc) { |
1836 | 0 | nc = min(nc, divide_round_up(nc, max_nc * nr) * nr); |
1837 | 0 | } |
1838 | 0 | } |
1839 | 0 | #endif |
1840 | 0 | if (groups == 1) { |
1841 | | #if XNN_MAX_UARCH_TYPES > 1 |
1842 | | if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) { |
1843 | | if (batch_size > 1) { |
1844 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d_with_uarch; |
1845 | | convolution_op->compute[igemm_compute_index].task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_batch_hmp_igemm; |
1846 | | } else { |
1847 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_2d_tile_2d_with_uarch; |
1848 | | convolution_op->compute[igemm_compute_index].task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_igemm; |
1849 | | } |
1850 | | } else { |
1851 | | if (batch_size > 1) { |
1852 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d; |
1853 | | convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm; |
1854 | | } else { |
1855 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_2d_tile_2d; |
1856 | | convolution_op->compute[igemm_compute_index].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm; |
1857 | | } |
1858 | | } |
1859 | | #else |
1860 | 0 | if (batch_size > 1) { |
1861 | 0 | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d; |
1862 | 0 | convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm; |
1863 | 0 | } else { |
1864 | 0 | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_2d_tile_2d; |
1865 | 0 | convolution_op->compute[igemm_compute_index].task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm; |
1866 | 0 | } |
1867 | 0 | #endif |
1868 | 0 | if (batch_size > 1) { |
1869 | 0 | convolution_op->compute[igemm_compute_index].range[0] = batch_size; |
1870 | 0 | convolution_op->compute[igemm_compute_index].range[1] = output_size; |
1871 | 0 | convolution_op->compute[igemm_compute_index].range[2] = group_output_channels; |
1872 | 0 | } else { |
1873 | 0 | convolution_op->compute[igemm_compute_index].range[0] = output_size; |
1874 | 0 | convolution_op->compute[igemm_compute_index].range[1] = group_output_channels; |
1875 | 0 | } |
1876 | 0 | convolution_op->compute[igemm_compute_index].tile[0] = mr; |
1877 | 0 | convolution_op->compute[igemm_compute_index].tile[1] = nc; |
1878 | 0 | } else { |
1879 | | #if XNN_MAX_UARCH_TYPES > 1 |
1880 | | if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) { |
1881 | | if (batch_size > 1) { |
1882 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_4d_tile_2d_with_uarch; |
1883 | | convolution_op->compute[igemm_compute_index].task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_batch_igemm; |
1884 | | } else { |
1885 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d_with_uarch; |
1886 | | convolution_op->compute[igemm_compute_index].task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm; |
1887 | | } |
1888 | | } else { |
1889 | | if (batch_size > 1) { |
1890 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_4d_tile_2d; |
1891 | | convolution_op->compute[igemm_compute_index].task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm; |
1892 | | } else { |
1893 | | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d; |
1894 | | convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm; |
1895 | | } |
1896 | | } |
1897 | | #else |
1898 | 0 | if (batch_size > 1) { |
1899 | 0 | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_4d_tile_2d; |
1900 | 0 | convolution_op->compute[igemm_compute_index].task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm; |
1901 | 0 | } else { |
1902 | 0 | convolution_op->compute[igemm_compute_index].type = xnn_parallelization_type_3d_tile_2d; |
1903 | 0 | convolution_op->compute[igemm_compute_index].task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm; |
1904 | 0 | } |
1905 | 0 | #endif |
1906 | 0 | if (batch_size > 1) { |
1907 | 0 | convolution_op->compute[igemm_compute_index].range[0] = batch_size; |
1908 | 0 | convolution_op->compute[igemm_compute_index].range[1] = groups; |
1909 | 0 | convolution_op->compute[igemm_compute_index].range[2] = output_size; |
1910 | 0 | convolution_op->compute[igemm_compute_index].range[3] = group_output_channels; |
1911 | 0 | } else { |
1912 | 0 | convolution_op->compute[igemm_compute_index].range[0] = groups; |
1913 | 0 | convolution_op->compute[igemm_compute_index].range[1] = output_size; |
1914 | 0 | convolution_op->compute[igemm_compute_index].range[2] = group_output_channels; |
1915 | 0 | } |
1916 | 0 | convolution_op->compute[igemm_compute_index].tile[0] = mr; |
1917 | 0 | convolution_op->compute[igemm_compute_index].tile[1] = nc; |
1918 | 0 | } |
1919 | 0 | convolution_op->state = xnn_run_state_needs_setup; |
1920 | |
|
1921 | 0 | return xnn_status_success; |
1922 | 0 | } |
1923 | | |
1924 | | static enum xnn_status reshape_dwconv( |
1925 | | xnn_operator_t convolution_op, |
1926 | | uint32_t log2_input_element_size, |
1927 | | uint32_t log2_accumulator_element_size, |
1928 | | uint32_t log2_output_element_size, |
1929 | | size_t* workspace_size, |
1930 | | size_t* workspace_alignment, |
1931 | | size_t num_threads) |
1932 | 0 | { |
1933 | 0 | const size_t input_height = convolution_op->input_height; |
1934 | 0 | const size_t input_width = convolution_op->input_width; |
1935 | 0 | const size_t kernel_height = convolution_op->kernel_height; |
1936 | 0 | const size_t kernel_width = convolution_op->kernel_width; |
1937 | 0 | const size_t kernel_size = kernel_height * kernel_width; |
1938 | 0 | const size_t output_height = convolution_op->output_height; |
1939 | 0 | const size_t output_width = convolution_op->output_width; |
1940 | 0 | const size_t step_width = convolution_op->dilation_width == 1 ? |
1941 | 0 | min(convolution_op->stride_width, kernel_width) : kernel_width; |
1942 | 0 | const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height; |
1943 | 0 | const struct xnn_ukernel_dwconv dwconv_ukernel = convolution_op->ukernel.dwconv; |
1944 | 0 | const bool is_unipass = dwconv_ukernel.last_tile == 0; |
1945 | 0 | const size_t tile_size = dwconv_ukernel.tile_size; |
1946 | 0 | size_t total_workspace_size = 0; |
1947 | | |
1948 | | // Micro-kernel will read (tile_size - kernel_size) elements after the end of indirection buffer. |
1949 | 0 | const size_t indirection_buffer_size = |
1950 | 0 | round_up_po2(sizeof(void*) * (tile_size - kernel_size + output_height * step_height), XNN_ALLOCATION_ALIGNMENT); |
1951 | |
|
1952 | 0 | size_t dwconv_compute_index; |
1953 | 0 | const bool is_transient_indirection_buffer = convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER; |
1954 | 0 | if (is_transient_indirection_buffer) { |
1955 | 0 | total_workspace_size += indirection_buffer_size; |
1956 | 0 | dwconv_compute_index = 1; |
1957 | |
|
1958 | 0 | convolution_op->context.dwconv_indirection_init = (struct dwconv_indirection_init_context) { |
1959 | 0 | .zero_buffer = convolution_op->zero_buffer, |
1960 | 0 | .input_pixel_stride = convolution_op->input_pixel_stride << log2_input_element_size, |
1961 | 0 | .input_height = input_height, |
1962 | 0 | .input_width = input_width, |
1963 | 0 | .output_height = output_height, |
1964 | 0 | .output_width = output_width, |
1965 | 0 | .kernel_height = kernel_height, |
1966 | 0 | .kernel_width = kernel_width, |
1967 | 0 | .stride_height = convolution_op->stride_height, |
1968 | 0 | .stride_width = convolution_op->stride_width, |
1969 | 0 | .dilation_height = convolution_op->dilation_height, |
1970 | 0 | .dilation_width = convolution_op->dilation_width, |
1971 | 0 | .input_padding_top = convolution_op->padding_top, |
1972 | 0 | .input_padding_left = convolution_op->padding_left, |
1973 | 0 | .step_height = step_height, |
1974 | 0 | .step_width = step_width, |
1975 | 0 | .tile_size = tile_size, |
1976 | 0 | }; |
1977 | |
|
1978 | 0 | convolution_op->compute[0].type = xnn_parallelization_type_1d_tile_1d; |
1979 | 0 | convolution_op->compute[0].context_offset = offsetof(struct xnn_operator, context.dwconv_indirection_init) - offsetof(struct xnn_operator, context); |
1980 | 0 | convolution_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_dwconv_indirection; |
1981 | 0 | convolution_op->compute[0].range[0] = output_height; |
1982 | | #if XNN_TEST_MODE |
1983 | | convolution_op->compute[0].tile[0] = output_height; |
1984 | | #else |
1985 | 0 | if (num_threads > 1) { |
1986 | 0 | const size_t target_tiles_per_thread = 5; |
1987 | 0 | convolution_op->compute[0].tile[0] = divide_round_up(output_height, num_threads * target_tiles_per_thread); |
1988 | 0 | } else { |
1989 | 0 | convolution_op->compute[0].tile[0] = output_height; |
1990 | 0 | } |
1991 | 0 | #endif |
1992 | 0 | } else { |
1993 | 0 | dwconv_compute_index = 0; |
1994 | |
|
1995 | 0 | if (input_size_changed(convolution_op)) { |
1996 | 0 | const void** indirection_buffer = |
1997 | 0 | (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size); |
1998 | 0 | if (indirection_buffer == NULL) { |
1999 | 0 | xnn_log_error("failed to allocate %zu bytes for %s operator indirection buffer", |
2000 | 0 | indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type)); |
2001 | 0 | return xnn_status_out_of_memory; |
2002 | 0 | } |
2003 | 0 | convolution_op->indirection_buffer = indirection_buffer; |
2004 | 0 | xnn_log_debug("allocated %zu bytes for indirection buffer in %s operator", |
2005 | 0 | indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type)); |
2006 | |
|
2007 | | #if XNN_TEST_MODE |
2008 | | memset(convolution_op->indirection_buffer, 0, indirection_buffer_size); |
2009 | | #endif |
2010 | | |
2011 | | // Set a dummy input first, the actual input offset is calculated in setup when we have the input pointer. |
2012 | | // This offset must be aligned properly because inputs and input offsets need to be aligned. |
2013 | 0 | convolution_op->input = (void* ) ((uintptr_t) convolution_op->zero_buffer + XNN_ALLOCATION_ALIGNMENT); |
2014 | 0 | convolution_op->last_input = convolution_op->input; |
2015 | 0 | convolution_op->last_input_height = convolution_op->input_height; |
2016 | 0 | convolution_op->last_input_width = convolution_op->input_width; |
2017 | |
|
2018 | 0 | xnn_indirection_init_dwconv2d( |
2019 | 0 | /*output_y_start=*/0, /*output_y_end=*/convolution_op->output_height, |
2020 | 0 | convolution_op->indirection_buffer, |
2021 | 0 | convolution_op->input, |
2022 | 0 | convolution_op->input_pixel_stride << log2_input_element_size, |
2023 | 0 | convolution_op->zero_buffer, |
2024 | 0 | convolution_op->input_height, convolution_op->input_width, |
2025 | 0 | convolution_op->output_height, convolution_op->output_width, |
2026 | 0 | convolution_op->kernel_height, convolution_op->kernel_width, |
2027 | 0 | convolution_op->stride_height, convolution_op->stride_width, |
2028 | 0 | convolution_op->dilation_height, convolution_op->dilation_width, |
2029 | 0 | convolution_op->padding_top, convolution_op->padding_left, |
2030 | 0 | step_height, step_width, tile_size); |
2031 | 0 | } |
2032 | 0 | } |
2033 | | |
2034 | 0 | const size_t groups = convolution_op->groups; |
2035 | 0 | int32_t extra_input_advanced = is_unipass ? 0 : tile_size - convolution_op->ukernel.dwconv.last_tile; |
2036 | 0 | convolution_op->context.dwconv = (struct dwconv_context) { |
2037 | 0 | .kernel_size = kernel_size, |
2038 | 0 | .indirect_input = convolution_op->indirection_buffer, |
2039 | 0 | .indirect_input_width_stride = (kernel_height * step_width - extra_input_advanced) * sizeof(void*), |
2040 | 0 | .indirect_input_height_stride = step_height * sizeof(void*), |
2041 | 0 | .input_batch_stride = (input_height * input_width * convolution_op->input_pixel_stride) << log2_input_element_size, |
2042 | 0 | .packed_weights = packed_weights(convolution_op), |
2043 | 0 | .output_batch_stride = (output_height * output_width * convolution_op->output_pixel_stride) << log2_output_element_size, |
2044 | 0 | .output_height_stride = (output_width * convolution_op->output_pixel_stride) << log2_output_element_size, |
2045 | 0 | .output_height = output_height, |
2046 | 0 | .output_width = output_width, |
2047 | 0 | .groups = groups, |
2048 | 0 | .zero = convolution_op->zero_buffer, |
2049 | 0 | .output_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size, |
2050 | 0 | }; |
2051 | 0 | memcpy(&convolution_op->context.dwconv.params, &convolution_op->params, sizeof(convolution_op->context.dwconv.params)); |
2052 | |
|
2053 | 0 | const size_t batch_size = convolution_op->batch_size; |
2054 | 0 | convolution_op->compute[dwconv_compute_index].range[0] = batch_size; |
2055 | 0 | convolution_op->compute[dwconv_compute_index].range[1] = output_height; |
2056 | 0 | convolution_op->state = xnn_run_state_needs_setup; |
2057 | |
|
2058 | 0 | if (is_unipass) { |
2059 | 0 | convolution_op->compute[dwconv_compute_index].type = xnn_parallelization_type_2d; |
2060 | 0 | convolution_op->compute[dwconv_compute_index].task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv_unipass; |
2061 | 0 | convolution_op->context.dwconv.unipass_ukernel = convolution_op->ukernel.dwconv.unipass_fn; |
2062 | 0 | } else { |
2063 | 0 | const size_t buffer_size = |
2064 | 0 | round_up_po2( |
2065 | 0 | (groups + (XNN_MULTIPASS_EXTRA_BYTES >> log2_input_element_size)) << log2_accumulator_element_size, |
2066 | 0 | XNN_ALLOCATION_ALIGNMENT); |
2067 | 0 | convolution_op->context.dwconv.buffer_size = buffer_size; |
2068 | 0 | if (is_transient_indirection_buffer) { |
2069 | 0 | convolution_op->context.dwconv.multipass_buffer_offset = indirection_buffer_size; |
2070 | 0 | } |
2071 | 0 | const bool use_threads_workspace_size = num_threads < batch_size * output_height; |
2072 | 0 | if (use_threads_workspace_size) { |
2073 | 0 | convolution_op->compute[dwconv_compute_index].type = xnn_parallelization_type_2d_with_thread; |
2074 | 0 | convolution_op->compute[dwconv_compute_index].task_2d_with_thread = |
2075 | 0 | (pthreadpool_task_2d_with_thread_t) xnn_compute_dwconv_multipass_with_thread; |
2076 | 0 | total_workspace_size += num_threads * buffer_size; |
2077 | 0 | } else { |
2078 | 0 | convolution_op->compute[dwconv_compute_index].type = xnn_parallelization_type_2d; |
2079 | 0 | convolution_op->compute[dwconv_compute_index].task_2d = |
2080 | 0 | (pthreadpool_task_2d_t) xnn_compute_dwconv_multipass; |
2081 | 0 | total_workspace_size += batch_size * output_height * buffer_size; |
2082 | 0 | } |
2083 | |
|
2084 | 0 | convolution_op->context.dwconv.multipass_ukernel = convolution_op->ukernel.dwconv.multipass_fn; |
2085 | 0 | } |
2086 | |
|
2087 | 0 | *workspace_size = total_workspace_size; |
2088 | 0 | *workspace_alignment = total_workspace_size == 0 ? 1 : XNN_ALLOCATION_ALIGNMENT; |
2089 | |
|
2090 | 0 | return xnn_status_success; |
2091 | 0 | } |
2092 | | |
2093 | | static enum xnn_status reshape_vmulcaddc( |
2094 | | xnn_operator_t convolution_op, |
2095 | | uint32_t log2_input_element_size, |
2096 | | uint32_t log2_output_element_size, |
2097 | | size_t* workspace_size, |
2098 | | size_t* workspace_alignment, |
2099 | | size_t num_threads) |
2100 | 0 | { |
2101 | 0 | const size_t batch_output_size = convolution_op->batch_size * convolution_op->output_height * convolution_op->output_width; |
2102 | |
|
2103 | 0 | convolution_op->context.vmulcaddc = (struct vmulcaddc_context) { |
2104 | 0 | .n = convolution_op->groups << log2_input_element_size, |
2105 | 0 | .x_stride = convolution_op->input_pixel_stride << log2_input_element_size, |
2106 | 0 | .w = packed_weights(convolution_op), |
2107 | 0 | .y_stride = convolution_op->output_pixel_stride << log2_output_element_size, |
2108 | 0 | .ukernel = convolution_op->ukernel.vmulcaddc.function, |
2109 | 0 | }; |
2110 | 0 | memcpy(&convolution_op->context.vmulcaddc.params, &convolution_op->params, |
2111 | 0 | sizeof(convolution_op->context.vmulcaddc.params)); |
2112 | |
|
2113 | | #if XNN_TEST_MODE |
2114 | | const size_t mc = convolution_op->ukernel.vmulcaddc.mr; |
2115 | | #else |
2116 | 0 | size_t mc = batch_output_size; |
2117 | 0 | if (num_threads > 1) { |
2118 | 0 | const size_t target_tiles_per_thread = 5; |
2119 | 0 | const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread); |
2120 | 0 | if (max_mc < mc) { |
2121 | 0 | const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr; |
2122 | 0 | mc = min(mc, divide_round_up(mc, max_mc * mr) * mr); |
2123 | 0 | } |
2124 | 0 | } |
2125 | 0 | #endif |
2126 | 0 | convolution_op->compute[0].type = xnn_parallelization_type_1d_tile_1d; |
2127 | 0 | convolution_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc; |
2128 | 0 | convolution_op->compute[0].range[0] = batch_output_size; |
2129 | 0 | convolution_op->compute[0].tile[0] = mc; |
2130 | 0 | convolution_op->state = xnn_run_state_needs_setup; |
2131 | |
|
2132 | 0 | *workspace_size = 0; |
2133 | 0 | *workspace_alignment = 1; |
2134 | |
|
2135 | 0 | return xnn_status_success; |
2136 | 0 | } |
2137 | | |
2138 | | static enum xnn_status reshape_convolution2d_nhwc( |
2139 | | xnn_operator_t convolution_op, |
2140 | | enum xnn_operator_type expected_operator_type, |
2141 | | size_t batch_size, |
2142 | | size_t input_height, |
2143 | | size_t input_width, |
2144 | | uint32_t log2_input_element_size, |
2145 | | uint32_t log2_filter_element_size, |
2146 | | uint32_t log2_accumulator_element_size, |
2147 | | uint32_t extra_weights_elements_size, |
2148 | | uint32_t log2_output_element_size, |
2149 | | size_t* workspace_size, |
2150 | | size_t* workspace_alignment, |
2151 | | size_t* output_height_out, |
2152 | | size_t* output_width_out, |
2153 | | pthreadpool_t threadpool) |
2154 | 0 | { |
2155 | 0 | if (convolution_op->type != expected_operator_type) { |
2156 | 0 | xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", |
2157 | 0 | xnn_operator_type_to_string(expected_operator_type), |
2158 | 0 | xnn_operator_type_to_string(convolution_op->type)); |
2159 | 0 | return xnn_status_invalid_parameter; |
2160 | 0 | } |
2161 | 0 | convolution_op->state = xnn_run_state_invalid; |
2162 | |
|
2163 | 0 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
2164 | 0 | xnn_log_error("failed to setup %s operator: XNNPACK is not initialized", |
2165 | 0 | xnn_operator_type_to_string(convolution_op->type)); |
2166 | 0 | return xnn_status_uninitialized; |
2167 | 0 | } |
2168 | | |
2169 | 0 | if (input_width == 0 || input_height == 0) { |
2170 | 0 | xnn_log_error( |
2171 | 0 | "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero", |
2172 | 0 | xnn_operator_type_to_string(convolution_op->type), input_width, input_height); |
2173 | 0 | return xnn_status_invalid_parameter; |
2174 | 0 | } |
2175 | | |
2176 | 0 | if (batch_size == 0) { |
2177 | 0 | convolution_op->state = xnn_run_state_skip; |
2178 | 0 | return xnn_status_success; |
2179 | 0 | } |
2180 | | |
2181 | 0 | if (convolution_op->weights_cache != NULL && !xnn_weights_cache_is_finalized(convolution_op->weights_cache)) { |
2182 | 0 | xnn_log_error("failed to setup %s operator: weights cache is not finalized", |
2183 | 0 | xnn_operator_type_to_string(convolution_op->type)); |
2184 | 0 | return xnn_status_invalid_state; |
2185 | 0 | } |
2186 | | |
2187 | 0 | convolution_op->batch_size = batch_size; |
2188 | 0 | convolution_op->input_height = input_height; |
2189 | 0 | convolution_op->input_width = input_width; |
2190 | |
|
2191 | 0 | if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) { |
2192 | 0 | convolution_op->output_height = compute_output_dimension_with_tf_same_padding( |
2193 | 0 | input_height, convolution_op->stride_height); |
2194 | 0 | convolution_op->output_width = compute_output_dimension_with_tf_same_padding( |
2195 | 0 | input_width, convolution_op->stride_width); |
2196 | |
|
2197 | 0 | const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1; |
2198 | 0 | const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1; |
2199 | 0 | const size_t total_padding_height = |
2200 | 0 | (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height; |
2201 | 0 | const size_t total_padding_width = |
2202 | 0 | (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width; |
2203 | 0 | convolution_op->padding_top = total_padding_height / 2; |
2204 | 0 | convolution_op->padding_left = total_padding_width / 2; |
2205 | 0 | convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top; |
2206 | 0 | convolution_op->padding_right = total_padding_width - convolution_op->padding_left; |
2207 | 0 | } else { |
2208 | 0 | convolution_op->output_height = xnn_compute_convolution_output_dimension( |
2209 | 0 | convolution_op->padding_top + input_height + convolution_op->padding_bottom, |
2210 | 0 | convolution_op->kernel_height, |
2211 | 0 | convolution_op->dilation_height, |
2212 | 0 | convolution_op->stride_height); |
2213 | 0 | convolution_op->output_width = xnn_compute_convolution_output_dimension( |
2214 | 0 | convolution_op->padding_left + input_width + convolution_op->padding_right, |
2215 | 0 | convolution_op->kernel_width, |
2216 | 0 | convolution_op->dilation_width, |
2217 | 0 | convolution_op->stride_width); |
2218 | 0 | } |
2219 | |
|
2220 | 0 | if (output_height_out != NULL) { |
2221 | 0 | *output_height_out = convolution_op->output_height; |
2222 | 0 | } |
2223 | 0 | if (output_width_out != NULL) { |
2224 | 0 | *output_width_out = convolution_op->output_width; |
2225 | 0 | } |
2226 | |
|
2227 | 0 | const size_t num_threads = pthreadpool_get_threads_count(threadpool); |
2228 | 0 | switch (convolution_op->ukernel.type) { |
2229 | 0 | case xnn_microkernel_type_gemm: |
2230 | 0 | return reshape_gemm( |
2231 | 0 | convolution_op, |
2232 | 0 | log2_input_element_size, log2_filter_element_size, extra_weights_elements_size, log2_output_element_size, |
2233 | 0 | workspace_size, workspace_alignment, num_threads); |
2234 | 0 | case xnn_microkernel_type_igemm: |
2235 | 0 | return reshape_igemm( |
2236 | 0 | convolution_op, |
2237 | 0 | log2_input_element_size, log2_filter_element_size, extra_weights_elements_size, log2_output_element_size, |
2238 | 0 | workspace_size, workspace_alignment, num_threads); |
2239 | 0 | case xnn_microkernel_type_dwconv: |
2240 | 0 | return reshape_dwconv( |
2241 | 0 | convolution_op, |
2242 | 0 | log2_input_element_size, log2_accumulator_element_size, log2_output_element_size, |
2243 | 0 | workspace_size, workspace_alignment, num_threads); |
2244 | 0 | case xnn_microkernel_type_vmulcaddc: |
2245 | 0 | return reshape_vmulcaddc( |
2246 | 0 | convolution_op, |
2247 | 0 | log2_input_element_size, log2_output_element_size, |
2248 | 0 | workspace_size, workspace_alignment, num_threads); |
2249 | 0 | default: |
2250 | 0 | XNN_UNREACHABLE; |
2251 | 0 | } |
2252 | 0 | } |
2253 | | |
2254 | | enum xnn_status xnn_reshape_convolution2d_nhwc_qu8( |
2255 | | xnn_operator_t convolution_op, |
2256 | | size_t batch_size, |
2257 | | size_t input_height, |
2258 | | size_t input_width, |
2259 | | size_t* workspace_size, |
2260 | | size_t* workspace_alignment, |
2261 | | size_t* output_height_out, |
2262 | | size_t* output_width_out, |
2263 | | pthreadpool_t threadpool) |
2264 | 0 | { |
2265 | 0 | return reshape_convolution2d_nhwc( |
2266 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_qu8, |
2267 | 0 | batch_size, input_height, input_width, |
2268 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, |
2269 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, |
2270 | | /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_INT32_T, |
2271 | 0 | /*extra_weights_elements_size=*/sizeof(int32_t), |
2272 | | /*log2_output_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, |
2273 | 0 | workspace_size, workspace_alignment, |
2274 | 0 | output_height_out, output_width_out, |
2275 | 0 | threadpool); |
2276 | 0 | } |
2277 | | |
2278 | | enum xnn_status xnn_reshape_convolution2d_nhwc_qs8( |
2279 | | xnn_operator_t convolution_op, |
2280 | | size_t batch_size, |
2281 | | size_t input_height, |
2282 | | size_t input_width, |
2283 | | size_t* workspace_size, |
2284 | | size_t* workspace_alignment, |
2285 | | size_t* output_height_out, |
2286 | | size_t* output_width_out, |
2287 | | pthreadpool_t threadpool) |
2288 | 0 | { |
2289 | 0 | return reshape_convolution2d_nhwc( |
2290 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_qs8, |
2291 | 0 | batch_size, input_height, input_width, |
2292 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
2293 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
2294 | | /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_INT32_T, |
2295 | 0 | /*extra_weights_elements_size=*/sizeof(int32_t), |
2296 | | /*log2_output_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
2297 | 0 | workspace_size, workspace_alignment, |
2298 | 0 | output_height_out, output_width_out, |
2299 | 0 | threadpool); |
2300 | 0 | } |
2301 | | |
2302 | | enum xnn_status xnn_reshape_convolution2d_nhwc_qs8_qc8w( |
2303 | | xnn_operator_t convolution_op, |
2304 | | size_t batch_size, |
2305 | | size_t input_height, |
2306 | | size_t input_width, |
2307 | | size_t* workspace_size, |
2308 | | size_t* workspace_alignment, |
2309 | | size_t* output_height_out, |
2310 | | size_t* output_width_out, |
2311 | | pthreadpool_t threadpool) |
2312 | 0 | { |
2313 | 0 | return reshape_convolution2d_nhwc( |
2314 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_qc8, |
2315 | 0 | batch_size, input_height, input_width, |
2316 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
2317 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
2318 | | /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_INT32_T, |
2319 | 0 | /*extra_weights_elements_size=*/sizeof(int32_t) + sizeof(float), |
2320 | | /*log2_output_element_size=*/XNN_LOG2_SIZEOF_INT8_T, |
2321 | 0 | workspace_size, workspace_alignment, |
2322 | 0 | output_height_out, output_width_out, |
2323 | 0 | threadpool); |
2324 | 0 | } |
2325 | | |
2326 | | enum xnn_status xnn_reshape_convolution2d_nhwc_f16( |
2327 | | xnn_operator_t convolution_op, |
2328 | | size_t batch_size, |
2329 | | size_t input_height, |
2330 | | size_t input_width, |
2331 | | size_t* workspace_size, |
2332 | | size_t* workspace_alignment, |
2333 | | size_t* output_height_out, |
2334 | | size_t* output_width_out, |
2335 | | pthreadpool_t threadpool) |
2336 | 0 | { |
2337 | 0 | return reshape_convolution2d_nhwc( |
2338 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_f16, |
2339 | 0 | batch_size, input_height, input_width, |
2340 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_HALF, |
2341 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_HALF, |
2342 | | /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_HALF, |
2343 | 0 | /*extra_weights_elements_size=*/sizeof(uint16_t), |
2344 | | /*log2_output_element_size=*/XNN_LOG2_SIZEOF_HALF, |
2345 | 0 | workspace_size, workspace_alignment, |
2346 | 0 | output_height_out, output_width_out, |
2347 | 0 | threadpool); |
2348 | 0 | } |
2349 | | |
2350 | | enum xnn_status xnn_reshape_convolution2d_nhwc_f32( |
2351 | | xnn_operator_t convolution_op, |
2352 | | size_t batch_size, |
2353 | | size_t input_height, |
2354 | | size_t input_width, |
2355 | | size_t* workspace_size, |
2356 | | size_t* workspace_alignment, |
2357 | | size_t* output_height_out, |
2358 | | size_t* output_width_out, |
2359 | | pthreadpool_t threadpool) |
2360 | 0 | { |
2361 | 0 | return reshape_convolution2d_nhwc( |
2362 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_f32, |
2363 | 0 | batch_size, input_height, input_width, |
2364 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
2365 | | /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
2366 | | /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
2367 | 0 | /*extra_weights_elements_size=*/sizeof(float), |
2368 | | /*log2_output_element_size=*/XNN_LOG2_SIZEOF_FLOAT, |
2369 | 0 | workspace_size, workspace_alignment, |
2370 | 0 | output_height_out, output_width_out, |
2371 | 0 | threadpool); |
2372 | 0 | } |
2373 | | |
2374 | | static enum xnn_status setup_gemm(xnn_operator_t convolution_op) |
2375 | 0 | { |
2376 | 0 | convolution_op->context.gemm.a = convolution_op->input; |
2377 | 0 | convolution_op->context.gemm.c = convolution_op->output; |
2378 | 0 | convolution_op->state = xnn_run_state_ready; |
2379 | |
|
2380 | 0 | return xnn_status_success; |
2381 | 0 | } |
2382 | | |
2383 | | static enum xnn_status setup_igemm( |
2384 | | xnn_operator_t convolution_op, |
2385 | | void* workspace, |
2386 | | uint32_t log2_input_element_size) |
2387 | 0 | { |
2388 | 0 | if (convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) { |
2389 | 0 | convolution_op->context.igemm.a_offset = (size_t) 0; |
2390 | 0 | convolution_op->context.igemm.indirect_a = (const void**) workspace; |
2391 | 0 | convolution_op->context.conv2d_igemm_indirection_init.indirection_buffer = (const void**) workspace; |
2392 | 0 | convolution_op->context.conv2d_igemm_indirection_init.input = convolution_op->input; |
2393 | 0 | } else { |
2394 | 0 | convolution_op->context.igemm.a_offset = (size_t) ((uintptr_t) convolution_op->input - (uintptr_t) convolution_op->last_input); |
2395 | 0 | } |
2396 | 0 | convolution_op->context.igemm.c = convolution_op->output; |
2397 | 0 | convolution_op->state = xnn_run_state_ready; |
2398 | |
|
2399 | 0 | return xnn_status_success; |
2400 | 0 | } |
2401 | | |
2402 | | static enum xnn_status setup_dwconv( |
2403 | | xnn_operator_t convolution_op, |
2404 | | void* workspace, |
2405 | | uint32_t log2_input_element_size) |
2406 | 0 | { |
2407 | | #if XNN_TEST_MODE |
2408 | | // indirection buffer is only set at this time if it is persistent. |
2409 | | if (!(convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER)) { |
2410 | | const size_t kernel_height = convolution_op->kernel_height; |
2411 | | const size_t kernel_width = convolution_op->kernel_width; |
2412 | | const size_t kernel_size = kernel_height * kernel_width; |
2413 | | const size_t output_width = convolution_op->output_width; |
2414 | | const size_t step_width = convolution_op->dilation_width == 1 ? |
2415 | | min(convolution_op->stride_width, kernel_width) : kernel_width; |
2416 | | const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height; |
2417 | | const struct xnn_ukernel_dwconv dwconv_ukernel = convolution_op->ukernel.dwconv; |
2418 | | const size_t tile_size = dwconv_ukernel.tile_size; |
2419 | | const size_t indirection_buffer_size = |
2420 | | sizeof(void*) * (tile_size - kernel_size + convolution_op->output_height * step_height); |
2421 | | |
2422 | | // TODO(zhin): store step_height and step_width, this is already computed in create. |
2423 | | for (size_t i = 0; i < indirection_buffer_size / sizeof(void*); i++) { |
2424 | | // Indirection initialization should have set all indirection pointers, make sure none of them are NULL. |
2425 | | assert(convolution_op->indirection_buffer[i] != NULL); |
2426 | | } |
2427 | | } |
2428 | | #endif |
2429 | |
|
2430 | 0 | if (convolution_op->flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) { |
2431 | 0 | convolution_op->context.dwconv.input_offset = (size_t) 0; |
2432 | 0 | convolution_op->context.dwconv.indirect_input = (const void**) workspace; |
2433 | 0 | convolution_op->context.dwconv_indirection_init.input = convolution_op->input; |
2434 | 0 | convolution_op->context.dwconv_indirection_init.indirection_buffer = (const void**) workspace; |
2435 | 0 | } else { |
2436 | 0 | convolution_op->context.dwconv.input_offset = (size_t) ((uintptr_t) convolution_op->input - (uintptr_t) convolution_op->last_input); |
2437 | 0 | } |
2438 | |
|
2439 | 0 | if (convolution_op->context.dwconv.buffer_size) { |
2440 | 0 | assert(workspace != NULL); |
2441 | 0 | convolution_op->context.dwconv.multipass_buffer = |
2442 | 0 | (void*) ((uintptr_t) workspace + convolution_op->context.dwconv.multipass_buffer_offset); |
2443 | 0 | } |
2444 | | |
2445 | 0 | convolution_op->context.dwconv.output = convolution_op->output; |
2446 | 0 | convolution_op->state = xnn_run_state_ready; |
2447 | |
|
2448 | 0 | return xnn_status_success; |
2449 | 0 | } |
2450 | | |
2451 | | static enum xnn_status setup_vmulcaddc(xnn_operator_t convolution_op) |
2452 | 0 | { |
2453 | 0 | convolution_op->context.vmulcaddc.x = convolution_op->input; |
2454 | 0 | convolution_op->context.vmulcaddc.y = convolution_op->output; |
2455 | 0 | convolution_op->state = xnn_run_state_ready; |
2456 | |
|
2457 | 0 | return xnn_status_success; |
2458 | 0 | } |
2459 | | |
2460 | | static enum xnn_status setup_convolution2d_nhwc( |
2461 | | xnn_operator_t convolution_op, |
2462 | | enum xnn_operator_type expected_operator_type, |
2463 | | void* workspace, |
2464 | | const void* input, |
2465 | | void* output, |
2466 | | uint32_t log2_input_element_size) |
2467 | 0 | { |
2468 | 0 | if (convolution_op->type != expected_operator_type) { |
2469 | 0 | xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", |
2470 | 0 | xnn_operator_type_to_string(expected_operator_type), |
2471 | 0 | xnn_operator_type_to_string(convolution_op->type)); |
2472 | 0 | return xnn_status_invalid_parameter; |
2473 | 0 | } |
2474 | | |
2475 | 0 | switch (convolution_op->state) { |
2476 | 0 | case xnn_run_state_skip: |
2477 | 0 | return xnn_status_success; |
2478 | 0 | case xnn_run_state_invalid: |
2479 | 0 | xnn_log_error( |
2480 | 0 | "failed to setup %s operator: operator has not been reshaped yet", |
2481 | 0 | xnn_operator_type_to_string(convolution_op->type)); |
2482 | 0 | return xnn_status_invalid_state; |
2483 | 0 | case xnn_run_state_needs_setup: |
2484 | | // Operator has been reshaped, but not setup, continue with setup. |
2485 | 0 | case xnn_run_state_ready: |
2486 | | // Operator has been reshaped, and we are setting up with different pointers. |
2487 | 0 | break; |
2488 | 0 | } |
2489 | | |
2490 | 0 | convolution_op->input = input; |
2491 | 0 | convolution_op->output = output; |
2492 | |
|
2493 | 0 | switch (convolution_op->ukernel.type) { |
2494 | 0 | case xnn_microkernel_type_gemm: |
2495 | 0 | return setup_gemm(convolution_op); |
2496 | 0 | case xnn_microkernel_type_igemm: |
2497 | 0 | return setup_igemm(convolution_op, workspace, log2_input_element_size); |
2498 | 0 | case xnn_microkernel_type_dwconv: |
2499 | 0 | return setup_dwconv(convolution_op, workspace, log2_input_element_size); |
2500 | 0 | case xnn_microkernel_type_vmulcaddc: |
2501 | 0 | return setup_vmulcaddc(convolution_op); |
2502 | 0 | default: |
2503 | 0 | XNN_UNREACHABLE; |
2504 | 0 | } |
2505 | 0 | } |
2506 | | |
2507 | | enum xnn_status xnn_setup_convolution2d_nhwc_qu8( |
2508 | | xnn_operator_t convolution_op, |
2509 | | void* workspace, |
2510 | | const uint8_t* input, |
2511 | | uint8_t* output) |
2512 | 0 | { |
2513 | 0 | return setup_convolution2d_nhwc( |
2514 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_qu8, |
2515 | 0 | workspace, input, output, |
2516 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_UINT8_T); |
2517 | 0 | } |
2518 | | |
2519 | | enum xnn_status xnn_setup_convolution2d_nhwc_qs8( |
2520 | | xnn_operator_t convolution_op, |
2521 | | void* workspace, |
2522 | | const int8_t* input, |
2523 | | int8_t* output) |
2524 | 0 | { |
2525 | 0 | return setup_convolution2d_nhwc( |
2526 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_qs8, |
2527 | 0 | workspace, input, output, |
2528 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T); |
2529 | 0 | } |
2530 | | |
2531 | | enum xnn_status xnn_setup_convolution2d_nhwc_qs8_qc8w( |
2532 | | xnn_operator_t convolution_op, |
2533 | | void* workspace, |
2534 | | const int8_t* input, |
2535 | | int8_t* output) |
2536 | 0 | { |
2537 | 0 | return setup_convolution2d_nhwc( |
2538 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_qc8, |
2539 | 0 | workspace, input, output, |
2540 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_INT8_T); |
2541 | 0 | } |
2542 | | |
2543 | | enum xnn_status xnn_setup_convolution2d_nhwc_f16( |
2544 | | xnn_operator_t convolution_op, |
2545 | | void* workspace, |
2546 | | const void* input, |
2547 | | void* output) |
2548 | 0 | { |
2549 | 0 | return setup_convolution2d_nhwc( |
2550 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_f16, |
2551 | 0 | workspace, input, output, |
2552 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_HALF); |
2553 | 0 | } |
2554 | | |
2555 | | enum xnn_status xnn_setup_convolution2d_nhwc_f32( |
2556 | | xnn_operator_t convolution_op, |
2557 | | void* workspace, |
2558 | | const float* input, |
2559 | | float* output) |
2560 | 0 | { |
2561 | 0 | return setup_convolution2d_nhwc( |
2562 | 0 | convolution_op, xnn_operator_type_convolution_nhwc_f32, |
2563 | 0 | workspace, input, output, |
2564 | | /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT); |
2565 | 0 | } |