Coverage Report

Created: 2023-09-25 06:31

/src/xnnpack/src/amalgam/gen/scalar.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2021 Google LLC
2
//
3
// This source code is licensed under the BSD-style license found in the
4
// LICENSE file in the root directory of this source tree.
5
6
#include <assert.h>
7
#include <fxdiv.h>
8
#include <math.h>
9
#include <stddef.h>
10
#include <stdint.h>
11
#include <string.h>
12
13
#include <xnnpack/argmaxpool.h>
14
#include <xnnpack/avgpool.h>
15
#include <xnnpack/common.h>
16
#include <xnnpack/conv.h>
17
#include <xnnpack/dwconv.h>
18
#include <xnnpack/fill.h>
19
#include <xnnpack/gavgpool.h>
20
#include <xnnpack/gemm.h>
21
#include <xnnpack/ibilinear.h>
22
#include <xnnpack/igemm.h>
23
#include <xnnpack/lut.h>
24
#include <xnnpack/math.h>
25
#include <xnnpack/maxpool.h>
26
#include <xnnpack/microparams.h>
27
#include <xnnpack/packw.h>
28
#include <xnnpack/pad.h>
29
#include <xnnpack/pavgpool.h>
30
#include <xnnpack/prelu.h>
31
#include <xnnpack/raddstoreexpminusmax.h>
32
#include <xnnpack/reduce.h>
33
#include <xnnpack/rmax.h>
34
#include <xnnpack/spmm.h>
35
#include <xnnpack/transpose.h>
36
#include <xnnpack/unaligned.h>
37
#include <xnnpack/unpool.h>
38
#include <xnnpack/vadd.h>
39
#include <xnnpack/vbinary.h>
40
#include <xnnpack/vcvt.h>
41
#include <xnnpack/vlrelu.h>
42
#include <xnnpack/vmul.h>
43
#include <xnnpack/vmulcaddc.h>
44
#include <xnnpack/vunary.h>
45
#include <xnnpack/zip.h>
46
47
48
void xnn_f16_f32_vcvt_ukernel__scalar_u1(
49
    size_t batch,
50
    const void* input,
51
    float* output,
52
    const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
53
0
{
54
0
  assert(batch != 0);
55
0
  assert(batch % sizeof(uint16_t) == 0);
56
0
  assert(input != NULL);
57
0
  assert(output != NULL);
58
59
0
  const uint32_t vsign_mask = params->scalar.sign_mask;
60
0
  const uint32_t vexp_offset = params->scalar.exp_offset;
61
0
  const float vexp_scale = params->scalar.exp_scale;
62
0
  const uint32_t vmagic_mask = params->scalar.magic_mask;
63
0
  const float vmagic_bias = params->scalar.magic_bias;
64
0
  const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
65
66
0
  const uint16_t* i = (const uint16_t*) input;
67
0
  uint32_t* o = (uint32_t*) output;
68
0
  do {
69
0
    const uint16_t vh = *i++;
70
71
0
    const uint32_t vw = (uint32_t) vh << 16;
72
0
    const uint32_t vsign = vw & vsign_mask;
73
0
    const uint32_t v2w = vw + vw;
74
0
    const uint32_t vnorm = float_as_uint32(uint32_as_float((v2w >> 4) + vexp_offset) * vexp_scale);
75
0
    const uint32_t vdenorm = float_as_uint32(uint32_as_float((v2w >> 17) | vmagic_mask) - vmagic_bias);
76
0
    const uint32_t vf = vsign | (XNN_UNPREDICTABLE(v2w < vdenorm_cutoff) ? vdenorm : vnorm);
77
78
0
    *o++ = vf;
79
80
0
    batch -= sizeof(uint16_t);
81
0
  } while (batch != 0);
82
0
}
83
84
void xnn_f16_f32_vcvt_ukernel__scalar_u4(
85
    size_t batch,
86
    const void* input,
87
    float* output,
88
    const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
89
0
{
90
0
  assert(batch != 0);
91
0
  assert(batch % sizeof(uint16_t) == 0);
92
0
  assert(input != NULL);
93
0
  assert(output != NULL);
94
95
0
  const uint32_t vsign_mask = params->scalar.sign_mask;
96
0
  const uint32_t vexp_offset = params->scalar.exp_offset;
97
0
  const float vexp_scale = params->scalar.exp_scale;
98
0
  const uint32_t vmagic_mask = params->scalar.magic_mask;
99
0
  const float vmagic_bias = params->scalar.magic_bias;
100
0
  const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
101
102
0
  const uint16_t* i = (const uint16_t*) input;
103
0
  uint32_t* o = (uint32_t*) output;
104
0
  for (; batch >= 4 * sizeof(uint16_t); batch -= 4 * sizeof(uint16_t)) {
105
0
    const uint16_t vh0 = i[0];
106
0
    const uint16_t vh1 = i[1];
107
0
    const uint16_t vh2 = i[2];
108
0
    const uint16_t vh3 = i[3];
109
0
    i += 4;
110
111
0
    const uint32_t vw0 = (uint32_t) vh0 << 16;
112
0
    const uint32_t vw1 = (uint32_t) vh1 << 16;
113
0
    const uint32_t vw2 = (uint32_t) vh2 << 16;
114
0
    const uint32_t vw3 = (uint32_t) vh3 << 16;
115
116
0
    const uint32_t vsign0 = vw0 & vsign_mask;
117
0
    const uint32_t vsign1 = vw1 & vsign_mask;
118
0
    const uint32_t vsign2 = vw2 & vsign_mask;
119
0
    const uint32_t vsign3 = vw3 & vsign_mask;
120
121
0
    const uint32_t v2w0 = vw0 + vw0;
122
0
    const uint32_t v2w1 = vw1 + vw1;
123
0
    const uint32_t v2w2 = vw2 + vw2;
124
0
    const uint32_t v2w3 = vw3 + vw3;
125
126
0
    const uint32_t vnorm0 = float_as_uint32(uint32_as_float((v2w0 >> 4) + vexp_offset) * vexp_scale);
127
0
    const uint32_t vnorm1 = float_as_uint32(uint32_as_float((v2w1 >> 4) + vexp_offset) * vexp_scale);
128
0
    const uint32_t vnorm2 = float_as_uint32(uint32_as_float((v2w2 >> 4) + vexp_offset) * vexp_scale);
129
0
    const uint32_t vnorm3 = float_as_uint32(uint32_as_float((v2w3 >> 4) + vexp_offset) * vexp_scale);
130
131
0
    const uint32_t vdenorm0 = float_as_uint32(uint32_as_float((v2w0 >> 17) | vmagic_mask) - vmagic_bias);
132
0
    const uint32_t vdenorm1 = float_as_uint32(uint32_as_float((v2w1 >> 17) | vmagic_mask) - vmagic_bias);
133
0
    const uint32_t vdenorm2 = float_as_uint32(uint32_as_float((v2w2 >> 17) | vmagic_mask) - vmagic_bias);
134
0
    const uint32_t vdenorm3 = float_as_uint32(uint32_as_float((v2w3 >> 17) | vmagic_mask) - vmagic_bias);
135
136
0
    const uint32_t vf0 = vsign0 | (XNN_UNPREDICTABLE(v2w0 < vdenorm_cutoff) ? vdenorm0 : vnorm0);
137
0
    const uint32_t vf1 = vsign1 | (XNN_UNPREDICTABLE(v2w1 < vdenorm_cutoff) ? vdenorm1 : vnorm1);
138
0
    const uint32_t vf2 = vsign2 | (XNN_UNPREDICTABLE(v2w2 < vdenorm_cutoff) ? vdenorm2 : vnorm2);
139
0
    const uint32_t vf3 = vsign3 | (XNN_UNPREDICTABLE(v2w3 < vdenorm_cutoff) ? vdenorm3 : vnorm3);
140
141
0
    o[0] = vf0;
142
0
    o[1] = vf1;
143
0
    o[2] = vf2;
144
0
    o[3] = vf3;
145
0
    o += 4;
146
0
  }
147
0
  if XNN_UNLIKELY(batch != 0) {
148
0
    do {
149
0
      const uint16_t vh = *i++;
150
151
0
      const uint32_t vw = (uint32_t) vh << 16;
152
0
      const uint32_t vsign = vw & vsign_mask;
153
0
      const uint32_t v2w = vw + vw;
154
0
      const uint32_t vnorm = float_as_uint32(uint32_as_float((v2w >> 4) + vexp_offset) * vexp_scale);
155
0
      const uint32_t vdenorm = float_as_uint32(uint32_as_float((v2w >> 17) | vmagic_mask) - vmagic_bias);
156
0
      const uint32_t vf = vsign | (XNN_UNPREDICTABLE(v2w < vdenorm_cutoff) ? vdenorm : vnorm);
157
158
0
      *o++ = vf;
159
160
0
      batch -= sizeof(uint16_t);
161
0
    } while (batch != 0);
162
0
  }
163
0
}
164
165
void xnn_f32_argmaxpool_ukernel_4x__scalar_c1(
166
    size_t output_pixels,
167
    size_t pooling_elements,
168
    size_t channels,
169
    const float** input,
170
    size_t input_offset,
171
    float* output,
172
    uint32_t* index,
173
    size_t input_increment,
174
    size_t output_increment)
175
0
{
176
0
  assert(output_pixels != 0);
177
0
  assert(pooling_elements != 0);
178
0
  assert(pooling_elements <= 4);
179
0
  assert(channels != 0);
180
181
0
  do {
182
0
    const float* i0 = input[0];
183
0
    const float* i1 = input[1];
184
0
    const float* i2 = input[2];
185
0
    const float* i3 = input[3];
186
0
    i0 = (const float*) ((uintptr_t) i0 + input_offset);
187
0
    i1 = (const float*) ((uintptr_t) i1 + input_offset);
188
0
    i2 = (const float*) ((uintptr_t) i2 + input_offset);
189
0
    i3 = (const float*) ((uintptr_t) i3 + input_offset);
190
0
    if (pooling_elements < 2) {
191
0
      i1 = i0;
192
0
    }
193
0
    if (pooling_elements <= 2) {
194
0
      i2 = i0;
195
0
    }
196
0
    if (pooling_elements != 4) {
197
0
      i3 = i0;
198
0
    }
199
200
0
    size_t c = channels;
201
0
    do {
202
0
      const float vi0 = *i0++;
203
0
      const float vi1 = *i1++;
204
0
      const float vi2 = *i2++;
205
0
      const float vi3 = *i3++;
206
207
0
      float vmax = vi0;
208
0
      uint32_t vidx = 0;
209
210
0
      if (vi1 > vmax) {
211
0
        vmax = vi1;
212
0
        vidx = 1;
213
0
      }
214
215
0
      if (vi2 > vmax) {
216
0
        vmax = vi2;
217
0
        vidx = 2;
218
0
      }
219
220
0
      if (vi3 > vmax) {
221
0
        vmax = vi3;
222
0
        vidx = 3;
223
0
      }
224
225
0
      *output++ = vmax;
226
0
      *index++ = vidx;
227
0
    } while (--c != 0);
228
0
    input = (const float**) ((uintptr_t) input + input_increment);
229
0
    output = (float*) ((uintptr_t) output + output_increment);
230
0
  } while (--output_pixels != 0);
231
0
}
232
233
void xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1(
234
    size_t output_pixels,
235
    size_t pooling_elements,
236
    size_t channels,
237
    const float** input,
238
    size_t input_offset,
239
    float* accumulation_buffer,
240
    uint32_t* index_buffer,
241
    float* output,
242
    uint32_t* index,
243
    size_t input_increment,
244
    size_t output_increment)
245
0
{
246
0
  assert(output_pixels != 0);
247
0
  assert(pooling_elements != 0);
248
0
  assert(pooling_elements > 9);
249
0
  assert(channels != 0);
250
251
0
  do {
252
0
    {
253
0
      float* ab = accumulation_buffer;
254
0
      uint32_t* ib = index_buffer;
255
256
0
      const float* i0 = *input++;
257
0
      const float* i1 = *input++;
258
0
      const float* i2 = *input++;
259
0
      const float* i3 = *input++;
260
0
      const float* i4 = *input++;
261
0
      const float* i5 = *input++;
262
0
      const float* i6 = *input++;
263
0
      const float* i7 = *input++;
264
0
      const float* i8 = *input++;
265
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
266
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
267
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
268
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
269
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
270
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
271
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
272
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
273
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
274
275
0
      size_t c = channels;
276
0
      do {
277
0
        const float vi0 = *i0++;
278
0
        const float vi1 = *i1++;
279
0
        const float vi2 = *i2++;
280
0
        const float vi3 = *i3++;
281
0
        const float vi4 = *i4++;
282
0
        const float vi5 = *i5++;
283
0
        const float vi6 = *i6++;
284
0
        const float vi7 = *i7++;
285
0
        const float vi8 = *i8++;
286
287
0
        float vmax = vi0;
288
0
        uint32_t vidx = 0;
289
290
0
        if (vi1 > vmax) {
291
0
          vmax = vi1;
292
0
          vidx = 1;
293
0
        }
294
295
0
        if (vi2 > vmax) {
296
0
          vmax = vi2;
297
0
          vidx = 2;
298
0
        }
299
300
0
        if (vi3 > vmax) {
301
0
          vmax = vi3;
302
0
          vidx = 3;
303
0
        }
304
305
0
        if (vi4 > vmax) {
306
0
          vmax = vi4;
307
0
          vidx = 4;
308
0
        }
309
310
0
        if (vi5 > vmax) {
311
0
          vmax = vi5;
312
0
          vidx = 5;
313
0
        }
314
315
0
        if (vi6 > vmax) {
316
0
          vmax = vi6;
317
0
          vidx = 6;
318
0
        }
319
320
0
        if (vi7 > vmax) {
321
0
          vmax = vi7;
322
0
          vidx = 7;
323
0
        }
324
325
0
        if (vi8 > vmax) {
326
0
          vmax = vi8;
327
0
          vidx = 8;
328
0
        }
329
330
0
        *ab++ = vmax;
331
0
        *ib++ = vidx;
332
0
      } while (--c != 0);
333
0
    }
334
0
    uint32_t vidx0 = 9;
335
0
    size_t k = pooling_elements;
336
0
    for (k -= 9; k > 8; k -= 8) {
337
0
      const float* i0 = *input++;
338
0
      const float* i1 = *input++;
339
0
      const float* i2 = *input++;
340
0
      const float* i3 = *input++;
341
0
      const float* i4 = *input++;
342
0
      const float* i5 = *input++;
343
0
      const float* i6 = *input++;
344
0
      const float* i7 = *input++;
345
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
346
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
347
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
348
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
349
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
350
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
351
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
352
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
353
354
0
      float* ab = accumulation_buffer;
355
0
      uint32_t* ib = index_buffer;
356
357
0
      size_t c = channels;
358
0
      do {
359
0
        const float vi0 = *i0++;
360
0
        const float vi1 = *i1++;
361
0
        const float vi2 = *i2++;
362
0
        const float vi3 = *i3++;
363
0
        const float vi4 = *i4++;
364
0
        const float vi5 = *i5++;
365
0
        const float vi6 = *i6++;
366
0
        const float vi7 = *i7++;
367
368
0
        float vmax = *ab;
369
0
        uint32_t vidx = *ib;
370
371
0
        if (vi0 > vmax) {
372
0
          vmax = vi0;
373
0
          vidx = vidx0;
374
0
        }
375
376
0
        if (vi1 > vmax) {
377
0
          vmax = vi1;
378
0
          vidx = vidx0 + 1;
379
0
        }
380
381
0
        if (vi2 > vmax) {
382
0
          vmax = vi2;
383
0
          vidx = vidx0 + 2;
384
0
        }
385
386
0
        if (vi3 > vmax) {
387
0
          vmax = vi3;
388
0
          vidx = vidx0 + 3;
389
0
        }
390
391
0
        if (vi4 > vmax) {
392
0
          vmax = vi4;
393
0
          vidx = vidx0 + 4;
394
0
        }
395
396
0
        if (vi5 > vmax) {
397
0
          vmax = vi5;
398
0
          vidx = vidx0 + 5;
399
0
        }
400
401
0
        if (vi6 > vmax) {
402
0
          vmax = vi6;
403
0
          vidx = vidx0 + 6;
404
0
        }
405
406
0
        if (vi7 > vmax) {
407
0
          vmax = vi7;
408
0
          vidx = vidx0 + 7;
409
0
        }
410
411
0
        *ab++ = vmax;
412
0
        *ib++ = vidx;
413
0
      } while (--c != 0);
414
0
      vidx0 += 8;
415
0
    }
416
417
0
    float* o = output;
418
0
    uint32_t* i = index;
419
0
    {
420
0
      const float* i0 = input[0];
421
0
      const float* i1 = input[1];
422
0
      const float* i2 = input[2];
423
0
      const float* i3 = input[3];
424
0
      const float* i4 = input[4];
425
0
      const float* i5 = input[5];
426
0
      const float* i6 = input[6];
427
0
      const float* i7 = input[7];
428
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
429
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
430
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
431
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
432
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
433
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
434
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
435
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
436
0
      input = (const float**) ((uintptr_t) input + input_increment);
437
0
      if (k < 2) {
438
0
        i1 = i0;
439
0
      }
440
0
      if (k <= 2) {
441
0
        i2 = i0;
442
0
      }
443
0
      if (k < 4) {
444
0
        i3 = i0;
445
0
      }
446
0
      if (k <= 4) {
447
0
        i4 = i0;
448
0
      }
449
0
      if (k < 6) {
450
0
        i5 = i0;
451
0
      }
452
0
      if (k <= 6) {
453
0
        i6 = i0;
454
0
      }
455
0
      if (k != 8) {
456
0
        i7 = i0;
457
0
      }
458
459
0
      size_t c = channels;
460
0
      float* ab = accumulation_buffer;
461
0
      uint32_t* ib = index_buffer;
462
0
      do {
463
0
        const float vi0 = *i0++;
464
0
        const float vi1 = *i1++;
465
0
        const float vi2 = *i2++;
466
0
        const float vi3 = *i3++;
467
0
        const float vi4 = *i4++;
468
0
        const float vi5 = *i5++;
469
0
        const float vi6 = *i6++;
470
0
        const float vi7 = *i7++;
471
472
0
        float vmax = *ab++;
473
0
        uint32_t vidx = *ib++;
474
475
0
        if (vi0 > vmax) {
476
0
          vmax = vi0;
477
0
          vidx = vidx0;
478
0
        }
479
480
0
        if (vi1 > vmax) {
481
0
          vmax = vi1;
482
0
          vidx = vidx0 + 1;
483
0
        }
484
485
0
        if (vi2 > vmax) {
486
0
          vmax = vi2;
487
0
          vidx = vidx0 + 2;
488
0
        }
489
490
0
        if (vi3 > vmax) {
491
0
          vmax = vi3;
492
0
          vidx = vidx0 + 3;
493
0
        }
494
495
0
        if (vi4 > vmax) {
496
0
          vmax = vi4;
497
0
          vidx = vidx0 + 4;
498
0
        }
499
500
0
        if (vi5 > vmax) {
501
0
          vmax = vi5;
502
0
          vidx = vidx0 + 5;
503
0
        }
504
505
0
        if (vi6 > vmax) {
506
0
          vmax = vi6;
507
0
          vidx = vidx0 + 6;
508
0
        }
509
510
0
        if (vi7 > vmax) {
511
0
          vmax = vi7;
512
0
          vidx = vidx0 + 7;
513
0
        }
514
515
0
        *o++ = vmax;
516
0
        *i++ = vidx;
517
0
      } while (--c != 0);
518
0
    }
519
520
0
    output = (float*) ((uintptr_t) o + output_increment);
521
0
    index = (uint32_t*) i;
522
0
  } while (--output_pixels != 0);
523
0
}
524
525
void xnn_f32_argmaxpool_ukernel_9x__scalar_c1(
526
    size_t output_pixels,
527
    size_t pooling_elements,
528
    size_t channels,
529
    const float** input,
530
    size_t input_offset,
531
    float* output,
532
    uint32_t* index,
533
    size_t input_increment,
534
    size_t output_increment)
535
0
{
536
0
  assert(output_pixels != 0);
537
0
  assert(pooling_elements != 0);
538
0
  assert(pooling_elements <= 9);
539
0
  assert(channels != 0);
540
541
0
  do {
542
0
    const float* i0 = input[0];
543
0
    const float* i1 = input[1];
544
0
    const float* i2 = input[2];
545
0
    const float* i3 = input[3];
546
0
    const float* i4 = input[4];
547
0
    const float* i5 = input[5];
548
0
    const float* i6 = input[6];
549
0
    const float* i7 = input[7];
550
0
    const float* i8 = input[8];
551
0
    i0 = (const float*) ((uintptr_t) i0 + input_offset);
552
0
    i1 = (const float*) ((uintptr_t) i1 + input_offset);
553
0
    i2 = (const float*) ((uintptr_t) i2 + input_offset);
554
0
    i3 = (const float*) ((uintptr_t) i3 + input_offset);
555
0
    i4 = (const float*) ((uintptr_t) i4 + input_offset);
556
0
    i5 = (const float*) ((uintptr_t) i5 + input_offset);
557
0
    i6 = (const float*) ((uintptr_t) i6 + input_offset);
558
0
    i7 = (const float*) ((uintptr_t) i7 + input_offset);
559
0
    i8 = (const float*) ((uintptr_t) i8 + input_offset);
560
0
    if (pooling_elements < 2) {
561
0
      i1 = i0;
562
0
    }
563
0
    if (pooling_elements <= 2) {
564
0
      i2 = i0;
565
0
    }
566
0
    if (pooling_elements < 4) {
567
0
      i3 = i0;
568
0
    }
569
0
    if (pooling_elements <= 4) {
570
0
      i4 = i0;
571
0
    }
572
0
    if (pooling_elements < 6) {
573
0
      i5 = i0;
574
0
    }
575
0
    if (pooling_elements <= 6) {
576
0
      i6 = i0;
577
0
    }
578
0
    if (pooling_elements < 8) {
579
0
      i7 = i0;
580
0
    }
581
0
    if (pooling_elements <= 8) {
582
0
      i8 = i0;
583
0
    }
584
585
0
    size_t c = channels;
586
0
    do {
587
0
      const float vi0 = *i0++;
588
0
      const float vi1 = *i1++;
589
0
      const float vi2 = *i2++;
590
0
      const float vi3 = *i3++;
591
0
      const float vi4 = *i4++;
592
0
      const float vi5 = *i5++;
593
0
      const float vi6 = *i6++;
594
0
      const float vi7 = *i7++;
595
0
      const float vi8 = *i8++;
596
597
0
      float vmax = vi0;
598
0
      uint32_t vidx = 0;
599
600
0
      if (vi1 > vmax) {
601
0
        vmax = vi1;
602
0
        vidx = 1;
603
0
      }
604
605
0
      if (vi2 > vmax) {
606
0
        vmax = vi2;
607
0
        vidx = 2;
608
0
      }
609
610
0
      if (vi3 > vmax) {
611
0
        vmax = vi3;
612
0
        vidx = 3;
613
0
      }
614
615
0
      if (vi4 > vmax) {
616
0
        vmax = vi4;
617
0
        vidx = 4;
618
0
      }
619
620
0
      if (vi5 > vmax) {
621
0
        vmax = vi5;
622
0
        vidx = 5;
623
0
      }
624
625
0
      if (vi6 > vmax) {
626
0
        vmax = vi6;
627
0
        vidx = 6;
628
0
      }
629
630
0
      if (vi7 > vmax) {
631
0
        vmax = vi7;
632
0
        vidx = 7;
633
0
      }
634
635
0
      if (vi8 > vmax) {
636
0
        vmax = vi8;
637
0
        vidx = 8;
638
0
      }
639
640
0
      *output++ = vmax;
641
0
      *index++ = vidx;
642
0
    } while (--c != 0);
643
0
    input = (const float**) ((uintptr_t) input + input_increment);
644
0
    output = (float*) ((uintptr_t) output + output_increment);
645
0
  } while (--output_pixels != 0);
646
0
}
647
648
void xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1(
649
    size_t output_pixels,
650
    size_t kernel_elements,
651
    size_t channels,
652
    const float** input,
653
    size_t input_offset,
654
    const float* zero,
655
    float* buffer,
656
    float* output,
657
    size_t input_increment,
658
    size_t output_increment,
659
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
660
0
{
661
0
  assert(output_pixels != 0);
662
0
  assert(kernel_elements > 9);
663
0
  assert(channels != 0);
664
665
0
  const float vscale = params->scalar.scale;
666
0
  const float vmin = params->scalar.min;
667
0
  const float vmax = params->scalar.max;
668
669
0
  do {
670
0
    {
671
0
      const float* i0 = *input++;
672
0
      assert(i0 != NULL);
673
0
      if XNN_UNPREDICTABLE(i0 != zero) {
674
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
675
0
      }
676
0
      const float* i1 = *input++;
677
0
      assert(i1 != NULL);
678
0
      if XNN_UNPREDICTABLE(i1 != zero) {
679
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
680
0
      }
681
0
      const float* i2 = *input++;
682
0
      assert(i2 != NULL);
683
0
      if XNN_UNPREDICTABLE(i2 != zero) {
684
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
685
0
      }
686
0
      const float* i3 = *input++;
687
0
      assert(i3 != NULL);
688
0
      if XNN_UNPREDICTABLE(i3 != zero) {
689
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
690
0
      }
691
0
      const float* i4 = *input++;
692
0
      assert(i4 != NULL);
693
0
      if XNN_UNPREDICTABLE(i4 != zero) {
694
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
695
0
      }
696
0
      const float* i5 = *input++;
697
0
      assert(i5 != NULL);
698
0
      if XNN_UNPREDICTABLE(i5 != zero) {
699
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
700
0
      }
701
0
      const float* i6 = *input++;
702
0
      assert(i6 != NULL);
703
0
      if XNN_UNPREDICTABLE(i6 != zero) {
704
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
705
0
      }
706
0
      const float* i7 = *input++;
707
0
      assert(i7 != NULL);
708
0
      if XNN_UNPREDICTABLE(i7 != zero) {
709
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
710
0
      }
711
0
      const float* i8 = *input++;
712
0
      assert(i8 != NULL);
713
0
      if XNN_UNPREDICTABLE(i8 != zero) {
714
0
        i8 = (const float*) ((uintptr_t) i8 + input_offset);
715
0
      }
716
717
0
      float* b = buffer;
718
0
      size_t c = channels;
719
0
      do {
720
0
        const float vi0 = *i0++;
721
0
        const float vi1 = *i1++;
722
0
        const float vi2 = *i2++;
723
0
        const float vi3 = *i3++;
724
0
        const float vi4 = *i4++;
725
0
        const float vi5 = *i5++;
726
0
        const float vi6 = *i6++;
727
0
        const float vi7 = *i7++;
728
0
        const float vi8 = *i8++;
729
730
0
        const float vsum01 = vi0 + vi1;
731
0
        const float vsum23 = vi2 + vi3;
732
0
        const float vsum45 = vi4 + vi5;
733
0
        const float vsum67 = vi6 + vi7;
734
0
        const float vsum018 = vsum01 + vi8;
735
0
        const float vsum2345 = vsum23 + vsum45;
736
0
        const float vsum01678 = vsum018 + vsum67;
737
0
        const float vsum = vsum2345 + vsum01678;
738
739
0
        *b++ = vsum;
740
0
      } while (--c != 0);
741
0
    }
742
743
0
    size_t k = kernel_elements;
744
0
    for (k -= 9; k > 8; k -= 8) {
745
0
      const float* i0 = *input++;
746
0
      assert(i0 != NULL);
747
0
      if XNN_UNPREDICTABLE(i0 != zero) {
748
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
749
0
      }
750
0
      const float* i1 = *input++;
751
0
      assert(i1 != NULL);
752
0
      if XNN_UNPREDICTABLE(i1 != zero) {
753
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
754
0
      }
755
0
      const float* i2 = *input++;
756
0
      assert(i2 != NULL);
757
0
      if XNN_UNPREDICTABLE(i2 != zero) {
758
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
759
0
      }
760
0
      const float* i3 = *input++;
761
0
      assert(i3 != NULL);
762
0
      if XNN_UNPREDICTABLE(i3 != zero) {
763
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
764
0
      }
765
0
      const float* i4 = *input++;
766
0
      assert(i4 != NULL);
767
0
      if XNN_UNPREDICTABLE(i4 != zero) {
768
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
769
0
      }
770
0
      const float* i5 = *input++;
771
0
      assert(i5 != NULL);
772
0
      if XNN_UNPREDICTABLE(i5 != zero) {
773
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
774
0
      }
775
0
      const float* i6 = *input++;
776
0
      assert(i6 != NULL);
777
0
      if XNN_UNPREDICTABLE(i6 != zero) {
778
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
779
0
      }
780
0
      const float* i7 = *input++;
781
0
      assert(i7 != NULL);
782
0
      if XNN_UNPREDICTABLE(i7 != zero) {
783
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
784
0
      }
785
786
0
      float* b = buffer;
787
0
      size_t c = channels;
788
0
      do {
789
0
        const float vi0 = *i0++;
790
0
        const float vi1 = *i1++;
791
0
        const float vi2 = *i2++;
792
0
        const float vi3 = *i3++;
793
0
        const float vi4 = *i4++;
794
0
        const float vi5 = *i5++;
795
0
        const float vi6 = *i6++;
796
0
        const float vi7 = *i7++;
797
0
        const float vacc = *b;
798
799
0
        const float vsum01 = vi0 + vi1;
800
0
        const float vsum23 = vi2 + vi3;
801
0
        const float vsum45 = vi4 + vi5;
802
0
        const float vsum67 = vi6 + vi7;
803
0
        const float vsum01a = vsum01 + vacc;
804
0
        const float vsum2345 = vsum23 + vsum45;
805
0
        const float vsum0167a = vsum01a + vsum67;
806
0
        const float vsum = vsum2345 + vsum0167a;
807
808
0
        *b++ = vsum;
809
0
      } while (--c != 0);
810
0
    }
811
812
0
    {
813
0
      const float* i0 = input[0];
814
0
      assert(i0 != NULL);
815
0
      const float* i1 = input[1];
816
0
      const float* i2 = input[2];
817
0
      const float* i3 = input[3];
818
0
      const float* i4 = input[4];
819
0
      const float* i5 = input[5];
820
0
      const float* i6 = input[6];
821
0
      const float* i7 = input[7];
822
0
      input = (const float**) ((uintptr_t) input + input_increment);
823
0
      if (k < 2) {
824
0
        i1 = zero;
825
0
      }
826
0
      assert(i1 != NULL);
827
0
      if (k <= 2) {
828
0
        i2 = zero;
829
0
      }
830
0
      assert(i2 != NULL);
831
0
      if (k < 4) {
832
0
        i3 = zero;
833
0
      }
834
0
      assert(i3 != NULL);
835
0
      if (k <= 4) {
836
0
        i4 = zero;
837
0
      }
838
0
      assert(i4 != NULL);
839
0
      if (k < 6) {
840
0
        i5 = zero;
841
0
      }
842
0
      assert(i5 != NULL);
843
0
      if (k <= 6) {
844
0
        i6 = zero;
845
0
      }
846
0
      assert(i6 != NULL);
847
0
      if (k < 8) {
848
0
        i7 = zero;
849
0
      }
850
0
      assert(i7 != NULL);
851
0
      if XNN_UNPREDICTABLE(i0 != zero) {
852
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
853
0
      }
854
0
      if XNN_UNPREDICTABLE(i1 != zero) {
855
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
856
0
      }
857
0
      if XNN_UNPREDICTABLE(i2 != zero) {
858
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
859
0
      }
860
0
      if XNN_UNPREDICTABLE(i3 != zero) {
861
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
862
0
      }
863
0
      if XNN_UNPREDICTABLE(i4 != zero) {
864
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
865
0
      }
866
0
      if XNN_UNPREDICTABLE(i5 != zero) {
867
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
868
0
      }
869
0
      if XNN_UNPREDICTABLE(i6 != zero) {
870
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
871
0
      }
872
0
      if XNN_UNPREDICTABLE(i7 != zero) {
873
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
874
0
      }
875
876
0
      size_t c = channels;
877
0
      float* b = buffer;
878
0
      do {
879
0
        const float vi0 = *i0++;
880
0
        const float vi1 = *i1++;
881
0
        const float vi2 = *i2++;
882
0
        const float vi3 = *i3++;
883
0
        const float vi4 = *i4++;
884
0
        const float vi5 = *i5++;
885
0
        const float vi6 = *i6++;
886
0
        const float vi7 = *i7++;
887
0
        const float vacc = *b++;
888
889
0
        const float vsum01 = vi0 + vi1;
890
0
        const float vsum23 = vi2 + vi3;
891
0
        const float vsum45 = vi4 + vi5;
892
0
        const float vsum67 = vi6 + vi7;
893
0
        const float vsum01a = vsum01 + vacc;
894
0
        const float vsum2345 = vsum23 + vsum45;
895
0
        const float vsum0167a = vsum01a + vsum67;
896
0
        const float vsum = vsum2345 + vsum0167a;
897
898
0
        float vout = vsum * vscale;
899
0
        vout = math_max_f32(vout, vmin);
900
0
        vout = math_min_f32(vout, vmax);
901
902
0
        *output++ = vout;
903
0
      } while (--c != 0);
904
0
    }
905
0
    output = (float*) ((uintptr_t) output + output_increment);
906
0
  } while (--output_pixels != 0);
907
0
}
908
909
void xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1(
910
    size_t output_pixels,
911
    size_t kernel_elements,
912
    size_t channels,
913
    const float** input,
914
    size_t input_offset,
915
    const float* zero,
916
    float* output,
917
    size_t input_increment,
918
    size_t output_increment,
919
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
920
0
{
921
0
  assert(output_pixels != 0);
922
0
  assert(kernel_elements != 0);
923
0
  assert(kernel_elements <= 9);
924
0
  assert(channels != 0);
925
926
0
  const float vscale = params->scalar.scale;
927
0
  const float vmin = params->scalar.min;
928
0
  const float vmax = params->scalar.max;
929
930
0
  do {
931
0
    const float* i0 = input[0];
932
0
    assert(i0 != NULL);
933
0
    const float* i1 = input[1];
934
0
    const float* i2 = input[2];
935
0
    const float* i3 = input[3];
936
0
    const float* i4 = input[4];
937
0
    const float* i5 = input[5];
938
0
    const float* i6 = input[6];
939
0
    const float* i7 = input[7];
940
0
    const float* i8 = input[8];
941
0
    input = (const float**) ((uintptr_t) input + input_increment);
942
0
    if (kernel_elements < 2) {
943
0
      i1 = zero;
944
0
    }
945
0
    assert(i1 != NULL);
946
0
    if (kernel_elements <= 2) {
947
0
      i2 = zero;
948
0
    }
949
0
    assert(i2 != NULL);
950
0
    if (kernel_elements < 4) {
951
0
      i3 = zero;
952
0
    }
953
0
    assert(i3 != NULL);
954
0
    if (kernel_elements <= 4) {
955
0
      i4 = zero;
956
0
    }
957
0
    assert(i4 != NULL);
958
0
    if (kernel_elements < 6) {
959
0
      i5 = zero;
960
0
    }
961
0
    assert(i5 != NULL);
962
0
    if (kernel_elements <= 6) {
963
0
      i6 = zero;
964
0
    }
965
0
    assert(i6 != NULL);
966
0
    if (kernel_elements < 8) {
967
0
      i7 = zero;
968
0
    }
969
0
    assert(i7 != NULL);
970
0
    if (kernel_elements <= 8) {
971
0
      i8 = zero;
972
0
    }
973
0
    assert(i8 != NULL);
974
0
    if XNN_UNPREDICTABLE(i0 != zero) {
975
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
976
0
    }
977
0
    if XNN_UNPREDICTABLE(i1 != zero) {
978
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
979
0
    }
980
0
    if XNN_UNPREDICTABLE(i2 != zero) {
981
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
982
0
    }
983
0
    if XNN_UNPREDICTABLE(i3 != zero) {
984
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
985
0
    }
986
0
    if XNN_UNPREDICTABLE(i4 != zero) {
987
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
988
0
    }
989
0
    if XNN_UNPREDICTABLE(i5 != zero) {
990
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
991
0
    }
992
0
    if XNN_UNPREDICTABLE(i6 != zero) {
993
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
994
0
    }
995
0
    if XNN_UNPREDICTABLE(i7 != zero) {
996
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
997
0
    }
998
0
    if XNN_UNPREDICTABLE(i8 != zero) {
999
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
1000
0
    }
1001
1002
0
    size_t c = channels;
1003
0
    do {
1004
0
      const float vi0 = *i0++;
1005
0
      const float vi1 = *i1++;
1006
0
      const float vi2 = *i2++;
1007
0
      const float vi3 = *i3++;
1008
0
      const float vi4 = *i4++;
1009
0
      const float vi5 = *i5++;
1010
0
      const float vi6 = *i6++;
1011
0
      const float vi7 = *i7++;
1012
0
      const float vi8 = *i8++;
1013
1014
0
      const float vsum01 = vi0 + vi1;
1015
0
      const float vsum23 = vi2 + vi3;
1016
0
      const float vsum45 = vi4 + vi5;
1017
0
      const float vsum67 = vi6 + vi7;
1018
0
      const float vsum018 = vsum01 + vi8;
1019
0
      const float vsum2345 = vsum23 + vsum45;
1020
0
      const float vsum01678 = vsum018 + vsum67;
1021
0
      const float vsum = vsum2345 + vsum01678;
1022
1023
0
      float vout = vsum * vscale;
1024
0
      vout = math_max_f32(vout, vmin);
1025
0
      vout = math_min_f32(vout, vmax);
1026
1027
0
      *output++ = vout;
1028
0
    } while (--c != 0);
1029
0
    output = (float*) ((uintptr_t) output + output_increment);
1030
0
  } while (--output_pixels != 0);
1031
0
}
1032
1033
void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(
1034
    size_t input_height,
1035
    size_t input_width,
1036
    size_t output_y_start,
1037
    size_t output_y_end,
1038
    const float* input,
1039
    const float* zero,
1040
    const float* weights,
1041
    float* output,
1042
    size_t input_padding_top,
1043
    size_t output_channels,
1044
    size_t output_height_stride,
1045
    size_t output_channel_stride,
1046
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1047
0
{
1048
0
  assert(input_width != 0);
1049
0
  assert(output_y_end > output_y_start);
1050
0
  assert(input_padding_top <= 1);
1051
0
  assert(output_channels != 0);
1052
1053
0
  const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
1054
0
  const size_t input_width_decrement = round_down_po2(input_width, 2) * 3 /* channels */ * sizeof(float);
1055
0
  const size_t output_width = (input_width + 1) / 2;
1056
0
  const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
1057
1058
  // Adjustment for padding processed below
1059
0
  const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
1060
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1061
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1062
0
  float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
1063
1064
0
  if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
1065
0
    i0 = zero;
1066
0
  }
1067
1068
0
  const float voutput_max = params->scalar.max;
1069
0
  const float voutput_min = params->scalar.min;
1070
1071
0
  for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
1072
0
    const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
1073
0
    if XNN_UNPREDICTABLE(input_y2 >= input_height) {
1074
0
      i2 = zero;
1075
0
    }
1076
1077
0
    const float* w = weights;
1078
0
    size_t c = output_channels;
1079
0
    float* o0c0 = output0;
1080
0
    float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
1081
0
    float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
1082
0
    float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
1083
0
    do {
1084
0
      if XNN_UNPREDICTABLE(c < 2) {
1085
0
        o0c1 = o0c0;
1086
0
      }
1087
0
      if XNN_UNPREDICTABLE(c <= 2) {
1088
0
        o0c2 = o0c1;
1089
0
      }
1090
0
      if XNN_UNPREDICTABLE(c < 4) {
1091
0
        o0c3 = o0c2;
1092
0
      }
1093
1094
      // Left edge padding
1095
0
      float vi00c0 = 0.0f;
1096
0
      float vi00c1 = 0.0f;
1097
0
      float vi00c2 = 0.0f;
1098
0
      float vi10c0 = 0.0f;
1099
0
      float vi10c1 = 0.0f;
1100
0
      float vi10c2 = 0.0f;
1101
0
      float vi20c0 = 0.0f;
1102
0
      float vi20c1 = 0.0f;
1103
0
      float vi20c2 = 0.0f;
1104
1105
0
      size_t iw = input_width;
1106
0
      for (; iw >= 2; iw -= 2) {
1107
0
        float voc0 = w[0];
1108
0
        float voc1 = w[1];
1109
0
        float voc2 = w[2];
1110
0
        float voc3 = w[3];
1111
1112
0
        const float vk00c0x0 = w[4];
1113
0
        const float vk00c0x1 = w[5];
1114
0
        const float vk00c0x2 = w[6];
1115
0
        const float vk00c0x3 = w[7];
1116
1117
0
        voc0 += vk00c0x0 * vi00c0;
1118
0
        voc1 += vk00c0x1 * vi00c0;
1119
0
        voc2 += vk00c0x2 * vi00c0;
1120
0
        voc3 += vk00c0x3 * vi00c0;
1121
1122
0
        const float vk10c0x0 = w[8];
1123
0
        const float vk10c0x1 = w[9];
1124
0
        const float vk10c0x2 = w[10];
1125
0
        const float vk10c0x3 = w[11];
1126
1127
0
        voc0 += vk10c0x0 * vi10c0;
1128
0
        voc1 += vk10c0x1 * vi10c0;
1129
0
        voc2 += vk10c0x2 * vi10c0;
1130
0
        voc3 += vk10c0x3 * vi10c0;
1131
1132
0
        const float vk20c0x0 = w[12];
1133
0
        const float vk20c0x1 = w[13];
1134
0
        const float vk20c0x2 = w[14];
1135
0
        const float vk20c0x3 = w[15];
1136
1137
0
        voc0 += vk20c0x0 * vi20c0;
1138
0
        voc1 += vk20c0x1 * vi20c0;
1139
0
        voc2 += vk20c0x2 * vi20c0;
1140
0
        voc3 += vk20c0x3 * vi20c0;
1141
1142
0
        const float vk00c1x0 = w[16];
1143
0
        const float vk00c1x1 = w[17];
1144
0
        const float vk00c1x2 = w[18];
1145
0
        const float vk00c1x3 = w[19];
1146
1147
0
        voc0 += vk00c1x0 * vi00c1;
1148
0
        voc1 += vk00c1x1 * vi00c1;
1149
0
        voc2 += vk00c1x2 * vi00c1;
1150
0
        voc3 += vk00c1x3 * vi00c1;
1151
1152
0
        const float vk10c1x0 = w[20];
1153
0
        const float vk10c1x1 = w[21];
1154
0
        const float vk10c1x2 = w[22];
1155
0
        const float vk10c1x3 = w[23];
1156
1157
0
        voc0 += vk10c1x0 * vi10c1;
1158
0
        voc1 += vk10c1x1 * vi10c1;
1159
0
        voc2 += vk10c1x2 * vi10c1;
1160
0
        voc3 += vk10c1x3 * vi10c1;
1161
1162
0
        const float vk20c1x0 = w[24];
1163
0
        const float vk20c1x1 = w[25];
1164
0
        const float vk20c1x2 = w[26];
1165
0
        const float vk20c1x3 = w[27];
1166
1167
0
        voc0 += vk20c1x0 * vi20c1;
1168
0
        voc1 += vk20c1x1 * vi20c1;
1169
0
        voc2 += vk20c1x2 * vi20c1;
1170
0
        voc3 += vk20c1x3 * vi20c1;
1171
1172
0
        const float vk00c2x0 = w[28];
1173
0
        const float vk00c2x1 = w[29];
1174
0
        const float vk00c2x2 = w[30];
1175
0
        const float vk00c2x3 = w[31];
1176
1177
0
        voc0 += vk00c2x0 * vi00c2;
1178
0
        voc1 += vk00c2x1 * vi00c2;
1179
0
        voc2 += vk00c2x2 * vi00c2;
1180
0
        voc3 += vk00c2x3 * vi00c2;
1181
1182
0
        const float vk10c2x0 = w[32];
1183
0
        const float vk10c2x1 = w[33];
1184
0
        const float vk10c2x2 = w[34];
1185
0
        const float vk10c2x3 = w[35];
1186
1187
0
        voc0 += vk10c2x0 * vi10c2;
1188
0
        voc1 += vk10c2x1 * vi10c2;
1189
0
        voc2 += vk10c2x2 * vi10c2;
1190
0
        voc3 += vk10c2x3 * vi10c2;
1191
1192
0
        const float vk20c2x0 = w[36];
1193
0
        const float vk20c2x1 = w[37];
1194
0
        const float vk20c2x2 = w[38];
1195
0
        const float vk20c2x3 = w[39];
1196
1197
0
        voc0 += vk20c2x0 * vi20c2;
1198
0
        voc1 += vk20c2x1 * vi20c2;
1199
0
        voc2 += vk20c2x2 * vi20c2;
1200
0
        voc3 += vk20c2x3 * vi20c2;
1201
1202
0
        const float vk01c0x0 = w[40];
1203
0
        const float vk01c0x1 = w[41];
1204
0
        const float vk01c0x2 = w[42];
1205
0
        const float vk01c0x3 = w[43];
1206
1207
0
        const float vi01c0 = i0[0];
1208
1209
0
        voc0 += vk01c0x0 * vi01c0;
1210
0
        voc1 += vk01c0x1 * vi01c0;
1211
0
        voc2 += vk01c0x2 * vi01c0;
1212
0
        voc3 += vk01c0x3 * vi01c0;
1213
1214
0
        const float vk11c0x0 = w[44];
1215
0
        const float vk11c0x1 = w[45];
1216
0
        const float vk11c0x2 = w[46];
1217
0
        const float vk11c0x3 = w[47];
1218
1219
0
        const float vi11c0 = i1[0];
1220
1221
0
        voc0 += vk11c0x0 * vi11c0;
1222
0
        voc1 += vk11c0x1 * vi11c0;
1223
0
        voc2 += vk11c0x2 * vi11c0;
1224
0
        voc3 += vk11c0x3 * vi11c0;
1225
1226
0
        const float vk21c0x0 = w[48];
1227
0
        const float vk21c0x1 = w[49];
1228
0
        const float vk21c0x2 = w[50];
1229
0
        const float vk21c0x3 = w[51];
1230
1231
0
        const float vi21c0 = i2[0];
1232
1233
0
        voc0 += vk21c0x0 * vi21c0;
1234
0
        voc1 += vk21c0x1 * vi21c0;
1235
0
        voc2 += vk21c0x2 * vi21c0;
1236
0
        voc3 += vk21c0x3 * vi21c0;
1237
1238
0
        const float vk01c1x0 = w[52];
1239
0
        const float vk01c1x1 = w[53];
1240
0
        const float vk01c1x2 = w[54];
1241
0
        const float vk01c1x3 = w[55];
1242
1243
0
        const float vi01c1 = i0[1];
1244
1245
0
        voc0 += vk01c1x0 * vi01c1;
1246
0
        voc1 += vk01c1x1 * vi01c1;
1247
0
        voc2 += vk01c1x2 * vi01c1;
1248
0
        voc3 += vk01c1x3 * vi01c1;
1249
1250
0
        const float vk11c1x0 = w[56];
1251
0
        const float vk11c1x1 = w[57];
1252
0
        const float vk11c1x2 = w[58];
1253
0
        const float vk11c1x3 = w[59];
1254
1255
0
        const float vi11c1 = i1[1];
1256
1257
0
        voc0 += vk11c1x0 * vi11c1;
1258
0
        voc1 += vk11c1x1 * vi11c1;
1259
0
        voc2 += vk11c1x2 * vi11c1;
1260
0
        voc3 += vk11c1x3 * vi11c1;
1261
1262
0
        const float vk21c1x0 = w[60];
1263
0
        const float vk21c1x1 = w[61];
1264
0
        const float vk21c1x2 = w[62];
1265
0
        const float vk21c1x3 = w[63];
1266
1267
0
        const float vi21c1 = i2[1];
1268
1269
0
        voc0 += vk21c1x0 * vi21c1;
1270
0
        voc1 += vk21c1x1 * vi21c1;
1271
0
        voc2 += vk21c1x2 * vi21c1;
1272
0
        voc3 += vk21c1x3 * vi21c1;
1273
1274
0
        const float vk01c2x0 = w[64];
1275
0
        const float vk01c2x1 = w[65];
1276
0
        const float vk01c2x2 = w[66];
1277
0
        const float vk01c2x3 = w[67];
1278
1279
0
        const float vi01c2 = i0[2];
1280
1281
0
        voc0 += vk01c2x0 * vi01c2;
1282
0
        voc1 += vk01c2x1 * vi01c2;
1283
0
        voc2 += vk01c2x2 * vi01c2;
1284
0
        voc3 += vk01c2x3 * vi01c2;
1285
1286
0
        const float vk11c2x0 = w[68];
1287
0
        const float vk11c2x1 = w[69];
1288
0
        const float vk11c2x2 = w[70];
1289
0
        const float vk11c2x3 = w[71];
1290
1291
0
        const float vi11c2 = i1[2];
1292
1293
0
        voc0 += vk11c2x0 * vi11c2;
1294
0
        voc1 += vk11c2x1 * vi11c2;
1295
0
        voc2 += vk11c2x2 * vi11c2;
1296
0
        voc3 += vk11c2x3 * vi11c2;
1297
1298
0
        const float vk21c2x0 = w[72];
1299
0
        const float vk21c2x1 = w[73];
1300
0
        const float vk21c2x2 = w[74];
1301
0
        const float vk21c2x3 = w[75];
1302
1303
0
        const float vi21c2 = i2[2];
1304
1305
0
        voc0 += vk21c2x0 * vi21c2;
1306
0
        voc1 += vk21c2x1 * vi21c2;
1307
0
        voc2 += vk21c2x2 * vi21c2;
1308
0
        voc3 += vk21c2x3 * vi21c2;
1309
1310
0
        const float vk02c0x0 = w[76];
1311
0
        const float vk02c0x1 = w[77];
1312
0
        const float vk02c0x2 = w[78];
1313
0
        const float vk02c0x3 = w[79];
1314
1315
0
        const float vi02c0 = i0[3];
1316
1317
0
        voc0 += vk02c0x0 * vi02c0;
1318
0
        voc1 += vk02c0x1 * vi02c0;
1319
0
        voc2 += vk02c0x2 * vi02c0;
1320
0
        voc3 += vk02c0x3 * vi02c0;
1321
1322
0
        const float vk12c0x0 = w[80];
1323
0
        const float vk12c0x1 = w[81];
1324
0
        const float vk12c0x2 = w[82];
1325
0
        const float vk12c0x3 = w[83];
1326
1327
0
        const float vi12c0 = i1[3];
1328
1329
0
        voc0 += vk12c0x0 * vi12c0;
1330
0
        voc1 += vk12c0x1 * vi12c0;
1331
0
        voc2 += vk12c0x2 * vi12c0;
1332
0
        voc3 += vk12c0x3 * vi12c0;
1333
1334
0
        const float vk22c0x0 = w[84];
1335
0
        const float vk22c0x1 = w[85];
1336
0
        const float vk22c0x2 = w[86];
1337
0
        const float vk22c0x3 = w[87];
1338
1339
0
        const float vi22c0 = i2[3];
1340
1341
0
        voc0 += vk22c0x0 * vi22c0;
1342
0
        voc1 += vk22c0x1 * vi22c0;
1343
0
        voc2 += vk22c0x2 * vi22c0;
1344
0
        voc3 += vk22c0x3 * vi22c0;
1345
1346
0
        vi00c0 = vi02c0;
1347
0
        vi10c0 = vi12c0;
1348
0
        vi20c0 = vi22c0;
1349
1350
0
        const float vk02c1x0 = w[88];
1351
0
        const float vk02c1x1 = w[89];
1352
0
        const float vk02c1x2 = w[90];
1353
0
        const float vk02c1x3 = w[91];
1354
1355
0
        const float vi02c1 = i0[4];
1356
1357
0
        voc0 += vk02c1x0 * vi02c1;
1358
0
        voc1 += vk02c1x1 * vi02c1;
1359
0
        voc2 += vk02c1x2 * vi02c1;
1360
0
        voc3 += vk02c1x3 * vi02c1;
1361
1362
0
        const float vk12c1x0 = w[92];
1363
0
        const float vk12c1x1 = w[93];
1364
0
        const float vk12c1x2 = w[94];
1365
0
        const float vk12c1x3 = w[95];
1366
1367
0
        const float vi12c1 = i1[4];
1368
1369
0
        voc0 += vk12c1x0 * vi12c1;
1370
0
        voc1 += vk12c1x1 * vi12c1;
1371
0
        voc2 += vk12c1x2 * vi12c1;
1372
0
        voc3 += vk12c1x3 * vi12c1;
1373
1374
0
        const float vk22c1x0 = w[96];
1375
0
        const float vk22c1x1 = w[97];
1376
0
        const float vk22c1x2 = w[98];
1377
0
        const float vk22c1x3 = w[99];
1378
1379
0
        const float vi22c1 = i2[4];
1380
1381
0
        voc0 += vk22c1x0 * vi22c1;
1382
0
        voc1 += vk22c1x1 * vi22c1;
1383
0
        voc2 += vk22c1x2 * vi22c1;
1384
0
        voc3 += vk22c1x3 * vi22c1;
1385
1386
0
        vi00c1 = vi02c1;
1387
0
        vi10c1 = vi12c1;
1388
0
        vi20c1 = vi22c1;
1389
1390
0
        const float vk02c2x0 = w[100];
1391
0
        const float vk02c2x1 = w[101];
1392
0
        const float vk02c2x2 = w[102];
1393
0
        const float vk02c2x3 = w[103];
1394
1395
0
        const float vi02c2 = i0[5];
1396
1397
0
        voc0 += vk02c2x0 * vi02c2;
1398
0
        voc1 += vk02c2x1 * vi02c2;
1399
0
        voc2 += vk02c2x2 * vi02c2;
1400
0
        voc3 += vk02c2x3 * vi02c2;
1401
1402
0
        const float vk12c2x0 = w[104];
1403
0
        const float vk12c2x1 = w[105];
1404
0
        const float vk12c2x2 = w[106];
1405
0
        const float vk12c2x3 = w[107];
1406
1407
0
        const float vi12c2 = i1[5];
1408
1409
0
        voc0 += vk12c2x0 * vi12c2;
1410
0
        voc1 += vk12c2x1 * vi12c2;
1411
0
        voc2 += vk12c2x2 * vi12c2;
1412
0
        voc3 += vk12c2x3 * vi12c2;
1413
1414
0
        const float vk22c2x0 = w[108];
1415
0
        const float vk22c2x1 = w[109];
1416
0
        const float vk22c2x2 = w[110];
1417
0
        const float vk22c2x3 = w[111];
1418
1419
0
        const float vi22c2 = i2[5];
1420
1421
0
        voc0 += vk22c2x0 * vi22c2;
1422
0
        voc1 += vk22c2x1 * vi22c2;
1423
0
        voc2 += vk22c2x2 * vi22c2;
1424
0
        voc3 += vk22c2x3 * vi22c2;
1425
1426
0
        vi00c2 = vi02c2;
1427
0
        vi10c2 = vi12c2;
1428
0
        vi20c2 = vi22c2;
1429
1430
0
        voc0 = math_min_f32(voc0, voutput_max);
1431
0
        voc1 = math_min_f32(voc1, voutput_max);
1432
0
        voc2 = math_min_f32(voc2, voutput_max);
1433
0
        voc3 = math_min_f32(voc3, voutput_max);
1434
1435
0
        voc0 = math_max_f32(voc0, voutput_min);
1436
0
        voc1 = math_max_f32(voc1, voutput_min);
1437
0
        voc2 = math_max_f32(voc2, voutput_min);
1438
0
        voc3 = math_max_f32(voc3, voutput_min);
1439
1440
0
        *o0c0++ = voc0;
1441
0
        *o0c1++ = voc1;
1442
0
        *o0c2++ = voc2;
1443
0
        *o0c3++ = voc3;
1444
1445
0
        i0 += 6;
1446
0
        i1 += 6;
1447
0
        i2 += 6;
1448
0
      }
1449
0
      assert(iw < 2);
1450
0
      if XNN_UNLIKELY(iw != 0) {
1451
0
        float voc0 = w[0];
1452
0
        float voc1 = w[1];
1453
0
        float voc2 = w[2];
1454
0
        float voc3 = w[3];
1455
1456
0
        const float vk00c0x0 = w[4];
1457
0
        const float vk00c0x1 = w[5];
1458
0
        const float vk00c0x2 = w[6];
1459
0
        const float vk00c0x3 = w[7];
1460
1461
0
        voc0 += vk00c0x0 * vi00c0;
1462
0
        voc1 += vk00c0x1 * vi00c0;
1463
0
        voc2 += vk00c0x2 * vi00c0;
1464
0
        voc3 += vk00c0x3 * vi00c0;
1465
1466
0
        const float vk10c0x0 = w[8];
1467
0
        const float vk10c0x1 = w[9];
1468
0
        const float vk10c0x2 = w[10];
1469
0
        const float vk10c0x3 = w[11];
1470
1471
0
        voc0 += vk10c0x0 * vi10c0;
1472
0
        voc1 += vk10c0x1 * vi10c0;
1473
0
        voc2 += vk10c0x2 * vi10c0;
1474
0
        voc3 += vk10c0x3 * vi10c0;
1475
1476
0
        const float vk20c0x0 = w[12];
1477
0
        const float vk20c0x1 = w[13];
1478
0
        const float vk20c0x2 = w[14];
1479
0
        const float vk20c0x3 = w[15];
1480
1481
0
        voc0 += vk20c0x0 * vi20c0;
1482
0
        voc1 += vk20c0x1 * vi20c0;
1483
0
        voc2 += vk20c0x2 * vi20c0;
1484
0
        voc3 += vk20c0x3 * vi20c0;
1485
1486
0
        const float vk00c1x0 = w[16];
1487
0
        const float vk00c1x1 = w[17];
1488
0
        const float vk00c1x2 = w[18];
1489
0
        const float vk00c1x3 = w[19];
1490
1491
0
        voc0 += vk00c1x0 * vi00c1;
1492
0
        voc1 += vk00c1x1 * vi00c1;
1493
0
        voc2 += vk00c1x2 * vi00c1;
1494
0
        voc3 += vk00c1x3 * vi00c1;
1495
1496
0
        const float vk10c1x0 = w[20];
1497
0
        const float vk10c1x1 = w[21];
1498
0
        const float vk10c1x2 = w[22];
1499
0
        const float vk10c1x3 = w[23];
1500
1501
0
        voc0 += vk10c1x0 * vi10c1;
1502
0
        voc1 += vk10c1x1 * vi10c1;
1503
0
        voc2 += vk10c1x2 * vi10c1;
1504
0
        voc3 += vk10c1x3 * vi10c1;
1505
1506
0
        const float vk20c1x0 = w[24];
1507
0
        const float vk20c1x1 = w[25];
1508
0
        const float vk20c1x2 = w[26];
1509
0
        const float vk20c1x3 = w[27];
1510
1511
0
        voc0 += vk20c1x0 * vi20c1;
1512
0
        voc1 += vk20c1x1 * vi20c1;
1513
0
        voc2 += vk20c1x2 * vi20c1;
1514
0
        voc3 += vk20c1x3 * vi20c1;
1515
1516
0
        const float vk00c2x0 = w[28];
1517
0
        const float vk00c2x1 = w[29];
1518
0
        const float vk00c2x2 = w[30];
1519
0
        const float vk00c2x3 = w[31];
1520
1521
0
        voc0 += vk00c2x0 * vi00c2;
1522
0
        voc1 += vk00c2x1 * vi00c2;
1523
0
        voc2 += vk00c2x2 * vi00c2;
1524
0
        voc3 += vk00c2x3 * vi00c2;
1525
1526
0
        const float vk10c2x0 = w[32];
1527
0
        const float vk10c2x1 = w[33];
1528
0
        const float vk10c2x2 = w[34];
1529
0
        const float vk10c2x3 = w[35];
1530
1531
0
        voc0 += vk10c2x0 * vi10c2;
1532
0
        voc1 += vk10c2x1 * vi10c2;
1533
0
        voc2 += vk10c2x2 * vi10c2;
1534
0
        voc3 += vk10c2x3 * vi10c2;
1535
1536
0
        const float vk20c2x0 = w[36];
1537
0
        const float vk20c2x1 = w[37];
1538
0
        const float vk20c2x2 = w[38];
1539
0
        const float vk20c2x3 = w[39];
1540
1541
0
        voc0 += vk20c2x0 * vi20c2;
1542
0
        voc1 += vk20c2x1 * vi20c2;
1543
0
        voc2 += vk20c2x2 * vi20c2;
1544
0
        voc3 += vk20c2x3 * vi20c2;
1545
1546
0
        const float vk01c0x0 = w[40];
1547
0
        const float vk01c0x1 = w[41];
1548
0
        const float vk01c0x2 = w[42];
1549
0
        const float vk01c0x3 = w[43];
1550
1551
0
        const float vi01c0 = i0[0];
1552
1553
0
        voc0 += vk01c0x0 * vi01c0;
1554
0
        voc1 += vk01c0x1 * vi01c0;
1555
0
        voc2 += vk01c0x2 * vi01c0;
1556
0
        voc3 += vk01c0x3 * vi01c0;
1557
1558
0
        const float vk11c0x0 = w[44];
1559
0
        const float vk11c0x1 = w[45];
1560
0
        const float vk11c0x2 = w[46];
1561
0
        const float vk11c0x3 = w[47];
1562
1563
0
        const float vi11c0 = i1[0];
1564
1565
0
        voc0 += vk11c0x0 * vi11c0;
1566
0
        voc1 += vk11c0x1 * vi11c0;
1567
0
        voc2 += vk11c0x2 * vi11c0;
1568
0
        voc3 += vk11c0x3 * vi11c0;
1569
1570
0
        const float vk21c0x0 = w[48];
1571
0
        const float vk21c0x1 = w[49];
1572
0
        const float vk21c0x2 = w[50];
1573
0
        const float vk21c0x3 = w[51];
1574
1575
0
        const float vi21c0 = i2[0];
1576
1577
0
        voc0 += vk21c0x0 * vi21c0;
1578
0
        voc1 += vk21c0x1 * vi21c0;
1579
0
        voc2 += vk21c0x2 * vi21c0;
1580
0
        voc3 += vk21c0x3 * vi21c0;
1581
1582
0
        const float vk01c1x0 = w[52];
1583
0
        const float vk01c1x1 = w[53];
1584
0
        const float vk01c1x2 = w[54];
1585
0
        const float vk01c1x3 = w[55];
1586
1587
0
        const float vi01c1 = i0[1];
1588
1589
0
        voc0 += vk01c1x0 * vi01c1;
1590
0
        voc1 += vk01c1x1 * vi01c1;
1591
0
        voc2 += vk01c1x2 * vi01c1;
1592
0
        voc3 += vk01c1x3 * vi01c1;
1593
1594
0
        const float vk11c1x0 = w[56];
1595
0
        const float vk11c1x1 = w[57];
1596
0
        const float vk11c1x2 = w[58];
1597
0
        const float vk11c1x3 = w[59];
1598
1599
0
        const float vi11c1 = i1[1];
1600
1601
0
        voc0 += vk11c1x0 * vi11c1;
1602
0
        voc1 += vk11c1x1 * vi11c1;
1603
0
        voc2 += vk11c1x2 * vi11c1;
1604
0
        voc3 += vk11c1x3 * vi11c1;
1605
1606
0
        const float vk21c1x0 = w[60];
1607
0
        const float vk21c1x1 = w[61];
1608
0
        const float vk21c1x2 = w[62];
1609
0
        const float vk21c1x3 = w[63];
1610
1611
0
        const float vi21c1 = i2[1];
1612
1613
0
        voc0 += vk21c1x0 * vi21c1;
1614
0
        voc1 += vk21c1x1 * vi21c1;
1615
0
        voc2 += vk21c1x2 * vi21c1;
1616
0
        voc3 += vk21c1x3 * vi21c1;
1617
1618
0
        const float vk01c2x0 = w[64];
1619
0
        const float vk01c2x1 = w[65];
1620
0
        const float vk01c2x2 = w[66];
1621
0
        const float vk01c2x3 = w[67];
1622
1623
0
        const float vi01c2 = i0[2];
1624
1625
0
        voc0 += vk01c2x0 * vi01c2;
1626
0
        voc1 += vk01c2x1 * vi01c2;
1627
0
        voc2 += vk01c2x2 * vi01c2;
1628
0
        voc3 += vk01c2x3 * vi01c2;
1629
1630
0
        const float vk11c2x0 = w[68];
1631
0
        const float vk11c2x1 = w[69];
1632
0
        const float vk11c2x2 = w[70];
1633
0
        const float vk11c2x3 = w[71];
1634
1635
0
        const float vi11c2 = i1[2];
1636
1637
0
        voc0 += vk11c2x0 * vi11c2;
1638
0
        voc1 += vk11c2x1 * vi11c2;
1639
0
        voc2 += vk11c2x2 * vi11c2;
1640
0
        voc3 += vk11c2x3 * vi11c2;
1641
1642
0
        const float vk21c2x0 = w[72];
1643
0
        const float vk21c2x1 = w[73];
1644
0
        const float vk21c2x2 = w[74];
1645
0
        const float vk21c2x3 = w[75];
1646
1647
0
        const float vi21c2 = i2[2];
1648
1649
0
        voc0 += vk21c2x0 * vi21c2;
1650
0
        voc1 += vk21c2x1 * vi21c2;
1651
0
        voc2 += vk21c2x2 * vi21c2;
1652
0
        voc3 += vk21c2x3 * vi21c2;
1653
1654
0
        voc0 = math_min_f32(voc0, voutput_max);
1655
0
        voc1 = math_min_f32(voc1, voutput_max);
1656
0
        voc2 = math_min_f32(voc2, voutput_max);
1657
0
        voc3 = math_min_f32(voc3, voutput_max);
1658
1659
0
        voc0 = math_max_f32(voc0, voutput_min);
1660
0
        voc1 = math_max_f32(voc1, voutput_min);
1661
0
        voc2 = math_max_f32(voc2, voutput_min);
1662
0
        voc3 = math_max_f32(voc3, voutput_min);
1663
1664
0
        *o0c0++ = voc0;
1665
0
        *o0c1++ = voc1;
1666
0
        *o0c2++ = voc2;
1667
0
        *o0c3++ = voc3;
1668
0
      }
1669
      // Move output pointers back to the position of the first pixel in a row,
1670
      // and forward to the next block of output channels.
1671
0
      o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
1672
0
      o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
1673
0
      o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
1674
0
      o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
1675
      // Revert input pointers to the position of the first pixel in a row
1676
0
      i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
1677
0
      i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
1678
0
      i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
1679
      // Move to the block of weights for the next 4 output channels
1680
0
      w += 112;
1681
0
      c = doz(c, 4);
1682
0
    } while (c != 0);
1683
    // Move output pointers forward to the next row
1684
0
    output0 = (float*) ((uintptr_t) output0 + output_height_stride);
1685
    // Move input pointers forward to the next row
1686
0
    i0 = i2;
1687
0
    i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1688
0
    i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1689
0
  }
1690
0
}
1691
1692
void xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2(
1693
    size_t channels,
1694
    size_t output_width,
1695
    const float** input,
1696
    const float* weights,
1697
    float* output,
1698
    intptr_t input_stride,
1699
    size_t output_increment,
1700
    size_t input_offset,
1701
    const float* zero,
1702
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1703
0
{
1704
0
  assert(channels != 0);
1705
0
  assert(output_width != 0);
1706
1707
0
  const float vmin = params->scalar.min;
1708
0
  const float vmax = params->scalar.max;
1709
0
  do {
1710
0
    const float* i0 = input[0];
1711
0
    assert(i0 != NULL);
1712
0
    if XNN_UNPREDICTABLE(i0 != zero) {
1713
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
1714
0
    }
1715
0
    const float* i1 = input[1];
1716
0
    assert(i1 != NULL);
1717
0
    if XNN_UNPREDICTABLE(i1 != zero) {
1718
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
1719
0
    }
1720
0
    const float* i2 = input[2];
1721
0
    assert(i2 != NULL);
1722
0
    if XNN_UNPREDICTABLE(i2 != zero) {
1723
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
1724
0
    }
1725
0
    const float* i3 = input[3];
1726
0
    assert(i3 != NULL);
1727
0
    if XNN_UNPREDICTABLE(i3 != zero) {
1728
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
1729
0
    }
1730
0
    const float* i4 = input[4];
1731
0
    assert(i4 != NULL);
1732
0
    if XNN_UNPREDICTABLE(i4 != zero) {
1733
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
1734
0
    }
1735
0
    const float* i5 = input[5];
1736
0
    assert(i5 != NULL);
1737
0
    if XNN_UNPREDICTABLE(i5 != zero) {
1738
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
1739
0
    }
1740
0
    const float* i6 = input[6];
1741
0
    assert(i6 != NULL);
1742
0
    if XNN_UNPREDICTABLE(i6 != zero) {
1743
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
1744
0
    }
1745
0
    const float* i7 = input[7];
1746
0
    assert(i7 != NULL);
1747
0
    if XNN_UNPREDICTABLE(i7 != zero) {
1748
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
1749
0
    }
1750
0
    const float* i8 = input[8];
1751
0
    assert(i8 != NULL);
1752
0
    if XNN_UNPREDICTABLE(i8 != zero) {
1753
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
1754
0
    }
1755
0
    const float* i9 = input[9];
1756
0
    assert(i9 != NULL);
1757
0
    if XNN_UNPREDICTABLE(i9 != zero) {
1758
0
      i9 = (const float*) ((uintptr_t) i9 + input_offset);
1759
0
    }
1760
0
    const float* i10 = input[10];
1761
0
    assert(i10 != NULL);
1762
0
    if XNN_UNPREDICTABLE(i10 != zero) {
1763
0
      i10 = (const float*) ((uintptr_t) i10 + input_offset);
1764
0
    }
1765
0
    const float* i11 = input[11];
1766
0
    assert(i11 != NULL);
1767
0
    if XNN_UNPREDICTABLE(i11 != zero) {
1768
0
      i11 = (const float*) ((uintptr_t) i11 + input_offset);
1769
0
    }
1770
0
    const float* i12 = input[12];
1771
0
    assert(i12 != NULL);
1772
0
    if XNN_UNPREDICTABLE(i12 != zero) {
1773
0
      i12 = (const float*) ((uintptr_t) i12 + input_offset);
1774
0
    }
1775
0
    const float* i13 = input[13];
1776
0
    assert(i13 != NULL);
1777
0
    if XNN_UNPREDICTABLE(i13 != zero) {
1778
0
      i13 = (const float*) ((uintptr_t) i13 + input_offset);
1779
0
    }
1780
0
    const float* i14 = input[14];
1781
0
    assert(i14 != NULL);
1782
0
    if XNN_UNPREDICTABLE(i14 != zero) {
1783
0
      i14 = (const float*) ((uintptr_t) i14 + input_offset);
1784
0
    }
1785
0
    const float* i15 = input[15];
1786
0
    assert(i15 != NULL);
1787
0
    if XNN_UNPREDICTABLE(i15 != zero) {
1788
0
      i15 = (const float*) ((uintptr_t) i15 + input_offset);
1789
0
    }
1790
0
    const float* i16 = input[16];
1791
0
    assert(i16 != NULL);
1792
0
    if XNN_UNPREDICTABLE(i16 != zero) {
1793
0
      i16 = (const float*) ((uintptr_t) i16 + input_offset);
1794
0
    }
1795
0
    const float* i17 = input[17];
1796
0
    assert(i17 != NULL);
1797
0
    if XNN_UNPREDICTABLE(i17 != zero) {
1798
0
      i17 = (const float*) ((uintptr_t) i17 + input_offset);
1799
0
    }
1800
0
    const float* i18 = input[18];
1801
0
    assert(i18 != NULL);
1802
0
    if XNN_UNPREDICTABLE(i18 != zero) {
1803
0
      i18 = (const float*) ((uintptr_t) i18 + input_offset);
1804
0
    }
1805
0
    const float* i19 = input[19];
1806
0
    assert(i19 != NULL);
1807
0
    if XNN_UNPREDICTABLE(i19 != zero) {
1808
0
      i19 = (const float*) ((uintptr_t) i19 + input_offset);
1809
0
    }
1810
0
    const float* i20 = input[20];
1811
0
    assert(i20 != NULL);
1812
0
    if XNN_UNPREDICTABLE(i20 != zero) {
1813
0
      i20 = (const float*) ((uintptr_t) i20 + input_offset);
1814
0
    }
1815
0
    const float* i21 = input[21];
1816
0
    assert(i21 != NULL);
1817
0
    if XNN_UNPREDICTABLE(i21 != zero) {
1818
0
      i21 = (const float*) ((uintptr_t) i21 + input_offset);
1819
0
    }
1820
0
    const float* i22 = input[22];
1821
0
    assert(i22 != NULL);
1822
0
    if XNN_UNPREDICTABLE(i22 != zero) {
1823
0
      i22 = (const float*) ((uintptr_t) i22 + input_offset);
1824
0
    }
1825
0
    const float* i23 = input[23];
1826
0
    assert(i23 != NULL);
1827
0
    if XNN_UNPREDICTABLE(i23 != zero) {
1828
0
      i23 = (const float*) ((uintptr_t) i23 + input_offset);
1829
0
    }
1830
0
    const float* i24 = input[24];
1831
0
    assert(i24 != NULL);
1832
0
    if XNN_UNPREDICTABLE(i24 != zero) {
1833
0
      i24 = (const float*) ((uintptr_t) i24 + input_offset);
1834
0
    }
1835
0
    input = (const float**) ((uintptr_t) input + input_stride);
1836
1837
0
    size_t c = channels;
1838
0
    const float* w = weights;
1839
0
    do {
1840
0
      float vacc0p0 = w[0];
1841
1842
0
      const float vi0 = *i0++;
1843
0
      const float vk0 = w[1];
1844
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
1845
1846
0
      const float vi1 = *i1++;
1847
0
      const float vk1 = w[2];
1848
0
      float vacc0p1 = vi1 * vk1;
1849
1850
0
      const float vi2 = *i2++;
1851
0
      const float vk2 = w[3];
1852
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
1853
1854
0
      const float vi3 = *i3++;
1855
0
      const float vk3 = w[4];
1856
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
1857
1858
0
      const float vi4 = *i4++;
1859
0
      const float vk4 = w[5];
1860
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
1861
1862
0
      const float vi5 = *i5++;
1863
0
      const float vk5 = w[6];
1864
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
1865
1866
0
      const float vi6 = *i6++;
1867
0
      const float vk6 = w[7];
1868
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
1869
1870
0
      const float vi7 = *i7++;
1871
0
      const float vk7 = w[8];
1872
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
1873
1874
0
      const float vi8 = *i8++;
1875
0
      const float vk8 = w[9];
1876
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
1877
1878
0
      const float vi9 = *i9++;
1879
0
      const float vk9 = w[10];
1880
0
      vacc0p1 = math_muladd_f32(vi9, vk9, vacc0p1);
1881
1882
0
      const float vi10 = *i10++;
1883
0
      const float vk10 = w[11];
1884
0
      vacc0p0 = math_muladd_f32(vi10, vk10, vacc0p0);
1885
1886
0
      const float vi11 = *i11++;
1887
0
      const float vk11 = w[12];
1888
0
      vacc0p1 = math_muladd_f32(vi11, vk11, vacc0p1);
1889
1890
0
      const float vi12 = *i12++;
1891
0
      const float vk12 = w[13];
1892
0
      vacc0p0 = math_muladd_f32(vi12, vk12, vacc0p0);
1893
1894
0
      const float vi13 = *i13++;
1895
0
      const float vk13 = w[14];
1896
0
      vacc0p1 = math_muladd_f32(vi13, vk13, vacc0p1);
1897
1898
0
      const float vi14 = *i14++;
1899
0
      const float vk14 = w[15];
1900
0
      vacc0p0 = math_muladd_f32(vi14, vk14, vacc0p0);
1901
1902
0
      const float vi15 = *i15++;
1903
0
      const float vk15 = w[16];
1904
0
      vacc0p1 = math_muladd_f32(vi15, vk15, vacc0p1);
1905
1906
0
      const float vi16 = *i16++;
1907
0
      const float vk16 = w[17];
1908
0
      vacc0p0 = math_muladd_f32(vi16, vk16, vacc0p0);
1909
1910
0
      const float vi17 = *i17++;
1911
0
      const float vk17 = w[18];
1912
0
      vacc0p1 = math_muladd_f32(vi17, vk17, vacc0p1);
1913
1914
0
      const float vi18 = *i18++;
1915
0
      const float vk18 = w[19];
1916
0
      vacc0p0 = math_muladd_f32(vi18, vk18, vacc0p0);
1917
1918
0
      const float vi19 = *i19++;
1919
0
      const float vk19 = w[20];
1920
0
      vacc0p1 = math_muladd_f32(vi19, vk19, vacc0p1);
1921
1922
0
      const float vi20 = *i20++;
1923
0
      const float vk20 = w[21];
1924
0
      vacc0p0 = math_muladd_f32(vi20, vk20, vacc0p0);
1925
1926
0
      const float vi21 = *i21++;
1927
0
      const float vk21 = w[22];
1928
0
      vacc0p1 = math_muladd_f32(vi21, vk21, vacc0p1);
1929
1930
0
      const float vi22 = *i22++;
1931
0
      const float vk22 = w[23];
1932
0
      vacc0p0 = math_muladd_f32(vi22, vk22, vacc0p0);
1933
1934
0
      const float vi23 = *i23++;
1935
0
      const float vk23 = w[24];
1936
0
      vacc0p1 = math_muladd_f32(vi23, vk23, vacc0p1);
1937
1938
0
      const float vi24 = *i24++;
1939
0
      const float vk24 = w[25];
1940
0
      vacc0p0 = math_muladd_f32(vi24, vk24, vacc0p0);
1941
1942
0
      w += 26;
1943
1944
0
      vacc0p0 += vacc0p1;
1945
1946
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
1947
0
      vacc0 = math_min_f32(vacc0, vmax);
1948
0
      *output++ = vacc0;
1949
0
    } while (--c != 0);
1950
1951
0
    output = (float*) ((uintptr_t) output + output_increment);
1952
0
  } while (--output_width != 0);
1953
0
}
1954
1955
void xnn_f32_dwconv_ukernel_25p1c__scalar_acc2(
1956
    size_t channels,
1957
    size_t output_width,
1958
    const float** input,
1959
    const float* weights,
1960
    float* output,
1961
    intptr_t input_stride,
1962
    size_t output_increment,
1963
    size_t input_offset,
1964
    const float* zero,
1965
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
1966
0
{
1967
0
  assert(channels != 0);
1968
0
  assert(output_width != 0);
1969
1970
0
  do {
1971
0
    const float* i0 = input[0];
1972
0
    assert(i0 != NULL);
1973
0
    if XNN_UNPREDICTABLE(i0 != zero) {
1974
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
1975
0
    }
1976
0
    const float* i1 = input[1];
1977
0
    assert(i1 != NULL);
1978
0
    if XNN_UNPREDICTABLE(i1 != zero) {
1979
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
1980
0
    }
1981
0
    const float* i2 = input[2];
1982
0
    assert(i2 != NULL);
1983
0
    if XNN_UNPREDICTABLE(i2 != zero) {
1984
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
1985
0
    }
1986
0
    const float* i3 = input[3];
1987
0
    assert(i3 != NULL);
1988
0
    if XNN_UNPREDICTABLE(i3 != zero) {
1989
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
1990
0
    }
1991
0
    const float* i4 = input[4];
1992
0
    assert(i4 != NULL);
1993
0
    if XNN_UNPREDICTABLE(i4 != zero) {
1994
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
1995
0
    }
1996
0
    const float* i5 = input[5];
1997
0
    assert(i5 != NULL);
1998
0
    if XNN_UNPREDICTABLE(i5 != zero) {
1999
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
2000
0
    }
2001
0
    const float* i6 = input[6];
2002
0
    assert(i6 != NULL);
2003
0
    if XNN_UNPREDICTABLE(i6 != zero) {
2004
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
2005
0
    }
2006
0
    const float* i7 = input[7];
2007
0
    assert(i7 != NULL);
2008
0
    if XNN_UNPREDICTABLE(i7 != zero) {
2009
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
2010
0
    }
2011
0
    const float* i8 = input[8];
2012
0
    assert(i8 != NULL);
2013
0
    if XNN_UNPREDICTABLE(i8 != zero) {
2014
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
2015
0
    }
2016
0
    const float* i9 = input[9];
2017
0
    assert(i9 != NULL);
2018
0
    if XNN_UNPREDICTABLE(i9 != zero) {
2019
0
      i9 = (const float*) ((uintptr_t) i9 + input_offset);
2020
0
    }
2021
0
    const float* i10 = input[10];
2022
0
    assert(i10 != NULL);
2023
0
    if XNN_UNPREDICTABLE(i10 != zero) {
2024
0
      i10 = (const float*) ((uintptr_t) i10 + input_offset);
2025
0
    }
2026
0
    const float* i11 = input[11];
2027
0
    assert(i11 != NULL);
2028
0
    if XNN_UNPREDICTABLE(i11 != zero) {
2029
0
      i11 = (const float*) ((uintptr_t) i11 + input_offset);
2030
0
    }
2031
0
    const float* i12 = input[12];
2032
0
    assert(i12 != NULL);
2033
0
    if XNN_UNPREDICTABLE(i12 != zero) {
2034
0
      i12 = (const float*) ((uintptr_t) i12 + input_offset);
2035
0
    }
2036
0
    const float* i13 = input[13];
2037
0
    assert(i13 != NULL);
2038
0
    if XNN_UNPREDICTABLE(i13 != zero) {
2039
0
      i13 = (const float*) ((uintptr_t) i13 + input_offset);
2040
0
    }
2041
0
    const float* i14 = input[14];
2042
0
    assert(i14 != NULL);
2043
0
    if XNN_UNPREDICTABLE(i14 != zero) {
2044
0
      i14 = (const float*) ((uintptr_t) i14 + input_offset);
2045
0
    }
2046
0
    const float* i15 = input[15];
2047
0
    assert(i15 != NULL);
2048
0
    if XNN_UNPREDICTABLE(i15 != zero) {
2049
0
      i15 = (const float*) ((uintptr_t) i15 + input_offset);
2050
0
    }
2051
0
    const float* i16 = input[16];
2052
0
    assert(i16 != NULL);
2053
0
    if XNN_UNPREDICTABLE(i16 != zero) {
2054
0
      i16 = (const float*) ((uintptr_t) i16 + input_offset);
2055
0
    }
2056
0
    const float* i17 = input[17];
2057
0
    assert(i17 != NULL);
2058
0
    if XNN_UNPREDICTABLE(i17 != zero) {
2059
0
      i17 = (const float*) ((uintptr_t) i17 + input_offset);
2060
0
    }
2061
0
    const float* i18 = input[18];
2062
0
    assert(i18 != NULL);
2063
0
    if XNN_UNPREDICTABLE(i18 != zero) {
2064
0
      i18 = (const float*) ((uintptr_t) i18 + input_offset);
2065
0
    }
2066
0
    const float* i19 = input[19];
2067
0
    assert(i19 != NULL);
2068
0
    if XNN_UNPREDICTABLE(i19 != zero) {
2069
0
      i19 = (const float*) ((uintptr_t) i19 + input_offset);
2070
0
    }
2071
0
    const float* i20 = input[20];
2072
0
    assert(i20 != NULL);
2073
0
    if XNN_UNPREDICTABLE(i20 != zero) {
2074
0
      i20 = (const float*) ((uintptr_t) i20 + input_offset);
2075
0
    }
2076
0
    const float* i21 = input[21];
2077
0
    assert(i21 != NULL);
2078
0
    if XNN_UNPREDICTABLE(i21 != zero) {
2079
0
      i21 = (const float*) ((uintptr_t) i21 + input_offset);
2080
0
    }
2081
0
    const float* i22 = input[22];
2082
0
    assert(i22 != NULL);
2083
0
    if XNN_UNPREDICTABLE(i22 != zero) {
2084
0
      i22 = (const float*) ((uintptr_t) i22 + input_offset);
2085
0
    }
2086
0
    const float* i23 = input[23];
2087
0
    assert(i23 != NULL);
2088
0
    if XNN_UNPREDICTABLE(i23 != zero) {
2089
0
      i23 = (const float*) ((uintptr_t) i23 + input_offset);
2090
0
    }
2091
0
    const float* i24 = input[24];
2092
0
    assert(i24 != NULL);
2093
0
    if XNN_UNPREDICTABLE(i24 != zero) {
2094
0
      i24 = (const float*) ((uintptr_t) i24 + input_offset);
2095
0
    }
2096
0
    input = (const float**) ((uintptr_t) input + input_stride);
2097
2098
0
    size_t c = channels;
2099
0
    const float* w = weights;
2100
0
    do {
2101
0
      float vacc0p0 = w[0];
2102
2103
0
      const float vi0 = *i0++;
2104
0
      const float vk0 = w[1];
2105
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2106
2107
0
      const float vi1 = *i1++;
2108
0
      const float vk1 = w[2];
2109
0
      float vacc0p1 = vi1 * vk1;
2110
2111
0
      const float vi2 = *i2++;
2112
0
      const float vk2 = w[3];
2113
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2114
2115
0
      const float vi3 = *i3++;
2116
0
      const float vk3 = w[4];
2117
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2118
2119
0
      const float vi4 = *i4++;
2120
0
      const float vk4 = w[5];
2121
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
2122
2123
0
      const float vi5 = *i5++;
2124
0
      const float vk5 = w[6];
2125
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
2126
2127
0
      const float vi6 = *i6++;
2128
0
      const float vk6 = w[7];
2129
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
2130
2131
0
      const float vi7 = *i7++;
2132
0
      const float vk7 = w[8];
2133
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
2134
2135
0
      const float vi8 = *i8++;
2136
0
      const float vk8 = w[9];
2137
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
2138
2139
0
      const float vi9 = *i9++;
2140
0
      const float vk9 = w[10];
2141
0
      vacc0p1 = math_muladd_f32(vi9, vk9, vacc0p1);
2142
2143
0
      const float vi10 = *i10++;
2144
0
      const float vk10 = w[11];
2145
0
      vacc0p0 = math_muladd_f32(vi10, vk10, vacc0p0);
2146
2147
0
      const float vi11 = *i11++;
2148
0
      const float vk11 = w[12];
2149
0
      vacc0p1 = math_muladd_f32(vi11, vk11, vacc0p1);
2150
2151
0
      const float vi12 = *i12++;
2152
0
      const float vk12 = w[13];
2153
0
      vacc0p0 = math_muladd_f32(vi12, vk12, vacc0p0);
2154
2155
0
      const float vi13 = *i13++;
2156
0
      const float vk13 = w[14];
2157
0
      vacc0p1 = math_muladd_f32(vi13, vk13, vacc0p1);
2158
2159
0
      const float vi14 = *i14++;
2160
0
      const float vk14 = w[15];
2161
0
      vacc0p0 = math_muladd_f32(vi14, vk14, vacc0p0);
2162
2163
0
      const float vi15 = *i15++;
2164
0
      const float vk15 = w[16];
2165
0
      vacc0p1 = math_muladd_f32(vi15, vk15, vacc0p1);
2166
2167
0
      const float vi16 = *i16++;
2168
0
      const float vk16 = w[17];
2169
0
      vacc0p0 = math_muladd_f32(vi16, vk16, vacc0p0);
2170
2171
0
      const float vi17 = *i17++;
2172
0
      const float vk17 = w[18];
2173
0
      vacc0p1 = math_muladd_f32(vi17, vk17, vacc0p1);
2174
2175
0
      const float vi18 = *i18++;
2176
0
      const float vk18 = w[19];
2177
0
      vacc0p0 = math_muladd_f32(vi18, vk18, vacc0p0);
2178
2179
0
      const float vi19 = *i19++;
2180
0
      const float vk19 = w[20];
2181
0
      vacc0p1 = math_muladd_f32(vi19, vk19, vacc0p1);
2182
2183
0
      const float vi20 = *i20++;
2184
0
      const float vk20 = w[21];
2185
0
      vacc0p0 = math_muladd_f32(vi20, vk20, vacc0p0);
2186
2187
0
      const float vi21 = *i21++;
2188
0
      const float vk21 = w[22];
2189
0
      vacc0p1 = math_muladd_f32(vi21, vk21, vacc0p1);
2190
2191
0
      const float vi22 = *i22++;
2192
0
      const float vk22 = w[23];
2193
0
      vacc0p0 = math_muladd_f32(vi22, vk22, vacc0p0);
2194
2195
0
      const float vi23 = *i23++;
2196
0
      const float vk23 = w[24];
2197
0
      vacc0p1 = math_muladd_f32(vi23, vk23, vacc0p1);
2198
2199
0
      const float vi24 = *i24++;
2200
0
      const float vk24 = w[25];
2201
0
      vacc0p0 = math_muladd_f32(vi24, vk24, vacc0p0);
2202
2203
0
      w += 26;
2204
2205
0
      vacc0p0 += vacc0p1;
2206
2207
0
      *output++ = vacc0p0;
2208
0
    } while (--c != 0);
2209
2210
0
    output = (float*) ((uintptr_t) output + output_increment);
2211
0
  } while (--output_width != 0);
2212
0
}
2213
2214
void xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2(
2215
    size_t channels,
2216
    size_t output_width,
2217
    const float** input,
2218
    const float* weights,
2219
    float* output,
2220
    intptr_t input_stride,
2221
    size_t output_increment,
2222
    size_t input_offset,
2223
    const float* zero,
2224
    size_t kernel_size,
2225
    float* buffer,
2226
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2227
0
{
2228
0
  assert(channels != 0);
2229
0
  assert(output_width != 0);
2230
0
  assert(kernel_size > 2);
2231
2232
0
  const float vmin = params->scalar.min;
2233
0
  const float vmax = params->scalar.max;
2234
0
  do {
2235
0
    const float* w = weights;
2236
2237
    // First pass to process 2 inputs.
2238
0
    {
2239
0
      float* b = buffer;
2240
0
      const float* i0 = input[0];
2241
0
      assert(i0 != NULL);
2242
0
      if XNN_UNPREDICTABLE(i0 != zero) {
2243
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
2244
0
      }
2245
0
      const float* i1 = input[1];
2246
0
      assert(i1 != NULL);
2247
0
      if XNN_UNPREDICTABLE(i1 != zero) {
2248
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
2249
0
      }
2250
0
      input += 2;
2251
2252
      // Process c channels and write to buffer.
2253
0
      size_t c = round_up_po2(channels, 1);
2254
0
      for (; c >= 4; c -= 4) {
2255
0
        float vacc0p0 = w[0];
2256
0
        float vacc1p0 = w[1];
2257
0
        float vacc2p0 = w[2];
2258
0
        float vacc3p0 = w[3];
2259
2260
2261
0
        const float vi0x0 = i0[0];
2262
0
        const float vi0x1 = i0[1];
2263
0
        const float vi0x2 = i0[2];
2264
0
        const float vi0x3 = i0[3];
2265
0
        i0 += 4;
2266
2267
0
        const float vk0x0 = w[4];
2268
0
        const float vk0x1 = w[5];
2269
0
        const float vk0x2 = w[6];
2270
0
        const float vk0x3 = w[7];
2271
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2272
0
        vacc1p0 = math_muladd_f32(vi0x1, vk0x1, vacc1p0);
2273
0
        vacc2p0 = math_muladd_f32(vi0x2, vk0x2, vacc2p0);
2274
0
        vacc3p0 = math_muladd_f32(vi0x3, vk0x3, vacc3p0);
2275
2276
0
        const float vi1x0 = i1[0];
2277
0
        const float vi1x1 = i1[1];
2278
0
        const float vi1x2 = i1[2];
2279
0
        const float vi1x3 = i1[3];
2280
0
        i1 += 4;
2281
2282
0
        const float vk1x0 = w[8];
2283
0
        const float vk1x1 = w[9];
2284
0
        const float vk1x2 = w[10];
2285
0
        const float vk1x3 = w[11];
2286
0
        float vacc0p1 = vi1x0 * vk1x0;
2287
0
        float vacc1p1 = vi1x1 * vk1x1;
2288
0
        float vacc2p1 = vi1x2 * vk1x2;
2289
0
        float vacc3p1 = vi1x3 * vk1x3;
2290
2291
0
        w += 12;
2292
2293
        // Add up all accumulators to vacc0123p0
2294
0
        vacc0p0 = vacc0p0 + vacc0p1;
2295
0
        vacc1p0 = vacc1p0 + vacc1p1;
2296
0
        vacc2p0 = vacc2p0 + vacc2p1;
2297
0
        vacc3p0 = vacc3p0 + vacc3p1;
2298
2299
0
        b[0] = vacc0p0;
2300
0
        b[1] = vacc1p0;
2301
0
        b[2] = vacc2p0;
2302
0
        b[3] = vacc3p0;
2303
0
        b += 4;
2304
0
      }
2305
2306
2307
0
      for (; c != 0; c --) {
2308
0
        float vacc0p0 = w[0];
2309
2310
0
        const float vi0x0 = i0[0];
2311
0
        i0 += 1;
2312
2313
0
        const float vk0x0 = w[1];
2314
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2315
2316
0
        const float vi1x0 = i1[0];
2317
0
        i1 += 1;
2318
2319
0
        const float vk1x0 = w[2];
2320
0
        float vacc0p1 = vi1x0 * vk1x0;
2321
2322
0
        w += 3;
2323
2324
        // Add up all accumulators to vacc0p0
2325
0
        vacc0p0 = vacc0p0 + vacc0p1;
2326
2327
0
        b[0] = vacc0p0;
2328
0
        b += 1;
2329
0
      }
2330
0
    }
2331
2332
    // Middle pass to process 2 inputs in each iteration.
2333
0
    for (size_t ks = kernel_size - 2; ks > 2; ks -= 2) {
2334
0
      float* b = buffer;
2335
0
      const float* i0 = input[0];
2336
0
      assert(i0 != NULL);
2337
0
      if XNN_UNPREDICTABLE(i0 != zero) {
2338
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
2339
0
      }
2340
0
      const float* i1 = input[1];
2341
0
      assert(i1 != NULL);
2342
0
      if XNN_UNPREDICTABLE(i1 != zero) {
2343
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
2344
0
      }
2345
0
      input += 2;
2346
2347
0
      size_t c = round_up_po2(channels, 1);
2348
0
      for (; c >= 4; c -= 4) {
2349
0
        float vacc0p0 = b[0];
2350
0
        float vacc1p0 = b[1];
2351
0
        float vacc2p0 = b[2];
2352
0
        float vacc3p0 = b[3];
2353
2354
2355
0
        const float vi0x0 = i0[0];
2356
0
        const float vi0x1 = i0[1];
2357
0
        const float vi0x2 = i0[2];
2358
0
        const float vi0x3 = i0[3];
2359
0
        i0 += 4;
2360
2361
0
        const float vk0x0 = w[0];
2362
0
        const float vk0x1 = w[1];
2363
0
        const float vk0x2 = w[2];
2364
0
        const float vk0x3 = w[3];
2365
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2366
0
        vacc1p0 = math_muladd_f32(vi0x1, vk0x1, vacc1p0);
2367
0
        vacc2p0 = math_muladd_f32(vi0x2, vk0x2, vacc2p0);
2368
0
        vacc3p0 = math_muladd_f32(vi0x3, vk0x3, vacc3p0);
2369
2370
0
        const float vi1x0 = i1[0];
2371
0
        const float vi1x1 = i1[1];
2372
0
        const float vi1x2 = i1[2];
2373
0
        const float vi1x3 = i1[3];
2374
0
        i1 += 4;
2375
2376
0
        const float vk1x0 = w[4];
2377
0
        const float vk1x1 = w[5];
2378
0
        const float vk1x2 = w[6];
2379
0
        const float vk1x3 = w[7];
2380
0
        float vacc0p1 = vi1x0 * vk1x0;
2381
0
        float vacc1p1 = vi1x1 * vk1x1;
2382
0
        float vacc2p1 = vi1x2 * vk1x2;
2383
0
        float vacc3p1 = vi1x3 * vk1x3;
2384
2385
0
        w += 8;
2386
2387
        // Add up all accumulators to vacc0123p0
2388
0
        vacc0p0 = vacc0p0 + vacc0p1;
2389
0
        vacc1p0 = vacc1p0 + vacc1p1;
2390
0
        vacc2p0 = vacc2p0 + vacc2p1;
2391
0
        vacc3p0 = vacc3p0 + vacc3p1;
2392
2393
0
        b[0] = vacc0p0;
2394
0
        b[1] = vacc1p0;
2395
0
        b[2] = vacc2p0;
2396
0
        b[3] = vacc3p0;
2397
0
        b += 4;
2398
0
      }
2399
2400
0
      for (; c != 0; c --) {
2401
0
        float vacc0p0 = b[0];
2402
2403
2404
0
        const float vi0x0 = i0[0];
2405
0
        i0 += 1;
2406
2407
0
        const float vk0x0 = w[0];
2408
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2409
2410
0
        const float vi1x0 = i1[0];
2411
0
        i1 += 1;
2412
2413
0
        const float vk1x0 = w[1];
2414
0
        float vacc0p1 = vi1x0 * vk1x0;
2415
2416
0
        w += 2;
2417
2418
        // Add up all accumulators to vacc0p0
2419
0
        vacc0p0 = vacc0p0 + vacc0p1;
2420
2421
0
        b[0] = vacc0p0;
2422
0
        b += 1;
2423
0
      }
2424
0
    }
2425
2426
    // Last pass to process up to 2 inputs.
2427
0
    {
2428
0
      float* b = buffer;
2429
0
      const float* i0 = input[0];
2430
0
      assert(i0 != NULL);
2431
0
      if XNN_UNPREDICTABLE(i0 != zero) {
2432
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
2433
0
      }
2434
0
      const float* i1 = input[1];
2435
0
      assert(i1 != NULL);
2436
0
      if XNN_UNPREDICTABLE(i1 != zero) {
2437
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
2438
0
      }
2439
2440
0
      size_t c = channels;
2441
0
      for (; c >= 4; c -= 4) {
2442
0
        float vacc0p0 = b[0];
2443
0
        float vacc1p0 = b[1];
2444
0
        float vacc2p0 = b[2];
2445
0
        float vacc3p0 = b[3];
2446
0
        b += 4;
2447
2448
2449
0
        const float vi0x0 = i0[0];
2450
0
        const float vi0x1 = i0[1];
2451
0
        const float vi0x2 = i0[2];
2452
0
        const float vi0x3 = i0[3];
2453
0
        i0 += 4;
2454
2455
0
        const float vk0x0 = w[0];
2456
0
        const float vk0x1 = w[1];
2457
0
        const float vk0x2 = w[2];
2458
0
        const float vk0x3 = w[3];
2459
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2460
0
        vacc1p0 = math_muladd_f32(vi0x1, vk0x1, vacc1p0);
2461
0
        vacc2p0 = math_muladd_f32(vi0x2, vk0x2, vacc2p0);
2462
0
        vacc3p0 = math_muladd_f32(vi0x3, vk0x3, vacc3p0);
2463
2464
0
        const float vi1x0 = i1[0];
2465
0
        const float vi1x1 = i1[1];
2466
0
        const float vi1x2 = i1[2];
2467
0
        const float vi1x3 = i1[3];
2468
0
        i1 += 4;
2469
2470
0
        const float vk1x0 = w[4];
2471
0
        const float vk1x1 = w[5];
2472
0
        const float vk1x2 = w[6];
2473
0
        const float vk1x3 = w[7];
2474
0
        float vacc0p1 = vi1x0 * vk1x0;
2475
0
        float vacc1p1 = vi1x1 * vk1x1;
2476
0
        float vacc2p1 = vi1x2 * vk1x2;
2477
0
        float vacc3p1 = vi1x3 * vk1x3;
2478
2479
0
        w += 8;
2480
2481
        // Add up all accumulators to vacc0123p0
2482
0
        vacc0p0 = vacc0p0 + vacc0p1;
2483
0
        vacc1p0 = vacc1p0 + vacc1p1;
2484
0
        vacc2p0 = vacc2p0 + vacc2p1;
2485
0
        vacc3p0 = vacc3p0 + vacc3p1;
2486
2487
0
        float vacc0 = math_max_f32(vacc0p0, vmin);
2488
0
        float vacc1 = math_max_f32(vacc1p0, vmin);
2489
0
        float vacc2 = math_max_f32(vacc2p0, vmin);
2490
0
        float vacc3 = math_max_f32(vacc3p0, vmin);
2491
2492
0
        vacc0 = math_min_f32(vacc0, vmax);
2493
0
        vacc1 = math_min_f32(vacc1, vmax);
2494
0
        vacc2 = math_min_f32(vacc2, vmax);
2495
0
        vacc3 = math_min_f32(vacc3, vmax);
2496
2497
0
        output[0] = vacc0;
2498
0
        output[1] = vacc1;
2499
0
        output[2] = vacc2;
2500
0
        output[3] = vacc3;
2501
0
        output += 4;
2502
0
      }
2503
0
      for (; c != 0; c --) {
2504
0
        float vacc0p0 = *b++;
2505
2506
0
        const float vi0 = *i0++;
2507
0
        const float vk0 = w[0];
2508
0
        vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2509
0
        const float vi1 = *i1++;
2510
0
        const float vk1 = w[1];
2511
0
        float vacc0p1 = vi1 * vk1;
2512
0
        w += 2;
2513
2514
        // Add up all accumulators to vacc0p0
2515
0
        vacc0p0 = vacc0p0 + vacc0p1;
2516
2517
0
        float vacc0 = math_max_f32(vacc0p0, vmin);
2518
0
        vacc0 = math_min_f32(vacc0, vmax);
2519
0
        *output++ = vacc0;
2520
0
      }
2521
2522
0
    }
2523
0
    input = (const float**) ((uintptr_t) input + input_stride);
2524
0
    output = (float*) ((uintptr_t) output + output_increment);
2525
0
  } while (--output_width != 0);
2526
0
}
2527
2528
void xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2(
2529
    size_t channels,
2530
    size_t output_width,
2531
    const float** input,
2532
    const float* weights,
2533
    float* output,
2534
    intptr_t input_stride,
2535
    size_t output_increment,
2536
    size_t input_offset,
2537
    const float* zero,
2538
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2539
0
{
2540
0
  assert(channels != 0);
2541
0
  assert(output_width != 0);
2542
2543
0
  const float vmin = params->scalar.min;
2544
0
  const float vmax = params->scalar.max;
2545
0
  do {
2546
0
    const float* i0 = input[0];
2547
0
    assert(i0 != NULL);
2548
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2549
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2550
0
    }
2551
0
    const float* i1 = input[1];
2552
0
    assert(i1 != NULL);
2553
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2554
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2555
0
    }
2556
0
    const float* i2 = input[2];
2557
0
    assert(i2 != NULL);
2558
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2559
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2560
0
    }
2561
0
    input = (const float**) ((uintptr_t) input + input_stride);
2562
2563
0
    size_t c = channels;
2564
0
    const float* w = weights;
2565
0
    do {
2566
0
      float vacc0p0 = w[0];
2567
2568
0
      const float vi0 = *i0++;
2569
0
      const float vk0 = w[1];
2570
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2571
2572
0
      const float vi1 = *i1++;
2573
0
      const float vk1 = w[2];
2574
0
      float vacc0p1 = vi1 * vk1;
2575
2576
0
      const float vi2 = *i2++;
2577
0
      const float vk2 = w[3];
2578
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2579
2580
0
      w += 4;
2581
2582
0
      vacc0p0 += vacc0p1;
2583
2584
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
2585
0
      vacc0 = math_min_f32(vacc0, vmax);
2586
0
      *output++ = vacc0;
2587
0
    } while (--c != 0);
2588
2589
0
    output = (float*) ((uintptr_t) output + output_increment);
2590
0
  } while (--output_width != 0);
2591
0
}
2592
2593
void xnn_f32_dwconv_ukernel_3p1c__scalar_acc2(
2594
    size_t channels,
2595
    size_t output_width,
2596
    const float** input,
2597
    const float* weights,
2598
    float* output,
2599
    intptr_t input_stride,
2600
    size_t output_increment,
2601
    size_t input_offset,
2602
    const float* zero,
2603
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2604
0
{
2605
0
  assert(channels != 0);
2606
0
  assert(output_width != 0);
2607
2608
0
  do {
2609
0
    const float* i0 = input[0];
2610
0
    assert(i0 != NULL);
2611
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2612
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2613
0
    }
2614
0
    const float* i1 = input[1];
2615
0
    assert(i1 != NULL);
2616
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2617
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2618
0
    }
2619
0
    const float* i2 = input[2];
2620
0
    assert(i2 != NULL);
2621
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2622
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2623
0
    }
2624
0
    input = (const float**) ((uintptr_t) input + input_stride);
2625
2626
0
    size_t c = channels;
2627
0
    const float* w = weights;
2628
0
    do {
2629
0
      float vacc0p0 = w[0];
2630
2631
0
      const float vi0 = *i0++;
2632
0
      const float vk0 = w[1];
2633
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2634
2635
0
      const float vi1 = *i1++;
2636
0
      const float vk1 = w[2];
2637
0
      float vacc0p1 = vi1 * vk1;
2638
2639
0
      const float vi2 = *i2++;
2640
0
      const float vk2 = w[3];
2641
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2642
2643
0
      w += 4;
2644
2645
0
      vacc0p0 += vacc0p1;
2646
2647
0
      *output++ = vacc0p0;
2648
0
    } while (--c != 0);
2649
2650
0
    output = (float*) ((uintptr_t) output + output_increment);
2651
0
  } while (--output_width != 0);
2652
0
}
2653
2654
void xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2(
2655
    size_t channels,
2656
    size_t output_width,
2657
    const float** input,
2658
    const float* weights,
2659
    float* output,
2660
    intptr_t input_stride,
2661
    size_t output_increment,
2662
    size_t input_offset,
2663
    const float* zero,
2664
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2665
0
{
2666
0
  assert(channels != 0);
2667
0
  assert(output_width != 0);
2668
2669
0
  const float vmin = params->scalar.min;
2670
0
  const float vmax = params->scalar.max;
2671
0
  do {
2672
0
    const float* i0 = input[0];
2673
0
    assert(i0 != NULL);
2674
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2675
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2676
0
    }
2677
0
    const float* i1 = input[1];
2678
0
    assert(i1 != NULL);
2679
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2680
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2681
0
    }
2682
0
    const float* i2 = input[2];
2683
0
    assert(i2 != NULL);
2684
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2685
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2686
0
    }
2687
0
    const float* i3 = input[3];
2688
0
    assert(i3 != NULL);
2689
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2690
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2691
0
    }
2692
0
    input = (const float**) ((uintptr_t) input + input_stride);
2693
2694
0
    size_t c = channels;
2695
0
    const float* w = weights;
2696
0
    do {
2697
0
      float vacc0p0 = w[0];
2698
2699
0
      const float vi0 = *i0++;
2700
0
      const float vk0 = w[1];
2701
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2702
2703
0
      const float vi1 = *i1++;
2704
0
      const float vk1 = w[2];
2705
0
      float vacc0p1 = vi1 * vk1;
2706
2707
0
      const float vi2 = *i2++;
2708
0
      const float vk2 = w[3];
2709
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2710
2711
0
      const float vi3 = *i3++;
2712
0
      const float vk3 = w[4];
2713
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2714
2715
0
      w += 5;
2716
2717
0
      vacc0p0 += vacc0p1;
2718
2719
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
2720
0
      vacc0 = math_min_f32(vacc0, vmax);
2721
0
      *output++ = vacc0;
2722
0
    } while (--c != 0);
2723
2724
0
    output = (float*) ((uintptr_t) output + output_increment);
2725
0
  } while (--output_width != 0);
2726
0
}
2727
2728
void xnn_f32_dwconv_ukernel_4p1c__scalar_acc2(
2729
    size_t channels,
2730
    size_t output_width,
2731
    const float** input,
2732
    const float* weights,
2733
    float* output,
2734
    intptr_t input_stride,
2735
    size_t output_increment,
2736
    size_t input_offset,
2737
    const float* zero,
2738
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2739
0
{
2740
0
  assert(channels != 0);
2741
0
  assert(output_width != 0);
2742
2743
0
  do {
2744
0
    const float* i0 = input[0];
2745
0
    assert(i0 != NULL);
2746
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2747
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2748
0
    }
2749
0
    const float* i1 = input[1];
2750
0
    assert(i1 != NULL);
2751
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2752
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2753
0
    }
2754
0
    const float* i2 = input[2];
2755
0
    assert(i2 != NULL);
2756
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2757
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2758
0
    }
2759
0
    const float* i3 = input[3];
2760
0
    assert(i3 != NULL);
2761
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2762
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2763
0
    }
2764
0
    input = (const float**) ((uintptr_t) input + input_stride);
2765
2766
0
    size_t c = channels;
2767
0
    const float* w = weights;
2768
0
    do {
2769
0
      float vacc0p0 = w[0];
2770
2771
0
      const float vi0 = *i0++;
2772
0
      const float vk0 = w[1];
2773
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2774
2775
0
      const float vi1 = *i1++;
2776
0
      const float vk1 = w[2];
2777
0
      float vacc0p1 = vi1 * vk1;
2778
2779
0
      const float vi2 = *i2++;
2780
0
      const float vk2 = w[3];
2781
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2782
2783
0
      const float vi3 = *i3++;
2784
0
      const float vk3 = w[4];
2785
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2786
2787
0
      w += 5;
2788
2789
0
      vacc0p0 += vacc0p1;
2790
2791
0
      *output++ = vacc0p0;
2792
0
    } while (--c != 0);
2793
2794
0
    output = (float*) ((uintptr_t) output + output_increment);
2795
0
  } while (--output_width != 0);
2796
0
}
2797
2798
void xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2(
2799
    size_t channels,
2800
    size_t output_width,
2801
    const float** input,
2802
    const float* weights,
2803
    float* output,
2804
    intptr_t input_stride,
2805
    size_t output_increment,
2806
    size_t input_offset,
2807
    const float* zero,
2808
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2809
0
{
2810
0
  assert(channels != 0);
2811
0
  assert(output_width != 0);
2812
2813
0
  const float vmin = params->scalar.min;
2814
0
  const float vmax = params->scalar.max;
2815
0
  do {
2816
0
    const float* i0 = input[0];
2817
0
    assert(i0 != NULL);
2818
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2819
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2820
0
    }
2821
0
    const float* i1 = input[1];
2822
0
    assert(i1 != NULL);
2823
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2824
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2825
0
    }
2826
0
    const float* i2 = input[2];
2827
0
    assert(i2 != NULL);
2828
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2829
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2830
0
    }
2831
0
    const float* i3 = input[3];
2832
0
    assert(i3 != NULL);
2833
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2834
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2835
0
    }
2836
0
    const float* i4 = input[4];
2837
0
    assert(i4 != NULL);
2838
0
    if XNN_UNPREDICTABLE(i4 != zero) {
2839
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
2840
0
    }
2841
0
    const float* i5 = input[5];
2842
0
    assert(i5 != NULL);
2843
0
    if XNN_UNPREDICTABLE(i5 != zero) {
2844
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
2845
0
    }
2846
0
    const float* i6 = input[6];
2847
0
    assert(i6 != NULL);
2848
0
    if XNN_UNPREDICTABLE(i6 != zero) {
2849
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
2850
0
    }
2851
0
    const float* i7 = input[7];
2852
0
    assert(i7 != NULL);
2853
0
    if XNN_UNPREDICTABLE(i7 != zero) {
2854
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
2855
0
    }
2856
0
    const float* i8 = input[8];
2857
0
    assert(i8 != NULL);
2858
0
    if XNN_UNPREDICTABLE(i8 != zero) {
2859
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
2860
0
    }
2861
0
    input = (const float**) ((uintptr_t) input + input_stride);
2862
2863
0
    size_t c = channels;
2864
0
    const float* w = weights;
2865
0
    do {
2866
0
      float vacc0p0 = w[0];
2867
2868
0
      const float vi0 = *i0++;
2869
0
      const float vk0 = w[1];
2870
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2871
2872
0
      const float vi1 = *i1++;
2873
0
      const float vk1 = w[2];
2874
0
      float vacc0p1 = vi1 * vk1;
2875
2876
0
      const float vi2 = *i2++;
2877
0
      const float vk2 = w[3];
2878
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2879
2880
0
      const float vi3 = *i3++;
2881
0
      const float vk3 = w[4];
2882
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2883
2884
0
      const float vi4 = *i4++;
2885
0
      const float vk4 = w[5];
2886
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
2887
2888
0
      const float vi5 = *i5++;
2889
0
      const float vk5 = w[6];
2890
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
2891
2892
0
      const float vi6 = *i6++;
2893
0
      const float vk6 = w[7];
2894
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
2895
2896
0
      const float vi7 = *i7++;
2897
0
      const float vk7 = w[8];
2898
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
2899
2900
0
      const float vi8 = *i8++;
2901
0
      const float vk8 = w[9];
2902
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
2903
2904
0
      w += 10;
2905
2906
0
      vacc0p0 += vacc0p1;
2907
2908
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
2909
0
      vacc0 = math_min_f32(vacc0, vmax);
2910
0
      *output++ = vacc0;
2911
0
    } while (--c != 0);
2912
2913
0
    output = (float*) ((uintptr_t) output + output_increment);
2914
0
  } while (--output_width != 0);
2915
0
}
2916
2917
void xnn_f32_dwconv_ukernel_9p1c__scalar_acc2(
2918
    size_t channels,
2919
    size_t output_width,
2920
    const float** input,
2921
    const float* weights,
2922
    float* output,
2923
    intptr_t input_stride,
2924
    size_t output_increment,
2925
    size_t input_offset,
2926
    const float* zero,
2927
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2928
0
{
2929
0
  assert(channels != 0);
2930
0
  assert(output_width != 0);
2931
2932
0
  do {
2933
0
    const float* i0 = input[0];
2934
0
    assert(i0 != NULL);
2935
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2936
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2937
0
    }
2938
0
    const float* i1 = input[1];
2939
0
    assert(i1 != NULL);
2940
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2941
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2942
0
    }
2943
0
    const float* i2 = input[2];
2944
0
    assert(i2 != NULL);
2945
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2946
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2947
0
    }
2948
0
    const float* i3 = input[3];
2949
0
    assert(i3 != NULL);
2950
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2951
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2952
0
    }
2953
0
    const float* i4 = input[4];
2954
0
    assert(i4 != NULL);
2955
0
    if XNN_UNPREDICTABLE(i4 != zero) {
2956
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
2957
0
    }
2958
0
    const float* i5 = input[5];
2959
0
    assert(i5 != NULL);
2960
0
    if XNN_UNPREDICTABLE(i5 != zero) {
2961
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
2962
0
    }
2963
0
    const float* i6 = input[6];
2964
0
    assert(i6 != NULL);
2965
0
    if XNN_UNPREDICTABLE(i6 != zero) {
2966
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
2967
0
    }
2968
0
    const float* i7 = input[7];
2969
0
    assert(i7 != NULL);
2970
0
    if XNN_UNPREDICTABLE(i7 != zero) {
2971
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
2972
0
    }
2973
0
    const float* i8 = input[8];
2974
0
    assert(i8 != NULL);
2975
0
    if XNN_UNPREDICTABLE(i8 != zero) {
2976
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
2977
0
    }
2978
0
    input = (const float**) ((uintptr_t) input + input_stride);
2979
2980
0
    size_t c = channels;
2981
0
    const float* w = weights;
2982
0
    do {
2983
0
      float vacc0p0 = w[0];
2984
2985
0
      const float vi0 = *i0++;
2986
0
      const float vk0 = w[1];
2987
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2988
2989
0
      const float vi1 = *i1++;
2990
0
      const float vk1 = w[2];
2991
0
      float vacc0p1 = vi1 * vk1;
2992
2993
0
      const float vi2 = *i2++;
2994
0
      const float vk2 = w[3];
2995
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2996
2997
0
      const float vi3 = *i3++;
2998
0
      const float vk3 = w[4];
2999
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
3000
3001
0
      const float vi4 = *i4++;
3002
0
      const float vk4 = w[5];
3003
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
3004
3005
0
      const float vi5 = *i5++;
3006
0
      const float vk5 = w[6];
3007
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
3008
3009
0
      const float vi6 = *i6++;
3010
0
      const float vk6 = w[7];
3011
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
3012
3013
0
      const float vi7 = *i7++;
3014
0
      const float vk7 = w[8];
3015
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
3016
3017
0
      const float vi8 = *i8++;
3018
0
      const float vk8 = w[9];
3019
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
3020
3021
0
      w += 10;
3022
3023
0
      vacc0p0 += vacc0p1;
3024
3025
0
      *output++ = vacc0p0;
3026
0
    } while (--c != 0);
3027
3028
0
    output = (float*) ((uintptr_t) output + output_increment);
3029
0
  } while (--output_width != 0);
3030
0
}
3031
3032
void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2(
3033
    size_t input_height,
3034
    size_t input_width,
3035
    const float* input,
3036
    const float* weights,
3037
    const float* zero,
3038
    float* output,
3039
    uint32_t padding_top,
3040
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3041
0
{
3042
0
  assert(input_height != 0);
3043
0
  assert(input_width != 0);
3044
0
  assert(input_width % sizeof(float) == 0);
3045
0
  assert(padding_top == 1);
3046
3047
0
  const float vmin = params->scalar.min;
3048
0
  const float vmax = params->scalar.max;
3049
3050
0
  const float vbias = weights[0];
3051
0
  const float vk00 = weights[1];
3052
0
  const float vk01 = weights[2];
3053
0
  const float vk02 = weights[3];
3054
0
  const float vk10 = weights[4];
3055
0
  const float vk11 = weights[5];
3056
0
  const float vk12 = weights[6];
3057
0
  const float vk20 = weights[7];
3058
0
  const float vk21 = weights[8];
3059
0
  const float vk22 = weights[9];
3060
3061
0
  const float* i0 = zero;
3062
0
  const float* i1 = input;
3063
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3064
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3065
3066
0
  float* o0 = output;
3067
0
  float* o1 = (float*) ((uintptr_t) o0 + input_width);
3068
3069
0
  size_t output_height = input_height;
3070
0
  do {
3071
0
    if XNN_UNPREDICTABLE(output_height < 2) {
3072
0
      i2 = zero;
3073
0
      o1 = o0;
3074
0
    }
3075
0
    if XNN_UNPREDICTABLE(output_height < 3) {
3076
0
      i3 = zero;
3077
0
    }
3078
3079
0
    float vi0x0 = 0.0f;
3080
0
    float vi1x0 = 0.0f;
3081
0
    float vi2x0 = 0.0f;
3082
0
    float vi3x0 = 0.0f;
3083
3084
0
    float vi0x1 = *i0++;
3085
0
    float vi1x1 = *i1++;
3086
0
    float vi2x1 = *i2++;
3087
0
    float vi3x1 = *i3++;
3088
3089
0
    size_t w = input_width;
3090
0
    for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
3091
0
      const float vi0x2 = *i0++;
3092
0
      const float vi1x2 = *i1++;
3093
0
      const float vi2x2 = *i2++;
3094
0
      const float vi3x2 = *i3++;
3095
3096
0
      float vo0p0 = vbias + vi0x0 * vk00;
3097
0
      float vo1p0 = vbias + vi1x0 * vk00;
3098
0
      float vo0p1 = vi1x0 * vk10;
3099
0
      float vo1p1 = vi2x0 * vk10;
3100
0
      vo0p0 += vi2x0 * vk20;
3101
0
      vo1p0 += vi3x0 * vk20;
3102
3103
0
      vi0x0 = vi0x1;
3104
0
      vi1x0 = vi1x1;
3105
0
      vi2x0 = vi2x1;
3106
0
      vi3x0 = vi3x1;
3107
3108
0
      vo0p1 += vi0x1 * vk01;
3109
0
      vo1p1 += vi1x1 * vk01;
3110
0
      vo0p0 += vi1x1 * vk11;
3111
0
      vo1p0 += vi2x1 * vk11;
3112
0
      vo0p1 += vi2x1 * vk21;
3113
0
      vo1p1 += vi3x1 * vk21;
3114
3115
0
      vi0x1 = vi0x2;
3116
0
      vi1x1 = vi1x2;
3117
0
      vi2x1 = vi2x2;
3118
0
      vi3x1 = vi3x2;
3119
3120
0
      vo0p0 += vi0x2 * vk02;
3121
0
      vo1p0 += vi1x2 * vk02;
3122
0
      vo0p1 += vi1x2 * vk12;
3123
0
      vo1p1 += vi2x2 * vk12;
3124
0
      vo0p0 += vi2x2 * vk22;
3125
0
      vo1p0 += vi3x2 * vk22;
3126
3127
0
      vo0p0 += vo0p1;
3128
0
      vo1p0 += vo1p1;
3129
3130
0
      float vo0 = math_max_f32(vo0p0, vmin);
3131
0
      float vo1 = math_max_f32(vo1p0, vmin);
3132
3133
0
      vo0 = math_min_f32(vo0, vmax);
3134
0
      vo1 = math_min_f32(vo1, vmax);
3135
3136
0
      *o1++ = vo1;
3137
0
      *o0++ = vo0;
3138
0
    }
3139
    // Always process the last pixel separately to account for right edge.
3140
0
    assert(w == 1 * sizeof(float));
3141
0
    {
3142
0
      float vo0p0 = vbias + vi0x0 * vk00;
3143
0
      float vo1p0 = vbias + vi1x0 * vk00;
3144
0
      float vo0p1 = vi1x0 * vk10;
3145
0
      float vo1p1 = vi2x0 * vk10;
3146
0
      vo0p0 += vi2x0 * vk20;
3147
0
      vo1p0 += vi3x0 * vk20;
3148
3149
0
      vo0p1 += vi0x1 * vk01;
3150
0
      vo1p1 += vi1x1 * vk01;
3151
0
      vo0p0 += vi1x1 * vk11;
3152
0
      vo1p0 += vi2x1 * vk11;
3153
0
      vo0p1 += vi2x1 * vk21;
3154
0
      vo1p1 += vi3x1 * vk21;
3155
3156
0
      vo0p0 += vo0p1;
3157
0
      vo1p0 += vo1p1;
3158
3159
0
      float vo0 = math_max_f32(vo0p0, vmin);
3160
0
      float vo1 = math_max_f32(vo1p0, vmin);
3161
3162
0
      vo0 = math_min_f32(vo0, vmax);
3163
0
      vo1 = math_min_f32(vo1, vmax);
3164
3165
0
      *o1++ = vo1;
3166
0
      *o0++ = vo0;
3167
0
    }
3168
3169
0
    i0 = (const float*) ((uintptr_t) i2 - input_width);
3170
0
    i1 = (const float*) ((uintptr_t) i3 - input_width);
3171
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3172
0
    i3 = (const float*) ((uintptr_t) i2 + input_width);
3173
3174
0
    o0 = o1;
3175
0
    o1 = (float*) ((uintptr_t) o0 + input_width);
3176
3177
0
    output_height = doz(output_height, 2);
3178
0
  } while (output_height != 0);
3179
0
}
3180
3181
void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1(
3182
    size_t input_height,
3183
    size_t input_width,
3184
    const float* input,
3185
    const float* weights,
3186
    const float* zero,
3187
    float* output,
3188
    uint32_t padding_top,
3189
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3190
0
{
3191
0
  assert(input_height != 0);
3192
0
  assert(input_width != 0);
3193
0
  assert(input_width % sizeof(float) == 0);
3194
0
  assert(padding_top == 1);
3195
3196
0
  const float vmin = params->scalar.min;
3197
0
  const float vmax = params->scalar.max;
3198
3199
0
  const float vbias = weights[0];
3200
0
  const float vk00 = weights[1];
3201
0
  const float vk01 = weights[2];
3202
0
  const float vk02 = weights[3];
3203
0
  const float vk10 = weights[4];
3204
0
  const float vk11 = weights[5];
3205
0
  const float vk12 = weights[6];
3206
0
  const float vk20 = weights[7];
3207
0
  const float vk21 = weights[8];
3208
0
  const float vk22 = weights[9];
3209
3210
0
  const float* i0 = zero;
3211
0
  const float* i1 = input;
3212
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3213
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3214
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3215
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3216
3217
0
  float* o0 = output;
3218
0
  float* o1 = (float*) ((uintptr_t) o0 + input_width);
3219
0
  float* o2 = (float*) ((uintptr_t) o1 + input_width);
3220
0
  float* o3 = (float*) ((uintptr_t) o2 + input_width);
3221
3222
0
  size_t output_height = input_height;
3223
0
  do {
3224
0
    if XNN_UNPREDICTABLE(output_height < 2) {
3225
0
      i2 = zero;
3226
0
      o1 = o0;
3227
0
    }
3228
0
    if XNN_UNPREDICTABLE(output_height < 3) {
3229
0
      i3 = zero;
3230
0
      o2 = o1;
3231
0
    }
3232
0
    if XNN_UNPREDICTABLE(output_height < 4) {
3233
0
      i4 = zero;
3234
0
      o3 = o2;
3235
0
    }
3236
0
    if XNN_UNPREDICTABLE(output_height < 5) {
3237
0
      i5 = zero;
3238
0
    }
3239
3240
0
    float vi0x0 = 0.0f;
3241
0
    float vi1x0 = 0.0f;
3242
0
    float vi2x0 = 0.0f;
3243
0
    float vi3x0 = 0.0f;
3244
0
    float vi4x0 = 0.0f;
3245
0
    float vi5x0 = 0.0f;
3246
3247
0
    float vi0x1 = *i0++;
3248
0
    float vi1x1 = *i1++;
3249
0
    float vi2x1 = *i2++;
3250
0
    float vi3x1 = *i3++;
3251
0
    float vi4x1 = *i4++;
3252
0
    float vi5x1 = *i5++;
3253
3254
0
    size_t w = input_width;
3255
0
    for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
3256
0
      const float vi0x2 = *i0++;
3257
0
      const float vi1x2 = *i1++;
3258
0
      const float vi2x2 = *i2++;
3259
0
      const float vi3x2 = *i3++;
3260
0
      const float vi4x2 = *i4++;
3261
0
      const float vi5x2 = *i5++;
3262
3263
0
      float vo0p0 = vbias + vi0x0 * vk00;
3264
0
      float vo1p0 = vbias + vi1x0 * vk00;
3265
0
      float vo2p0 = vbias + vi2x0 * vk00;
3266
0
      float vo3p0 = vbias + vi3x0 * vk00;
3267
0
      vo0p0 += vi1x0 * vk10;
3268
0
      vo1p0 += vi2x0 * vk10;
3269
0
      vo2p0 += vi3x0 * vk10;
3270
0
      vo3p0 += vi4x0 * vk10;
3271
0
      vo0p0 += vi2x0 * vk20;
3272
0
      vo1p0 += vi3x0 * vk20;
3273
0
      vo2p0 += vi4x0 * vk20;
3274
0
      vo3p0 += vi5x0 * vk20;
3275
3276
0
      vi0x0 = vi0x1;
3277
0
      vi1x0 = vi1x1;
3278
0
      vi2x0 = vi2x1;
3279
0
      vi3x0 = vi3x1;
3280
0
      vi4x0 = vi4x1;
3281
0
      vi5x0 = vi5x1;
3282
3283
0
      vo0p0 += vi0x1 * vk01;
3284
0
      vo1p0 += vi1x1 * vk01;
3285
0
      vo2p0 += vi2x1 * vk01;
3286
0
      vo3p0 += vi3x1 * vk01;
3287
0
      vo0p0 += vi1x1 * vk11;
3288
0
      vo1p0 += vi2x1 * vk11;
3289
0
      vo2p0 += vi3x1 * vk11;
3290
0
      vo3p0 += vi4x1 * vk11;
3291
0
      vo0p0 += vi2x1 * vk21;
3292
0
      vo1p0 += vi3x1 * vk21;
3293
0
      vo2p0 += vi4x1 * vk21;
3294
0
      vo3p0 += vi5x1 * vk21;
3295
3296
0
      vi0x1 = vi0x2;
3297
0
      vi1x1 = vi1x2;
3298
0
      vi2x1 = vi2x2;
3299
0
      vi3x1 = vi3x2;
3300
0
      vi4x1 = vi4x2;
3301
0
      vi5x1 = vi5x2;
3302
3303
0
      vo0p0 += vi0x2 * vk02;
3304
0
      vo1p0 += vi1x2 * vk02;
3305
0
      vo2p0 += vi2x2 * vk02;
3306
0
      vo3p0 += vi3x2 * vk02;
3307
0
      vo0p0 += vi1x2 * vk12;
3308
0
      vo1p0 += vi2x2 * vk12;
3309
0
      vo2p0 += vi3x2 * vk12;
3310
0
      vo3p0 += vi4x2 * vk12;
3311
0
      vo0p0 += vi2x2 * vk22;
3312
0
      vo1p0 += vi3x2 * vk22;
3313
0
      vo2p0 += vi4x2 * vk22;
3314
0
      vo3p0 += vi5x2 * vk22;
3315
3316
3317
0
      float vo0 = math_max_f32(vo0p0, vmin);
3318
0
      float vo1 = math_max_f32(vo1p0, vmin);
3319
0
      float vo2 = math_max_f32(vo2p0, vmin);
3320
0
      float vo3 = math_max_f32(vo3p0, vmin);
3321
3322
0
      vo0 = math_min_f32(vo0, vmax);
3323
0
      vo1 = math_min_f32(vo1, vmax);
3324
0
      vo2 = math_min_f32(vo2, vmax);
3325
0
      vo3 = math_min_f32(vo3, vmax);
3326
3327
0
      *o3++ = vo3;
3328
0
      *o2++ = vo2;
3329
0
      *o1++ = vo1;
3330
0
      *o0++ = vo0;
3331
0
    }
3332
    // Always process the last pixel separately to account for right edge.
3333
0
    assert(w == 1 * sizeof(float));
3334
0
    {
3335
0
      float vo0p0 = vbias + vi0x0 * vk00;
3336
0
      float vo1p0 = vbias + vi1x0 * vk00;
3337
0
      float vo2p0 = vbias + vi2x0 * vk00;
3338
0
      float vo3p0 = vbias + vi3x0 * vk00;
3339
0
      vo0p0 += vi1x0 * vk10;
3340
0
      vo1p0 += vi2x0 * vk10;
3341
0
      vo2p0 += vi3x0 * vk10;
3342
0
      vo3p0 += vi4x0 * vk10;
3343
0
      vo0p0 += vi2x0 * vk20;
3344
0
      vo1p0 += vi3x0 * vk20;
3345
0
      vo2p0 += vi4x0 * vk20;
3346
0
      vo3p0 += vi5x0 * vk20;
3347
3348
0
      vo0p0 += vi0x1 * vk01;
3349
0
      vo1p0 += vi1x1 * vk01;
3350
0
      vo2p0 += vi2x1 * vk01;
3351
0
      vo3p0 += vi3x1 * vk01;
3352
0
      vo0p0 += vi1x1 * vk11;
3353
0
      vo1p0 += vi2x1 * vk11;
3354
0
      vo2p0 += vi3x1 * vk11;
3355
0
      vo3p0 += vi4x1 * vk11;
3356
0
      vo0p0 += vi2x1 * vk21;
3357
0
      vo1p0 += vi3x1 * vk21;
3358
0
      vo2p0 += vi4x1 * vk21;
3359
0
      vo3p0 += vi5x1 * vk21;
3360
3361
3362
0
      float vo0 = math_max_f32(vo0p0, vmin);
3363
0
      float vo1 = math_max_f32(vo1p0, vmin);
3364
0
      float vo2 = math_max_f32(vo2p0, vmin);
3365
0
      float vo3 = math_max_f32(vo3p0, vmin);
3366
3367
0
      vo0 = math_min_f32(vo0, vmax);
3368
0
      vo1 = math_min_f32(vo1, vmax);
3369
0
      vo2 = math_min_f32(vo2, vmax);
3370
0
      vo3 = math_min_f32(vo3, vmax);
3371
3372
0
      *o3++ = vo3;
3373
0
      *o2++ = vo2;
3374
0
      *o1++ = vo1;
3375
0
      *o0++ = vo0;
3376
0
    }
3377
3378
0
    i0 = (const float*) ((uintptr_t) i4 - input_width);
3379
0
    i1 = (const float*) ((uintptr_t) i5 - input_width);
3380
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3381
0
    i3 = (const float*) ((uintptr_t) i2 + input_width);
3382
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
3383
0
    i5 = (const float*) ((uintptr_t) i4 + input_width);
3384
3385
0
    o0 = o3;
3386
0
    o1 = (float*) ((uintptr_t) o0 + input_width);
3387
0
    o2 = (float*) ((uintptr_t) o1 + input_width);
3388
0
    o3 = (float*) ((uintptr_t) o2 + input_width);
3389
3390
0
    output_height = doz(output_height, 4);
3391
0
  } while (output_height != 0);
3392
0
}
3393
3394
void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2(
3395
    size_t input_height,
3396
    size_t input_width,
3397
    const float* input,
3398
    const float* weights,
3399
    const float* zero,
3400
    float* output,
3401
    uint32_t padding_top,
3402
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3403
0
{
3404
0
  assert(input_height != 0);
3405
0
  assert(input_width != 0);
3406
0
  assert(input_width % sizeof(float) == 0);
3407
0
  assert(padding_top >= 0);
3408
0
  assert(padding_top <= 1);
3409
3410
0
  const float vmin = params->scalar.min;
3411
0
  const float vmax = params->scalar.max;
3412
3413
0
  const float vbias = weights[0];
3414
0
  const float vk00 = weights[1];
3415
0
  const float vk01 = weights[2];
3416
0
  const float vk02 = weights[3];
3417
0
  const float vk10 = weights[4];
3418
0
  const float vk11 = weights[5];
3419
0
  const float vk12 = weights[6];
3420
0
  const float vk20 = weights[7];
3421
0
  const float vk21 = weights[8];
3422
0
  const float vk22 = weights[9];
3423
3424
3425
0
  const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
3426
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
3427
0
  if XNN_UNPREDICTABLE(padding_top != 0) {
3428
0
    i0 = zero;
3429
0
  }
3430
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3431
3432
0
  float* o0 = output;
3433
3434
0
  size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
3435
0
  size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
3436
0
  do {
3437
0
    if XNN_UNPREDICTABLE(padded_input_height < 4) {
3438
0
      i2 = zero;
3439
0
    }
3440
3441
0
    float vi0x0 = 0.0f;
3442
0
    float vi1x0 = 0.0f;
3443
0
    float vi2x0 = 0.0f;
3444
3445
0
    size_t w = input_width;
3446
0
    for (; w >= 2 * sizeof(float); w -= 2 * sizeof(float)) {
3447
0
      const float vi0x1 = i0[0];
3448
0
      const float vi1x1 = i1[0];
3449
0
      const float vi2x1 = i2[0];
3450
3451
0
      float vo0p0 = vbias + vi0x0 * vk00;
3452
0
      float vo0p1 = vi1x0 * vk10;
3453
0
      vo0p0 += vi2x0 * vk20;
3454
3455
0
      const float vi0x2 = i0[1];
3456
0
      i0 += 2;
3457
0
      const float vi1x2 = i1[1];
3458
0
      i1 += 2;
3459
0
      const float vi2x2 = i2[1];
3460
0
      i2 += 2;
3461
3462
0
      vo0p1 += vi0x1 * vk01;
3463
0
      vo0p0 += vi1x1 * vk11;
3464
0
      vo0p1 += vi2x1 * vk21;
3465
3466
0
      vi0x0 = vi0x2;
3467
0
      vi1x0 = vi1x2;
3468
0
      vi2x0 = vi2x2;
3469
3470
0
      vo0p0 += vi0x2 * vk02;
3471
0
      vo0p1 += vi1x2 * vk12;
3472
0
      vo0p0 += vi2x2 * vk22;
3473
3474
0
      vo0p0 += vo0p1;
3475
3476
0
      float vo0 = math_max_f32(vo0p0, vmin);
3477
3478
0
      vo0 = math_min_f32(vo0, vmax);
3479
3480
0
      *o0++ = vo0;
3481
0
    }
3482
    // Potentially process the last pixel.
3483
0
    assert(w <= 1 * sizeof(float));
3484
0
    if (w != 0) {
3485
0
      const float vi0x1 = *i0++;
3486
0
      const float vi1x1 = *i1++;
3487
0
      const float vi2x1 = *i2++;
3488
3489
0
      float vo0p0 = vbias + vi0x0 * vk00;
3490
0
      float vo0p1 = vi1x0 * vk10;
3491
0
      vo0p0 += vi2x0 * vk20;
3492
3493
0
      vo0p1 += vi0x1 * vk01;
3494
0
      vo0p0 += vi1x1 * vk11;
3495
0
      vo0p1 += vi2x1 * vk21;
3496
3497
0
      vo0p0 += vo0p1;
3498
3499
0
      float vo0 = math_max_f32(vo0p0, vmin);
3500
3501
0
      vo0 = math_min_f32(vo0, vmax);
3502
3503
0
      *o0++ = vo0;
3504
0
    }
3505
3506
0
    i0 = (const float*) ((uintptr_t) i1);
3507
0
    i1 = (const float*) ((uintptr_t) i2);
3508
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3509
3510
3511
0
    output_height -= 1;
3512
0
    padded_input_height -= 2;
3513
0
  } while (output_height != 0);
3514
0
}
3515
3516
void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2(
3517
    size_t input_height,
3518
    size_t input_width,
3519
    const float* input,
3520
    const float* weights,
3521
    const float* zero,
3522
    float* output,
3523
    uint32_t padding_top,
3524
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3525
0
{
3526
0
  assert(input_height != 0);
3527
0
  assert(input_width != 0);
3528
0
  assert(input_width % sizeof(float) == 0);
3529
0
  assert(padding_top >= 0);
3530
0
  assert(padding_top <= 1);
3531
3532
0
  const float vmin = params->scalar.min;
3533
0
  const float vmax = params->scalar.max;
3534
3535
0
  const float vbias = weights[0];
3536
0
  const float vk00 = weights[1];
3537
0
  const float vk01 = weights[2];
3538
0
  const float vk02 = weights[3];
3539
0
  const float vk10 = weights[4];
3540
0
  const float vk11 = weights[5];
3541
0
  const float vk12 = weights[6];
3542
0
  const float vk20 = weights[7];
3543
0
  const float vk21 = weights[8];
3544
0
  const float vk22 = weights[9];
3545
3546
0
  const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
3547
3548
0
  const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
3549
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
3550
0
  if XNN_UNPREDICTABLE(padding_top != 0) {
3551
0
    i0 = zero;
3552
0
  }
3553
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3554
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3555
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3556
3557
0
  float* o0 = output;
3558
0
  float* o1 = (float*) ((uintptr_t) o0 + output_width);
3559
3560
0
  size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
3561
0
  size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
3562
0
  do {
3563
0
    if XNN_UNPREDICTABLE(padded_input_height < 4) {
3564
0
      i2 = zero;
3565
0
    }
3566
0
    if XNN_UNPREDICTABLE(padded_input_height < 5) {
3567
0
      i3 = zero;
3568
0
      o1 = o0;
3569
0
    }
3570
0
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
3571
0
      i4 = zero;
3572
0
    }
3573
3574
0
    float vi0x0 = 0.0f;
3575
0
    float vi1x0 = 0.0f;
3576
0
    float vi2x0 = 0.0f;
3577
0
    float vi3x0 = 0.0f;
3578
0
    float vi4x0 = 0.0f;
3579
3580
0
    size_t w = input_width;
3581
0
    for (; w >= 2 * sizeof(float); w -= 2 * sizeof(float)) {
3582
0
      const float vi0x1 = i0[0];
3583
0
      const float vi1x1 = i1[0];
3584
0
      const float vi2x1 = i2[0];
3585
0
      const float vi3x1 = i3[0];
3586
0
      const float vi4x1 = i4[0];
3587
3588
0
      float vo0p0 = vbias + vi0x0 * vk00;
3589
0
      float vo1p0 = vbias + vi2x0 * vk00;
3590
0
      float vo0p1 = vi1x0 * vk10;
3591
0
      float vo1p1 = vi3x0 * vk10;
3592
0
      vo0p0 += vi2x0 * vk20;
3593
0
      vo1p0 += vi4x0 * vk20;
3594
3595
0
      const float vi0x2 = i0[1];
3596
0
      i0 += 2;
3597
0
      const float vi1x2 = i1[1];
3598
0
      i1 += 2;
3599
0
      const float vi2x2 = i2[1];
3600
0
      i2 += 2;
3601
0
      const float vi3x2 = i3[1];
3602
0
      i3 += 2;
3603
0
      const float vi4x2 = i4[1];
3604
0
      i4 += 2;
3605
3606
0
      vo0p1 += vi0x1 * vk01;
3607
0
      vo1p1 += vi2x1 * vk01;
3608
0
      vo0p0 += vi1x1 * vk11;
3609
0
      vo1p0 += vi3x1 * vk11;
3610
0
      vo0p1 += vi2x1 * vk21;
3611
0
      vo1p1 += vi4x1 * vk21;
3612
3613
0
      vi0x0 = vi0x2;
3614
0
      vi1x0 = vi1x2;
3615
0
      vi2x0 = vi2x2;
3616
0
      vi3x0 = vi3x2;
3617
0
      vi4x0 = vi4x2;
3618
3619
0
      vo0p0 += vi0x2 * vk02;
3620
0
      vo1p0 += vi2x2 * vk02;
3621
0
      vo0p1 += vi1x2 * vk12;
3622
0
      vo1p1 += vi3x2 * vk12;
3623
0
      vo0p0 += vi2x2 * vk22;
3624
0
      vo1p0 += vi4x2 * vk22;
3625
3626
0
      vo0p0 += vo0p1;
3627
0
      vo1p0 += vo1p1;
3628
3629
0
      float vo0 = math_max_f32(vo0p0, vmin);
3630
0
      float vo1 = math_max_f32(vo1p0, vmin);
3631
3632
0
      vo0 = math_min_f32(vo0, vmax);
3633
0
      vo1 = math_min_f32(vo1, vmax);
3634
3635
0
      *o1++ = vo1;
3636
0
      *o0++ = vo0;
3637
0
    }
3638
    // Potentially process the last pixel.
3639
0
    assert(w <= 1 * sizeof(float));
3640
0
    if (w != 0) {
3641
0
      const float vi0x1 = *i0++;
3642
0
      const float vi1x1 = *i1++;
3643
0
      const float vi2x1 = *i2++;
3644
0
      const float vi3x1 = *i3++;
3645
0
      const float vi4x1 = *i4++;
3646
3647
0
      float vo0p0 = vbias + vi0x0 * vk00;
3648
0
      float vo1p0 = vbias + vi2x0 * vk00;
3649
0
      float vo0p1 = vi1x0 * vk10;
3650
0
      float vo1p1 = vi3x0 * vk10;
3651
0
      vo0p0 += vi2x0 * vk20;
3652
0
      vo1p0 += vi4x0 * vk20;
3653
3654
0
      vo0p1 += vi0x1 * vk01;
3655
0
      vo1p1 += vi2x1 * vk01;
3656
0
      vo0p0 += vi1x1 * vk11;
3657
0
      vo1p0 += vi3x1 * vk11;
3658
0
      vo0p1 += vi2x1 * vk21;
3659
0
      vo1p1 += vi4x1 * vk21;
3660
3661
0
      vo0p0 += vo0p1;
3662
0
      vo1p0 += vo1p1;
3663
3664
0
      float vo0 = math_max_f32(vo0p0, vmin);
3665
0
      float vo1 = math_max_f32(vo1p0, vmin);
3666
3667
0
      vo0 = math_min_f32(vo0, vmax);
3668
0
      vo1 = math_min_f32(vo1, vmax);
3669
3670
0
      *o1++ = vo1;
3671
0
      *o0++ = vo0;
3672
0
    }
3673
3674
0
    i0 = (const float*) ((uintptr_t) i3);
3675
0
    i1 = (const float*) ((uintptr_t) i4);
3676
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3677
0
    i3 = (const float*) ((uintptr_t) i2 + input_width);
3678
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
3679
3680
0
    o0 = o1;
3681
0
    o1 = (float*) ((uintptr_t) o0 + output_width);
3682
3683
0
    output_height = doz(output_height, 2);
3684
0
    padded_input_height = doz(padded_input_height, 4);
3685
0
  } while (output_height != 0);
3686
0
}
3687
3688
void xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5(
3689
    size_t input_height,
3690
    size_t input_width,
3691
    const float* input,
3692
    const float* weights,
3693
    const float* zero,
3694
    float* output,
3695
    uint32_t padding_top,
3696
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3697
0
{
3698
0
  assert(input_height != 0);
3699
0
  assert(input_width != 0);
3700
0
  assert(input_width % sizeof(float) == 0);
3701
0
  assert(padding_top == 2);
3702
3703
0
  const float vmin = params->scalar.min;
3704
0
  const float vmax = params->scalar.max;
3705
3706
0
  const float vbias = weights[0];
3707
0
  const float vk00 = weights[1];
3708
0
  const float vk01 = weights[2];
3709
0
  const float vk02 = weights[3];
3710
0
  const float vk03 = weights[4];
3711
0
  const float vk04 = weights[5];
3712
0
  const float vk10 = weights[6];
3713
0
  const float vk11 = weights[7];
3714
0
  const float vk12 = weights[8];
3715
0
  const float vk13 = weights[9];
3716
0
  const float vk14 = weights[10];
3717
0
  const float vk20 = weights[11];
3718
0
  const float vk21 = weights[12];
3719
0
  const float vk22 = weights[13];
3720
0
  const float vk23 = weights[14];
3721
0
  const float vk24 = weights[15];
3722
0
  const float vk30 = weights[16];
3723
0
  const float vk31 = weights[17];
3724
0
  const float vk32 = weights[18];
3725
0
  const float vk33 = weights[19];
3726
0
  const float vk34 = weights[20];
3727
0
  const float vk40 = weights[21];
3728
0
  const float vk41 = weights[22];
3729
0
  const float vk42 = weights[23];
3730
0
  const float vk43 = weights[24];
3731
0
  const float vk44 = weights[25];
3732
3733
0
  const float* i0 = zero;
3734
0
  const float* i1 = zero;
3735
0
  const float* i2 = input;
3736
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3737
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3738
3739
0
  float* o0 = output;
3740
3741
0
  size_t output_height = input_height;
3742
0
  do {
3743
0
    if XNN_UNPREDICTABLE(output_height < 2) {
3744
0
      i3 = zero;
3745
0
    }
3746
0
    if XNN_UNPREDICTABLE(output_height < 3) {
3747
0
      i4 = zero;
3748
0
    }
3749
3750
0
    float vi0x0 = 0.0f;
3751
0
    float vi1x0 = 0.0f;
3752
0
    float vi2x0 = 0.0f;
3753
0
    float vi3x0 = 0.0f;
3754
0
    float vi4x0 = 0.0f;
3755
3756
0
    float vi0x1 = 0.0f;
3757
0
    float vi1x1 = 0.0f;
3758
0
    float vi2x1 = 0.0f;
3759
0
    float vi3x1 = 0.0f;
3760
0
    float vi4x1 = 0.0f;
3761
3762
0
    float vi0x2 = *i0++;
3763
0
    float vi1x2 = *i1++;
3764
0
    float vi2x2 = *i2++;
3765
0
    float vi3x2 = *i3++;
3766
0
    float vi4x2 = *i4++;
3767
3768
0
    size_t w = input_width;
3769
0
    if (w > 1 * sizeof(float)) {
3770
0
      float vi0x3 = *i0++;
3771
0
      float vi1x3 = *i1++;
3772
0
      float vi2x3 = *i2++;
3773
0
      float vi3x3 = *i3++;
3774
0
      float vi4x3 = *i4++;
3775
3776
0
      for (; w > 2 * sizeof(float); w -= 1 * sizeof(float)) {
3777
0
        const float vi0x4 = *i0++;
3778
0
        const float vi1x4 = *i1++;
3779
0
        const float vi2x4 = *i2++;
3780
0
        const float vi3x4 = *i3++;
3781
0
        const float vi4x4 = *i4++;
3782
3783
0
        float vo0p0 = vbias + vi0x0 * vk00;
3784
0
        float vo0p1 = vi1x0 * vk10;
3785
0
        float vo0p2 = vi2x0 * vk20;
3786
0
        float vo0p3 = vi3x0 * vk30;
3787
0
        float vo0p4 = vi4x0 * vk40;
3788
3789
0
        vi0x0 = vi0x1;
3790
0
        vi1x0 = vi1x1;
3791
0
        vi2x0 = vi2x1;
3792
0
        vi3x0 = vi3x1;
3793
0
        vi4x0 = vi4x1;
3794
3795
0
        vo0p0 += vi0x1 * vk01;
3796
0
        vo0p1 += vi1x1 * vk11;
3797
0
        vo0p2 += vi2x1 * vk21;
3798
0
        vo0p3 += vi3x1 * vk31;
3799
0
        vo0p4 += vi4x1 * vk41;
3800
3801
0
        vi0x1 = vi0x2;
3802
0
        vi1x1 = vi1x2;
3803
0
        vi2x1 = vi2x2;
3804
0
        vi3x1 = vi3x2;
3805
0
        vi4x1 = vi4x2;
3806
3807
0
        vo0p0 += vi0x2 * vk02;
3808
0
        vo0p1 += vi1x2 * vk12;
3809
0
        vo0p2 += vi2x2 * vk22;
3810
0
        vo0p3 += vi3x2 * vk32;
3811
0
        vo0p4 += vi4x2 * vk42;
3812
3813
0
        vi0x2 = vi0x3;
3814
0
        vi1x2 = vi1x3;
3815
0
        vi2x2 = vi2x3;
3816
0
        vi3x2 = vi3x3;
3817
0
        vi4x2 = vi4x3;
3818
3819
0
        vo0p0 += vi0x3 * vk03;
3820
0
        vo0p1 += vi1x3 * vk13;
3821
0
        vo0p2 += vi2x3 * vk23;
3822
0
        vo0p3 += vi3x3 * vk33;
3823
0
        vo0p4 += vi4x3 * vk43;
3824
3825
0
        vi0x3 = vi0x4;
3826
0
        vi1x3 = vi1x4;
3827
0
        vi2x3 = vi2x4;
3828
0
        vi3x3 = vi3x4;
3829
0
        vi4x3 = vi4x4;
3830
3831
0
        vo0p0 += vi0x4 * vk04;
3832
0
        vo0p1 += vi1x4 * vk14;
3833
0
        vo0p2 += vi2x4 * vk24;
3834
0
        vo0p3 += vi3x4 * vk34;
3835
0
        vo0p4 += vi4x4 * vk44;
3836
3837
0
        vo0p0 += vo0p1;
3838
0
        vo0p2 += vo0p3;
3839
0
        vo0p0 += vo0p2;
3840
0
        vo0p0 += vo0p4;
3841
3842
0
        float vo0 = math_max_f32(vo0p0, vmin);
3843
3844
0
        vo0 = math_min_f32(vo0, vmax);
3845
3846
0
        *o0++ = vo0;
3847
0
      }
3848
0
      assert(w == 2 * sizeof(float));
3849
0
      {
3850
0
        float vo0p0 = vbias + vi0x0 * vk00;
3851
0
        float vo0p1 = vi1x0 * vk10;
3852
0
        float vo0p2 = vi2x0 * vk20;
3853
0
        float vo0p3 = vi3x0 * vk30;
3854
0
        float vo0p4 = vi4x0 * vk40;
3855
3856
0
        vi0x0 = vi0x1;
3857
0
        vi1x0 = vi1x1;
3858
0
        vi2x0 = vi2x1;
3859
0
        vi3x0 = vi3x1;
3860
0
        vi4x0 = vi4x1;
3861
3862
0
        vo0p0 += vi0x1 * vk01;
3863
0
        vo0p1 += vi1x1 * vk11;
3864
0
        vo0p2 += vi2x1 * vk21;
3865
0
        vo0p3 += vi3x1 * vk31;
3866
0
        vo0p4 += vi4x1 * vk41;
3867
3868
0
        vi0x1 = vi0x2;
3869
0
        vi1x1 = vi1x2;
3870
0
        vi2x1 = vi2x2;
3871
0
        vi3x1 = vi3x2;
3872
0
        vi4x1 = vi4x2;
3873
3874
0
        vo0p0 += vi0x2 * vk02;
3875
0
        vo0p1 += vi1x2 * vk12;
3876
0
        vo0p2 += vi2x2 * vk22;
3877
0
        vo0p3 += vi3x2 * vk32;
3878
0
        vo0p4 += vi4x2 * vk42;
3879
3880
0
        vi0x2 = vi0x3;
3881
0
        vi1x2 = vi1x3;
3882
0
        vi2x2 = vi2x3;
3883
0
        vi3x2 = vi3x3;
3884
0
        vi4x2 = vi4x3;
3885
3886
0
        vo0p0 += vi0x3 * vk03;
3887
0
        vo0p1 += vi1x3 * vk13;
3888
0
        vo0p2 += vi2x3 * vk23;
3889
0
        vo0p3 += vi3x3 * vk33;
3890
0
        vo0p4 += vi4x3 * vk43;
3891
3892
0
        vo0p0 += vo0p1;
3893
0
        vo0p2 += vo0p3;
3894
0
        vo0p0 += vo0p2;
3895
0
        vo0p0 += vo0p4;
3896
3897
0
        float vo0 = math_max_f32(vo0p0, vmin);
3898
3899
0
        vo0 = math_min_f32(vo0, vmax);
3900
3901
0
        *o0++ = vo0;
3902
0
      }
3903
0
      w -= 1 * sizeof(float);
3904
0
    }
3905
0
    assert(w == 1 * sizeof(float));
3906
0
    {
3907
0
      float vo0p0 = vbias + vi0x0 * vk00;
3908
0
      float vo0p1 = vi1x0 * vk10;
3909
0
      float vo0p2 = vi2x0 * vk20;
3910
0
      float vo0p3 = vi3x0 * vk30;
3911
0
      float vo0p4 = vi4x0 * vk40;
3912
3913
0
      vo0p0 += vi0x1 * vk01;
3914
0
      vo0p1 += vi1x1 * vk11;
3915
0
      vo0p2 += vi2x1 * vk21;
3916
0
      vo0p3 += vi3x1 * vk31;
3917
0
      vo0p4 += vi4x1 * vk41;
3918
3919
0
      vo0p0 += vi0x2 * vk02;
3920
0
      vo0p1 += vi1x2 * vk12;
3921
0
      vo0p2 += vi2x2 * vk22;
3922
0
      vo0p3 += vi3x2 * vk32;
3923
0
      vo0p4 += vi4x2 * vk42;
3924
3925
0
      vo0p0 += vo0p1;
3926
0
      vo0p2 += vo0p3;
3927
0
      vo0p0 += vo0p2;
3928
0
      vo0p0 += vo0p4;
3929
3930
0
      float vo0 = math_max_f32(vo0p0, vmin);
3931
3932
0
      vo0 = math_min_f32(vo0, vmax);
3933
3934
0
      *o0++ = vo0;
3935
0
    }
3936
3937
0
    i0 = (const float*) ((uintptr_t) i1 - input_width);
3938
0
    i1 = (const float*) ((uintptr_t) i2 - input_width);
3939
3940
3941
0
  } while (--output_height != 0);
3942
0
}
3943
3944
void xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2(
3945
    size_t input_height,
3946
    size_t input_width,
3947
    const float* input,
3948
    const float* weights,
3949
    const float* zero,
3950
    float* output,
3951
    uint32_t padding_top,
3952
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3953
0
{
3954
0
  assert(input_height != 0);
3955
0
  assert(input_width != 0);
3956
0
  assert(input_width % sizeof(float) == 0);
3957
0
  assert(padding_top == 2);
3958
3959
0
  const float vmin = params->scalar.min;
3960
0
  const float vmax = params->scalar.max;
3961
3962
0
  const float vbias = weights[0];
3963
0
  const float vk00 = weights[1];
3964
0
  const float vk01 = weights[2];
3965
0
  const float vk02 = weights[3];
3966
0
  const float vk03 = weights[4];
3967
0
  const float vk04 = weights[5];
3968
0
  const float vk10 = weights[6];
3969
0
  const float vk11 = weights[7];
3970
0
  const float vk12 = weights[8];
3971
0
  const float vk13 = weights[9];
3972
0
  const float vk14 = weights[10];
3973
0
  const float vk20 = weights[11];
3974
0
  const float vk21 = weights[12];
3975
0
  const float vk22 = weights[13];
3976
0
  const float vk23 = weights[14];
3977
0
  const float vk24 = weights[15];
3978
0
  const float vk30 = weights[16];
3979
0
  const float vk31 = weights[17];
3980
0
  const float vk32 = weights[18];
3981
0
  const float vk33 = weights[19];
3982
0
  const float vk34 = weights[20];
3983
0
  const float vk40 = weights[21];
3984
0
  const float vk41 = weights[22];
3985
0
  const float vk42 = weights[23];
3986
0
  const float vk43 = weights[24];
3987
0
  const float vk44 = weights[25];
3988
3989
0
  const float* i0 = zero;
3990
0
  const float* i1 = zero;
3991
0
  const float* i2 = input;
3992
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3993
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3994
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3995
3996
0
  float* o0 = output;
3997
0
  float* o1 = (float*) ((uintptr_t) o0 + input_width);
3998
3999
0
  size_t output_height = input_height;
4000
0
  do {
4001
0
    if XNN_UNPREDICTABLE(output_height < 2) {
4002
0
      i3 = zero;
4003
0
      o1 = o0;
4004
0
    }
4005
0
    if XNN_UNPREDICTABLE(output_height < 3) {
4006
0
      i4 = zero;
4007
0
    }
4008
0
    if XNN_UNPREDICTABLE(output_height < 4) {
4009
0
      i5 = zero;
4010
0
    }
4011
4012
0
    float vi0x0 = 0.0f;
4013
0
    float vi1x0 = 0.0f;
4014
0
    float vi2x0 = 0.0f;
4015
0
    float vi3x0 = 0.0f;
4016
0
    float vi4x0 = 0.0f;
4017
0
    float vi5x0 = 0.0f;
4018
4019
0
    float vi0x1 = 0.0f;
4020
0
    float vi1x1 = 0.0f;
4021
0
    float vi2x1 = 0.0f;
4022
0
    float vi3x1 = 0.0f;
4023
0
    float vi4x1 = 0.0f;
4024
0
    float vi5x1 = 0.0f;
4025
4026
0
    float vi0x2 = *i0++;
4027
0
    float vi1x2 = *i1++;
4028
0
    float vi2x2 = *i2++;
4029
0
    float vi3x2 = *i3++;
4030
0
    float vi4x2 = *i4++;
4031
0
    float vi5x2 = *i5++;
4032
4033
0
    size_t w = input_width;
4034
0
    if (w > 1 * sizeof(float)) {
4035
0
      float vi0x3 = *i0++;
4036
0
      float vi1x3 = *i1++;
4037
0
      float vi2x3 = *i2++;
4038
0
      float vi3x3 = *i3++;
4039
0
      float vi4x3 = *i4++;
4040
0
      float vi5x3 = *i5++;
4041
4042
0
      for (; w > 2 * sizeof(float); w -= 1 * sizeof(float)) {
4043
0
        const float vi0x4 = *i0++;
4044
0
        const float vi1x4 = *i1++;
4045
0
        const float vi2x4 = *i2++;
4046
0
        const float vi3x4 = *i3++;
4047
0
        const float vi4x4 = *i4++;
4048
0
        const float vi5x4 = *i5++;
4049
4050
0
        float vo0p0 = vbias + vi0x0 * vk00;
4051
0
        float vo1p0 = vbias + vi1x0 * vk00;
4052
0
        float vo0p1 = vi1x0 * vk10;
4053
0
        float vo1p1 = vi2x0 * vk10;
4054
0
        vo0p0 += vi2x0 * vk20;
4055
0
        vo1p0 += vi3x0 * vk20;
4056
0
        vo0p1 += vi3x0 * vk30;
4057
0
        vo1p1 += vi4x0 * vk30;
4058
0
        vo0p0 += vi4x0 * vk40;
4059
0
        vo1p0 += vi5x0 * vk40;
4060
4061
0
        vi0x0 = vi0x1;
4062
0
        vi1x0 = vi1x1;
4063
0
        vi2x0 = vi2x1;
4064
0
        vi3x0 = vi3x1;
4065
0
        vi4x0 = vi4x1;
4066
0
        vi5x0 = vi5x1;
4067
4068
0
        vo0p1 += vi0x1 * vk01;
4069
0
        vo1p1 += vi1x1 * vk01;
4070
0
        vo0p0 += vi1x1 * vk11;
4071
0
        vo1p0 += vi2x1 * vk11;
4072
0
        vo0p1 += vi2x1 * vk21;
4073
0
        vo1p1 += vi3x1 * vk21;
4074
0
        vo0p0 += vi3x1 * vk31;
4075
0
        vo1p0 += vi4x1 * vk31;
4076
0
        vo0p1 += vi4x1 * vk41;
4077
0
        vo1p1 += vi5x1 * vk41;
4078
4079
0
        vi0x1 = vi0x2;
4080
0
        vi1x1 = vi1x2;
4081
0
        vi2x1 = vi2x2;
4082
0
        vi3x1 = vi3x2;
4083
0
        vi4x1 = vi4x2;
4084
0
        vi5x1 = vi5x2;
4085
4086
0
        vo0p0 += vi0x2 * vk02;
4087
0
        vo1p0 += vi1x2 * vk02;
4088
0
        vo0p1 += vi1x2 * vk12;
4089
0
        vo1p1 += vi2x2 * vk12;
4090
0
        vo0p0 += vi2x2 * vk22;
4091
0
        vo1p0 += vi3x2 * vk22;
4092
0
        vo0p1 += vi3x2 * vk32;
4093
0
        vo1p1 += vi4x2 * vk32;
4094
0
        vo0p0 += vi4x2 * vk42;
4095
0
        vo1p0 += vi5x2 * vk42;
4096
4097
0
        vi0x2 = vi0x3;
4098
0
        vi1x2 = vi1x3;
4099
0
        vi2x2 = vi2x3;
4100
0
        vi3x2 = vi3x3;
4101
0
        vi4x2 = vi4x3;
4102
0
        vi5x2 = vi5x3;
4103
4104
0
        vo0p1 += vi0x3 * vk03;
4105
0
        vo1p1 += vi1x3 * vk03;
4106
0
        vo0p0 += vi1x3 * vk13;
4107
0
        vo1p0 += vi2x3 * vk13;
4108
0
        vo0p1 += vi2x3 * vk23;
4109
0
        vo1p1 += vi3x3 * vk23;
4110
0
        vo0p0 += vi3x3 * vk33;
4111
0
        vo1p0 += vi4x3 * vk33;
4112
0
        vo0p1 += vi4x3 * vk43;
4113
0
        vo1p1 += vi5x3 * vk43;
4114
4115
0
        vi0x3 = vi0x4;
4116
0
        vi1x3 = vi1x4;
4117
0
        vi2x3 = vi2x4;
4118
0
        vi3x3 = vi3x4;
4119
0
        vi4x3 = vi4x4;
4120
0
        vi5x3 = vi5x4;
4121
4122
0
        vo0p0 += vi0x4 * vk04;
4123
0
        vo1p0 += vi1x4 * vk04;
4124
0
        vo0p1 += vi1x4 * vk14;
4125
0
        vo1p1 += vi2x4 * vk14;
4126
0
        vo0p0 += vi2x4 * vk24;
4127
0
        vo1p0 += vi3x4 * vk24;
4128
0
        vo0p1 += vi3x4 * vk34;
4129
0
        vo1p1 += vi4x4 * vk34;
4130
0
        vo0p0 += vi4x4 * vk44;
4131
0
        vo1p0 += vi5x4 * vk44;
4132
4133
0
        vo0p0 += vo0p1;
4134
0
        vo1p0 += vo1p1;
4135
4136
0
        float vo0 = math_max_f32(vo0p0, vmin);
4137
0
        float vo1 = math_max_f32(vo1p0, vmin);
4138
4139
0
        vo0 = math_min_f32(vo0, vmax);
4140
0
        vo1 = math_min_f32(vo1, vmax);
4141
4142
0
        *o1++ = vo1;
4143
0
        *o0++ = vo0;
4144
0
      }
4145
0
      assert(w == 2 * sizeof(float));
4146
0
      {
4147
0
        float vo0p0 = vbias + vi0x0 * vk00;
4148
0
        float vo1p0 = vbias + vi1x0 * vk00;
4149
0
        float vo0p1 = vi1x0 * vk10;
4150
0
        float vo1p1 = vi2x0 * vk10;
4151
0
        vo0p0 += vi2x0 * vk20;
4152
0
        vo1p0 += vi3x0 * vk20;
4153
0
        vo0p1 += vi3x0 * vk30;
4154
0
        vo1p1 += vi4x0 * vk30;
4155
0
        vo0p0 += vi4x0 * vk40;
4156
0
        vo1p0 += vi5x0 * vk40;
4157
4158
0
        vi0x0 = vi0x1;
4159
0
        vi1x0 = vi1x1;
4160
0
        vi2x0 = vi2x1;
4161
0
        vi3x0 = vi3x1;
4162
0
        vi4x0 = vi4x1;
4163
0
        vi5x0 = vi5x1;
4164
4165
0
        vo0p1 += vi0x1 * vk01;
4166
0
        vo1p1 += vi1x1 * vk01;
4167
0
        vo0p0 += vi1x1 * vk11;
4168
0
        vo1p0 += vi2x1 * vk11;
4169
0
        vo0p1 += vi2x1 * vk21;
4170
0
        vo1p1 += vi3x1 * vk21;
4171
0
        vo0p0 += vi3x1 * vk31;
4172
0
        vo1p0 += vi4x1 * vk31;
4173
0
        vo0p1 += vi4x1 * vk41;
4174
0
        vo1p1 += vi5x1 * vk41;
4175
4176
0
        vi0x1 = vi0x2;
4177
0
        vi1x1 = vi1x2;
4178
0
        vi2x1 = vi2x2;
4179
0
        vi3x1 = vi3x2;
4180
0
        vi4x1 = vi4x2;
4181
0
        vi5x1 = vi5x2;
4182
4183
0
        vo0p0 += vi0x2 * vk02;
4184
0
        vo1p0 += vi1x2 * vk02;
4185
0
        vo0p1 += vi1x2 * vk12;
4186
0
        vo1p1 += vi2x2 * vk12;
4187
0
        vo0p0 += vi2x2 * vk22;
4188
0
        vo1p0 += vi3x2 * vk22;
4189
0
        vo0p1 += vi3x2 * vk32;
4190
0
        vo1p1 += vi4x2 * vk32;
4191
0
        vo0p0 += vi4x2 * vk42;
4192
0
        vo1p0 += vi5x2 * vk42;
4193
4194
0
        vi0x2 = vi0x3;
4195
0
        vi1x2 = vi1x3;
4196
0
        vi2x2 = vi2x3;
4197
0
        vi3x2 = vi3x3;
4198
0
        vi4x2 = vi4x3;
4199
0
        vi5x2 = vi5x3;
4200
4201
0
        vo0p1 += vi0x3 * vk03;
4202
0
        vo1p1 += vi1x3 * vk03;
4203
0
        vo0p0 += vi1x3 * vk13;
4204
0
        vo1p0 += vi2x3 * vk13;
4205
0
        vo0p1 += vi2x3 * vk23;
4206
0
        vo1p1 += vi3x3 * vk23;
4207
0
        vo0p0 += vi3x3 * vk33;
4208
0
        vo1p0 += vi4x3 * vk33;
4209
0
        vo0p1 += vi4x3 * vk43;
4210
0
        vo1p1 += vi5x3 * vk43;
4211
4212
0
        vo0p0 += vo0p1;
4213
0
        vo1p0 += vo1p1;
4214
4215
0
        float vo0 = math_max_f32(vo0p0, vmin);
4216
0
        float vo1 = math_max_f32(vo1p0, vmin);
4217
4218
0
        vo0 = math_min_f32(vo0, vmax);
4219
0
        vo1 = math_min_f32(vo1, vmax);
4220
4221
0
        *o1++ = vo1;
4222
0
        *o0++ = vo0;
4223
0
      }
4224
0
      w -= 1 * sizeof(float);
4225
0
    }
4226
0
    assert(w == 1 * sizeof(float));
4227
0
    {
4228
0
      float vo0p0 = vbias + vi0x0 * vk00;
4229
0
      float vo1p0 = vbias + vi1x0 * vk00;
4230
0
      float vo0p1 = vi1x0 * vk10;
4231
0
      float vo1p1 = vi2x0 * vk10;
4232
0
      vo0p0 += vi2x0 * vk20;
4233
0
      vo1p0 += vi3x0 * vk20;
4234
0
      vo0p1 += vi3x0 * vk30;
4235
0
      vo1p1 += vi4x0 * vk30;
4236
0
      vo0p0 += vi4x0 * vk40;
4237
0
      vo1p0 += vi5x0 * vk40;
4238
4239
0
      vo0p1 += vi0x1 * vk01;
4240
0
      vo1p1 += vi1x1 * vk01;
4241
0
      vo0p0 += vi1x1 * vk11;
4242
0
      vo1p0 += vi2x1 * vk11;
4243
0
      vo0p1 += vi2x1 * vk21;
4244
0
      vo1p1 += vi3x1 * vk21;
4245
0
      vo0p0 += vi3x1 * vk31;
4246
0
      vo1p0 += vi4x1 * vk31;
4247
0
      vo0p1 += vi4x1 * vk41;
4248
0
      vo1p1 += vi5x1 * vk41;
4249
4250
0
      vo0p0 += vi0x2 * vk02;
4251
0
      vo1p0 += vi1x2 * vk02;
4252
0
      vo0p1 += vi1x2 * vk12;
4253
0
      vo1p1 += vi2x2 * vk12;
4254
0
      vo0p0 += vi2x2 * vk22;
4255
0
      vo1p0 += vi3x2 * vk22;
4256
0
      vo0p1 += vi3x2 * vk32;
4257
0
      vo1p1 += vi4x2 * vk32;
4258
0
      vo0p0 += vi4x2 * vk42;
4259
0
      vo1p0 += vi5x2 * vk42;
4260
4261
0
      vo0p0 += vo0p1;
4262
0
      vo1p0 += vo1p1;
4263
4264
0
      float vo0 = math_max_f32(vo0p0, vmin);
4265
0
      float vo1 = math_max_f32(vo1p0, vmin);
4266
4267
0
      vo0 = math_min_f32(vo0, vmax);
4268
0
      vo1 = math_min_f32(vo1, vmax);
4269
4270
0
      *o1++ = vo1;
4271
0
      *o0++ = vo0;
4272
0
    }
4273
4274
0
    i0 = (const float*) ((uintptr_t) i2 - input_width);
4275
0
    i1 = (const float*) ((uintptr_t) i3 - input_width);
4276
0
    i2 = i3;
4277
0
    i3 = i4;
4278
0
    i4 = i5;
4279
0
    i5 = (const float*) ((uintptr_t) i4 + input_width);
4280
4281
0
    o0 = o1;
4282
0
    o1 = (float*) ((uintptr_t) o0 + input_width);
4283
4284
0
    output_height = doz(output_height, 2);
4285
0
  } while (output_height != 0);
4286
0
}
4287
4288
void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5(
4289
    size_t input_height,
4290
    size_t input_width,
4291
    const float* input,
4292
    const float* weights,
4293
    const float* zero,
4294
    float* output,
4295
    uint32_t padding_top,
4296
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
4297
0
{
4298
0
  assert(input_height != 0);
4299
0
  assert(input_width != 0);
4300
0
  assert(input_width % sizeof(float) == 0);
4301
0
  assert(padding_top >= 1);
4302
0
  assert(padding_top <= 2);
4303
4304
0
  const float vmax = params->scalar.max;
4305
0
  const float vmin = params->scalar.min;
4306
4307
0
  const float vbias = weights[0];
4308
0
  const float vk00 = weights[1];
4309
0
  const float vk01 = weights[2];
4310
0
  const float vk02 = weights[3];
4311
0
  const float vk03 = weights[4];
4312
0
  const float vk04 = weights[5];
4313
0
  const float vk10 = weights[6];
4314
0
  const float vk11 = weights[7];
4315
0
  const float vk12 = weights[8];
4316
0
  const float vk13 = weights[9];
4317
0
  const float vk14 = weights[10];
4318
0
  const float vk20 = weights[11];
4319
0
  const float vk21 = weights[12];
4320
0
  const float vk22 = weights[13];
4321
0
  const float vk23 = weights[14];
4322
0
  const float vk24 = weights[15];
4323
0
  const float vk30 = weights[16];
4324
0
  const float vk31 = weights[17];
4325
0
  const float vk32 = weights[18];
4326
0
  const float vk33 = weights[19];
4327
0
  const float vk34 = weights[20];
4328
0
  const float vk40 = weights[21];
4329
0
  const float vk41 = weights[22];
4330
0
  const float vk42 = weights[23];
4331
0
  const float vk43 = weights[24];
4332
0
  const float vk44 = weights[25];
4333
4334
0
  const uint32_t padding_top_less_1 = padding_top - 1;
4335
4336
0
  const float* i0 = zero;
4337
0
  const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
4338
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
4339
0
  if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
4340
0
    i1 = zero;
4341
0
  }
4342
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
4343
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
4344
4345
4346
0
  float* o0 = output;
4347
4348
0
  size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
4349
0
  size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
4350
0
  do {
4351
0
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
4352
0
      i3 = zero;
4353
0
    }
4354
0
    if XNN_UNPREDICTABLE(padded_input_height < 7) {
4355
0
      i4 = zero;
4356
0
    }
4357
4358
0
    float vi0x0 = 0.0f;
4359
0
    float vi1x0 = 0.0f;
4360
0
    float vi2x0 = 0.0f;
4361
0
    float vi3x0 = 0.0f;
4362
0
    float vi4x0 = 0.0f;
4363
4364
0
    float vi0x1 = 0.0f;
4365
0
    float vi1x1 = 0.0f;
4366
0
    float vi2x1 = 0.0f;
4367
0
    float vi3x1 = 0.0f;
4368
0
    float vi4x1 = 0.0f;
4369
4370
0
    float vi0x2 = *i0++;
4371
0
    float vi1x2 = *i1++;
4372
0
    float vi2x2 = *i2++;
4373
0
    float vi3x2 = *i3++;
4374
0
    float vi4x2 = *i4++;
4375
4376
0
    size_t w = input_width;
4377
0
    for (; w > 2 * sizeof(float); w -= 2 * sizeof(float)) {
4378
0
      const float vi0x3 = i0[0];
4379
0
      const float vi1x3 = i1[0];
4380
0
      const float vi2x3 = i2[0];
4381
0
      const float vi3x3 = i3[0];
4382
0
      const float vi4x3 = i4[0];
4383
4384
0
      const float vi0x4 = i0[1];
4385
0
      i0 += 2;
4386
0
      const float vi1x4 = i1[1];
4387
0
      i1 += 2;
4388
0
      const float vi2x4 = i2[1];
4389
0
      i2 += 2;
4390
0
      const float vi3x4 = i3[1];
4391
0
      i3 += 2;
4392
0
      const float vi4x4 = i4[1];
4393
0
      i4 += 2;
4394
4395
0
      float vo0p0 = vbias + vi0x0 * vk00;
4396
0
      float vo0p1 = vi1x0 * vk10;
4397
0
      float vo0p2 = vi2x0 * vk20;
4398
0
      float vo0p3 = vi3x0 * vk30;
4399
0
      float vo0p4 = vi4x0 * vk40;
4400
4401
0
      vi0x0 = vi0x2;
4402
0
      vi1x0 = vi1x2;
4403
0
      vi2x0 = vi2x2;
4404
0
      vi3x0 = vi3x2;
4405
0
      vi4x0 = vi4x2;
4406
4407
0
      vo0p0 += vi0x1 * vk01;
4408
0
      vo0p1 += vi1x1 * vk11;
4409
0
      vo0p2 += vi2x1 * vk21;
4410
0
      vo0p3 += vi3x1 * vk31;
4411
0
      vo0p4 += vi4x1 * vk41;
4412
4413
0
      vi0x1 = vi0x3;
4414
0
      vi1x1 = vi1x3;
4415
0
      vi2x1 = vi2x3;
4416
0
      vi3x1 = vi3x3;
4417
0
      vi4x1 = vi4x3;
4418
4419
0
      vo0p0 += vi0x2 * vk02;
4420
0
      vo0p1 += vi1x2 * vk12;
4421
0
      vo0p2 += vi2x2 * vk22;
4422
0
      vo0p3 += vi3x2 * vk32;
4423
0
      vo0p4 += vi4x2 * vk42;
4424
4425
0
      vi0x2 = vi0x4;
4426
0
      vi1x2 = vi1x4;
4427
0
      vi2x2 = vi2x4;
4428
0
      vi3x2 = vi3x4;
4429
0
      vi4x2 = vi4x4;
4430
4431
0
      vo0p0 += vi0x3 * vk03;
4432
0
      vo0p1 += vi1x3 * vk13;
4433
0
      vo0p2 += vi2x3 * vk23;
4434
0
      vo0p3 += vi3x3 * vk33;
4435
0
      vo0p4 += vi4x3 * vk43;
4436
4437
0
      vo0p0 += vi0x4 * vk04;
4438
0
      vo0p1 += vi1x4 * vk14;
4439
0
      vo0p2 += vi2x4 * vk24;
4440
0
      vo0p3 += vi3x4 * vk34;
4441
0
      vo0p4 += vi4x4 * vk44;
4442
4443
0
      vo0p0 += vo0p1;
4444
0
      vo0p2 += vo0p3;
4445
0
      vo0p0 += vo0p2;
4446
0
      vo0p0 += vo0p4;
4447
4448
0
      float vo0 = math_max_f32(vo0p0, vmin);
4449
4450
0
      vo0 = math_min_f32(vo0, vmax);
4451
4452
0
      *o0++ = vo0;
4453
0
    }
4454
0
    if XNN_LIKELY(w == 2 * sizeof(float)) {
4455
0
      const float vi0x3 = *i0++;
4456
0
      const float vi1x3 = *i1++;
4457
0
      const float vi2x3 = *i2++;
4458
0
      const float vi3x3 = *i3++;
4459
0
      const float vi4x3 = *i4++;
4460
4461
0
      float vo0p0 = vbias + vi0x0 * vk00;
4462
0
      float vo0p1 = vi1x0 * vk10;
4463
0
      float vo0p2 = vi2x0 * vk20;
4464
0
      float vo0p3 = vi3x0 * vk30;
4465
0
      float vo0p4 = vi4x0 * vk40;
4466
4467
0
      vo0p0 += vi0x1 * vk01;
4468
0
      vo0p1 += vi1x1 * vk11;
4469
0
      vo0p2 += vi2x1 * vk21;
4470
0
      vo0p3 += vi3x1 * vk31;
4471
0
      vo0p4 += vi4x1 * vk41;
4472
4473
0
      vo0p0 += vi0x2 * vk02;
4474
0
      vo0p1 += vi1x2 * vk12;
4475
0
      vo0p2 += vi2x2 * vk22;
4476
0
      vo0p3 += vi3x2 * vk32;
4477
0
      vo0p4 += vi4x2 * vk42;
4478
4479
0
      vo0p0 += vi0x3 * vk03;
4480
0
      vo0p1 += vi1x3 * vk13;
4481
0
      vo0p2 += vi2x3 * vk23;
4482
0
      vo0p3 += vi3x3 * vk33;
4483
0
      vo0p4 += vi4x3 * vk43;
4484
4485
0
      vo0p0 += vo0p1;
4486
0
      vo0p2 += vo0p3;
4487
0
      vo0p0 += vo0p2;
4488
0
      vo0p0 += vo0p4;
4489
4490
0
      float vo0 = math_max_f32(vo0p0, vmin);
4491
4492
0
      vo0 = math_min_f32(vo0, vmax);
4493
4494
0
      *o0++ = vo0;
4495
0
    } else {
4496
0
      float vo0p0 = vbias + vi0x0 * vk00;
4497
0
      float vo0p1 = vi1x0 * vk10;
4498
0
      float vo0p2 = vi2x0 * vk20;
4499
0
      float vo0p3 = vi3x0 * vk30;
4500
0
      float vo0p4 = vi4x0 * vk40;
4501
4502
0
      vo0p0 += vi0x1 * vk01;
4503
0
      vo0p1 += vi1x1 * vk11;
4504
0
      vo0p2 += vi2x1 * vk21;
4505
0
      vo0p3 += vi3x1 * vk31;
4506
0
      vo0p4 += vi4x1 * vk41;
4507
4508
0
      vo0p0 += vi0x2 * vk02;
4509
0
      vo0p1 += vi1x2 * vk12;
4510
0
      vo0p2 += vi2x2 * vk22;
4511
0
      vo0p3 += vi3x2 * vk32;
4512
0
      vo0p4 += vi4x2 * vk42;
4513
4514
0
      vo0p0 += vo0p1;
4515
0
      vo0p2 += vo0p3;
4516
0
      vo0p0 += vo0p2;
4517
0
      vo0p0 += vo0p4;
4518
4519
0
      float vo0 = math_max_f32(vo0p0, vmin);
4520
4521
0
      vo0 = math_min_f32(vo0, vmax);
4522
4523
0
      *o0++ = vo0;
4524
0
    }
4525
4526
0
    i0 = (const float*) ((uintptr_t) i2 - input_width);
4527
0
    i1 = (const float*) ((uintptr_t) i2);
4528
0
    i2 = (const float*) ((uintptr_t) i3);
4529
0
    i3 = (const float*) ((uintptr_t) i4);
4530
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
4531
4532
4533
0
    output_height -= 1;
4534
0
    padded_input_height -= 2;
4535
0
  } while (output_height != 0);
4536
0
}
4537
4538
void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2(
4539
    size_t input_height,
4540
    size_t input_width,
4541
    const float* input,
4542
    const float* weights,
4543
    const float* zero,
4544
    float* output,
4545
    uint32_t padding_top,
4546
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
4547
0
{
4548
0
  assert(input_height != 0);
4549
0
  assert(input_width != 0);
4550
0
  assert(input_width % sizeof(float) == 0);
4551
0
  assert(padding_top >= 1);
4552
0
  assert(padding_top <= 2);
4553
4554
0
  const float vmax = params->scalar.max;
4555
0
  const float vmin = params->scalar.min;
4556
4557
0
  const float vbias = weights[0];
4558
0
  const float vk00 = weights[1];
4559
0
  const float vk01 = weights[2];
4560
0
  const float vk02 = weights[3];
4561
0
  const float vk03 = weights[4];
4562
0
  const float vk04 = weights[5];
4563
0
  const float vk10 = weights[6];
4564
0
  const float vk11 = weights[7];
4565
0
  const float vk12 = weights[8];
4566
0
  const float vk13 = weights[9];
4567
0
  const float vk14 = weights[10];
4568
0
  const float vk20 = weights[11];
4569
0
  const float vk21 = weights[12];
4570
0
  const float vk22 = weights[13];
4571
0
  const float vk23 = weights[14];
4572
0
  const float vk24 = weights[15];
4573
0
  const float vk30 = weights[16];
4574
0
  const float vk31 = weights[17];
4575
0
  const float vk32 = weights[18];
4576
0
  const float vk33 = weights[19];
4577
0
  const float vk34 = weights[20];
4578
0
  const float vk40 = weights[21];
4579
0
  const float vk41 = weights[22];
4580
0
  const float vk42 = weights[23];
4581
0
  const float vk43 = weights[24];
4582
0
  const float vk44 = weights[25];
4583
4584
0
  const uint32_t padding_top_less_1 = padding_top - 1;
4585
4586
0
  const float* i0 = zero;
4587
0
  const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
4588
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
4589
0
  if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
4590
0
    i1 = zero;
4591
0
  }
4592
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
4593
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
4594
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
4595
0
  const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
4596
4597
0
  const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
4598
4599
0
  float* o0 = output;
4600
0
  float* o1 = (float*) ((uintptr_t) o0 + output_width);
4601
4602
0
  size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
4603
0
  size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
4604
0
  do {
4605
0
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
4606
0
      i3 = zero;
4607
0
    }
4608
0
    if XNN_UNPREDICTABLE(padded_input_height < 7) {
4609
0
      i4 = zero;
4610
0
      o1 = o0;
4611
0
    }
4612
0
    if XNN_UNPREDICTABLE(padded_input_height < 8) {
4613
0
      i5 = zero;
4614
0
    }
4615
0
    if XNN_UNPREDICTABLE(padded_input_height < 9) {
4616
0
      i6 = zero;
4617
0
    }
4618
4619
0
    float vi0x0 = 0.0f;
4620
0
    float vi1x0 = 0.0f;
4621
0
    float vi2x0 = 0.0f;
4622
0
    float vi3x0 = 0.0f;
4623
0
    float vi4x0 = 0.0f;
4624
0
    float vi5x0 = 0.0f;
4625
0
    float vi6x0 = 0.0f;
4626
4627
0
    float vi0x1 = 0.0f;
4628
0
    float vi1x1 = 0.0f;
4629
0
    float vi2x1 = 0.0f;
4630
0
    float vi3x1 = 0.0f;
4631
0
    float vi4x1 = 0.0f;
4632
0
    float vi5x1 = 0.0f;
4633
0
    float vi6x1 = 0.0f;
4634
4635
0
    float vi0x2 = *i0++;
4636
0
    float vi1x2 = *i1++;
4637
0
    float vi2x2 = *i2++;
4638
0
    float vi3x2 = *i3++;
4639
0
    float vi4x2 = *i4++;
4640
0
    float vi5x2 = *i5++;
4641
0
    float vi6x2 = *i6++;
4642
4643
0
    size_t w = input_width;
4644
0
    for (; w > 2 * sizeof(float); w -= 2 * sizeof(float)) {
4645
0
      const float vi0x3 = i0[0];
4646
0
      const float vi1x3 = i1[0];
4647
0
      const float vi2x3 = i2[0];
4648
0
      const float vi3x3 = i3[0];
4649
0
      const float vi4x3 = i4[0];
4650
0
      const float vi5x3 = i5[0];
4651
0
      const float vi6x3 = i6[0];
4652
4653
0
      const float vi0x4 = i0[1];
4654
0
      i0 += 2;
4655
0
      const float vi1x4 = i1[1];
4656
0
      i1 += 2;
4657
0
      const float vi2x4 = i2[1];
4658
0
      i2 += 2;
4659
0
      const float vi3x4 = i3[1];
4660
0
      i3 += 2;
4661
0
      const float vi4x4 = i4[1];
4662
0
      i4 += 2;
4663
0
      const float vi5x4 = i5[1];
4664
0
      i5 += 2;
4665
0
      const float vi6x4 = i6[1];
4666
0
      i6 += 2;
4667
4668
0
      float vo0p0 = vbias + vi0x0 * vk00;
4669
0
      float vo1p0 = vbias + vi2x0 * vk00;
4670
0
      float vo0p1 = vi1x0 * vk10;
4671
0
      float vo1p1 = vi3x0 * vk10;
4672
0
      vo0p0 += vi2x0 * vk20;
4673
0
      vo1p0 += vi4x0 * vk20;
4674
0
      vo0p1 += vi3x0 * vk30;
4675
0
      vo1p1 += vi5x0 * vk30;
4676
0
      vo0p0 += vi4x0 * vk40;
4677
0
      vo1p0 += vi6x0 * vk40;
4678
4679
0
      vi0x0 = vi0x2;
4680
0
      vi1x0 = vi1x2;
4681
0
      vi2x0 = vi2x2;
4682
0
      vi3x0 = vi3x2;
4683
0
      vi4x0 = vi4x2;
4684
0
      vi5x0 = vi5x2;
4685
0
      vi6x0 = vi6x2;
4686
4687
0
      vo0p1 += vi0x1 * vk01;
4688
0
      vo1p1 += vi2x1 * vk01;
4689
0
      vo0p0 += vi1x1 * vk11;
4690
0
      vo1p0 += vi3x1 * vk11;
4691
0
      vo0p1 += vi2x1 * vk21;
4692
0
      vo1p1 += vi4x1 * vk21;
4693
0
      vo0p0 += vi3x1 * vk31;
4694
0
      vo1p0 += vi5x1 * vk31;
4695
0
      vo0p1 += vi4x1 * vk41;
4696
0
      vo1p1 += vi6x1 * vk41;
4697
4698
0
      vi0x1 = vi0x3;
4699
0
      vi1x1 = vi1x3;
4700
0
      vi2x1 = vi2x3;
4701
0
      vi3x1 = vi3x3;
4702
0
      vi4x1 = vi4x3;
4703
0
      vi5x1 = vi5x3;
4704
0
      vi6x1 = vi6x3;
4705
4706
0
      vo0p0 += vi0x2 * vk02;
4707
0
      vo1p0 += vi2x2 * vk02;
4708
0
      vo0p1 += vi1x2 * vk12;
4709
0
      vo1p1 += vi3x2 * vk12;
4710
0
      vo0p0 += vi2x2 * vk22;
4711
0
      vo1p0 += vi4x2 * vk22;
4712
0
      vo0p1 += vi3x2 * vk32;
4713
0
      vo1p1 += vi5x2 * vk32;
4714
0
      vo0p0 += vi4x2 * vk42;
4715
0
      vo1p0 += vi6x2 * vk42;
4716
4717
0
      vi0x2 = vi0x4;
4718
0
      vi1x2 = vi1x4;
4719
0
      vi2x2 = vi2x4;
4720
0
      vi3x2 = vi3x4;
4721
0
      vi4x2 = vi4x4;
4722
0
      vi5x2 = vi5x4;
4723
0
      vi6x2 = vi6x4;
4724
4725
0
      vo0p1 += vi0x3 * vk03;
4726
0
      vo1p1 += vi2x3 * vk03;
4727
0
      vo0p0 += vi1x3 * vk13;
4728
0
      vo1p0 += vi3x3 * vk13;
4729
0
      vo0p1 += vi2x3 * vk23;
4730
0
      vo1p1 += vi4x3 * vk23;
4731
0
      vo0p0 += vi3x3 * vk33;
4732
0
      vo1p0 += vi5x3 * vk33;
4733
0
      vo0p1 += vi4x3 * vk43;
4734
0
      vo1p1 += vi6x3 * vk43;
4735
4736
0
      vo0p0 += vi0x4 * vk04;
4737
0
      vo1p0 += vi2x4 * vk04;
4738
0
      vo0p1 += vi1x4 * vk14;
4739
0
      vo1p1 += vi3x4 * vk14;
4740
0
      vo0p0 += vi2x4 * vk24;
4741
0
      vo1p0 += vi4x4 * vk24;
4742
0
      vo0p1 += vi3x4 * vk34;
4743
0
      vo1p1 += vi5x4 * vk34;
4744
0
      vo0p0 += vi4x4 * vk44;
4745
0
      vo1p0 += vi6x4 * vk44;
4746
4747
0
      vo0p0 += vo0p1;
4748
0
      vo1p0 += vo1p1;
4749
4750
0
      float vo0 = math_max_f32(vo0p0, vmin);
4751
0
      float vo1 = math_max_f32(vo1p0, vmin);
4752
4753
0
      vo0 = math_min_f32(vo0, vmax);
4754
0
      vo1 = math_min_f32(vo1, vmax);
4755
4756
0
      *o1++ = vo1;
4757
0
      *o0++ = vo0;
4758
0
    }
4759
0
    if XNN_LIKELY(w == 2 * sizeof(float)) {
4760
0
      const float vi0x3 = *i0++;
4761
0
      const float vi1x3 = *i1++;
4762
0
      const float vi2x3 = *i2++;
4763
0
      const float vi3x3 = *i3++;
4764
0
      const float vi4x3 = *i4++;
4765
0
      const float vi5x3 = *i5++;
4766
0
      const float vi6x3 = *i6++;
4767
4768
0
      float vo0p0 = vbias + vi0x0 * vk00;
4769
0
      float vo1p0 = vbias + vi2x0 * vk00;
4770
0
      float vo0p1 = vi1x0 * vk10;
4771
0
      float vo1p1 = vi3x0 * vk10;
4772
0
      vo0p0 += vi2x0 * vk20;
4773
0
      vo1p0 += vi4x0 * vk20;
4774
0
      vo0p1 += vi3x0 * vk30;
4775
0
      vo1p1 += vi5x0 * vk30;
4776
0
      vo0p0 += vi4x0 * vk40;
4777
0
      vo1p0 += vi6x0 * vk40;
4778
4779
0
      vo0p1 += vi0x1 * vk01;
4780
0
      vo1p1 += vi2x1 * vk01;
4781
0
      vo0p0 += vi1x1 * vk11;
4782
0
      vo1p0 += vi3x1 * vk11;
4783
0
      vo0p1 += vi2x1 * vk21;
4784
0
      vo1p1 += vi4x1 * vk21;
4785
0
      vo0p0 += vi3x1 * vk31;
4786
0
      vo1p0 += vi5x1 * vk31;
4787
0
      vo0p1 += vi4x1 * vk41;
4788
0
      vo1p1 += vi6x1 * vk41;
4789
4790
0
      vo0p0 += vi0x2 * vk02;
4791
0
      vo1p0 += vi2x2 * vk02;
4792
0
      vo0p1 += vi1x2 * vk12;
4793
0
      vo1p1 += vi3x2 * vk12;
4794
0
      vo0p0 += vi2x2 * vk22;
4795
0
      vo1p0 += vi4x2 * vk22;
4796
0
      vo0p1 += vi3x2 * vk32;
4797
0
      vo1p1 += vi5x2 * vk32;
4798
0
      vo0p0 += vi4x2 * vk42;
4799
0
      vo1p0 += vi6x2 * vk42;
4800
4801
0
      vo0p1 += vi0x3 * vk03;
4802
0
      vo1p1 += vi2x3 * vk03;
4803
0
      vo0p0 += vi1x3 * vk13;
4804
0
      vo1p0 += vi3x3 * vk13;
4805
0
      vo0p1 += vi2x3 * vk23;
4806
0
      vo1p1 += vi4x3 * vk23;
4807
0
      vo0p0 += vi3x3 * vk33;
4808
0
      vo1p0 += vi5x3 * vk33;
4809
0
      vo0p1 += vi4x3 * vk43;
4810
0
      vo1p1 += vi6x3 * vk43;
4811
4812
0
      vo0p0 += vo0p1;
4813
0
      vo1p0 += vo1p1;
4814
4815
0
      float vo0 = math_max_f32(vo0p0, vmin);
4816
0
      float vo1 = math_max_f32(vo1p0, vmin);
4817
4818
0
      vo0 = math_min_f32(vo0, vmax);
4819
0
      vo1 = math_min_f32(vo1, vmax);
4820
4821
0
      *o1++ = vo1;
4822
0
      *o0++ = vo0;
4823
0
    } else {
4824
0
      float vo0p0 = vbias + vi0x0 * vk00;
4825
0
      float vo1p0 = vbias + vi2x0 * vk00;
4826
0
      float vo0p1 = vi1x0 * vk10;
4827
0
      float vo1p1 = vi3x0 * vk10;
4828
0
      vo0p0 += vi2x0 * vk20;
4829
0
      vo1p0 += vi4x0 * vk20;
4830
0
      vo0p1 += vi3x0 * vk30;
4831
0
      vo1p1 += vi5x0 * vk30;
4832
0
      vo0p0 += vi4x0 * vk40;
4833
0
      vo1p0 += vi6x0 * vk40;
4834
4835
0
      vo0p1 += vi0x1 * vk01;
4836
0
      vo1p1 += vi2x1 * vk01;
4837
0
      vo0p0 += vi1x1 * vk11;
4838
0
      vo1p0 += vi3x1 * vk11;
4839
0
      vo0p1 += vi2x1 * vk21;
4840
0
      vo1p1 += vi4x1 * vk21;
4841
0
      vo0p0 += vi3x1 * vk31;
4842
0
      vo1p0 += vi5x1 * vk31;
4843
0
      vo0p1 += vi4x1 * vk41;
4844
0
      vo1p1 += vi6x1 * vk41;
4845
4846
0
      vo0p0 += vi0x2 * vk02;
4847
0
      vo1p0 += vi2x2 * vk02;
4848
0
      vo0p1 += vi1x2 * vk12;
4849
0
      vo1p1 += vi3x2 * vk12