Coverage Report

Created: 2023-09-25 06:31

/src/xnnpack/src/amalgam/gen/scalar.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2021 Google LLC
2
//
3
// This source code is licensed under the BSD-style license found in the
4
// LICENSE file in the root directory of this source tree.
5
6
#include <assert.h>
7
#include <fxdiv.h>
8
#include <math.h>
9
#include <stddef.h>
10
#include <stdint.h>
11
#include <string.h>
12
13
#include <xnnpack/argmaxpool.h>
14
#include <xnnpack/avgpool.h>
15
#include <xnnpack/common.h>
16
#include <xnnpack/conv.h>
17
#include <xnnpack/dwconv.h>
18
#include <xnnpack/fill.h>
19
#include <xnnpack/gavgpool.h>
20
#include <xnnpack/gemm.h>
21
#include <xnnpack/ibilinear.h>
22
#include <xnnpack/igemm.h>
23
#include <xnnpack/lut.h>
24
#include <xnnpack/math.h>
25
#include <xnnpack/maxpool.h>
26
#include <xnnpack/microparams.h>
27
#include <xnnpack/packw.h>
28
#include <xnnpack/pad.h>
29
#include <xnnpack/pavgpool.h>
30
#include <xnnpack/prelu.h>
31
#include <xnnpack/raddstoreexpminusmax.h>
32
#include <xnnpack/reduce.h>
33
#include <xnnpack/rmax.h>
34
#include <xnnpack/spmm.h>
35
#include <xnnpack/transpose.h>
36
#include <xnnpack/unaligned.h>
37
#include <xnnpack/unpool.h>
38
#include <xnnpack/vadd.h>
39
#include <xnnpack/vbinary.h>
40
#include <xnnpack/vcvt.h>
41
#include <xnnpack/vlrelu.h>
42
#include <xnnpack/vmul.h>
43
#include <xnnpack/vmulcaddc.h>
44
#include <xnnpack/vunary.h>
45
#include <xnnpack/zip.h>
46
47
48
void xnn_f16_f32_vcvt_ukernel__scalar_u1(
49
    size_t batch,
50
    const void* input,
51
    float* output,
52
    const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
53
0
{
54
0
  assert(batch != 0);
55
0
  assert(batch % sizeof(uint16_t) == 0);
56
0
  assert(input != NULL);
57
0
  assert(output != NULL);
58
59
0
  const uint32_t vsign_mask = params->scalar.sign_mask;
60
0
  const uint32_t vexp_offset = params->scalar.exp_offset;
61
0
  const float vexp_scale = params->scalar.exp_scale;
62
0
  const uint32_t vmagic_mask = params->scalar.magic_mask;
63
0
  const float vmagic_bias = params->scalar.magic_bias;
64
0
  const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
65
66
0
  const uint16_t* i = (const uint16_t*) input;
67
0
  uint32_t* o = (uint32_t*) output;
68
0
  do {
69
0
    const uint16_t vh = *i++;
70
71
0
    const uint32_t vw = (uint32_t) vh << 16;
72
0
    const uint32_t vsign = vw & vsign_mask;
73
0
    const uint32_t v2w = vw + vw;
74
0
    const uint32_t vnorm = float_as_uint32(uint32_as_float((v2w >> 4) + vexp_offset) * vexp_scale);
75
0
    const uint32_t vdenorm = float_as_uint32(uint32_as_float((v2w >> 17) | vmagic_mask) - vmagic_bias);
76
0
    const uint32_t vf = vsign | (XNN_UNPREDICTABLE(v2w < vdenorm_cutoff) ? vdenorm : vnorm);
77
78
0
    *o++ = vf;
79
80
0
    batch -= sizeof(uint16_t);
81
0
  } while (batch != 0);
82
0
}
83
84
void xnn_f16_f32_vcvt_ukernel__scalar_u4(
85
    size_t batch,
86
    const void* input,
87
    float* output,
88
    const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
89
0
{
90
0
  assert(batch != 0);
91
0
  assert(batch % sizeof(uint16_t) == 0);
92
0
  assert(input != NULL);
93
0
  assert(output != NULL);
94
95
0
  const uint32_t vsign_mask = params->scalar.sign_mask;
96
0
  const uint32_t vexp_offset = params->scalar.exp_offset;
97
0
  const float vexp_scale = params->scalar.exp_scale;
98
0
  const uint32_t vmagic_mask = params->scalar.magic_mask;
99
0
  const float vmagic_bias = params->scalar.magic_bias;
100
0
  const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
101
102
0
  const uint16_t* i = (const uint16_t*) input;
103
0
  uint32_t* o = (uint32_t*) output;
104
0
  for (; batch >= 4 * sizeof(uint16_t); batch -= 4 * sizeof(uint16_t)) {
105
0
    const uint16_t vh0 = i[0];
106
0
    const uint16_t vh1 = i[1];
107
0
    const uint16_t vh2 = i[2];
108
0
    const uint16_t vh3 = i[3];
109
0
    i += 4;
110
111
0
    const uint32_t vw0 = (uint32_t) vh0 << 16;
112
0
    const uint32_t vw1 = (uint32_t) vh1 << 16;
113
0
    const uint32_t vw2 = (uint32_t) vh2 << 16;
114
0
    const uint32_t vw3 = (uint32_t) vh3 << 16;
115
116
0
    const uint32_t vsign0 = vw0 & vsign_mask;
117
0
    const uint32_t vsign1 = vw1 & vsign_mask;
118
0
    const uint32_t vsign2 = vw2 & vsign_mask;
119
0
    const uint32_t vsign3 = vw3 & vsign_mask;
120
121
0
    const uint32_t v2w0 = vw0 + vw0;
122
0
    const uint32_t v2w1 = vw1 + vw1;
123
0
    const uint32_t v2w2 = vw2 + vw2;
124
0
    const uint32_t v2w3 = vw3 + vw3;
125
126
0
    const uint32_t vnorm0 = float_as_uint32(uint32_as_float((v2w0 >> 4) + vexp_offset) * vexp_scale);
127
0
    const uint32_t vnorm1 = float_as_uint32(uint32_as_float((v2w1 >> 4) + vexp_offset) * vexp_scale);
128
0
    const uint32_t vnorm2 = float_as_uint32(uint32_as_float((v2w2 >> 4) + vexp_offset) * vexp_scale);
129
0
    const uint32_t vnorm3 = float_as_uint32(uint32_as_float((v2w3 >> 4) + vexp_offset) * vexp_scale);
130
131
0
    const uint32_t vdenorm0 = float_as_uint32(uint32_as_float((v2w0 >> 17) | vmagic_mask) - vmagic_bias);
132
0
    const uint32_t vdenorm1 = float_as_uint32(uint32_as_float((v2w1 >> 17) | vmagic_mask) - vmagic_bias);
133
0
    const uint32_t vdenorm2 = float_as_uint32(uint32_as_float((v2w2 >> 17) | vmagic_mask) - vmagic_bias);
134
0
    const uint32_t vdenorm3 = float_as_uint32(uint32_as_float((v2w3 >> 17) | vmagic_mask) - vmagic_bias);
135
136
0
    const uint32_t vf0 = vsign0 | (XNN_UNPREDICTABLE(v2w0 < vdenorm_cutoff) ? vdenorm0 : vnorm0);
137
0
    const uint32_t vf1 = vsign1 | (XNN_UNPREDICTABLE(v2w1 < vdenorm_cutoff) ? vdenorm1 : vnorm1);
138
0
    const uint32_t vf2 = vsign2 | (XNN_UNPREDICTABLE(v2w2 < vdenorm_cutoff) ? vdenorm2 : vnorm2);
139
0
    const uint32_t vf3 = vsign3 | (XNN_UNPREDICTABLE(v2w3 < vdenorm_cutoff) ? vdenorm3 : vnorm3);
140
141
0
    o[0] = vf0;
142
0
    o[1] = vf1;
143
0
    o[2] = vf2;
144
0
    o[3] = vf3;
145
0
    o += 4;
146
0
  }
147
0
  if XNN_UNLIKELY(batch != 0) {
148
0
    do {
149
0
      const uint16_t vh = *i++;
150
151
0
      const uint32_t vw = (uint32_t) vh << 16;
152
0
      const uint32_t vsign = vw & vsign_mask;
153
0
      const uint32_t v2w = vw + vw;
154
0
      const uint32_t vnorm = float_as_uint32(uint32_as_float((v2w >> 4) + vexp_offset) * vexp_scale);
155
0
      const uint32_t vdenorm = float_as_uint32(uint32_as_float((v2w >> 17) | vmagic_mask) - vmagic_bias);
156
0
      const uint32_t vf = vsign | (XNN_UNPREDICTABLE(v2w < vdenorm_cutoff) ? vdenorm : vnorm);
157
158
0
      *o++ = vf;
159
160
0
      batch -= sizeof(uint16_t);
161
0
    } while (batch != 0);
162
0
  }
163
0
}
164
165
void xnn_f32_argmaxpool_ukernel_4x__scalar_c1(
166
    size_t output_pixels,
167
    size_t pooling_elements,
168
    size_t channels,
169
    const float** input,
170
    size_t input_offset,
171
    float* output,
172
    uint32_t* index,
173
    size_t input_increment,
174
    size_t output_increment)
175
0
{
176
0
  assert(output_pixels != 0);
177
0
  assert(pooling_elements != 0);
178
0
  assert(pooling_elements <= 4);
179
0
  assert(channels != 0);
180
181
0
  do {
182
0
    const float* i0 = input[0];
183
0
    const float* i1 = input[1];
184
0
    const float* i2 = input[2];
185
0
    const float* i3 = input[3];
186
0
    i0 = (const float*) ((uintptr_t) i0 + input_offset);
187
0
    i1 = (const float*) ((uintptr_t) i1 + input_offset);
188
0
    i2 = (const float*) ((uintptr_t) i2 + input_offset);
189
0
    i3 = (const float*) ((uintptr_t) i3 + input_offset);
190
0
    if (pooling_elements < 2) {
191
0
      i1 = i0;
192
0
    }
193
0
    if (pooling_elements <= 2) {
194
0
      i2 = i0;
195
0
    }
196
0
    if (pooling_elements != 4) {
197
0
      i3 = i0;
198
0
    }
199
200
0
    size_t c = channels;
201
0
    do {
202
0
      const float vi0 = *i0++;
203
0
      const float vi1 = *i1++;
204
0
      const float vi2 = *i2++;
205
0
      const float vi3 = *i3++;
206
207
0
      float vmax = vi0;
208
0
      uint32_t vidx = 0;
209
210
0
      if (vi1 > vmax) {
211
0
        vmax = vi1;
212
0
        vidx = 1;
213
0
      }
214
215
0
      if (vi2 > vmax) {
216
0
        vmax = vi2;
217
0
        vidx = 2;
218
0
      }
219
220
0
      if (vi3 > vmax) {
221
0
        vmax = vi3;
222
0
        vidx = 3;
223
0
      }
224
225
0
      *output++ = vmax;
226
0
      *index++ = vidx;
227
0
    } while (--c != 0);
228
0
    input = (const float**) ((uintptr_t) input + input_increment);
229
0
    output = (float*) ((uintptr_t) output + output_increment);
230
0
  } while (--output_pixels != 0);
231
0
}
232
233
void xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1(
234
    size_t output_pixels,
235
    size_t pooling_elements,
236
    size_t channels,
237
    const float** input,
238
    size_t input_offset,
239
    float* accumulation_buffer,
240
    uint32_t* index_buffer,
241
    float* output,
242
    uint32_t* index,
243
    size_t input_increment,
244
    size_t output_increment)
245
0
{
246
0
  assert(output_pixels != 0);
247
0
  assert(pooling_elements != 0);
248
0
  assert(pooling_elements > 9);
249
0
  assert(channels != 0);
250
251
0
  do {
252
0
    {
253
0
      float* ab = accumulation_buffer;
254
0
      uint32_t* ib = index_buffer;
255
256
0
      const float* i0 = *input++;
257
0
      const float* i1 = *input++;
258
0
      const float* i2 = *input++;
259
0
      const float* i3 = *input++;
260
0
      const float* i4 = *input++;
261
0
      const float* i5 = *input++;
262
0
      const float* i6 = *input++;
263
0
      const float* i7 = *input++;
264
0
      const float* i8 = *input++;
265
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
266
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
267
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
268
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
269
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
270
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
271
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
272
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
273
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
274
275
0
      size_t c = channels;
276
0
      do {
277
0
        const float vi0 = *i0++;
278
0
        const float vi1 = *i1++;
279
0
        const float vi2 = *i2++;
280
0
        const float vi3 = *i3++;
281
0
        const float vi4 = *i4++;
282
0
        const float vi5 = *i5++;
283
0
        const float vi6 = *i6++;
284
0
        const float vi7 = *i7++;
285
0
        const float vi8 = *i8++;
286
287
0
        float vmax = vi0;
288
0
        uint32_t vidx = 0;
289
290
0
        if (vi1 > vmax) {
291
0
          vmax = vi1;
292
0
          vidx = 1;
293
0
        }
294
295
0
        if (vi2 > vmax) {
296
0
          vmax = vi2;
297
0
          vidx = 2;
298
0
        }
299
300
0
        if (vi3 > vmax) {
301
0
          vmax = vi3;
302
0
          vidx = 3;
303
0
        }
304
305
0
        if (vi4 > vmax) {
306
0
          vmax = vi4;
307
0
          vidx = 4;
308
0
        }
309
310
0
        if (vi5 > vmax) {
311
0
          vmax = vi5;
312
0
          vidx = 5;
313
0
        }
314
315
0
        if (vi6 > vmax) {
316
0
          vmax = vi6;
317
0
          vidx = 6;
318
0
        }
319
320
0
        if (vi7 > vmax) {
321
0
          vmax = vi7;
322
0
          vidx = 7;
323
0
        }
324
325
0
        if (vi8 > vmax) {
326
0
          vmax = vi8;
327
0
          vidx = 8;
328
0
        }
329
330
0
        *ab++ = vmax;
331
0
        *ib++ = vidx;
332
0
      } while (--c != 0);
333
0
    }
334
0
    uint32_t vidx0 = 9;
335
0
    size_t k = pooling_elements;
336
0
    for (k -= 9; k > 8; k -= 8) {
337
0
      const float* i0 = *input++;
338
0
      const float* i1 = *input++;
339
0
      const float* i2 = *input++;
340
0
      const float* i3 = *input++;
341
0
      const float* i4 = *input++;
342
0
      const float* i5 = *input++;
343
0
      const float* i6 = *input++;
344
0
      const float* i7 = *input++;
345
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
346
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
347
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
348
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
349
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
350
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
351
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
352
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
353
354
0
      float* ab = accumulation_buffer;
355
0
      uint32_t* ib = index_buffer;
356
357
0
      size_t c = channels;
358
0
      do {
359
0
        const float vi0 = *i0++;
360
0
        const float vi1 = *i1++;
361
0
        const float vi2 = *i2++;
362
0
        const float vi3 = *i3++;
363
0
        const float vi4 = *i4++;
364
0
        const float vi5 = *i5++;
365
0
        const float vi6 = *i6++;
366
0
        const float vi7 = *i7++;
367
368
0
        float vmax = *ab;
369
0
        uint32_t vidx = *ib;
370
371
0
        if (vi0 > vmax) {
372
0
          vmax = vi0;
373
0
          vidx = vidx0;
374
0
        }
375
376
0
        if (vi1 > vmax) {
377
0
          vmax = vi1;
378
0
          vidx = vidx0 + 1;
379
0
        }
380
381
0
        if (vi2 > vmax) {
382
0
          vmax = vi2;
383
0
          vidx = vidx0 + 2;
384
0
        }
385
386
0
        if (vi3 > vmax) {
387
0
          vmax = vi3;
388
0
          vidx = vidx0 + 3;
389
0
        }
390
391
0
        if (vi4 > vmax) {
392
0
          vmax = vi4;
393
0
          vidx = vidx0 + 4;
394
0
        }
395
396
0
        if (vi5 > vmax) {
397
0
          vmax = vi5;
398
0
          vidx = vidx0 + 5;
399
0
        }
400
401
0
        if (vi6 > vmax) {
402
0
          vmax = vi6;
403
0
          vidx = vidx0 + 6;
404
0
        }
405
406
0
        if (vi7 > vmax) {
407
0
          vmax = vi7;
408
0
          vidx = vidx0 + 7;
409
0
        }
410
411
0
        *ab++ = vmax;
412
0
        *ib++ = vidx;
413
0
      } while (--c != 0);
414
0
      vidx0 += 8;
415
0
    }
416
417
0
    float* o = output;
418
0
    uint32_t* i = index;
419
0
    {
420
0
      const float* i0 = input[0];
421
0
      const float* i1 = input[1];
422
0
      const float* i2 = input[2];
423
0
      const float* i3 = input[3];
424
0
      const float* i4 = input[4];
425
0
      const float* i5 = input[5];
426
0
      const float* i6 = input[6];
427
0
      const float* i7 = input[7];
428
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
429
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
430
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
431
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
432
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
433
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
434
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
435
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
436
0
      input = (const float**) ((uintptr_t) input + input_increment);
437
0
      if (k < 2) {
438
0
        i1 = i0;
439
0
      }
440
0
      if (k <= 2) {
441
0
        i2 = i0;
442
0
      }
443
0
      if (k < 4) {
444
0
        i3 = i0;
445
0
      }
446
0
      if (k <= 4) {
447
0
        i4 = i0;
448
0
      }
449
0
      if (k < 6) {
450
0
        i5 = i0;
451
0
      }
452
0
      if (k <= 6) {
453
0
        i6 = i0;
454
0
      }
455
0
      if (k != 8) {
456
0
        i7 = i0;
457
0
      }
458
459
0
      size_t c = channels;
460
0
      float* ab = accumulation_buffer;
461
0
      uint32_t* ib = index_buffer;
462
0
      do {
463
0
        const float vi0 = *i0++;
464
0
        const float vi1 = *i1++;
465
0
        const float vi2 = *i2++;
466
0
        const float vi3 = *i3++;
467
0
        const float vi4 = *i4++;
468
0
        const float vi5 = *i5++;
469
0
        const float vi6 = *i6++;
470
0
        const float vi7 = *i7++;
471
472
0
        float vmax = *ab++;
473
0
        uint32_t vidx = *ib++;
474
475
0
        if (vi0 > vmax) {
476
0
          vmax = vi0;
477
0
          vidx = vidx0;
478
0
        }
479
480
0
        if (vi1 > vmax) {
481
0
          vmax = vi1;
482
0
          vidx = vidx0 + 1;
483
0
        }
484
485
0
        if (vi2 > vmax) {
486
0
          vmax = vi2;
487
0
          vidx = vidx0 + 2;
488
0
        }
489
490
0
        if (vi3 > vmax) {
491
0
          vmax = vi3;
492
0
          vidx = vidx0 + 3;
493
0
        }
494
495
0
        if (vi4 > vmax) {
496
0
          vmax = vi4;
497
0
          vidx = vidx0 + 4;
498
0
        }
499
500
0
        if (vi5 > vmax) {
501
0
          vmax = vi5;
502
0
          vidx = vidx0 + 5;
503
0
        }
504
505
0
        if (vi6 > vmax) {
506
0
          vmax = vi6;
507
0
          vidx = vidx0 + 6;
508
0
        }
509
510
0
        if (vi7 > vmax) {
511
0
          vmax = vi7;
512
0
          vidx = vidx0 + 7;
513
0
        }
514
515
0
        *o++ = vmax;
516
0
        *i++ = vidx;
517
0
      } while (--c != 0);
518
0
    }
519
520
0
    output = (float*) ((uintptr_t) o + output_increment);
521
0
    index = (uint32_t*) i;
522
0
  } while (--output_pixels != 0);
523
0
}
524
525
void xnn_f32_argmaxpool_ukernel_9x__scalar_c1(
526
    size_t output_pixels,
527
    size_t pooling_elements,
528
    size_t channels,
529
    const float** input,
530
    size_t input_offset,
531
    float* output,
532
    uint32_t* index,
533
    size_t input_increment,
534
    size_t output_increment)
535
0
{
536
0
  assert(output_pixels != 0);
537
0
  assert(pooling_elements != 0);
538
0
  assert(pooling_elements <= 9);
539
0
  assert(channels != 0);
540
541
0
  do {
542
0
    const float* i0 = input[0];
543
0
    const float* i1 = input[1];
544
0
    const float* i2 = input[2];
545
0
    const float* i3 = input[3];
546
0
    const float* i4 = input[4];
547
0
    const float* i5 = input[5];
548
0
    const float* i6 = input[6];
549
0
    const float* i7 = input[7];
550
0
    const float* i8 = input[8];
551
0
    i0 = (const float*) ((uintptr_t) i0 + input_offset);
552
0
    i1 = (const float*) ((uintptr_t) i1 + input_offset);
553
0
    i2 = (const float*) ((uintptr_t) i2 + input_offset);
554
0
    i3 = (const float*) ((uintptr_t) i3 + input_offset);
555
0
    i4 = (const float*) ((uintptr_t) i4 + input_offset);
556
0
    i5 = (const float*) ((uintptr_t) i5 + input_offset);
557
0
    i6 = (const float*) ((uintptr_t) i6 + input_offset);
558
0
    i7 = (const float*) ((uintptr_t) i7 + input_offset);
559
0
    i8 = (const float*) ((uintptr_t) i8 + input_offset);
560
0
    if (pooling_elements < 2) {
561
0
      i1 = i0;
562
0
    }
563
0
    if (pooling_elements <= 2) {
564
0
      i2 = i0;
565
0
    }
566
0
    if (pooling_elements < 4) {
567
0
      i3 = i0;
568
0
    }
569
0
    if (pooling_elements <= 4) {
570
0
      i4 = i0;
571
0
    }
572
0
    if (pooling_elements < 6) {
573
0
      i5 = i0;
574
0
    }
575
0
    if (pooling_elements <= 6) {
576
0
      i6 = i0;
577
0
    }
578
0
    if (pooling_elements < 8) {
579
0
      i7 = i0;
580
0
    }
581
0
    if (pooling_elements <= 8) {
582
0
      i8 = i0;
583
0
    }
584
585
0
    size_t c = channels;
586
0
    do {
587
0
      const float vi0 = *i0++;
588
0
      const float vi1 = *i1++;
589
0
      const float vi2 = *i2++;
590
0
      const float vi3 = *i3++;
591
0
      const float vi4 = *i4++;
592
0
      const float vi5 = *i5++;
593
0
      const float vi6 = *i6++;
594
0
      const float vi7 = *i7++;
595
0
      const float vi8 = *i8++;
596
597
0
      float vmax = vi0;
598
0
      uint32_t vidx = 0;
599
600
0
      if (vi1 > vmax) {
601
0
        vmax = vi1;
602
0
        vidx = 1;
603
0
      }
604
605
0
      if (vi2 > vmax) {
606
0
        vmax = vi2;
607
0
        vidx = 2;
608
0
      }
609
610
0
      if (vi3 > vmax) {
611
0
        vmax = vi3;
612
0
        vidx = 3;
613
0
      }
614
615
0
      if (vi4 > vmax) {
616
0
        vmax = vi4;
617
0
        vidx = 4;
618
0
      }
619
620
0
      if (vi5 > vmax) {
621
0
        vmax = vi5;
622
0
        vidx = 5;
623
0
      }
624
625
0
      if (vi6 > vmax) {
626
0
        vmax = vi6;
627
0
        vidx = 6;
628
0
      }
629
630
0
      if (vi7 > vmax) {
631
0
        vmax = vi7;
632
0
        vidx = 7;
633
0
      }
634
635
0
      if (vi8 > vmax) {
636
0
        vmax = vi8;
637
0
        vidx = 8;
638
0
      }
639
640
0
      *output++ = vmax;
641
0
      *index++ = vidx;
642
0
    } while (--c != 0);
643
0
    input = (const float**) ((uintptr_t) input + input_increment);
644
0
    output = (float*) ((uintptr_t) output + output_increment);
645
0
  } while (--output_pixels != 0);
646
0
}
647
648
void xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1(
649
    size_t output_pixels,
650
    size_t kernel_elements,
651
    size_t channels,
652
    const float** input,
653
    size_t input_offset,
654
    const float* zero,
655
    float* buffer,
656
    float* output,
657
    size_t input_increment,
658
    size_t output_increment,
659
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
660
0
{
661
0
  assert(output_pixels != 0);
662
0
  assert(kernel_elements > 9);
663
0
  assert(channels != 0);
664
665
0
  const float vscale = params->scalar.scale;
666
0
  const float vmin = params->scalar.min;
667
0
  const float vmax = params->scalar.max;
668
669
0
  do {
670
0
    {
671
0
      const float* i0 = *input++;
672
0
      assert(i0 != NULL);
673
0
      if XNN_UNPREDICTABLE(i0 != zero) {
674
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
675
0
      }
676
0
      const float* i1 = *input++;
677
0
      assert(i1 != NULL);
678
0
      if XNN_UNPREDICTABLE(i1 != zero) {
679
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
680
0
      }
681
0
      const float* i2 = *input++;
682
0
      assert(i2 != NULL);
683
0
      if XNN_UNPREDICTABLE(i2 != zero) {
684
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
685
0
      }
686
0
      const float* i3 = *input++;
687
0
      assert(i3 != NULL);
688
0
      if XNN_UNPREDICTABLE(i3 != zero) {
689
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
690
0
      }
691
0
      const float* i4 = *input++;
692
0
      assert(i4 != NULL);
693
0
      if XNN_UNPREDICTABLE(i4 != zero) {
694
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
695
0
      }
696
0
      const float* i5 = *input++;
697
0
      assert(i5 != NULL);
698
0
      if XNN_UNPREDICTABLE(i5 != zero) {
699
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
700
0
      }
701
0
      const float* i6 = *input++;
702
0
      assert(i6 != NULL);
703
0
      if XNN_UNPREDICTABLE(i6 != zero) {
704
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
705
0
      }
706
0
      const float* i7 = *input++;
707
0
      assert(i7 != NULL);
708
0
      if XNN_UNPREDICTABLE(i7 != zero) {
709
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
710
0
      }
711
0
      const float* i8 = *input++;
712
0
      assert(i8 != NULL);
713
0
      if XNN_UNPREDICTABLE(i8 != zero) {
714
0
        i8 = (const float*) ((uintptr_t) i8 + input_offset);
715
0
      }
716
717
0
      float* b = buffer;
718
0
      size_t c = channels;
719
0
      do {
720
0
        const float vi0 = *i0++;
721
0
        const float vi1 = *i1++;
722
0
        const float vi2 = *i2++;
723
0
        const float vi3 = *i3++;
724
0
        const float vi4 = *i4++;
725
0
        const float vi5 = *i5++;
726
0
        const float vi6 = *i6++;
727
0
        const float vi7 = *i7++;
728
0
        const float vi8 = *i8++;
729
730
0
        const float vsum01 = vi0 + vi1;
731
0
        const float vsum23 = vi2 + vi3;
732
0
        const float vsum45 = vi4 + vi5;
733
0
        const float vsum67 = vi6 + vi7;
734
0
        const float vsum018 = vsum01 + vi8;
735
0
        const float vsum2345 = vsum23 + vsum45;
736
0
        const float vsum01678 = vsum018 + vsum67;
737
0
        const float vsum = vsum2345 + vsum01678;
738
739
0
        *b++ = vsum;
740
0
      } while (--c != 0);
741
0
    }
742
743
0
    size_t k = kernel_elements;
744
0
    for (k -= 9; k > 8; k -= 8) {
745
0
      const float* i0 = *input++;
746
0
      assert(i0 != NULL);
747
0
      if XNN_UNPREDICTABLE(i0 != zero) {
748
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
749
0
      }
750
0
      const float* i1 = *input++;
751
0
      assert(i1 != NULL);
752
0
      if XNN_UNPREDICTABLE(i1 != zero) {
753
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
754
0
      }
755
0
      const float* i2 = *input++;
756
0
      assert(i2 != NULL);
757
0
      if XNN_UNPREDICTABLE(i2 != zero) {
758
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
759
0
      }
760
0
      const float* i3 = *input++;
761
0
      assert(i3 != NULL);
762
0
      if XNN_UNPREDICTABLE(i3 != zero) {
763
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
764
0
      }
765
0
      const float* i4 = *input++;
766
0
      assert(i4 != NULL);
767
0
      if XNN_UNPREDICTABLE(i4 != zero) {
768
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
769
0
      }
770
0
      const float* i5 = *input++;
771
0
      assert(i5 != NULL);
772
0
      if XNN_UNPREDICTABLE(i5 != zero) {
773
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
774
0
      }
775
0
      const float* i6 = *input++;
776
0
      assert(i6 != NULL);
777
0
      if XNN_UNPREDICTABLE(i6 != zero) {
778
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
779
0
      }
780
0
      const float* i7 = *input++;
781
0
      assert(i7 != NULL);
782
0
      if XNN_UNPREDICTABLE(i7 != zero) {
783
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
784
0
      }
785
786
0
      float* b = buffer;
787
0
      size_t c = channels;
788
0
      do {
789
0
        const float vi0 = *i0++;
790
0
        const float vi1 = *i1++;
791
0
        const float vi2 = *i2++;
792
0
        const float vi3 = *i3++;
793
0
        const float vi4 = *i4++;
794
0
        const float vi5 = *i5++;
795
0
        const float vi6 = *i6++;
796
0
        const float vi7 = *i7++;
797
0
        const float vacc = *b;
798
799
0
        const float vsum01 = vi0 + vi1;
800
0
        const float vsum23 = vi2 + vi3;
801
0
        const float vsum45 = vi4 + vi5;
802
0
        const float vsum67 = vi6 + vi7;
803
0
        const float vsum01a = vsum01 + vacc;
804
0
        const float vsum2345 = vsum23 + vsum45;
805
0
        const float vsum0167a = vsum01a + vsum67;
806
0
        const float vsum = vsum2345 + vsum0167a;
807
808
0
        *b++ = vsum;
809
0
      } while (--c != 0);
810
0
    }
811
812
0
    {
813
0
      const float* i0 = input[0];
814
0
      assert(i0 != NULL);
815
0
      const float* i1 = input[1];
816
0
      const float* i2 = input[2];
817
0
      const float* i3 = input[3];
818
0
      const float* i4 = input[4];
819
0
      const float* i5 = input[5];
820
0
      const float* i6 = input[6];
821
0
      const float* i7 = input[7];
822
0
      input = (const float**) ((uintptr_t) input + input_increment);
823
0
      if (k < 2) {
824
0
        i1 = zero;
825
0
      }
826
0
      assert(i1 != NULL);
827
0
      if (k <= 2) {
828
0
        i2 = zero;
829
0
      }
830
0
      assert(i2 != NULL);
831
0
      if (k < 4) {
832
0
        i3 = zero;
833
0
      }
834
0
      assert(i3 != NULL);
835
0
      if (k <= 4) {
836
0
        i4 = zero;
837
0
      }
838
0
      assert(i4 != NULL);
839
0
      if (k < 6) {
840
0
        i5 = zero;
841
0
      }
842
0
      assert(i5 != NULL);
843
0
      if (k <= 6) {
844
0
        i6 = zero;
845
0
      }
846
0
      assert(i6 != NULL);
847
0
      if (k < 8) {
848
0
        i7 = zero;
849
0
      }
850
0
      assert(i7 != NULL);
851
0
      if XNN_UNPREDICTABLE(i0 != zero) {
852
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
853
0
      }
854
0
      if XNN_UNPREDICTABLE(i1 != zero) {
855
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
856
0
      }
857
0
      if XNN_UNPREDICTABLE(i2 != zero) {
858
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
859
0
      }
860
0
      if XNN_UNPREDICTABLE(i3 != zero) {
861
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
862
0
      }
863
0
      if XNN_UNPREDICTABLE(i4 != zero) {
864
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
865
0
      }
866
0
      if XNN_UNPREDICTABLE(i5 != zero) {
867
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
868
0
      }
869
0
      if XNN_UNPREDICTABLE(i6 != zero) {
870
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
871
0
      }
872
0
      if XNN_UNPREDICTABLE(i7 != zero) {
873
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
874
0
      }
875
876
0
      size_t c = channels;
877
0
      float* b = buffer;
878
0
      do {
879
0
        const float vi0 = *i0++;
880
0
        const float vi1 = *i1++;
881
0
        const float vi2 = *i2++;
882
0
        const float vi3 = *i3++;
883
0
        const float vi4 = *i4++;
884
0
        const float vi5 = *i5++;
885
0
        const float vi6 = *i6++;
886
0
        const float vi7 = *i7++;
887
0
        const float vacc = *b++;
888
889
0
        const float vsum01 = vi0 + vi1;
890
0
        const float vsum23 = vi2 + vi3;
891
0
        const float vsum45 = vi4 + vi5;
892
0
        const float vsum67 = vi6 + vi7;
893
0
        const float vsum01a = vsum01 + vacc;
894
0
        const float vsum2345 = vsum23 + vsum45;
895
0
        const float vsum0167a = vsum01a + vsum67;
896
0
        const float vsum = vsum2345 + vsum0167a;
897
898
0
        float vout = vsum * vscale;
899
0
        vout = math_max_f32(vout, vmin);
900
0
        vout = math_min_f32(vout, vmax);
901
902
0
        *output++ = vout;
903
0
      } while (--c != 0);
904
0
    }
905
0
    output = (float*) ((uintptr_t) output + output_increment);
906
0
  } while (--output_pixels != 0);
907
0
}
908
909
void xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1(
910
    size_t output_pixels,
911
    size_t kernel_elements,
912
    size_t channels,
913
    const float** input,
914
    size_t input_offset,
915
    const float* zero,
916
    float* output,
917
    size_t input_increment,
918
    size_t output_increment,
919
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
920
0
{
921
0
  assert(output_pixels != 0);
922
0
  assert(kernel_elements != 0);
923
0
  assert(kernel_elements <= 9);
924
0
  assert(channels != 0);
925
926
0
  const float vscale = params->scalar.scale;
927
0
  const float vmin = params->scalar.min;
928
0
  const float vmax = params->scalar.max;
929
930
0
  do {
931
0
    const float* i0 = input[0];
932
0
    assert(i0 != NULL);
933
0
    const float* i1 = input[1];
934
0
    const float* i2 = input[2];
935
0
    const float* i3 = input[3];
936
0
    const float* i4 = input[4];
937
0
    const float* i5 = input[5];
938
0
    const float* i6 = input[6];
939
0
    const float* i7 = input[7];
940
0
    const float* i8 = input[8];
941
0
    input = (const float**) ((uintptr_t) input + input_increment);
942
0
    if (kernel_elements < 2) {
943
0
      i1 = zero;
944
0
    }
945
0
    assert(i1 != NULL);
946
0
    if (kernel_elements <= 2) {
947
0
      i2 = zero;
948
0
    }
949
0
    assert(i2 != NULL);
950
0
    if (kernel_elements < 4) {
951
0
      i3 = zero;
952
0
    }
953
0
    assert(i3 != NULL);
954
0
    if (kernel_elements <= 4) {
955
0
      i4 = zero;
956
0
    }
957
0
    assert(i4 != NULL);
958
0
    if (kernel_elements < 6) {
959
0
      i5 = zero;
960
0
    }
961
0
    assert(i5 != NULL);
962
0
    if (kernel_elements <= 6) {
963
0
      i6 = zero;
964
0
    }
965
0
    assert(i6 != NULL);
966
0
    if (kernel_elements < 8) {
967
0
      i7 = zero;
968
0
    }
969
0
    assert(i7 != NULL);
970
0
    if (kernel_elements <= 8) {
971
0
      i8 = zero;
972
0
    }
973
0
    assert(i8 != NULL);
974
0
    if XNN_UNPREDICTABLE(i0 != zero) {
975
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
976
0
    }
977
0
    if XNN_UNPREDICTABLE(i1 != zero) {
978
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
979
0
    }
980
0
    if XNN_UNPREDICTABLE(i2 != zero) {
981
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
982
0
    }
983
0
    if XNN_UNPREDICTABLE(i3 != zero) {
984
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
985
0
    }
986
0
    if XNN_UNPREDICTABLE(i4 != zero) {
987
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
988
0
    }
989
0
    if XNN_UNPREDICTABLE(i5 != zero) {
990
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
991
0
    }
992
0
    if XNN_UNPREDICTABLE(i6 != zero) {
993
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
994
0
    }
995
0
    if XNN_UNPREDICTABLE(i7 != zero) {
996
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
997
0
    }
998
0
    if XNN_UNPREDICTABLE(i8 != zero) {
999
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
1000
0
    }
1001
1002
0
    size_t c = channels;
1003
0
    do {
1004
0
      const float vi0 = *i0++;
1005
0
      const float vi1 = *i1++;
1006
0
      const float vi2 = *i2++;
1007
0
      const float vi3 = *i3++;
1008
0
      const float vi4 = *i4++;
1009
0
      const float vi5 = *i5++;
1010
0
      const float vi6 = *i6++;
1011
0
      const float vi7 = *i7++;
1012
0
      const float vi8 = *i8++;
1013
1014
0
      const float vsum01 = vi0 + vi1;
1015
0
      const float vsum23 = vi2 + vi3;
1016
0
      const float vsum45 = vi4 + vi5;
1017
0
      const float vsum67 = vi6 + vi7;
1018
0
      const float vsum018 = vsum01 + vi8;
1019
0
      const float vsum2345 = vsum23 + vsum45;
1020
0
      const float vsum01678 = vsum018 + vsum67;
1021
0
      const float vsum = vsum2345 + vsum01678;
1022
1023
0
      float vout = vsum * vscale;
1024
0
      vout = math_max_f32(vout, vmin);
1025
0
      vout = math_min_f32(vout, vmax);
1026
1027
0
      *output++ = vout;
1028
0
    } while (--c != 0);
1029
0
    output = (float*) ((uintptr_t) output + output_increment);
1030
0
  } while (--output_pixels != 0);
1031
0
}
1032
1033
void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(
1034
    size_t input_height,
1035
    size_t input_width,
1036
    size_t output_y_start,
1037
    size_t output_y_end,
1038
    const float* input,
1039
    const float* zero,
1040
    const float* weights,
1041
    float* output,
1042
    size_t input_padding_top,
1043
    size_t output_channels,
1044
    size_t output_height_stride,
1045
    size_t output_channel_stride,
1046
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1047
0
{
1048
0
  assert(input_width != 0);
1049
0
  assert(output_y_end > output_y_start);
1050
0
  assert(input_padding_top <= 1);
1051
0
  assert(output_channels != 0);
1052
1053
0
  const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
1054
0
  const size_t input_width_decrement = round_down_po2(input_width, 2) * 3 /* channels */ * sizeof(float);
1055
0
  const size_t output_width = (input_width + 1) / 2;
1056
0
  const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
1057
1058
  // Adjustment for padding processed below
1059
0
  const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
1060
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1061
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1062
0
  float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
1063
1064
0
  if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
1065
0
    i0 = zero;
1066
0
  }
1067
1068
0
  const float voutput_max = params->scalar.max;
1069
0
  const float voutput_min = params->scalar.min;
1070
1071
0
  for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
1072
0
    const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
1073
0
    if XNN_UNPREDICTABLE(input_y2 >= input_height) {
1074
0
      i2 = zero;
1075
0
    }
1076
1077
0
    const float* w = weights;
1078
0
    size_t c = output_channels;
1079
0
    float* o0c0 = output0;
1080
0
    float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
1081
0
    float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
1082
0
    float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
1083
0
    do {
1084
0
      if XNN_UNPREDICTABLE(c < 2) {
1085
0
        o0c1 = o0c0;
1086
0
      }
1087
0
      if XNN_UNPREDICTABLE(c <= 2) {
1088
0
        o0c2 = o0c1;
1089
0
      }
1090
0
      if XNN_UNPREDICTABLE(c < 4) {
1091
0
        o0c3 = o0c2;
1092
0
      }
1093
1094
      // Left edge padding
1095
0
      float vi00c0 = 0.0f;
1096
0
      float vi00c1 = 0.0f;
1097
0
      float vi00c2 = 0.0f;
1098
0
      float vi10c0 = 0.0f;
1099
0
      float vi10c1 = 0.0f;
1100
0
      float vi10c2 = 0.0f;
1101
0
      float vi20c0 = 0.0f;
1102
0
      float vi20c1 = 0.0f;
1103
0
      float vi20c2 = 0.0f;
1104
1105
0
      size_t iw = input_width;
1106
0
      for (; iw >= 2; iw -= 2) {
1107
0
        float voc0 = w[0];
1108
0
        float voc1 = w[1];
1109
0
        float voc2 = w[2];
1110
0
        float voc3 = w[3];
1111
1112
0
        const float vk00c0x0 = w[4];
1113
0
        const float vk00c0x1 = w[5];
1114
0
        const float vk00c0x2 = w[6];
1115
0
        const float vk00c0x3 = w[7];
1116
1117
0
        voc0 += vk00c0x0 * vi00c0;
1118
0
        voc1 += vk00c0x1 * vi00c0;
1119
0
        voc2 += vk00c0x2 * vi00c0;
1120
0
        voc3 += vk00c0x3 * vi00c0;
1121
1122
0
        const float vk10c0x0 = w[8];
1123
0
        const float vk10c0x1 = w[9];
1124
0
        const float vk10c0x2 = w[10];
1125
0
        const float vk10c0x3 = w[11];
1126
1127
0
        voc0 += vk10c0x0 * vi10c0;
1128
0
        voc1 += vk10c0x1 * vi10c0;
1129
0
        voc2 += vk10c0x2 * vi10c0;
1130
0
        voc3 += vk10c0x3 * vi10c0;
1131
1132
0
        const float vk20c0x0 = w[12];
1133
0
        const float vk20c0x1 = w[13];
1134
0
        const float vk20c0x2 = w[14];
1135
0
        const float vk20c0x3 = w[15];
1136
1137
0
        voc0 += vk20c0x0 * vi20c0;
1138
0
        voc1 += vk20c0x1 * vi20c0;
1139
0
        voc2 += vk20c0x2 * vi20c0;
1140
0
        voc3 += vk20c0x3 * vi20c0;
1141
1142
0
        const float vk00c1x0 = w[16];
1143
0
        const float vk00c1x1 = w[17];
1144
0
        const float vk00c1x2 = w[18];
1145
0
        const float vk00c1x3 = w[19];
1146
1147
0
        voc0 += vk00c1x0 * vi00c1;
1148
0
        voc1 += vk00c1x1 * vi00c1;
1149
0
        voc2 += vk00c1x2 * vi00c1;
1150
0
        voc3 += vk00c1x3 * vi00c1;
1151
1152
0
        const float vk10c1x0 = w[20];
1153
0
        const float vk10c1x1 = w[21];
1154
0
        const float vk10c1x2 = w[22];
1155
0
        const float vk10c1x3 = w[23];
1156
1157
0
        voc0 += vk10c1x0 * vi10c1;
1158
0
        voc1 += vk10c1x1 * vi10c1;
1159
0
        voc2 += vk10c1x2 * vi10c1;
1160
0
        voc3 += vk10c1x3 * vi10c1;
1161
1162
0
        const float vk20c1x0 = w[24];
1163
0
        const float vk20c1x1 = w[25];
1164
0
        const float vk20c1x2 = w[26];
1165
0
        const float vk20c1x3 = w[27];
1166
1167
0
        voc0 += vk20c1x0 * vi20c1;
1168
0
        voc1 += vk20c1x1 * vi20c1;
1169
0
        voc2 += vk20c1x2 * vi20c1;
1170
0
        voc3 += vk20c1x3 * vi20c1;
1171
1172
0
        const float vk00c2x0 = w[28];
1173
0
        const float vk00c2x1 = w[29];
1174
0
        const float vk00c2x2 = w[30];
1175
0
        const float vk00c2x3 = w[31];
1176
1177
0
        voc0 += vk00c2x0 * vi00c2;
1178
0
        voc1 += vk00c2x1 * vi00c2;
1179
0
        voc2 += vk00c2x2 * vi00c2;
1180
0
        voc3 += vk00c2x3 * vi00c2;
1181
1182
0
        const float vk10c2x0 = w[32];
1183
0
        const float vk10c2x1 = w[33];
1184
0
        const float vk10c2x2 = w[34];
1185
0
        const float vk10c2x3 = w[35];
1186
1187
0
        voc0 += vk10c2x0 * vi10c2;
1188
0
        voc1 += vk10c2x1 * vi10c2;
1189
0
        voc2 += vk10c2x2 * vi10c2;
1190
0
        voc3 += vk10c2x3 * vi10c2;
1191
1192
0
        const float vk20c2x0 = w[36];
1193
0
        const float vk20c2x1 = w[37];
1194
0
        const float vk20c2x2 = w[38];
1195
0
        const float vk20c2x3 = w[39];
1196
1197
0
        voc0 += vk20c2x0 * vi20c2;
1198
0
        voc1 += vk20c2x1 * vi20c2;
1199
0
        voc2 += vk20c2x2 * vi20c2;
1200
0
        voc3 += vk20c2x3 * vi20c2;
1201
1202
0
        const float vk01c0x0 = w[40];
1203
0
        const float vk01c0x1 = w[41];
1204
0
        const float vk01c0x2 = w[42];
1205
0
        const float vk01c0x3 = w[43];
1206
1207
0
        const float vi01c0 = i0[0];
1208
1209
0
        voc0 += vk01c0x0 * vi01c0;
1210
0
        voc1 += vk01c0x1 * vi01c0;
1211
0
        voc2 += vk01c0x2 * vi01c0;
1212
0
        voc3 += vk01c0x3 * vi01c0;
1213
1214
0
        const float vk11c0x0 = w[44];
1215
0
        const float vk11c0x1 = w[45];
1216
0
        const float vk11c0x2 = w[46];
1217
0
        const float vk11c0x3 = w[47];
1218
1219
0
        const float vi11c0 = i1[0];
1220
1221
0
        voc0 += vk11c0x0 * vi11c0;
1222
0
        voc1 += vk11c0x1 * vi11c0;
1223
0
        voc2 += vk11c0x2 * vi11c0;
1224
0
        voc3 += vk11c0x3 * vi11c0;
1225
1226
0
        const float vk21c0x0 = w[48];
1227
0
        const float vk21c0x1 = w[49];
1228
0
        const float vk21c0x2 = w[50];
1229
0
        const float vk21c0x3 = w[51];
1230
1231
0
        const float vi21c0 = i2[0];
1232
1233
0
        voc0 += vk21c0x0 * vi21c0;
1234
0
        voc1 += vk21c0x1 * vi21c0;
1235
0
        voc2 += vk21c0x2 * vi21c0;
1236
0
        voc3 += vk21c0x3 * vi21c0;
1237
1238
0
        const float vk01c1x0 = w[52];
1239
0
        const float vk01c1x1 = w[53];
1240
0
        const float vk01c1x2 = w[54];
1241
0
        const float vk01c1x3 = w[55];
1242
1243
0
        const float vi01c1 = i0[1];
1244
1245
0
        voc0 += vk01c1x0 * vi01c1;
1246
0
        voc1 += vk01c1x1 * vi01c1;
1247
0
        voc2 += vk01c1x2 * vi01c1;
1248
0
        voc3 += vk01c1x3 * vi01c1;
1249
1250
0
        const float vk11c1x0 = w[56];
1251
0
        const float vk11c1x1 = w[57];
1252
0
        const float vk11c1x2 = w[58];
1253
0
        const float vk11c1x3 = w[59];
1254
1255
0
        const float vi11c1 = i1[1];
1256
1257
0
        voc0 += vk11c1x0 * vi11c1;
1258
0
        voc1 += vk11c1x1 * vi11c1;
1259
0
        voc2 += vk11c1x2 * vi11c1;
1260
0
        voc3 += vk11c1x3 * vi11c1;
1261
1262
0
        const float vk21c1x0 = w[60];
1263
0
        const float vk21c1x1 = w[61];
1264
0
        const float vk21c1x2 = w[62];
1265
0
        const float vk21c1x3 = w[63];
1266
1267
0
        const float vi21c1 = i2[1];
1268
1269
0
        voc0 += vk21c1x0 * vi21c1;
1270
0
        voc1 += vk21c1x1 * vi21c1;
1271
0
        voc2 += vk21c1x2 * vi21c1;
1272
0
        voc3 += vk21c1x3 * vi21c1;
1273
1274
0
        const float vk01c2x0 = w[64];
1275
0
        const float vk01c2x1 = w[65];
1276
0
        const float vk01c2x2 = w[66];
1277
0
        const float vk01c2x3 = w[67];
1278
1279
0
        const float vi01c2 = i0[2];
1280
1281
0
        voc0 += vk01c2x0 * vi01c2;
1282
0
        voc1 += vk01c2x1 * vi01c2;
1283
0
        voc2 += vk01c2x2 * vi01c2;
1284
0
        voc3 += vk01c2x3 * vi01c2;
1285
1286
0
        const float vk11c2x0 = w[68];
1287
0
        const float vk11c2x1 = w[69];
1288
0
        const float vk11c2x2 = w[70];
1289
0
        const float vk11c2x3 = w[71];
1290
1291
0
        const float vi11c2 = i1[2];
1292
1293
0
        voc0 += vk11c2x0 * vi11c2;
1294
0
        voc1 += vk11c2x1 * vi11c2;
1295
0
        voc2 += vk11c2x2 * vi11c2;
1296
0
        voc3 += vk11c2x3 * vi11c2;
1297
1298
0
        const float vk21c2x0 = w[72];
1299
0
        const float vk21c2x1 = w[73];
1300
0
        const float vk21c2x2 = w[74];
1301
0
        const float vk21c2x3 = w[75];
1302
1303
0
        const float vi21c2 = i2[2];
1304
1305
0
        voc0 += vk21c2x0 * vi21c2;
1306
0
        voc1 += vk21c2x1 * vi21c2;
1307
0
        voc2 += vk21c2x2 * vi21c2;
1308
0
        voc3 += vk21c2x3 * vi21c2;
1309
1310
0
        const float vk02c0x0 = w[76];
1311
0
        const float vk02c0x1 = w[77];
1312
0
        const float vk02c0x2 = w[78];
1313
0
        const float vk02c0x3 = w[79];
1314
1315
0
        const float vi02c0 = i0[3];
1316
1317
0
        voc0 += vk02c0x0 * vi02c0;
1318
0
        voc1 += vk02c0x1 * vi02c0;
1319
0
        voc2 += vk02c0x2 * vi02c0;
1320
0
        voc3 += vk02c0x3 * vi02c0;
1321
1322
0
        const float vk12c0x0 = w[80];
1323
0
        const float vk12c0x1 = w[81];
1324
0
        const float vk12c0x2 = w[82];
1325
0
        const float vk12c0x3 = w[83];
1326
1327
0
        const float vi12c0 = i1[3];
1328
1329
0
        voc0 += vk12c0x0 * vi12c0;
1330
0
        voc1 += vk12c0x1 * vi12c0;
1331
0
        voc2 += vk12c0x2 * vi12c0;
1332
0
        voc3 += vk12c0x3 * vi12c0;
1333
1334
0
        const float vk22c0x0 = w[84];
1335
0
        const float vk22c0x1 = w[85];
1336
0
        const float vk22c0x2 = w[86];
1337
0
        const float vk22c0x3 = w[87];
1338
1339
0
        const float vi22c0 = i2[3];
1340
1341
0
        voc0 += vk22c0x0 * vi22c0;
1342
0
        voc1 += vk22c0x1 * vi22c0;
1343
0
        voc2 += vk22c0x2 * vi22c0;
1344
0
        voc3 += vk22c0x3 * vi22c0;
1345
1346
0
        vi00c0 = vi02c0;
1347
0
        vi10c0 = vi12c0;
1348
0
        vi20c0 = vi22c0;
1349
1350
0
        const float vk02c1x0 = w[88];
1351
0
        const float vk02c1x1 = w[89];
1352
0
        const float vk02c1x2 = w[90];
1353
0
        const float vk02c1x3 = w[91];
1354
1355
0
        const float vi02c1 = i0[4];
1356
1357
0
        voc0 += vk02c1x0 * vi02c1;
1358
0
        voc1 += vk02c1x1 * vi02c1;
1359
0
        voc2 += vk02c1x2 * vi02c1;
1360
0
        voc3 += vk02c1x3 * vi02c1;
1361
1362
0
        const float vk12c1x0 = w[92];
1363
0
        const float vk12c1x1 = w[93];
1364
0
        const float vk12c1x2 = w[94];
1365
0
        const float vk12c1x3 = w[95];
1366
1367
0
        const float vi12c1 = i1[4];
1368
1369
0
        voc0 += vk12c1x0 * vi12c1;
1370
0
        voc1 += vk12c1x1 * vi12c1;
1371
0
        voc2 += vk12c1x2 * vi12c1;
1372
0
        voc3 += vk12c1x3 * vi12c1;
1373
1374
0
        const float vk22c1x0 = w[96];
1375
0
        const float vk22c1x1 = w[97];
1376
0
        const float vk22c1x2 = w[98];
1377
0
        const float vk22c1x3 = w[99];
1378
1379
0
        const float vi22c1 = i2[4];
1380
1381
0
        voc0 += vk22c1x0 * vi22c1;
1382
0
        voc1 += vk22c1x1 * vi22c1;
1383
0
        voc2 += vk22c1x2 * vi22c1;
1384
0
        voc3 += vk22c1x3 * vi22c1;
1385
1386
0
        vi00c1 = vi02c1;
1387
0
        vi10c1 = vi12c1;
1388
0
        vi20c1 = vi22c1;
1389
1390
0
        const float vk02c2x0 = w[100];
1391
0
        const float vk02c2x1 = w[101];
1392
0
        const float vk02c2x2 = w[102];
1393
0
        const float vk02c2x3 = w[103];
1394
1395
0
        const float vi02c2 = i0[5];
1396
1397
0
        voc0 += vk02c2x0 * vi02c2;
1398
0
        voc1 += vk02c2x1 * vi02c2;
1399
0
        voc2 += vk02c2x2 * vi02c2;
1400
0
        voc3 += vk02c2x3 * vi02c2;
1401
1402
0
        const float vk12c2x0 = w[104];
1403
0
        const float vk12c2x1 = w[105];
1404
0
        const float vk12c2x2 = w[106];
1405
0
        const float vk12c2x3 = w[107];
1406
1407
0
        const float vi12c2 = i1[5];
1408
1409
0
        voc0 += vk12c2x0 * vi12c2;
1410
0
        voc1 += vk12c2x1 * vi12c2;
1411
0
        voc2 += vk12c2x2 * vi12c2;
1412
0
        voc3 += vk12c2x3 * vi12c2;
1413
1414
0
        const float vk22c2x0 = w[108];
1415
0
        const float vk22c2x1 = w[109];
1416
0
        const float vk22c2x2 = w[110];
1417
0
        const float vk22c2x3 = w[111];
1418
1419
0
        const float vi22c2 = i2[5];
1420
1421
0
        voc0 += vk22c2x0 * vi22c2;
1422
0
        voc1 += vk22c2x1 * vi22c2;
1423
0
        voc2 += vk22c2x2 * vi22c2;
1424
0
        voc3 += vk22c2x3 * vi22c2;
1425
1426
0
        vi00c2 = vi02c2;
1427
0
        vi10c2 = vi12c2;
1428
0
        vi20c2 = vi22c2;
1429
1430
0
        voc0 = math_min_f32(voc0, voutput_max);
1431
0
        voc1 = math_min_f32(voc1, voutput_max);
1432
0
        voc2 = math_min_f32(voc2, voutput_max);
1433
0
        voc3 = math_min_f32(voc3, voutput_max);
1434
1435
0
        voc0 = math_max_f32(voc0, voutput_min);
1436
0
        voc1 = math_max_f32(voc1, voutput_min);
1437
0
        voc2 = math_max_f32(voc2, voutput_min);
1438
0
        voc3 = math_max_f32(voc3, voutput_min);
1439
1440
0
        *o0c0++ = voc0;
1441
0
        *o0c1++ = voc1;
1442
0
        *o0c2++ = voc2;
1443
0
        *o0c3++ = voc3;
1444
1445
0
        i0 += 6;
1446
0
        i1 += 6;
1447
0
        i2 += 6;
1448
0
      }
1449
0
      assert(iw < 2);
1450
0
      if XNN_UNLIKELY(iw != 0) {
1451
0
        float voc0 = w[0];
1452
0
        float voc1 = w[1];
1453
0
        float voc2 = w[2];
1454
0
        float voc3 = w[3];
1455
1456
0
        const float vk00c0x0 = w[4];
1457
0
        const float vk00c0x1 = w[5];
1458
0
        const float vk00c0x2 = w[6];
1459
0
        const float vk00c0x3 = w[7];
1460
1461
0
        voc0 += vk00c0x0 * vi00c0;
1462
0
        voc1 += vk00c0x1 * vi00c0;
1463
0
        voc2 += vk00c0x2 * vi00c0;
1464
0
        voc3 += vk00c0x3 * vi00c0;
1465
1466
0
        const float vk10c0x0 = w[8];
1467
0
        const float vk10c0x1 = w[9];
1468
0
        const float vk10c0x2 = w[10];
1469
0
        const float vk10c0x3 = w[11];
1470
1471
0
        voc0 += vk10c0x0 * vi10c0;
1472
0
        voc1 += vk10c0x1 * vi10c0;
1473
0
        voc2 += vk10c0x2 * vi10c0;
1474
0
        voc3 += vk10c0x3 * vi10c0;
1475
1476
0
        const float vk20c0x0 = w[12];
1477
0
        const float vk20c0x1 = w[13];
1478
0
        const float vk20c0x2 = w[14];
1479
0
        const float vk20c0x3 = w[15];
1480
1481
0
        voc0 += vk20c0x0 * vi20c0;
1482
0
        voc1 += vk20c0x1 * vi20c0;
1483
0
        voc2 += vk20c0x2 * vi20c0;
1484
0
        voc3 += vk20c0x3 * vi20c0;
1485
1486
0
        const float vk00c1x0 = w[16];
1487
0
        const float vk00c1x1 = w[17];
1488
0
        const float vk00c1x2 = w[18];
1489
0
        const float vk00c1x3 = w[19];
1490
1491
0
        voc0 += vk00c1x0 * vi00c1;
1492
0
        voc1 += vk00c1x1 * vi00c1;
1493
0
        voc2 += vk00c1x2 * vi00c1;
1494
0
        voc3 += vk00c1x3 * vi00c1;
1495
1496
0
        const float vk10c1x0 = w[20];
1497
0
        const float vk10c1x1 = w[21];
1498
0
        const float vk10c1x2 = w[22];
1499
0
        const float vk10c1x3 = w[23];
1500
1501
0
        voc0 += vk10c1x0 * vi10c1;
1502
0
        voc1 += vk10c1x1 * vi10c1;
1503
0
        voc2 += vk10c1x2 * vi10c1;
1504
0
        voc3 += vk10c1x3 * vi10c1;
1505
1506
0
        const float vk20c1x0 = w[24];
1507
0
        const float vk20c1x1 = w[25];
1508
0
        const float vk20c1x2 = w[26];
1509
0
        const float vk20c1x3 = w[27];
1510
1511
0
        voc0 += vk20c1x0 * vi20c1;
1512
0
        voc1 += vk20c1x1 * vi20c1;
1513
0
        voc2 += vk20c1x2 * vi20c1;
1514
0
        voc3 += vk20c1x3 * vi20c1;
1515
1516
0
        const float vk00c2x0 = w[28];
1517
0
        const float vk00c2x1 = w[29];
1518
0
        const float vk00c2x2 = w[30];
1519
0
        const float vk00c2x3 = w[31];
1520
1521
0
        voc0 += vk00c2x0 * vi00c2;
1522
0
        voc1 += vk00c2x1 * vi00c2;
1523
0
        voc2 += vk00c2x2 * vi00c2;
1524
0
        voc3 += vk00c2x3 * vi00c2;
1525
1526
0
        const float vk10c2x0 = w[32];
1527
0
        const float vk10c2x1 = w[33];
1528
0
        const float vk10c2x2 = w[34];
1529
0
        const float vk10c2x3 = w[35];
1530
1531
0
        voc0 += vk10c2x0 * vi10c2;
1532
0
        voc1 += vk10c2x1 * vi10c2;
1533
0
        voc2 += vk10c2x2 * vi10c2;
1534
0
        voc3 += vk10c2x3 * vi10c2;
1535
1536
0
        const float vk20c2x0 = w[36];
1537
0
        const float vk20c2x1 = w[37];
1538
0
        const float vk20c2x2 = w[38];
1539
0
        const float vk20c2x3 = w[39];
1540
1541
0
        voc0 += vk20c2x0 * vi20c2;
1542
0
        voc1 += vk20c2x1 * vi20c2;
1543
0
        voc2 += vk20c2x2 * vi20c2;
1544
0
        voc3 += vk20c2x3 * vi20c2;
1545
1546
0
        const float vk01c0x0 = w[40];
1547
0
        const float vk01c0x1 = w[41];
1548
0
        const float vk01c0x2 = w[42];
1549
0
        const float vk01c0x3 = w[43];
1550
1551
0
        const float vi01c0 = i0[0];
1552
1553
0
        voc0 += vk01c0x0 * vi01c0;
1554
0
        voc1 += vk01c0x1 * vi01c0;
1555
0
        voc2 += vk01c0x2 * vi01c0;
1556
0
        voc3 += vk01c0x3 * vi01c0;
1557
1558
0
        const float vk11c0x0 = w[44];
1559
0
        const float vk11c0x1 = w[45];
1560
0
        const float vk11c0x2 = w[46];
1561
0
        const float vk11c0x3 = w[47];
1562
1563
0
        const float vi11c0 = i1[0];
1564
1565
0
        voc0 += vk11c0x0 * vi11c0;
1566
0
        voc1 += vk11c0x1 * vi11c0;
1567
0
        voc2 += vk11c0x2 * vi11c0;
1568
0
        voc3 += vk11c0x3 * vi11c0;
1569
1570
0
        const float vk21c0x0 = w[48];
1571
0
        const float vk21c0x1 = w[49];
1572
0
        const float vk21c0x2 = w[50];
1573
0
        const float vk21c0x3 = w[51];
1574
1575
0
        const float vi21c0 = i2[0];
1576
1577
0
        voc0 += vk21c0x0 * vi21c0;
1578
0
        voc1 += vk21c0x1 * vi21c0;
1579
0
        voc2 += vk21c0x2 * vi21c0;
1580
0
        voc3 += vk21c0x3 * vi21c0;
1581
1582
0
        const float vk01c1x0 = w[52];
1583
0
        const float vk01c1x1 = w[53];
1584
0
        const float vk01c1x2 = w[54];
1585
0
        const float vk01c1x3 = w[55];
1586
1587
0
        const float vi01c1 = i0[1];
1588
1589
0
        voc0 += vk01c1x0 * vi01c1;
1590
0
        voc1 += vk01c1x1 * vi01c1;
1591
0
        voc2 += vk01c1x2 * vi01c1;
1592
0
        voc3 += vk01c1x3 * vi01c1;
1593
1594
0
        const float vk11c1x0 = w[56];
1595
0
        const float vk11c1x1 = w[57];
1596
0
        const float vk11c1x2 = w[58];
1597
0
        const float vk11c1x3 = w[59];
1598
1599
0
        const float vi11c1 = i1[1];
1600
1601
0
        voc0 += vk11c1x0 * vi11c1;
1602
0
        voc1 += vk11c1x1 * vi11c1;
1603
0
        voc2 += vk11c1x2 * vi11c1;
1604
0
        voc3 += vk11c1x3 * vi11c1;
1605
1606
0
        const float vk21c1x0 = w[60];
1607
0
        const float vk21c1x1 = w[61];
1608
0
        const float vk21c1x2 = w[62];
1609
0
        const float vk21c1x3 = w[63];
1610
1611
0
        const float vi21c1 = i2[1];
1612
1613
0
        voc0 += vk21c1x0 * vi21c1;
1614
0
        voc1 += vk21c1x1 * vi21c1;
1615
0
        voc2 += vk21c1x2 * vi21c1;
1616
0
        voc3 += vk21c1x3 * vi21c1;
1617
1618
0
        const float vk01c2x0 = w[64];
1619
0
        const float vk01c2x1 = w[65];
1620
0
        const float vk01c2x2 = w[66];
1621
0
        const float vk01c2x3 = w[67];
1622
1623
0
        const float vi01c2 = i0[2];
1624
1625
0
        voc0 += vk01c2x0 * vi01c2;
1626
0
        voc1 += vk01c2x1 * vi01c2;
1627
0
        voc2 += vk01c2x2 * vi01c2;
1628
0
        voc3 += vk01c2x3 * vi01c2;
1629
1630
0
        const float vk11c2x0 = w[68];
1631
0
        const float vk11c2x1 = w[69];
1632
0
        const float vk11c2x2 = w[70];
1633
0
        const float vk11c2x3 = w[71];
1634
1635
0
        const float vi11c2 = i1[2];
1636
1637
0
        voc0 += vk11c2x0 * vi11c2;
1638
0
        voc1 += vk11c2x1 * vi11c2;
1639
0
        voc2 += vk11c2x2 * vi11c2;
1640
0
        voc3 += vk11c2x3 * vi11c2;
1641
1642
0
        const float vk21c2x0 = w[72];
1643
0
        const float vk21c2x1 = w[73];
1644
0
        const float vk21c2x2 = w[74];
1645
0
        const float vk21c2x3 = w[75];
1646
1647
0
        const float vi21c2 = i2[2];
1648
1649
0
        voc0 += vk21c2x0 * vi21c2;
1650
0
        voc1 += vk21c2x1 * vi21c2;
1651
0
        voc2 += vk21c2x2 * vi21c2;
1652
0
        voc3 += vk21c2x3 * vi21c2;
1653
1654
0
        voc0 = math_min_f32(voc0, voutput_max);
1655
0
        voc1 = math_min_f32(voc1, voutput_max);
1656
0
        voc2 = math_min_f32(voc2, voutput_max);
1657
0
        voc3 = math_min_f32(voc3, voutput_max);
1658
1659
0
        voc0 = math_max_f32(voc0, voutput_min);
1660
0
        voc1 = math_max_f32(voc1, voutput_min);
1661
0
        voc2 = math_max_f32(voc2, voutput_min);
1662
0
        voc3 = math_max_f32(voc3, voutput_min);
1663
1664
0
        *o0c0++ = voc0;
1665
0
        *o0c1++ = voc1;
1666
0
        *o0c2++ = voc2;
1667
0
        *o0c3++ = voc3;
1668
0
      }
1669
      // Move output pointers back to the position of the first pixel in a row,
1670
      // and forward to the next block of output channels.
1671
0
      o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
1672
0
      o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
1673
0
      o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
1674
0
      o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
1675
      // Revert input pointers to the position of the first pixel in a row
1676
0
      i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
1677
0
      i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
1678
0
      i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
1679
      // Move to the block of weights for the next 4 output channels
1680
0
      w += 112;
1681
0
      c = doz(c, 4);
1682
0
    } while (c != 0);
1683
    // Move output pointers forward to the next row
1684
0
    output0 = (float*) ((uintptr_t) output0 + output_height_stride);
1685
    // Move input pointers forward to the next row
1686
0
    i0 = i2;
1687
0
    i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
1688
0
    i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
1689
0
  }
1690
0
}
1691
1692
void xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2(
1693
    size_t channels,
1694
    size_t output_width,
1695
    const float** input,
1696
    const float* weights,
1697
    float* output,
1698
    intptr_t input_stride,
1699
    size_t output_increment,
1700
    size_t input_offset,
1701
    const float* zero,
1702
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1703
0
{
1704
0
  assert(channels != 0);
1705
0
  assert(output_width != 0);
1706
1707
0
  const float vmin = params->scalar.min;
1708
0
  const float vmax = params->scalar.max;
1709
0
  do {
1710
0
    const float* i0 = input[0];
1711
0
    assert(i0 != NULL);
1712
0
    if XNN_UNPREDICTABLE(i0 != zero) {
1713
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
1714
0
    }
1715
0
    const float* i1 = input[1];
1716
0
    assert(i1 != NULL);
1717
0
    if XNN_UNPREDICTABLE(i1 != zero) {
1718
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
1719
0
    }
1720
0
    const float* i2 = input[2];
1721
0
    assert(i2 != NULL);
1722
0
    if XNN_UNPREDICTABLE(i2 != zero) {
1723
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
1724
0
    }
1725
0
    const float* i3 = input[3];
1726
0
    assert(i3 != NULL);
1727
0
    if XNN_UNPREDICTABLE(i3 != zero) {
1728
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
1729
0
    }
1730
0
    const float* i4 = input[4];
1731
0
    assert(i4 != NULL);
1732
0
    if XNN_UNPREDICTABLE(i4 != zero) {
1733
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
1734
0
    }
1735
0
    const float* i5 = input[5];
1736
0
    assert(i5 != NULL);
1737
0
    if XNN_UNPREDICTABLE(i5 != zero) {
1738
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
1739
0
    }
1740
0
    const float* i6 = input[6];
1741
0
    assert(i6 != NULL);
1742
0
    if XNN_UNPREDICTABLE(i6 != zero) {
1743
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
1744
0
    }
1745
0
    const float* i7 = input[7];
1746
0
    assert(i7 != NULL);
1747
0
    if XNN_UNPREDICTABLE(i7 != zero) {
1748
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
1749
0
    }
1750
0
    const float* i8 = input[8];
1751
0
    assert(i8 != NULL);
1752
0
    if XNN_UNPREDICTABLE(i8 != zero) {
1753
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
1754
0
    }
1755
0
    const float* i9 = input[9];
1756
0
    assert(i9 != NULL);
1757
0
    if XNN_UNPREDICTABLE(i9 != zero) {
1758
0
      i9 = (const float*) ((uintptr_t) i9 + input_offset);
1759
0
    }
1760
0
    const float* i10 = input[10];
1761
0
    assert(i10 != NULL);
1762
0
    if XNN_UNPREDICTABLE(i10 != zero) {
1763
0
      i10 = (const float*) ((uintptr_t) i10 + input_offset);
1764
0
    }
1765
0
    const float* i11 = input[11];
1766
0
    assert(i11 != NULL);
1767
0
    if XNN_UNPREDICTABLE(i11 != zero) {
1768
0
      i11 = (const float*) ((uintptr_t) i11 + input_offset);
1769
0
    }
1770
0
    const float* i12 = input[12];
1771
0
    assert(i12 != NULL);
1772
0
    if XNN_UNPREDICTABLE(i12 != zero) {
1773
0
      i12 = (const float*) ((uintptr_t) i12 + input_offset);
1774
0
    }
1775
0
    const float* i13 = input[13];
1776
0
    assert(i13 != NULL);
1777
0
    if XNN_UNPREDICTABLE(i13 != zero) {
1778
0
      i13 = (const float*) ((uintptr_t) i13 + input_offset);
1779
0
    }
1780
0
    const float* i14 = input[14];
1781
0
    assert(i14 != NULL);
1782
0
    if XNN_UNPREDICTABLE(i14 != zero) {
1783
0
      i14 = (const float*) ((uintptr_t) i14 + input_offset);
1784
0
    }
1785
0
    const float* i15 = input[15];
1786
0
    assert(i15 != NULL);
1787
0
    if XNN_UNPREDICTABLE(i15 != zero) {
1788
0
      i15 = (const float*) ((uintptr_t) i15 + input_offset);
1789
0
    }
1790
0
    const float* i16 = input[16];
1791
0
    assert(i16 != NULL);
1792
0
    if XNN_UNPREDICTABLE(i16 != zero) {
1793
0
      i16 = (const float*) ((uintptr_t) i16 + input_offset);
1794
0
    }
1795
0
    const float* i17 = input[17];
1796
0
    assert(i17 != NULL);
1797
0
    if XNN_UNPREDICTABLE(i17 != zero) {
1798
0
      i17 = (const float*) ((uintptr_t) i17 + input_offset);
1799
0
    }
1800
0
    const float* i18 = input[18];
1801
0
    assert(i18 != NULL);
1802
0
    if XNN_UNPREDICTABLE(i18 != zero) {
1803
0
      i18 = (const float*) ((uintptr_t) i18 + input_offset);
1804
0
    }
1805
0
    const float* i19 = input[19];
1806
0
    assert(i19 != NULL);
1807
0
    if XNN_UNPREDICTABLE(i19 != zero) {
1808
0
      i19 = (const float*) ((uintptr_t) i19 + input_offset);
1809
0
    }
1810
0
    const float* i20 = input[20];
1811
0
    assert(i20 != NULL);
1812
0
    if XNN_UNPREDICTABLE(i20 != zero) {
1813
0
      i20 = (const float*) ((uintptr_t) i20 + input_offset);
1814
0
    }
1815
0
    const float* i21 = input[21];
1816
0
    assert(i21 != NULL);
1817
0
    if XNN_UNPREDICTABLE(i21 != zero) {
1818
0
      i21 = (const float*) ((uintptr_t) i21 + input_offset);
1819
0
    }
1820
0
    const float* i22 = input[22];
1821
0
    assert(i22 != NULL);
1822
0
    if XNN_UNPREDICTABLE(i22 != zero) {
1823
0
      i22 = (const float*) ((uintptr_t) i22 + input_offset);
1824
0
    }
1825
0
    const float* i23 = input[23];
1826
0
    assert(i23 != NULL);
1827
0
    if XNN_UNPREDICTABLE(i23 != zero) {
1828
0
      i23 = (const float*) ((uintptr_t) i23 + input_offset);
1829
0
    }
1830
0
    const float* i24 = input[24];
1831
0
    assert(i24 != NULL);
1832
0
    if XNN_UNPREDICTABLE(i24 != zero) {
1833
0
      i24 = (const float*) ((uintptr_t) i24 + input_offset);
1834
0
    }
1835
0
    input = (const float**) ((uintptr_t) input + input_stride);
1836
1837
0
    size_t c = channels;
1838
0
    const float* w = weights;
1839
0
    do {
1840
0
      float vacc0p0 = w[0];
1841
1842
0
      const float vi0 = *i0++;
1843
0
      const float vk0 = w[1];
1844
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
1845
1846
0
      const float vi1 = *i1++;
1847
0
      const float vk1 = w[2];
1848
0
      float vacc0p1 = vi1 * vk1;
1849
1850
0
      const float vi2 = *i2++;
1851
0
      const float vk2 = w[3];
1852
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
1853
1854
0
      const float vi3 = *i3++;
1855
0
      const float vk3 = w[4];
1856
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
1857
1858
0
      const float vi4 = *i4++;
1859
0
      const float vk4 = w[5];
1860
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
1861
1862
0
      const float vi5 = *i5++;
1863
0
      const float vk5 = w[6];
1864
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
1865
1866
0
      const float vi6 = *i6++;
1867
0
      const float vk6 = w[7];
1868
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
1869
1870
0
      const float vi7 = *i7++;
1871
0
      const float vk7 = w[8];
1872
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
1873
1874
0
      const float vi8 = *i8++;
1875
0
      const float vk8 = w[9];
1876
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
1877
1878
0
      const float vi9 = *i9++;
1879
0
      const float vk9 = w[10];
1880
0
      vacc0p1 = math_muladd_f32(vi9, vk9, vacc0p1);
1881
1882
0
      const float vi10 = *i10++;
1883
0
      const float vk10 = w[11];
1884
0
      vacc0p0 = math_muladd_f32(vi10, vk10, vacc0p0);
1885
1886
0
      const float vi11 = *i11++;
1887
0
      const float vk11 = w[12];
1888
0
      vacc0p1 = math_muladd_f32(vi11, vk11, vacc0p1);
1889
1890
0
      const float vi12 = *i12++;
1891
0
      const float vk12 = w[13];
1892
0
      vacc0p0 = math_muladd_f32(vi12, vk12, vacc0p0);
1893
1894
0
      const float vi13 = *i13++;
1895
0
      const float vk13 = w[14];
1896
0
      vacc0p1 = math_muladd_f32(vi13, vk13, vacc0p1);
1897
1898
0
      const float vi14 = *i14++;
1899
0
      const float vk14 = w[15];
1900
0
      vacc0p0 = math_muladd_f32(vi14, vk14, vacc0p0);
1901
1902
0
      const float vi15 = *i15++;
1903
0
      const float vk15 = w[16];
1904
0
      vacc0p1 = math_muladd_f32(vi15, vk15, vacc0p1);
1905
1906
0
      const float vi16 = *i16++;
1907
0
      const float vk16 = w[17];
1908
0
      vacc0p0 = math_muladd_f32(vi16, vk16, vacc0p0);
1909
1910
0
      const float vi17 = *i17++;
1911
0
      const float vk17 = w[18];
1912
0
      vacc0p1 = math_muladd_f32(vi17, vk17, vacc0p1);
1913
1914
0
      const float vi18 = *i18++;
1915
0
      const float vk18 = w[19];
1916
0
      vacc0p0 = math_muladd_f32(vi18, vk18, vacc0p0);
1917
1918
0
      const float vi19 = *i19++;
1919
0
      const float vk19 = w[20];
1920
0
      vacc0p1 = math_muladd_f32(vi19, vk19, vacc0p1);
1921
1922
0
      const float vi20 = *i20++;
1923
0
      const float vk20 = w[21];
1924
0
      vacc0p0 = math_muladd_f32(vi20, vk20, vacc0p0);
1925
1926
0
      const float vi21 = *i21++;
1927
0
      const float vk21 = w[22];
1928
0
      vacc0p1 = math_muladd_f32(vi21, vk21, vacc0p1);
1929
1930
0
      const float vi22 = *i22++;
1931
0
      const float vk22 = w[23];
1932
0
      vacc0p0 = math_muladd_f32(vi22, vk22, vacc0p0);
1933
1934
0
      const float vi23 = *i23++;
1935
0
      const float vk23 = w[24];
1936
0
      vacc0p1 = math_muladd_f32(vi23, vk23, vacc0p1);
1937
1938
0
      const float vi24 = *i24++;
1939
0
      const float vk24 = w[25];
1940
0
      vacc0p0 = math_muladd_f32(vi24, vk24, vacc0p0);
1941
1942
0
      w += 26;
1943
1944
0
      vacc0p0 += vacc0p1;
1945
1946
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
1947
0
      vacc0 = math_min_f32(vacc0, vmax);
1948
0
      *output++ = vacc0;
1949
0
    } while (--c != 0);
1950
1951
0
    output = (float*) ((uintptr_t) output + output_increment);
1952
0
  } while (--output_width != 0);
1953
0
}
1954
1955
void xnn_f32_dwconv_ukernel_25p1c__scalar_acc2(
1956
    size_t channels,
1957
    size_t output_width,
1958
    const float** input,
1959
    const float* weights,
1960
    float* output,
1961
    intptr_t input_stride,
1962
    size_t output_increment,
1963
    size_t input_offset,
1964
    const float* zero,
1965
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
1966
0
{
1967
0
  assert(channels != 0);
1968
0
  assert(output_width != 0);
1969
1970
0
  do {
1971
0
    const float* i0 = input[0];
1972
0
    assert(i0 != NULL);
1973
0
    if XNN_UNPREDICTABLE(i0 != zero) {
1974
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
1975
0
    }
1976
0
    const float* i1 = input[1];
1977
0
    assert(i1 != NULL);
1978
0
    if XNN_UNPREDICTABLE(i1 != zero) {
1979
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
1980
0
    }
1981
0
    const float* i2 = input[2];
1982
0
    assert(i2 != NULL);
1983
0
    if XNN_UNPREDICTABLE(i2 != zero) {
1984
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
1985
0
    }
1986
0
    const float* i3 = input[3];
1987
0
    assert(i3 != NULL);
1988
0
    if XNN_UNPREDICTABLE(i3 != zero) {
1989
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
1990
0
    }
1991
0
    const float* i4 = input[4];
1992
0
    assert(i4 != NULL);
1993
0
    if XNN_UNPREDICTABLE(i4 != zero) {
1994
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
1995
0
    }
1996
0
    const float* i5 = input[5];
1997
0
    assert(i5 != NULL);
1998
0
    if XNN_UNPREDICTABLE(i5 != zero) {
1999
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
2000
0
    }
2001
0
    const float* i6 = input[6];
2002
0
    assert(i6 != NULL);
2003
0
    if XNN_UNPREDICTABLE(i6 != zero) {
2004
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
2005
0
    }
2006
0
    const float* i7 = input[7];
2007
0
    assert(i7 != NULL);
2008
0
    if XNN_UNPREDICTABLE(i7 != zero) {
2009
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
2010
0
    }
2011
0
    const float* i8 = input[8];
2012
0
    assert(i8 != NULL);
2013
0
    if XNN_UNPREDICTABLE(i8 != zero) {
2014
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
2015
0
    }
2016
0
    const float* i9 = input[9];
2017
0
    assert(i9 != NULL);
2018
0
    if XNN_UNPREDICTABLE(i9 != zero) {
2019
0
      i9 = (const float*) ((uintptr_t) i9 + input_offset);
2020
0
    }
2021
0
    const float* i10 = input[10];
2022
0
    assert(i10 != NULL);
2023
0
    if XNN_UNPREDICTABLE(i10 != zero) {
2024
0
      i10 = (const float*) ((uintptr_t) i10 + input_offset);
2025
0
    }
2026
0
    const float* i11 = input[11];
2027
0
    assert(i11 != NULL);
2028
0
    if XNN_UNPREDICTABLE(i11 != zero) {
2029
0
      i11 = (const float*) ((uintptr_t) i11 + input_offset);
2030
0
    }
2031
0
    const float* i12 = input[12];
2032
0
    assert(i12 != NULL);
2033
0
    if XNN_UNPREDICTABLE(i12 != zero) {
2034
0
      i12 = (const float*) ((uintptr_t) i12 + input_offset);
2035
0
    }
2036
0
    const float* i13 = input[13];
2037
0
    assert(i13 != NULL);
2038
0
    if XNN_UNPREDICTABLE(i13 != zero) {
2039
0
      i13 = (const float*) ((uintptr_t) i13 + input_offset);
2040
0
    }
2041
0
    const float* i14 = input[14];
2042
0
    assert(i14 != NULL);
2043
0
    if XNN_UNPREDICTABLE(i14 != zero) {
2044
0
      i14 = (const float*) ((uintptr_t) i14 + input_offset);
2045
0
    }
2046
0
    const float* i15 = input[15];
2047
0
    assert(i15 != NULL);
2048
0
    if XNN_UNPREDICTABLE(i15 != zero) {
2049
0
      i15 = (const float*) ((uintptr_t) i15 + input_offset);
2050
0
    }
2051
0
    const float* i16 = input[16];
2052
0
    assert(i16 != NULL);
2053
0
    if XNN_UNPREDICTABLE(i16 != zero) {
2054
0
      i16 = (const float*) ((uintptr_t) i16 + input_offset);
2055
0
    }
2056
0
    const float* i17 = input[17];
2057
0
    assert(i17 != NULL);
2058
0
    if XNN_UNPREDICTABLE(i17 != zero) {
2059
0
      i17 = (const float*) ((uintptr_t) i17 + input_offset);
2060
0
    }
2061
0
    const float* i18 = input[18];
2062
0
    assert(i18 != NULL);
2063
0
    if XNN_UNPREDICTABLE(i18 != zero) {
2064
0
      i18 = (const float*) ((uintptr_t) i18 + input_offset);
2065
0
    }
2066
0
    const float* i19 = input[19];
2067
0
    assert(i19 != NULL);
2068
0
    if XNN_UNPREDICTABLE(i19 != zero) {
2069
0
      i19 = (const float*) ((uintptr_t) i19 + input_offset);
2070
0
    }
2071
0
    const float* i20 = input[20];
2072
0
    assert(i20 != NULL);
2073
0
    if XNN_UNPREDICTABLE(i20 != zero) {
2074
0
      i20 = (const float*) ((uintptr_t) i20 + input_offset);
2075
0
    }
2076
0
    const float* i21 = input[21];
2077
0
    assert(i21 != NULL);
2078
0
    if XNN_UNPREDICTABLE(i21 != zero) {
2079
0
      i21 = (const float*) ((uintptr_t) i21 + input_offset);
2080
0
    }
2081
0
    const float* i22 = input[22];
2082
0
    assert(i22 != NULL);
2083
0
    if XNN_UNPREDICTABLE(i22 != zero) {
2084
0
      i22 = (const float*) ((uintptr_t) i22 + input_offset);
2085
0
    }
2086
0
    const float* i23 = input[23];
2087
0
    assert(i23 != NULL);
2088
0
    if XNN_UNPREDICTABLE(i23 != zero) {
2089
0
      i23 = (const float*) ((uintptr_t) i23 + input_offset);
2090
0
    }
2091
0
    const float* i24 = input[24];
2092
0
    assert(i24 != NULL);
2093
0
    if XNN_UNPREDICTABLE(i24 != zero) {
2094
0
      i24 = (const float*) ((uintptr_t) i24 + input_offset);
2095
0
    }
2096
0
    input = (const float**) ((uintptr_t) input + input_stride);
2097
2098
0
    size_t c = channels;
2099
0
    const float* w = weights;
2100
0
    do {
2101
0
      float vacc0p0 = w[0];
2102
2103
0
      const float vi0 = *i0++;
2104
0
      const float vk0 = w[1];
2105
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2106
2107
0
      const float vi1 = *i1++;
2108
0
      const float vk1 = w[2];
2109
0
      float vacc0p1 = vi1 * vk1;
2110
2111
0
      const float vi2 = *i2++;
2112
0
      const float vk2 = w[3];
2113
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2114
2115
0
      const float vi3 = *i3++;
2116
0
      const float vk3 = w[4];
2117
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2118
2119
0
      const float vi4 = *i4++;
2120
0
      const float vk4 = w[5];
2121
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
2122
2123
0
      const float vi5 = *i5++;
2124
0
      const float vk5 = w[6];
2125
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
2126
2127
0
      const float vi6 = *i6++;
2128
0
      const float vk6 = w[7];
2129
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
2130
2131
0
      const float vi7 = *i7++;
2132
0
      const float vk7 = w[8];
2133
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
2134
2135
0
      const float vi8 = *i8++;
2136
0
      const float vk8 = w[9];
2137
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
2138
2139
0
      const float vi9 = *i9++;
2140
0
      const float vk9 = w[10];
2141
0
      vacc0p1 = math_muladd_f32(vi9, vk9, vacc0p1);
2142
2143
0
      const float vi10 = *i10++;
2144
0
      const float vk10 = w[11];
2145
0
      vacc0p0 = math_muladd_f32(vi10, vk10, vacc0p0);
2146
2147
0
      const float vi11 = *i11++;
2148
0
      const float vk11 = w[12];
2149
0
      vacc0p1 = math_muladd_f32(vi11, vk11, vacc0p1);
2150
2151
0
      const float vi12 = *i12++;
2152
0
      const float vk12 = w[13];
2153
0
      vacc0p0 = math_muladd_f32(vi12, vk12, vacc0p0);
2154
2155
0
      const float vi13 = *i13++;
2156
0
      const float vk13 = w[14];
2157
0
      vacc0p1 = math_muladd_f32(vi13, vk13, vacc0p1);
2158
2159
0
      const float vi14 = *i14++;
2160
0
      const float vk14 = w[15];
2161
0
      vacc0p0 = math_muladd_f32(vi14, vk14, vacc0p0);
2162
2163
0
      const float vi15 = *i15++;
2164
0
      const float vk15 = w[16];
2165
0
      vacc0p1 = math_muladd_f32(vi15, vk15, vacc0p1);
2166
2167
0
      const float vi16 = *i16++;
2168
0
      const float vk16 = w[17];
2169
0
      vacc0p0 = math_muladd_f32(vi16, vk16, vacc0p0);
2170
2171
0
      const float vi17 = *i17++;
2172
0
      const float vk17 = w[18];
2173
0
      vacc0p1 = math_muladd_f32(vi17, vk17, vacc0p1);
2174
2175
0
      const float vi18 = *i18++;
2176
0
      const float vk18 = w[19];
2177
0
      vacc0p0 = math_muladd_f32(vi18, vk18, vacc0p0);
2178
2179
0
      const float vi19 = *i19++;
2180
0
      const float vk19 = w[20];
2181
0
      vacc0p1 = math_muladd_f32(vi19, vk19, vacc0p1);
2182
2183
0
      const float vi20 = *i20++;
2184
0
      const float vk20 = w[21];
2185
0
      vacc0p0 = math_muladd_f32(vi20, vk20, vacc0p0);
2186
2187
0
      const float vi21 = *i21++;
2188
0
      const float vk21 = w[22];
2189
0
      vacc0p1 = math_muladd_f32(vi21, vk21, vacc0p1);
2190
2191
0
      const float vi22 = *i22++;
2192
0
      const float vk22 = w[23];
2193
0
      vacc0p0 = math_muladd_f32(vi22, vk22, vacc0p0);
2194
2195
0
      const float vi23 = *i23++;
2196
0
      const float vk23 = w[24];
2197
0
      vacc0p1 = math_muladd_f32(vi23, vk23, vacc0p1);
2198
2199
0
      const float vi24 = *i24++;
2200
0
      const float vk24 = w[25];
2201
0
      vacc0p0 = math_muladd_f32(vi24, vk24, vacc0p0);
2202
2203
0
      w += 26;
2204
2205
0
      vacc0p0 += vacc0p1;
2206
2207
0
      *output++ = vacc0p0;
2208
0
    } while (--c != 0);
2209
2210
0
    output = (float*) ((uintptr_t) output + output_increment);
2211
0
  } while (--output_width != 0);
2212
0
}
2213
2214
void xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2(
2215
    size_t channels,
2216
    size_t output_width,
2217
    const float** input,
2218
    const float* weights,
2219
    float* output,
2220
    intptr_t input_stride,
2221
    size_t output_increment,
2222
    size_t input_offset,
2223
    const float* zero,
2224
    size_t kernel_size,
2225
    float* buffer,
2226
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2227
0
{
2228
0
  assert(channels != 0);
2229
0
  assert(output_width != 0);
2230
0
  assert(kernel_size > 2);
2231
2232
0
  const float vmin = params->scalar.min;
2233
0
  const float vmax = params->scalar.max;
2234
0
  do {
2235
0
    const float* w = weights;
2236
2237
    // First pass to process 2 inputs.
2238
0
    {
2239
0
      float* b = buffer;
2240
0
      const float* i0 = input[0];
2241
0
      assert(i0 != NULL);
2242
0
      if XNN_UNPREDICTABLE(i0 != zero) {
2243
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
2244
0
      }
2245
0
      const float* i1 = input[1];
2246
0
      assert(i1 != NULL);
2247
0
      if XNN_UNPREDICTABLE(i1 != zero) {
2248
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
2249
0
      }
2250
0
      input += 2;
2251
2252
      // Process c channels and write to buffer.
2253
0
      size_t c = round_up_po2(channels, 1);
2254
0
      for (; c >= 4; c -= 4) {
2255
0
        float vacc0p0 = w[0];
2256
0
        float vacc1p0 = w[1];
2257
0
        float vacc2p0 = w[2];
2258
0
        float vacc3p0 = w[3];
2259
2260
2261
0
        const float vi0x0 = i0[0];
2262
0
        const float vi0x1 = i0[1];
2263
0
        const float vi0x2 = i0[2];
2264
0
        const float vi0x3 = i0[3];
2265
0
        i0 += 4;
2266
2267
0
        const float vk0x0 = w[4];
2268
0
        const float vk0x1 = w[5];
2269
0
        const float vk0x2 = w[6];
2270
0
        const float vk0x3 = w[7];
2271
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2272
0
        vacc1p0 = math_muladd_f32(vi0x1, vk0x1, vacc1p0);
2273
0
        vacc2p0 = math_muladd_f32(vi0x2, vk0x2, vacc2p0);
2274
0
        vacc3p0 = math_muladd_f32(vi0x3, vk0x3, vacc3p0);
2275
2276
0
        const float vi1x0 = i1[0];
2277
0
        const float vi1x1 = i1[1];
2278
0
        const float vi1x2 = i1[2];
2279
0
        const float vi1x3 = i1[3];
2280
0
        i1 += 4;
2281
2282
0
        const float vk1x0 = w[8];
2283
0
        const float vk1x1 = w[9];
2284
0
        const float vk1x2 = w[10];
2285
0
        const float vk1x3 = w[11];
2286
0
        float vacc0p1 = vi1x0 * vk1x0;
2287
0
        float vacc1p1 = vi1x1 * vk1x1;
2288
0
        float vacc2p1 = vi1x2 * vk1x2;
2289
0
        float vacc3p1 = vi1x3 * vk1x3;
2290
2291
0
        w += 12;
2292
2293
        // Add up all accumulators to vacc0123p0
2294
0
        vacc0p0 = vacc0p0 + vacc0p1;
2295
0
        vacc1p0 = vacc1p0 + vacc1p1;
2296
0
        vacc2p0 = vacc2p0 + vacc2p1;
2297
0
        vacc3p0 = vacc3p0 + vacc3p1;
2298
2299
0
        b[0] = vacc0p0;
2300
0
        b[1] = vacc1p0;
2301
0
        b[2] = vacc2p0;
2302
0
        b[3] = vacc3p0;
2303
0
        b += 4;
2304
0
      }
2305
2306
2307
0
      for (; c != 0; c --) {
2308
0
        float vacc0p0 = w[0];
2309
2310
0
        const float vi0x0 = i0[0];
2311
0
        i0 += 1;
2312
2313
0
        const float vk0x0 = w[1];
2314
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2315
2316
0
        const float vi1x0 = i1[0];
2317
0
        i1 += 1;
2318
2319
0
        const float vk1x0 = w[2];
2320
0
        float vacc0p1 = vi1x0 * vk1x0;
2321
2322
0
        w += 3;
2323
2324
        // Add up all accumulators to vacc0p0
2325
0
        vacc0p0 = vacc0p0 + vacc0p1;
2326
2327
0
        b[0] = vacc0p0;
2328
0
        b += 1;
2329
0
      }
2330
0
    }
2331
2332
    // Middle pass to process 2 inputs in each iteration.
2333
0
    for (size_t ks = kernel_size - 2; ks > 2; ks -= 2) {
2334
0
      float* b = buffer;
2335
0
      const float* i0 = input[0];
2336
0
      assert(i0 != NULL);
2337
0
      if XNN_UNPREDICTABLE(i0 != zero) {
2338
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
2339
0
      }
2340
0
      const float* i1 = input[1];
2341
0
      assert(i1 != NULL);
2342
0
      if XNN_UNPREDICTABLE(i1 != zero) {
2343
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
2344
0
      }
2345
0
      input += 2;
2346
2347
0
      size_t c = round_up_po2(channels, 1);
2348
0
      for (; c >= 4; c -= 4) {
2349
0
        float vacc0p0 = b[0];
2350
0
        float vacc1p0 = b[1];
2351
0
        float vacc2p0 = b[2];
2352
0
        float vacc3p0 = b[3];
2353
2354
2355
0
        const float vi0x0 = i0[0];
2356
0
        const float vi0x1 = i0[1];
2357
0
        const float vi0x2 = i0[2];
2358
0
        const float vi0x3 = i0[3];
2359
0
        i0 += 4;
2360
2361
0
        const float vk0x0 = w[0];
2362
0
        const float vk0x1 = w[1];
2363
0
        const float vk0x2 = w[2];
2364
0
        const float vk0x3 = w[3];
2365
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2366
0
        vacc1p0 = math_muladd_f32(vi0x1, vk0x1, vacc1p0);
2367
0
        vacc2p0 = math_muladd_f32(vi0x2, vk0x2, vacc2p0);
2368
0
        vacc3p0 = math_muladd_f32(vi0x3, vk0x3, vacc3p0);
2369
2370
0
        const float vi1x0 = i1[0];
2371
0
        const float vi1x1 = i1[1];
2372
0
        const float vi1x2 = i1[2];
2373
0
        const float vi1x3 = i1[3];
2374
0
        i1 += 4;
2375
2376
0
        const float vk1x0 = w[4];
2377
0
        const float vk1x1 = w[5];
2378
0
        const float vk1x2 = w[6];
2379
0
        const float vk1x3 = w[7];
2380
0
        float vacc0p1 = vi1x0 * vk1x0;
2381
0
        float vacc1p1 = vi1x1 * vk1x1;
2382
0
        float vacc2p1 = vi1x2 * vk1x2;
2383
0
        float vacc3p1 = vi1x3 * vk1x3;
2384
2385
0
        w += 8;
2386
2387
        // Add up all accumulators to vacc0123p0
2388
0
        vacc0p0 = vacc0p0 + vacc0p1;
2389
0
        vacc1p0 = vacc1p0 + vacc1p1;
2390
0
        vacc2p0 = vacc2p0 + vacc2p1;
2391
0
        vacc3p0 = vacc3p0 + vacc3p1;
2392
2393
0
        b[0] = vacc0p0;
2394
0
        b[1] = vacc1p0;
2395
0
        b[2] = vacc2p0;
2396
0
        b[3] = vacc3p0;
2397
0
        b += 4;
2398
0
      }
2399
2400
0
      for (; c != 0; c --) {
2401
0
        float vacc0p0 = b[0];
2402
2403
2404
0
        const float vi0x0 = i0[0];
2405
0
        i0 += 1;
2406
2407
0
        const float vk0x0 = w[0];
2408
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2409
2410
0
        const float vi1x0 = i1[0];
2411
0
        i1 += 1;
2412
2413
0
        const float vk1x0 = w[1];
2414
0
        float vacc0p1 = vi1x0 * vk1x0;
2415
2416
0
        w += 2;
2417
2418
        // Add up all accumulators to vacc0p0
2419
0
        vacc0p0 = vacc0p0 + vacc0p1;
2420
2421
0
        b[0] = vacc0p0;
2422
0
        b += 1;
2423
0
      }
2424
0
    }
2425
2426
    // Last pass to process up to 2 inputs.
2427
0
    {
2428
0
      float* b = buffer;
2429
0
      const float* i0 = input[0];
2430
0
      assert(i0 != NULL);
2431
0
      if XNN_UNPREDICTABLE(i0 != zero) {
2432
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
2433
0
      }
2434
0
      const float* i1 = input[1];
2435
0
      assert(i1 != NULL);
2436
0
      if XNN_UNPREDICTABLE(i1 != zero) {
2437
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
2438
0
      }
2439
2440
0
      size_t c = channels;
2441
0
      for (; c >= 4; c -= 4) {
2442
0
        float vacc0p0 = b[0];
2443
0
        float vacc1p0 = b[1];
2444
0
        float vacc2p0 = b[2];
2445
0
        float vacc3p0 = b[3];
2446
0
        b += 4;
2447
2448
2449
0
        const float vi0x0 = i0[0];
2450
0
        const float vi0x1 = i0[1];
2451
0
        const float vi0x2 = i0[2];
2452
0
        const float vi0x3 = i0[3];
2453
0
        i0 += 4;
2454
2455
0
        const float vk0x0 = w[0];
2456
0
        const float vk0x1 = w[1];
2457
0
        const float vk0x2 = w[2];
2458
0
        const float vk0x3 = w[3];
2459
0
        vacc0p0 = math_muladd_f32(vi0x0, vk0x0, vacc0p0);
2460
0
        vacc1p0 = math_muladd_f32(vi0x1, vk0x1, vacc1p0);
2461
0
        vacc2p0 = math_muladd_f32(vi0x2, vk0x2, vacc2p0);
2462
0
        vacc3p0 = math_muladd_f32(vi0x3, vk0x3, vacc3p0);
2463
2464
0
        const float vi1x0 = i1[0];
2465
0
        const float vi1x1 = i1[1];
2466
0
        const float vi1x2 = i1[2];
2467
0
        const float vi1x3 = i1[3];
2468
0
        i1 += 4;
2469
2470
0
        const float vk1x0 = w[4];
2471
0
        const float vk1x1 = w[5];
2472
0
        const float vk1x2 = w[6];
2473
0
        const float vk1x3 = w[7];
2474
0
        float vacc0p1 = vi1x0 * vk1x0;
2475
0
        float vacc1p1 = vi1x1 * vk1x1;
2476
0
        float vacc2p1 = vi1x2 * vk1x2;
2477
0
        float vacc3p1 = vi1x3 * vk1x3;
2478
2479
0
        w += 8;
2480
2481
        // Add up all accumulators to vacc0123p0
2482
0
        vacc0p0 = vacc0p0 + vacc0p1;
2483
0
        vacc1p0 = vacc1p0 + vacc1p1;
2484
0
        vacc2p0 = vacc2p0 + vacc2p1;
2485
0
        vacc3p0 = vacc3p0 + vacc3p1;
2486
2487
0
        float vacc0 = math_max_f32(vacc0p0, vmin);
2488
0
        float vacc1 = math_max_f32(vacc1p0, vmin);
2489
0
        float vacc2 = math_max_f32(vacc2p0, vmin);
2490
0
        float vacc3 = math_max_f32(vacc3p0, vmin);
2491
2492
0
        vacc0 = math_min_f32(vacc0, vmax);
2493
0
        vacc1 = math_min_f32(vacc1, vmax);
2494
0
        vacc2 = math_min_f32(vacc2, vmax);
2495
0
        vacc3 = math_min_f32(vacc3, vmax);
2496
2497
0
        output[0] = vacc0;
2498
0
        output[1] = vacc1;
2499
0
        output[2] = vacc2;
2500
0
        output[3] = vacc3;
2501
0
        output += 4;
2502
0
      }
2503
0
      for (; c != 0; c --) {
2504
0
        float vacc0p0 = *b++;
2505
2506
0
        const float vi0 = *i0++;
2507
0
        const float vk0 = w[0];
2508
0
        vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2509
0
        const float vi1 = *i1++;
2510
0
        const float vk1 = w[1];
2511
0
        float vacc0p1 = vi1 * vk1;
2512
0
        w += 2;
2513
2514
        // Add up all accumulators to vacc0p0
2515
0
        vacc0p0 = vacc0p0 + vacc0p1;
2516
2517
0
        float vacc0 = math_max_f32(vacc0p0, vmin);
2518
0
        vacc0 = math_min_f32(vacc0, vmax);
2519
0
        *output++ = vacc0;
2520
0
      }
2521
2522
0
    }
2523
0
    input = (const float**) ((uintptr_t) input + input_stride);
2524
0
    output = (float*) ((uintptr_t) output + output_increment);
2525
0
  } while (--output_width != 0);
2526
0
}
2527
2528
void xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2(
2529
    size_t channels,
2530
    size_t output_width,
2531
    const float** input,
2532
    const float* weights,
2533
    float* output,
2534
    intptr_t input_stride,
2535
    size_t output_increment,
2536
    size_t input_offset,
2537
    const float* zero,
2538
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2539
0
{
2540
0
  assert(channels != 0);
2541
0
  assert(output_width != 0);
2542
2543
0
  const float vmin = params->scalar.min;
2544
0
  const float vmax = params->scalar.max;
2545
0
  do {
2546
0
    const float* i0 = input[0];
2547
0
    assert(i0 != NULL);
2548
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2549
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2550
0
    }
2551
0
    const float* i1 = input[1];
2552
0
    assert(i1 != NULL);
2553
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2554
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2555
0
    }
2556
0
    const float* i2 = input[2];
2557
0
    assert(i2 != NULL);
2558
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2559
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2560
0
    }
2561
0
    input = (const float**) ((uintptr_t) input + input_stride);
2562
2563
0
    size_t c = channels;
2564
0
    const float* w = weights;
2565
0
    do {
2566
0
      float vacc0p0 = w[0];
2567
2568
0
      const float vi0 = *i0++;
2569
0
      const float vk0 = w[1];
2570
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2571
2572
0
      const float vi1 = *i1++;
2573
0
      const float vk1 = w[2];
2574
0
      float vacc0p1 = vi1 * vk1;
2575
2576
0
      const float vi2 = *i2++;
2577
0
      const float vk2 = w[3];
2578
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2579
2580
0
      w += 4;
2581
2582
0
      vacc0p0 += vacc0p1;
2583
2584
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
2585
0
      vacc0 = math_min_f32(vacc0, vmax);
2586
0
      *output++ = vacc0;
2587
0
    } while (--c != 0);
2588
2589
0
    output = (float*) ((uintptr_t) output + output_increment);
2590
0
  } while (--output_width != 0);
2591
0
}
2592
2593
void xnn_f32_dwconv_ukernel_3p1c__scalar_acc2(
2594
    size_t channels,
2595
    size_t output_width,
2596
    const float** input,
2597
    const float* weights,
2598
    float* output,
2599
    intptr_t input_stride,
2600
    size_t output_increment,
2601
    size_t input_offset,
2602
    const float* zero,
2603
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2604
0
{
2605
0
  assert(channels != 0);
2606
0
  assert(output_width != 0);
2607
2608
0
  do {
2609
0
    const float* i0 = input[0];
2610
0
    assert(i0 != NULL);
2611
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2612
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2613
0
    }
2614
0
    const float* i1 = input[1];
2615
0
    assert(i1 != NULL);
2616
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2617
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2618
0
    }
2619
0
    const float* i2 = input[2];
2620
0
    assert(i2 != NULL);
2621
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2622
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2623
0
    }
2624
0
    input = (const float**) ((uintptr_t) input + input_stride);
2625
2626
0
    size_t c = channels;
2627
0
    const float* w = weights;
2628
0
    do {
2629
0
      float vacc0p0 = w[0];
2630
2631
0
      const float vi0 = *i0++;
2632
0
      const float vk0 = w[1];
2633
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2634
2635
0
      const float vi1 = *i1++;
2636
0
      const float vk1 = w[2];
2637
0
      float vacc0p1 = vi1 * vk1;
2638
2639
0
      const float vi2 = *i2++;
2640
0
      const float vk2 = w[3];
2641
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2642
2643
0
      w += 4;
2644
2645
0
      vacc0p0 += vacc0p1;
2646
2647
0
      *output++ = vacc0p0;
2648
0
    } while (--c != 0);
2649
2650
0
    output = (float*) ((uintptr_t) output + output_increment);
2651
0
  } while (--output_width != 0);
2652
0
}
2653
2654
void xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2(
2655
    size_t channels,
2656
    size_t output_width,
2657
    const float** input,
2658
    const float* weights,
2659
    float* output,
2660
    intptr_t input_stride,
2661
    size_t output_increment,
2662
    size_t input_offset,
2663
    const float* zero,
2664
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2665
0
{
2666
0
  assert(channels != 0);
2667
0
  assert(output_width != 0);
2668
2669
0
  const float vmin = params->scalar.min;
2670
0
  const float vmax = params->scalar.max;
2671
0
  do {
2672
0
    const float* i0 = input[0];
2673
0
    assert(i0 != NULL);
2674
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2675
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2676
0
    }
2677
0
    const float* i1 = input[1];
2678
0
    assert(i1 != NULL);
2679
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2680
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2681
0
    }
2682
0
    const float* i2 = input[2];
2683
0
    assert(i2 != NULL);
2684
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2685
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2686
0
    }
2687
0
    const float* i3 = input[3];
2688
0
    assert(i3 != NULL);
2689
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2690
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2691
0
    }
2692
0
    input = (const float**) ((uintptr_t) input + input_stride);
2693
2694
0
    size_t c = channels;
2695
0
    const float* w = weights;
2696
0
    do {
2697
0
      float vacc0p0 = w[0];
2698
2699
0
      const float vi0 = *i0++;
2700
0
      const float vk0 = w[1];
2701
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2702
2703
0
      const float vi1 = *i1++;
2704
0
      const float vk1 = w[2];
2705
0
      float vacc0p1 = vi1 * vk1;
2706
2707
0
      const float vi2 = *i2++;
2708
0
      const float vk2 = w[3];
2709
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2710
2711
0
      const float vi3 = *i3++;
2712
0
      const float vk3 = w[4];
2713
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2714
2715
0
      w += 5;
2716
2717
0
      vacc0p0 += vacc0p1;
2718
2719
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
2720
0
      vacc0 = math_min_f32(vacc0, vmax);
2721
0
      *output++ = vacc0;
2722
0
    } while (--c != 0);
2723
2724
0
    output = (float*) ((uintptr_t) output + output_increment);
2725
0
  } while (--output_width != 0);
2726
0
}
2727
2728
void xnn_f32_dwconv_ukernel_4p1c__scalar_acc2(
2729
    size_t channels,
2730
    size_t output_width,
2731
    const float** input,
2732
    const float* weights,
2733
    float* output,
2734
    intptr_t input_stride,
2735
    size_t output_increment,
2736
    size_t input_offset,
2737
    const float* zero,
2738
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2739
0
{
2740
0
  assert(channels != 0);
2741
0
  assert(output_width != 0);
2742
2743
0
  do {
2744
0
    const float* i0 = input[0];
2745
0
    assert(i0 != NULL);
2746
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2747
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2748
0
    }
2749
0
    const float* i1 = input[1];
2750
0
    assert(i1 != NULL);
2751
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2752
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2753
0
    }
2754
0
    const float* i2 = input[2];
2755
0
    assert(i2 != NULL);
2756
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2757
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2758
0
    }
2759
0
    const float* i3 = input[3];
2760
0
    assert(i3 != NULL);
2761
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2762
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2763
0
    }
2764
0
    input = (const float**) ((uintptr_t) input + input_stride);
2765
2766
0
    size_t c = channels;
2767
0
    const float* w = weights;
2768
0
    do {
2769
0
      float vacc0p0 = w[0];
2770
2771
0
      const float vi0 = *i0++;
2772
0
      const float vk0 = w[1];
2773
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2774
2775
0
      const float vi1 = *i1++;
2776
0
      const float vk1 = w[2];
2777
0
      float vacc0p1 = vi1 * vk1;
2778
2779
0
      const float vi2 = *i2++;
2780
0
      const float vk2 = w[3];
2781
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2782
2783
0
      const float vi3 = *i3++;
2784
0
      const float vk3 = w[4];
2785
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2786
2787
0
      w += 5;
2788
2789
0
      vacc0p0 += vacc0p1;
2790
2791
0
      *output++ = vacc0p0;
2792
0
    } while (--c != 0);
2793
2794
0
    output = (float*) ((uintptr_t) output + output_increment);
2795
0
  } while (--output_width != 0);
2796
0
}
2797
2798
void xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2(
2799
    size_t channels,
2800
    size_t output_width,
2801
    const float** input,
2802
    const float* weights,
2803
    float* output,
2804
    intptr_t input_stride,
2805
    size_t output_increment,
2806
    size_t input_offset,
2807
    const float* zero,
2808
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2809
0
{
2810
0
  assert(channels != 0);
2811
0
  assert(output_width != 0);
2812
2813
0
  const float vmin = params->scalar.min;
2814
0
  const float vmax = params->scalar.max;
2815
0
  do {
2816
0
    const float* i0 = input[0];
2817
0
    assert(i0 != NULL);
2818
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2819
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2820
0
    }
2821
0
    const float* i1 = input[1];
2822
0
    assert(i1 != NULL);
2823
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2824
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2825
0
    }
2826
0
    const float* i2 = input[2];
2827
0
    assert(i2 != NULL);
2828
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2829
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2830
0
    }
2831
0
    const float* i3 = input[3];
2832
0
    assert(i3 != NULL);
2833
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2834
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2835
0
    }
2836
0
    const float* i4 = input[4];
2837
0
    assert(i4 != NULL);
2838
0
    if XNN_UNPREDICTABLE(i4 != zero) {
2839
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
2840
0
    }
2841
0
    const float* i5 = input[5];
2842
0
    assert(i5 != NULL);
2843
0
    if XNN_UNPREDICTABLE(i5 != zero) {
2844
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
2845
0
    }
2846
0
    const float* i6 = input[6];
2847
0
    assert(i6 != NULL);
2848
0
    if XNN_UNPREDICTABLE(i6 != zero) {
2849
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
2850
0
    }
2851
0
    const float* i7 = input[7];
2852
0
    assert(i7 != NULL);
2853
0
    if XNN_UNPREDICTABLE(i7 != zero) {
2854
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
2855
0
    }
2856
0
    const float* i8 = input[8];
2857
0
    assert(i8 != NULL);
2858
0
    if XNN_UNPREDICTABLE(i8 != zero) {
2859
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
2860
0
    }
2861
0
    input = (const float**) ((uintptr_t) input + input_stride);
2862
2863
0
    size_t c = channels;
2864
0
    const float* w = weights;
2865
0
    do {
2866
0
      float vacc0p0 = w[0];
2867
2868
0
      const float vi0 = *i0++;
2869
0
      const float vk0 = w[1];
2870
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2871
2872
0
      const float vi1 = *i1++;
2873
0
      const float vk1 = w[2];
2874
0
      float vacc0p1 = vi1 * vk1;
2875
2876
0
      const float vi2 = *i2++;
2877
0
      const float vk2 = w[3];
2878
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2879
2880
0
      const float vi3 = *i3++;
2881
0
      const float vk3 = w[4];
2882
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
2883
2884
0
      const float vi4 = *i4++;
2885
0
      const float vk4 = w[5];
2886
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
2887
2888
0
      const float vi5 = *i5++;
2889
0
      const float vk5 = w[6];
2890
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
2891
2892
0
      const float vi6 = *i6++;
2893
0
      const float vk6 = w[7];
2894
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
2895
2896
0
      const float vi7 = *i7++;
2897
0
      const float vk7 = w[8];
2898
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
2899
2900
0
      const float vi8 = *i8++;
2901
0
      const float vk8 = w[9];
2902
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
2903
2904
0
      w += 10;
2905
2906
0
      vacc0p0 += vacc0p1;
2907
2908
0
      float vacc0 = math_max_f32(vacc0p0, vmin);
2909
0
      vacc0 = math_min_f32(vacc0, vmax);
2910
0
      *output++ = vacc0;
2911
0
    } while (--c != 0);
2912
2913
0
    output = (float*) ((uintptr_t) output + output_increment);
2914
0
  } while (--output_width != 0);
2915
0
}
2916
2917
void xnn_f32_dwconv_ukernel_9p1c__scalar_acc2(
2918
    size_t channels,
2919
    size_t output_width,
2920
    const float** input,
2921
    const float* weights,
2922
    float* output,
2923
    intptr_t input_stride,
2924
    size_t output_increment,
2925
    size_t input_offset,
2926
    const float* zero,
2927
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2928
0
{
2929
0
  assert(channels != 0);
2930
0
  assert(output_width != 0);
2931
2932
0
  do {
2933
0
    const float* i0 = input[0];
2934
0
    assert(i0 != NULL);
2935
0
    if XNN_UNPREDICTABLE(i0 != zero) {
2936
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
2937
0
    }
2938
0
    const float* i1 = input[1];
2939
0
    assert(i1 != NULL);
2940
0
    if XNN_UNPREDICTABLE(i1 != zero) {
2941
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
2942
0
    }
2943
0
    const float* i2 = input[2];
2944
0
    assert(i2 != NULL);
2945
0
    if XNN_UNPREDICTABLE(i2 != zero) {
2946
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
2947
0
    }
2948
0
    const float* i3 = input[3];
2949
0
    assert(i3 != NULL);
2950
0
    if XNN_UNPREDICTABLE(i3 != zero) {
2951
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
2952
0
    }
2953
0
    const float* i4 = input[4];
2954
0
    assert(i4 != NULL);
2955
0
    if XNN_UNPREDICTABLE(i4 != zero) {
2956
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
2957
0
    }
2958
0
    const float* i5 = input[5];
2959
0
    assert(i5 != NULL);
2960
0
    if XNN_UNPREDICTABLE(i5 != zero) {
2961
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
2962
0
    }
2963
0
    const float* i6 = input[6];
2964
0
    assert(i6 != NULL);
2965
0
    if XNN_UNPREDICTABLE(i6 != zero) {
2966
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
2967
0
    }
2968
0
    const float* i7 = input[7];
2969
0
    assert(i7 != NULL);
2970
0
    if XNN_UNPREDICTABLE(i7 != zero) {
2971
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
2972
0
    }
2973
0
    const float* i8 = input[8];
2974
0
    assert(i8 != NULL);
2975
0
    if XNN_UNPREDICTABLE(i8 != zero) {
2976
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
2977
0
    }
2978
0
    input = (const float**) ((uintptr_t) input + input_stride);
2979
2980
0
    size_t c = channels;
2981
0
    const float* w = weights;
2982
0
    do {
2983
0
      float vacc0p0 = w[0];
2984
2985
0
      const float vi0 = *i0++;
2986
0
      const float vk0 = w[1];
2987
0
      vacc0p0 = math_muladd_f32(vi0, vk0, vacc0p0);
2988
2989
0
      const float vi1 = *i1++;
2990
0
      const float vk1 = w[2];
2991
0
      float vacc0p1 = vi1 * vk1;
2992
2993
0
      const float vi2 = *i2++;
2994
0
      const float vk2 = w[3];
2995
0
      vacc0p0 = math_muladd_f32(vi2, vk2, vacc0p0);
2996
2997
0
      const float vi3 = *i3++;
2998
0
      const float vk3 = w[4];
2999
0
      vacc0p1 = math_muladd_f32(vi3, vk3, vacc0p1);
3000
3001
0
      const float vi4 = *i4++;
3002
0
      const float vk4 = w[5];
3003
0
      vacc0p0 = math_muladd_f32(vi4, vk4, vacc0p0);
3004
3005
0
      const float vi5 = *i5++;
3006
0
      const float vk5 = w[6];
3007
0
      vacc0p1 = math_muladd_f32(vi5, vk5, vacc0p1);
3008
3009
0
      const float vi6 = *i6++;
3010
0
      const float vk6 = w[7];
3011
0
      vacc0p0 = math_muladd_f32(vi6, vk6, vacc0p0);
3012
3013
0
      const float vi7 = *i7++;
3014
0
      const float vk7 = w[8];
3015
0
      vacc0p1 = math_muladd_f32(vi7, vk7, vacc0p1);
3016
3017
0
      const float vi8 = *i8++;
3018
0
      const float vk8 = w[9];
3019
0
      vacc0p0 = math_muladd_f32(vi8, vk8, vacc0p0);
3020
3021
0
      w += 10;
3022
3023
0
      vacc0p0 += vacc0p1;
3024
3025
0
      *output++ = vacc0p0;
3026
0
    } while (--c != 0);
3027
3028
0
    output = (float*) ((uintptr_t) output + output_increment);
3029
0
  } while (--output_width != 0);
3030
0
}
3031
3032
void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2(
3033
    size_t input_height,
3034
    size_t input_width,
3035
    const float* input,
3036
    const float* weights,
3037
    const float* zero,
3038
    float* output,
3039
    uint32_t padding_top,
3040
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3041
0
{
3042
0
  assert(input_height != 0);
3043
0
  assert(input_width != 0);
3044
0
  assert(input_width % sizeof(float) == 0);
3045
0
  assert(padding_top == 1);
3046
3047
0
  const float vmin = params->scalar.min;
3048
0
  const float vmax = params->scalar.max;
3049
3050
0
  const float vbias = weights[0];
3051
0
  const float vk00 = weights[1];
3052
0
  const float vk01 = weights[2];
3053
0
  const float vk02 = weights[3];
3054
0
  const float vk10 = weights[4];
3055
0
  const float vk11 = weights[5];
3056
0
  const float vk12 = weights[6];
3057
0
  const float vk20 = weights[7];
3058
0
  const float vk21 = weights[8];
3059
0
  const float vk22 = weights[9];
3060
3061
0
  const float* i0 = zero;
3062
0
  const float* i1 = input;
3063
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3064
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3065
3066
0
  float* o0 = output;
3067
0
  float* o1 = (float*) ((uintptr_t) o0 + input_width);
3068
3069
0
  size_t output_height = input_height;
3070
0
  do {
3071
0
    if XNN_UNPREDICTABLE(output_height < 2) {
3072
0
      i2 = zero;
3073
0
      o1 = o0;
3074
0
    }
3075
0
    if XNN_UNPREDICTABLE(output_height < 3) {
3076
0
      i3 = zero;
3077
0
    }
3078
3079
0
    float vi0x0 = 0.0f;
3080
0
    float vi1x0 = 0.0f;
3081
0
    float vi2x0 = 0.0f;
3082
0
    float vi3x0 = 0.0f;
3083
3084
0
    float vi0x1 = *i0++;
3085
0
    float vi1x1 = *i1++;
3086
0
    float vi2x1 = *i2++;
3087
0
    float vi3x1 = *i3++;
3088
3089
0
    size_t w = input_width;
3090
0
    for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
3091
0
      const float vi0x2 = *i0++;
3092
0
      const float vi1x2 = *i1++;
3093
0
      const float vi2x2 = *i2++;
3094
0
      const float vi3x2 = *i3++;
3095
3096
0
      float vo0p0 = vbias + vi0x0 * vk00;
3097
0
      float vo1p0 = vbias + vi1x0 * vk00;
3098
0
      float vo0p1 = vi1x0 * vk10;
3099
0
      float vo1p1 = vi2x0 * vk10;
3100
0
      vo0p0 += vi2x0 * vk20;
3101
0
      vo1p0 += vi3x0 * vk20;
3102
3103
0
      vi0x0 = vi0x1;
3104
0
      vi1x0 = vi1x1;
3105
0
      vi2x0 = vi2x1;
3106
0
      vi3x0 = vi3x1;
3107
3108
0
      vo0p1 += vi0x1 * vk01;
3109
0
      vo1p1 += vi1x1 * vk01;
3110
0
      vo0p0 += vi1x1 * vk11;
3111
0
      vo1p0 += vi2x1 * vk11;
3112
0
      vo0p1 += vi2x1 * vk21;
3113
0
      vo1p1 += vi3x1 * vk21;
3114
3115
0
      vi0x1 = vi0x2;
3116
0
      vi1x1 = vi1x2;
3117
0
      vi2x1 = vi2x2;
3118
0
      vi3x1 = vi3x2;
3119
3120
0
      vo0p0 += vi0x2 * vk02;
3121
0
      vo1p0 += vi1x2 * vk02;
3122
0
      vo0p1 += vi1x2 * vk12;
3123
0
      vo1p1 += vi2x2 * vk12;
3124
0
      vo0p0 += vi2x2 * vk22;
3125
0
      vo1p0 += vi3x2 * vk22;
3126
3127
0
      vo0p0 += vo0p1;
3128
0
      vo1p0 += vo1p1;
3129
3130
0
      float vo0 = math_max_f32(vo0p0, vmin);
3131
0
      float vo1 = math_max_f32(vo1p0, vmin);
3132
3133
0
      vo0 = math_min_f32(vo0, vmax);
3134
0
      vo1 = math_min_f32(vo1, vmax);
3135
3136
0
      *o1++ = vo1;
3137
0
      *o0++ = vo0;
3138
0
    }
3139
    // Always process the last pixel separately to account for right edge.
3140
0
    assert(w == 1 * sizeof(float));
3141
0
    {
3142
0
      float vo0p0 = vbias + vi0x0 * vk00;
3143
0
      float vo1p0 = vbias + vi1x0 * vk00;
3144
0
      float vo0p1 = vi1x0 * vk10;
3145
0
      float vo1p1 = vi2x0 * vk10;
3146
0
      vo0p0 += vi2x0 * vk20;
3147
0
      vo1p0 += vi3x0 * vk20;
3148
3149
0
      vo0p1 += vi0x1 * vk01;
3150
0
      vo1p1 += vi1x1 * vk01;
3151
0
      vo0p0 += vi1x1 * vk11;
3152
0
      vo1p0 += vi2x1 * vk11;
3153
0
      vo0p1 += vi2x1 * vk21;
3154
0
      vo1p1 += vi3x1 * vk21;
3155
3156
0
      vo0p0 += vo0p1;
3157
0
      vo1p0 += vo1p1;
3158
3159
0
      float vo0 = math_max_f32(vo0p0, vmin);
3160
0
      float vo1 = math_max_f32(vo1p0, vmin);
3161
3162
0
      vo0 = math_min_f32(vo0, vmax);
3163
0
      vo1 = math_min_f32(vo1, vmax);
3164
3165
0
      *o1++ = vo1;
3166
0
      *o0++ = vo0;
3167
0
    }
3168
3169
0
    i0 = (const float*) ((uintptr_t) i2 - input_width);
3170
0
    i1 = (const float*) ((uintptr_t) i3 - input_width);
3171
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3172
0
    i3 = (const float*) ((uintptr_t) i2 + input_width);
3173
3174
0
    o0 = o1;
3175
0
    o1 = (float*) ((uintptr_t) o0 + input_width);
3176
3177
0
    output_height = doz(output_height, 2);
3178
0
  } while (output_height != 0);
3179
0
}
3180
3181
void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1(
3182
    size_t input_height,
3183
    size_t input_width,
3184
    const float* input,
3185
    const float* weights,
3186
    const float* zero,
3187
    float* output,
3188
    uint32_t padding_top,
3189
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3190
0
{
3191
0
  assert(input_height != 0);
3192
0
  assert(input_width != 0);
3193
0
  assert(input_width % sizeof(float) == 0);
3194
0
  assert(padding_top == 1);
3195
3196
0
  const float vmin = params->scalar.min;
3197
0
  const float vmax = params->scalar.max;
3198
3199
0
  const float vbias = weights[0];
3200
0
  const float vk00 = weights[1];
3201
0
  const float vk01 = weights[2];
3202
0
  const float vk02 = weights[3];
3203
0
  const float vk10 = weights[4];
3204
0
  const float vk11 = weights[5];
3205
0
  const float vk12 = weights[6];
3206
0
  const float vk20 = weights[7];
3207
0
  const float vk21 = weights[8];
3208
0
  const float vk22 = weights[9];
3209
3210
0
  const float* i0 = zero;
3211
0
  const float* i1 = input;
3212
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3213
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3214
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3215
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3216
3217
0
  float* o0 = output;
3218
0
  float* o1 = (float*) ((uintptr_t) o0 + input_width);
3219
0
  float* o2 = (float*) ((uintptr_t) o1 + input_width);
3220
0
  float* o3 = (float*) ((uintptr_t) o2 + input_width);
3221
3222
0
  size_t output_height = input_height;
3223
0
  do {
3224
0
    if XNN_UNPREDICTABLE(output_height < 2) {
3225
0
      i2 = zero;
3226
0
      o1 = o0;
3227
0
    }
3228
0
    if XNN_UNPREDICTABLE(output_height < 3) {
3229
0
      i3 = zero;
3230
0
      o2 = o1;
3231
0
    }
3232
0
    if XNN_UNPREDICTABLE(output_height < 4) {
3233
0
      i4 = zero;
3234
0
      o3 = o2;
3235
0
    }
3236
0
    if XNN_UNPREDICTABLE(output_height < 5) {
3237
0
      i5 = zero;
3238
0
    }
3239
3240
0
    float vi0x0 = 0.0f;
3241
0
    float vi1x0 = 0.0f;
3242
0
    float vi2x0 = 0.0f;
3243
0
    float vi3x0 = 0.0f;
3244
0
    float vi4x0 = 0.0f;
3245
0
    float vi5x0 = 0.0f;
3246
3247
0
    float vi0x1 = *i0++;
3248
0
    float vi1x1 = *i1++;
3249
0
    float vi2x1 = *i2++;
3250
0
    float vi3x1 = *i3++;
3251
0
    float vi4x1 = *i4++;
3252
0
    float vi5x1 = *i5++;
3253
3254
0
    size_t w = input_width;
3255
0
    for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
3256
0
      const float vi0x2 = *i0++;
3257
0
      const float vi1x2 = *i1++;
3258
0
      const float vi2x2 = *i2++;
3259
0
      const float vi3x2 = *i3++;
3260
0
      const float vi4x2 = *i4++;
3261
0
      const float vi5x2 = *i5++;
3262
3263
0
      float vo0p0 = vbias + vi0x0 * vk00;
3264
0
      float vo1p0 = vbias + vi1x0 * vk00;
3265
0
      float vo2p0 = vbias + vi2x0 * vk00;
3266
0
      float vo3p0 = vbias + vi3x0 * vk00;
3267
0
      vo0p0 += vi1x0 * vk10;
3268
0
      vo1p0 += vi2x0 * vk10;
3269
0
      vo2p0 += vi3x0 * vk10;
3270
0
      vo3p0 += vi4x0 * vk10;
3271
0
      vo0p0 += vi2x0 * vk20;
3272
0
      vo1p0 += vi3x0 * vk20;
3273
0
      vo2p0 += vi4x0 * vk20;
3274
0
      vo3p0 += vi5x0 * vk20;
3275
3276
0
      vi0x0 = vi0x1;
3277
0
      vi1x0 = vi1x1;
3278
0
      vi2x0 = vi2x1;
3279
0
      vi3x0 = vi3x1;
3280
0
      vi4x0 = vi4x1;
3281
0
      vi5x0 = vi5x1;
3282
3283
0
      vo0p0 += vi0x1 * vk01;
3284
0
      vo1p0 += vi1x1 * vk01;
3285
0
      vo2p0 += vi2x1 * vk01;
3286
0
      vo3p0 += vi3x1 * vk01;
3287
0
      vo0p0 += vi1x1 * vk11;
3288
0
      vo1p0 += vi2x1 * vk11;
3289
0
      vo2p0 += vi3x1 * vk11;
3290
0
      vo3p0 += vi4x1 * vk11;
3291
0
      vo0p0 += vi2x1 * vk21;
3292
0
      vo1p0 += vi3x1 * vk21;
3293
0
      vo2p0 += vi4x1 * vk21;
3294
0
      vo3p0 += vi5x1 * vk21;
3295
3296
0
      vi0x1 = vi0x2;
3297
0
      vi1x1 = vi1x2;
3298
0
      vi2x1 = vi2x2;
3299
0
      vi3x1 = vi3x2;
3300
0
      vi4x1 = vi4x2;
3301
0
      vi5x1 = vi5x2;
3302
3303
0
      vo0p0 += vi0x2 * vk02;
3304
0
      vo1p0 += vi1x2 * vk02;
3305
0
      vo2p0 += vi2x2 * vk02;
3306
0
      vo3p0 += vi3x2 * vk02;
3307
0
      vo0p0 += vi1x2 * vk12;
3308
0
      vo1p0 += vi2x2 * vk12;
3309
0
      vo2p0 += vi3x2 * vk12;
3310
0
      vo3p0 += vi4x2 * vk12;
3311
0
      vo0p0 += vi2x2 * vk22;
3312
0
      vo1p0 += vi3x2 * vk22;
3313
0
      vo2p0 += vi4x2 * vk22;
3314
0
      vo3p0 += vi5x2 * vk22;
3315
3316
3317
0
      float vo0 = math_max_f32(vo0p0, vmin);
3318
0
      float vo1 = math_max_f32(vo1p0, vmin);
3319
0
      float vo2 = math_max_f32(vo2p0, vmin);
3320
0
      float vo3 = math_max_f32(vo3p0, vmin);
3321
3322
0
      vo0 = math_min_f32(vo0, vmax);
3323
0
      vo1 = math_min_f32(vo1, vmax);
3324
0
      vo2 = math_min_f32(vo2, vmax);
3325
0
      vo3 = math_min_f32(vo3, vmax);
3326
3327
0
      *o3++ = vo3;
3328
0
      *o2++ = vo2;
3329
0
      *o1++ = vo1;
3330
0
      *o0++ = vo0;
3331
0
    }
3332
    // Always process the last pixel separately to account for right edge.
3333
0
    assert(w == 1 * sizeof(float));
3334
0
    {
3335
0
      float vo0p0 = vbias + vi0x0 * vk00;
3336
0
      float vo1p0 = vbias + vi1x0 * vk00;
3337
0
      float vo2p0 = vbias + vi2x0 * vk00;
3338
0
      float vo3p0 = vbias + vi3x0 * vk00;
3339
0
      vo0p0 += vi1x0 * vk10;
3340
0
      vo1p0 += vi2x0 * vk10;
3341
0
      vo2p0 += vi3x0 * vk10;
3342
0
      vo3p0 += vi4x0 * vk10;
3343
0
      vo0p0 += vi2x0 * vk20;
3344
0
      vo1p0 += vi3x0 * vk20;
3345
0
      vo2p0 += vi4x0 * vk20;
3346
0
      vo3p0 += vi5x0 * vk20;
3347
3348
0
      vo0p0 += vi0x1 * vk01;
3349
0
      vo1p0 += vi1x1 * vk01;
3350
0
      vo2p0 += vi2x1 * vk01;
3351
0
      vo3p0 += vi3x1 * vk01;
3352
0
      vo0p0 += vi1x1 * vk11;
3353
0
      vo1p0 += vi2x1 * vk11;
3354
0
      vo2p0 += vi3x1 * vk11;
3355
0
      vo3p0 += vi4x1 * vk11;
3356
0
      vo0p0 += vi2x1 * vk21;
3357
0
      vo1p0 += vi3x1 * vk21;
3358
0
      vo2p0 += vi4x1 * vk21;
3359
0
      vo3p0 += vi5x1 * vk21;
3360
3361
3362
0
      float vo0 = math_max_f32(vo0p0, vmin);
3363
0
      float vo1 = math_max_f32(vo1p0, vmin);
3364
0
      float vo2 = math_max_f32(vo2p0, vmin);
3365
0
      float vo3 = math_max_f32(vo3p0, vmin);
3366
3367
0
      vo0 = math_min_f32(vo0, vmax);
3368
0
      vo1 = math_min_f32(vo1, vmax);
3369
0
      vo2 = math_min_f32(vo2, vmax);
3370
0
      vo3 = math_min_f32(vo3, vmax);
3371
3372
0
      *o3++ = vo3;
3373
0
      *o2++ = vo2;
3374
0
      *o1++ = vo1;
3375
0
      *o0++ = vo0;
3376
0
    }
3377
3378
0
    i0 = (const float*) ((uintptr_t) i4 - input_width);
3379
0
    i1 = (const float*) ((uintptr_t) i5 - input_width);
3380
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3381
0
    i3 = (const float*) ((uintptr_t) i2 + input_width);
3382
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
3383
0
    i5 = (const float*) ((uintptr_t) i4 + input_width);
3384
3385
0
    o0 = o3;
3386
0
    o1 = (float*) ((uintptr_t) o0 + input_width);
3387
0
    o2 = (float*) ((uintptr_t) o1 + input_width);
3388
0
    o3 = (float*) ((uintptr_t) o2 + input_width);
3389
3390
0
    output_height = doz(output_height, 4);
3391
0
  } while (output_height != 0);
3392
0
}
3393
3394
void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2(
3395
    size_t input_height,
3396
    size_t input_width,
3397
    const float* input,
3398
    const float* weights,
3399
    const float* zero,
3400
    float* output,
3401
    uint32_t padding_top,
3402
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3403
0
{
3404
0
  assert(input_height != 0);
3405
0
  assert(input_width != 0);
3406
0
  assert(input_width % sizeof(float) == 0);
3407
0
  assert(padding_top >= 0);
3408
0
  assert(padding_top <= 1);
3409
3410
0
  const float vmin = params->scalar.min;
3411
0
  const float vmax = params->scalar.max;
3412
3413
0
  const float vbias = weights[0];
3414
0
  const float vk00 = weights[1];
3415
0
  const float vk01 = weights[2];
3416
0
  const float vk02 = weights[3];
3417
0
  const float vk10 = weights[4];
3418
0
  const float vk11 = weights[5];
3419
0
  const float vk12 = weights[6];
3420
0
  const float vk20 = weights[7];
3421
0
  const float vk21 = weights[8];
3422
0
  const float vk22 = weights[9];
3423
3424
3425
0
  const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
3426
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
3427
0
  if XNN_UNPREDICTABLE(padding_top != 0) {
3428
0
    i0 = zero;
3429
0
  }
3430
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3431
3432
0
  float* o0 = output;
3433
3434
0
  size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
3435
0
  size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
3436
0
  do {
3437
0
    if XNN_UNPREDICTABLE(padded_input_height < 4) {
3438
0
      i2 = zero;
3439
0
    }
3440
3441
0
    float vi0x0 = 0.0f;
3442
0
    float vi1x0 = 0.0f;
3443
0
    float vi2x0 = 0.0f;
3444
3445
0
    size_t w = input_width;
3446
0
    for (; w >= 2 * sizeof(float); w -= 2 * sizeof(float)) {
3447
0
      const float vi0x1 = i0[0];
3448
0
      const float vi1x1 = i1[0];
3449
0
      const float vi2x1 = i2[0];
3450
3451
0
      float vo0p0 = vbias + vi0x0 * vk00;
3452
0
      float vo0p1 = vi1x0 * vk10;
3453
0
      vo0p0 += vi2x0 * vk20;
3454
3455
0
      const float vi0x2 = i0[1];
3456
0
      i0 += 2;
3457
0
      const float vi1x2 = i1[1];
3458
0
      i1 += 2;
3459
0
      const float vi2x2 = i2[1];
3460
0
      i2 += 2;
3461
3462
0
      vo0p1 += vi0x1 * vk01;
3463
0
      vo0p0 += vi1x1 * vk11;
3464
0
      vo0p1 += vi2x1 * vk21;
3465
3466
0
      vi0x0 = vi0x2;
3467
0
      vi1x0 = vi1x2;
3468
0
      vi2x0 = vi2x2;
3469
3470
0
      vo0p0 += vi0x2 * vk02;
3471
0
      vo0p1 += vi1x2 * vk12;
3472
0
      vo0p0 += vi2x2 * vk22;
3473
3474
0
      vo0p0 += vo0p1;
3475
3476
0
      float vo0 = math_max_f32(vo0p0, vmin);
3477
3478
0
      vo0 = math_min_f32(vo0, vmax);
3479
3480
0
      *o0++ = vo0;
3481
0
    }
3482
    // Potentially process the last pixel.
3483
0
    assert(w <= 1 * sizeof(float));
3484
0
    if (w != 0) {
3485
0
      const float vi0x1 = *i0++;
3486
0
      const float vi1x1 = *i1++;
3487
0
      const float vi2x1 = *i2++;
3488
3489
0
      float vo0p0 = vbias + vi0x0 * vk00;
3490
0
      float vo0p1 = vi1x0 * vk10;
3491
0
      vo0p0 += vi2x0 * vk20;
3492
3493
0
      vo0p1 += vi0x1 * vk01;
3494
0
      vo0p0 += vi1x1 * vk11;
3495
0
      vo0p1 += vi2x1 * vk21;
3496
3497
0
      vo0p0 += vo0p1;
3498
3499
0
      float vo0 = math_max_f32(vo0p0, vmin);
3500
3501
0
      vo0 = math_min_f32(vo0, vmax);
3502
3503
0
      *o0++ = vo0;
3504
0
    }
3505
3506
0
    i0 = (const float*) ((uintptr_t) i1);
3507
0
    i1 = (const float*) ((uintptr_t) i2);
3508
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3509
3510
3511
0
    output_height -= 1;
3512
0
    padded_input_height -= 2;
3513
0
  } while (output_height != 0);
3514
0
}
3515
3516
void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2(
3517
    size_t input_height,
3518
    size_t input_width,
3519
    const float* input,
3520
    const float* weights,
3521
    const float* zero,
3522
    float* output,
3523
    uint32_t padding_top,
3524
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3525
0
{
3526
0
  assert(input_height != 0);
3527
0
  assert(input_width != 0);
3528
0
  assert(input_width % sizeof(float) == 0);
3529
0
  assert(padding_top >= 0);
3530
0
  assert(padding_top <= 1);
3531
3532
0
  const float vmin = params->scalar.min;
3533
0
  const float vmax = params->scalar.max;
3534
3535
0
  const float vbias = weights[0];
3536
0
  const float vk00 = weights[1];
3537
0
  const float vk01 = weights[2];
3538
0
  const float vk02 = weights[3];
3539
0
  const float vk10 = weights[4];
3540
0
  const float vk11 = weights[5];
3541
0
  const float vk12 = weights[6];
3542
0
  const float vk20 = weights[7];
3543
0
  const float vk21 = weights[8];
3544
0
  const float vk22 = weights[9];
3545
3546
0
  const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
3547
3548
0
  const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
3549
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
3550
0
  if XNN_UNPREDICTABLE(padding_top != 0) {
3551
0
    i0 = zero;
3552
0
  }
3553
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
3554
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3555
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3556
3557
0
  float* o0 = output;
3558
0
  float* o1 = (float*) ((uintptr_t) o0 + output_width);
3559
3560
0
  size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
3561
0
  size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
3562
0
  do {
3563
0
    if XNN_UNPREDICTABLE(padded_input_height < 4) {
3564
0
      i2 = zero;
3565
0
    }
3566
0
    if XNN_UNPREDICTABLE(padded_input_height < 5) {
3567
0
      i3 = zero;
3568
0
      o1 = o0;
3569
0
    }
3570
0
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
3571
0
      i4 = zero;
3572
0
    }
3573
3574
0
    float vi0x0 = 0.0f;
3575
0
    float vi1x0 = 0.0f;
3576
0
    float vi2x0 = 0.0f;
3577
0
    float vi3x0 = 0.0f;
3578
0
    float vi4x0 = 0.0f;
3579
3580
0
    size_t w = input_width;
3581
0
    for (; w >= 2 * sizeof(float); w -= 2 * sizeof(float)) {
3582
0
      const float vi0x1 = i0[0];
3583
0
      const float vi1x1 = i1[0];
3584
0
      const float vi2x1 = i2[0];
3585
0
      const float vi3x1 = i3[0];
3586
0
      const float vi4x1 = i4[0];
3587
3588
0
      float vo0p0 = vbias + vi0x0 * vk00;
3589
0
      float vo1p0 = vbias + vi2x0 * vk00;
3590
0
      float vo0p1 = vi1x0 * vk10;
3591
0
      float vo1p1 = vi3x0 * vk10;
3592
0
      vo0p0 += vi2x0 * vk20;
3593
0
      vo1p0 += vi4x0 * vk20;
3594
3595
0
      const float vi0x2 = i0[1];
3596
0
      i0 += 2;
3597
0
      const float vi1x2 = i1[1];
3598
0
      i1 += 2;
3599
0
      const float vi2x2 = i2[1];
3600
0
      i2 += 2;
3601
0
      const float vi3x2 = i3[1];
3602
0
      i3 += 2;
3603
0
      const float vi4x2 = i4[1];
3604
0
      i4 += 2;
3605
3606
0
      vo0p1 += vi0x1 * vk01;
3607
0
      vo1p1 += vi2x1 * vk01;
3608
0
      vo0p0 += vi1x1 * vk11;
3609
0
      vo1p0 += vi3x1 * vk11;
3610
0
      vo0p1 += vi2x1 * vk21;
3611
0
      vo1p1 += vi4x1 * vk21;
3612
3613
0
      vi0x0 = vi0x2;
3614
0
      vi1x0 = vi1x2;
3615
0
      vi2x0 = vi2x2;
3616
0
      vi3x0 = vi3x2;
3617
0
      vi4x0 = vi4x2;
3618
3619
0
      vo0p0 += vi0x2 * vk02;
3620
0
      vo1p0 += vi2x2 * vk02;
3621
0
      vo0p1 += vi1x2 * vk12;
3622
0
      vo1p1 += vi3x2 * vk12;
3623
0
      vo0p0 += vi2x2 * vk22;
3624
0
      vo1p0 += vi4x2 * vk22;
3625
3626
0
      vo0p0 += vo0p1;
3627
0
      vo1p0 += vo1p1;
3628
3629
0
      float vo0 = math_max_f32(vo0p0, vmin);
3630
0
      float vo1 = math_max_f32(vo1p0, vmin);
3631
3632
0
      vo0 = math_min_f32(vo0, vmax);
3633
0
      vo1 = math_min_f32(vo1, vmax);
3634
3635
0
      *o1++ = vo1;
3636
0
      *o0++ = vo0;
3637
0
    }
3638
    // Potentially process the last pixel.
3639
0
    assert(w <= 1 * sizeof(float));
3640
0
    if (w != 0) {
3641
0
      const float vi0x1 = *i0++;
3642
0
      const float vi1x1 = *i1++;
3643
0
      const float vi2x1 = *i2++;
3644
0
      const float vi3x1 = *i3++;
3645
0
      const float vi4x1 = *i4++;
3646
3647
0
      float vo0p0 = vbias + vi0x0 * vk00;
3648
0
      float vo1p0 = vbias + vi2x0 * vk00;
3649
0
      float vo0p1 = vi1x0 * vk10;
3650
0
      float vo1p1 = vi3x0 * vk10;
3651
0
      vo0p0 += vi2x0 * vk20;
3652
0
      vo1p0 += vi4x0 * vk20;
3653
3654
0
      vo0p1 += vi0x1 * vk01;
3655
0
      vo1p1 += vi2x1 * vk01;
3656
0
      vo0p0 += vi1x1 * vk11;
3657
0
      vo1p0 += vi3x1 * vk11;
3658
0
      vo0p1 += vi2x1 * vk21;
3659
0
      vo1p1 += vi4x1 * vk21;
3660
3661
0
      vo0p0 += vo0p1;
3662
0
      vo1p0 += vo1p1;
3663
3664
0
      float vo0 = math_max_f32(vo0p0, vmin);
3665
0
      float vo1 = math_max_f32(vo1p0, vmin);
3666
3667
0
      vo0 = math_min_f32(vo0, vmax);
3668
0
      vo1 = math_min_f32(vo1, vmax);
3669
3670
0
      *o1++ = vo1;
3671
0
      *o0++ = vo0;
3672
0
    }
3673
3674
0
    i0 = (const float*) ((uintptr_t) i3);
3675
0
    i1 = (const float*) ((uintptr_t) i4);
3676
0
    i2 = (const float*) ((uintptr_t) i1 + input_width);
3677
0
    i3 = (const float*) ((uintptr_t) i2 + input_width);
3678
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
3679
3680
0
    o0 = o1;
3681
0
    o1 = (float*) ((uintptr_t) o0 + output_width);
3682
3683
0
    output_height = doz(output_height, 2);
3684
0
    padded_input_height = doz(padded_input_height, 4);
3685
0
  } while (output_height != 0);
3686
0
}
3687
3688
void xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5(
3689
    size_t input_height,
3690
    size_t input_width,
3691
    const float* input,
3692
    const float* weights,
3693
    const float* zero,
3694
    float* output,
3695
    uint32_t padding_top,
3696
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3697
0
{
3698
0
  assert(input_height != 0);
3699
0
  assert(input_width != 0);
3700
0
  assert(input_width % sizeof(float) == 0);
3701
0
  assert(padding_top == 2);
3702
3703
0
  const float vmin = params->scalar.min;
3704
0
  const float vmax = params->scalar.max;
3705
3706
0
  const float vbias = weights[0];
3707
0
  const float vk00 = weights[1];
3708
0
  const float vk01 = weights[2];
3709
0
  const float vk02 = weights[3];
3710
0
  const float vk03 = weights[4];
3711
0
  const float vk04 = weights[5];
3712
0
  const float vk10 = weights[6];
3713
0
  const float vk11 = weights[7];
3714
0
  const float vk12 = weights[8];
3715
0
  const float vk13 = weights[9];
3716
0
  const float vk14 = weights[10];
3717
0
  const float vk20 = weights[11];
3718
0
  const float vk21 = weights[12];
3719
0
  const float vk22 = weights[13];
3720
0
  const float vk23 = weights[14];
3721
0
  const float vk24 = weights[15];
3722
0
  const float vk30 = weights[16];
3723
0
  const float vk31 = weights[17];
3724
0
  const float vk32 = weights[18];
3725
0
  const float vk33 = weights[19];
3726
0
  const float vk34 = weights[20];
3727
0
  const float vk40 = weights[21];
3728
0
  const float vk41 = weights[22];
3729
0
  const float vk42 = weights[23];
3730
0
  const float vk43 = weights[24];
3731
0
  const float vk44 = weights[25];
3732
3733
0
  const float* i0 = zero;
3734
0
  const float* i1 = zero;
3735
0
  const float* i2 = input;
3736
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3737
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3738
3739
0
  float* o0 = output;
3740
3741
0
  size_t output_height = input_height;
3742
0
  do {
3743
0
    if XNN_UNPREDICTABLE(output_height < 2) {
3744
0
      i3 = zero;
3745
0
    }
3746
0
    if XNN_UNPREDICTABLE(output_height < 3) {
3747
0
      i4 = zero;
3748
0
    }
3749
3750
0
    float vi0x0 = 0.0f;
3751
0
    float vi1x0 = 0.0f;
3752
0
    float vi2x0 = 0.0f;
3753
0
    float vi3x0 = 0.0f;
3754
0
    float vi4x0 = 0.0f;
3755
3756
0
    float vi0x1 = 0.0f;
3757
0
    float vi1x1 = 0.0f;
3758
0
    float vi2x1 = 0.0f;
3759
0
    float vi3x1 = 0.0f;
3760
0
    float vi4x1 = 0.0f;
3761
3762
0
    float vi0x2 = *i0++;
3763
0
    float vi1x2 = *i1++;
3764
0
    float vi2x2 = *i2++;
3765
0
    float vi3x2 = *i3++;
3766
0
    float vi4x2 = *i4++;
3767
3768
0
    size_t w = input_width;
3769
0
    if (w > 1 * sizeof(float)) {
3770
0
      float vi0x3 = *i0++;
3771
0
      float vi1x3 = *i1++;
3772
0
      float vi2x3 = *i2++;
3773
0
      float vi3x3 = *i3++;
3774
0
      float vi4x3 = *i4++;
3775
3776
0
      for (; w > 2 * sizeof(float); w -= 1 * sizeof(float)) {
3777
0
        const float vi0x4 = *i0++;
3778
0
        const float vi1x4 = *i1++;
3779
0
        const float vi2x4 = *i2++;
3780
0
        const float vi3x4 = *i3++;
3781
0
        const float vi4x4 = *i4++;
3782
3783
0
        float vo0p0 = vbias + vi0x0 * vk00;
3784
0
        float vo0p1 = vi1x0 * vk10;
3785
0
        float vo0p2 = vi2x0 * vk20;
3786
0
        float vo0p3 = vi3x0 * vk30;
3787
0
        float vo0p4 = vi4x0 * vk40;
3788
3789
0
        vi0x0 = vi0x1;
3790
0
        vi1x0 = vi1x1;
3791
0
        vi2x0 = vi2x1;
3792
0
        vi3x0 = vi3x1;
3793
0
        vi4x0 = vi4x1;
3794
3795
0
        vo0p0 += vi0x1 * vk01;
3796
0
        vo0p1 += vi1x1 * vk11;
3797
0
        vo0p2 += vi2x1 * vk21;
3798
0
        vo0p3 += vi3x1 * vk31;
3799
0
        vo0p4 += vi4x1 * vk41;
3800
3801
0
        vi0x1 = vi0x2;
3802
0
        vi1x1 = vi1x2;
3803
0
        vi2x1 = vi2x2;
3804
0
        vi3x1 = vi3x2;
3805
0
        vi4x1 = vi4x2;
3806
3807
0
        vo0p0 += vi0x2 * vk02;
3808
0
        vo0p1 += vi1x2 * vk12;
3809
0
        vo0p2 += vi2x2 * vk22;
3810
0
        vo0p3 += vi3x2 * vk32;
3811
0
        vo0p4 += vi4x2 * vk42;
3812
3813
0
        vi0x2 = vi0x3;
3814
0
        vi1x2 = vi1x3;
3815
0
        vi2x2 = vi2x3;
3816
0
        vi3x2 = vi3x3;
3817
0
        vi4x2 = vi4x3;
3818
3819
0
        vo0p0 += vi0x3 * vk03;
3820
0
        vo0p1 += vi1x3 * vk13;
3821
0
        vo0p2 += vi2x3 * vk23;
3822
0
        vo0p3 += vi3x3 * vk33;
3823
0
        vo0p4 += vi4x3 * vk43;
3824
3825
0
        vi0x3 = vi0x4;
3826
0
        vi1x3 = vi1x4;
3827
0
        vi2x3 = vi2x4;
3828
0
        vi3x3 = vi3x4;
3829
0
        vi4x3 = vi4x4;
3830
3831
0
        vo0p0 += vi0x4 * vk04;
3832
0
        vo0p1 += vi1x4 * vk14;
3833
0
        vo0p2 += vi2x4 * vk24;
3834
0
        vo0p3 += vi3x4 * vk34;
3835
0
        vo0p4 += vi4x4 * vk44;
3836
3837
0
        vo0p0 += vo0p1;
3838
0
        vo0p2 += vo0p3;
3839
0
        vo0p0 += vo0p2;
3840
0
        vo0p0 += vo0p4;
3841
3842
0
        float vo0 = math_max_f32(vo0p0, vmin);
3843
3844
0
        vo0 = math_min_f32(vo0, vmax);
3845
3846
0
        *o0++ = vo0;
3847
0
      }
3848
0
      assert(w == 2 * sizeof(float));
3849
0
      {
3850
0
        float vo0p0 = vbias + vi0x0 * vk00;
3851
0
        float vo0p1 = vi1x0 * vk10;
3852
0
        float vo0p2 = vi2x0 * vk20;
3853
0
        float vo0p3 = vi3x0 * vk30;
3854
0
        float vo0p4 = vi4x0 * vk40;
3855
3856
0
        vi0x0 = vi0x1;
3857
0
        vi1x0 = vi1x1;
3858
0
        vi2x0 = vi2x1;
3859
0
        vi3x0 = vi3x1;
3860
0
        vi4x0 = vi4x1;
3861
3862
0
        vo0p0 += vi0x1 * vk01;
3863
0
        vo0p1 += vi1x1 * vk11;
3864
0
        vo0p2 += vi2x1 * vk21;
3865
0
        vo0p3 += vi3x1 * vk31;
3866
0
        vo0p4 += vi4x1 * vk41;
3867
3868
0
        vi0x1 = vi0x2;
3869
0
        vi1x1 = vi1x2;
3870
0
        vi2x1 = vi2x2;
3871
0
        vi3x1 = vi3x2;
3872
0
        vi4x1 = vi4x2;
3873
3874
0
        vo0p0 += vi0x2 * vk02;
3875
0
        vo0p1 += vi1x2 * vk12;
3876
0
        vo0p2 += vi2x2 * vk22;
3877
0
        vo0p3 += vi3x2 * vk32;
3878
0
        vo0p4 += vi4x2 * vk42;
3879
3880
0
        vi0x2 = vi0x3;
3881
0
        vi1x2 = vi1x3;
3882
0
        vi2x2 = vi2x3;
3883
0
        vi3x2 = vi3x3;
3884
0
        vi4x2 = vi4x3;
3885
3886
0
        vo0p0 += vi0x3 * vk03;
3887
0
        vo0p1 += vi1x3 * vk13;
3888
0
        vo0p2 += vi2x3 * vk23;
3889
0
        vo0p3 += vi3x3 * vk33;
3890
0
        vo0p4 += vi4x3 * vk43;
3891
3892
0
        vo0p0 += vo0p1;
3893
0
        vo0p2 += vo0p3;
3894
0
        vo0p0 += vo0p2;
3895
0
        vo0p0 += vo0p4;
3896
3897
0
        float vo0 = math_max_f32(vo0p0, vmin);
3898
3899
0
        vo0 = math_min_f32(vo0, vmax);
3900
3901
0
        *o0++ = vo0;
3902
0
      }
3903
0
      w -= 1 * sizeof(float);
3904
0
    }
3905
0
    assert(w == 1 * sizeof(float));
3906
0
    {
3907
0
      float vo0p0 = vbias + vi0x0 * vk00;
3908
0
      float vo0p1 = vi1x0 * vk10;
3909
0
      float vo0p2 = vi2x0 * vk20;
3910
0
      float vo0p3 = vi3x0 * vk30;
3911
0
      float vo0p4 = vi4x0 * vk40;
3912
3913
0
      vo0p0 += vi0x1 * vk01;
3914
0
      vo0p1 += vi1x1 * vk11;
3915
0
      vo0p2 += vi2x1 * vk21;
3916
0
      vo0p3 += vi3x1 * vk31;
3917
0
      vo0p4 += vi4x1 * vk41;
3918
3919
0
      vo0p0 += vi0x2 * vk02;
3920
0
      vo0p1 += vi1x2 * vk12;
3921
0
      vo0p2 += vi2x2 * vk22;
3922
0
      vo0p3 += vi3x2 * vk32;
3923
0
      vo0p4 += vi4x2 * vk42;
3924
3925
0
      vo0p0 += vo0p1;
3926
0
      vo0p2 += vo0p3;
3927
0
      vo0p0 += vo0p2;
3928
0
      vo0p0 += vo0p4;
3929
3930
0
      float vo0 = math_max_f32(vo0p0, vmin);
3931
3932
0
      vo0 = math_min_f32(vo0, vmax);
3933
3934
0
      *o0++ = vo0;
3935
0
    }
3936
3937
0
    i0 = (const float*) ((uintptr_t) i1 - input_width);
3938
0
    i1 = (const float*) ((uintptr_t) i2 - input_width);
3939
3940
3941
0
  } while (--output_height != 0);
3942
0
}
3943
3944
void xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2(
3945
    size_t input_height,
3946
    size_t input_width,
3947
    const float* input,
3948
    const float* weights,
3949
    const float* zero,
3950
    float* output,
3951
    uint32_t padding_top,
3952
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
3953
0
{
3954
0
  assert(input_height != 0);
3955
0
  assert(input_width != 0);
3956
0
  assert(input_width % sizeof(float) == 0);
3957
0
  assert(padding_top == 2);
3958
3959
0
  const float vmin = params->scalar.min;
3960
0
  const float vmax = params->scalar.max;
3961
3962
0
  const float vbias = weights[0];
3963
0
  const float vk00 = weights[1];
3964
0
  const float vk01 = weights[2];
3965
0
  const float vk02 = weights[3];
3966
0
  const float vk03 = weights[4];
3967
0
  const float vk04 = weights[5];
3968
0
  const float vk10 = weights[6];
3969
0
  const float vk11 = weights[7];
3970
0
  const float vk12 = weights[8];
3971
0
  const float vk13 = weights[9];
3972
0
  const float vk14 = weights[10];
3973
0
  const float vk20 = weights[11];
3974
0
  const float vk21 = weights[12];
3975
0
  const float vk22 = weights[13];
3976
0
  const float vk23 = weights[14];
3977
0
  const float vk24 = weights[15];
3978
0
  const float vk30 = weights[16];
3979
0
  const float vk31 = weights[17];
3980
0
  const float vk32 = weights[18];
3981
0
  const float vk33 = weights[19];
3982
0
  const float vk34 = weights[20];
3983
0
  const float vk40 = weights[21];
3984
0
  const float vk41 = weights[22];
3985
0
  const float vk42 = weights[23];
3986
0
  const float vk43 = weights[24];
3987
0
  const float vk44 = weights[25];
3988
3989
0
  const float* i0 = zero;
3990
0
  const float* i1 = zero;
3991
0
  const float* i2 = input;
3992
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
3993
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
3994
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
3995
3996
0
  float* o0 = output;
3997
0
  float* o1 = (float*) ((uintptr_t) o0 + input_width);
3998
3999
0
  size_t output_height = input_height;
4000
0
  do {
4001
0
    if XNN_UNPREDICTABLE(output_height < 2) {
4002
0
      i3 = zero;
4003
0
      o1 = o0;
4004
0
    }
4005
0
    if XNN_UNPREDICTABLE(output_height < 3) {
4006
0
      i4 = zero;
4007
0
    }
4008
0
    if XNN_UNPREDICTABLE(output_height < 4) {
4009
0
      i5 = zero;
4010
0
    }
4011
4012
0
    float vi0x0 = 0.0f;
4013
0
    float vi1x0 = 0.0f;
4014
0
    float vi2x0 = 0.0f;
4015
0
    float vi3x0 = 0.0f;
4016
0
    float vi4x0 = 0.0f;
4017
0
    float vi5x0 = 0.0f;
4018
4019
0
    float vi0x1 = 0.0f;
4020
0
    float vi1x1 = 0.0f;
4021
0
    float vi2x1 = 0.0f;
4022
0
    float vi3x1 = 0.0f;
4023
0
    float vi4x1 = 0.0f;
4024
0
    float vi5x1 = 0.0f;
4025
4026
0
    float vi0x2 = *i0++;
4027
0
    float vi1x2 = *i1++;
4028
0
    float vi2x2 = *i2++;
4029
0
    float vi3x2 = *i3++;
4030
0
    float vi4x2 = *i4++;
4031
0
    float vi5x2 = *i5++;
4032
4033
0
    size_t w = input_width;
4034
0
    if (w > 1 * sizeof(float)) {
4035
0
      float vi0x3 = *i0++;
4036
0
      float vi1x3 = *i1++;
4037
0
      float vi2x3 = *i2++;
4038
0
      float vi3x3 = *i3++;
4039
0
      float vi4x3 = *i4++;
4040
0
      float vi5x3 = *i5++;
4041
4042
0
      for (; w > 2 * sizeof(float); w -= 1 * sizeof(float)) {
4043
0
        const float vi0x4 = *i0++;
4044
0
        const float vi1x4 = *i1++;
4045
0
        const float vi2x4 = *i2++;
4046
0
        const float vi3x4 = *i3++;
4047
0
        const float vi4x4 = *i4++;
4048
0
        const float vi5x4 = *i5++;
4049
4050
0
        float vo0p0 = vbias + vi0x0 * vk00;
4051
0
        float vo1p0 = vbias + vi1x0 * vk00;
4052
0
        float vo0p1 = vi1x0 * vk10;
4053
0
        float vo1p1 = vi2x0 * vk10;
4054
0
        vo0p0 += vi2x0 * vk20;
4055
0
        vo1p0 += vi3x0 * vk20;
4056
0
        vo0p1 += vi3x0 * vk30;
4057
0
        vo1p1 += vi4x0 * vk30;
4058
0
        vo0p0 += vi4x0 * vk40;
4059
0
        vo1p0 += vi5x0 * vk40;
4060
4061
0
        vi0x0 = vi0x1;
4062
0
        vi1x0 = vi1x1;
4063
0
        vi2x0 = vi2x1;
4064
0
        vi3x0 = vi3x1;
4065
0
        vi4x0 = vi4x1;
4066
0
        vi5x0 = vi5x1;
4067
4068
0
        vo0p1 += vi0x1 * vk01;
4069
0
        vo1p1 += vi1x1 * vk01;
4070
0
        vo0p0 += vi1x1 * vk11;
4071
0
        vo1p0 += vi2x1 * vk11;
4072
0
        vo0p1 += vi2x1 * vk21;
4073
0
        vo1p1 += vi3x1 * vk21;
4074
0
        vo0p0 += vi3x1 * vk31;
4075
0
        vo1p0 += vi4x1 * vk31;
4076
0
        vo0p1 += vi4x1 * vk41;
4077
0
        vo1p1 += vi5x1 * vk41;
4078
4079
0
        vi0x1 = vi0x2;
4080
0
        vi1x1 = vi1x2;
4081
0
        vi2x1 = vi2x2;
4082
0
        vi3x1 = vi3x2;
4083
0
        vi4x1 = vi4x2;
4084
0
        vi5x1 = vi5x2;
4085
4086
0
        vo0p0 += vi0x2 * vk02;
4087
0
        vo1p0 += vi1x2 * vk02;
4088
0
        vo0p1 += vi1x2 * vk12;
4089
0
        vo1p1 += vi2x2 * vk12;
4090
0
        vo0p0 += vi2x2 * vk22;
4091
0
        vo1p0 += vi3x2 * vk22;
4092
0
        vo0p1 += vi3x2 * vk32;
4093
0
        vo1p1 += vi4x2 * vk32;
4094
0
        vo0p0 += vi4x2 * vk42;
4095
0
        vo1p0 += vi5x2 * vk42;
4096
4097
0
        vi0x2 = vi0x3;
4098
0
        vi1x2 = vi1x3;
4099
0
        vi2x2 = vi2x3;
4100
0
        vi3x2 = vi3x3;
4101
0
        vi4x2 = vi4x3;
4102
0
        vi5x2 = vi5x3;
4103
4104
0
        vo0p1 += vi0x3 * vk03;
4105
0
        vo1p1 += vi1x3 * vk03;
4106
0
        vo0p0 += vi1x3 * vk13;
4107
0
        vo1p0 += vi2x3 * vk13;
4108
0
        vo0p1 += vi2x3 * vk23;
4109
0
        vo1p1 += vi3x3 * vk23;
4110
0
        vo0p0 += vi3x3 * vk33;
4111
0
        vo1p0 += vi4x3 * vk33;
4112
0
        vo0p1 += vi4x3 * vk43;
4113
0
        vo1p1 += vi5x3 * vk43;
4114
4115
0
        vi0x3 = vi0x4;
4116
0
        vi1x3 = vi1x4;
4117
0
        vi2x3 = vi2x4;
4118
0
        vi3x3 = vi3x4;
4119
0
        vi4x3 = vi4x4;
4120
0
        vi5x3 = vi5x4;
4121
4122
0
        vo0p0 += vi0x4 * vk04;
4123
0
        vo1p0 += vi1x4 * vk04;
4124
0
        vo0p1 += vi1x4 * vk14;
4125
0
        vo1p1 += vi2x4 * vk14;
4126
0
        vo0p0 += vi2x4 * vk24;
4127
0
        vo1p0 += vi3x4 * vk24;
4128
0
        vo0p1 += vi3x4 * vk34;
4129
0
        vo1p1 += vi4x4 * vk34;
4130
0
        vo0p0 += vi4x4 * vk44;
4131
0
        vo1p0 += vi5x4 * vk44;
4132
4133
0
        vo0p0 += vo0p1;
4134
0
        vo1p0 += vo1p1;
4135
4136
0
        float vo0 = math_max_f32(vo0p0, vmin);
4137
0
        float vo1 = math_max_f32(vo1p0, vmin);
4138
4139
0
        vo0 = math_min_f32(vo0, vmax);
4140
0
        vo1 = math_min_f32(vo1, vmax);
4141
4142
0
        *o1++ = vo1;
4143
0
        *o0++ = vo0;
4144
0
      }
4145
0
      assert(w == 2 * sizeof(float));
4146
0
      {
4147
0
        float vo0p0 = vbias + vi0x0 * vk00;
4148
0
        float vo1p0 = vbias + vi1x0 * vk00;
4149
0
        float vo0p1 = vi1x0 * vk10;
4150
0
        float vo1p1 = vi2x0 * vk10;
4151
0
        vo0p0 += vi2x0 * vk20;
4152
0
        vo1p0 += vi3x0 * vk20;
4153
0
        vo0p1 += vi3x0 * vk30;
4154
0
        vo1p1 += vi4x0 * vk30;
4155
0
        vo0p0 += vi4x0 * vk40;
4156
0
        vo1p0 += vi5x0 * vk40;
4157
4158
0
        vi0x0 = vi0x1;
4159
0
        vi1x0 = vi1x1;
4160
0
        vi2x0 = vi2x1;
4161
0
        vi3x0 = vi3x1;
4162
0
        vi4x0 = vi4x1;
4163
0
        vi5x0 = vi5x1;
4164
4165
0
        vo0p1 += vi0x1 * vk01;
4166
0
        vo1p1 += vi1x1 * vk01;
4167
0
        vo0p0 += vi1x1 * vk11;
4168
0
        vo1p0 += vi2x1 * vk11;
4169
0
        vo0p1 += vi2x1 * vk21;
4170
0
        vo1p1 += vi3x1 * vk21;
4171
0
        vo0p0 += vi3x1 * vk31;
4172
0
        vo1p0 += vi4x1 * vk31;
4173
0
        vo0p1 += vi4x1 * vk41;
4174
0
        vo1p1 += vi5x1 * vk41;
4175
4176
0
        vi0x1 = vi0x2;
4177
0
        vi1x1 = vi1x2;
4178
0
        vi2x1 = vi2x2;
4179
0
        vi3x1 = vi3x2;
4180
0
        vi4x1 = vi4x2;
4181
0
        vi5x1 = vi5x2;
4182
4183
0
        vo0p0 += vi0x2 * vk02;
4184
0
        vo1p0 += vi1x2 * vk02;
4185
0
        vo0p1 += vi1x2 * vk12;
4186
0
        vo1p1 += vi2x2 * vk12;
4187
0
        vo0p0 += vi2x2 * vk22;
4188
0
        vo1p0 += vi3x2 * vk22;
4189
0
        vo0p1 += vi3x2 * vk32;
4190
0
        vo1p1 += vi4x2 * vk32;
4191
0
        vo0p0 += vi4x2 * vk42;
4192
0
        vo1p0 += vi5x2 * vk42;
4193
4194
0
        vi0x2 = vi0x3;
4195
0
        vi1x2 = vi1x3;
4196
0
        vi2x2 = vi2x3;
4197
0
        vi3x2 = vi3x3;
4198
0
        vi4x2 = vi4x3;
4199
0
        vi5x2 = vi5x3;
4200
4201
0
        vo0p1 += vi0x3 * vk03;
4202
0
        vo1p1 += vi1x3 * vk03;
4203
0
        vo0p0 += vi1x3 * vk13;
4204
0
        vo1p0 += vi2x3 * vk13;
4205
0
        vo0p1 += vi2x3 * vk23;
4206
0
        vo1p1 += vi3x3 * vk23;
4207
0
        vo0p0 += vi3x3 * vk33;
4208
0
        vo1p0 += vi4x3 * vk33;
4209
0
        vo0p1 += vi4x3 * vk43;
4210
0
        vo1p1 += vi5x3 * vk43;
4211
4212
0
        vo0p0 += vo0p1;
4213
0
        vo1p0 += vo1p1;
4214
4215
0
        float vo0 = math_max_f32(vo0p0, vmin);
4216
0
        float vo1 = math_max_f32(vo1p0, vmin);
4217
4218
0
        vo0 = math_min_f32(vo0, vmax);
4219
0
        vo1 = math_min_f32(vo1, vmax);
4220
4221
0
        *o1++ = vo1;
4222
0
        *o0++ = vo0;
4223
0
      }
4224
0
      w -= 1 * sizeof(float);
4225
0
    }
4226
0
    assert(w == 1 * sizeof(float));
4227
0
    {
4228
0
      float vo0p0 = vbias + vi0x0 * vk00;
4229
0
      float vo1p0 = vbias + vi1x0 * vk00;
4230
0
      float vo0p1 = vi1x0 * vk10;
4231
0
      float vo1p1 = vi2x0 * vk10;
4232
0
      vo0p0 += vi2x0 * vk20;
4233
0
      vo1p0 += vi3x0 * vk20;
4234
0
      vo0p1 += vi3x0 * vk30;
4235
0
      vo1p1 += vi4x0 * vk30;
4236
0
      vo0p0 += vi4x0 * vk40;
4237
0
      vo1p0 += vi5x0 * vk40;
4238
4239
0
      vo0p1 += vi0x1 * vk01;
4240
0
      vo1p1 += vi1x1 * vk01;
4241
0
      vo0p0 += vi1x1 * vk11;
4242
0
      vo1p0 += vi2x1 * vk11;
4243
0
      vo0p1 += vi2x1 * vk21;
4244
0
      vo1p1 += vi3x1 * vk21;
4245
0
      vo0p0 += vi3x1 * vk31;
4246
0
      vo1p0 += vi4x1 * vk31;
4247
0
      vo0p1 += vi4x1 * vk41;
4248
0
      vo1p1 += vi5x1 * vk41;
4249
4250
0
      vo0p0 += vi0x2 * vk02;
4251
0
      vo1p0 += vi1x2 * vk02;
4252
0
      vo0p1 += vi1x2 * vk12;
4253
0
      vo1p1 += vi2x2 * vk12;
4254
0
      vo0p0 += vi2x2 * vk22;
4255
0
      vo1p0 += vi3x2 * vk22;
4256
0
      vo0p1 += vi3x2 * vk32;
4257
0
      vo1p1 += vi4x2 * vk32;
4258
0
      vo0p0 += vi4x2 * vk42;
4259
0
      vo1p0 += vi5x2 * vk42;
4260
4261
0
      vo0p0 += vo0p1;
4262
0
      vo1p0 += vo1p1;
4263
4264
0
      float vo0 = math_max_f32(vo0p0, vmin);
4265
0
      float vo1 = math_max_f32(vo1p0, vmin);
4266
4267
0
      vo0 = math_min_f32(vo0, vmax);
4268
0
      vo1 = math_min_f32(vo1, vmax);
4269
4270
0
      *o1++ = vo1;
4271
0
      *o0++ = vo0;
4272
0
    }
4273
4274
0
    i0 = (const float*) ((uintptr_t) i2 - input_width);
4275
0
    i1 = (const float*) ((uintptr_t) i3 - input_width);
4276
0
    i2 = i3;
4277
0
    i3 = i4;
4278
0
    i4 = i5;
4279
0
    i5 = (const float*) ((uintptr_t) i4 + input_width);
4280
4281
0
    o0 = o1;
4282
0
    o1 = (float*) ((uintptr_t) o0 + input_width);
4283
4284
0
    output_height = doz(output_height, 2);
4285
0
  } while (output_height != 0);
4286
0
}
4287
4288
void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5(
4289
    size_t input_height,
4290
    size_t input_width,
4291
    const float* input,
4292
    const float* weights,
4293
    const float* zero,
4294
    float* output,
4295
    uint32_t padding_top,
4296
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
4297
0
{
4298
0
  assert(input_height != 0);
4299
0
  assert(input_width != 0);
4300
0
  assert(input_width % sizeof(float) == 0);
4301
0
  assert(padding_top >= 1);
4302
0
  assert(padding_top <= 2);
4303
4304
0
  const float vmax = params->scalar.max;
4305
0
  const float vmin = params->scalar.min;
4306
4307
0
  const float vbias = weights[0];
4308
0
  const float vk00 = weights[1];
4309
0
  const float vk01 = weights[2];
4310
0
  const float vk02 = weights[3];
4311
0
  const float vk03 = weights[4];
4312
0
  const float vk04 = weights[5];
4313
0
  const float vk10 = weights[6];
4314
0
  const float vk11 = weights[7];
4315
0
  const float vk12 = weights[8];
4316
0
  const float vk13 = weights[9];
4317
0
  const float vk14 = weights[10];
4318
0
  const float vk20 = weights[11];
4319
0
  const float vk21 = weights[12];
4320
0
  const float vk22 = weights[13];
4321
0
  const float vk23 = weights[14];
4322
0
  const float vk24 = weights[15];
4323
0
  const float vk30 = weights[16];
4324
0
  const float vk31 = weights[17];
4325
0
  const float vk32 = weights[18];
4326
0
  const float vk33 = weights[19];
4327
0
  const float vk34 = weights[20];
4328
0
  const float vk40 = weights[21];
4329
0
  const float vk41 = weights[22];
4330
0
  const float vk42 = weights[23];
4331
0
  const float vk43 = weights[24];
4332
0
  const float vk44 = weights[25];
4333
4334
0
  const uint32_t padding_top_less_1 = padding_top - 1;
4335
4336
0
  const float* i0 = zero;
4337
0
  const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
4338
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
4339
0
  if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
4340
0
    i1 = zero;
4341
0
  }
4342
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
4343
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
4344
4345
4346
0
  float* o0 = output;
4347
4348
0
  size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
4349
0
  size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
4350
0
  do {
4351
0
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
4352
0
      i3 = zero;
4353
0
    }
4354
0
    if XNN_UNPREDICTABLE(padded_input_height < 7) {
4355
0
      i4 = zero;
4356
0
    }
4357
4358
0
    float vi0x0 = 0.0f;
4359
0
    float vi1x0 = 0.0f;
4360
0
    float vi2x0 = 0.0f;
4361
0
    float vi3x0 = 0.0f;
4362
0
    float vi4x0 = 0.0f;
4363
4364
0
    float vi0x1 = 0.0f;
4365
0
    float vi1x1 = 0.0f;
4366
0
    float vi2x1 = 0.0f;
4367
0
    float vi3x1 = 0.0f;
4368
0
    float vi4x1 = 0.0f;
4369
4370
0
    float vi0x2 = *i0++;
4371
0
    float vi1x2 = *i1++;
4372
0
    float vi2x2 = *i2++;
4373
0
    float vi3x2 = *i3++;
4374
0
    float vi4x2 = *i4++;
4375
4376
0
    size_t w = input_width;
4377
0
    for (; w > 2 * sizeof(float); w -= 2 * sizeof(float)) {
4378
0
      const float vi0x3 = i0[0];
4379
0
      const float vi1x3 = i1[0];
4380
0
      const float vi2x3 = i2[0];
4381
0
      const float vi3x3 = i3[0];
4382
0
      const float vi4x3 = i4[0];
4383
4384
0
      const float vi0x4 = i0[1];
4385
0
      i0 += 2;
4386
0
      const float vi1x4 = i1[1];
4387
0
      i1 += 2;
4388
0
      const float vi2x4 = i2[1];
4389
0
      i2 += 2;
4390
0
      const float vi3x4 = i3[1];
4391
0
      i3 += 2;
4392
0
      const float vi4x4 = i4[1];
4393
0
      i4 += 2;
4394
4395
0
      float vo0p0 = vbias + vi0x0 * vk00;
4396
0
      float vo0p1 = vi1x0 * vk10;
4397
0
      float vo0p2 = vi2x0 * vk20;
4398
0
      float vo0p3 = vi3x0 * vk30;
4399
0
      float vo0p4 = vi4x0 * vk40;
4400
4401
0
      vi0x0 = vi0x2;
4402
0
      vi1x0 = vi1x2;
4403
0
      vi2x0 = vi2x2;
4404
0
      vi3x0 = vi3x2;
4405
0
      vi4x0 = vi4x2;
4406
4407
0
      vo0p0 += vi0x1 * vk01;
4408
0
      vo0p1 += vi1x1 * vk11;
4409
0
      vo0p2 += vi2x1 * vk21;
4410
0
      vo0p3 += vi3x1 * vk31;
4411
0
      vo0p4 += vi4x1 * vk41;
4412
4413
0
      vi0x1 = vi0x3;
4414
0
      vi1x1 = vi1x3;
4415
0
      vi2x1 = vi2x3;
4416
0
      vi3x1 = vi3x3;
4417
0
      vi4x1 = vi4x3;
4418
4419
0
      vo0p0 += vi0x2 * vk02;
4420
0
      vo0p1 += vi1x2 * vk12;
4421
0
      vo0p2 += vi2x2 * vk22;
4422
0
      vo0p3 += vi3x2 * vk32;
4423
0
      vo0p4 += vi4x2 * vk42;
4424
4425
0
      vi0x2 = vi0x4;
4426
0
      vi1x2 = vi1x4;
4427
0
      vi2x2 = vi2x4;
4428
0
      vi3x2 = vi3x4;
4429
0
      vi4x2 = vi4x4;
4430
4431
0
      vo0p0 += vi0x3 * vk03;
4432
0
      vo0p1 += vi1x3 * vk13;
4433
0
      vo0p2 += vi2x3 * vk23;
4434
0
      vo0p3 += vi3x3 * vk33;
4435
0
      vo0p4 += vi4x3 * vk43;
4436
4437
0
      vo0p0 += vi0x4 * vk04;
4438
0
      vo0p1 += vi1x4 * vk14;
4439
0
      vo0p2 += vi2x4 * vk24;
4440
0
      vo0p3 += vi3x4 * vk34;
4441
0
      vo0p4 += vi4x4 * vk44;
4442
4443
0
      vo0p0 += vo0p1;
4444
0
      vo0p2 += vo0p3;
4445
0
      vo0p0 += vo0p2;
4446
0
      vo0p0 += vo0p4;
4447
4448
0
      float vo0 = math_max_f32(vo0p0, vmin);
4449
4450
0
      vo0 = math_min_f32(vo0, vmax);
4451
4452
0
      *o0++ = vo0;
4453
0
    }
4454
0
    if XNN_LIKELY(w == 2 * sizeof(float)) {
4455
0
      const float vi0x3 = *i0++;
4456
0
      const float vi1x3 = *i1++;
4457
0
      const float vi2x3 = *i2++;
4458
0
      const float vi3x3 = *i3++;
4459
0
      const float vi4x3 = *i4++;
4460
4461
0
      float vo0p0 = vbias + vi0x0 * vk00;
4462
0
      float vo0p1 = vi1x0 * vk10;
4463
0
      float vo0p2 = vi2x0 * vk20;
4464
0
      float vo0p3 = vi3x0 * vk30;
4465
0
      float vo0p4 = vi4x0 * vk40;
4466
4467
0
      vo0p0 += vi0x1 * vk01;
4468
0
      vo0p1 += vi1x1 * vk11;
4469
0
      vo0p2 += vi2x1 * vk21;
4470
0
      vo0p3 += vi3x1 * vk31;
4471
0
      vo0p4 += vi4x1 * vk41;
4472
4473
0
      vo0p0 += vi0x2 * vk02;
4474
0
      vo0p1 += vi1x2 * vk12;
4475
0
      vo0p2 += vi2x2 * vk22;
4476
0
      vo0p3 += vi3x2 * vk32;
4477
0
      vo0p4 += vi4x2 * vk42;
4478
4479
0
      vo0p0 += vi0x3 * vk03;
4480
0
      vo0p1 += vi1x3 * vk13;
4481
0
      vo0p2 += vi2x3 * vk23;
4482
0
      vo0p3 += vi3x3 * vk33;
4483
0
      vo0p4 += vi4x3 * vk43;
4484
4485
0
      vo0p0 += vo0p1;
4486
0
      vo0p2 += vo0p3;
4487
0
      vo0p0 += vo0p2;
4488
0
      vo0p0 += vo0p4;
4489
4490
0
      float vo0 = math_max_f32(vo0p0, vmin);
4491
4492
0
      vo0 = math_min_f32(vo0, vmax);
4493
4494
0
      *o0++ = vo0;
4495
0
    } else {
4496
0
      float vo0p0 = vbias + vi0x0 * vk00;
4497
0
      float vo0p1 = vi1x0 * vk10;
4498
0
      float vo0p2 = vi2x0 * vk20;
4499
0
      float vo0p3 = vi3x0 * vk30;
4500
0
      float vo0p4 = vi4x0 * vk40;
4501
4502
0
      vo0p0 += vi0x1 * vk01;
4503
0
      vo0p1 += vi1x1 * vk11;
4504
0
      vo0p2 += vi2x1 * vk21;
4505
0
      vo0p3 += vi3x1 * vk31;
4506
0
      vo0p4 += vi4x1 * vk41;
4507
4508
0
      vo0p0 += vi0x2 * vk02;
4509
0
      vo0p1 += vi1x2 * vk12;
4510
0
      vo0p2 += vi2x2 * vk22;
4511
0
      vo0p3 += vi3x2 * vk32;
4512
0
      vo0p4 += vi4x2 * vk42;
4513
4514
0
      vo0p0 += vo0p1;
4515
0
      vo0p2 += vo0p3;
4516
0
      vo0p0 += vo0p2;
4517
0
      vo0p0 += vo0p4;
4518
4519
0
      float vo0 = math_max_f32(vo0p0, vmin);
4520
4521
0
      vo0 = math_min_f32(vo0, vmax);
4522
4523
0
      *o0++ = vo0;
4524
0
    }
4525
4526
0
    i0 = (const float*) ((uintptr_t) i2 - input_width);
4527
0
    i1 = (const float*) ((uintptr_t) i2);
4528
0
    i2 = (const float*) ((uintptr_t) i3);
4529
0
    i3 = (const float*) ((uintptr_t) i4);
4530
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
4531
4532
4533
0
    output_height -= 1;
4534
0
    padded_input_height -= 2;
4535
0
  } while (output_height != 0);
4536
0
}
4537
4538
void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2(
4539
    size_t input_height,
4540
    size_t input_width,
4541
    const float* input,
4542
    const float* weights,
4543
    const float* zero,
4544
    float* output,
4545
    uint32_t padding_top,
4546
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
4547
0
{
4548
0
  assert(input_height != 0);
4549
0
  assert(input_width != 0);
4550
0
  assert(input_width % sizeof(float) == 0);
4551
0
  assert(padding_top >= 1);
4552
0
  assert(padding_top <= 2);
4553
4554
0
  const float vmax = params->scalar.max;
4555
0
  const float vmin = params->scalar.min;
4556
4557
0
  const float vbias = weights[0];
4558
0
  const float vk00 = weights[1];
4559
0
  const float vk01 = weights[2];
4560
0
  const float vk02 = weights[3];
4561
0
  const float vk03 = weights[4];
4562
0
  const float vk04 = weights[5];
4563
0
  const float vk10 = weights[6];
4564
0
  const float vk11 = weights[7];
4565
0
  const float vk12 = weights[8];
4566
0
  const float vk13 = weights[9];
4567
0
  const float vk14 = weights[10];
4568
0
  const float vk20 = weights[11];
4569
0
  const float vk21 = weights[12];
4570
0
  const float vk22 = weights[13];
4571
0
  const float vk23 = weights[14];
4572
0
  const float vk24 = weights[15];
4573
0
  const float vk30 = weights[16];
4574
0
  const float vk31 = weights[17];
4575
0
  const float vk32 = weights[18];
4576
0
  const float vk33 = weights[19];
4577
0
  const float vk34 = weights[20];
4578
0
  const float vk40 = weights[21];
4579
0
  const float vk41 = weights[22];
4580
0
  const float vk42 = weights[23];
4581
0
  const float vk43 = weights[24];
4582
0
  const float vk44 = weights[25];
4583
4584
0
  const uint32_t padding_top_less_1 = padding_top - 1;
4585
4586
0
  const float* i0 = zero;
4587
0
  const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
4588
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
4589
0
  if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
4590
0
    i1 = zero;
4591
0
  }
4592
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
4593
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
4594
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
4595
0
  const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
4596
4597
0
  const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
4598
4599
0
  float* o0 = output;
4600
0
  float* o1 = (float*) ((uintptr_t) o0 + output_width);
4601
4602
0
  size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
4603
0
  size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
4604
0
  do {
4605
0
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
4606
0
      i3 = zero;
4607
0
    }
4608
0
    if XNN_UNPREDICTABLE(padded_input_height < 7) {
4609
0
      i4 = zero;
4610
0
      o1 = o0;
4611
0
    }
4612
0
    if XNN_UNPREDICTABLE(padded_input_height < 8) {
4613
0
      i5 = zero;
4614
0
    }
4615
0
    if XNN_UNPREDICTABLE(padded_input_height < 9) {
4616
0
      i6 = zero;
4617
0
    }
4618
4619
0
    float vi0x0 = 0.0f;
4620
0
    float vi1x0 = 0.0f;
4621
0
    float vi2x0 = 0.0f;
4622
0
    float vi3x0 = 0.0f;
4623
0
    float vi4x0 = 0.0f;
4624
0
    float vi5x0 = 0.0f;
4625
0
    float vi6x0 = 0.0f;
4626
4627
0
    float vi0x1 = 0.0f;
4628
0
    float vi1x1 = 0.0f;
4629
0
    float vi2x1 = 0.0f;
4630
0
    float vi3x1 = 0.0f;
4631
0
    float vi4x1 = 0.0f;
4632
0
    float vi5x1 = 0.0f;
4633
0
    float vi6x1 = 0.0f;
4634
4635
0
    float vi0x2 = *i0++;
4636
0
    float vi1x2 = *i1++;
4637
0
    float vi2x2 = *i2++;
4638
0
    float vi3x2 = *i3++;
4639
0
    float vi4x2 = *i4++;
4640
0
    float vi5x2 = *i5++;
4641
0
    float vi6x2 = *i6++;
4642
4643
0
    size_t w = input_width;
4644
0
    for (; w > 2 * sizeof(float); w -= 2 * sizeof(float)) {
4645
0
      const float vi0x3 = i0[0];
4646
0
      const float vi1x3 = i1[0];
4647
0
      const float vi2x3 = i2[0];
4648
0
      const float vi3x3 = i3[0];
4649
0
      const float vi4x3 = i4[0];
4650
0
      const float vi5x3 = i5[0];
4651
0
      const float vi6x3 = i6[0];
4652
4653
0
      const float vi0x4 = i0[1];
4654
0
      i0 += 2;
4655
0
      const float vi1x4 = i1[1];
4656
0
      i1 += 2;
4657
0
      const float vi2x4 = i2[1];
4658
0
      i2 += 2;
4659
0
      const float vi3x4 = i3[1];
4660
0
      i3 += 2;
4661
0
      const float vi4x4 = i4[1];
4662
0
      i4 += 2;
4663
0
      const float vi5x4 = i5[1];
4664
0
      i5 += 2;
4665
0
      const float vi6x4 = i6[1];
4666
0
      i6 += 2;
4667
4668
0
      float vo0p0 = vbias + vi0x0 * vk00;
4669
0
      float vo1p0 = vbias + vi2x0 * vk00;
4670
0
      float vo0p1 = vi1x0 * vk10;
4671
0
      float vo1p1 = vi3x0 * vk10;
4672
0
      vo0p0 += vi2x0 * vk20;
4673
0
      vo1p0 += vi4x0 * vk20;
4674
0
      vo0p1 += vi3x0 * vk30;
4675
0
      vo1p1 += vi5x0 * vk30;
4676
0
      vo0p0 += vi4x0 * vk40;
4677
0
      vo1p0 += vi6x0 * vk40;
4678
4679
0
      vi0x0 = vi0x2;
4680
0
      vi1x0 = vi1x2;
4681
0
      vi2x0 = vi2x2;
4682
0
      vi3x0 = vi3x2;
4683
0
      vi4x0 = vi4x2;
4684
0
      vi5x0 = vi5x2;
4685
0
      vi6x0 = vi6x2;
4686
4687
0
      vo0p1 += vi0x1 * vk01;
4688
0
      vo1p1 += vi2x1 * vk01;
4689
0
      vo0p0 += vi1x1 * vk11;
4690
0
      vo1p0 += vi3x1 * vk11;
4691
0
      vo0p1 += vi2x1 * vk21;
4692
0
      vo1p1 += vi4x1 * vk21;
4693
0
      vo0p0 += vi3x1 * vk31;
4694
0
      vo1p0 += vi5x1 * vk31;
4695
0
      vo0p1 += vi4x1 * vk41;
4696
0
      vo1p1 += vi6x1 * vk41;
4697
4698
0
      vi0x1 = vi0x3;
4699
0
      vi1x1 = vi1x3;
4700
0
      vi2x1 = vi2x3;
4701
0
      vi3x1 = vi3x3;
4702
0
      vi4x1 = vi4x3;
4703
0
      vi5x1 = vi5x3;
4704
0
      vi6x1 = vi6x3;
4705
4706
0
      vo0p0 += vi0x2 * vk02;
4707
0
      vo1p0 += vi2x2 * vk02;
4708
0
      vo0p1 += vi1x2 * vk12;
4709
0
      vo1p1 += vi3x2 * vk12;
4710
0
      vo0p0 += vi2x2 * vk22;
4711
0
      vo1p0 += vi4x2 * vk22;
4712
0
      vo0p1 += vi3x2 * vk32;
4713
0
      vo1p1 += vi5x2 * vk32;
4714
0
      vo0p0 += vi4x2 * vk42;
4715
0
      vo1p0 += vi6x2 * vk42;
4716
4717
0
      vi0x2 = vi0x4;
4718
0
      vi1x2 = vi1x4;
4719
0
      vi2x2 = vi2x4;
4720
0
      vi3x2 = vi3x4;
4721
0
      vi4x2 = vi4x4;
4722
0
      vi5x2 = vi5x4;
4723
0
      vi6x2 = vi6x4;
4724
4725
0
      vo0p1 += vi0x3 * vk03;
4726
0
      vo1p1 += vi2x3 * vk03;
4727
0
      vo0p0 += vi1x3 * vk13;
4728
0
      vo1p0 += vi3x3 * vk13;
4729
0
      vo0p1 += vi2x3 * vk23;
4730
0
      vo1p1 += vi4x3 * vk23;
4731
0
      vo0p0 += vi3x3 * vk33;
4732
0
      vo1p0 += vi5x3 * vk33;
4733
0
      vo0p1 += vi4x3 * vk43;
4734
0
      vo1p1 += vi6x3 * vk43;
4735
4736
0
      vo0p0 += vi0x4 * vk04;
4737
0
      vo1p0 += vi2x4 * vk04;
4738
0
      vo0p1 += vi1x4 * vk14;
4739
0
      vo1p1 += vi3x4 * vk14;
4740
0
      vo0p0 += vi2x4 * vk24;
4741
0
      vo1p0 += vi4x4 * vk24;
4742
0
      vo0p1 += vi3x4 * vk34;
4743
0
      vo1p1 += vi5x4 * vk34;
4744
0
      vo0p0 += vi4x4 * vk44;
4745
0
      vo1p0 += vi6x4 * vk44;
4746
4747
0
      vo0p0 += vo0p1;
4748
0
      vo1p0 += vo1p1;
4749
4750
0
      float vo0 = math_max_f32(vo0p0, vmin);
4751
0
      float vo1 = math_max_f32(vo1p0, vmin);
4752
4753
0
      vo0 = math_min_f32(vo0, vmax);
4754
0
      vo1 = math_min_f32(vo1, vmax);
4755
4756
0
      *o1++ = vo1;
4757
0
      *o0++ = vo0;
4758
0
    }
4759
0
    if XNN_LIKELY(w == 2 * sizeof(float)) {
4760
0
      const float vi0x3 = *i0++;
4761
0
      const float vi1x3 = *i1++;
4762
0
      const float vi2x3 = *i2++;
4763
0
      const float vi3x3 = *i3++;
4764
0
      const float vi4x3 = *i4++;
4765
0
      const float vi5x3 = *i5++;
4766
0
      const float vi6x3 = *i6++;
4767
4768
0
      float vo0p0 = vbias + vi0x0 * vk00;
4769
0
      float vo1p0 = vbias + vi2x0 * vk00;
4770
0
      float vo0p1 = vi1x0 * vk10;
4771
0
      float vo1p1 = vi3x0 * vk10;
4772
0
      vo0p0 += vi2x0 * vk20;
4773
0
      vo1p0 += vi4x0 * vk20;
4774
0
      vo0p1 += vi3x0 * vk30;
4775
0
      vo1p1 += vi5x0 * vk30;
4776
0
      vo0p0 += vi4x0 * vk40;
4777
0
      vo1p0 += vi6x0 * vk40;
4778
4779
0
      vo0p1 += vi0x1 * vk01;
4780
0
      vo1p1 += vi2x1 * vk01;
4781
0
      vo0p0 += vi1x1 * vk11;
4782
0
      vo1p0 += vi3x1 * vk11;
4783
0
      vo0p1 += vi2x1 * vk21;
4784
0
      vo1p1 += vi4x1 * vk21;
4785
0
      vo0p0 += vi3x1 * vk31;
4786
0
      vo1p0 += vi5x1 * vk31;
4787
0
      vo0p1 += vi4x1 * vk41;
4788
0
      vo1p1 += vi6x1 * vk41;
4789
4790
0
      vo0p0 += vi0x2 * vk02;
4791
0
      vo1p0 += vi2x2 * vk02;
4792
0
      vo0p1 += vi1x2 * vk12;
4793
0
      vo1p1 += vi3x2 * vk12;
4794
0
      vo0p0 += vi2x2 * vk22;
4795
0
      vo1p0 += vi4x2 * vk22;
4796
0
      vo0p1 += vi3x2 * vk32;
4797
0
      vo1p1 += vi5x2 * vk32;
4798
0
      vo0p0 += vi4x2 * vk42;
4799
0
      vo1p0 += vi6x2 * vk42;
4800
4801
0
      vo0p1 += vi0x3 * vk03;
4802
0
      vo1p1 += vi2x3 * vk03;
4803
0
      vo0p0 += vi1x3 * vk13;
4804
0
      vo1p0 += vi3x3 * vk13;
4805
0
      vo0p1 += vi2x3 * vk23;
4806
0
      vo1p1 += vi4x3 * vk23;
4807
0
      vo0p0 += vi3x3 * vk33;
4808
0
      vo1p0 += vi5x3 * vk33;
4809
0
      vo0p1 += vi4x3 * vk43;
4810
0
      vo1p1 += vi6x3 * vk43;
4811
4812
0
      vo0p0 += vo0p1;
4813
0
      vo1p0 += vo1p1;
4814
4815
0
      float vo0 = math_max_f32(vo0p0, vmin);
4816
0
      float vo1 = math_max_f32(vo1p0, vmin);
4817
4818
0
      vo0 = math_min_f32(vo0, vmax);
4819
0
      vo1 = math_min_f32(vo1, vmax);
4820
4821
0
      *o1++ = vo1;
4822
0
      *o0++ = vo0;
4823
0
    } else {
4824
0
      float vo0p0 = vbias + vi0x0 * vk00;
4825
0
      float vo1p0 = vbias + vi2x0 * vk00;
4826
0
      float vo0p1 = vi1x0 * vk10;
4827
0
      float vo1p1 = vi3x0 * vk10;
4828
0
      vo0p0 += vi2x0 * vk20;
4829
0
      vo1p0 += vi4x0 * vk20;
4830
0
      vo0p1 += vi3x0 * vk30;
4831
0
      vo1p1 += vi5x0 * vk30;
4832
0
      vo0p0 += vi4x0 * vk40;
4833
0
      vo1p0 += vi6x0 * vk40;
4834
4835
0
      vo0p1 += vi0x1 * vk01;
4836
0
      vo1p1 += vi2x1 * vk01;
4837
0
      vo0p0 += vi1x1 * vk11;
4838
0
      vo1p0 += vi3x1 * vk11;
4839
0
      vo0p1 += vi2x1 * vk21;
4840
0
      vo1p1 += vi4x1 * vk21;
4841
0
      vo0p0 += vi3x1 * vk31;
4842
0
      vo1p0 += vi5x1 * vk31;
4843
0
      vo0p1 += vi4x1 * vk41;
4844
0
      vo1p1 += vi6x1 * vk41;
4845
4846
0
      vo0p0 += vi0x2 * vk02;
4847
0
      vo1p0 += vi2x2 * vk02;
4848
0
      vo0p1 += vi1x2 * vk12;
4849
0
      vo1p1 += vi3x2 * vk12;
4850
0
      vo0p0 += vi2x2 * vk22;
4851
0
      vo1p0 += vi4x2 * vk22;
4852
0
      vo0p1 += vi3x2 * vk32;
4853
0
      vo1p1 += vi5x2 * vk32;
4854
0
      vo0p0 += vi4x2 * vk42;
4855
0
      vo1p0 += vi6x2 * vk42;
4856
4857
0
      vo0p0 += vo0p1;
4858
0
      vo1p0 += vo1p1;
4859
4860
0
      float vo0 = math_max_f32(vo0p0, vmin);
4861
0
      float vo1 = math_max_f32(vo1p0, vmin);
4862
4863
0
      vo0 = math_min_f32(vo0, vmax);
4864
0
      vo1 = math_min_f32(vo1, vmax);
4865
4866
0
      *o1++ = vo1;
4867
0
      *o0++ = vo0;
4868
0
    }
4869
4870
0
    i0 = (const float*) ((uintptr_t) i3);
4871
0
    i1 = (const float*) ((uintptr_t) i4);
4872
0
    i2 = (const float*) ((uintptr_t) i5);
4873
0
    i3 = (const float*) ((uintptr_t) i6);
4874
0
    i4 = (const float*) ((uintptr_t) i3 + input_width);
4875
0
    i5 = (const float*) ((uintptr_t) i4 + input_width);
4876
0
    i6 = (const float*) ((uintptr_t) i5 + input_width);
4877
4878
0
    o0 = o1;
4879
0
    o1 = (float*) ((uintptr_t) o0 + output_width);
4880
4881
0
    output_height = doz(output_height, 2);
4882
0
    padded_input_height = doz(padded_input_height, 4);
4883
0
  } while (output_height != 0);
4884
0
}
4885
4886
void xnn_f32_f16_vcvt_ukernel__scalar_bitcast_u4(
4887
    size_t batch,
4888
    const float* input,
4889
    void* output,
4890
    const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
4891
0
{
4892
0
  assert(batch != 0);
4893
0
  assert(batch % sizeof(float) == 0);
4894
0
  assert(input != NULL);
4895
0
  assert(output != NULL);
4896
4897
0
  const uint32_t vnonsign_mask = params->scalar_bitcast.nonsign_mask;
4898
0
  const uint32_t vexp_bias = params->scalar_bitcast.exp_bias;
4899
0
  const float vscale_to_inf = params->scalar_bitcast.scale_to_inf;
4900
0
  const uint32_t vexpw_max = params->scalar_bitcast.expw_max;
4901
0
  const float vscale_to_zero = params->scalar_bitcast.scale_to_zero;
4902
0
  const uint32_t vbias_min = params->scalar_bitcast.bias_min;
4903
0
  const uint16_t vexph_mask = params->scalar_bitcast.exph_mask;
4904
0
  const uint16_t vmanth_mask = params->scalar_bitcast.manth_mask;
4905
0
  const uint16_t vnanh = params->scalar_bitcast.nanh;
4906
4907
0
  const uint32_t* i = (const uint32_t*) input;
4908
0
  uint16_t* o = (uint16_t*) output;
4909
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
4910
0
    const uint32_t vw0 = i[0];
4911
0
    const uint32_t vw1 = i[1];
4912
0
    const uint32_t vw2 = i[2];
4913
0
    const uint32_t vw3 = i[3];
4914
0
    i += 4;
4915
4916
0
    const uint32_t vnonsignw0 = vw0 & vnonsign_mask;
4917
0
    const uint32_t vnonsignw1 = vw1 & vnonsign_mask;
4918
0
    const uint32_t vnonsignw2 = vw2 & vnonsign_mask;
4919
0
    const uint32_t vnonsignw3 = vw3 & vnonsign_mask;
4920
4921
0
    float vf0 = uint32_as_float(vnonsignw0);
4922
0
    float vf1 = uint32_as_float(vnonsignw1);
4923
0
    float vf2 = uint32_as_float(vnonsignw2);
4924
0
    float vf3 = uint32_as_float(vnonsignw3);
4925
0
    const uint32_t vsignw0 = vw0 ^ vnonsignw0;
4926
0
    const uint32_t vsignw1 = vw1 ^ vnonsignw1;
4927
0
    const uint32_t vsignw2 = vw2 ^ vnonsignw2;
4928
0
    const uint32_t vsignw3 = vw3 ^ vnonsignw3;
4929
0
    uint32_t vbias0 = vnonsignw0 + vexp_bias;
4930
0
    uint32_t vbias1 = vnonsignw1 + vexp_bias;
4931
0
    uint32_t vbias2 = vnonsignw2 + vexp_bias;
4932
0
    uint32_t vbias3 = vnonsignw3 + vexp_bias;
4933
4934
0
    vf0 *= vscale_to_inf;
4935
0
    vf1 *= vscale_to_inf;
4936
0
    vf2 *= vscale_to_inf;
4937
0
    vf3 *= vscale_to_inf;
4938
0
    vbias0 &= vexpw_max;
4939
0
    vbias1 &= vexpw_max;
4940
0
    vbias2 &= vexpw_max;
4941
0
    vbias3 &= vexpw_max;
4942
4943
0
    vf0 *= vscale_to_zero;
4944
0
    vf1 *= vscale_to_zero;
4945
0
    vf2 *= vscale_to_zero;
4946
0
    vf3 *= vscale_to_zero;
4947
0
    vbias0 = math_max_u32(vbias0, vbias_min);
4948
0
    vbias1 = math_max_u32(vbias1, vbias_min);
4949
0
    vbias2 = math_max_u32(vbias2, vbias_min);
4950
0
    vbias3 = math_max_u32(vbias3, vbias_min);
4951
4952
0
    vf0 += uint32_as_float(vbias0);
4953
0
    vf1 += uint32_as_float(vbias1);
4954
0
    vf2 += uint32_as_float(vbias2);
4955
0
    vf3 += uint32_as_float(vbias3);
4956
4957
0
    const uint32_t vbits0 = float_as_uint32(vf0);
4958
0
    const uint32_t vbits1 = float_as_uint32(vf1);
4959
0
    const uint32_t vbits2 = float_as_uint32(vf2);
4960
0
    const uint32_t vbits3 = float_as_uint32(vf3);
4961
4962
0
    const uint16_t vexph0 = (uint16_t) (vbits0 >> 13) & vexph_mask;
4963
0
    const uint16_t vexph1 = (uint16_t) (vbits1 >> 13) & vexph_mask;
4964
0
    const uint16_t vexph2 = (uint16_t) (vbits2 >> 13) & vexph_mask;
4965
0
    const uint16_t vexph3 = (uint16_t) (vbits3 >> 13) & vexph_mask;
4966
0
    const uint16_t vmanth0 = (uint16_t) vbits0 & vmanth_mask;
4967
0
    const uint16_t vmanth1 = (uint16_t) vbits1 & vmanth_mask;
4968
0
    const uint16_t vmanth2 = (uint16_t) vbits2 & vmanth_mask;
4969
0
    const uint16_t vmanth3 = (uint16_t) vbits3 & vmanth_mask;
4970
0
    const uint16_t vsignh0 = (uint16_t) (vsignw0 >> 16);
4971
0
    const uint16_t vsignh1 = (uint16_t) (vsignw1 >> 16);
4972
0
    const uint16_t vsignh2 = (uint16_t) (vsignw2 >> 16);
4973
0
    const uint16_t vsignh3 = (uint16_t) (vsignw3 >> 16);
4974
4975
0
    uint16_t vh0 = vexph0 + vmanth0;
4976
0
    uint16_t vh1 = vexph1 + vmanth1;
4977
0
    uint16_t vh2 = vexph2 + vmanth2;
4978
0
    uint16_t vh3 = vexph3 + vmanth3;
4979
0
    if XNN_UNPREDICTABLE(vnonsignw0 > vexpw_max) {
4980
0
      vh0 = vnanh;
4981
0
    }
4982
0
    if XNN_UNPREDICTABLE(vnonsignw1 > vexpw_max) {
4983
0
      vh1 = vnanh;
4984
0
    }
4985
0
    if XNN_UNPREDICTABLE(vnonsignw2 > vexpw_max) {
4986
0
      vh2 = vnanh;
4987
0
    }
4988
0
    if XNN_UNPREDICTABLE(vnonsignw3 > vexpw_max) {
4989
0
      vh3 = vnanh;
4990
0
    }
4991
0
    vh0 |= vsignh0;
4992
0
    vh1 |= vsignh1;
4993
0
    vh2 |= vsignh2;
4994
0
    vh3 |= vsignh3;
4995
4996
0
    o[0] = vh0;
4997
0
    o[1] = vh1;
4998
0
    o[2] = vh2;
4999
0
    o[3] = vh3;
5000
0
    o += 4;
5001
0
  }
5002
0
  if XNN_UNLIKELY(batch != 0) {
5003
0
    do {
5004
0
      const uint32_t vw = *i++;
5005
5006
0
      const uint32_t vnonsignw = vw & vnonsign_mask;
5007
5008
0
      float vf = uint32_as_float(vnonsignw);
5009
0
      const uint32_t vsignw = vw ^ vnonsignw;
5010
0
      uint32_t vbias = vnonsignw + vexp_bias;
5011
5012
0
      vf *= vscale_to_inf;
5013
0
      vbias &= vexpw_max;
5014
5015
0
      vf *= vscale_to_zero;
5016
0
      vbias = math_max_u32(vbias, vbias_min);
5017
5018
0
      vf += uint32_as_float(vbias);
5019
5020
0
      const uint32_t vbits = float_as_uint32(vf);
5021
5022
0
      const uint16_t vexph = (uint16_t) (vbits >> 13) & vexph_mask;
5023
0
      const uint16_t vmanth = (uint16_t) vbits & vmanth_mask;
5024
0
      const uint16_t vsignh = (uint16_t) (vsignw >> 16);
5025
5026
0
      uint16_t vh = vexph + vmanth;
5027
0
      if XNN_UNPREDICTABLE(vnonsignw > vexpw_max) {
5028
0
        vh = vnanh;
5029
0
      }
5030
0
      vh |= vsignh;
5031
5032
0
      *o++ = vh;
5033
5034
0
      batch -= sizeof(float);
5035
0
    } while (batch != 0);
5036
0
  }
5037
0
}
5038
5039
void xnn_f32_f16_vcvt_ukernel__scalar_fabsf_u2(
5040
    size_t batch,
5041
    const float* input,
5042
    void* output,
5043
    const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
5044
0
{
5045
0
  assert(batch != 0);
5046
0
  assert(batch % sizeof(float) == 0);
5047
0
  assert(input != NULL);
5048
0
  assert(output != NULL);
5049
5050
0
  const float vscale_to_inf = params->scalar_fabsf.scale_to_inf;
5051
0
  const uint32_t vexp_bias = params->scalar_fabsf.exp_bias;
5052
0
  const float vscale_to_zero = params->scalar_fabsf.scale_to_zero;
5053
0
  const uint32_t vexpw_max = params->scalar_fabsf.expw_max;
5054
0
  const uint32_t vbias_min = params->scalar_fabsf.bias_min;
5055
0
  const uint16_t vexph_mask = params->scalar_fabsf.exph_mask;
5056
0
  const uint16_t vmanth_mask = params->scalar_fabsf.manth_mask;
5057
0
  const uint16_t vnanh = params->scalar_fabsf.nanh;
5058
5059
0
  uint16_t* o = (uint16_t*) output;
5060
0
  for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
5061
0
    const float vx0 = input[0];
5062
0
    const float vx1 = input[1];
5063
0
    input += 2;
5064
5065
0
    const float vabsx0 = fabsf(vx0);
5066
0
    const float vabsx1 = fabsf(vx1);
5067
0
    uint32_t vsignw0 = float_as_uint32(vx0);
5068
0
    uint32_t vsignw1 = float_as_uint32(vx1);
5069
5070
0
    const uint32_t vnonsignw0 = float_as_uint32(vabsx0);
5071
0
    const uint32_t vnonsignw1 = float_as_uint32(vabsx1);
5072
0
    float vf0 = vabsx0 * vscale_to_inf;
5073
0
    float vf1 = vabsx1 * vscale_to_inf;
5074
5075
0
    uint32_t vbias0 = vnonsignw0 + vexp_bias;
5076
0
    uint32_t vbias1 = vnonsignw1 + vexp_bias;
5077
0
    vsignw0 ^= vnonsignw0;
5078
0
    vsignw1 ^= vnonsignw1;
5079
5080
0
    vf0 *= vscale_to_zero;
5081
0
    vf1 *= vscale_to_zero;
5082
0
    vbias0 &= vexpw_max;
5083
0
    vbias1 &= vexpw_max;
5084
5085
0
    vbias0 = math_max_u32(vbias0, vbias_min);
5086
0
    vbias1 = math_max_u32(vbias1, vbias_min);
5087
5088
0
    vf0 += uint32_as_float(vbias0);
5089
0
    vf1 += uint32_as_float(vbias1);
5090
5091
0
    const uint32_t vbits0 = float_as_uint32(vf0);
5092
0
    const uint32_t vbits1 = float_as_uint32(vf1);
5093
5094
0
    const uint16_t vexph0 = (uint16_t) (vbits0 >> 13) & vexph_mask;
5095
0
    const uint16_t vexph1 = (uint16_t) (vbits1 >> 13) & vexph_mask;
5096
0
    const uint16_t vmanth0 = (uint16_t) vbits0 & vmanth_mask;
5097
0
    const uint16_t vmanth1 = (uint16_t) vbits1 & vmanth_mask;
5098
0
    const uint16_t vsignh0 = (uint16_t) (vsignw0 >> 16);
5099
0
    const uint16_t vsignh1 = (uint16_t) (vsignw1 >> 16);
5100
5101
0
    uint16_t vh0 = vexph0 + vmanth0;
5102
0
    uint16_t vh1 = vexph1 + vmanth1;
5103
0
    if XNN_UNPREDICTABLE(vnonsignw0 > vexpw_max) {
5104
0
      vh0 = vnanh;
5105
0
    }
5106
0
    if XNN_UNPREDICTABLE(vnonsignw1 > vexpw_max) {
5107
0
      vh1 = vnanh;
5108
0
    }
5109
0
    vh0 |= vsignh0;
5110
0
    vh1 |= vsignh1;
5111
5112
0
    o[0] = vh0;
5113
0
    o[1] = vh1;
5114
0
    o += 2;
5115
0
  }
5116
0
  if XNN_UNLIKELY(batch != 0) {
5117
0
    const float vx = *input;
5118
5119
0
    const float vabsx = fabsf(vx);
5120
0
    uint32_t vsignw = float_as_uint32(vx);
5121
5122
0
    const uint32_t vnonsignw = float_as_uint32(vabsx);
5123
0
    float vf = vabsx * vscale_to_inf;
5124
5125
0
    uint32_t vbias = vnonsignw + vexp_bias;
5126
0
    vsignw ^= vnonsignw;
5127
5128
0
    vf *= vscale_to_zero;
5129
0
    vbias &= vexpw_max;
5130
5131
0
    vbias = math_max_u32(vbias, vbias_min);
5132
5133
0
    vf += uint32_as_float(vbias);
5134
5135
0
    const uint32_t vbits = float_as_uint32(vf);
5136
5137
0
    const uint16_t vexph = (uint16_t) (vbits >> 13) & vexph_mask;
5138
0
    const uint16_t vmanth = (uint16_t) vbits & vmanth_mask;
5139
0
    const uint16_t vsignh = (uint16_t) (vsignw >> 16);
5140
5141
0
    uint16_t vh = vexph + vmanth;
5142
0
    if XNN_UNPREDICTABLE(vnonsignw > vexpw_max) {
5143
0
      vh = vnanh;
5144
0
    }
5145
0
    vh |= vsignh;
5146
5147
0
    *o = vh;
5148
0
  }
5149
0
}
5150
5151
void xnn_f32_gavgpool_cw_ukernel__scalar_x1(
5152
    size_t elements,
5153
    size_t channels,
5154
    const float* input,
5155
    float* output,
5156
    const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)])
5157
0
{
5158
0
  assert(elements != 0);
5159
0
  assert(elements % sizeof(float) == 0);
5160
0
  assert(channels != 0);
5161
5162
0
  const float* i0 = input;
5163
5164
0
  const float vmultiplier = params->scalar.multiplier;
5165
0
  const float voutput_max = params->scalar.output_max;
5166
0
  const float voutput_min = params->scalar.output_min;
5167
5168
0
  while (channels != 0) {
5169
0
    float vsum0 = 0.f;
5170
0
    float vsum1 = 0.f;
5171
0
    float vsum2 = 0.f;
5172
0
    float vsum3 = 0.f;
5173
0
    size_t n = elements;
5174
0
    while (n >= 4 * sizeof(float)) {
5175
0
      vsum0 += i0[0];
5176
0
      vsum1 += i0[1];
5177
0
      vsum2 += i0[2];
5178
0
      vsum3 += i0[3];
5179
5180
0
      i0 += 4;
5181
0
      n -= 4 * sizeof(float);
5182
0
    }
5183
5184
0
    while (n != 0) {
5185
0
      vsum0 += *i0++;
5186
0
      n -= sizeof(float);
5187
0
    }
5188
5189
0
    float vout = ( (vsum0 + vsum1) + (vsum2 + vsum3) ) * vmultiplier;
5190
5191
0
    vout = math_min_f32(vout, voutput_max);
5192
0
    vout = math_max_f32(vout, voutput_min);
5193
5194
0
    *output++ = vout;
5195
0
    channels -= 1;
5196
0
  }
5197
0
}
5198
5199
void xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1(
5200
    size_t rows,
5201
    size_t channels,
5202
    const float* input,
5203
    size_t input_stride,
5204
    const float* zero,
5205
    float* buffer,
5206
    float* output,
5207
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5208
0
{
5209
0
  assert(rows > 7);
5210
0
  assert(channels != 0);
5211
5212
0
  const float* i0 = input;
5213
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
5214
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
5215
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
5216
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
5217
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
5218
0
  const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
5219
0
  const size_t input_increment = 7 * input_stride - channels * sizeof(float);
5220
5221
0
  float* b = buffer;
5222
0
  size_t c = channels;
5223
0
  do {
5224
0
    const float vi0 = *i0++;
5225
0
    const float vi1 = *i1++;
5226
0
    const float vi2 = *i2++;
5227
0
    const float vi3 = *i3++;
5228
0
    const float vi4 = *i4++;
5229
0
    const float vi5 = *i5++;
5230
0
    const float vi6 = *i6++;
5231
5232
0
    const float vsum01 = vi0 + vi1;
5233
0
    const float vsum23 = vi2 + vi3;
5234
0
    const float vsum45 = vi4 + vi5;
5235
5236
0
    const float vsum016 = vsum01 + vi6;
5237
0
    const float vsum2345 = vsum23 + vsum45;
5238
5239
0
    const float vsum = vsum016 + vsum2345;
5240
5241
0
    *b++ = vsum;
5242
0
  } while (--c != 0);
5243
0
  for (rows -= 7; rows > 7; rows -= 7) {
5244
0
    b = buffer;
5245
5246
0
    i0 = (const float*) ((uintptr_t) i0 + input_increment);
5247
0
    i1 = (const float*) ((uintptr_t) i1 + input_increment);
5248
0
    i2 = (const float*) ((uintptr_t) i2 + input_increment);
5249
0
    i3 = (const float*) ((uintptr_t) i3 + input_increment);
5250
0
    i4 = (const float*) ((uintptr_t) i4 + input_increment);
5251
0
    i5 = (const float*) ((uintptr_t) i5 + input_increment);
5252
0
    i6 = (const float*) ((uintptr_t) i6 + input_increment);
5253
5254
0
    size_t c = channels;
5255
0
    do {
5256
0
      const float vi0 = *i0++;
5257
0
      const float vi1 = *i1++;
5258
0
      const float vi2 = *i2++;
5259
0
      const float vi3 = *i3++;
5260
0
      const float vi4 = *i4++;
5261
0
      const float vi5 = *i5++;
5262
0
      const float vi6 = *i6++;
5263
0
      const float vacc = *b;
5264
5265
0
      const float vsum01 = vi0 + vi1;
5266
0
      const float vsum23 = vi2 + vi3;
5267
0
      const float vsum45 = vi4 + vi5;
5268
0
      const float vsum6a = vi6 + vacc;
5269
5270
0
      const float vsum0123 = vsum01 + vsum23;
5271
0
      const float vsum456a = vsum45 + vsum6a;
5272
5273
0
      const float vsum = vsum0123 + vsum456a;
5274
5275
0
      *b++ = vsum;
5276
0
    } while (--c != 0);
5277
0
  }
5278
5279
0
  i0 = (const float*) ((uintptr_t) i0 + input_increment);
5280
0
  i1 = (const float*) ((uintptr_t) i1 + input_increment);
5281
0
  if (rows < 2) {
5282
0
    i1 = zero;
5283
0
  }
5284
0
  i2 = (const float*) ((uintptr_t) i2 + input_increment);
5285
0
  if (rows <= 2) {
5286
0
    i2 = zero;
5287
0
  }
5288
0
  i3 = (const float*) ((uintptr_t) i3 + input_increment);
5289
0
  if (rows < 4) {
5290
0
    i3 = zero;
5291
0
  }
5292
0
  i4 = (const float*) ((uintptr_t) i4 + input_increment);
5293
0
  if (rows <= 4) {
5294
0
    i4 = zero;
5295
0
  }
5296
0
  i5 = (const float*) ((uintptr_t) i5 + input_increment);
5297
0
  if (rows < 6) {
5298
0
    i5 = zero;
5299
0
  }
5300
0
  i6 = (const float*) ((uintptr_t) i6 + input_increment);
5301
0
  if (rows <= 6) {
5302
0
    i6 = zero;
5303
0
  }
5304
0
  const float vscale = params->scalar.scale;
5305
0
  const float vmin = params->scalar.min;
5306
0
  const float vmax = params->scalar.max;
5307
5308
0
  b = buffer;
5309
0
  do {
5310
0
    const float vi0 = *i0++;
5311
0
    const float vi1 = *i1++;
5312
0
    const float vi2 = *i2++;
5313
0
    const float vi3 = *i3++;
5314
0
    const float vi4 = *i4++;
5315
0
    const float vi5 = *i5++;
5316
0
    const float vi6 = *i6++;
5317
0
    const float vacc = *b++;
5318
5319
0
    const float vsum01 = vi0 + vi1;
5320
0
    const float vsum23 = vi2 + vi3;
5321
0
    const float vsum45 = vi4 + vi5;
5322
0
    const float vsum6a = vi6 + vacc;
5323
5324
0
    const float vsum0123 = vsum01 + vsum23;
5325
0
    const float vsum456a = vsum45 + vsum6a;
5326
5327
0
    const float vsum = vsum0123 + vsum456a;
5328
5329
0
    float vout = vsum * vscale;
5330
0
    vout = math_max_f32(vout, vmin);
5331
0
    vout = math_min_f32(vout, vmax);
5332
5333
0
    *output++ = vout;
5334
0
  } while (--channels != 0);
5335
0
}
5336
5337
void xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1(
5338
    size_t rows,
5339
    size_t channels,
5340
    const float* input,
5341
    size_t input_stride,
5342
    const float* zero,
5343
    float* output,
5344
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5345
0
{
5346
0
  assert(rows != 0);
5347
0
  assert(rows <= 7);
5348
0
  assert(channels != 0);
5349
5350
0
  const float* i0 = input;
5351
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
5352
0
  if (rows < 2) {
5353
0
    i1 = zero;
5354
0
  }
5355
0
  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
5356
0
  if (rows <= 2) {
5357
0
    i2 = zero;
5358
0
  }
5359
0
  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
5360
0
  if (rows < 4) {
5361
0
    i3 = zero;
5362
0
  }
5363
0
  const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
5364
0
  if (rows <= 4) {
5365
0
    i4 = zero;
5366
0
  }
5367
0
  const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
5368
0
  if (rows < 6) {
5369
0
    i5 = zero;
5370
0
  }
5371
0
  const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
5372
0
  if (rows <= 6) {
5373
0
    i6 = zero;
5374
0
  }
5375
5376
0
  const float vscale = params->scalar.scale;
5377
0
  const float vmin = params->scalar.min;
5378
0
  const float vmax = params->scalar.max;
5379
0
  do {
5380
0
    const float vi0 = *i0++;
5381
0
    const float vi1 = *i1++;
5382
0
    const float vi2 = *i2++;
5383
0
    const float vi3 = *i3++;
5384
0
    const float vi4 = *i4++;
5385
0
    const float vi5 = *i5++;
5386
0
    const float vi6 = *i6++;
5387
5388
0
    const float vsum01 = vi0 + vi1;
5389
0
    const float vsum23 = vi2 + vi3;
5390
0
    const float vsum45 = vi4 + vi5;
5391
5392
0
    const float vsum016 = vsum01 + vi6;
5393
0
    const float vsum2345 = vsum23 + vsum45;
5394
5395
0
    const float vsum = vsum016 + vsum2345;
5396
5397
0
    float vout = vsum * vscale;
5398
0
    vout = math_max_f32(vout, vmin);
5399
0
    vout = math_min_f32(vout, vmax);
5400
5401
0
    *output++ = vout;
5402
0
  } while (--channels != 0);
5403
0
}
5404
5405
void xnn_f32_gemm_minmax_ukernel_1x4__scalar(
5406
    size_t mr,
5407
    size_t nc,
5408
    size_t kc,
5409
    const float* restrict a,
5410
    size_t a_stride,
5411
    const float* restrict w,
5412
    float* restrict c,
5413
    size_t cm_stride,
5414
    size_t cn_stride,
5415
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5416
0
{
5417
0
  assert(mr != 0);
5418
0
  assert(mr <= 1);
5419
0
  assert(nc != 0);
5420
0
  assert(kc != 0);
5421
0
  assert(kc % sizeof(float) == 0);
5422
0
  assert(a != NULL);
5423
0
  assert(w != NULL);
5424
0
  assert(c != NULL);
5425
5426
0
  const float* a0 = a;
5427
0
  float* c0 = c;
5428
5429
0
  const float vmin = params->scalar.min;
5430
0
  const float vmax = params->scalar.max;
5431
0
  do {
5432
0
    float vacc00 = w[0];
5433
0
    float vacc01 = w[1];
5434
0
    float vacc02 = w[2];
5435
0
    float vacc03 = w[3];
5436
0
    w += 4;
5437
5438
0
    size_t k = kc;
5439
0
    do {
5440
0
      const float va0 = *a0++;
5441
5442
0
      const float vb0 = w[0];
5443
0
      const float vb1 = w[1];
5444
0
      const float vb2 = w[2];
5445
0
      const float vb3 = w[3];
5446
0
      w += 4;
5447
5448
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
5449
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
5450
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
5451
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
5452
5453
0
      k -= sizeof(float);
5454
0
    } while (k != 0);
5455
5456
0
    vacc00 = math_max_f32(vacc00, vmin);
5457
0
    vacc01 = math_max_f32(vacc01, vmin);
5458
0
    vacc02 = math_max_f32(vacc02, vmin);
5459
0
    vacc03 = math_max_f32(vacc03, vmin);
5460
5461
0
    vacc00 = math_min_f32(vacc00, vmax);
5462
0
    vacc01 = math_min_f32(vacc01, vmax);
5463
0
    vacc02 = math_min_f32(vacc02, vmax);
5464
0
    vacc03 = math_min_f32(vacc03, vmax);
5465
5466
0
    if XNN_LIKELY(nc >= 4) {
5467
0
      c0[0] = vacc00;
5468
0
      c0[1] = vacc01;
5469
0
      c0[2] = vacc02;
5470
0
      c0[3] = vacc03;
5471
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
5472
5473
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
5474
5475
0
      nc -= 4;
5476
0
    } else {
5477
0
      if (nc & 2) {
5478
0
        c0[0] = vacc00;
5479
0
        c0[1] = vacc01;
5480
0
        vacc00 = vacc02;
5481
0
        c0 += 2;
5482
0
      }
5483
0
      if (nc & 1) {
5484
0
        c0[0] = vacc00;
5485
0
      }
5486
5487
0
      nc = 0;
5488
0
    }
5489
0
  } while (nc != 0);
5490
0
}
5491
5492
void xnn_f32_gemm_relu_ukernel_1x4__scalar(
5493
    size_t mr,
5494
    size_t nc,
5495
    size_t kc,
5496
    const float* restrict a,
5497
    size_t a_stride,
5498
    const float* restrict w,
5499
    float* restrict c,
5500
    size_t cm_stride,
5501
    size_t cn_stride,
5502
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
5503
0
{
5504
0
  assert(mr != 0);
5505
0
  assert(mr <= 1);
5506
0
  assert(nc != 0);
5507
0
  assert(kc != 0);
5508
0
  assert(kc % sizeof(float) == 0);
5509
0
  assert(a != NULL);
5510
0
  assert(w != NULL);
5511
0
  assert(c != NULL);
5512
5513
0
  const float* a0 = a;
5514
0
  float* c0 = c;
5515
5516
0
  do {
5517
0
    float vacc00 = w[0];
5518
0
    float vacc01 = w[1];
5519
0
    float vacc02 = w[2];
5520
0
    float vacc03 = w[3];
5521
0
    w += 4;
5522
5523
0
    size_t k = kc;
5524
0
    do {
5525
0
      const float va0 = *a0++;
5526
5527
0
      const float vb0 = w[0];
5528
0
      const float vb1 = w[1];
5529
0
      const float vb2 = w[2];
5530
0
      const float vb3 = w[3];
5531
0
      w += 4;
5532
5533
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
5534
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
5535
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
5536
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
5537
5538
0
      k -= sizeof(float);
5539
0
    } while (k != 0);
5540
5541
0
    vacc00 = math_max_f32(vacc00, 0.0f);
5542
0
    vacc01 = math_max_f32(vacc01, 0.0f);
5543
0
    vacc02 = math_max_f32(vacc02, 0.0f);
5544
0
    vacc03 = math_max_f32(vacc03, 0.0f);
5545
5546
0
    if XNN_LIKELY(nc >= 4) {
5547
0
      c0[0] = vacc00;
5548
0
      c0[1] = vacc01;
5549
0
      c0[2] = vacc02;
5550
0
      c0[3] = vacc03;
5551
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
5552
5553
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
5554
5555
0
      nc -= 4;
5556
0
    } else {
5557
0
      if (nc & 2) {
5558
0
        c0[0] = vacc00;
5559
0
        c0[1] = vacc01;
5560
0
        vacc00 = vacc02;
5561
0
        c0 += 2;
5562
0
      }
5563
0
      if (nc & 1) {
5564
0
        c0[0] = vacc00;
5565
0
      }
5566
5567
0
      nc = 0;
5568
0
    }
5569
0
  } while (nc != 0);
5570
0
}
5571
5572
void xnn_f32_gemm_ukernel_1x4__scalar(
5573
    size_t mr,
5574
    size_t nc,
5575
    size_t kc,
5576
    const float* restrict a,
5577
    size_t a_stride,
5578
    const float* restrict w,
5579
    float* restrict c,
5580
    size_t cm_stride,
5581
    size_t cn_stride,
5582
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
5583
0
{
5584
0
  assert(mr != 0);
5585
0
  assert(mr <= 1);
5586
0
  assert(nc != 0);
5587
0
  assert(kc != 0);
5588
0
  assert(kc % sizeof(float) == 0);
5589
0
  assert(a != NULL);
5590
0
  assert(w != NULL);
5591
0
  assert(c != NULL);
5592
5593
0
  const float* a0 = a;
5594
0
  float* c0 = c;
5595
5596
0
  do {
5597
0
    float vacc00 = w[0];
5598
0
    float vacc01 = w[1];
5599
0
    float vacc02 = w[2];
5600
0
    float vacc03 = w[3];
5601
0
    w += 4;
5602
5603
0
    size_t k = kc;
5604
0
    do {
5605
0
      const float va0 = *a0++;
5606
5607
0
      const float vb0 = w[0];
5608
0
      const float vb1 = w[1];
5609
0
      const float vb2 = w[2];
5610
0
      const float vb3 = w[3];
5611
0
      w += 4;
5612
5613
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
5614
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
5615
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
5616
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
5617
5618
0
      k -= sizeof(float);
5619
0
    } while (k != 0);
5620
5621
5622
0
    if XNN_LIKELY(nc >= 4) {
5623
0
      c0[0] = vacc00;
5624
0
      c0[1] = vacc01;
5625
0
      c0[2] = vacc02;
5626
0
      c0[3] = vacc03;
5627
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
5628
5629
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
5630
5631
0
      nc -= 4;
5632
0
    } else {
5633
0
      if (nc & 2) {
5634
0
        c0[0] = vacc00;
5635
0
        c0[1] = vacc01;
5636
0
        vacc00 = vacc02;
5637
0
        c0 += 2;
5638
0
      }
5639
0
      if (nc & 1) {
5640
0
        c0[0] = vacc00;
5641
0
      }
5642
5643
0
      nc = 0;
5644
0
    }
5645
0
  } while (nc != 0);
5646
0
}
5647
5648
void xnn_f32_gemm_minmax_ukernel_2x4__scalar(
5649
    size_t mr,
5650
    size_t nc,
5651
    size_t kc,
5652
    const float* restrict a,
5653
    size_t a_stride,
5654
    const float* restrict w,
5655
    float* restrict c,
5656
    size_t cm_stride,
5657
    size_t cn_stride,
5658
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5659
0
{
5660
0
  assert(mr != 0);
5661
0
  assert(mr <= 2);
5662
0
  assert(nc != 0);
5663
0
  assert(kc != 0);
5664
0
  assert(kc % sizeof(float) == 0);
5665
0
  assert(a != NULL);
5666
0
  assert(w != NULL);
5667
0
  assert(c != NULL);
5668
5669
0
  const float* a0 = a;
5670
0
  float* c0 = c;
5671
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
5672
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5673
0
  if XNN_UNPREDICTABLE(mr != 2) {
5674
0
    a1 = a0;
5675
0
    c1 = c0;
5676
0
  }
5677
5678
0
  const float vmin = params->scalar.min;
5679
0
  const float vmax = params->scalar.max;
5680
0
  do {
5681
0
    float vacc00 = w[0];
5682
0
    float vacc01 = w[1];
5683
0
    float vacc02 = w[2];
5684
0
    float vacc03 = w[3];
5685
0
    w += 4;
5686
0
    float vacc10 = vacc00;
5687
0
    float vacc11 = vacc01;
5688
0
    float vacc12 = vacc02;
5689
0
    float vacc13 = vacc03;
5690
5691
0
    size_t k = kc;
5692
0
    do {
5693
0
      const float va0 = *a0++;
5694
0
      const float va1 = *a1++;
5695
5696
0
      const float vb0 = w[0];
5697
0
      const float vb1 = w[1];
5698
0
      const float vb2 = w[2];
5699
0
      const float vb3 = w[3];
5700
0
      w += 4;
5701
5702
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
5703
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
5704
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
5705
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
5706
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
5707
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
5708
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
5709
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
5710
5711
0
      k -= sizeof(float);
5712
0
    } while (k != 0);
5713
5714
0
    vacc00 = math_max_f32(vacc00, vmin);
5715
0
    vacc01 = math_max_f32(vacc01, vmin);
5716
0
    vacc02 = math_max_f32(vacc02, vmin);
5717
0
    vacc03 = math_max_f32(vacc03, vmin);
5718
0
    vacc10 = math_max_f32(vacc10, vmin);
5719
0
    vacc11 = math_max_f32(vacc11, vmin);
5720
0
    vacc12 = math_max_f32(vacc12, vmin);
5721
0
    vacc13 = math_max_f32(vacc13, vmin);
5722
5723
0
    vacc00 = math_min_f32(vacc00, vmax);
5724
0
    vacc01 = math_min_f32(vacc01, vmax);
5725
0
    vacc02 = math_min_f32(vacc02, vmax);
5726
0
    vacc03 = math_min_f32(vacc03, vmax);
5727
0
    vacc10 = math_min_f32(vacc10, vmax);
5728
0
    vacc11 = math_min_f32(vacc11, vmax);
5729
0
    vacc12 = math_min_f32(vacc12, vmax);
5730
0
    vacc13 = math_min_f32(vacc13, vmax);
5731
5732
0
    if XNN_LIKELY(nc >= 4) {
5733
0
      c1[0] = vacc10;
5734
0
      c1[1] = vacc11;
5735
0
      c1[2] = vacc12;
5736
0
      c1[3] = vacc13;
5737
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
5738
0
      c0[0] = vacc00;
5739
0
      c0[1] = vacc01;
5740
0
      c0[2] = vacc02;
5741
0
      c0[3] = vacc03;
5742
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
5743
5744
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
5745
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
5746
5747
0
      nc -= 4;
5748
0
    } else {
5749
0
      if (nc & 2) {
5750
0
        c1[0] = vacc10;
5751
0
        c1[1] = vacc11;
5752
0
        vacc10 = vacc12;
5753
0
        c1 += 2;
5754
0
        c0[0] = vacc00;
5755
0
        c0[1] = vacc01;
5756
0
        vacc00 = vacc02;
5757
0
        c0 += 2;
5758
0
      }
5759
0
      if (nc & 1) {
5760
0
        c1[0] = vacc10;
5761
0
        c0[0] = vacc00;
5762
0
      }
5763
5764
0
      nc = 0;
5765
0
    }
5766
0
  } while (nc != 0);
5767
0
}
5768
5769
void xnn_f32_gemm_relu_ukernel_2x4__scalar(
5770
    size_t mr,
5771
    size_t nc,
5772
    size_t kc,
5773
    const float* restrict a,
5774
    size_t a_stride,
5775
    const float* restrict w,
5776
    float* restrict c,
5777
    size_t cm_stride,
5778
    size_t cn_stride,
5779
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
5780
0
{
5781
0
  assert(mr != 0);
5782
0
  assert(mr <= 2);
5783
0
  assert(nc != 0);
5784
0
  assert(kc != 0);
5785
0
  assert(kc % sizeof(float) == 0);
5786
0
  assert(a != NULL);
5787
0
  assert(w != NULL);
5788
0
  assert(c != NULL);
5789
5790
0
  const float* a0 = a;
5791
0
  float* c0 = c;
5792
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
5793
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5794
0
  if XNN_UNPREDICTABLE(mr != 2) {
5795
0
    a1 = a0;
5796
0
    c1 = c0;
5797
0
  }
5798
5799
0
  do {
5800
0
    float vacc00 = w[0];
5801
0
    float vacc01 = w[1];
5802
0
    float vacc02 = w[2];
5803
0
    float vacc03 = w[3];
5804
0
    w += 4;
5805
0
    float vacc10 = vacc00;
5806
0
    float vacc11 = vacc01;
5807
0
    float vacc12 = vacc02;
5808
0
    float vacc13 = vacc03;
5809
5810
0
    size_t k = kc;
5811
0
    do {
5812
0
      const float va0 = *a0++;
5813
0
      const float va1 = *a1++;
5814
5815
0
      const float vb0 = w[0];
5816
0
      const float vb1 = w[1];
5817
0
      const float vb2 = w[2];
5818
0
      const float vb3 = w[3];
5819
0
      w += 4;
5820
5821
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
5822
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
5823
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
5824
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
5825
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
5826
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
5827
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
5828
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
5829
5830
0
      k -= sizeof(float);
5831
0
    } while (k != 0);
5832
5833
0
    vacc00 = math_max_f32(vacc00, 0.0f);
5834
0
    vacc01 = math_max_f32(vacc01, 0.0f);
5835
0
    vacc02 = math_max_f32(vacc02, 0.0f);
5836
0
    vacc03 = math_max_f32(vacc03, 0.0f);
5837
0
    vacc10 = math_max_f32(vacc10, 0.0f);
5838
0
    vacc11 = math_max_f32(vacc11, 0.0f);
5839
0
    vacc12 = math_max_f32(vacc12, 0.0f);
5840
0
    vacc13 = math_max_f32(vacc13, 0.0f);
5841
5842
0
    if XNN_LIKELY(nc >= 4) {
5843
0
      c1[0] = vacc10;
5844
0
      c1[1] = vacc11;
5845
0
      c1[2] = vacc12;
5846
0
      c1[3] = vacc13;
5847
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
5848
0
      c0[0] = vacc00;
5849
0
      c0[1] = vacc01;
5850
0
      c0[2] = vacc02;
5851
0
      c0[3] = vacc03;
5852
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
5853
5854
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
5855
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
5856
5857
0
      nc -= 4;
5858
0
    } else {
5859
0
      if (nc & 2) {
5860
0
        c1[0] = vacc10;
5861
0
        c1[1] = vacc11;
5862
0
        vacc10 = vacc12;
5863
0
        c1 += 2;
5864
0
        c0[0] = vacc00;
5865
0
        c0[1] = vacc01;
5866
0
        vacc00 = vacc02;
5867
0
        c0 += 2;
5868
0
      }
5869
0
      if (nc & 1) {
5870
0
        c1[0] = vacc10;
5871
0
        c0[0] = vacc00;
5872
0
      }
5873
5874
0
      nc = 0;
5875
0
    }
5876
0
  } while (nc != 0);
5877
0
}
5878
5879
void xnn_f32_gemm_ukernel_2x4__scalar(
5880
    size_t mr,
5881
    size_t nc,
5882
    size_t kc,
5883
    const float* restrict a,
5884
    size_t a_stride,
5885
    const float* restrict w,
5886
    float* restrict c,
5887
    size_t cm_stride,
5888
    size_t cn_stride,
5889
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
5890
0
{
5891
0
  assert(mr != 0);
5892
0
  assert(mr <= 2);
5893
0
  assert(nc != 0);
5894
0
  assert(kc != 0);
5895
0
  assert(kc % sizeof(float) == 0);
5896
0
  assert(a != NULL);
5897
0
  assert(w != NULL);
5898
0
  assert(c != NULL);
5899
5900
0
  const float* a0 = a;
5901
0
  float* c0 = c;
5902
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
5903
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
5904
0
  if XNN_UNPREDICTABLE(mr != 2) {
5905
0
    a1 = a0;
5906
0
    c1 = c0;
5907
0
  }
5908
5909
0
  do {
5910
0
    float vacc00 = w[0];
5911
0
    float vacc01 = w[1];
5912
0
    float vacc02 = w[2];
5913
0
    float vacc03 = w[3];
5914
0
    w += 4;
5915
0
    float vacc10 = vacc00;
5916
0
    float vacc11 = vacc01;
5917
0
    float vacc12 = vacc02;
5918
0
    float vacc13 = vacc03;
5919
5920
0
    size_t k = kc;
5921
0
    do {
5922
0
      const float va0 = *a0++;
5923
0
      const float va1 = *a1++;
5924
5925
0
      const float vb0 = w[0];
5926
0
      const float vb1 = w[1];
5927
0
      const float vb2 = w[2];
5928
0
      const float vb3 = w[3];
5929
0
      w += 4;
5930
5931
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
5932
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
5933
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
5934
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
5935
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
5936
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
5937
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
5938
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
5939
5940
0
      k -= sizeof(float);
5941
0
    } while (k != 0);
5942
5943
5944
0
    if XNN_LIKELY(nc >= 4) {
5945
0
      c1[0] = vacc10;
5946
0
      c1[1] = vacc11;
5947
0
      c1[2] = vacc12;
5948
0
      c1[3] = vacc13;
5949
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
5950
0
      c0[0] = vacc00;
5951
0
      c0[1] = vacc01;
5952
0
      c0[2] = vacc02;
5953
0
      c0[3] = vacc03;
5954
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
5955
5956
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
5957
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
5958
5959
0
      nc -= 4;
5960
0
    } else {
5961
0
      if (nc & 2) {
5962
0
        c1[0] = vacc10;
5963
0
        c1[1] = vacc11;
5964
0
        vacc10 = vacc12;
5965
0
        c1 += 2;
5966
0
        c0[0] = vacc00;
5967
0
        c0[1] = vacc01;
5968
0
        vacc00 = vacc02;
5969
0
        c0 += 2;
5970
0
      }
5971
0
      if (nc & 1) {
5972
0
        c1[0] = vacc10;
5973
0
        c0[0] = vacc00;
5974
0
      }
5975
5976
0
      nc = 0;
5977
0
    }
5978
0
  } while (nc != 0);
5979
0
}
5980
5981
void xnn_f32_gemm_minmax_ukernel_4x2__scalar(
5982
    size_t mr,
5983
    size_t nc,
5984
    size_t kc,
5985
    const float* restrict a,
5986
    size_t a_stride,
5987
    const float* restrict w,
5988
    float* restrict c,
5989
    size_t cm_stride,
5990
    size_t cn_stride,
5991
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
5992
0
{
5993
0
  assert(mr != 0);
5994
0
  assert(mr <= 4);
5995
0
  assert(nc != 0);
5996
0
  assert(kc != 0);
5997
0
  assert(kc % sizeof(float) == 0);
5998
0
  assert(a != NULL);
5999
0
  assert(w != NULL);
6000
0
  assert(c != NULL);
6001
6002
0
  const float* a0 = a;
6003
0
  float* c0 = c;
6004
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
6005
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
6006
0
  if XNN_UNPREDICTABLE(mr < 2) {
6007
0
    a1 = a0;
6008
0
    c1 = c0;
6009
0
  }
6010
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
6011
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
6012
0
  if XNN_UNPREDICTABLE(mr <= 2) {
6013
0
    a2 = a1;
6014
0
    c2 = c1;
6015
0
  }
6016
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
6017
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
6018
0
  if XNN_UNPREDICTABLE(mr != 4) {
6019
0
    a3 = a2;
6020
0
    c3 = c2;
6021
0
  }
6022
6023
0
  const float vmin = params->scalar.min;
6024
0
  const float vmax = params->scalar.max;
6025
0
  do {
6026
0
    float vacc00 = w[0];
6027
0
    float vacc01 = w[1];
6028
0
    w += 2;
6029
0
    float vacc10 = vacc00;
6030
0
    float vacc11 = vacc01;
6031
0
    float vacc20 = vacc00;
6032
0
    float vacc21 = vacc01;
6033
0
    float vacc30 = vacc00;
6034
0
    float vacc31 = vacc01;
6035
6036
0
    size_t k = kc;
6037
0
    do {
6038
0
      const float va0 = *a0++;
6039
0
      const float va1 = *a1++;
6040
0
      const float va2 = *a2++;
6041
0
      const float va3 = *a3++;
6042
6043
0
      const float vb0 = w[0];
6044
0
      const float vb1 = w[1];
6045
0
      w += 2;
6046
6047
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
6048
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
6049
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
6050
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
6051
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
6052
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
6053
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
6054
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
6055
6056
0
      k -= sizeof(float);
6057
0
    } while (k != 0);
6058
6059
0
    vacc00 = math_max_f32(vacc00, vmin);
6060
0
    vacc01 = math_max_f32(vacc01, vmin);
6061
0
    vacc10 = math_max_f32(vacc10, vmin);
6062
0
    vacc11 = math_max_f32(vacc11, vmin);
6063
0
    vacc20 = math_max_f32(vacc20, vmin);
6064
0
    vacc21 = math_max_f32(vacc21, vmin);
6065
0
    vacc30 = math_max_f32(vacc30, vmin);
6066
0
    vacc31 = math_max_f32(vacc31, vmin);
6067
6068
0
    vacc00 = math_min_f32(vacc00, vmax);
6069
0
    vacc01 = math_min_f32(vacc01, vmax);
6070
0
    vacc10 = math_min_f32(vacc10, vmax);
6071
0
    vacc11 = math_min_f32(vacc11, vmax);
6072
0
    vacc20 = math_min_f32(vacc20, vmax);
6073
0
    vacc21 = math_min_f32(vacc21, vmax);
6074
0
    vacc30 = math_min_f32(vacc30, vmax);
6075
0
    vacc31 = math_min_f32(vacc31, vmax);
6076
6077
0
    if XNN_LIKELY(nc >= 2) {
6078
0
      c3[0] = vacc30;
6079
0
      c3[1] = vacc31;
6080
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
6081
0
      c2[0] = vacc20;
6082
0
      c2[1] = vacc21;
6083
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
6084
0
      c1[0] = vacc10;
6085
0
      c1[1] = vacc11;
6086
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
6087
0
      c0[0] = vacc00;
6088
0
      c0[1] = vacc01;
6089
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
6090
6091
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
6092
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
6093
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
6094
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
6095
6096
0
      nc -= 2;
6097
0
    } else {
6098
0
      if (nc & 1) {
6099
0
        c3[0] = vacc30;
6100
0
        c2[0] = vacc20;
6101
0
        c1[0] = vacc10;
6102
0
        c0[0] = vacc00;
6103
0
      }
6104
6105
0
      nc = 0;
6106
0
    }
6107
0
  } while (nc != 0);
6108
0
}
6109
6110
void xnn_f32_gemm_ukernel_4x2__scalar(
6111
    size_t mr,
6112
    size_t nc,
6113
    size_t kc,
6114
    const float* restrict a,
6115
    size_t a_stride,
6116
    const float* restrict w,
6117
    float* restrict c,
6118
    size_t cm_stride,
6119
    size_t cn_stride,
6120
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
6121
0
{
6122
0
  assert(mr != 0);
6123
0
  assert(mr <= 4);
6124
0
  assert(nc != 0);
6125
0
  assert(kc != 0);
6126
0
  assert(kc % sizeof(float) == 0);
6127
0
  assert(a != NULL);
6128
0
  assert(w != NULL);
6129
0
  assert(c != NULL);
6130
6131
0
  const float* a0 = a;
6132
0
  float* c0 = c;
6133
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
6134
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
6135
0
  if XNN_UNPREDICTABLE(mr < 2) {
6136
0
    a1 = a0;
6137
0
    c1 = c0;
6138
0
  }
6139
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
6140
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
6141
0
  if XNN_UNPREDICTABLE(mr <= 2) {
6142
0
    a2 = a1;
6143
0
    c2 = c1;
6144
0
  }
6145
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
6146
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
6147
0
  if XNN_UNPREDICTABLE(mr != 4) {
6148
0
    a3 = a2;
6149
0
    c3 = c2;
6150
0
  }
6151
6152
0
  do {
6153
0
    float vacc00 = w[0];
6154
0
    float vacc01 = w[1];
6155
0
    w += 2;
6156
0
    float vacc10 = vacc00;
6157
0
    float vacc11 = vacc01;
6158
0
    float vacc20 = vacc00;
6159
0
    float vacc21 = vacc01;
6160
0
    float vacc30 = vacc00;
6161
0
    float vacc31 = vacc01;
6162
6163
0
    size_t k = kc;
6164
0
    do {
6165
0
      const float va0 = *a0++;
6166
0
      const float va1 = *a1++;
6167
0
      const float va2 = *a2++;
6168
0
      const float va3 = *a3++;
6169
6170
0
      const float vb0 = w[0];
6171
0
      const float vb1 = w[1];
6172
0
      w += 2;
6173
6174
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
6175
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
6176
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
6177
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
6178
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
6179
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
6180
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
6181
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
6182
6183
0
      k -= sizeof(float);
6184
0
    } while (k != 0);
6185
6186
6187
0
    if XNN_LIKELY(nc >= 2) {
6188
0
      c3[0] = vacc30;
6189
0
      c3[1] = vacc31;
6190
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
6191
0
      c2[0] = vacc20;
6192
0
      c2[1] = vacc21;
6193
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
6194
0
      c1[0] = vacc10;
6195
0
      c1[1] = vacc11;
6196
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
6197
0
      c0[0] = vacc00;
6198
0
      c0[1] = vacc01;
6199
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
6200
6201
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
6202
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
6203
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
6204
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
6205
6206
0
      nc -= 2;
6207
0
    } else {
6208
0
      if (nc & 1) {
6209
0
        c3[0] = vacc30;
6210
0
        c2[0] = vacc20;
6211
0
        c1[0] = vacc10;
6212
0
        c0[0] = vacc00;
6213
0
      }
6214
6215
0
      nc = 0;
6216
0
    }
6217
0
  } while (nc != 0);
6218
0
}
6219
6220
void xnn_f32_gemm_minmax_ukernel_4x4__scalar(
6221
    size_t mr,
6222
    size_t nc,
6223
    size_t kc,
6224
    const float* restrict a,
6225
    size_t a_stride,
6226
    const float* restrict w,
6227
    float* restrict c,
6228
    size_t cm_stride,
6229
    size_t cn_stride,
6230
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
6231
0
{
6232
0
  assert(mr != 0);
6233
0
  assert(mr <= 4);
6234
0
  assert(nc != 0);
6235
0
  assert(kc != 0);
6236
0
  assert(kc % sizeof(float) == 0);
6237
0
  assert(a != NULL);
6238
0
  assert(w != NULL);
6239
0
  assert(c != NULL);
6240
6241
0
  const float* a0 = a;
6242
0
  float* c0 = c;
6243
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
6244
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
6245
0
  if XNN_UNPREDICTABLE(mr < 2) {
6246
0
    a1 = a0;
6247
0
    c1 = c0;
6248
0
  }
6249
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
6250
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
6251
0
  if XNN_UNPREDICTABLE(mr <= 2) {
6252
0
    a2 = a1;
6253
0
    c2 = c1;
6254
0
  }
6255
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
6256
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
6257
0
  if XNN_UNPREDICTABLE(mr != 4) {
6258
0
    a3 = a2;
6259
0
    c3 = c2;
6260
0
  }
6261
6262
0
  const float vmin = params->scalar.min;
6263
0
  const float vmax = params->scalar.max;
6264
0
  do {
6265
0
    float vacc00 = w[0];
6266
0
    float vacc01 = w[1];
6267
0
    float vacc02 = w[2];
6268
0
    float vacc03 = w[3];
6269
0
    w += 4;
6270
0
    float vacc10 = vacc00;
6271
0
    float vacc11 = vacc01;
6272
0
    float vacc12 = vacc02;
6273
0
    float vacc13 = vacc03;
6274
0
    float vacc20 = vacc00;
6275
0
    float vacc21 = vacc01;
6276
0
    float vacc22 = vacc02;
6277
0
    float vacc23 = vacc03;
6278
0
    float vacc30 = vacc00;
6279
0
    float vacc31 = vacc01;
6280
0
    float vacc32 = vacc02;
6281
0
    float vacc33 = vacc03;
6282
6283
0
    size_t k = kc;
6284
0
    do {
6285
0
      const float va0 = *a0++;
6286
0
      const float va1 = *a1++;
6287
0
      const float va2 = *a2++;
6288
0
      const float va3 = *a3++;
6289
6290
0
      const float vb0 = w[0];
6291
0
      const float vb1 = w[1];
6292
0
      const float vb2 = w[2];
6293
0
      const float vb3 = w[3];
6294
0
      w += 4;
6295
6296
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
6297
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
6298
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
6299
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
6300
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
6301
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
6302
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
6303
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
6304
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
6305
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
6306
0
      vacc22 = math_muladd_f32(va2, vb2, vacc22);
6307
0
      vacc23 = math_muladd_f32(va2, vb3, vacc23);
6308
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
6309
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
6310
0
      vacc32 = math_muladd_f32(va3, vb2, vacc32);
6311
0
      vacc33 = math_muladd_f32(va3, vb3, vacc33);
6312
6313
0
      k -= sizeof(float);
6314
0
    } while (k != 0);
6315
6316
0
    vacc00 = math_max_f32(vacc00, vmin);
6317
0
    vacc01 = math_max_f32(vacc01, vmin);
6318
0
    vacc02 = math_max_f32(vacc02, vmin);
6319
0
    vacc03 = math_max_f32(vacc03, vmin);
6320
0
    vacc10 = math_max_f32(vacc10, vmin);
6321
0
    vacc11 = math_max_f32(vacc11, vmin);
6322
0
    vacc12 = math_max_f32(vacc12, vmin);
6323
0
    vacc13 = math_max_f32(vacc13, vmin);
6324
0
    vacc20 = math_max_f32(vacc20, vmin);
6325
0
    vacc21 = math_max_f32(vacc21, vmin);
6326
0
    vacc22 = math_max_f32(vacc22, vmin);
6327
0
    vacc23 = math_max_f32(vacc23, vmin);
6328
0
    vacc30 = math_max_f32(vacc30, vmin);
6329
0
    vacc31 = math_max_f32(vacc31, vmin);
6330
0
    vacc32 = math_max_f32(vacc32, vmin);
6331
0
    vacc33 = math_max_f32(vacc33, vmin);
6332
6333
0
    vacc00 = math_min_f32(vacc00, vmax);
6334
0
    vacc01 = math_min_f32(vacc01, vmax);
6335
0
    vacc02 = math_min_f32(vacc02, vmax);
6336
0
    vacc03 = math_min_f32(vacc03, vmax);
6337
0
    vacc10 = math_min_f32(vacc10, vmax);
6338
0
    vacc11 = math_min_f32(vacc11, vmax);
6339
0
    vacc12 = math_min_f32(vacc12, vmax);
6340
0
    vacc13 = math_min_f32(vacc13, vmax);
6341
0
    vacc20 = math_min_f32(vacc20, vmax);
6342
0
    vacc21 = math_min_f32(vacc21, vmax);
6343
0
    vacc22 = math_min_f32(vacc22, vmax);
6344
0
    vacc23 = math_min_f32(vacc23, vmax);
6345
0
    vacc30 = math_min_f32(vacc30, vmax);
6346
0
    vacc31 = math_min_f32(vacc31, vmax);
6347
0
    vacc32 = math_min_f32(vacc32, vmax);
6348
0
    vacc33 = math_min_f32(vacc33, vmax);
6349
6350
0
    if XNN_LIKELY(nc >= 4) {
6351
0
      c3[0] = vacc30;
6352
0
      c3[1] = vacc31;
6353
0
      c3[2] = vacc32;
6354
0
      c3[3] = vacc33;
6355
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
6356
0
      c2[0] = vacc20;
6357
0
      c2[1] = vacc21;
6358
0
      c2[2] = vacc22;
6359
0
      c2[3] = vacc23;
6360
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
6361
0
      c1[0] = vacc10;
6362
0
      c1[1] = vacc11;
6363
0
      c1[2] = vacc12;
6364
0
      c1[3] = vacc13;
6365
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
6366
0
      c0[0] = vacc00;
6367
0
      c0[1] = vacc01;
6368
0
      c0[2] = vacc02;
6369
0
      c0[3] = vacc03;
6370
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
6371
6372
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
6373
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
6374
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
6375
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
6376
6377
0
      nc -= 4;
6378
0
    } else {
6379
0
      if (nc & 2) {
6380
0
        c3[0] = vacc30;
6381
0
        c3[1] = vacc31;
6382
0
        vacc30 = vacc32;
6383
0
        c3 += 2;
6384
0
        c2[0] = vacc20;
6385
0
        c2[1] = vacc21;
6386
0
        vacc20 = vacc22;
6387
0
        c2 += 2;
6388
0
        c1[0] = vacc10;
6389
0
        c1[1] = vacc11;
6390
0
        vacc10 = vacc12;
6391
0
        c1 += 2;
6392
0
        c0[0] = vacc00;
6393
0
        c0[1] = vacc01;
6394
0
        vacc00 = vacc02;
6395
0
        c0 += 2;
6396
0
      }
6397
0
      if (nc & 1) {
6398
0
        c3[0] = vacc30;
6399
0
        c2[0] = vacc20;
6400
0
        c1[0] = vacc10;
6401
0
        c0[0] = vacc00;
6402
0
      }
6403
6404
0
      nc = 0;
6405
0
    }
6406
0
  } while (nc != 0);
6407
0
}
6408
6409
void xnn_f32_gemm_relu_ukernel_4x4__scalar(
6410
    size_t mr,
6411
    size_t nc,
6412
    size_t kc,
6413
    const float* restrict a,
6414
    size_t a_stride,
6415
    const float* restrict w,
6416
    float* restrict c,
6417
    size_t cm_stride,
6418
    size_t cn_stride,
6419
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
6420
0
{
6421
0
  assert(mr != 0);
6422
0
  assert(mr <= 4);
6423
0
  assert(nc != 0);
6424
0
  assert(kc != 0);
6425
0
  assert(kc % sizeof(float) == 0);
6426
0
  assert(a != NULL);
6427
0
  assert(w != NULL);
6428
0
  assert(c != NULL);
6429
6430
0
  const float* a0 = a;
6431
0
  float* c0 = c;
6432
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
6433
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
6434
0
  if XNN_UNPREDICTABLE(mr < 2) {
6435
0
    a1 = a0;
6436
0
    c1 = c0;
6437
0
  }
6438
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
6439
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
6440
0
  if XNN_UNPREDICTABLE(mr <= 2) {
6441
0
    a2 = a1;
6442
0
    c2 = c1;
6443
0
  }
6444
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
6445
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
6446
0
  if XNN_UNPREDICTABLE(mr != 4) {
6447
0
    a3 = a2;
6448
0
    c3 = c2;
6449
0
  }
6450
6451
0
  do {
6452
0
    float vacc00 = w[0];
6453
0
    float vacc01 = w[1];
6454
0
    float vacc02 = w[2];
6455
0
    float vacc03 = w[3];
6456
0
    w += 4;
6457
0
    float vacc10 = vacc00;
6458
0
    float vacc11 = vacc01;
6459
0
    float vacc12 = vacc02;
6460
0
    float vacc13 = vacc03;
6461
0
    float vacc20 = vacc00;
6462
0
    float vacc21 = vacc01;
6463
0
    float vacc22 = vacc02;
6464
0
    float vacc23 = vacc03;
6465
0
    float vacc30 = vacc00;
6466
0
    float vacc31 = vacc01;
6467
0
    float vacc32 = vacc02;
6468
0
    float vacc33 = vacc03;
6469
6470
0
    size_t k = kc;
6471
0
    do {
6472
0
      const float va0 = *a0++;
6473
0
      const float va1 = *a1++;
6474
0
      const float va2 = *a2++;
6475
0
      const float va3 = *a3++;
6476
6477
0
      const float vb0 = w[0];
6478
0
      const float vb1 = w[1];
6479
0
      const float vb2 = w[2];
6480
0
      const float vb3 = w[3];
6481
0
      w += 4;
6482
6483
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
6484
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
6485
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
6486
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
6487
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
6488
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
6489
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
6490
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
6491
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
6492
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
6493
0
      vacc22 = math_muladd_f32(va2, vb2, vacc22);
6494
0
      vacc23 = math_muladd_f32(va2, vb3, vacc23);
6495
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
6496
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
6497
0
      vacc32 = math_muladd_f32(va3, vb2, vacc32);
6498
0
      vacc33 = math_muladd_f32(va3, vb3, vacc33);
6499
6500
0
      k -= sizeof(float);
6501
0
    } while (k != 0);
6502
6503
0
    vacc00 = math_max_f32(vacc00, 0.0f);
6504
0
    vacc01 = math_max_f32(vacc01, 0.0f);
6505
0
    vacc02 = math_max_f32(vacc02, 0.0f);
6506
0
    vacc03 = math_max_f32(vacc03, 0.0f);
6507
0
    vacc10 = math_max_f32(vacc10, 0.0f);
6508
0
    vacc11 = math_max_f32(vacc11, 0.0f);
6509
0
    vacc12 = math_max_f32(vacc12, 0.0f);
6510
0
    vacc13 = math_max_f32(vacc13, 0.0f);
6511
0
    vacc20 = math_max_f32(vacc20, 0.0f);
6512
0
    vacc21 = math_max_f32(vacc21, 0.0f);
6513
0
    vacc22 = math_max_f32(vacc22, 0.0f);
6514
0
    vacc23 = math_max_f32(vacc23, 0.0f);
6515
0
    vacc30 = math_max_f32(vacc30, 0.0f);
6516
0
    vacc31 = math_max_f32(vacc31, 0.0f);
6517
0
    vacc32 = math_max_f32(vacc32, 0.0f);
6518
0
    vacc33 = math_max_f32(vacc33, 0.0f);
6519
6520
0
    if XNN_LIKELY(nc >= 4) {
6521
0
      c3[0] = vacc30;
6522
0
      c3[1] = vacc31;
6523
0
      c3[2] = vacc32;
6524
0
      c3[3] = vacc33;
6525
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
6526
0
      c2[0] = vacc20;
6527
0
      c2[1] = vacc21;
6528
0
      c2[2] = vacc22;
6529
0
      c2[3] = vacc23;
6530
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
6531
0
      c1[0] = vacc10;
6532
0
      c1[1] = vacc11;
6533
0
      c1[2] = vacc12;
6534
0
      c1[3] = vacc13;
6535
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
6536
0
      c0[0] = vacc00;
6537
0
      c0[1] = vacc01;
6538
0
      c0[2] = vacc02;
6539
0
      c0[3] = vacc03;
6540
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
6541
6542
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
6543
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
6544
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
6545
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
6546
6547
0
      nc -= 4;
6548
0
    } else {
6549
0
      if (nc & 2) {
6550
0
        c3[0] = vacc30;
6551
0
        c3[1] = vacc31;
6552
0
        vacc30 = vacc32;
6553
0
        c3 += 2;
6554
0
        c2[0] = vacc20;
6555
0
        c2[1] = vacc21;
6556
0
        vacc20 = vacc22;
6557
0
        c2 += 2;
6558
0
        c1[0] = vacc10;
6559
0
        c1[1] = vacc11;
6560
0
        vacc10 = vacc12;
6561
0
        c1 += 2;
6562
0
        c0[0] = vacc00;
6563
0
        c0[1] = vacc01;
6564
0
        vacc00 = vacc02;
6565
0
        c0 += 2;
6566
0
      }
6567
0
      if (nc & 1) {
6568
0
        c3[0] = vacc30;
6569
0
        c2[0] = vacc20;
6570
0
        c1[0] = vacc10;
6571
0
        c0[0] = vacc00;
6572
0
      }
6573
6574
0
      nc = 0;
6575
0
    }
6576
0
  } while (nc != 0);
6577
0
}
6578
6579
void xnn_f32_gemm_ukernel_4x4__scalar(
6580
    size_t mr,
6581
    size_t nc,
6582
    size_t kc,
6583
    const float* restrict a,
6584
    size_t a_stride,
6585
    const float* restrict w,
6586
    float* restrict c,
6587
    size_t cm_stride,
6588
    size_t cn_stride,
6589
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
6590
0
{
6591
0
  assert(mr != 0);
6592
0
  assert(mr <= 4);
6593
0
  assert(nc != 0);
6594
0
  assert(kc != 0);
6595
0
  assert(kc % sizeof(float) == 0);
6596
0
  assert(a != NULL);
6597
0
  assert(w != NULL);
6598
0
  assert(c != NULL);
6599
6600
0
  const float* a0 = a;
6601
0
  float* c0 = c;
6602
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
6603
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
6604
0
  if XNN_UNPREDICTABLE(mr < 2) {
6605
0
    a1 = a0;
6606
0
    c1 = c0;
6607
0
  }
6608
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
6609
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
6610
0
  if XNN_UNPREDICTABLE(mr <= 2) {
6611
0
    a2 = a1;
6612
0
    c2 = c1;
6613
0
  }
6614
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
6615
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
6616
0
  if XNN_UNPREDICTABLE(mr != 4) {
6617
0
    a3 = a2;
6618
0
    c3 = c2;
6619
0
  }
6620
6621
0
  do {
6622
0
    float vacc00 = w[0];
6623
0
    float vacc01 = w[1];
6624
0
    float vacc02 = w[2];
6625
0
    float vacc03 = w[3];
6626
0
    w += 4;
6627
0
    float vacc10 = vacc00;
6628
0
    float vacc11 = vacc01;
6629
0
    float vacc12 = vacc02;
6630
0
    float vacc13 = vacc03;
6631
0
    float vacc20 = vacc00;
6632
0
    float vacc21 = vacc01;
6633
0
    float vacc22 = vacc02;
6634
0
    float vacc23 = vacc03;
6635
0
    float vacc30 = vacc00;
6636
0
    float vacc31 = vacc01;
6637
0
    float vacc32 = vacc02;
6638
0
    float vacc33 = vacc03;
6639
6640
0
    size_t k = kc;
6641
0
    do {
6642
0
      const float va0 = *a0++;
6643
0
      const float va1 = *a1++;
6644
0
      const float va2 = *a2++;
6645
0
      const float va3 = *a3++;
6646
6647
0
      const float vb0 = w[0];
6648
0
      const float vb1 = w[1];
6649
0
      const float vb2 = w[2];
6650
0
      const float vb3 = w[3];
6651
0
      w += 4;
6652
6653
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
6654
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
6655
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
6656
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
6657
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
6658
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
6659
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
6660
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
6661
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
6662
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
6663
0
      vacc22 = math_muladd_f32(va2, vb2, vacc22);
6664
0
      vacc23 = math_muladd_f32(va2, vb3, vacc23);
6665
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
6666
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
6667
0
      vacc32 = math_muladd_f32(va3, vb2, vacc32);
6668
0
      vacc33 = math_muladd_f32(va3, vb3, vacc33);
6669
6670
0
      k -= sizeof(float);
6671
0
    } while (k != 0);
6672
6673
6674
0
    if XNN_LIKELY(nc >= 4) {
6675
0
      c3[0] = vacc30;
6676
0
      c3[1] = vacc31;
6677
0
      c3[2] = vacc32;
6678
0
      c3[3] = vacc33;
6679
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
6680
0
      c2[0] = vacc20;
6681
0
      c2[1] = vacc21;
6682
0
      c2[2] = vacc22;
6683
0
      c2[3] = vacc23;
6684
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
6685
0
      c1[0] = vacc10;
6686
0
      c1[1] = vacc11;
6687
0
      c1[2] = vacc12;
6688
0
      c1[3] = vacc13;
6689
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
6690
0
      c0[0] = vacc00;
6691
0
      c0[1] = vacc01;
6692
0
      c0[2] = vacc02;
6693
0
      c0[3] = vacc03;
6694
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
6695
6696
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
6697
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
6698
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
6699
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
6700
6701
0
      nc -= 4;
6702
0
    } else {
6703
0
      if (nc & 2) {
6704
0
        c3[0] = vacc30;
6705
0
        c3[1] = vacc31;
6706
0
        vacc30 = vacc32;
6707
0
        c3 += 2;
6708
0
        c2[0] = vacc20;
6709
0
        c2[1] = vacc21;
6710
0
        vacc20 = vacc22;
6711
0
        c2 += 2;
6712
0
        c1[0] = vacc10;
6713
0
        c1[1] = vacc11;
6714
0
        vacc10 = vacc12;
6715
0
        c1 += 2;
6716
0
        c0[0] = vacc00;
6717
0
        c0[1] = vacc01;
6718
0
        vacc00 = vacc02;
6719
0
        c0 += 2;
6720
0
      }
6721
0
      if (nc & 1) {
6722
0
        c3[0] = vacc30;
6723
0
        c2[0] = vacc20;
6724
0
        c1[0] = vacc10;
6725
0
        c0[0] = vacc00;
6726
0
      }
6727
6728
0
      nc = 0;
6729
0
    }
6730
0
  } while (nc != 0);
6731
0
}
6732
6733
void xnn_f32_ibilinear_chw_ukernel__scalar_p4(
6734
    size_t output_pixels,
6735
    size_t channels,
6736
    const float** restrict input,
6737
    size_t input_offset,
6738
    const float* restrict weights,
6739
    float* restrict output,
6740
    size_t input_increment)
6741
0
{
6742
0
  assert(output_pixels != 0);
6743
0
  assert(channels != 0);
6744
0
  assert(input_increment % sizeof(float) == 0);
6745
6746
0
  size_t c = channels;
6747
0
  do {
6748
0
    const float** i = input;
6749
0
    const float* w = weights;
6750
6751
0
    size_t p = output_pixels;
6752
0
    for (; p >= 4; p -= 4) {
6753
0
      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
6754
0
      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
6755
0
      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
6756
0
      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
6757
0
      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
6758
0
      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
6759
0
      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
6760
0
      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
6761
0
      i += 4 * 2;
6762
6763
0
      const float valphah0 = w[0];
6764
0
      const float valphav0 = w[1];
6765
0
      const float valphah1 = w[2];
6766
0
      const float valphav1 = w[3];
6767
0
      const float valphah2 = w[4];
6768
0
      const float valphav2 = w[5];
6769
0
      const float valphah3 = w[6];
6770
0
      const float valphav3 = w[7];
6771
0
      w += 4 * 2;
6772
6773
0
      const float vtl0 = itl0[0];
6774
0
      const float vtr0 = itl0[1];
6775
0
      const float vbl0 = ibl0[0];
6776
0
      const float vbr0 = ibl0[1];
6777
0
      const float vtl1 = itl1[0];
6778
0
      const float vtr1 = itl1[1];
6779
0
      const float vbl1 = ibl1[0];
6780
0
      const float vbr1 = ibl1[1];
6781
0
      const float vtl2 = itl2[0];
6782
0
      const float vtr2 = itl2[1];
6783
0
      const float vbl2 = ibl2[0];
6784
0
      const float vbr2 = ibl2[1];
6785
0
      const float vtl3 = itl3[0];
6786
0
      const float vtr3 = itl3[1];
6787
0
      const float vbl3 = ibl3[0];
6788
0
      const float vbr3 = ibl3[1];
6789
6790
0
      const float vtd0 = vtr0 - vtl0;
6791
0
      const float vbd0 = vbr0 - vbl0;
6792
0
      const float vtd1 = vtr1 - vtl1;
6793
0
      const float vbd1 = vbr1 - vbl1;
6794
0
      const float vtd2 = vtr2 - vtl2;
6795
0
      const float vbd2 = vbr2 - vbl2;
6796
0
      const float vtd3 = vtr3 - vtl3;
6797
0
      const float vbd3 = vbr3 - vbl3;
6798
6799
0
      const float vt0 = vtl0 + vtd0 * valphah0;
6800
0
      const float vb0 = vbl0 + vbd0 * valphah0;
6801
0
      const float vt1 = vtl1 + vtd1 * valphah1;
6802
0
      const float vb1 = vbl1 + vbd1 * valphah1;
6803
0
      const float vt2 = vtl2 + vtd2 * valphah2;
6804
0
      const float vb2 = vbl2 + vbd2 * valphah2;
6805
0
      const float vt3 = vtl3 + vtd3 * valphah3;
6806
0
      const float vb3 = vbl3 + vbd3 * valphah3;
6807
6808
0
      const float vd0 = vb0 - vt0;
6809
0
      const float vd1 = vb1 - vt1;
6810
0
      const float vd2 = vb2 - vt2;
6811
0
      const float vd3 = vb3 - vt3;
6812
6813
0
      const float vo0 = vt0 + vd0 * valphav0;
6814
0
      const float vo1 = vt1 + vd1 * valphav1;
6815
0
      const float vo2 = vt2 + vd2 * valphav2;
6816
0
      const float vo3 = vt3 + vd3 * valphav3;
6817
6818
0
      output[0] = vo0;
6819
0
      output[1] = vo1;
6820
0
      output[2] = vo2;
6821
0
      output[3] = vo3;
6822
0
      output += 4;
6823
0
    }
6824
6825
0
    for (; p >= 1; p -= 1) {
6826
0
      const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
6827
0
      const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
6828
0
      i += 2;
6829
6830
0
      const float valphah = w[0];
6831
0
      const float valphav = w[1];
6832
0
      w += 2;
6833
6834
0
      const float vtl = itl[0];
6835
0
      const float vtr = itl[1];
6836
0
      const float vbl = ibl[0];
6837
0
      const float vbr = ibl[1];
6838
6839
0
      const float vtd = vtr - vtl;
6840
0
      const float vbd = vbr - vbl;
6841
6842
0
      const float vt = vtl + vtd * valphah;
6843
0
      const float vb = vbl + vbd * valphah;
6844
6845
0
      const float vd = vb - vt;
6846
6847
0
      const float vo = vt + vd * valphav;
6848
6849
0
      *output++ = vo;
6850
0
    }
6851
6852
0
    input_offset += input_increment;
6853
6854
0
    c--;
6855
0
  } while (c != 0);
6856
0
}
6857
6858
void xnn_f32_ibilinear_ukernel__scalar_c2(
6859
    size_t output_pixels,
6860
    size_t channels,
6861
    const float** restrict input,
6862
    size_t input_offset,
6863
    const float* restrict weights,
6864
    float* restrict output,
6865
    size_t output_increment)
6866
0
{
6867
0
  assert(output_pixels != 0);
6868
0
  assert(channels != 0);
6869
0
  assert(channels % sizeof(float) == 0);
6870
6871
0
  do {
6872
0
    const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
6873
0
    const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
6874
0
    const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
6875
0
    const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
6876
0
    input += 4;
6877
6878
0
    const float valphah = weights[0];
6879
0
    const float valphav = weights[1];
6880
0
    weights += 2;
6881
6882
0
    size_t c = channels;
6883
0
    for (; c >= 2 * sizeof(float); c -= 2 * sizeof(float)) {
6884
0
      const float vtl0 = i0[0];
6885
0
      const float vtr0 = i1[0];
6886
0
      const float vbl0 = i2[0];
6887
0
      const float vbr0 = i3[0];
6888
0
      const float vtl1 = i0[1];
6889
0
      const float vtr1 = i1[1];
6890
0
      const float vbl1 = i2[1];
6891
0
      const float vbr1 = i3[1];
6892
0
      i0 += 2;
6893
0
      i1 += 2;
6894
0
      i2 += 2;
6895
0
      i3 += 2;
6896
6897
0
      const float vtd0 = vtr0 - vtl0;
6898
0
      const float vbd0 = vbr0 - vbl0;
6899
0
      const float vtd1 = vtr1 - vtl1;
6900
0
      const float vbd1 = vbr1 - vbl1;
6901
6902
0
      const float vt0 = vtl0 + vtd0 * valphah;
6903
0
      const float vb0 = vbl0 + vbd0 * valphah;
6904
0
      const float vt1 = vtl1 + vtd1 * valphah;
6905
0
      const float vb1 = vbl1 + vbd1 * valphah;
6906
6907
0
      const float vd0 = vb0 - vt0;
6908
0
      const float vd1 = vb1 - vt1;
6909
6910
0
      const float vo0 = vt0 + vd0 * valphav;
6911
0
      const float vo1 = vt1 + vd1 * valphav;
6912
6913
0
      output[0] = vo0;
6914
0
      output[1] = vo1;
6915
0
      output += 2;
6916
0
    }
6917
0
    for (; c >= sizeof(float); c -= sizeof(float)) {
6918
0
      const float vtl = *i0++;
6919
0
      const float vtr = *i1++;
6920
0
      const float vbl = *i2++;
6921
0
      const float vbr = *i3++;
6922
6923
0
      const float vtd = vtr - vtl;
6924
0
      const float vbd = vbr - vbl;
6925
6926
0
      const float vt = vtl + vtd * valphah;
6927
0
      const float vb = vbl + vbd * valphah;
6928
6929
0
      const float vd = vb - vt;
6930
6931
0
      const float vo = vt + vd * valphav;
6932
6933
0
      *output++ = vo;
6934
0
    }
6935
6936
0
    output = (float*) ((uintptr_t) output + output_increment);
6937
0
  } while (--output_pixels != 0);
6938
0
}
6939
6940
void xnn_f32_igemm_minmax_ukernel_1x4__scalar(
6941
    size_t mr,
6942
    size_t nc,
6943
    size_t kc,
6944
    size_t ks,
6945
    const float** restrict a,
6946
    const float* restrict w,
6947
    float* restrict c,
6948
    size_t cm_stride,
6949
    size_t cn_stride,
6950
    size_t a_offset,
6951
    const float* zero,
6952
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
6953
0
{
6954
0
  assert(mr != 0);
6955
0
  assert(mr <= 1);
6956
0
  assert(nc != 0);
6957
0
  assert(kc != 0);
6958
0
  assert(kc % sizeof(float) == 0);
6959
0
  assert(ks != 0);
6960
0
  assert(ks % (1 * sizeof(void*)) == 0);
6961
0
  assert(a_offset % sizeof(float) == 0);
6962
0
  assert(a != NULL);
6963
0
  assert(w != NULL);
6964
0
  assert(c != NULL);
6965
6966
0
  float* c0 = c;
6967
6968
0
  const float vmin = params->scalar.min;
6969
0
  const float vmax = params->scalar.max;
6970
0
  do {
6971
0
    float vacc00 = w[0];
6972
0
    float vacc01 = w[1];
6973
0
    float vacc02 = w[2];
6974
0
    float vacc03 = w[3];
6975
0
    w += 4;
6976
6977
0
    size_t p = ks;
6978
0
    do {
6979
0
      const float* restrict a0 = a[0];
6980
0
      assert(a0 != NULL);
6981
0
      if XNN_UNPREDICTABLE(a0 != zero) {
6982
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
6983
0
      }
6984
0
      a += 1;
6985
6986
0
      size_t k = kc;
6987
0
      do {
6988
0
        const float va0 = *a0++;
6989
6990
0
        const float vb0 = w[0];
6991
0
        const float vb1 = w[1];
6992
0
        const float vb2 = w[2];
6993
0
        const float vb3 = w[3];
6994
0
        w += 4;
6995
6996
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
6997
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
6998
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
6999
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7000
7001
0
        k -= sizeof(float);
7002
0
      } while (k != 0);
7003
0
      p -= 1 * sizeof(void*);
7004
0
    } while (p != 0);
7005
7006
0
    vacc00 = math_max_f32(vacc00, vmin);
7007
0
    vacc01 = math_max_f32(vacc01, vmin);
7008
0
    vacc02 = math_max_f32(vacc02, vmin);
7009
0
    vacc03 = math_max_f32(vacc03, vmin);
7010
7011
0
    vacc00 = math_min_f32(vacc00, vmax);
7012
0
    vacc01 = math_min_f32(vacc01, vmax);
7013
0
    vacc02 = math_min_f32(vacc02, vmax);
7014
0
    vacc03 = math_min_f32(vacc03, vmax);
7015
7016
0
    if XNN_LIKELY(nc >= 4) {
7017
0
      c0[0] = vacc00;
7018
0
      c0[1] = vacc01;
7019
0
      c0[2] = vacc02;
7020
0
      c0[3] = vacc03;
7021
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7022
7023
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7024
0
      nc -= 4;
7025
0
    } else {
7026
0
      if (nc & 2) {
7027
0
        c0[0] = vacc00;
7028
0
        c0[1] = vacc01;
7029
0
        vacc00 = vacc02;
7030
0
        c0 += 2;
7031
0
      }
7032
0
      if (nc & 1) {
7033
0
        c0[0] = vacc00;
7034
0
      }
7035
7036
0
      nc = 0;
7037
0
    }
7038
0
  } while (nc != 0);
7039
0
}
7040
7041
void xnn_f32_igemm_relu_ukernel_1x4__scalar(
7042
    size_t mr,
7043
    size_t nc,
7044
    size_t kc,
7045
    size_t ks,
7046
    const float** restrict a,
7047
    const float* restrict w,
7048
    float* restrict c,
7049
    size_t cm_stride,
7050
    size_t cn_stride,
7051
    size_t a_offset,
7052
    const float* zero,
7053
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
7054
0
{
7055
0
  assert(mr != 0);
7056
0
  assert(mr <= 1);
7057
0
  assert(nc != 0);
7058
0
  assert(kc != 0);
7059
0
  assert(kc % sizeof(float) == 0);
7060
0
  assert(ks != 0);
7061
0
  assert(ks % (1 * sizeof(void*)) == 0);
7062
0
  assert(a_offset % sizeof(float) == 0);
7063
0
  assert(a != NULL);
7064
0
  assert(w != NULL);
7065
0
  assert(c != NULL);
7066
7067
0
  float* c0 = c;
7068
7069
0
  do {
7070
0
    float vacc00 = w[0];
7071
0
    float vacc01 = w[1];
7072
0
    float vacc02 = w[2];
7073
0
    float vacc03 = w[3];
7074
0
    w += 4;
7075
7076
0
    size_t p = ks;
7077
0
    do {
7078
0
      const float* restrict a0 = a[0];
7079
0
      assert(a0 != NULL);
7080
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7081
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7082
0
      }
7083
0
      a += 1;
7084
7085
0
      size_t k = kc;
7086
0
      do {
7087
0
        const float va0 = *a0++;
7088
7089
0
        const float vb0 = w[0];
7090
0
        const float vb1 = w[1];
7091
0
        const float vb2 = w[2];
7092
0
        const float vb3 = w[3];
7093
0
        w += 4;
7094
7095
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7096
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7097
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
7098
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7099
7100
0
        k -= sizeof(float);
7101
0
      } while (k != 0);
7102
0
      p -= 1 * sizeof(void*);
7103
0
    } while (p != 0);
7104
7105
0
    vacc00 = math_max_f32(vacc00, 0.0f);
7106
0
    vacc01 = math_max_f32(vacc01, 0.0f);
7107
0
    vacc02 = math_max_f32(vacc02, 0.0f);
7108
0
    vacc03 = math_max_f32(vacc03, 0.0f);
7109
7110
0
    if XNN_LIKELY(nc >= 4) {
7111
0
      c0[0] = vacc00;
7112
0
      c0[1] = vacc01;
7113
0
      c0[2] = vacc02;
7114
0
      c0[3] = vacc03;
7115
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7116
7117
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7118
0
      nc -= 4;
7119
0
    } else {
7120
0
      if (nc & 2) {
7121
0
        c0[0] = vacc00;
7122
0
        c0[1] = vacc01;
7123
0
        vacc00 = vacc02;
7124
0
        c0 += 2;
7125
0
      }
7126
0
      if (nc & 1) {
7127
0
        c0[0] = vacc00;
7128
0
      }
7129
7130
0
      nc = 0;
7131
0
    }
7132
0
  } while (nc != 0);
7133
0
}
7134
7135
void xnn_f32_igemm_ukernel_1x4__scalar(
7136
    size_t mr,
7137
    size_t nc,
7138
    size_t kc,
7139
    size_t ks,
7140
    const float** restrict a,
7141
    const float* restrict w,
7142
    float* restrict c,
7143
    size_t cm_stride,
7144
    size_t cn_stride,
7145
    size_t a_offset,
7146
    const float* zero,
7147
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
7148
0
{
7149
0
  assert(mr != 0);
7150
0
  assert(mr <= 1);
7151
0
  assert(nc != 0);
7152
0
  assert(kc != 0);
7153
0
  assert(kc % sizeof(float) == 0);
7154
0
  assert(ks != 0);
7155
0
  assert(ks % (1 * sizeof(void*)) == 0);
7156
0
  assert(a_offset % sizeof(float) == 0);
7157
0
  assert(a != NULL);
7158
0
  assert(w != NULL);
7159
0
  assert(c != NULL);
7160
7161
0
  float* c0 = c;
7162
7163
0
  do {
7164
0
    float vacc00 = w[0];
7165
0
    float vacc01 = w[1];
7166
0
    float vacc02 = w[2];
7167
0
    float vacc03 = w[3];
7168
0
    w += 4;
7169
7170
0
    size_t p = ks;
7171
0
    do {
7172
0
      const float* restrict a0 = a[0];
7173
0
      assert(a0 != NULL);
7174
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7175
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7176
0
      }
7177
0
      a += 1;
7178
7179
0
      size_t k = kc;
7180
0
      do {
7181
0
        const float va0 = *a0++;
7182
7183
0
        const float vb0 = w[0];
7184
0
        const float vb1 = w[1];
7185
0
        const float vb2 = w[2];
7186
0
        const float vb3 = w[3];
7187
0
        w += 4;
7188
7189
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7190
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7191
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
7192
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7193
7194
0
        k -= sizeof(float);
7195
0
      } while (k != 0);
7196
0
      p -= 1 * sizeof(void*);
7197
0
    } while (p != 0);
7198
7199
7200
0
    if XNN_LIKELY(nc >= 4) {
7201
0
      c0[0] = vacc00;
7202
0
      c0[1] = vacc01;
7203
0
      c0[2] = vacc02;
7204
0
      c0[3] = vacc03;
7205
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7206
7207
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7208
0
      nc -= 4;
7209
0
    } else {
7210
0
      if (nc & 2) {
7211
0
        c0[0] = vacc00;
7212
0
        c0[1] = vacc01;
7213
0
        vacc00 = vacc02;
7214
0
        c0 += 2;
7215
0
      }
7216
0
      if (nc & 1) {
7217
0
        c0[0] = vacc00;
7218
0
      }
7219
7220
0
      nc = 0;
7221
0
    }
7222
0
  } while (nc != 0);
7223
0
}
7224
7225
void xnn_f32_igemm_minmax_ukernel_2x4__scalar(
7226
    size_t mr,
7227
    size_t nc,
7228
    size_t kc,
7229
    size_t ks,
7230
    const float** restrict a,
7231
    const float* restrict w,
7232
    float* restrict c,
7233
    size_t cm_stride,
7234
    size_t cn_stride,
7235
    size_t a_offset,
7236
    const float* zero,
7237
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
7238
0
{
7239
0
  assert(mr != 0);
7240
0
  assert(mr <= 2);
7241
0
  assert(nc != 0);
7242
0
  assert(kc != 0);
7243
0
  assert(kc % sizeof(float) == 0);
7244
0
  assert(ks != 0);
7245
0
  assert(ks % (2 * sizeof(void*)) == 0);
7246
0
  assert(a_offset % sizeof(float) == 0);
7247
0
  assert(a != NULL);
7248
0
  assert(w != NULL);
7249
0
  assert(c != NULL);
7250
7251
0
  float* c0 = c;
7252
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
7253
0
  if XNN_UNPREDICTABLE(mr != 2) {
7254
0
    c1 = c0;
7255
0
  }
7256
7257
0
  const float vmin = params->scalar.min;
7258
0
  const float vmax = params->scalar.max;
7259
0
  do {
7260
0
    float vacc00 = w[0];
7261
0
    float vacc01 = w[1];
7262
0
    float vacc02 = w[2];
7263
0
    float vacc03 = w[3];
7264
0
    float vacc10 = vacc00;
7265
0
    float vacc11 = vacc01;
7266
0
    float vacc12 = vacc02;
7267
0
    float vacc13 = vacc03;
7268
0
    w += 4;
7269
7270
0
    size_t p = ks;
7271
0
    do {
7272
0
      const float* restrict a0 = a[0];
7273
0
      assert(a0 != NULL);
7274
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7275
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7276
0
      }
7277
0
      const float* restrict a1 = a[1];
7278
0
      assert(a1 != NULL);
7279
0
      if XNN_UNPREDICTABLE(a1 != zero) {
7280
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
7281
0
      }
7282
0
      a += 2;
7283
7284
0
      size_t k = kc;
7285
0
      do {
7286
0
        const float va0 = *a0++;
7287
0
        const float va1 = *a1++;
7288
7289
0
        const float vb0 = w[0];
7290
0
        const float vb1 = w[1];
7291
0
        const float vb2 = w[2];
7292
0
        const float vb3 = w[3];
7293
0
        w += 4;
7294
7295
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7296
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7297
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
7298
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7299
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
7300
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
7301
0
        vacc12 = math_muladd_f32(va1, vb2, vacc12);
7302
0
        vacc13 = math_muladd_f32(va1, vb3, vacc13);
7303
7304
0
        k -= sizeof(float);
7305
0
      } while (k != 0);
7306
0
      p -= 2 * sizeof(void*);
7307
0
    } while (p != 0);
7308
7309
0
    vacc00 = math_max_f32(vacc00, vmin);
7310
0
    vacc01 = math_max_f32(vacc01, vmin);
7311
0
    vacc02 = math_max_f32(vacc02, vmin);
7312
0
    vacc03 = math_max_f32(vacc03, vmin);
7313
0
    vacc10 = math_max_f32(vacc10, vmin);
7314
0
    vacc11 = math_max_f32(vacc11, vmin);
7315
0
    vacc12 = math_max_f32(vacc12, vmin);
7316
0
    vacc13 = math_max_f32(vacc13, vmin);
7317
7318
0
    vacc00 = math_min_f32(vacc00, vmax);
7319
0
    vacc01 = math_min_f32(vacc01, vmax);
7320
0
    vacc02 = math_min_f32(vacc02, vmax);
7321
0
    vacc03 = math_min_f32(vacc03, vmax);
7322
0
    vacc10 = math_min_f32(vacc10, vmax);
7323
0
    vacc11 = math_min_f32(vacc11, vmax);
7324
0
    vacc12 = math_min_f32(vacc12, vmax);
7325
0
    vacc13 = math_min_f32(vacc13, vmax);
7326
7327
0
    if XNN_LIKELY(nc >= 4) {
7328
0
      c1[0] = vacc10;
7329
0
      c1[1] = vacc11;
7330
0
      c1[2] = vacc12;
7331
0
      c1[3] = vacc13;
7332
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
7333
0
      c0[0] = vacc00;
7334
0
      c0[1] = vacc01;
7335
0
      c0[2] = vacc02;
7336
0
      c0[3] = vacc03;
7337
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7338
7339
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7340
0
      nc -= 4;
7341
0
    } else {
7342
0
      if (nc & 2) {
7343
0
        c1[0] = vacc10;
7344
0
        c1[1] = vacc11;
7345
0
        vacc10 = vacc12;
7346
0
        c1 += 2;
7347
0
        c0[0] = vacc00;
7348
0
        c0[1] = vacc01;
7349
0
        vacc00 = vacc02;
7350
0
        c0 += 2;
7351
0
      }
7352
0
      if (nc & 1) {
7353
0
        c1[0] = vacc10;
7354
0
        c0[0] = vacc00;
7355
0
      }
7356
7357
0
      nc = 0;
7358
0
    }
7359
0
  } while (nc != 0);
7360
0
}
7361
7362
void xnn_f32_igemm_relu_ukernel_2x4__scalar(
7363
    size_t mr,
7364
    size_t nc,
7365
    size_t kc,
7366
    size_t ks,
7367
    const float** restrict a,
7368
    const float* restrict w,
7369
    float* restrict c,
7370
    size_t cm_stride,
7371
    size_t cn_stride,
7372
    size_t a_offset,
7373
    const float* zero,
7374
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
7375
0
{
7376
0
  assert(mr != 0);
7377
0
  assert(mr <= 2);
7378
0
  assert(nc != 0);
7379
0
  assert(kc != 0);
7380
0
  assert(kc % sizeof(float) == 0);
7381
0
  assert(ks != 0);
7382
0
  assert(ks % (2 * sizeof(void*)) == 0);
7383
0
  assert(a_offset % sizeof(float) == 0);
7384
0
  assert(a != NULL);
7385
0
  assert(w != NULL);
7386
0
  assert(c != NULL);
7387
7388
0
  float* c0 = c;
7389
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
7390
0
  if XNN_UNPREDICTABLE(mr != 2) {
7391
0
    c1 = c0;
7392
0
  }
7393
7394
0
  do {
7395
0
    float vacc00 = w[0];
7396
0
    float vacc01 = w[1];
7397
0
    float vacc02 = w[2];
7398
0
    float vacc03 = w[3];
7399
0
    float vacc10 = vacc00;
7400
0
    float vacc11 = vacc01;
7401
0
    float vacc12 = vacc02;
7402
0
    float vacc13 = vacc03;
7403
0
    w += 4;
7404
7405
0
    size_t p = ks;
7406
0
    do {
7407
0
      const float* restrict a0 = a[0];
7408
0
      assert(a0 != NULL);
7409
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7410
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7411
0
      }
7412
0
      const float* restrict a1 = a[1];
7413
0
      assert(a1 != NULL);
7414
0
      if XNN_UNPREDICTABLE(a1 != zero) {
7415
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
7416
0
      }
7417
0
      a += 2;
7418
7419
0
      size_t k = kc;
7420
0
      do {
7421
0
        const float va0 = *a0++;
7422
0
        const float va1 = *a1++;
7423
7424
0
        const float vb0 = w[0];
7425
0
        const float vb1 = w[1];
7426
0
        const float vb2 = w[2];
7427
0
        const float vb3 = w[3];
7428
0
        w += 4;
7429
7430
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7431
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7432
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
7433
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7434
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
7435
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
7436
0
        vacc12 = math_muladd_f32(va1, vb2, vacc12);
7437
0
        vacc13 = math_muladd_f32(va1, vb3, vacc13);
7438
7439
0
        k -= sizeof(float);
7440
0
      } while (k != 0);
7441
0
      p -= 2 * sizeof(void*);
7442
0
    } while (p != 0);
7443
7444
0
    vacc00 = math_max_f32(vacc00, 0.0f);
7445
0
    vacc01 = math_max_f32(vacc01, 0.0f);
7446
0
    vacc02 = math_max_f32(vacc02, 0.0f);
7447
0
    vacc03 = math_max_f32(vacc03, 0.0f);
7448
0
    vacc10 = math_max_f32(vacc10, 0.0f);
7449
0
    vacc11 = math_max_f32(vacc11, 0.0f);
7450
0
    vacc12 = math_max_f32(vacc12, 0.0f);
7451
0
    vacc13 = math_max_f32(vacc13, 0.0f);
7452
7453
0
    if XNN_LIKELY(nc >= 4) {
7454
0
      c1[0] = vacc10;
7455
0
      c1[1] = vacc11;
7456
0
      c1[2] = vacc12;
7457
0
      c1[3] = vacc13;
7458
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
7459
0
      c0[0] = vacc00;
7460
0
      c0[1] = vacc01;
7461
0
      c0[2] = vacc02;
7462
0
      c0[3] = vacc03;
7463
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7464
7465
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7466
0
      nc -= 4;
7467
0
    } else {
7468
0
      if (nc & 2) {
7469
0
        c1[0] = vacc10;
7470
0
        c1[1] = vacc11;
7471
0
        vacc10 = vacc12;
7472
0
        c1 += 2;
7473
0
        c0[0] = vacc00;
7474
0
        c0[1] = vacc01;
7475
0
        vacc00 = vacc02;
7476
0
        c0 += 2;
7477
0
      }
7478
0
      if (nc & 1) {
7479
0
        c1[0] = vacc10;
7480
0
        c0[0] = vacc00;
7481
0
      }
7482
7483
0
      nc = 0;
7484
0
    }
7485
0
  } while (nc != 0);
7486
0
}
7487
7488
void xnn_f32_igemm_ukernel_2x4__scalar(
7489
    size_t mr,
7490
    size_t nc,
7491
    size_t kc,
7492
    size_t ks,
7493
    const float** restrict a,
7494
    const float* restrict w,
7495
    float* restrict c,
7496
    size_t cm_stride,
7497
    size_t cn_stride,
7498
    size_t a_offset,
7499
    const float* zero,
7500
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
7501
0
{
7502
0
  assert(mr != 0);
7503
0
  assert(mr <= 2);
7504
0
  assert(nc != 0);
7505
0
  assert(kc != 0);
7506
0
  assert(kc % sizeof(float) == 0);
7507
0
  assert(ks != 0);
7508
0
  assert(ks % (2 * sizeof(void*)) == 0);
7509
0
  assert(a_offset % sizeof(float) == 0);
7510
0
  assert(a != NULL);
7511
0
  assert(w != NULL);
7512
0
  assert(c != NULL);
7513
7514
0
  float* c0 = c;
7515
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
7516
0
  if XNN_UNPREDICTABLE(mr != 2) {
7517
0
    c1 = c0;
7518
0
  }
7519
7520
0
  do {
7521
0
    float vacc00 = w[0];
7522
0
    float vacc01 = w[1];
7523
0
    float vacc02 = w[2];
7524
0
    float vacc03 = w[3];
7525
0
    float vacc10 = vacc00;
7526
0
    float vacc11 = vacc01;
7527
0
    float vacc12 = vacc02;
7528
0
    float vacc13 = vacc03;
7529
0
    w += 4;
7530
7531
0
    size_t p = ks;
7532
0
    do {
7533
0
      const float* restrict a0 = a[0];
7534
0
      assert(a0 != NULL);
7535
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7536
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7537
0
      }
7538
0
      const float* restrict a1 = a[1];
7539
0
      assert(a1 != NULL);
7540
0
      if XNN_UNPREDICTABLE(a1 != zero) {
7541
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
7542
0
      }
7543
0
      a += 2;
7544
7545
0
      size_t k = kc;
7546
0
      do {
7547
0
        const float va0 = *a0++;
7548
0
        const float va1 = *a1++;
7549
7550
0
        const float vb0 = w[0];
7551
0
        const float vb1 = w[1];
7552
0
        const float vb2 = w[2];
7553
0
        const float vb3 = w[3];
7554
0
        w += 4;
7555
7556
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7557
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7558
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
7559
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7560
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
7561
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
7562
0
        vacc12 = math_muladd_f32(va1, vb2, vacc12);
7563
0
        vacc13 = math_muladd_f32(va1, vb3, vacc13);
7564
7565
0
        k -= sizeof(float);
7566
0
      } while (k != 0);
7567
0
      p -= 2 * sizeof(void*);
7568
0
    } while (p != 0);
7569
7570
7571
0
    if XNN_LIKELY(nc >= 4) {
7572
0
      c1[0] = vacc10;
7573
0
      c1[1] = vacc11;
7574
0
      c1[2] = vacc12;
7575
0
      c1[3] = vacc13;
7576
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
7577
0
      c0[0] = vacc00;
7578
0
      c0[1] = vacc01;
7579
0
      c0[2] = vacc02;
7580
0
      c0[3] = vacc03;
7581
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7582
7583
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7584
0
      nc -= 4;
7585
0
    } else {
7586
0
      if (nc & 2) {
7587
0
        c1[0] = vacc10;
7588
0
        c1[1] = vacc11;
7589
0
        vacc10 = vacc12;
7590
0
        c1 += 2;
7591
0
        c0[0] = vacc00;
7592
0
        c0[1] = vacc01;
7593
0
        vacc00 = vacc02;
7594
0
        c0 += 2;
7595
0
      }
7596
0
      if (nc & 1) {
7597
0
        c1[0] = vacc10;
7598
0
        c0[0] = vacc00;
7599
0
      }
7600
7601
0
      nc = 0;
7602
0
    }
7603
0
  } while (nc != 0);
7604
0
}
7605
7606
void xnn_f32_igemm_minmax_ukernel_4x2__scalar(
7607
    size_t mr,
7608
    size_t nc,
7609
    size_t kc,
7610
    size_t ks,
7611
    const float** restrict a,
7612
    const float* restrict w,
7613
    float* restrict c,
7614
    size_t cm_stride,
7615
    size_t cn_stride,
7616
    size_t a_offset,
7617
    const float* zero,
7618
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
7619
0
{
7620
0
  assert(mr != 0);
7621
0
  assert(mr <= 4);
7622
0
  assert(nc != 0);
7623
0
  assert(kc != 0);
7624
0
  assert(kc % sizeof(float) == 0);
7625
0
  assert(ks != 0);
7626
0
  assert(ks % (4 * sizeof(void*)) == 0);
7627
0
  assert(a_offset % sizeof(float) == 0);
7628
0
  assert(a != NULL);
7629
0
  assert(w != NULL);
7630
0
  assert(c != NULL);
7631
7632
0
  float* c0 = c;
7633
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
7634
0
  if XNN_UNPREDICTABLE(mr < 2) {
7635
0
    c1 = c0;
7636
0
  }
7637
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
7638
0
  if XNN_UNPREDICTABLE(mr <= 2) {
7639
0
    c2 = c1;
7640
0
  }
7641
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
7642
0
  if XNN_UNPREDICTABLE(mr != 4) {
7643
0
    c3 = c2;
7644
0
  }
7645
7646
0
  const float vmin = params->scalar.min;
7647
0
  const float vmax = params->scalar.max;
7648
0
  do {
7649
0
    float vacc00 = w[0];
7650
0
    float vacc01 = w[1];
7651
0
    float vacc10 = vacc00;
7652
0
    float vacc11 = vacc01;
7653
0
    float vacc20 = vacc00;
7654
0
    float vacc21 = vacc01;
7655
0
    float vacc30 = vacc00;
7656
0
    float vacc31 = vacc01;
7657
0
    w += 2;
7658
7659
0
    size_t p = ks;
7660
0
    do {
7661
0
      const float* restrict a0 = a[0];
7662
0
      assert(a0 != NULL);
7663
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7664
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7665
0
      }
7666
0
      const float* restrict a1 = a[1];
7667
0
      assert(a1 != NULL);
7668
0
      if XNN_UNPREDICTABLE(a1 != zero) {
7669
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
7670
0
      }
7671
0
      const float* restrict a2 = a[2];
7672
0
      assert(a2 != NULL);
7673
0
      if XNN_UNPREDICTABLE(a2 != zero) {
7674
0
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
7675
0
      }
7676
0
      const float* restrict a3 = a[3];
7677
0
      assert(a3 != NULL);
7678
0
      if XNN_UNPREDICTABLE(a3 != zero) {
7679
0
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
7680
0
      }
7681
0
      a += 4;
7682
7683
0
      size_t k = kc;
7684
0
      do {
7685
0
        const float va0 = *a0++;
7686
0
        const float va1 = *a1++;
7687
0
        const float va2 = *a2++;
7688
0
        const float va3 = *a3++;
7689
7690
0
        const float vb0 = w[0];
7691
0
        const float vb1 = w[1];
7692
0
        w += 2;
7693
7694
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7695
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7696
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
7697
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
7698
0
        vacc20 = math_muladd_f32(va2, vb0, vacc20);
7699
0
        vacc21 = math_muladd_f32(va2, vb1, vacc21);
7700
0
        vacc30 = math_muladd_f32(va3, vb0, vacc30);
7701
0
        vacc31 = math_muladd_f32(va3, vb1, vacc31);
7702
7703
0
        k -= sizeof(float);
7704
0
      } while (k != 0);
7705
0
      p -= 4 * sizeof(void*);
7706
0
    } while (p != 0);
7707
7708
0
    vacc00 = math_max_f32(vacc00, vmin);
7709
0
    vacc01 = math_max_f32(vacc01, vmin);
7710
0
    vacc10 = math_max_f32(vacc10, vmin);
7711
0
    vacc11 = math_max_f32(vacc11, vmin);
7712
0
    vacc20 = math_max_f32(vacc20, vmin);
7713
0
    vacc21 = math_max_f32(vacc21, vmin);
7714
0
    vacc30 = math_max_f32(vacc30, vmin);
7715
0
    vacc31 = math_max_f32(vacc31, vmin);
7716
7717
0
    vacc00 = math_min_f32(vacc00, vmax);
7718
0
    vacc01 = math_min_f32(vacc01, vmax);
7719
0
    vacc10 = math_min_f32(vacc10, vmax);
7720
0
    vacc11 = math_min_f32(vacc11, vmax);
7721
0
    vacc20 = math_min_f32(vacc20, vmax);
7722
0
    vacc21 = math_min_f32(vacc21, vmax);
7723
0
    vacc30 = math_min_f32(vacc30, vmax);
7724
0
    vacc31 = math_min_f32(vacc31, vmax);
7725
7726
0
    if XNN_LIKELY(nc >= 2) {
7727
0
      c3[0] = vacc30;
7728
0
      c3[1] = vacc31;
7729
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
7730
0
      c2[0] = vacc20;
7731
0
      c2[1] = vacc21;
7732
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
7733
0
      c1[0] = vacc10;
7734
0
      c1[1] = vacc11;
7735
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
7736
0
      c0[0] = vacc00;
7737
0
      c0[1] = vacc01;
7738
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7739
7740
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7741
0
      nc -= 2;
7742
0
    } else {
7743
0
      if (nc & 1) {
7744
0
        c3[0] = vacc30;
7745
0
        c2[0] = vacc20;
7746
0
        c1[0] = vacc10;
7747
0
        c0[0] = vacc00;
7748
0
      }
7749
7750
0
      nc = 0;
7751
0
    }
7752
0
  } while (nc != 0);
7753
0
}
7754
7755
void xnn_f32_igemm_ukernel_4x2__scalar(
7756
    size_t mr,
7757
    size_t nc,
7758
    size_t kc,
7759
    size_t ks,
7760
    const float** restrict a,
7761
    const float* restrict w,
7762
    float* restrict c,
7763
    size_t cm_stride,
7764
    size_t cn_stride,
7765
    size_t a_offset,
7766
    const float* zero,
7767
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
7768
0
{
7769
0
  assert(mr != 0);
7770
0
  assert(mr <= 4);
7771
0
  assert(nc != 0);
7772
0
  assert(kc != 0);
7773
0
  assert(kc % sizeof(float) == 0);
7774
0
  assert(ks != 0);
7775
0
  assert(ks % (4 * sizeof(void*)) == 0);
7776
0
  assert(a_offset % sizeof(float) == 0);
7777
0
  assert(a != NULL);
7778
0
  assert(w != NULL);
7779
0
  assert(c != NULL);
7780
7781
0
  float* c0 = c;
7782
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
7783
0
  if XNN_UNPREDICTABLE(mr < 2) {
7784
0
    c1 = c0;
7785
0
  }
7786
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
7787
0
  if XNN_UNPREDICTABLE(mr <= 2) {
7788
0
    c2 = c1;
7789
0
  }
7790
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
7791
0
  if XNN_UNPREDICTABLE(mr != 4) {
7792
0
    c3 = c2;
7793
0
  }
7794
7795
0
  do {
7796
0
    float vacc00 = w[0];
7797
0
    float vacc01 = w[1];
7798
0
    float vacc10 = vacc00;
7799
0
    float vacc11 = vacc01;
7800
0
    float vacc20 = vacc00;
7801
0
    float vacc21 = vacc01;
7802
0
    float vacc30 = vacc00;
7803
0
    float vacc31 = vacc01;
7804
0
    w += 2;
7805
7806
0
    size_t p = ks;
7807
0
    do {
7808
0
      const float* restrict a0 = a[0];
7809
0
      assert(a0 != NULL);
7810
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7811
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7812
0
      }
7813
0
      const float* restrict a1 = a[1];
7814
0
      assert(a1 != NULL);
7815
0
      if XNN_UNPREDICTABLE(a1 != zero) {
7816
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
7817
0
      }
7818
0
      const float* restrict a2 = a[2];
7819
0
      assert(a2 != NULL);
7820
0
      if XNN_UNPREDICTABLE(a2 != zero) {
7821
0
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
7822
0
      }
7823
0
      const float* restrict a3 = a[3];
7824
0
      assert(a3 != NULL);
7825
0
      if XNN_UNPREDICTABLE(a3 != zero) {
7826
0
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
7827
0
      }
7828
0
      a += 4;
7829
7830
0
      size_t k = kc;
7831
0
      do {
7832
0
        const float va0 = *a0++;
7833
0
        const float va1 = *a1++;
7834
0
        const float va2 = *a2++;
7835
0
        const float va3 = *a3++;
7836
7837
0
        const float vb0 = w[0];
7838
0
        const float vb1 = w[1];
7839
0
        w += 2;
7840
7841
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7842
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7843
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
7844
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
7845
0
        vacc20 = math_muladd_f32(va2, vb0, vacc20);
7846
0
        vacc21 = math_muladd_f32(va2, vb1, vacc21);
7847
0
        vacc30 = math_muladd_f32(va3, vb0, vacc30);
7848
0
        vacc31 = math_muladd_f32(va3, vb1, vacc31);
7849
7850
0
        k -= sizeof(float);
7851
0
      } while (k != 0);
7852
0
      p -= 4 * sizeof(void*);
7853
0
    } while (p != 0);
7854
7855
7856
0
    if XNN_LIKELY(nc >= 2) {
7857
0
      c3[0] = vacc30;
7858
0
      c3[1] = vacc31;
7859
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
7860
0
      c2[0] = vacc20;
7861
0
      c2[1] = vacc21;
7862
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
7863
0
      c1[0] = vacc10;
7864
0
      c1[1] = vacc11;
7865
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
7866
0
      c0[0] = vacc00;
7867
0
      c0[1] = vacc01;
7868
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
7869
7870
0
      a = (const float**restrict) ((uintptr_t) a - ks);
7871
0
      nc -= 2;
7872
0
    } else {
7873
0
      if (nc & 1) {
7874
0
        c3[0] = vacc30;
7875
0
        c2[0] = vacc20;
7876
0
        c1[0] = vacc10;
7877
0
        c0[0] = vacc00;
7878
0
      }
7879
7880
0
      nc = 0;
7881
0
    }
7882
0
  } while (nc != 0);
7883
0
}
7884
7885
void xnn_f32_igemm_minmax_ukernel_4x4__scalar(
7886
    size_t mr,
7887
    size_t nc,
7888
    size_t kc,
7889
    size_t ks,
7890
    const float** restrict a,
7891
    const float* restrict w,
7892
    float* restrict c,
7893
    size_t cm_stride,
7894
    size_t cn_stride,
7895
    size_t a_offset,
7896
    const float* zero,
7897
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
7898
0
{
7899
0
  assert(mr != 0);
7900
0
  assert(mr <= 4);
7901
0
  assert(nc != 0);
7902
0
  assert(kc != 0);
7903
0
  assert(kc % sizeof(float) == 0);
7904
0
  assert(ks != 0);
7905
0
  assert(ks % (4 * sizeof(void*)) == 0);
7906
0
  assert(a_offset % sizeof(float) == 0);
7907
0
  assert(a != NULL);
7908
0
  assert(w != NULL);
7909
0
  assert(c != NULL);
7910
7911
0
  float* c0 = c;
7912
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
7913
0
  if XNN_UNPREDICTABLE(mr < 2) {
7914
0
    c1 = c0;
7915
0
  }
7916
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
7917
0
  if XNN_UNPREDICTABLE(mr <= 2) {
7918
0
    c2 = c1;
7919
0
  }
7920
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
7921
0
  if XNN_UNPREDICTABLE(mr != 4) {
7922
0
    c3 = c2;
7923
0
  }
7924
7925
0
  const float vmin = params->scalar.min;
7926
0
  const float vmax = params->scalar.max;
7927
0
  do {
7928
0
    float vacc00 = w[0];
7929
0
    float vacc01 = w[1];
7930
0
    float vacc02 = w[2];
7931
0
    float vacc03 = w[3];
7932
0
    float vacc10 = vacc00;
7933
0
    float vacc11 = vacc01;
7934
0
    float vacc12 = vacc02;
7935
0
    float vacc13 = vacc03;
7936
0
    float vacc20 = vacc00;
7937
0
    float vacc21 = vacc01;
7938
0
    float vacc22 = vacc02;
7939
0
    float vacc23 = vacc03;
7940
0
    float vacc30 = vacc00;
7941
0
    float vacc31 = vacc01;
7942
0
    float vacc32 = vacc02;
7943
0
    float vacc33 = vacc03;
7944
0
    w += 4;
7945
7946
0
    size_t p = ks;
7947
0
    do {
7948
0
      const float* restrict a0 = a[0];
7949
0
      assert(a0 != NULL);
7950
0
      if XNN_UNPREDICTABLE(a0 != zero) {
7951
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
7952
0
      }
7953
0
      const float* restrict a1 = a[1];
7954
0
      assert(a1 != NULL);
7955
0
      if XNN_UNPREDICTABLE(a1 != zero) {
7956
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
7957
0
      }
7958
0
      const float* restrict a2 = a[2];
7959
0
      assert(a2 != NULL);
7960
0
      if XNN_UNPREDICTABLE(a2 != zero) {
7961
0
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
7962
0
      }
7963
0
      const float* restrict a3 = a[3];
7964
0
      assert(a3 != NULL);
7965
0
      if XNN_UNPREDICTABLE(a3 != zero) {
7966
0
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
7967
0
      }
7968
0
      a += 4;
7969
7970
0
      size_t k = kc;
7971
0
      do {
7972
0
        const float va0 = *a0++;
7973
0
        const float va1 = *a1++;
7974
0
        const float va2 = *a2++;
7975
0
        const float va3 = *a3++;
7976
7977
0
        const float vb0 = w[0];
7978
0
        const float vb1 = w[1];
7979
0
        const float vb2 = w[2];
7980
0
        const float vb3 = w[3];
7981
0
        w += 4;
7982
7983
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
7984
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
7985
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
7986
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
7987
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
7988
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
7989
0
        vacc12 = math_muladd_f32(va1, vb2, vacc12);
7990
0
        vacc13 = math_muladd_f32(va1, vb3, vacc13);
7991
0
        vacc20 = math_muladd_f32(va2, vb0, vacc20);
7992
0
        vacc21 = math_muladd_f32(va2, vb1, vacc21);
7993
0
        vacc22 = math_muladd_f32(va2, vb2, vacc22);
7994
0
        vacc23 = math_muladd_f32(va2, vb3, vacc23);
7995
0
        vacc30 = math_muladd_f32(va3, vb0, vacc30);
7996
0
        vacc31 = math_muladd_f32(va3, vb1, vacc31);
7997
0
        vacc32 = math_muladd_f32(va3, vb2, vacc32);
7998
0
        vacc33 = math_muladd_f32(va3, vb3, vacc33);
7999
8000
0
        k -= sizeof(float);
8001
0
      } while (k != 0);
8002
0
      p -= 4 * sizeof(void*);
8003
0
    } while (p != 0);
8004
8005
0
    vacc00 = math_max_f32(vacc00, vmin);
8006
0
    vacc01 = math_max_f32(vacc01, vmin);
8007
0
    vacc02 = math_max_f32(vacc02, vmin);
8008
0
    vacc03 = math_max_f32(vacc03, vmin);
8009
0
    vacc10 = math_max_f32(vacc10, vmin);
8010
0
    vacc11 = math_max_f32(vacc11, vmin);
8011
0
    vacc12 = math_max_f32(vacc12, vmin);
8012
0
    vacc13 = math_max_f32(vacc13, vmin);
8013
0
    vacc20 = math_max_f32(vacc20, vmin);
8014
0
    vacc21 = math_max_f32(vacc21, vmin);
8015
0
    vacc22 = math_max_f32(vacc22, vmin);
8016
0
    vacc23 = math_max_f32(vacc23, vmin);
8017
0
    vacc30 = math_max_f32(vacc30, vmin);
8018
0
    vacc31 = math_max_f32(vacc31, vmin);
8019
0
    vacc32 = math_max_f32(vacc32, vmin);
8020
0
    vacc33 = math_max_f32(vacc33, vmin);
8021
8022
0
    vacc00 = math_min_f32(vacc00, vmax);
8023
0
    vacc01 = math_min_f32(vacc01, vmax);
8024
0
    vacc02 = math_min_f32(vacc02, vmax);
8025
0
    vacc03 = math_min_f32(vacc03, vmax);
8026
0
    vacc10 = math_min_f32(vacc10, vmax);
8027
0
    vacc11 = math_min_f32(vacc11, vmax);
8028
0
    vacc12 = math_min_f32(vacc12, vmax);
8029
0
    vacc13 = math_min_f32(vacc13, vmax);
8030
0
    vacc20 = math_min_f32(vacc20, vmax);
8031
0
    vacc21 = math_min_f32(vacc21, vmax);
8032
0
    vacc22 = math_min_f32(vacc22, vmax);
8033
0
    vacc23 = math_min_f32(vacc23, vmax);
8034
0
    vacc30 = math_min_f32(vacc30, vmax);
8035
0
    vacc31 = math_min_f32(vacc31, vmax);
8036
0
    vacc32 = math_min_f32(vacc32, vmax);
8037
0
    vacc33 = math_min_f32(vacc33, vmax);
8038
8039
0
    if XNN_LIKELY(nc >= 4) {
8040
0
      c3[0] = vacc30;
8041
0
      c3[1] = vacc31;
8042
0
      c3[2] = vacc32;
8043
0
      c3[3] = vacc33;
8044
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
8045
0
      c2[0] = vacc20;
8046
0
      c2[1] = vacc21;
8047
0
      c2[2] = vacc22;
8048
0
      c2[3] = vacc23;
8049
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
8050
0
      c1[0] = vacc10;
8051
0
      c1[1] = vacc11;
8052
0
      c1[2] = vacc12;
8053
0
      c1[3] = vacc13;
8054
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
8055
0
      c0[0] = vacc00;
8056
0
      c0[1] = vacc01;
8057
0
      c0[2] = vacc02;
8058
0
      c0[3] = vacc03;
8059
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
8060
8061
0
      a = (const float**restrict) ((uintptr_t) a - ks);
8062
0
      nc -= 4;
8063
0
    } else {
8064
0
      if (nc & 2) {
8065
0
        c3[0] = vacc30;
8066
0
        c3[1] = vacc31;
8067
0
        vacc30 = vacc32;
8068
0
        c3 += 2;
8069
0
        c2[0] = vacc20;
8070
0
        c2[1] = vacc21;
8071
0
        vacc20 = vacc22;
8072
0
        c2 += 2;
8073
0
        c1[0] = vacc10;
8074
0
        c1[1] = vacc11;
8075
0
        vacc10 = vacc12;
8076
0
        c1 += 2;
8077
0
        c0[0] = vacc00;
8078
0
        c0[1] = vacc01;
8079
0
        vacc00 = vacc02;
8080
0
        c0 += 2;
8081
0
      }
8082
0
      if (nc & 1) {
8083
0
        c3[0] = vacc30;
8084
0
        c2[0] = vacc20;
8085
0
        c1[0] = vacc10;
8086
0
        c0[0] = vacc00;
8087
0
      }
8088
8089
0
      nc = 0;
8090
0
    }
8091
0
  } while (nc != 0);
8092
0
}
8093
8094
void xnn_f32_igemm_relu_ukernel_4x4__scalar(
8095
    size_t mr,
8096
    size_t nc,
8097
    size_t kc,
8098
    size_t ks,
8099
    const float** restrict a,
8100
    const float* restrict w,
8101
    float* restrict c,
8102
    size_t cm_stride,
8103
    size_t cn_stride,
8104
    size_t a_offset,
8105
    const float* zero,
8106
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
8107
0
{
8108
0
  assert(mr != 0);
8109
0
  assert(mr <= 4);
8110
0
  assert(nc != 0);
8111
0
  assert(kc != 0);
8112
0
  assert(kc % sizeof(float) == 0);
8113
0
  assert(ks != 0);
8114
0
  assert(ks % (4 * sizeof(void*)) == 0);
8115
0
  assert(a_offset % sizeof(float) == 0);
8116
0
  assert(a != NULL);
8117
0
  assert(w != NULL);
8118
0
  assert(c != NULL);
8119
8120
0
  float* c0 = c;
8121
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
8122
0
  if XNN_UNPREDICTABLE(mr < 2) {
8123
0
    c1 = c0;
8124
0
  }
8125
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
8126
0
  if XNN_UNPREDICTABLE(mr <= 2) {
8127
0
    c2 = c1;
8128
0
  }
8129
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
8130
0
  if XNN_UNPREDICTABLE(mr != 4) {
8131
0
    c3 = c2;
8132
0
  }
8133
8134
0
  do {
8135
0
    float vacc00 = w[0];
8136
0
    float vacc01 = w[1];
8137
0
    float vacc02 = w[2];
8138
0
    float vacc03 = w[3];
8139
0
    float vacc10 = vacc00;
8140
0
    float vacc11 = vacc01;
8141
0
    float vacc12 = vacc02;
8142
0
    float vacc13 = vacc03;
8143
0
    float vacc20 = vacc00;
8144
0
    float vacc21 = vacc01;
8145
0
    float vacc22 = vacc02;
8146
0
    float vacc23 = vacc03;
8147
0
    float vacc30 = vacc00;
8148
0
    float vacc31 = vacc01;
8149
0
    float vacc32 = vacc02;
8150
0
    float vacc33 = vacc03;
8151
0
    w += 4;
8152
8153
0
    size_t p = ks;
8154
0
    do {
8155
0
      const float* restrict a0 = a[0];
8156
0
      assert(a0 != NULL);
8157
0
      if XNN_UNPREDICTABLE(a0 != zero) {
8158
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
8159
0
      }
8160
0
      const float* restrict a1 = a[1];
8161
0
      assert(a1 != NULL);
8162
0
      if XNN_UNPREDICTABLE(a1 != zero) {
8163
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
8164
0
      }
8165
0
      const float* restrict a2 = a[2];
8166
0
      assert(a2 != NULL);
8167
0
      if XNN_UNPREDICTABLE(a2 != zero) {
8168
0
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
8169
0
      }
8170
0
      const float* restrict a3 = a[3];
8171
0
      assert(a3 != NULL);
8172
0
      if XNN_UNPREDICTABLE(a3 != zero) {
8173
0
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
8174
0
      }
8175
0
      a += 4;
8176
8177
0
      size_t k = kc;
8178
0
      do {
8179
0
        const float va0 = *a0++;
8180
0
        const float va1 = *a1++;
8181
0
        const float va2 = *a2++;
8182
0
        const float va3 = *a3++;
8183
8184
0
        const float vb0 = w[0];
8185
0
        const float vb1 = w[1];
8186
0
        const float vb2 = w[2];
8187
0
        const float vb3 = w[3];
8188
0
        w += 4;
8189
8190
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
8191
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
8192
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
8193
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
8194
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
8195
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
8196
0
        vacc12 = math_muladd_f32(va1, vb2, vacc12);
8197
0
        vacc13 = math_muladd_f32(va1, vb3, vacc13);
8198
0
        vacc20 = math_muladd_f32(va2, vb0, vacc20);
8199
0
        vacc21 = math_muladd_f32(va2, vb1, vacc21);
8200
0
        vacc22 = math_muladd_f32(va2, vb2, vacc22);
8201
0
        vacc23 = math_muladd_f32(va2, vb3, vacc23);
8202
0
        vacc30 = math_muladd_f32(va3, vb0, vacc30);
8203
0
        vacc31 = math_muladd_f32(va3, vb1, vacc31);
8204
0
        vacc32 = math_muladd_f32(va3, vb2, vacc32);
8205
0
        vacc33 = math_muladd_f32(va3, vb3, vacc33);
8206
8207
0
        k -= sizeof(float);
8208
0
      } while (k != 0);
8209
0
      p -= 4 * sizeof(void*);
8210
0
    } while (p != 0);
8211
8212
0
    vacc00 = math_max_f32(vacc00, 0.0f);
8213
0
    vacc01 = math_max_f32(vacc01, 0.0f);
8214
0
    vacc02 = math_max_f32(vacc02, 0.0f);
8215
0
    vacc03 = math_max_f32(vacc03, 0.0f);
8216
0
    vacc10 = math_max_f32(vacc10, 0.0f);
8217
0
    vacc11 = math_max_f32(vacc11, 0.0f);
8218
0
    vacc12 = math_max_f32(vacc12, 0.0f);
8219
0
    vacc13 = math_max_f32(vacc13, 0.0f);
8220
0
    vacc20 = math_max_f32(vacc20, 0.0f);
8221
0
    vacc21 = math_max_f32(vacc21, 0.0f);
8222
0
    vacc22 = math_max_f32(vacc22, 0.0f);
8223
0
    vacc23 = math_max_f32(vacc23, 0.0f);
8224
0
    vacc30 = math_max_f32(vacc30, 0.0f);
8225
0
    vacc31 = math_max_f32(vacc31, 0.0f);
8226
0
    vacc32 = math_max_f32(vacc32, 0.0f);
8227
0
    vacc33 = math_max_f32(vacc33, 0.0f);
8228
8229
0
    if XNN_LIKELY(nc >= 4) {
8230
0
      c3[0] = vacc30;
8231
0
      c3[1] = vacc31;
8232
0
      c3[2] = vacc32;
8233
0
      c3[3] = vacc33;
8234
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
8235
0
      c2[0] = vacc20;
8236
0
      c2[1] = vacc21;
8237
0
      c2[2] = vacc22;
8238
0
      c2[3] = vacc23;
8239
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
8240
0
      c1[0] = vacc10;
8241
0
      c1[1] = vacc11;
8242
0
      c1[2] = vacc12;
8243
0
      c1[3] = vacc13;
8244
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
8245
0
      c0[0] = vacc00;
8246
0
      c0[1] = vacc01;
8247
0
      c0[2] = vacc02;
8248
0
      c0[3] = vacc03;
8249
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
8250
8251
0
      a = (const float**restrict) ((uintptr_t) a - ks);
8252
0
      nc -= 4;
8253
0
    } else {
8254
0
      if (nc & 2) {
8255
0
        c3[0] = vacc30;
8256
0
        c3[1] = vacc31;
8257
0
        vacc30 = vacc32;
8258
0
        c3 += 2;
8259
0
        c2[0] = vacc20;
8260
0
        c2[1] = vacc21;
8261
0
        vacc20 = vacc22;
8262
0
        c2 += 2;
8263
0
        c1[0] = vacc10;
8264
0
        c1[1] = vacc11;
8265
0
        vacc10 = vacc12;
8266
0
        c1 += 2;
8267
0
        c0[0] = vacc00;
8268
0
        c0[1] = vacc01;
8269
0
        vacc00 = vacc02;
8270
0
        c0 += 2;
8271
0
      }
8272
0
      if (nc & 1) {
8273
0
        c3[0] = vacc30;
8274
0
        c2[0] = vacc20;
8275
0
        c1[0] = vacc10;
8276
0
        c0[0] = vacc00;
8277
0
      }
8278
8279
0
      nc = 0;
8280
0
    }
8281
0
  } while (nc != 0);
8282
0
}
8283
8284
void xnn_f32_igemm_ukernel_4x4__scalar(
8285
    size_t mr,
8286
    size_t nc,
8287
    size_t kc,
8288
    size_t ks,
8289
    const float** restrict a,
8290
    const float* restrict w,
8291
    float* restrict c,
8292
    size_t cm_stride,
8293
    size_t cn_stride,
8294
    size_t a_offset,
8295
    const float* zero,
8296
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
8297
0
{
8298
0
  assert(mr != 0);
8299
0
  assert(mr <= 4);
8300
0
  assert(nc != 0);
8301
0
  assert(kc != 0);
8302
0
  assert(kc % sizeof(float) == 0);
8303
0
  assert(ks != 0);
8304
0
  assert(ks % (4 * sizeof(void*)) == 0);
8305
0
  assert(a_offset % sizeof(float) == 0);
8306
0
  assert(a != NULL);
8307
0
  assert(w != NULL);
8308
0
  assert(c != NULL);
8309
8310
0
  float* c0 = c;
8311
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
8312
0
  if XNN_UNPREDICTABLE(mr < 2) {
8313
0
    c1 = c0;
8314
0
  }
8315
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
8316
0
  if XNN_UNPREDICTABLE(mr <= 2) {
8317
0
    c2 = c1;
8318
0
  }
8319
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
8320
0
  if XNN_UNPREDICTABLE(mr != 4) {
8321
0
    c3 = c2;
8322
0
  }
8323
8324
0
  do {
8325
0
    float vacc00 = w[0];
8326
0
    float vacc01 = w[1];
8327
0
    float vacc02 = w[2];
8328
0
    float vacc03 = w[3];
8329
0
    float vacc10 = vacc00;
8330
0
    float vacc11 = vacc01;
8331
0
    float vacc12 = vacc02;
8332
0
    float vacc13 = vacc03;
8333
0
    float vacc20 = vacc00;
8334
0
    float vacc21 = vacc01;
8335
0
    float vacc22 = vacc02;
8336
0
    float vacc23 = vacc03;
8337
0
    float vacc30 = vacc00;
8338
0
    float vacc31 = vacc01;
8339
0
    float vacc32 = vacc02;
8340
0
    float vacc33 = vacc03;
8341
0
    w += 4;
8342
8343
0
    size_t p = ks;
8344
0
    do {
8345
0
      const float* restrict a0 = a[0];
8346
0
      assert(a0 != NULL);
8347
0
      if XNN_UNPREDICTABLE(a0 != zero) {
8348
0
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
8349
0
      }
8350
0
      const float* restrict a1 = a[1];
8351
0
      assert(a1 != NULL);
8352
0
      if XNN_UNPREDICTABLE(a1 != zero) {
8353
0
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
8354
0
      }
8355
0
      const float* restrict a2 = a[2];
8356
0
      assert(a2 != NULL);
8357
0
      if XNN_UNPREDICTABLE(a2 != zero) {
8358
0
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
8359
0
      }
8360
0
      const float* restrict a3 = a[3];
8361
0
      assert(a3 != NULL);
8362
0
      if XNN_UNPREDICTABLE(a3 != zero) {
8363
0
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
8364
0
      }
8365
0
      a += 4;
8366
8367
0
      size_t k = kc;
8368
0
      do {
8369
0
        const float va0 = *a0++;
8370
0
        const float va1 = *a1++;
8371
0
        const float va2 = *a2++;
8372
0
        const float va3 = *a3++;
8373
8374
0
        const float vb0 = w[0];
8375
0
        const float vb1 = w[1];
8376
0
        const float vb2 = w[2];
8377
0
        const float vb3 = w[3];
8378
0
        w += 4;
8379
8380
0
        vacc00 = math_muladd_f32(va0, vb0, vacc00);
8381
0
        vacc01 = math_muladd_f32(va0, vb1, vacc01);
8382
0
        vacc02 = math_muladd_f32(va0, vb2, vacc02);
8383
0
        vacc03 = math_muladd_f32(va0, vb3, vacc03);
8384
0
        vacc10 = math_muladd_f32(va1, vb0, vacc10);
8385
0
        vacc11 = math_muladd_f32(va1, vb1, vacc11);
8386
0
        vacc12 = math_muladd_f32(va1, vb2, vacc12);
8387
0
        vacc13 = math_muladd_f32(va1, vb3, vacc13);
8388
0
        vacc20 = math_muladd_f32(va2, vb0, vacc20);
8389
0
        vacc21 = math_muladd_f32(va2, vb1, vacc21);
8390
0
        vacc22 = math_muladd_f32(va2, vb2, vacc22);
8391
0
        vacc23 = math_muladd_f32(va2, vb3, vacc23);
8392
0
        vacc30 = math_muladd_f32(va3, vb0, vacc30);
8393
0
        vacc31 = math_muladd_f32(va3, vb1, vacc31);
8394
0
        vacc32 = math_muladd_f32(va3, vb2, vacc32);
8395
0
        vacc33 = math_muladd_f32(va3, vb3, vacc33);
8396
8397
0
        k -= sizeof(float);
8398
0
      } while (k != 0);
8399
0
      p -= 4 * sizeof(void*);
8400
0
    } while (p != 0);
8401
8402
8403
0
    if XNN_LIKELY(nc >= 4) {
8404
0
      c3[0] = vacc30;
8405
0
      c3[1] = vacc31;
8406
0
      c3[2] = vacc32;
8407
0
      c3[3] = vacc33;
8408
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
8409
0
      c2[0] = vacc20;
8410
0
      c2[1] = vacc21;
8411
0
      c2[2] = vacc22;
8412
0
      c2[3] = vacc23;
8413
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
8414
0
      c1[0] = vacc10;
8415
0
      c1[1] = vacc11;
8416
0
      c1[2] = vacc12;
8417
0
      c1[3] = vacc13;
8418
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
8419
0
      c0[0] = vacc00;
8420
0
      c0[1] = vacc01;
8421
0
      c0[2] = vacc02;
8422
0
      c0[3] = vacc03;
8423
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
8424
8425
0
      a = (const float**restrict) ((uintptr_t) a - ks);
8426
0
      nc -= 4;
8427
0
    } else {
8428
0
      if (nc & 2) {
8429
0
        c3[0] = vacc30;
8430
0
        c3[1] = vacc31;
8431
0
        vacc30 = vacc32;
8432
0
        c3 += 2;
8433
0
        c2[0] = vacc20;
8434
0
        c2[1] = vacc21;
8435
0
        vacc20 = vacc22;
8436
0
        c2 += 2;
8437
0
        c1[0] = vacc10;
8438
0
        c1[1] = vacc11;
8439
0
        vacc10 = vacc12;
8440
0
        c1 += 2;
8441
0
        c0[0] = vacc00;
8442
0
        c0[1] = vacc01;
8443
0
        vacc00 = vacc02;
8444
0
        c0 += 2;
8445
0
      }
8446
0
      if (nc & 1) {
8447
0
        c3[0] = vacc30;
8448
0
        c2[0] = vacc20;
8449
0
        c1[0] = vacc10;
8450
0
        c0[0] = vacc00;
8451
0
      }
8452
8453
0
      nc = 0;
8454
0
    }
8455
0
  } while (nc != 0);
8456
0
}
8457
8458
void xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1(
8459
    size_t output_pixels,
8460
    size_t kernel_elements,
8461
    size_t channels,
8462
    const float** input,
8463
    size_t input_offset,
8464
    float* output,
8465
    size_t input_increment,
8466
    size_t output_increment,
8467
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
8468
0
{
8469
0
  assert(output_pixels != 0);
8470
0
  assert(kernel_elements != 0);
8471
0
  assert(channels != 0);
8472
8473
0
  const float voutput_min = params->scalar.min;
8474
0
  const float voutput_max = params->scalar.max;
8475
0
  do {
8476
0
    float* o = output;
8477
0
    {
8478
0
      const float* i0 = *input++;
8479
0
      const float* i1 = *input++;
8480
0
      const float* i2 = *input++;
8481
0
      const float* i3 = *input++;
8482
0
      const float* i4 = *input++;
8483
0
      const float* i5 = *input++;
8484
0
      const float* i6 = *input++;
8485
0
      const float* i7 = *input++;
8486
0
      const float* i8 = *input++;
8487
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
8488
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
8489
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
8490
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
8491
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
8492
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
8493
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
8494
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
8495
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
8496
0
      if (kernel_elements < 2) {
8497
0
        i1 = i0;
8498
0
      }
8499
0
      if (kernel_elements <= 2) {
8500
0
        i2 = i0;
8501
0
      }
8502
0
      if (kernel_elements < 4) {
8503
0
        i3 = i0;
8504
0
      }
8505
0
      if (kernel_elements <= 4) {
8506
0
        i4 = i0;
8507
0
      }
8508
0
      if (kernel_elements < 6) {
8509
0
        i5 = i0;
8510
0
      }
8511
0
      if (kernel_elements <= 6) {
8512
0
        i6 = i0;
8513
0
      }
8514
0
      if (kernel_elements < 8) {
8515
0
        i7 = i0;
8516
0
      }
8517
0
      if (kernel_elements <= 8) {
8518
0
        i8 = i0;
8519
0
      }
8520
8521
0
      size_t c = channels;
8522
0
      do {
8523
0
        const float vi0 = *i0++;
8524
0
        const float vi1 = *i1++;
8525
0
        const float vi2 = *i2++;
8526
0
        const float vi3 = *i3++;
8527
0
        const float vi4 = *i4++;
8528
0
        const float vi5 = *i5++;
8529
0
        const float vi6 = *i6++;
8530
0
        const float vi7 = *i7++;
8531
0
        const float vi8 = *i8++;
8532
8533
0
        const float vmax01 = math_max_f32(vi0, vi1);
8534
0
        const float vmax23 = math_max_f32(vi2, vi3);
8535
0
        const float vmax45 = math_max_f32(vi4, vi5);
8536
0
        const float vmax67 = math_max_f32(vi6, vi7);
8537
0
        const float vmax018 = math_max_f32(vmax01, vi8);
8538
8539
0
        const float vmax2345 = math_max_f32(vmax23, vmax45);
8540
0
        const float vmax01678 = math_max_f32(vmax018, vmax67);
8541
0
        float vout = math_max_f32(vmax2345, vmax01678);
8542
0
        vout = math_max_f32(vout, voutput_min);
8543
0
        vout = math_min_f32(vout, voutput_max);
8544
8545
0
        *o++ = vout;
8546
0
      } while (--c != 0);
8547
0
    }
8548
8549
0
    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
8550
0
      const float* i0 = *input++;
8551
0
      const float* i1 = *input++;
8552
0
      const float* i2 = *input++;
8553
0
      const float* i3 = *input++;
8554
0
      const float* i4 = *input++;
8555
0
      const float* i5 = *input++;
8556
0
      const float* i6 = *input++;
8557
0
      const float* i7 = *input++;
8558
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
8559
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
8560
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
8561
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
8562
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
8563
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
8564
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
8565
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
8566
0
      if (k < 2) {
8567
0
        i1 = i0;
8568
0
      }
8569
0
      if (k <= 2) {
8570
0
        i2 = i0;
8571
0
      }
8572
0
      if (k < 4) {
8573
0
        i3 = i0;
8574
0
      }
8575
0
      if (k <= 4) {
8576
0
        i4 = i0;
8577
0
      }
8578
0
      if (k < 6) {
8579
0
        i5 = i0;
8580
0
      }
8581
0
      if (k <= 6) {
8582
0
        i6 = i0;
8583
0
      }
8584
0
      if (k < 8) {
8585
0
        i7 = i0;
8586
0
      }
8587
8588
0
      o = output;
8589
0
      size_t c = channels;
8590
0
      do {
8591
0
        const float vi0 = *i0++;
8592
0
        const float vi1 = *i1++;
8593
0
        const float vi2 = *i2++;
8594
0
        const float vi3 = *i3++;
8595
0
        const float vi4 = *i4++;
8596
0
        const float vi5 = *i5++;
8597
0
        const float vi6 = *i6++;
8598
0
        const float vi7 = *i7++;
8599
0
        const float vi8 = *o;
8600
8601
0
        const float vmax01 = math_max_f32(vi0, vi1);
8602
0
        const float vmax23 = math_max_f32(vi2, vi3);
8603
0
        const float vmax45 = math_max_f32(vi4, vi5);
8604
0
        const float vmax67 = math_max_f32(vi6, vi7);
8605
0
        const float vmax018 = math_max_f32(vmax01, vi8);
8606
8607
0
        const float vmax2345 = math_max_f32(vmax23, vmax45);
8608
0
        const float vmax01678 = math_max_f32(vmax018, vmax67);
8609
0
        float vout = math_max_f32(vmax2345, vmax01678);
8610
0
        vout = math_max_f32(vout, voutput_min);
8611
0
        vout = math_min_f32(vout, voutput_max);
8612
8613
0
        *o++ = vout;
8614
0
      } while (--c != 0);
8615
0
    }
8616
0
    input = (const float**) ((uintptr_t) input + input_increment);
8617
0
    output = (float*) ((uintptr_t) o + output_increment);
8618
0
  } while (--output_pixels != 0);
8619
0
}
8620
8621
void xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1(
8622
    size_t output_pixels,
8623
    size_t kernel_elements,
8624
    size_t channels,
8625
    const float** input,
8626
    size_t input_offset,
8627
    const float* zero,
8628
    const float* multiplier,
8629
    float* buffer,
8630
    float* output,
8631
    size_t input_increment,
8632
    size_t output_increment,
8633
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
8634
0
{
8635
0
  assert(output_pixels != 0);
8636
0
  assert(kernel_elements > 9);
8637
0
  assert(channels != 0);
8638
8639
0
  const float voutput_min = params->scalar.min;
8640
0
  const float voutput_max = params->scalar.max;
8641
8642
0
  do {
8643
0
    {
8644
0
      const float* i0 = *input++;
8645
0
      assert(i0 != NULL);
8646
0
      if XNN_UNPREDICTABLE(i0 != zero) {
8647
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
8648
0
      }
8649
0
      const float* i1 = *input++;
8650
0
      assert(i1 != NULL);
8651
0
      if XNN_UNPREDICTABLE(i1 != zero) {
8652
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
8653
0
      }
8654
0
      const float* i2 = *input++;
8655
0
      assert(i2 != NULL);
8656
0
      if XNN_UNPREDICTABLE(i2 != zero) {
8657
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
8658
0
      }
8659
0
      const float* i3 = *input++;
8660
0
      assert(i3 != NULL);
8661
0
      if XNN_UNPREDICTABLE(i3 != zero) {
8662
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
8663
0
      }
8664
0
      const float* i4 = *input++;
8665
0
      assert(i4 != NULL);
8666
0
      if XNN_UNPREDICTABLE(i4 != zero) {
8667
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
8668
0
      }
8669
0
      const float* i5 = *input++;
8670
0
      assert(i5 != NULL);
8671
0
      if XNN_UNPREDICTABLE(i5 != zero) {
8672
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
8673
0
      }
8674
0
      const float* i6 = *input++;
8675
0
      assert(i6 != NULL);
8676
0
      if XNN_UNPREDICTABLE(i6 != zero) {
8677
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
8678
0
      }
8679
0
      const float* i7 = *input++;
8680
0
      assert(i7 != NULL);
8681
0
      if XNN_UNPREDICTABLE(i7 != zero) {
8682
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
8683
0
      }
8684
0
      const float* i8 = *input++;
8685
0
      assert(i8 != NULL);
8686
0
      if XNN_UNPREDICTABLE(i8 != zero) {
8687
0
        i8 = (const float*) ((uintptr_t) i8 + input_offset);
8688
0
      }
8689
8690
0
      float* b = buffer;
8691
0
      size_t c = channels;
8692
0
      do {
8693
0
        const float vi0 = *i0++;
8694
0
        const float vi1 = *i1++;
8695
0
        const float vi2 = *i2++;
8696
0
        const float vi3 = *i3++;
8697
0
        const float vi4 = *i4++;
8698
0
        const float vi5 = *i5++;
8699
0
        const float vi6 = *i6++;
8700
0
        const float vi7 = *i7++;
8701
0
        const float vi8 = *i8++;
8702
8703
0
        const float vsum01 = vi0 + vi1;
8704
0
        const float vsum23 = vi2 + vi3;
8705
0
        const float vsum45 = vi4 + vi5;
8706
0
        const float vsum67 = vi6 + vi7;
8707
0
        const float vsum018 = vsum01 + vi8;
8708
0
        const float vsum2345 = vsum23 + vsum45;
8709
0
        const float vsum01678 = vsum018 + vsum67;
8710
0
        const float vsum = vsum2345 + vsum01678;
8711
8712
0
        *b++ = vsum;
8713
0
      } while (--c != 0);
8714
0
    }
8715
8716
0
    size_t k = kernel_elements;
8717
0
    for (k -= 9; k > 8; k -= 8) {
8718
0
      const float* i0 = *input++;
8719
0
      assert(i0 != NULL);
8720
0
      if XNN_UNPREDICTABLE(i0 != zero) {
8721
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
8722
0
      }
8723
0
      const float* i1 = *input++;
8724
0
      assert(i1 != NULL);
8725
0
      if XNN_UNPREDICTABLE(i1 != zero) {
8726
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
8727
0
      }
8728
0
      const float* i2 = *input++;
8729
0
      assert(i2 != NULL);
8730
0
      if XNN_UNPREDICTABLE(i2 != zero) {
8731
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
8732
0
      }
8733
0
      const float* i3 = *input++;
8734
0
      assert(i3 != NULL);
8735
0
      if XNN_UNPREDICTABLE(i3 != zero) {
8736
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
8737
0
      }
8738
0
      const float* i4 = *input++;
8739
0
      assert(i4 != NULL);
8740
0
      if XNN_UNPREDICTABLE(i4 != zero) {
8741
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
8742
0
      }
8743
0
      const float* i5 = *input++;
8744
0
      assert(i5 != NULL);
8745
0
      if XNN_UNPREDICTABLE(i5 != zero) {
8746
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
8747
0
      }
8748
0
      const float* i6 = *input++;
8749
0
      assert(i6 != NULL);
8750
0
      if XNN_UNPREDICTABLE(i6 != zero) {
8751
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
8752
0
      }
8753
0
      const float* i7 = *input++;
8754
0
      assert(i7 != NULL);
8755
0
      if XNN_UNPREDICTABLE(i7 != zero) {
8756
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
8757
0
      }
8758
8759
0
      float* b = buffer;
8760
0
      size_t c = channels;
8761
0
      do {
8762
0
        const float vi0 = *i0++;
8763
0
        const float vi1 = *i1++;
8764
0
        const float vi2 = *i2++;
8765
0
        const float vi3 = *i3++;
8766
0
        const float vi4 = *i4++;
8767
0
        const float vi5 = *i5++;
8768
0
        const float vi6 = *i6++;
8769
0
        const float vi7 = *i7++;
8770
0
        const float vacc = *b;
8771
8772
0
        const float vsum01 = vi0 + vi1;
8773
0
        const float vsum23 = vi2 + vi3;
8774
0
        const float vsum45 = vi4 + vi5;
8775
0
        const float vsum67 = vi6 + vi7;
8776
0
        const float vsum01a = vsum01 + vacc;
8777
0
        const float vsum2345 = vsum23 + vsum45;
8778
0
        const float vsum0167a = vsum01a + vsum67;
8779
0
        const float vsum = vsum2345 + vsum0167a;
8780
8781
0
        *b++ = vsum;
8782
0
      } while (--c != 0);
8783
0
    }
8784
8785
0
    {
8786
0
      const float* i0 = input[0];
8787
0
      assert(i0 != NULL);
8788
0
      const float* i1 = input[1];
8789
0
      const float* i2 = input[2];
8790
0
      const float* i3 = input[3];
8791
0
      const float* i4 = input[4];
8792
0
      const float* i5 = input[5];
8793
0
      const float* i6 = input[6];
8794
0
      const float* i7 = input[7];
8795
0
      input = (const float**) ((uintptr_t) input + input_increment);
8796
0
      if (k < 2) {
8797
0
        i1 = zero;
8798
0
      }
8799
0
      assert(i1 != NULL);
8800
0
      if (k <= 2) {
8801
0
        i2 = zero;
8802
0
      }
8803
0
      assert(i2 != NULL);
8804
0
      if (k < 4) {
8805
0
        i3 = zero;
8806
0
      }
8807
0
      assert(i3 != NULL);
8808
0
      if (k <= 4) {
8809
0
        i4 = zero;
8810
0
      }
8811
0
      assert(i4 != NULL);
8812
0
      if (k < 6) {
8813
0
        i5 = zero;
8814
0
      }
8815
0
      assert(i5 != NULL);
8816
0
      if (k <= 6) {
8817
0
        i6 = zero;
8818
0
      }
8819
0
      assert(i6 != NULL);
8820
0
      if (k < 8) {
8821
0
        i7 = zero;
8822
0
      }
8823
0
      assert(i7 != NULL);
8824
0
      if XNN_UNPREDICTABLE(i0 != zero) {
8825
0
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
8826
0
      }
8827
0
      if XNN_UNPREDICTABLE(i1 != zero) {
8828
0
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
8829
0
      }
8830
0
      if XNN_UNPREDICTABLE(i2 != zero) {
8831
0
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
8832
0
      }
8833
0
      if XNN_UNPREDICTABLE(i3 != zero) {
8834
0
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
8835
0
      }
8836
0
      if XNN_UNPREDICTABLE(i4 != zero) {
8837
0
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
8838
0
      }
8839
0
      if XNN_UNPREDICTABLE(i5 != zero) {
8840
0
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
8841
0
      }
8842
0
      if XNN_UNPREDICTABLE(i6 != zero) {
8843
0
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
8844
0
      }
8845
0
      if XNN_UNPREDICTABLE(i7 != zero) {
8846
0
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
8847
0
      }
8848
8849
0
      const float vmultiplier = *multiplier++;
8850
8851
0
      size_t c = channels;
8852
0
      float* b = buffer;
8853
0
      do {
8854
0
        const float vi0 = *i0++;
8855
0
        const float vi1 = *i1++;
8856
0
        const float vi2 = *i2++;
8857
0
        const float vi3 = *i3++;
8858
0
        const float vi4 = *i4++;
8859
0
        const float vi5 = *i5++;
8860
0
        const float vi6 = *i6++;
8861
0
        const float vi7 = *i7++;
8862
0
        const float vacc = *b++;
8863
8864
0
        const float vsum01 = vi0 + vi1;
8865
0
        const float vsum23 = vi2 + vi3;
8866
0
        const float vsum45 = vi4 + vi5;
8867
0
        const float vsum67 = vi6 + vi7;
8868
0
        const float vsum01a = vsum01 + vacc;
8869
0
        const float vsum2345 = vsum23 + vsum45;
8870
0
        const float vsum0167a = vsum01a + vsum67;
8871
0
        const float vsum = vsum2345 + vsum0167a;
8872
8873
0
        float vout = vsum * vmultiplier;
8874
0
        vout = math_max_f32(vout, voutput_min);
8875
0
        vout = math_min_f32(vout, voutput_max);
8876
8877
0
        *output++ = vout;
8878
0
      } while (--c != 0);
8879
0
    }
8880
0
    output = (float*) ((uintptr_t) output + output_increment);
8881
0
  } while (--output_pixels != 0);
8882
0
}
8883
8884
void xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1(
8885
    size_t output_pixels,
8886
    size_t kernel_elements,
8887
    size_t channels,
8888
    const float** input,
8889
    size_t input_offset,
8890
    const float* zero,
8891
    const float* multiplier,
8892
    float* output,
8893
    size_t input_increment,
8894
    size_t output_increment,
8895
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
8896
0
{
8897
0
  assert(output_pixels != 0);
8898
0
  assert(kernel_elements != 0);
8899
0
  assert(kernel_elements <= 9);
8900
0
  assert(channels != 0);
8901
8902
0
  const float voutput_min = params->scalar.min;
8903
0
  const float voutput_max = params->scalar.max;
8904
8905
0
  do {
8906
0
    const float* i0 = input[0];
8907
0
    assert(i0 != NULL);
8908
0
    const float* i1 = input[1];
8909
0
    const float* i2 = input[2];
8910
0
    const float* i3 = input[3];
8911
0
    const float* i4 = input[4];
8912
0
    const float* i5 = input[5];
8913
0
    const float* i6 = input[6];
8914
0
    const float* i7 = input[7];
8915
0
    const float* i8 = input[8];
8916
0
    input = (const float**) ((uintptr_t) input + input_increment);
8917
0
    if (kernel_elements < 2) {
8918
0
      i1 = zero;
8919
0
    }
8920
0
    assert(i1 != NULL);
8921
0
    if (kernel_elements <= 2) {
8922
0
      i2 = zero;
8923
0
    }
8924
0
    assert(i2 != NULL);
8925
0
    if (kernel_elements < 4) {
8926
0
      i3 = zero;
8927
0
    }
8928
0
    assert(i3 != NULL);
8929
0
    if (kernel_elements <= 4) {
8930
0
      i4 = zero;
8931
0
    }
8932
0
    assert(i4 != NULL);
8933
0
    if (kernel_elements < 6) {
8934
0
      i5 = zero;
8935
0
    }
8936
0
    assert(i5 != NULL);
8937
0
    if (kernel_elements <= 6) {
8938
0
      i6 = zero;
8939
0
    }
8940
0
    assert(i6 != NULL);
8941
0
    if (kernel_elements < 8) {
8942
0
      i7 = zero;
8943
0
    }
8944
0
    assert(i7 != NULL);
8945
0
    if (kernel_elements <= 8) {
8946
0
      i8 = zero;
8947
0
    }
8948
0
    assert(i8 != NULL);
8949
0
    if XNN_UNPREDICTABLE(i0 != zero) {
8950
0
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
8951
0
    }
8952
0
    if XNN_UNPREDICTABLE(i1 != zero) {
8953
0
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
8954
0
    }
8955
0
    if XNN_UNPREDICTABLE(i2 != zero) {
8956
0
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
8957
0
    }
8958
0
    if XNN_UNPREDICTABLE(i3 != zero) {
8959
0
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
8960
0
    }
8961
0
    if XNN_UNPREDICTABLE(i4 != zero) {
8962
0
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
8963
0
    }
8964
0
    if XNN_UNPREDICTABLE(i5 != zero) {
8965
0
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
8966
0
    }
8967
0
    if XNN_UNPREDICTABLE(i6 != zero) {
8968
0
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
8969
0
    }
8970
0
    if XNN_UNPREDICTABLE(i7 != zero) {
8971
0
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
8972
0
    }
8973
0
    if XNN_UNPREDICTABLE(i8 != zero) {
8974
0
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
8975
0
    }
8976
8977
0
    const float vmultiplier = *multiplier++;
8978
8979
0
    size_t c = channels;
8980
0
    do {
8981
0
      const float vi0 = *i0++;
8982
0
      const float vi1 = *i1++;
8983
0
      const float vi2 = *i2++;
8984
0
      const float vi3 = *i3++;
8985
0
      const float vi4 = *i4++;
8986
0
      const float vi5 = *i5++;
8987
0
      const float vi6 = *i6++;
8988
0
      const float vi7 = *i7++;
8989
0
      const float vi8 = *i8++;
8990
8991
0
      const float vsum01 = vi0 + vi1;
8992
0
      const float vsum23 = vi2 + vi3;
8993
0
      const float vsum45 = vi4 + vi5;
8994
0
      const float vsum67 = vi6 + vi7;
8995
0
      const float vsum018 = vsum01 + vi8;
8996
0
      const float vsum2345 = vsum23 + vsum45;
8997
0
      const float vsum01678 = vsum018 + vsum67;
8998
0
      const float vsum = vsum2345 + vsum01678;
8999
9000
0
      float vout = vsum * vmultiplier;
9001
0
      vout = math_max_f32(vout, voutput_min);
9002
0
      vout = math_min_f32(vout, voutput_max);
9003
9004
0
      *output++ = vout;
9005
0
    } while (--c != 0);
9006
0
    output = (float*) ((uintptr_t) output + output_increment);
9007
0
  } while (--output_pixels != 0);
9008
0
}
9009
9010
void xnn_f32_prelu_ukernel__scalar_2x4(
9011
    size_t rows,
9012
    size_t channels,
9013
    const float* restrict input,
9014
    size_t input_stride,
9015
    const float* restrict weights,
9016
    float* restrict output,
9017
    size_t output_stride)
9018
0
{
9019
0
  assert(rows != 0);
9020
0
  assert(channels != 0);
9021
0
  assert(channels % sizeof(float) == 0);
9022
9023
0
  const float* i0 = input;
9024
0
  float* o0 = output;
9025
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
9026
0
  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
9027
9028
0
  const size_t input_increment = input_stride * 2 - channels;
9029
0
  const size_t output_increment = output_stride * 2 - channels;
9030
9031
0
  do {
9032
0
    if XNN_UNPREDICTABLE(rows < 2) {
9033
0
      i1 = i0;
9034
0
      o1 = o0;
9035
0
    }
9036
9037
0
    const float* w = weights;
9038
0
    size_t c = channels;
9039
0
    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
9040
0
      const float vw0 = w[0];
9041
0
      const float vw1 = w[1];
9042
0
      const float vw2 = w[2];
9043
0
      const float vw3 = w[3];
9044
9045
0
      const float vi0x0 = i0[0];
9046
0
      const float vi0x1 = i0[1];
9047
0
      const float vi0x2 = i0[2];
9048
0
      const float vi0x3 = i0[3];
9049
0
      i0 += 4;
9050
0
      const float vi1x0 = i1[0];
9051
0
      const float vi1x1 = i1[1];
9052
0
      const float vi1x2 = i1[2];
9053
0
      const float vi1x3 = i1[3];
9054
0
      i1 += 4;
9055
9056
0
      const float vacc0x0 = XNN_UNPREDICTABLE(vi0x0 < 0.0f) ? vi0x0 * vw0 : vi0x0;
9057
0
      const float vacc0x1 = XNN_UNPREDICTABLE(vi0x1 < 0.0f) ? vi0x1 * vw1 : vi0x1;
9058
0
      const float vacc0x2 = XNN_UNPREDICTABLE(vi0x2 < 0.0f) ? vi0x2 * vw2 : vi0x2;
9059
0
      const float vacc0x3 = XNN_UNPREDICTABLE(vi0x3 < 0.0f) ? vi0x3 * vw3 : vi0x3;
9060
0
      const float vacc1x0 = XNN_UNPREDICTABLE(vi1x0 < 0.0f) ? vi1x0 * vw0 : vi1x0;
9061
0
      const float vacc1x1 = XNN_UNPREDICTABLE(vi1x1 < 0.0f) ? vi1x1 * vw1 : vi1x1;
9062
0
      const float vacc1x2 = XNN_UNPREDICTABLE(vi1x2 < 0.0f) ? vi1x2 * vw2 : vi1x2;
9063
0
      const float vacc1x3 = XNN_UNPREDICTABLE(vi1x3 < 0.0f) ? vi1x3 * vw3 : vi1x3;
9064
9065
0
      o0[0] = vacc0x0;
9066
0
      o0[1] = vacc0x1;
9067
0
      o0[2] = vacc0x2;
9068
0
      o0[3] = vacc0x3;
9069
0
      o0 += 4;
9070
0
      o1[0] = vacc1x0;
9071
0
      o1[1] = vacc1x1;
9072
0
      o1[2] = vacc1x2;
9073
0
      o1[3] = vacc1x3;
9074
0
      o1 += 4;
9075
9076
0
      w += 4;
9077
0
    }
9078
0
    for (; c != 0; c -= sizeof(float)) {
9079
0
      const float vw = *w++;
9080
9081
0
      const float vi0 = *i0++;
9082
0
      const float vi1 = *i1++;
9083
9084
0
      const float vacc0 = XNN_UNPREDICTABLE(vi0 < 0.0f) ? vi0 * vw : vi0;
9085
0
      const float vacc1 = XNN_UNPREDICTABLE(vi1 < 0.0f) ? vi1 * vw : vi1;
9086
9087
0
      *o0++ = vacc0;
9088
0
      *o1++ = vacc1;
9089
0
    }
9090
0
    i0 = (const float*) ((uintptr_t) i0 + input_increment);
9091
0
    o0 = (float*) ((uintptr_t) o0 + output_increment);
9092
0
    i1 = (const float*) ((uintptr_t) i1 + input_increment);
9093
0
    o1 = (float*) ((uintptr_t) o1 + output_increment);
9094
0
    rows = doz(rows, 2);
9095
0
  } while (rows != 0);
9096
0
}
9097
9098
void xnn_f32_qc4w_gemm_minmax_ukernel_1x4__scalar(
9099
    size_t mr,
9100
    size_t nc,
9101
    size_t kc,
9102
    const float* restrict a,
9103
    size_t a_stride,
9104
    const void* restrict w,
9105
    float* restrict c,
9106
    size_t cm_stride,
9107
    size_t cn_stride,
9108
    const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
9109
0
{
9110
0
  assert(mr != 0);
9111
0
  assert(mr <= 1);
9112
0
  assert(nc != 0);
9113
0
  assert(kc != 0);
9114
0
  assert(kc % sizeof(float) == 0);
9115
0
  assert(a != NULL);
9116
0
  assert(w != NULL);
9117
0
  assert(c != NULL);
9118
9119
0
  const float* a0 = a;
9120
0
  float* c0 = c;
9121
9122
0
  const float vmin = params->scalar.min;
9123
0
  const float vmax = params->scalar.max;
9124
0
  const int32_t vminus_kernel_zero_point = params->scalar.minus_kernel_zero_point;
9125
0
  do {
9126
0
    float vacc00 = ((const float*)w)[0];
9127
0
    float vacc01 = ((const float*)w)[1];
9128
0
    float vacc02 = ((const float*)w)[2];
9129
0
    float vacc03 = ((const float*)w)[3];
9130
0
    w = (const float*) w + 4;
9131
9132
0
    size_t k = kc;
9133
0
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
9134
0
      const float va00 = *a0++;
9135
0
      const float va01 = *a0++;
9136
9137
0
      const uint8_t vbi0 = ((const uint8_t*) w)[0];
9138
0
      const uint8_t vbi1 = ((const uint8_t*) w)[1];
9139
0
      const uint8_t vbi2 = ((const uint8_t*) w)[2];
9140
0
      const uint8_t vbi3 = ((const uint8_t*) w)[3];
9141
0
      const float vb00 = (float) ((int32_t) (vbi0 & 0xF) + vminus_kernel_zero_point);
9142
0
      const float vb10 = (float) ((int32_t) (vbi1 & 0xF) + vminus_kernel_zero_point);
9143
0
      const float vb20 = (float) ((int32_t) (vbi2 & 0xF) + vminus_kernel_zero_point);
9144
0
      const float vb30 = (float) ((int32_t) (vbi3 & 0xF) + vminus_kernel_zero_point);
9145
0
      const float vb01 = (float) ((int32_t) (vbi0 >> 4) + vminus_kernel_zero_point);
9146
0
      const float vb11 = (float) ((int32_t) (vbi1 >> 4) + vminus_kernel_zero_point);
9147
0
      const float vb21 = (float) ((int32_t) (vbi2 >> 4) + vminus_kernel_zero_point);
9148
0
      const float vb31 = (float) ((int32_t) (vbi3 >> 4) + vminus_kernel_zero_point);
9149
0
      w = (const int8_t*) w + 4;
9150
9151
0
      vacc00 = math_muladd_f32(va00, vb00, vacc00);
9152
0
      vacc01 = math_muladd_f32(va00, vb10, vacc01);
9153
0
      vacc02 = math_muladd_f32(va00, vb20, vacc02);
9154
0
      vacc03 = math_muladd_f32(va00, vb30, vacc03);
9155
0
      vacc00 = math_muladd_f32(va01, vb01, vacc00);
9156
0
      vacc01 = math_muladd_f32(va01, vb11, vacc01);
9157
0
      vacc02 = math_muladd_f32(va01, vb21, vacc02);
9158
0
      vacc03 = math_muladd_f32(va01, vb31, vacc03);
9159
0
    }
9160
0
    if XNN_UNLIKELY(k != 0) {
9161
0
      const float va0 = *a0++;
9162
9163
0
      const uint8_t vbi0 = ((const uint8_t*) w)[0];
9164
0
      const uint8_t vbi1 = ((const uint8_t*) w)[1];
9165
0
      const uint8_t vbi2 = ((const uint8_t*) w)[2];
9166
0
      const uint8_t vbi3 = ((const uint8_t*) w)[3];
9167
0
      const float vb0 = (float) ((int32_t) vbi0 + vminus_kernel_zero_point);
9168
0
      const float vb1 = (float) ((int32_t) vbi1 + vminus_kernel_zero_point);
9169
0
      const float vb2 = (float) ((int32_t) vbi2 + vminus_kernel_zero_point);
9170
0
      const float vb3 = (float) ((int32_t) vbi3 + vminus_kernel_zero_point);
9171
0
      w = (const int8_t*) w + 4;
9172
9173
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
9174
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
9175
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
9176
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
9177
0
    }
9178
9179
0
    const float vscale0 = ((const float*)w)[0];
9180
0
    const float vscale1 = ((const float*)w)[1];
9181
0
    const float vscale2 = ((const float*)w)[2];
9182
0
    const float vscale3 = ((const float*)w)[3];
9183
0
    w = (const float*) w + 4;
9184
0
    vacc00 *= vscale0;
9185
0
    vacc01 *= vscale1;
9186
0
    vacc02 *= vscale2;
9187
0
    vacc03 *= vscale3;
9188
0
    vacc00 = math_max_f32(vacc00, vmin);
9189
0
    vacc01 = math_max_f32(vacc01, vmin);
9190
0
    vacc02 = math_max_f32(vacc02, vmin);
9191
0
    vacc03 = math_max_f32(vacc03, vmin);
9192
9193
0
    vacc00 = math_min_f32(vacc00, vmax);
9194
0
    vacc01 = math_min_f32(vacc01, vmax);
9195
0
    vacc02 = math_min_f32(vacc02, vmax);
9196
0
    vacc03 = math_min_f32(vacc03, vmax);
9197
9198
0
    if XNN_LIKELY(nc >= 4) {
9199
0
      c0[0] = vacc00;
9200
0
      c0[1] = vacc01;
9201
0
      c0[2] = vacc02;
9202
0
      c0[3] = vacc03;
9203
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
9204
9205
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
9206
9207
0
      nc -= 4;
9208
0
    } else {
9209
0
      if (nc & 2) {
9210
0
        c0[0] = vacc00;
9211
0
        c0[1] = vacc01;
9212
0
        vacc00 = vacc02;
9213
0
        c0 += 2;
9214
0
      }
9215
0
      if (nc & 1) {
9216
0
        c0[0] = vacc00;
9217
0
      }
9218
9219
0
      nc = 0;
9220
0
    }
9221
0
  } while (nc != 0);
9222
0
}
9223
9224
void xnn_f32_qc4w_gemm_minmax_ukernel_4x4__scalar(
9225
    size_t mr,
9226
    size_t nc,
9227
    size_t kc,
9228
    const float* restrict a,
9229
    size_t a_stride,
9230
    const void* restrict w,
9231
    float* restrict c,
9232
    size_t cm_stride,
9233
    size_t cn_stride,
9234
    const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
9235
0
{
9236
0
  assert(mr != 0);
9237
0
  assert(mr <= 4);
9238
0
  assert(nc != 0);
9239
0
  assert(kc != 0);
9240
0
  assert(kc % sizeof(float) == 0);
9241
0
  assert(a != NULL);
9242
0
  assert(w != NULL);
9243
0
  assert(c != NULL);
9244
9245
0
  const float* a0 = a;
9246
0
  float* c0 = c;
9247
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
9248
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
9249
0
  if XNN_UNPREDICTABLE(mr < 2) {
9250
0
    a1 = a0;
9251
0
    c1 = c0;
9252
0
  }
9253
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
9254
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
9255
0
  if XNN_UNPREDICTABLE(mr <= 2) {
9256
0
    a2 = a1;
9257
0
    c2 = c1;
9258
0
  }
9259
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
9260
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
9261
0
  if XNN_UNPREDICTABLE(mr != 4) {
9262
0
    a3 = a2;
9263
0
    c3 = c2;
9264
0
  }
9265
9266
0
  const float vmin = params->scalar.min;
9267
0
  const float vmax = params->scalar.max;
9268
0
  const int32_t vminus_kernel_zero_point = params->scalar.minus_kernel_zero_point;
9269
0
  do {
9270
0
    float vacc00 = ((const float*)w)[0];
9271
0
    float vacc01 = ((const float*)w)[1];
9272
0
    float vacc02 = ((const float*)w)[2];
9273
0
    float vacc03 = ((const float*)w)[3];
9274
0
    w = (const float*) w + 4;
9275
0
    float vacc10 = vacc00;
9276
0
    float vacc11 = vacc01;
9277
0
    float vacc12 = vacc02;
9278
0
    float vacc13 = vacc03;
9279
0
    float vacc20 = vacc00;
9280
0
    float vacc21 = vacc01;
9281
0
    float vacc22 = vacc02;
9282
0
    float vacc23 = vacc03;
9283
0
    float vacc30 = vacc00;
9284
0
    float vacc31 = vacc01;
9285
0
    float vacc32 = vacc02;
9286
0
    float vacc33 = vacc03;
9287
9288
0
    size_t k = kc;
9289
0
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
9290
0
      const float va00 = *a0++;
9291
0
      const float va01 = *a0++;
9292
0
      const float va10 = *a1++;
9293
0
      const float va11 = *a1++;
9294
0
      const float va20 = *a2++;
9295
0
      const float va21 = *a2++;
9296
0
      const float va30 = *a3++;
9297
0
      const float va31 = *a3++;
9298
9299
0
      const uint8_t vbi0 = ((const uint8_t*) w)[0];
9300
0
      const uint8_t vbi1 = ((const uint8_t*) w)[1];
9301
0
      const uint8_t vbi2 = ((const uint8_t*) w)[2];
9302
0
      const uint8_t vbi3 = ((const uint8_t*) w)[3];
9303
0
      const float vb00 = (float) ((int32_t) (vbi0 & 0xF) + vminus_kernel_zero_point);
9304
0
      const float vb10 = (float) ((int32_t) (vbi1 & 0xF) + vminus_kernel_zero_point);
9305
0
      const float vb20 = (float) ((int32_t) (vbi2 & 0xF) + vminus_kernel_zero_point);
9306
0
      const float vb30 = (float) ((int32_t) (vbi3 & 0xF) + vminus_kernel_zero_point);
9307
0
      const float vb01 = (float) ((int32_t) (vbi0 >> 4) + vminus_kernel_zero_point);
9308
0
      const float vb11 = (float) ((int32_t) (vbi1 >> 4) + vminus_kernel_zero_point);
9309
0
      const float vb21 = (float) ((int32_t) (vbi2 >> 4) + vminus_kernel_zero_point);
9310
0
      const float vb31 = (float) ((int32_t) (vbi3 >> 4) + vminus_kernel_zero_point);
9311
0
      w = (const int8_t*) w + 4;
9312
9313
0
      vacc00 = math_muladd_f32(va00, vb00, vacc00);
9314
0
      vacc01 = math_muladd_f32(va00, vb10, vacc01);
9315
0
      vacc02 = math_muladd_f32(va00, vb20, vacc02);
9316
0
      vacc03 = math_muladd_f32(va00, vb30, vacc03);
9317
0
      vacc10 = math_muladd_f32(va10, vb00, vacc10);
9318
0
      vacc11 = math_muladd_f32(va10, vb10, vacc11);
9319
0
      vacc12 = math_muladd_f32(va10, vb20, vacc12);
9320
0
      vacc13 = math_muladd_f32(va10, vb30, vacc13);
9321
0
      vacc20 = math_muladd_f32(va20, vb00, vacc20);
9322
0
      vacc21 = math_muladd_f32(va20, vb10, vacc21);
9323
0
      vacc22 = math_muladd_f32(va20, vb20, vacc22);
9324
0
      vacc23 = math_muladd_f32(va20, vb30, vacc23);
9325
0
      vacc30 = math_muladd_f32(va30, vb00, vacc30);
9326
0
      vacc31 = math_muladd_f32(va30, vb10, vacc31);
9327
0
      vacc32 = math_muladd_f32(va30, vb20, vacc32);
9328
0
      vacc33 = math_muladd_f32(va30, vb30, vacc33);
9329
0
      vacc00 = math_muladd_f32(va01, vb01, vacc00);
9330
0
      vacc01 = math_muladd_f32(va01, vb11, vacc01);
9331
0
      vacc02 = math_muladd_f32(va01, vb21, vacc02);
9332
0
      vacc03 = math_muladd_f32(va01, vb31, vacc03);
9333
0
      vacc10 = math_muladd_f32(va11, vb01, vacc10);
9334
0
      vacc11 = math_muladd_f32(va11, vb11, vacc11);
9335
0
      vacc12 = math_muladd_f32(va11, vb21, vacc12);
9336
0
      vacc13 = math_muladd_f32(va11, vb31, vacc13);
9337
0
      vacc20 = math_muladd_f32(va21, vb01, vacc20);
9338
0
      vacc21 = math_muladd_f32(va21, vb11, vacc21);
9339
0
      vacc22 = math_muladd_f32(va21, vb21, vacc22);
9340
0
      vacc23 = math_muladd_f32(va21, vb31, vacc23);
9341
0
      vacc30 = math_muladd_f32(va31, vb01, vacc30);
9342
0
      vacc31 = math_muladd_f32(va31, vb11, vacc31);
9343
0
      vacc32 = math_muladd_f32(va31, vb21, vacc32);
9344
0
      vacc33 = math_muladd_f32(va31, vb31, vacc33);
9345
0
    }
9346
0
    if XNN_UNLIKELY(k != 0) {
9347
0
      const float va0 = *a0++;
9348
0
      const float va1 = *a1++;
9349
0
      const float va2 = *a2++;
9350
0
      const float va3 = *a3++;
9351
9352
0
      const uint8_t vbi0 = ((const uint8_t*) w)[0];
9353
0
      const uint8_t vbi1 = ((const uint8_t*) w)[1];
9354
0
      const uint8_t vbi2 = ((const uint8_t*) w)[2];
9355
0
      const uint8_t vbi3 = ((const uint8_t*) w)[3];
9356
0
      const float vb0 = (float) ((int32_t) vbi0 + vminus_kernel_zero_point);
9357
0
      const float vb1 = (float) ((int32_t) vbi1 + vminus_kernel_zero_point);
9358
0
      const float vb2 = (float) ((int32_t) vbi2 + vminus_kernel_zero_point);
9359
0
      const float vb3 = (float) ((int32_t) vbi3 + vminus_kernel_zero_point);
9360
0
      w = (const int8_t*) w + 4;
9361
9362
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
9363
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
9364
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
9365
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
9366
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
9367
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
9368
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
9369
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
9370
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
9371
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
9372
0
      vacc22 = math_muladd_f32(va2, vb2, vacc22);
9373
0
      vacc23 = math_muladd_f32(va2, vb3, vacc23);
9374
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
9375
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
9376
0
      vacc32 = math_muladd_f32(va3, vb2, vacc32);
9377
0
      vacc33 = math_muladd_f32(va3, vb3, vacc33);
9378
0
    }
9379
9380
0
    const float vscale0 = ((const float*)w)[0];
9381
0
    const float vscale1 = ((const float*)w)[1];
9382
0
    const float vscale2 = ((const float*)w)[2];
9383
0
    const float vscale3 = ((const float*)w)[3];
9384
0
    w = (const float*) w + 4;
9385
0
    vacc00 *= vscale0;
9386
0
    vacc10 *= vscale0;
9387
0
    vacc20 *= vscale0;
9388
0
    vacc30 *= vscale0;
9389
0
    vacc01 *= vscale1;
9390
0
    vacc11 *= vscale1;
9391
0
    vacc21 *= vscale1;
9392
0
    vacc31 *= vscale1;
9393
0
    vacc02 *= vscale2;
9394
0
    vacc12 *= vscale2;
9395
0
    vacc22 *= vscale2;
9396
0
    vacc32 *= vscale2;
9397
0
    vacc03 *= vscale3;
9398
0
    vacc13 *= vscale3;
9399
0
    vacc23 *= vscale3;
9400
0
    vacc33 *= vscale3;
9401
0
    vacc00 = math_max_f32(vacc00, vmin);
9402
0
    vacc01 = math_max_f32(vacc01, vmin);
9403
0
    vacc02 = math_max_f32(vacc02, vmin);
9404
0
    vacc03 = math_max_f32(vacc03, vmin);
9405
0
    vacc10 = math_max_f32(vacc10, vmin);
9406
0
    vacc11 = math_max_f32(vacc11, vmin);
9407
0
    vacc12 = math_max_f32(vacc12, vmin);
9408
0
    vacc13 = math_max_f32(vacc13, vmin);
9409
0
    vacc20 = math_max_f32(vacc20, vmin);
9410
0
    vacc21 = math_max_f32(vacc21, vmin);
9411
0
    vacc22 = math_max_f32(vacc22, vmin);
9412
0
    vacc23 = math_max_f32(vacc23, vmin);
9413
0
    vacc30 = math_max_f32(vacc30, vmin);
9414
0
    vacc31 = math_max_f32(vacc31, vmin);
9415
0
    vacc32 = math_max_f32(vacc32, vmin);
9416
0
    vacc33 = math_max_f32(vacc33, vmin);
9417
9418
0
    vacc00 = math_min_f32(vacc00, vmax);
9419
0
    vacc01 = math_min_f32(vacc01, vmax);
9420
0
    vacc02 = math_min_f32(vacc02, vmax);
9421
0
    vacc03 = math_min_f32(vacc03, vmax);
9422
0
    vacc10 = math_min_f32(vacc10, vmax);
9423
0
    vacc11 = math_min_f32(vacc11, vmax);
9424
0
    vacc12 = math_min_f32(vacc12, vmax);
9425
0
    vacc13 = math_min_f32(vacc13, vmax);
9426
0
    vacc20 = math_min_f32(vacc20, vmax);
9427
0
    vacc21 = math_min_f32(vacc21, vmax);
9428
0
    vacc22 = math_min_f32(vacc22, vmax);
9429
0
    vacc23 = math_min_f32(vacc23, vmax);
9430
0
    vacc30 = math_min_f32(vacc30, vmax);
9431
0
    vacc31 = math_min_f32(vacc31, vmax);
9432
0
    vacc32 = math_min_f32(vacc32, vmax);
9433
0
    vacc33 = math_min_f32(vacc33, vmax);
9434
9435
0
    if XNN_LIKELY(nc >= 4) {
9436
0
      c3[0] = vacc30;
9437
0
      c3[1] = vacc31;
9438
0
      c3[2] = vacc32;
9439
0
      c3[3] = vacc33;
9440
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
9441
0
      c2[0] = vacc20;
9442
0
      c2[1] = vacc21;
9443
0
      c2[2] = vacc22;
9444
0
      c2[3] = vacc23;
9445
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
9446
0
      c1[0] = vacc10;
9447
0
      c1[1] = vacc11;
9448
0
      c1[2] = vacc12;
9449
0
      c1[3] = vacc13;
9450
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
9451
0
      c0[0] = vacc00;
9452
0
      c0[1] = vacc01;
9453
0
      c0[2] = vacc02;
9454
0
      c0[3] = vacc03;
9455
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
9456
9457
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
9458
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
9459
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
9460
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
9461
9462
0
      nc -= 4;
9463
0
    } else {
9464
0
      if (nc & 2) {
9465
0
        c3[0] = vacc30;
9466
0
        c3[1] = vacc31;
9467
0
        vacc30 = vacc32;
9468
0
        c3 += 2;
9469
0
        c2[0] = vacc20;
9470
0
        c2[1] = vacc21;
9471
0
        vacc20 = vacc22;
9472
0
        c2 += 2;
9473
0
        c1[0] = vacc10;
9474
0
        c1[1] = vacc11;
9475
0
        vacc10 = vacc12;
9476
0
        c1 += 2;
9477
0
        c0[0] = vacc00;
9478
0
        c0[1] = vacc01;
9479
0
        vacc00 = vacc02;
9480
0
        c0 += 2;
9481
0
      }
9482
0
      if (nc & 1) {
9483
0
        c3[0] = vacc30;
9484
0
        c2[0] = vacc20;
9485
0
        c1[0] = vacc10;
9486
0
        c0[0] = vacc00;
9487
0
      }
9488
9489
0
      nc = 0;
9490
0
    }
9491
0
  } while (nc != 0);
9492
0
}
9493
9494
void xnn_f32_qc8w_gemm_minmax_ukernel_1x4__scalar(
9495
    size_t mr,
9496
    size_t nc,
9497
    size_t kc,
9498
    const float* restrict a,
9499
    size_t a_stride,
9500
    const void* restrict w,
9501
    float* restrict c,
9502
    size_t cm_stride,
9503
    size_t cn_stride,
9504
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
9505
0
{
9506
0
  assert(mr != 0);
9507
0
  assert(mr <= 1);
9508
0
  assert(nc != 0);
9509
0
  assert(kc != 0);
9510
0
  assert(kc % sizeof(float) == 0);
9511
0
  assert(a != NULL);
9512
0
  assert(w != NULL);
9513
0
  assert(c != NULL);
9514
9515
0
  const float* a0 = a;
9516
0
  float* c0 = c;
9517
9518
0
  const float vmin = params->scalar.min;
9519
0
  const float vmax = params->scalar.max;
9520
0
  do {
9521
0
    float vacc00 = ((const float*)w)[0];
9522
0
    float vacc01 = ((const float*)w)[1];
9523
0
    float vacc02 = ((const float*)w)[2];
9524
0
    float vacc03 = ((const float*)w)[3];
9525
0
    w = (const float*) w + 4;
9526
9527
0
    size_t k = kc;
9528
0
    do {
9529
0
      const float va0 = *a0++;
9530
9531
0
      const float vb0 = (float) ((const int8_t*) w)[0];
9532
0
      const float vb1 = (float) ((const int8_t*) w)[1];
9533
0
      const float vb2 = (float) ((const int8_t*) w)[2];
9534
0
      const float vb3 = (float) ((const int8_t*) w)[3];
9535
0
      w = (const int8_t*) w + 4;
9536
9537
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
9538
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
9539
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
9540
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
9541
9542
0
      k -= sizeof(float);
9543
0
    } while (k != 0);
9544
9545
0
    const float vscale0 = ((const float*)w)[0];
9546
0
    const float vscale1 = ((const float*)w)[1];
9547
0
    const float vscale2 = ((const float*)w)[2];
9548
0
    const float vscale3 = ((const float*)w)[3];
9549
0
    w = (const float*) w + 4;
9550
0
    vacc00 *= vscale0;
9551
0
    vacc01 *= vscale1;
9552
0
    vacc02 *= vscale2;
9553
0
    vacc03 *= vscale3;
9554
0
    vacc00 = math_max_f32(vacc00, vmin);
9555
0
    vacc01 = math_max_f32(vacc01, vmin);
9556
0
    vacc02 = math_max_f32(vacc02, vmin);
9557
0
    vacc03 = math_max_f32(vacc03, vmin);
9558
9559
0
    vacc00 = math_min_f32(vacc00, vmax);
9560
0
    vacc01 = math_min_f32(vacc01, vmax);
9561
0
    vacc02 = math_min_f32(vacc02, vmax);
9562
0
    vacc03 = math_min_f32(vacc03, vmax);
9563
9564
0
    if XNN_LIKELY(nc >= 4) {
9565
0
      c0[0] = vacc00;
9566
0
      c0[1] = vacc01;
9567
0
      c0[2] = vacc02;
9568
0
      c0[3] = vacc03;
9569
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
9570
9571
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
9572
9573
0
      nc -= 4;
9574
0
    } else {
9575
0
      if (nc & 2) {
9576
0
        c0[0] = vacc00;
9577
0
        c0[1] = vacc01;
9578
0
        vacc00 = vacc02;
9579
0
        c0 += 2;
9580
0
      }
9581
0
      if (nc & 1) {
9582
0
        c0[0] = vacc00;
9583
0
      }
9584
9585
0
      nc = 0;
9586
0
    }
9587
0
  } while (nc != 0);
9588
0
}
9589
9590
void xnn_f32_qc8w_gemm_minmax_ukernel_4x4__scalar(
9591
    size_t mr,
9592
    size_t nc,
9593
    size_t kc,
9594
    const float* restrict a,
9595
    size_t a_stride,
9596
    const void* restrict w,
9597
    float* restrict c,
9598
    size_t cm_stride,
9599
    size_t cn_stride,
9600
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
9601
0
{
9602
0
  assert(mr != 0);
9603
0
  assert(mr <= 4);
9604
0
  assert(nc != 0);
9605
0
  assert(kc != 0);
9606
0
  assert(kc % sizeof(float) == 0);
9607
0
  assert(a != NULL);
9608
0
  assert(w != NULL);
9609
0
  assert(c != NULL);
9610
9611
0
  const float* a0 = a;
9612
0
  float* c0 = c;
9613
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
9614
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
9615
0
  if XNN_UNPREDICTABLE(mr < 2) {
9616
0
    a1 = a0;
9617
0
    c1 = c0;
9618
0
  }
9619
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
9620
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
9621
0
  if XNN_UNPREDICTABLE(mr <= 2) {
9622
0
    a2 = a1;
9623
0
    c2 = c1;
9624
0
  }
9625
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
9626
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
9627
0
  if XNN_UNPREDICTABLE(mr != 4) {
9628
0
    a3 = a2;
9629
0
    c3 = c2;
9630
0
  }
9631
9632
0
  const float vmin = params->scalar.min;
9633
0
  const float vmax = params->scalar.max;
9634
0
  do {
9635
0
    float vacc00 = ((const float*)w)[0];
9636
0
    float vacc01 = ((const float*)w)[1];
9637
0
    float vacc02 = ((const float*)w)[2];
9638
0
    float vacc03 = ((const float*)w)[3];
9639
0
    w = (const float*) w + 4;
9640
0
    float vacc10 = vacc00;
9641
0
    float vacc11 = vacc01;
9642
0
    float vacc12 = vacc02;
9643
0
    float vacc13 = vacc03;
9644
0
    float vacc20 = vacc00;
9645
0
    float vacc21 = vacc01;
9646
0
    float vacc22 = vacc02;
9647
0
    float vacc23 = vacc03;
9648
0
    float vacc30 = vacc00;
9649
0
    float vacc31 = vacc01;
9650
0
    float vacc32 = vacc02;
9651
0
    float vacc33 = vacc03;
9652
9653
0
    size_t k = kc;
9654
0
    do {
9655
0
      const float va0 = *a0++;
9656
0
      const float va1 = *a1++;
9657
0
      const float va2 = *a2++;
9658
0
      const float va3 = *a3++;
9659
9660
0
      const float vb0 = (float) ((const int8_t*) w)[0];
9661
0
      const float vb1 = (float) ((const int8_t*) w)[1];
9662
0
      const float vb2 = (float) ((const int8_t*) w)[2];
9663
0
      const float vb3 = (float) ((const int8_t*) w)[3];
9664
0
      w = (const int8_t*) w + 4;
9665
9666
0
      vacc00 = math_muladd_f32(va0, vb0, vacc00);
9667
0
      vacc01 = math_muladd_f32(va0, vb1, vacc01);
9668
0
      vacc02 = math_muladd_f32(va0, vb2, vacc02);
9669
0
      vacc03 = math_muladd_f32(va0, vb3, vacc03);
9670
0
      vacc10 = math_muladd_f32(va1, vb0, vacc10);
9671
0
      vacc11 = math_muladd_f32(va1, vb1, vacc11);
9672
0
      vacc12 = math_muladd_f32(va1, vb2, vacc12);
9673
0
      vacc13 = math_muladd_f32(va1, vb3, vacc13);
9674
0
      vacc20 = math_muladd_f32(va2, vb0, vacc20);
9675
0
      vacc21 = math_muladd_f32(va2, vb1, vacc21);
9676
0
      vacc22 = math_muladd_f32(va2, vb2, vacc22);
9677
0
      vacc23 = math_muladd_f32(va2, vb3, vacc23);
9678
0
      vacc30 = math_muladd_f32(va3, vb0, vacc30);
9679
0
      vacc31 = math_muladd_f32(va3, vb1, vacc31);
9680
0
      vacc32 = math_muladd_f32(va3, vb2, vacc32);
9681
0
      vacc33 = math_muladd_f32(va3, vb3, vacc33);
9682
9683
0
      k -= sizeof(float);
9684
0
    } while (k != 0);
9685
9686
0
    const float vscale0 = ((const float*)w)[0];
9687
0
    const float vscale1 = ((const float*)w)[1];
9688
0
    const float vscale2 = ((const float*)w)[2];
9689
0
    const float vscale3 = ((const float*)w)[3];
9690
0
    w = (const float*) w + 4;
9691
0
    vacc00 *= vscale0;
9692
0
    vacc10 *= vscale0;
9693
0
    vacc20 *= vscale0;
9694
0
    vacc30 *= vscale0;
9695
0
    vacc01 *= vscale1;
9696
0
    vacc11 *= vscale1;
9697
0
    vacc21 *= vscale1;
9698
0
    vacc31 *= vscale1;
9699
0
    vacc02 *= vscale2;
9700
0
    vacc12 *= vscale2;
9701
0
    vacc22 *= vscale2;
9702
0
    vacc32 *= vscale2;
9703
0
    vacc03 *= vscale3;
9704
0
    vacc13 *= vscale3;
9705
0
    vacc23 *= vscale3;
9706
0
    vacc33 *= vscale3;
9707
0
    vacc00 = math_max_f32(vacc00, vmin);
9708
0
    vacc01 = math_max_f32(vacc01, vmin);
9709
0
    vacc02 = math_max_f32(vacc02, vmin);
9710
0
    vacc03 = math_max_f32(vacc03, vmin);
9711
0
    vacc10 = math_max_f32(vacc10, vmin);
9712
0
    vacc11 = math_max_f32(vacc11, vmin);
9713
0
    vacc12 = math_max_f32(vacc12, vmin);
9714
0
    vacc13 = math_max_f32(vacc13, vmin);
9715
0
    vacc20 = math_max_f32(vacc20, vmin);
9716
0
    vacc21 = math_max_f32(vacc21, vmin);
9717
0
    vacc22 = math_max_f32(vacc22, vmin);
9718
0
    vacc23 = math_max_f32(vacc23, vmin);
9719
0
    vacc30 = math_max_f32(vacc30, vmin);
9720
0
    vacc31 = math_max_f32(vacc31, vmin);
9721
0
    vacc32 = math_max_f32(vacc32, vmin);
9722
0
    vacc33 = math_max_f32(vacc33, vmin);
9723
9724
0
    vacc00 = math_min_f32(vacc00, vmax);
9725
0
    vacc01 = math_min_f32(vacc01, vmax);
9726
0
    vacc02 = math_min_f32(vacc02, vmax);
9727
0
    vacc03 = math_min_f32(vacc03, vmax);
9728
0
    vacc10 = math_min_f32(vacc10, vmax);
9729
0
    vacc11 = math_min_f32(vacc11, vmax);
9730
0
    vacc12 = math_min_f32(vacc12, vmax);
9731
0
    vacc13 = math_min_f32(vacc13, vmax);
9732
0
    vacc20 = math_min_f32(vacc20, vmax);
9733
0
    vacc21 = math_min_f32(vacc21, vmax);
9734
0
    vacc22 = math_min_f32(vacc22, vmax);
9735
0
    vacc23 = math_min_f32(vacc23, vmax);
9736
0
    vacc30 = math_min_f32(vacc30, vmax);
9737
0
    vacc31 = math_min_f32(vacc31, vmax);
9738
0
    vacc32 = math_min_f32(vacc32, vmax);
9739
0
    vacc33 = math_min_f32(vacc33, vmax);
9740
9741
0
    if XNN_LIKELY(nc >= 4) {
9742
0
      c3[0] = vacc30;
9743
0
      c3[1] = vacc31;
9744
0
      c3[2] = vacc32;
9745
0
      c3[3] = vacc33;
9746
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
9747
0
      c2[0] = vacc20;
9748
0
      c2[1] = vacc21;
9749
0
      c2[2] = vacc22;
9750
0
      c2[3] = vacc23;
9751
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
9752
0
      c1[0] = vacc10;
9753
0
      c1[1] = vacc11;
9754
0
      c1[2] = vacc12;
9755
0
      c1[3] = vacc13;
9756
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
9757
0
      c0[0] = vacc00;
9758
0
      c0[1] = vacc01;
9759
0
      c0[2] = vacc02;
9760
0
      c0[3] = vacc03;
9761
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
9762
9763
0
      a3 = (const void*) ((uintptr_t) a3 - kc);
9764
0
      a2 = (const void*) ((uintptr_t) a2 - kc);
9765
0
      a1 = (const void*) ((uintptr_t) a1 - kc);
9766
0
      a0 = (const void*) ((uintptr_t) a0 - kc);
9767
9768
0
      nc -= 4;
9769
0
    } else {
9770
0
      if (nc & 2) {
9771
0
        c3[0] = vacc30;
9772
0
        c3[1] = vacc31;
9773
0
        vacc30 = vacc32;
9774
0
        c3 += 2;
9775
0
        c2[0] = vacc20;
9776
0
        c2[1] = vacc21;
9777
0
        vacc20 = vacc22;
9778
0
        c2 += 2;
9779
0
        c1[0] = vacc10;
9780
0
        c1[1] = vacc11;
9781
0
        vacc10 = vacc12;
9782
0
        c1 += 2;
9783
0
        c0[0] = vacc00;
9784
0
        c0[1] = vacc01;
9785
0
        vacc00 = vacc02;
9786
0
        c0 += 2;
9787
0
      }
9788
0
      if (nc & 1) {
9789
0
        c3[0] = vacc30;
9790
0
        c2[0] = vacc20;
9791
0
        c1[0] = vacc10;
9792
0
        c0[0] = vacc00;
9793
0
      }
9794
9795
0
      nc = 0;
9796
0
    }
9797
0
  } while (nc != 0);
9798
0
}
9799
9800
void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u4(
9801
    size_t batch,
9802
    const float* input,
9803
    int8_t* output,
9804
    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
9805
0
{
9806
0
  assert(batch != 0);
9807
0
  assert(batch % sizeof(float) == 0);
9808
0
  assert(input != NULL);
9809
0
  assert(output != NULL);
9810
9811
0
  const float vscale = params->scalar_imagic.scale;
9812
0
  const float vmagic_bias = params->scalar_imagic.magic_bias;
9813
0
  const int32_t vmagic_min = params->scalar_imagic.magic_min;
9814
0
  const int32_t vmagic_max = params->scalar_imagic.magic_max;
9815
0
  const int32_t vmagic_bias_less_zero_point = params->scalar_imagic.magic_bias_less_zero_point;
9816
9817
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
9818
0
    float vx0 = input[0];
9819
0
    float vx1 = input[1];
9820
0
    float vx2 = input[2];
9821
0
    float vx3 = input[3];
9822
0
    input += 4;
9823
9824
0
    vx0 *= vscale;
9825
0
    vx1 *= vscale;
9826
0
    vx2 *= vscale;
9827
0
    vx3 *= vscale;
9828
9829
0
    vx0 += vmagic_bias;
9830
0
    vx1 += vmagic_bias;
9831
0
    vx2 += vmagic_bias;
9832
0
    vx3 += vmagic_bias;
9833
9834
0
    int32_t vy0 = (int32_t) float_as_uint32(vx0);
9835
0
    int32_t vy1 = (int32_t) float_as_uint32(vx1);
9836
0
    int32_t vy2 = (int32_t) float_as_uint32(vx2);
9837
0
    int32_t vy3 = (int32_t) float_as_uint32(vx3);
9838
9839
0
    vy0 = math_max_s32(vy0, vmagic_min);
9840
0
    vy1 = math_max_s32(vy1, vmagic_min);
9841
0
    vy2 = math_max_s32(vy2, vmagic_min);
9842
0
    vy3 = math_max_s32(vy3, vmagic_min);
9843
9844
0
    vy0 = math_min_s32(vy0, vmagic_max);
9845
0
    vy1 = math_min_s32(vy1, vmagic_max);
9846
0
    vy2 = math_min_s32(vy2, vmagic_max);
9847
0
    vy3 = math_min_s32(vy3, vmagic_max);
9848
9849
0
    vy0 -= vmagic_bias_less_zero_point;
9850
0
    vy1 -= vmagic_bias_less_zero_point;
9851
0
    vy2 -= vmagic_bias_less_zero_point;
9852
0
    vy3 -= vmagic_bias_less_zero_point;
9853
9854
0
    output[0] = (int8_t) vy0;
9855
0
    output[1] = (int8_t) vy1;
9856
0
    output[2] = (int8_t) vy2;
9857
0
    output[3] = (int8_t) vy3;
9858
0
    output += 4;
9859
0
  }
9860
0
  if XNN_UNLIKELY(batch != 0) {
9861
0
    do {
9862
0
      float vx = *input++;
9863
0
      vx *= vscale;
9864
0
      vx += vmagic_bias;
9865
9866
0
      int32_t vy = (int32_t) float_as_uint32(vx);
9867
0
      vy = math_max_s32(vy, vmagic_min);
9868
0
      vy = math_min_s32(vy, vmagic_max);
9869
0
      vy -= vmagic_bias_less_zero_point;
9870
9871
0
      *output++ = (int8_t) vy;
9872
9873
0
      batch -= sizeof(float);
9874
0
    } while (batch != 0);
9875
0
  }
9876
0
}
9877
9878
void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u4(
9879
    size_t batch,
9880
    const float* input,
9881
    int8_t* output,
9882
    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
9883
0
{
9884
0
  assert(batch != 0);
9885
0
  assert(batch % sizeof(float) == 0);
9886
0
  assert(input != NULL);
9887
0
  assert(output != NULL);
9888
9889
0
  const float vscale = params->scalar_lrintf.scale;
9890
0
  const float voutput_min_less_zero_point = params->scalar_lrintf.output_min_less_zero_point;
9891
0
  const float voutput_max_less_zero_point = params->scalar_lrintf.output_max_less_zero_point;
9892
0
  const int32_t voutput_zero_point = params->scalar_lrintf.output_zero_point;
9893
9894
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
9895
0
    float vx0 = input[0];
9896
0
    float vx1 = input[1];
9897
0
    float vx2 = input[2];
9898
0
    float vx3 = input[3];
9899
0
    input += 4;
9900
9901
0
    vx0 *= vscale;
9902
0
    vx1 *= vscale;
9903
0
    vx2 *= vscale;
9904
0
    vx3 *= vscale;
9905
9906
0
    vx0 = math_max_f32(vx0, voutput_min_less_zero_point);
9907
0
    vx1 = math_max_f32(vx1, voutput_min_less_zero_point);
9908
0
    vx2 = math_max_f32(vx2, voutput_min_less_zero_point);
9909
0
    vx3 = math_max_f32(vx3, voutput_min_less_zero_point);
9910
9911
0
    vx0 = math_min_f32(vx0, voutput_max_less_zero_point);
9912
0
    vx1 = math_min_f32(vx1, voutput_max_less_zero_point);
9913
0
    vx2 = math_min_f32(vx2, voutput_max_less_zero_point);
9914
0
    vx3 = math_min_f32(vx3, voutput_max_less_zero_point);
9915
9916
0
    int32_t vy0 = (int32_t) lrintf(vx0);
9917
0
    int32_t vy1 = (int32_t) lrintf(vx1);
9918
0
    int32_t vy2 = (int32_t) lrintf(vx2);
9919
0
    int32_t vy3 = (int32_t) lrintf(vx3);
9920
9921
0
    vy0 += voutput_zero_point;
9922
0
    vy1 += voutput_zero_point;
9923
0
    vy2 += voutput_zero_point;
9924
0
    vy3 += voutput_zero_point;
9925
9926
0
    output[0] = (int8_t) vy0;
9927
0
    output[1] = (int8_t) vy1;
9928
0
    output[2] = (int8_t) vy2;
9929
0
    output[3] = (int8_t) vy3;
9930
0
    output += 4;
9931
0
  }
9932
0
  if XNN_UNLIKELY(batch != 0) {
9933
0
    do {
9934
0
      float vx = *input++;
9935
0
      vx *= vscale;
9936
0
      vx = math_max_f32(vx, voutput_min_less_zero_point);
9937
0
      vx = math_min_f32(vx, voutput_max_less_zero_point);
9938
9939
0
      int32_t vy = (int32_t) lrintf(vx);
9940
0
      vy += voutput_zero_point;
9941
9942
0
      *output++ = (int8_t) vy;
9943
9944
0
      batch -= sizeof(float);
9945
0
    } while (batch != 0);
9946
0
  }
9947
0
}
9948
9949
void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u1(
9950
    size_t batch,
9951
    const float* input,
9952
    uint8_t* output,
9953
    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
9954
0
{
9955
0
  assert(batch != 0);
9956
0
  assert(batch % sizeof(float) == 0);
9957
0
  assert(input != NULL);
9958
0
  assert(output != NULL);
9959
9960
0
  const float vscale = params->scalar_imagic.scale;
9961
0
  const float vmagic_bias = params->scalar_imagic.magic_bias;
9962
0
  const int32_t vmagic_min = params->scalar_imagic.magic_min;
9963
0
  const int32_t vmagic_max = params->scalar_imagic.magic_max;
9964
0
  const int32_t vmagic_bias_less_zero_point = params->scalar_imagic.magic_bias_less_zero_point;
9965
9966
0
  do {
9967
0
    float vx = *input++;
9968
0
    vx *= vscale;
9969
0
    vx += vmagic_bias;
9970
9971
0
    int32_t vy = (int32_t) float_as_uint32(vx);
9972
0
    vy = math_max_s32(vy, vmagic_min);
9973
0
    vy = math_min_s32(vy, vmagic_max);
9974
0
    vy -= vmagic_bias_less_zero_point;
9975
9976
0
    *output++ = (uint8_t) vy;
9977
9978
0
    batch -= sizeof(float);
9979
0
  } while (batch != 0);
9980
0
}
9981
9982
void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u4(
9983
    size_t batch,
9984
    const float* input,
9985
    uint8_t* output,
9986
    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
9987
0
{
9988
0
  assert(batch != 0);
9989
0
  assert(batch % sizeof(float) == 0);
9990
0
  assert(input != NULL);
9991
0
  assert(output != NULL);
9992
9993
0
  const float vscale = params->scalar_imagic.scale;
9994
0
  const float vmagic_bias = params->scalar_imagic.magic_bias;
9995
0
  const int32_t vmagic_min = params->scalar_imagic.magic_min;
9996
0
  const int32_t vmagic_max = params->scalar_imagic.magic_max;
9997
0
  const int32_t vmagic_bias_less_zero_point = params->scalar_imagic.magic_bias_less_zero_point;
9998
9999
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
10000
0
    float vx0 = input[0];
10001
0
    float vx1 = input[1];
10002
0
    float vx2 = input[2];
10003
0
    float vx3 = input[3];
10004
0
    input += 4;
10005
10006
0
    vx0 *= vscale;
10007
0
    vx1 *= vscale;
10008
0
    vx2 *= vscale;
10009
0
    vx3 *= vscale;
10010
10011
0
    vx0 += vmagic_bias;
10012
0
    vx1 += vmagic_bias;
10013
0
    vx2 += vmagic_bias;
10014
0
    vx3 += vmagic_bias;
10015
10016
0
    int32_t vy0 = (int32_t) float_as_uint32(vx0);
10017
0
    int32_t vy1 = (int32_t) float_as_uint32(vx1);
10018
0
    int32_t vy2 = (int32_t) float_as_uint32(vx2);
10019
0
    int32_t vy3 = (int32_t) float_as_uint32(vx3);
10020
10021
0
    vy0 = math_max_s32(vy0, vmagic_min);
10022
0
    vy1 = math_max_s32(vy1, vmagic_min);
10023
0
    vy2 = math_max_s32(vy2, vmagic_min);
10024
0
    vy3 = math_max_s32(vy3, vmagic_min);
10025
10026
0
    vy0 = math_min_s32(vy0, vmagic_max);
10027
0
    vy1 = math_min_s32(vy1, vmagic_max);
10028
0
    vy2 = math_min_s32(vy2, vmagic_max);
10029
0
    vy3 = math_min_s32(vy3, vmagic_max);
10030
10031
0
    vy0 -= vmagic_bias_less_zero_point;
10032
0
    vy1 -= vmagic_bias_less_zero_point;
10033
0
    vy2 -= vmagic_bias_less_zero_point;
10034
0
    vy3 -= vmagic_bias_less_zero_point;
10035
10036
0
    output[0] = (uint8_t) vy0;
10037
0
    output[1] = (uint8_t) vy1;
10038
0
    output[2] = (uint8_t) vy2;
10039
0
    output[3] = (uint8_t) vy3;
10040
0
    output += 4;
10041
0
  }
10042
0
  if XNN_UNLIKELY(batch != 0) {
10043
0
    do {
10044
0
      float vx = *input++;
10045
0
      vx *= vscale;
10046
0
      vx += vmagic_bias;
10047
10048
0
      int32_t vy = (int32_t) float_as_uint32(vx);
10049
0
      vy = math_max_s32(vy, vmagic_min);
10050
0
      vy = math_min_s32(vy, vmagic_max);
10051
0
      vy -= vmagic_bias_less_zero_point;
10052
10053
0
      *output++ = (uint8_t) vy;
10054
10055
0
      batch -= sizeof(float);
10056
0
    } while (batch != 0);
10057
0
  }
10058
0
}
10059
10060
void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u4(
10061
    size_t batch,
10062
    const float* input,
10063
    uint8_t* output,
10064
    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
10065
0
{
10066
0
  assert(batch != 0);
10067
0
  assert(batch % sizeof(float) == 0);
10068
0
  assert(input != NULL);
10069
0
  assert(output != NULL);
10070
10071
0
  const float vscale = params->scalar_lrintf.scale;
10072
0
  const float voutput_min_less_zero_point = params->scalar_lrintf.output_min_less_zero_point;
10073
0
  const float voutput_max_less_zero_point = params->scalar_lrintf.output_max_less_zero_point;
10074
0
  const int32_t voutput_zero_point = params->scalar_lrintf.output_zero_point;
10075
10076
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
10077
0
    float vx0 = input[0];
10078
0
    float vx1 = input[1];
10079
0
    float vx2 = input[2];
10080
0
    float vx3 = input[3];
10081
0
    input += 4;
10082
10083
0
    vx0 *= vscale;
10084
0
    vx1 *= vscale;
10085
0
    vx2 *= vscale;
10086
0
    vx3 *= vscale;
10087
10088
0
    vx0 = math_max_f32(vx0, voutput_min_less_zero_point);
10089
0
    vx1 = math_max_f32(vx1, voutput_min_less_zero_point);
10090
0
    vx2 = math_max_f32(vx2, voutput_min_less_zero_point);
10091
0
    vx3 = math_max_f32(vx3, voutput_min_less_zero_point);
10092
10093
0
    vx0 = math_min_f32(vx0, voutput_max_less_zero_point);
10094
0
    vx1 = math_min_f32(vx1, voutput_max_less_zero_point);
10095
0
    vx2 = math_min_f32(vx2, voutput_max_less_zero_point);
10096
0
    vx3 = math_min_f32(vx3, voutput_max_less_zero_point);
10097
10098
0
    int32_t vy0 = (int32_t) lrintf(vx0);
10099
0
    int32_t vy1 = (int32_t) lrintf(vx1);
10100
0
    int32_t vy2 = (int32_t) lrintf(vx2);
10101
0
    int32_t vy3 = (int32_t) lrintf(vx3);
10102
10103
0
    vy0 += voutput_zero_point;
10104
0
    vy1 += voutput_zero_point;
10105
0
    vy2 += voutput_zero_point;
10106
0
    vy3 += voutput_zero_point;
10107
10108
0
    output[0] = (uint8_t) vy0;
10109
0
    output[1] = (uint8_t) vy1;
10110
0
    output[2] = (uint8_t) vy2;
10111
0
    output[3] = (uint8_t) vy3;
10112
0
    output += 4;
10113
0
  }
10114
0
  if XNN_UNLIKELY(batch != 0) {
10115
0
    do {
10116
0
      float vx = *input++;
10117
0
      vx *= vscale;
10118
0
      vx = math_max_f32(vx, voutput_min_less_zero_point);
10119
0
      vx = math_min_f32(vx, voutput_max_less_zero_point);
10120
10121
0
      int32_t vy = (int32_t) lrintf(vx);
10122
0
      vy += voutput_zero_point;
10123
10124
0
      *output++ = (uint8_t) vy;
10125
10126
0
      batch -= sizeof(float);
10127
0
    } while (batch != 0);
10128
0
  }
10129
0
}
10130
10131
void xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_u4_acc2(
10132
    size_t batch,
10133
    const float* input,
10134
    const float* max,
10135
    float* output,
10136
    float* sum,
10137
    const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)])
10138
0
{
10139
0
  assert(batch != 0);
10140
0
  assert(batch % sizeof(float) == 0);
10141
0
  assert(input != NULL);
10142
0
  assert(max != NULL);
10143
0
  assert(output != NULL);
10144
0
  assert(sum != NULL);
10145
10146
0
  const float vi_max = *max;
10147
0
  const float vlog2e = params->scalar_rr2_p5.log2e;
10148
0
  const float vmagic_bias = params->scalar_rr2_p5.magic_bias;
10149
0
  const float vminus_ln2_hi = params->scalar_rr2_p5.minus_ln2_hi;
10150
0
  const float vminus_ln2_lo = params->scalar_rr2_p5.minus_ln2_lo;
10151
0
  const float vc5 = params->scalar_rr2_p5.c5;
10152
0
  const float vc4 = params->scalar_rr2_p5.c4;
10153
0
  const float vc3 = params->scalar_rr2_p5.c3;
10154
0
  const float vc2 = params->scalar_rr2_p5.c2;
10155
0
  const float vc1 = params->scalar_rr2_p5.c1;
10156
0
  const float vdenorm_cutoff = params->scalar_rr2_p5.denorm_cutoff;
10157
10158
0
  float vacc0 = 0.0f;
10159
0
  float vacc1 = 0.0f;
10160
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
10161
    // Load 4 inputs at a time.
10162
0
    const float vi0 = input[0];
10163
0
    const float vi1 = input[1];
10164
0
    const float vi2 = input[2];
10165
0
    const float vi3 = input[3];
10166
0
    input += 4;
10167
10168
    // Subtract maximum input x := i - i_max. This implies x <= 0.
10169
0
    const float vx0 = vi0 - vi_max;
10170
0
    const float vx1 = vi1 - vi_max;
10171
0
    const float vx2 = vi2 - vi_max;
10172
0
    const float vx3 = vi3 - vi_max;
10173
10174
    // Compute reduced argument n := round(x / log(2)).
10175
    // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
10176
    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
10177
    // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
10178
    // anyway. We fixup the result for such inputs at the very end of the algorithm.
10179
0
    float vn0 = vx0 * vlog2e + vmagic_bias;
10180
0
    float vn1 = vx1 * vlog2e + vmagic_bias;
10181
0
    float vn2 = vx2 * vlog2e + vmagic_bias;
10182
0
    float vn3 = vx3 * vlog2e + vmagic_bias;
10183
10184
    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
10185
    // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
10186
0
    const float vs0 = uint32_as_float(float_as_uint32(vn0) << 23);
10187
0
    const float vs1 = uint32_as_float(float_as_uint32(vn1) << 23);
10188
0
    const float vs2 = uint32_as_float(float_as_uint32(vn2) << 23);
10189
0
    const float vs3 = uint32_as_float(float_as_uint32(vn3) << 23);
10190
10191
    // Subtract the large number back to get final n := round(x / log(2)).
10192
0
    vn0 -= vmagic_bias;
10193
0
    vn1 -= vmagic_bias;
10194
0
    vn2 -= vmagic_bias;
10195
0
    vn3 -= vmagic_bias;
10196
10197
    // Compute reduced argument t := x - n * log(2).
10198
    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
10199
0
    float vt0 = vn0 * vminus_ln2_hi + vx0;
10200
0
    float vt1 = vn1 * vminus_ln2_hi + vx1;
10201
0
    float vt2 = vn2 * vminus_ln2_hi + vx2;
10202
0
    float vt3 = vn3 * vminus_ln2_hi + vx3;
10203
10204
0
    vt0 = vn0 * vminus_ln2_lo + vt0;
10205
0
    vt1 = vn1 * vminus_ln2_lo + vt1;
10206
0
    vt2 = vn2 * vminus_ln2_lo + vt2;
10207
0
    vt3 = vn3 * vminus_ln2_lo + vt3;
10208
10209
    // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
10210
0
    float vp0 = vc5 * vt0 + vc4;
10211
0
    float vp1 = vc5 * vt1 + vc4;
10212
0
    float vp2 = vc5 * vt2 + vc4;
10213
0
    float vp3 = vc5 * vt3 + vc4;
10214
10215
0
    vp0 = vp0 * vt0 + vc3;
10216
0
    vp1 = vp1 * vt1 + vc3;
10217
0
    vp2 = vp2 * vt2 + vc3;
10218
0
    vp3 = vp3 * vt3 + vc3;
10219
10220
0
    vp0 = vp0 * vt0 + vc2;
10221
0
    vp1 = vp1 * vt1 + vc2;
10222
0
    vp2 = vp2 * vt2 + vc2;
10223
0
    vp3 = vp3 * vt3 + vc2;
10224
10225
0
    vp0 = vp0 * vt0 + vc1;
10226
0
    vp1 = vp1 * vt1 + vc1;
10227
0
    vp2 = vp2 * vt2 + vc1;
10228
0
    vp3 = vp3 * vt3 + vc1;
10229
10230
    // Reconstruct the final f value:
10231
    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
10232
    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
10233
    //     = s + (t * s) * p
10234
0
    vt0 *= vs0;
10235
0
    vt1 *= vs1;
10236
0
    vt2 *= vs2;
10237
0
    vt3 *= vs3;
10238
10239
0
    float vf0 = vt0 * vp0 + vs0;
10240
0
    float vf1 = vt1 * vp1 + vs1;
10241
0
    float vf2 = vt2 * vp2 + vs2;
10242
0
    float vf3 = vt3 * vp3 + vs3;
10243
10244
    // For inputs below denormal cutoff, replace output with +0.0f.
10245
    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
10246
0
    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
10247
0
      vf0 = 0.0f;
10248
0
    }
10249
0
    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
10250
0
      vf1 = 0.0f;
10251
0
    }
10252
0
    if XNN_UNPREDICTABLE(vx2 < vdenorm_cutoff) {
10253
0
      vf2 = 0.0f;
10254
0
    }
10255
0
    if XNN_UNPREDICTABLE(vx3 < vdenorm_cutoff) {
10256
0
      vf3 = 0.0f;
10257
0
    }
10258
10259
    // Store 4 outputs at a time.
10260
0
    output[0] = vf0;
10261
0
    output[1] = vf1;
10262
0
    output[2] = vf2;
10263
0
    output[3] = vf3;
10264
0
    output += 4;
10265
10266
    // Accumulate computed exponents.
10267
0
    vacc0 += vf0;
10268
0
    vacc1 += vf1;
10269
0
    vacc0 += vf2;
10270
0
    vacc1 += vf3;
10271
0
  }
10272
  // Add up all accumulators to vacc0
10273
0
  vacc0 += vacc1;
10274
10275
0
  float vacc = vacc0;
10276
0
  for (; batch >= sizeof(float); batch -= sizeof(float)) {
10277
    // Load 1 input at a time.
10278
0
    const float vi = *input++;
10279
10280
    // Subtract maximum input x := i - i_max. This implies x <= 0.
10281
0
    const float vx = vi - vi_max;
10282
10283
    // Compute reduced argument n := round(x / log(2)).
10284
    // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
10285
    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
10286
    // certain bounds (|x| <= 2**22), but that's ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
10287
    // anyway. We fixup the result for such inputs at the very end of the algorithm.
10288
0
    float vn = vx * vlog2e + vmagic_bias;
10289
10290
    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
10291
    // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
10292
0
    const float vs = uint32_as_float(float_as_uint32(vn) << 23);
10293
10294
    // Subtract the large number back to get final n := round(x / log(2)).
10295
0
    vn -= vmagic_bias;
10296
10297
    // Compute reduced argument t := x - n * log(2).
10298
    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
10299
0
    float vt = vn * vminus_ln2_hi + vx;
10300
0
    vt = vn * vminus_ln2_lo + vt;
10301
10302
    // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
10303
0
    float vp = vc5 * vt + vc4;
10304
0
    vp = vp * vt + vc3;
10305
0
    vp = vp * vt + vc2;
10306
0
    vp = vp * vt + vc1;
10307
10308
    // Reconstruct the final f value:
10309
    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
10310
    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
10311
    //     = s + (t * s) * p
10312
0
    vt *= vs;
10313
0
    float vf = vt * vp + vs;
10314
10315
    // For inputs below denormal cutoff, replace output with +0.0f.
10316
    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
10317
0
    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
10318
0
      vf = 0.0f;
10319
0
    }
10320
10321
    // Store 1 output at a time.
10322
0
    *output++ = vf;
10323
10324
    // Accumulate computed exponents.
10325
0
    vacc += vf;
10326
0
  }
10327
0
  *sum = vacc;
10328
0
}
10329
10330
void xnn_f32_rmax_ukernel__scalar(
10331
    size_t batch,
10332
    const float* input,
10333
    float* output,
10334
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
10335
0
{
10336
0
  assert(batch != 0);
10337
0
  assert(batch % sizeof(float) == 0);
10338
0
  assert(input != NULL);
10339
0
  assert(output != NULL);
10340
10341
0
  float vmax0 = *input;
10342
0
  float vmax1 = vmax0;
10343
0
  float vmax2 = vmax0;
10344
0
  float vmax3 = vmax0;
10345
0
  for (; batch >= 16; batch -= 16) {
10346
0
    const float vx0 = input[0];
10347
0
    const float vx1 = input[1];
10348
0
    const float vx2 = input[2];
10349
0
    const float vx3 = input[3];
10350
0
    input += 4;
10351
10352
0
    vmax0 = math_max_f32(vx0, vmax0);
10353
0
    vmax1 = math_max_f32(vx1, vmax1);
10354
0
    vmax2 = math_max_f32(vx2, vmax2);
10355
0
    vmax3 = math_max_f32(vx3, vmax3);
10356
0
  }
10357
0
  const float vmax01 = math_max_f32(vmax0, vmax1);
10358
0
  const float vmax23 = math_max_f32(vmax2, vmax3);
10359
0
  float vmax = math_max_f32(vmax01, vmax23);
10360
0
  if XNN_UNLIKELY(batch != 0) {
10361
0
    do {
10362
0
      const float vx = *input++;
10363
0
      vmax = math_max_f32(vx, vmax);
10364
0
      batch -= 4;
10365
0
    } while (batch != 0);
10366
0
  }
10367
0
  *output = vmax;
10368
0
}
10369
10370
void xnn_f32_rminmax_ukernel__scalar_u4_acc4(
10371
    size_t batch,
10372
    const float* input,
10373
    float* output,
10374
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
10375
0
{
10376
0
  assert(batch != 0);
10377
0
  assert(batch % sizeof(float) == 0);
10378
0
  assert(input != NULL);
10379
0
  assert(output != NULL);
10380
10381
0
  float vmin0 = *input;
10382
0
  float vmax0 = *input;
10383
0
  float vmin1 = vmin0;
10384
0
  float vmax1 = vmax0;
10385
0
  float vmin2 = vmin0;
10386
0
  float vmax2 = vmax0;
10387
0
  float vmin3 = vmin0;
10388
0
  float vmax3 = vmax0;
10389
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
10390
0
    const float vt0 = input[0];
10391
0
    const float vt1 = input[1];
10392
0
    const float vt2 = input[2];
10393
0
    const float vt3 = input[3];
10394
0
    input += 4;
10395
10396
0
    vmin0 = math_min_f32(vmin0, vt0);
10397
0
    vmax0 = math_max_f32(vmax0, vt0);
10398
0
    vmin1 = math_min_f32(vmin1, vt1);
10399
0
    vmax1 = math_max_f32(vmax1, vt1);
10400
0
    vmin2 = math_min_f32(vmin2, vt2);
10401
0
    vmax2 = math_max_f32(vmax2, vt2);
10402
0
    vmin3 = math_min_f32(vmin3, vt3);
10403
0
    vmax3 = math_max_f32(vmax3, vt3);
10404
0
  }
10405
0
  vmin0 = math_min_f32(vmin0, vmin1);
10406
0
  vmax0 = math_max_f32(vmax0, vmax1);
10407
0
  vmin2 = math_min_f32(vmin2, vmin3);
10408
0
  vmax2 = math_max_f32(vmax2, vmax3);
10409
0
  vmin0 = math_min_f32(vmin0, vmin2);
10410
0
  vmax0 = math_max_f32(vmax0, vmax2);
10411
10412
0
  if XNN_UNLIKELY(batch != 0) {
10413
0
    do {
10414
0
      const float vt = *input++;
10415
0
      vmin0 = math_min_f32(vmin0, vt);
10416
0
      vmax0 = math_max_f32(vmax0, vt);
10417
0
      batch -= sizeof(float);
10418
0
    } while (batch != 0);
10419
0
  }
10420
0
  output[0] = vmin0;
10421
0
  output[1] = vmax0;
10422
0
}
10423
10424
void xnn_f32_rsum_ukernel__scalar_u4_acc4(
10425
    size_t batch,
10426
    const float* input,
10427
    float* output,
10428
    const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
10429
0
{
10430
0
  assert(batch != 0);
10431
0
  assert(batch % sizeof(float) == 0);
10432
0
  assert(input != NULL);
10433
0
  assert(output != NULL);
10434
10435
0
  float vacc0 = 0.0f;
10436
0
  float vacc1 = 0.0f;
10437
0
  float vacc2 = 0.0f;
10438
0
  float vacc3 = 0.0f;
10439
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
10440
0
    const float vt0 = input[0];
10441
0
    const float vt1 = input[1];
10442
0
    const float vt2 = input[2];
10443
0
    const float vt3 = input[3];
10444
0
    input += 4;
10445
10446
0
    vacc0 += vt0;
10447
0
    vacc1 += vt1;
10448
0
    vacc2 += vt2;
10449
0
    vacc3 += vt3;
10450
0
  }
10451
0
  vacc0 += vacc1;
10452
0
  vacc2 += vacc3;
10453
0
  vacc0 += vacc2;
10454
10455
0
  if XNN_UNLIKELY(batch != 0) {
10456
0
    do {
10457
0
      const float vt = *input++;
10458
0
      vacc0 += vt;
10459
0
      batch -= sizeof(float);
10460
0
    } while (batch != 0);
10461
0
  }
10462
0
  const float vscale = params->scalar.scale;
10463
0
  vacc0 *= vscale;
10464
0
  *output = vacc0;
10465
0
}
10466
10467
void xnn_f32_spmm_minmax_ukernel_8x1__scalar(
10468
    size_t mc,
10469
    size_t nc,
10470
    const float* input,
10471
    const float* weights,
10472
    const int32_t* widx_dmap,
10473
    const uint32_t* nidx_nnzmap,
10474
    float* output,
10475
    size_t output_stride,
10476
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
10477
0
{
10478
0
  assert(mc != 0);
10479
0
  assert(mc % sizeof(float) == 0);
10480
0
  assert(nc != 0);
10481
10482
0
  const float vmin = params->scalar.min;
10483
0
  const float vmax = params->scalar.max;
10484
0
  size_t output_decrement = output_stride * nc - 8 * sizeof(float);
10485
0
  while (mc >= 8 * sizeof(float)) {
10486
0
    const float* w = weights;
10487
0
    const int32_t* dmap = widx_dmap;
10488
0
    const uint32_t* nnzmap = nidx_nnzmap;
10489
0
    size_t n = nc;
10490
0
    while (n >= 1) {
10491
0
      uint32_t nnz = *nnzmap++;
10492
0
      float vacc0x0 = *w++;
10493
0
      float vacc1x0 = vacc0x0;
10494
0
      float vacc2x0 = vacc0x0;
10495
0
      float vacc3x0 = vacc0x0;
10496
0
      float vacc4x0 = vacc0x0;
10497
0
      float vacc5x0 = vacc0x0;
10498
0
      float vacc6x0 = vacc0x0;
10499
0
      float vacc7x0 = vacc0x0;
10500
0
      if XNN_LIKELY(nnz != 0) {
10501
0
        do {
10502
0
          const intptr_t diff = *dmap++;
10503
0
          const float vi0 = input[0];
10504
0
          const float vi1 = input[1];
10505
0
          const float vi2 = input[2];
10506
0
          const float vi3 = input[3];
10507
0
          const float vi4 = input[4];
10508
0
          const float vi5 = input[5];
10509
0
          const float vi6 = input[6];
10510
0
          const float vi7 = input[7];
10511
0
          input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10512
0
          const float vw0 = *w++;
10513
0
          vacc0x0 += vi0 * vw0;
10514
0
          vacc1x0 += vi1 * vw0;
10515
0
          vacc2x0 += vi2 * vw0;
10516
0
          vacc3x0 += vi3 * vw0;
10517
0
          vacc4x0 += vi4 * vw0;
10518
0
          vacc5x0 += vi5 * vw0;
10519
0
          vacc6x0 += vi6 * vw0;
10520
0
          vacc7x0 += vi7 * vw0;
10521
0
        } while (--nnz != 0);
10522
0
      }
10523
0
      float vout0x0 = math_min_f32(vacc0x0, vmax);
10524
0
      float vout1x0 = math_min_f32(vacc1x0, vmax);
10525
0
      float vout2x0 = math_min_f32(vacc2x0, vmax);
10526
0
      float vout3x0 = math_min_f32(vacc3x0, vmax);
10527
0
      float vout4x0 = math_min_f32(vacc4x0, vmax);
10528
0
      float vout5x0 = math_min_f32(vacc5x0, vmax);
10529
0
      float vout6x0 = math_min_f32(vacc6x0, vmax);
10530
0
      float vout7x0 = math_min_f32(vacc7x0, vmax);
10531
0
      vout0x0 = math_max_f32(vout0x0, vmin);
10532
0
      vout1x0 = math_max_f32(vout1x0, vmin);
10533
0
      vout2x0 = math_max_f32(vout2x0, vmin);
10534
0
      vout3x0 = math_max_f32(vout3x0, vmin);
10535
0
      vout4x0 = math_max_f32(vout4x0, vmin);
10536
0
      vout5x0 = math_max_f32(vout5x0, vmin);
10537
0
      vout6x0 = math_max_f32(vout6x0, vmin);
10538
0
      vout7x0 = math_max_f32(vout7x0, vmin);
10539
0
      output[0] = vout0x0;
10540
0
      output[1] = vout1x0;
10541
0
      output[2] = vout2x0;
10542
0
      output[3] = vout3x0;
10543
0
      output[4] = vout4x0;
10544
0
      output[5] = vout5x0;
10545
0
      output[6] = vout6x0;
10546
0
      output[7] = vout7x0;
10547
0
      output[0] = vout0x0;
10548
0
      output[1] = vout1x0;
10549
0
      output[2] = vout2x0;
10550
0
      output[3] = vout3x0;
10551
0
      output[4] = vout4x0;
10552
0
      output[5] = vout5x0;
10553
0
      output[6] = vout6x0;
10554
0
      output[7] = vout7x0;
10555
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
10556
0
      n -= 1;
10557
0
    }
10558
0
    if XNN_UNLIKELY(n != 0) {
10559
0
      do {
10560
0
        uint32_t nnz = *nnzmap++;
10561
0
        float vacc0 = *w++;
10562
0
        float vacc1 = vacc0;
10563
0
        float vacc2 = vacc0;
10564
0
        float vacc3 = vacc0;
10565
0
        float vacc4 = vacc0;
10566
0
        float vacc5 = vacc0;
10567
0
        float vacc6 = vacc0;
10568
0
        float vacc7 = vacc0;
10569
0
        if XNN_LIKELY(nnz != 0) {
10570
0
          do {
10571
0
            const intptr_t diff = *dmap++;
10572
0
            const float vi0 = input[0];
10573
0
            const float vi1 = input[1];
10574
0
            const float vi2 = input[2];
10575
0
            const float vi3 = input[3];
10576
0
            const float vi4 = input[4];
10577
0
            const float vi5 = input[5];
10578
0
            const float vi6 = input[6];
10579
0
            const float vi7 = input[7];
10580
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10581
0
            const float vw = *w++;
10582
0
            vacc0 += vi0 * vw;
10583
0
            vacc1 += vi1 * vw;
10584
0
            vacc2 += vi2 * vw;
10585
0
            vacc3 += vi3 * vw;
10586
0
            vacc4 += vi4 * vw;
10587
0
            vacc5 += vi5 * vw;
10588
0
            vacc6 += vi6 * vw;
10589
0
            vacc7 += vi7 * vw;
10590
0
          } while (--nnz != 0);
10591
0
        }
10592
0
        float vout0 = math_min_f32(vacc0, vmax);
10593
0
        float vout1 = math_min_f32(vacc1, vmax);
10594
0
        float vout2 = math_min_f32(vacc2, vmax);
10595
0
        float vout3 = math_min_f32(vacc3, vmax);
10596
0
        float vout4 = math_min_f32(vacc4, vmax);
10597
0
        float vout5 = math_min_f32(vacc5, vmax);
10598
0
        float vout6 = math_min_f32(vacc6, vmax);
10599
0
        float vout7 = math_min_f32(vacc7, vmax);
10600
0
        vout0 = math_max_f32(vout0, vmin);
10601
0
        vout1 = math_max_f32(vout1, vmin);
10602
0
        vout2 = math_max_f32(vout2, vmin);
10603
0
        vout3 = math_max_f32(vout3, vmin);
10604
0
        vout4 = math_max_f32(vout4, vmin);
10605
0
        vout5 = math_max_f32(vout5, vmin);
10606
0
        vout6 = math_max_f32(vout6, vmin);
10607
0
        vout7 = math_max_f32(vout7, vmin);
10608
0
        output[0] = vout0;
10609
0
        output[1] = vout1;
10610
0
        output[2] = vout2;
10611
0
        output[3] = vout3;
10612
0
        output[4] = vout4;
10613
0
        output[5] = vout5;
10614
0
        output[6] = vout6;
10615
0
        output[7] = vout7;
10616
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
10617
0
        n -= 1;
10618
0
      } while (n != 0);
10619
0
    }
10620
0
    output = (float*restrict) ((uintptr_t) output - output_decrement);
10621
0
    input += 8;
10622
0
    mc -= 8 * sizeof(float);
10623
0
  }
10624
0
  if XNN_UNLIKELY(mc != 0) {
10625
0
    output_decrement += 4 * sizeof(float);
10626
0
    if (mc & (4 * sizeof(float))) {
10627
0
      const float* w = weights;
10628
0
      const int32_t* dmap = widx_dmap;
10629
0
      const uint32_t* nnzmap = nidx_nnzmap;
10630
0
      size_t n = nc;
10631
0
      while (n >= 1) {
10632
0
        uint32_t nnz = *nnzmap++;
10633
0
        float vacc0x0 = *w++;
10634
0
        float vacc1x0 = vacc0x0;
10635
0
        float vacc2x0 = vacc0x0;
10636
0
        float vacc3x0 = vacc0x0;
10637
0
        if XNN_LIKELY(nnz != 0) {
10638
0
          do {
10639
0
            const intptr_t diff = *dmap++;
10640
0
            const float vi0 = input[0];
10641
0
            const float vi1 = input[1];
10642
0
            const float vi2 = input[2];
10643
0
            const float vi3 = input[3];
10644
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10645
0
            const float vw0 = *w++;
10646
0
            vacc0x0 += vi0 * vw0;
10647
0
            vacc1x0 += vi1 * vw0;
10648
0
            vacc2x0 += vi2 * vw0;
10649
0
            vacc3x0 += vi3 * vw0;
10650
0
          } while (--nnz != 0);
10651
0
        }
10652
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
10653
0
        float vout1x0 = math_min_f32(vacc1x0, vmax);
10654
0
        float vout2x0 = math_min_f32(vacc2x0, vmax);
10655
0
        float vout3x0 = math_min_f32(vacc3x0, vmax);
10656
0
        vout0x0 = math_max_f32(vout0x0, vmin);
10657
0
        vout1x0 = math_max_f32(vout1x0, vmin);
10658
0
        vout2x0 = math_max_f32(vout2x0, vmin);
10659
0
        vout3x0 = math_max_f32(vout3x0, vmin);
10660
0
        output[0] = vout0x0;
10661
0
        output[1] = vout1x0;
10662
0
        output[2] = vout2x0;
10663
0
        output[3] = vout3x0;
10664
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
10665
0
        n -= 1;
10666
0
      }
10667
0
      if XNN_UNLIKELY(n != 0) {
10668
0
        do {
10669
0
          uint32_t nnz = *nnzmap++;
10670
0
          float vacc0 = *w++;
10671
0
          float vacc1 = vacc0;
10672
0
          float vacc2 = vacc0;
10673
0
          float vacc3 = vacc0;
10674
0
          if XNN_LIKELY(nnz != 0) {
10675
0
            do {
10676
0
              const intptr_t diff = *dmap++;
10677
0
              const float vi0 = input[0];
10678
0
              const float vi1 = input[1];
10679
0
              const float vi2 = input[2];
10680
0
              const float vi3 = input[3];
10681
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10682
0
              const float vw = *w++;
10683
0
              vacc0 += vi0 * vw;
10684
0
              vacc1 += vi1 * vw;
10685
0
              vacc2 += vi2 * vw;
10686
0
              vacc3 += vi3 * vw;
10687
0
            } while (--nnz != 0);
10688
0
          }
10689
0
          float vout0 = math_min_f32(vacc0, vmax);
10690
0
          float vout1 = math_min_f32(vacc1, vmax);
10691
0
          float vout2 = math_min_f32(vacc2, vmax);
10692
0
          float vout3 = math_min_f32(vacc3, vmax);
10693
0
          vout0 = math_max_f32(vout0, vmin);
10694
0
          vout1 = math_max_f32(vout1, vmin);
10695
0
          vout2 = math_max_f32(vout2, vmin);
10696
0
          vout3 = math_max_f32(vout3, vmin);
10697
0
          output[0] = vout0;
10698
0
          output[1] = vout1;
10699
0
          output[2] = vout2;
10700
0
          output[3] = vout3;
10701
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
10702
0
          n -= 1;
10703
0
        } while (n != 0);
10704
0
      }
10705
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
10706
0
      input += 4;
10707
0
    }
10708
0
    output_decrement += 2 * sizeof(float);
10709
0
    if (mc & (2 * sizeof(float))) {
10710
0
      const float* w = weights;
10711
0
      const int32_t* dmap = widx_dmap;
10712
0
      const uint32_t* nnzmap = nidx_nnzmap;
10713
0
      size_t n = nc;
10714
0
      while (n >= 1) {
10715
0
        uint32_t nnz = *nnzmap++;
10716
0
        float vacc0x0 = *w++;
10717
0
        float vacc1x0 = vacc0x0;
10718
0
        if XNN_LIKELY(nnz != 0) {
10719
0
          do {
10720
0
            const intptr_t diff = *dmap++;
10721
0
            const float vi0 = input[0];
10722
0
            const float vi1 = input[1];
10723
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10724
0
            const float vw0 = *w++;
10725
0
            vacc0x0 += vi0 * vw0;
10726
0
            vacc1x0 += vi1 * vw0;
10727
0
          } while (--nnz != 0);
10728
0
        }
10729
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
10730
0
        float vout1x0 = math_min_f32(vacc1x0, vmax);
10731
0
        vout0x0 = math_max_f32(vout0x0, vmin);
10732
0
        vout1x0 = math_max_f32(vout1x0, vmin);
10733
0
        output[0] = vout0x0;
10734
0
        output[1] = vout1x0;
10735
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
10736
0
        n -= 1;
10737
0
      }
10738
0
      if XNN_UNLIKELY(n != 0) {
10739
0
        do {
10740
0
          uint32_t nnz = *nnzmap++;
10741
0
          float vacc0 = *w++;
10742
0
          float vacc1 = vacc0;
10743
0
          if XNN_LIKELY(nnz != 0) {
10744
0
            do {
10745
0
              const intptr_t diff = *dmap++;
10746
0
              const float vi0 = input[0];
10747
0
              const float vi1 = input[1];
10748
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10749
0
              const float vw = *w++;
10750
0
              vacc0 += vi0 * vw;
10751
0
              vacc1 += vi1 * vw;
10752
0
            } while (--nnz != 0);
10753
0
          }
10754
0
          float vout0 = math_min_f32(vacc0, vmax);
10755
0
          float vout1 = math_min_f32(vacc1, vmax);
10756
0
          vout0 = math_max_f32(vout0, vmin);
10757
0
          vout1 = math_max_f32(vout1, vmin);
10758
0
          output[0] = vout0;
10759
0
          output[1] = vout1;
10760
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
10761
0
          n -= 1;
10762
0
        } while (n != 0);
10763
0
      }
10764
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
10765
0
      input += 2;
10766
0
    }
10767
0
    output_decrement += 1 * sizeof(float);
10768
0
    if (mc & (1 * sizeof(float))) {
10769
0
      const float* w = weights;
10770
0
      const int32_t* dmap = widx_dmap;
10771
0
      const uint32_t* nnzmap = nidx_nnzmap;
10772
0
      size_t n = nc;
10773
0
      while (n >= 1) {
10774
0
        uint32_t nnz = *nnzmap++;
10775
0
        float vacc0x0 = *w++;
10776
0
        if XNN_LIKELY(nnz != 0) {
10777
0
          do {
10778
0
            const intptr_t diff = *dmap++;
10779
0
            const float vi0 = input[0];
10780
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10781
0
            const float vw0 = *w++;
10782
0
            vacc0x0 += vi0 * vw0;
10783
0
          } while (--nnz != 0);
10784
0
        }
10785
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
10786
0
        vout0x0 = math_max_f32(vout0x0, vmin);
10787
0
        output[0] = vout0x0;
10788
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
10789
0
        n -= 1;
10790
0
      }
10791
0
      if XNN_UNLIKELY(n != 0) {
10792
0
        do {
10793
0
          uint32_t nnz = *nnzmap++;
10794
0
          float vacc0 = *w++;
10795
0
          if XNN_LIKELY(nnz != 0) {
10796
0
            do {
10797
0
              const intptr_t diff = *dmap++;
10798
0
              const float vi0 = input[0];
10799
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10800
0
              const float vw = *w++;
10801
0
              vacc0 += vi0 * vw;
10802
0
            } while (--nnz != 0);
10803
0
          }
10804
0
          float vout0 = math_min_f32(vacc0, vmax);
10805
0
          vout0 = math_max_f32(vout0, vmin);
10806
0
          output[0] = vout0;
10807
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
10808
0
          n -= 1;
10809
0
        } while (n != 0);
10810
0
      }
10811
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
10812
0
      input += 1;
10813
0
    }
10814
0
  }
10815
0
}
10816
10817
void xnn_f32_spmm_minmax_ukernel_8x2__scalar(
10818
    size_t mc,
10819
    size_t nc,
10820
    const float* input,
10821
    const float* weights,
10822
    const int32_t* widx_dmap,
10823
    const uint32_t* nidx_nnzmap,
10824
    float* output,
10825
    size_t output_stride,
10826
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
10827
0
{
10828
0
  assert(mc != 0);
10829
0
  assert(mc % sizeof(float) == 0);
10830
0
  assert(nc != 0);
10831
10832
0
  const float vmin = params->scalar.min;
10833
0
  const float vmax = params->scalar.max;
10834
0
  size_t output_decrement = output_stride * nc - 8 * sizeof(float);
10835
0
  while (mc >= 8 * sizeof(float)) {
10836
0
    const float* w = weights;
10837
0
    const int32_t* dmap = widx_dmap;
10838
0
    const uint32_t* nnzmap = nidx_nnzmap;
10839
0
    size_t n = nc;
10840
0
    while (n >= 2) {
10841
0
      uint32_t nnz = *nnzmap++;
10842
0
      float vacc0x0 = *w++;
10843
0
      float vacc0x1 = *w++;
10844
0
      float vacc1x0 = vacc0x0;
10845
0
      float vacc1x1 = vacc0x1;
10846
0
      float vacc2x0 = vacc0x0;
10847
0
      float vacc2x1 = vacc0x1;
10848
0
      float vacc3x0 = vacc0x0;
10849
0
      float vacc3x1 = vacc0x1;
10850
0
      float vacc4x0 = vacc0x0;
10851
0
      float vacc4x1 = vacc0x1;
10852
0
      float vacc5x0 = vacc0x0;
10853
0
      float vacc5x1 = vacc0x1;
10854
0
      float vacc6x0 = vacc0x0;
10855
0
      float vacc6x1 = vacc0x1;
10856
0
      float vacc7x0 = vacc0x0;
10857
0
      float vacc7x1 = vacc0x1;
10858
0
      if XNN_LIKELY(nnz != 0) {
10859
0
        do {
10860
0
          const intptr_t diff = *dmap++;
10861
0
          const float vi0 = input[0];
10862
0
          const float vi1 = input[1];
10863
0
          const float vi2 = input[2];
10864
0
          const float vi3 = input[3];
10865
0
          const float vi4 = input[4];
10866
0
          const float vi5 = input[5];
10867
0
          const float vi6 = input[6];
10868
0
          const float vi7 = input[7];
10869
0
          input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10870
0
          const float vw0 = *w++;
10871
0
          const float vw1 = *w++;
10872
0
          vacc0x0 += vi0 * vw0;
10873
0
          vacc1x0 += vi1 * vw0;
10874
0
          vacc2x0 += vi2 * vw0;
10875
0
          vacc3x0 += vi3 * vw0;
10876
0
          vacc4x0 += vi4 * vw0;
10877
0
          vacc5x0 += vi5 * vw0;
10878
0
          vacc6x0 += vi6 * vw0;
10879
0
          vacc7x0 += vi7 * vw0;
10880
0
          vacc0x1 += vi0 * vw1;
10881
0
          vacc1x1 += vi1 * vw1;
10882
0
          vacc2x1 += vi2 * vw1;
10883
0
          vacc3x1 += vi3 * vw1;
10884
0
          vacc4x1 += vi4 * vw1;
10885
0
          vacc5x1 += vi5 * vw1;
10886
0
          vacc6x1 += vi6 * vw1;
10887
0
          vacc7x1 += vi7 * vw1;
10888
0
        } while (--nnz != 0);
10889
0
      }
10890
0
      float vout0x0 = math_min_f32(vacc0x0, vmax);
10891
0
      float vout1x0 = math_min_f32(vacc1x0, vmax);
10892
0
      float vout2x0 = math_min_f32(vacc2x0, vmax);
10893
0
      float vout3x0 = math_min_f32(vacc3x0, vmax);
10894
0
      float vout4x0 = math_min_f32(vacc4x0, vmax);
10895
0
      float vout5x0 = math_min_f32(vacc5x0, vmax);
10896
0
      float vout6x0 = math_min_f32(vacc6x0, vmax);
10897
0
      float vout7x0 = math_min_f32(vacc7x0, vmax);
10898
0
      float vout0x1 = math_min_f32(vacc0x1, vmax);
10899
0
      float vout1x1 = math_min_f32(vacc1x1, vmax);
10900
0
      float vout2x1 = math_min_f32(vacc2x1, vmax);
10901
0
      float vout3x1 = math_min_f32(vacc3x1, vmax);
10902
0
      float vout4x1 = math_min_f32(vacc4x1, vmax);
10903
0
      float vout5x1 = math_min_f32(vacc5x1, vmax);
10904
0
      float vout6x1 = math_min_f32(vacc6x1, vmax);
10905
0
      float vout7x1 = math_min_f32(vacc7x1, vmax);
10906
0
      vout0x0 = math_max_f32(vout0x0, vmin);
10907
0
      vout1x0 = math_max_f32(vout1x0, vmin);
10908
0
      vout2x0 = math_max_f32(vout2x0, vmin);
10909
0
      vout3x0 = math_max_f32(vout3x0, vmin);
10910
0
      vout4x0 = math_max_f32(vout4x0, vmin);
10911
0
      vout5x0 = math_max_f32(vout5x0, vmin);
10912
0
      vout6x0 = math_max_f32(vout6x0, vmin);
10913
0
      vout7x0 = math_max_f32(vout7x0, vmin);
10914
0
      vout0x1 = math_max_f32(vout0x1, vmin);
10915
0
      vout1x1 = math_max_f32(vout1x1, vmin);
10916
0
      vout2x1 = math_max_f32(vout2x1, vmin);
10917
0
      vout3x1 = math_max_f32(vout3x1, vmin);
10918
0
      vout4x1 = math_max_f32(vout4x1, vmin);
10919
0
      vout5x1 = math_max_f32(vout5x1, vmin);
10920
0
      vout6x1 = math_max_f32(vout6x1, vmin);
10921
0
      vout7x1 = math_max_f32(vout7x1, vmin);
10922
0
      output[0] = vout0x1;
10923
0
      output[1] = vout1x1;
10924
0
      output[2] = vout2x1;
10925
0
      output[3] = vout3x1;
10926
0
      output[4] = vout4x1;
10927
0
      output[5] = vout5x1;
10928
0
      output[6] = vout6x1;
10929
0
      output[7] = vout7x1;
10930
0
      output[0] = vout0x0;
10931
0
      output[1] = vout1x0;
10932
0
      output[2] = vout2x0;
10933
0
      output[3] = vout3x0;
10934
0
      output[4] = vout4x0;
10935
0
      output[5] = vout5x0;
10936
0
      output[6] = vout6x0;
10937
0
      output[7] = vout7x0;
10938
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
10939
0
      output[0] = vout0x1;
10940
0
      output[1] = vout1x1;
10941
0
      output[2] = vout2x1;
10942
0
      output[3] = vout3x1;
10943
0
      output[4] = vout4x1;
10944
0
      output[5] = vout5x1;
10945
0
      output[6] = vout6x1;
10946
0
      output[7] = vout7x1;
10947
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
10948
0
      n -= 2;
10949
0
    }
10950
0
    if XNN_UNLIKELY(n != 0) {
10951
0
      do {
10952
0
        uint32_t nnz = *nnzmap++;
10953
0
        float vacc0 = *w++;
10954
0
        float vacc1 = vacc0;
10955
0
        float vacc2 = vacc0;
10956
0
        float vacc3 = vacc0;
10957
0
        float vacc4 = vacc0;
10958
0
        float vacc5 = vacc0;
10959
0
        float vacc6 = vacc0;
10960
0
        float vacc7 = vacc0;
10961
0
        if XNN_LIKELY(nnz != 0) {
10962
0
          do {
10963
0
            const intptr_t diff = *dmap++;
10964
0
            const float vi0 = input[0];
10965
0
            const float vi1 = input[1];
10966
0
            const float vi2 = input[2];
10967
0
            const float vi3 = input[3];
10968
0
            const float vi4 = input[4];
10969
0
            const float vi5 = input[5];
10970
0
            const float vi6 = input[6];
10971
0
            const float vi7 = input[7];
10972
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
10973
0
            const float vw = *w++;
10974
0
            vacc0 += vi0 * vw;
10975
0
            vacc1 += vi1 * vw;
10976
0
            vacc2 += vi2 * vw;
10977
0
            vacc3 += vi3 * vw;
10978
0
            vacc4 += vi4 * vw;
10979
0
            vacc5 += vi5 * vw;
10980
0
            vacc6 += vi6 * vw;
10981
0
            vacc7 += vi7 * vw;
10982
0
          } while (--nnz != 0);
10983
0
        }
10984
0
        float vout0 = math_min_f32(vacc0, vmax);
10985
0
        float vout1 = math_min_f32(vacc1, vmax);
10986
0
        float vout2 = math_min_f32(vacc2, vmax);
10987
0
        float vout3 = math_min_f32(vacc3, vmax);
10988
0
        float vout4 = math_min_f32(vacc4, vmax);
10989
0
        float vout5 = math_min_f32(vacc5, vmax);
10990
0
        float vout6 = math_min_f32(vacc6, vmax);
10991
0
        float vout7 = math_min_f32(vacc7, vmax);
10992
0
        vout0 = math_max_f32(vout0, vmin);
10993
0
        vout1 = math_max_f32(vout1, vmin);
10994
0
        vout2 = math_max_f32(vout2, vmin);
10995
0
        vout3 = math_max_f32(vout3, vmin);
10996
0
        vout4 = math_max_f32(vout4, vmin);
10997
0
        vout5 = math_max_f32(vout5, vmin);
10998
0
        vout6 = math_max_f32(vout6, vmin);
10999
0
        vout7 = math_max_f32(vout7, vmin);
11000
0
        output[0] = vout0;
11001
0
        output[1] = vout1;
11002
0
        output[2] = vout2;
11003
0
        output[3] = vout3;
11004
0
        output[4] = vout4;
11005
0
        output[5] = vout5;
11006
0
        output[6] = vout6;
11007
0
        output[7] = vout7;
11008
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11009
0
        n -= 1;
11010
0
      } while (n != 0);
11011
0
    }
11012
0
    output = (float*restrict) ((uintptr_t) output - output_decrement);
11013
0
    input += 8;
11014
0
    mc -= 8 * sizeof(float);
11015
0
  }
11016
0
  if XNN_UNLIKELY(mc != 0) {
11017
0
    output_decrement += 4 * sizeof(float);
11018
0
    if (mc & (4 * sizeof(float))) {
11019
0
      const float* w = weights;
11020
0
      const int32_t* dmap = widx_dmap;
11021
0
      const uint32_t* nnzmap = nidx_nnzmap;
11022
0
      size_t n = nc;
11023
0
      while (n >= 2) {
11024
0
        uint32_t nnz = *nnzmap++;
11025
0
        float vacc0x0 = *w++;
11026
0
        float vacc0x1 = *w++;
11027
0
        float vacc1x0 = vacc0x0;
11028
0
        float vacc2x0 = vacc0x0;
11029
0
        float vacc3x0 = vacc0x0;
11030
0
        float vacc1x1 = vacc0x1;
11031
0
        float vacc2x1 = vacc0x1;
11032
0
        float vacc3x1 = vacc0x1;
11033
0
        if XNN_LIKELY(nnz != 0) {
11034
0
          do {
11035
0
            const intptr_t diff = *dmap++;
11036
0
            const float vi0 = input[0];
11037
0
            const float vi1 = input[1];
11038
0
            const float vi2 = input[2];
11039
0
            const float vi3 = input[3];
11040
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11041
0
            const float vw0 = *w++;
11042
0
            const float vw1 = *w++;
11043
0
            vacc0x0 += vi0 * vw0;
11044
0
            vacc1x0 += vi1 * vw0;
11045
0
            vacc2x0 += vi2 * vw0;
11046
0
            vacc3x0 += vi3 * vw0;
11047
0
            vacc0x1 += vi0 * vw1;
11048
0
            vacc1x1 += vi1 * vw1;
11049
0
            vacc2x1 += vi2 * vw1;
11050
0
            vacc3x1 += vi3 * vw1;
11051
0
          } while (--nnz != 0);
11052
0
        }
11053
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
11054
0
        float vout1x0 = math_min_f32(vacc1x0, vmax);
11055
0
        float vout2x0 = math_min_f32(vacc2x0, vmax);
11056
0
        float vout3x0 = math_min_f32(vacc3x0, vmax);
11057
0
        float vout0x1 = math_min_f32(vacc0x1, vmax);
11058
0
        float vout1x1 = math_min_f32(vacc1x1, vmax);
11059
0
        float vout2x1 = math_min_f32(vacc2x1, vmax);
11060
0
        float vout3x1 = math_min_f32(vacc3x1, vmax);
11061
0
        vout0x0 = math_max_f32(vout0x0, vmin);
11062
0
        vout1x0 = math_max_f32(vout1x0, vmin);
11063
0
        vout2x0 = math_max_f32(vout2x0, vmin);
11064
0
        vout3x0 = math_max_f32(vout3x0, vmin);
11065
0
        vout0x1 = math_max_f32(vout0x1, vmin);
11066
0
        vout1x1 = math_max_f32(vout1x1, vmin);
11067
0
        vout2x1 = math_max_f32(vout2x1, vmin);
11068
0
        vout3x1 = math_max_f32(vout3x1, vmin);
11069
0
        output[0] = vout0x0;
11070
0
        output[1] = vout1x0;
11071
0
        output[2] = vout2x0;
11072
0
        output[3] = vout3x0;
11073
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11074
0
        output[0] = vout0x1;
11075
0
        output[1] = vout1x1;
11076
0
        output[2] = vout2x1;
11077
0
        output[3] = vout3x1;
11078
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11079
0
        n -= 2;
11080
0
      }
11081
0
      if XNN_UNLIKELY(n != 0) {
11082
0
        do {
11083
0
          uint32_t nnz = *nnzmap++;
11084
0
          float vacc0 = *w++;
11085
0
          float vacc1 = vacc0;
11086
0
          float vacc2 = vacc0;
11087
0
          float vacc3 = vacc0;
11088
0
          if XNN_LIKELY(nnz != 0) {
11089
0
            do {
11090
0
              const intptr_t diff = *dmap++;
11091
0
              const float vi0 = input[0];
11092
0
              const float vi1 = input[1];
11093
0
              const float vi2 = input[2];
11094
0
              const float vi3 = input[3];
11095
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11096
0
              const float vw = *w++;
11097
0
              vacc0 += vi0 * vw;
11098
0
              vacc1 += vi1 * vw;
11099
0
              vacc2 += vi2 * vw;
11100
0
              vacc3 += vi3 * vw;
11101
0
            } while (--nnz != 0);
11102
0
          }
11103
0
          float vout0 = math_min_f32(vacc0, vmax);
11104
0
          float vout1 = math_min_f32(vacc1, vmax);
11105
0
          float vout2 = math_min_f32(vacc2, vmax);
11106
0
          float vout3 = math_min_f32(vacc3, vmax);
11107
0
          vout0 = math_max_f32(vout0, vmin);
11108
0
          vout1 = math_max_f32(vout1, vmin);
11109
0
          vout2 = math_max_f32(vout2, vmin);
11110
0
          vout3 = math_max_f32(vout3, vmin);
11111
0
          output[0] = vout0;
11112
0
          output[1] = vout1;
11113
0
          output[2] = vout2;
11114
0
          output[3] = vout3;
11115
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
11116
0
          n -= 1;
11117
0
        } while (n != 0);
11118
0
      }
11119
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
11120
0
      input += 4;
11121
0
    }
11122
0
    output_decrement += 2 * sizeof(float);
11123
0
    if (mc & (2 * sizeof(float))) {
11124
0
      const float* w = weights;
11125
0
      const int32_t* dmap = widx_dmap;
11126
0
      const uint32_t* nnzmap = nidx_nnzmap;
11127
0
      size_t n = nc;
11128
0
      while (n >= 2) {
11129
0
        uint32_t nnz = *nnzmap++;
11130
0
        float vacc0x0 = *w++;
11131
0
        float vacc0x1 = *w++;
11132
0
        float vacc1x0 = vacc0x0;
11133
0
        float vacc1x1 = vacc0x1;
11134
0
        if XNN_LIKELY(nnz != 0) {
11135
0
          do {
11136
0
            const intptr_t diff = *dmap++;
11137
0
            const float vi0 = input[0];
11138
0
            const float vi1 = input[1];
11139
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11140
0
            const float vw0 = *w++;
11141
0
            const float vw1 = *w++;
11142
0
            vacc0x0 += vi0 * vw0;
11143
0
            vacc1x0 += vi1 * vw0;
11144
0
            vacc0x1 += vi0 * vw1;
11145
0
            vacc1x1 += vi1 * vw1;
11146
0
          } while (--nnz != 0);
11147
0
        }
11148
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
11149
0
        float vout1x0 = math_min_f32(vacc1x0, vmax);
11150
0
        float vout0x1 = math_min_f32(vacc0x1, vmax);
11151
0
        float vout1x1 = math_min_f32(vacc1x1, vmax);
11152
0
        vout0x0 = math_max_f32(vout0x0, vmin);
11153
0
        vout1x0 = math_max_f32(vout1x0, vmin);
11154
0
        vout0x1 = math_max_f32(vout0x1, vmin);
11155
0
        vout1x1 = math_max_f32(vout1x1, vmin);
11156
0
        output[0] = vout0x0;
11157
0
        output[1] = vout1x0;
11158
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11159
0
        output[0] = vout0x1;
11160
0
        output[1] = vout1x1;
11161
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11162
0
        n -= 2;
11163
0
      }
11164
0
      if XNN_UNLIKELY(n != 0) {
11165
0
        do {
11166
0
          uint32_t nnz = *nnzmap++;
11167
0
          float vacc0 = *w++;
11168
0
          float vacc1 = vacc0;
11169
0
          if XNN_LIKELY(nnz != 0) {
11170
0
            do {
11171
0
              const intptr_t diff = *dmap++;
11172
0
              const float vi0 = input[0];
11173
0
              const float vi1 = input[1];
11174
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11175
0
              const float vw = *w++;
11176
0
              vacc0 += vi0 * vw;
11177
0
              vacc1 += vi1 * vw;
11178
0
            } while (--nnz != 0);
11179
0
          }
11180
0
          float vout0 = math_min_f32(vacc0, vmax);
11181
0
          float vout1 = math_min_f32(vacc1, vmax);
11182
0
          vout0 = math_max_f32(vout0, vmin);
11183
0
          vout1 = math_max_f32(vout1, vmin);
11184
0
          output[0] = vout0;
11185
0
          output[1] = vout1;
11186
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
11187
0
          n -= 1;
11188
0
        } while (n != 0);
11189
0
      }
11190
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
11191
0
      input += 2;
11192
0
    }
11193
0
    output_decrement += 1 * sizeof(float);
11194
0
    if (mc & (1 * sizeof(float))) {
11195
0
      const float* w = weights;
11196
0
      const int32_t* dmap = widx_dmap;
11197
0
      const uint32_t* nnzmap = nidx_nnzmap;
11198
0
      size_t n = nc;
11199
0
      while (n >= 2) {
11200
0
        uint32_t nnz = *nnzmap++;
11201
0
        float vacc0x0 = *w++;
11202
0
        float vacc0x1 = *w++;
11203
0
        if XNN_LIKELY(nnz != 0) {
11204
0
          do {
11205
0
            const intptr_t diff = *dmap++;
11206
0
            const float vi0 = input[0];
11207
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11208
0
            const float vw0 = *w++;
11209
0
            const float vw1 = *w++;
11210
0
            vacc0x0 += vi0 * vw0;
11211
0
            vacc0x1 += vi0 * vw1;
11212
0
          } while (--nnz != 0);
11213
0
        }
11214
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
11215
0
        float vout0x1 = math_min_f32(vacc0x1, vmax);
11216
0
        vout0x0 = math_max_f32(vout0x0, vmin);
11217
0
        vout0x1 = math_max_f32(vout0x1, vmin);
11218
0
        output[0] = vout0x0;
11219
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11220
0
        output[0] = vout0x1;
11221
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11222
0
        n -= 2;
11223
0
      }
11224
0
      if XNN_UNLIKELY(n != 0) {
11225
0
        do {
11226
0
          uint32_t nnz = *nnzmap++;
11227
0
          float vacc0 = *w++;
11228
0
          if XNN_LIKELY(nnz != 0) {
11229
0
            do {
11230
0
              const intptr_t diff = *dmap++;
11231
0
              const float vi0 = input[0];
11232
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11233
0
              const float vw = *w++;
11234
0
              vacc0 += vi0 * vw;
11235
0
            } while (--nnz != 0);
11236
0
          }
11237
0
          float vout0 = math_min_f32(vacc0, vmax);
11238
0
          vout0 = math_max_f32(vout0, vmin);
11239
0
          output[0] = vout0;
11240
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
11241
0
          n -= 1;
11242
0
        } while (n != 0);
11243
0
      }
11244
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
11245
0
      input += 1;
11246
0
    }
11247
0
  }
11248
0
}
11249
11250
void xnn_f32_spmm_minmax_ukernel_8x4__scalar(
11251
    size_t mc,
11252
    size_t nc,
11253
    const float* input,
11254
    const float* weights,
11255
    const int32_t* widx_dmap,
11256
    const uint32_t* nidx_nnzmap,
11257
    float* output,
11258
    size_t output_stride,
11259
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
11260
0
{
11261
0
  assert(mc != 0);
11262
0
  assert(mc % sizeof(float) == 0);
11263
0
  assert(nc != 0);
11264
11265
0
  const float vmin = params->scalar.min;
11266
0
  const float vmax = params->scalar.max;
11267
0
  size_t output_decrement = output_stride * nc - 8 * sizeof(float);
11268
0
  while (mc >= 8 * sizeof(float)) {
11269
0
    const float* w = weights;
11270
0
    const int32_t* dmap = widx_dmap;
11271
0
    const uint32_t* nnzmap = nidx_nnzmap;
11272
0
    size_t n = nc;
11273
0
    while (n >= 4) {
11274
0
      uint32_t nnz = *nnzmap++;
11275
0
      float vacc0x0 = *w++;
11276
0
      float vacc0x1 = *w++;
11277
0
      float vacc0x2 = *w++;
11278
0
      float vacc0x3 = *w++;
11279
0
      float vacc1x0 = vacc0x0;
11280
0
      float vacc1x1 = vacc0x1;
11281
0
      float vacc1x2 = vacc0x2;
11282
0
      float vacc1x3 = vacc0x3;
11283
0
      float vacc2x0 = vacc0x0;
11284
0
      float vacc2x1 = vacc0x1;
11285
0
      float vacc2x2 = vacc0x2;
11286
0
      float vacc2x3 = vacc0x3;
11287
0
      float vacc3x0 = vacc0x0;
11288
0
      float vacc3x1 = vacc0x1;
11289
0
      float vacc3x2 = vacc0x2;
11290
0
      float vacc3x3 = vacc0x3;
11291
0
      float vacc4x0 = vacc0x0;
11292
0
      float vacc4x1 = vacc0x1;
11293
0
      float vacc4x2 = vacc0x2;
11294
0
      float vacc4x3 = vacc0x3;
11295
0
      float vacc5x0 = vacc0x0;
11296
0
      float vacc5x1 = vacc0x1;
11297
0
      float vacc5x2 = vacc0x2;
11298
0
      float vacc5x3 = vacc0x3;
11299
0
      float vacc6x0 = vacc0x0;
11300
0
      float vacc6x1 = vacc0x1;
11301
0
      float vacc6x2 = vacc0x2;
11302
0
      float vacc6x3 = vacc0x3;
11303
0
      float vacc7x0 = vacc0x0;
11304
0
      float vacc7x1 = vacc0x1;
11305
0
      float vacc7x2 = vacc0x2;
11306
0
      float vacc7x3 = vacc0x3;
11307
0
      if XNN_LIKELY(nnz != 0) {
11308
0
        do {
11309
0
          const intptr_t diff = *dmap++;
11310
0
          const float vi0 = input[0];
11311
0
          const float vi1 = input[1];
11312
0
          const float vi2 = input[2];
11313
0
          const float vi3 = input[3];
11314
0
          const float vi4 = input[4];
11315
0
          const float vi5 = input[5];
11316
0
          const float vi6 = input[6];
11317
0
          const float vi7 = input[7];
11318
0
          input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11319
0
          const float vw0 = *w++;
11320
0
          const float vw1 = *w++;
11321
0
          const float vw2 = *w++;
11322
0
          const float vw3 = *w++;
11323
0
          vacc0x0 += vi0 * vw0;
11324
0
          vacc1x0 += vi1 * vw0;
11325
0
          vacc2x0 += vi2 * vw0;
11326
0
          vacc3x0 += vi3 * vw0;
11327
0
          vacc4x0 += vi4 * vw0;
11328
0
          vacc5x0 += vi5 * vw0;
11329
0
          vacc6x0 += vi6 * vw0;
11330
0
          vacc7x0 += vi7 * vw0;
11331
0
          vacc0x1 += vi0 * vw1;
11332
0
          vacc1x1 += vi1 * vw1;
11333
0
          vacc2x1 += vi2 * vw1;
11334
0
          vacc3x1 += vi3 * vw1;
11335
0
          vacc4x1 += vi4 * vw1;
11336
0
          vacc5x1 += vi5 * vw1;
11337
0
          vacc6x1 += vi6 * vw1;
11338
0
          vacc7x1 += vi7 * vw1;
11339
0
          vacc0x2 += vi0 * vw2;
11340
0
          vacc1x2 += vi1 * vw2;
11341
0
          vacc2x2 += vi2 * vw2;
11342
0
          vacc3x2 += vi3 * vw2;
11343
0
          vacc4x2 += vi4 * vw2;
11344
0
          vacc5x2 += vi5 * vw2;
11345
0
          vacc6x2 += vi6 * vw2;
11346
0
          vacc7x2 += vi7 * vw2;
11347
0
          vacc0x3 += vi0 * vw3;
11348
0
          vacc1x3 += vi1 * vw3;
11349
0
          vacc2x3 += vi2 * vw3;
11350
0
          vacc3x3 += vi3 * vw3;
11351
0
          vacc4x3 += vi4 * vw3;
11352
0
          vacc5x3 += vi5 * vw3;
11353
0
          vacc6x3 += vi6 * vw3;
11354
0
          vacc7x3 += vi7 * vw3;
11355
0
        } while (--nnz != 0);
11356
0
      }
11357
0
      float vout0x0 = math_min_f32(vacc0x0, vmax);
11358
0
      float vout1x0 = math_min_f32(vacc1x0, vmax);
11359
0
      float vout2x0 = math_min_f32(vacc2x0, vmax);
11360
0
      float vout3x0 = math_min_f32(vacc3x0, vmax);
11361
0
      float vout4x0 = math_min_f32(vacc4x0, vmax);
11362
0
      float vout5x0 = math_min_f32(vacc5x0, vmax);
11363
0
      float vout6x0 = math_min_f32(vacc6x0, vmax);
11364
0
      float vout7x0 = math_min_f32(vacc7x0, vmax);
11365
0
      float vout0x1 = math_min_f32(vacc0x1, vmax);
11366
0
      float vout1x1 = math_min_f32(vacc1x1, vmax);
11367
0
      float vout2x1 = math_min_f32(vacc2x1, vmax);
11368
0
      float vout3x1 = math_min_f32(vacc3x1, vmax);
11369
0
      float vout4x1 = math_min_f32(vacc4x1, vmax);
11370
0
      float vout5x1 = math_min_f32(vacc5x1, vmax);
11371
0
      float vout6x1 = math_min_f32(vacc6x1, vmax);
11372
0
      float vout7x1 = math_min_f32(vacc7x1, vmax);
11373
0
      float vout0x2 = math_min_f32(vacc0x2, vmax);
11374
0
      float vout1x2 = math_min_f32(vacc1x2, vmax);
11375
0
      float vout2x2 = math_min_f32(vacc2x2, vmax);
11376
0
      float vout3x2 = math_min_f32(vacc3x2, vmax);
11377
0
      float vout4x2 = math_min_f32(vacc4x2, vmax);
11378
0
      float vout5x2 = math_min_f32(vacc5x2, vmax);
11379
0
      float vout6x2 = math_min_f32(vacc6x2, vmax);
11380
0
      float vout7x2 = math_min_f32(vacc7x2, vmax);
11381
0
      float vout0x3 = math_min_f32(vacc0x3, vmax);
11382
0
      float vout1x3 = math_min_f32(vacc1x3, vmax);
11383
0
      float vout2x3 = math_min_f32(vacc2x3, vmax);
11384
0
      float vout3x3 = math_min_f32(vacc3x3, vmax);
11385
0
      float vout4x3 = math_min_f32(vacc4x3, vmax);
11386
0
      float vout5x3 = math_min_f32(vacc5x3, vmax);
11387
0
      float vout6x3 = math_min_f32(vacc6x3, vmax);
11388
0
      float vout7x3 = math_min_f32(vacc7x3, vmax);
11389
0
      vout0x0 = math_max_f32(vout0x0, vmin);
11390
0
      vout1x0 = math_max_f32(vout1x0, vmin);
11391
0
      vout2x0 = math_max_f32(vout2x0, vmin);
11392
0
      vout3x0 = math_max_f32(vout3x0, vmin);
11393
0
      vout4x0 = math_max_f32(vout4x0, vmin);
11394
0
      vout5x0 = math_max_f32(vout5x0, vmin);
11395
0
      vout6x0 = math_max_f32(vout6x0, vmin);
11396
0
      vout7x0 = math_max_f32(vout7x0, vmin);
11397
0
      vout0x1 = math_max_f32(vout0x1, vmin);
11398
0
      vout1x1 = math_max_f32(vout1x1, vmin);
11399
0
      vout2x1 = math_max_f32(vout2x1, vmin);
11400
0
      vout3x1 = math_max_f32(vout3x1, vmin);
11401
0
      vout4x1 = math_max_f32(vout4x1, vmin);
11402
0
      vout5x1 = math_max_f32(vout5x1, vmin);
11403
0
      vout6x1 = math_max_f32(vout6x1, vmin);
11404
0
      vout7x1 = math_max_f32(vout7x1, vmin);
11405
0
      vout0x2 = math_max_f32(vout0x2, vmin);
11406
0
      vout1x2 = math_max_f32(vout1x2, vmin);
11407
0
      vout2x2 = math_max_f32(vout2x2, vmin);
11408
0
      vout3x2 = math_max_f32(vout3x2, vmin);
11409
0
      vout4x2 = math_max_f32(vout4x2, vmin);
11410
0
      vout5x2 = math_max_f32(vout5x2, vmin);
11411
0
      vout6x2 = math_max_f32(vout6x2, vmin);
11412
0
      vout7x2 = math_max_f32(vout7x2, vmin);
11413
0
      vout0x3 = math_max_f32(vout0x3, vmin);
11414
0
      vout1x3 = math_max_f32(vout1x3, vmin);
11415
0
      vout2x3 = math_max_f32(vout2x3, vmin);
11416
0
      vout3x3 = math_max_f32(vout3x3, vmin);
11417
0
      vout4x3 = math_max_f32(vout4x3, vmin);
11418
0
      vout5x3 = math_max_f32(vout5x3, vmin);
11419
0
      vout6x3 = math_max_f32(vout6x3, vmin);
11420
0
      vout7x3 = math_max_f32(vout7x3, vmin);
11421
0
      output[0] = vout0x3;
11422
0
      output[1] = vout1x3;
11423
0
      output[2] = vout2x3;
11424
0
      output[3] = vout3x3;
11425
0
      output[4] = vout4x3;
11426
0
      output[5] = vout5x3;
11427
0
      output[6] = vout6x3;
11428
0
      output[7] = vout7x3;
11429
0
      output[0] = vout0x0;
11430
0
      output[1] = vout1x0;
11431
0
      output[2] = vout2x0;
11432
0
      output[3] = vout3x0;
11433
0
      output[4] = vout4x0;
11434
0
      output[5] = vout5x0;
11435
0
      output[6] = vout6x0;
11436
0
      output[7] = vout7x0;
11437
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
11438
0
      output[0] = vout0x1;
11439
0
      output[1] = vout1x1;
11440
0
      output[2] = vout2x1;
11441
0
      output[3] = vout3x1;
11442
0
      output[4] = vout4x1;
11443
0
      output[5] = vout5x1;
11444
0
      output[6] = vout6x1;
11445
0
      output[7] = vout7x1;
11446
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
11447
0
      output[0] = vout0x2;
11448
0
      output[1] = vout1x2;
11449
0
      output[2] = vout2x2;
11450
0
      output[3] = vout3x2;
11451
0
      output[4] = vout4x2;
11452
0
      output[5] = vout5x2;
11453
0
      output[6] = vout6x2;
11454
0
      output[7] = vout7x2;
11455
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
11456
0
      output[0] = vout0x3;
11457
0
      output[1] = vout1x3;
11458
0
      output[2] = vout2x3;
11459
0
      output[3] = vout3x3;
11460
0
      output[4] = vout4x3;
11461
0
      output[5] = vout5x3;
11462
0
      output[6] = vout6x3;
11463
0
      output[7] = vout7x3;
11464
0
      output = (float*restrict) ((uintptr_t) output + output_stride);
11465
0
      n -= 4;
11466
0
    }
11467
0
    if XNN_UNLIKELY(n != 0) {
11468
0
      do {
11469
0
        uint32_t nnz = *nnzmap++;
11470
0
        float vacc0 = *w++;
11471
0
        float vacc1 = vacc0;
11472
0
        float vacc2 = vacc0;
11473
0
        float vacc3 = vacc0;
11474
0
        float vacc4 = vacc0;
11475
0
        float vacc5 = vacc0;
11476
0
        float vacc6 = vacc0;
11477
0
        float vacc7 = vacc0;
11478
0
        if XNN_LIKELY(nnz != 0) {
11479
0
          do {
11480
0
            const intptr_t diff = *dmap++;
11481
0
            const float vi0 = input[0];
11482
0
            const float vi1 = input[1];
11483
0
            const float vi2 = input[2];
11484
0
            const float vi3 = input[3];
11485
0
            const float vi4 = input[4];
11486
0
            const float vi5 = input[5];
11487
0
            const float vi6 = input[6];
11488
0
            const float vi7 = input[7];
11489
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11490
0
            const float vw = *w++;
11491
0
            vacc0 += vi0 * vw;
11492
0
            vacc1 += vi1 * vw;
11493
0
            vacc2 += vi2 * vw;
11494
0
            vacc3 += vi3 * vw;
11495
0
            vacc4 += vi4 * vw;
11496
0
            vacc5 += vi5 * vw;
11497
0
            vacc6 += vi6 * vw;
11498
0
            vacc7 += vi7 * vw;
11499
0
          } while (--nnz != 0);
11500
0
        }
11501
0
        float vout0 = math_min_f32(vacc0, vmax);
11502
0
        float vout1 = math_min_f32(vacc1, vmax);
11503
0
        float vout2 = math_min_f32(vacc2, vmax);
11504
0
        float vout3 = math_min_f32(vacc3, vmax);
11505
0
        float vout4 = math_min_f32(vacc4, vmax);
11506
0
        float vout5 = math_min_f32(vacc5, vmax);
11507
0
        float vout6 = math_min_f32(vacc6, vmax);
11508
0
        float vout7 = math_min_f32(vacc7, vmax);
11509
0
        vout0 = math_max_f32(vout0, vmin);
11510
0
        vout1 = math_max_f32(vout1, vmin);
11511
0
        vout2 = math_max_f32(vout2, vmin);
11512
0
        vout3 = math_max_f32(vout3, vmin);
11513
0
        vout4 = math_max_f32(vout4, vmin);
11514
0
        vout5 = math_max_f32(vout5, vmin);
11515
0
        vout6 = math_max_f32(vout6, vmin);
11516
0
        vout7 = math_max_f32(vout7, vmin);
11517
0
        output[0] = vout0;
11518
0
        output[1] = vout1;
11519
0
        output[2] = vout2;
11520
0
        output[3] = vout3;
11521
0
        output[4] = vout4;
11522
0
        output[5] = vout5;
11523
0
        output[6] = vout6;
11524
0
        output[7] = vout7;
11525
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11526
0
        n -= 1;
11527
0
      } while (n != 0);
11528
0
    }
11529
0
    output = (float*restrict) ((uintptr_t) output - output_decrement);
11530
0
    input += 8;
11531
0
    mc -= 8 * sizeof(float);
11532
0
  }
11533
0
  if XNN_UNLIKELY(mc != 0) {
11534
0
    output_decrement += 4 * sizeof(float);
11535
0
    if (mc & (4 * sizeof(float))) {
11536
0
      const float* w = weights;
11537
0
      const int32_t* dmap = widx_dmap;
11538
0
      const uint32_t* nnzmap = nidx_nnzmap;
11539
0
      size_t n = nc;
11540
0
      while (n >= 4) {
11541
0
        uint32_t nnz = *nnzmap++;
11542
0
        float vacc0x0 = *w++;
11543
0
        float vacc0x1 = *w++;
11544
0
        float vacc0x2 = *w++;
11545
0
        float vacc0x3 = *w++;
11546
0
        float vacc1x0 = vacc0x0;
11547
0
        float vacc2x0 = vacc0x0;
11548
0
        float vacc3x0 = vacc0x0;
11549
0
        float vacc1x1 = vacc0x1;
11550
0
        float vacc2x1 = vacc0x1;
11551
0
        float vacc3x1 = vacc0x1;
11552
0
        float vacc1x2 = vacc0x2;
11553
0
        float vacc2x2 = vacc0x2;
11554
0
        float vacc3x2 = vacc0x2;
11555
0
        float vacc1x3 = vacc0x3;
11556
0
        float vacc2x3 = vacc0x3;
11557
0
        float vacc3x3 = vacc0x3;
11558
0
        if XNN_LIKELY(nnz != 0) {
11559
0
          do {
11560
0
            const intptr_t diff = *dmap++;
11561
0
            const float vi0 = input[0];
11562
0
            const float vi1 = input[1];
11563
0
            const float vi2 = input[2];
11564
0
            const float vi3 = input[3];
11565
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11566
0
            const float vw0 = *w++;
11567
0
            const float vw1 = *w++;
11568
0
            const float vw2 = *w++;
11569
0
            const float vw3 = *w++;
11570
0
            vacc0x0 += vi0 * vw0;
11571
0
            vacc1x0 += vi1 * vw0;
11572
0
            vacc2x0 += vi2 * vw0;
11573
0
            vacc3x0 += vi3 * vw0;
11574
0
            vacc0x1 += vi0 * vw1;
11575
0
            vacc1x1 += vi1 * vw1;
11576
0
            vacc2x1 += vi2 * vw1;
11577
0
            vacc3x1 += vi3 * vw1;
11578
0
            vacc0x2 += vi0 * vw2;
11579
0
            vacc1x2 += vi1 * vw2;
11580
0
            vacc2x2 += vi2 * vw2;
11581
0
            vacc3x2 += vi3 * vw2;
11582
0
            vacc0x3 += vi0 * vw3;
11583
0
            vacc1x3 += vi1 * vw3;
11584
0
            vacc2x3 += vi2 * vw3;
11585
0
            vacc3x3 += vi3 * vw3;
11586
0
          } while (--nnz != 0);
11587
0
        }
11588
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
11589
0
        float vout1x0 = math_min_f32(vacc1x0, vmax);
11590
0
        float vout2x0 = math_min_f32(vacc2x0, vmax);
11591
0
        float vout3x0 = math_min_f32(vacc3x0, vmax);
11592
0
        float vout0x1 = math_min_f32(vacc0x1, vmax);
11593
0
        float vout1x1 = math_min_f32(vacc1x1, vmax);
11594
0
        float vout2x1 = math_min_f32(vacc2x1, vmax);
11595
0
        float vout3x1 = math_min_f32(vacc3x1, vmax);
11596
0
        float vout0x2 = math_min_f32(vacc0x2, vmax);
11597
0
        float vout1x2 = math_min_f32(vacc1x2, vmax);
11598
0
        float vout2x2 = math_min_f32(vacc2x2, vmax);
11599
0
        float vout3x2 = math_min_f32(vacc3x2, vmax);
11600
0
        float vout0x3 = math_min_f32(vacc0x3, vmax);
11601
0
        float vout1x3 = math_min_f32(vacc1x3, vmax);
11602
0
        float vout2x3 = math_min_f32(vacc2x3, vmax);
11603
0
        float vout3x3 = math_min_f32(vacc3x3, vmax);
11604
0
        vout0x0 = math_max_f32(vout0x0, vmin);
11605
0
        vout1x0 = math_max_f32(vout1x0, vmin);
11606
0
        vout2x0 = math_max_f32(vout2x0, vmin);
11607
0
        vout3x0 = math_max_f32(vout3x0, vmin);
11608
0
        vout0x1 = math_max_f32(vout0x1, vmin);
11609
0
        vout1x1 = math_max_f32(vout1x1, vmin);
11610
0
        vout2x1 = math_max_f32(vout2x1, vmin);
11611
0
        vout3x1 = math_max_f32(vout3x1, vmin);
11612
0
        vout0x2 = math_max_f32(vout0x2, vmin);
11613
0
        vout1x2 = math_max_f32(vout1x2, vmin);
11614
0
        vout2x2 = math_max_f32(vout2x2, vmin);
11615
0
        vout3x2 = math_max_f32(vout3x2, vmin);
11616
0
        vout0x3 = math_max_f32(vout0x3, vmin);
11617
0
        vout1x3 = math_max_f32(vout1x3, vmin);
11618
0
        vout2x3 = math_max_f32(vout2x3, vmin);
11619
0
        vout3x3 = math_max_f32(vout3x3, vmin);
11620
0
        output[0] = vout0x0;
11621
0
        output[1] = vout1x0;
11622
0
        output[2] = vout2x0;
11623
0
        output[3] = vout3x0;
11624
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11625
0
        output[0] = vout0x1;
11626
0
        output[1] = vout1x1;
11627
0
        output[2] = vout2x1;
11628
0
        output[3] = vout3x1;
11629
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11630
0
        output[0] = vout0x2;
11631
0
        output[1] = vout1x2;
11632
0
        output[2] = vout2x2;
11633
0
        output[3] = vout3x2;
11634
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11635
0
        output[0] = vout0x3;
11636
0
        output[1] = vout1x3;
11637
0
        output[2] = vout2x3;
11638
0
        output[3] = vout3x3;
11639
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11640
0
        n -= 4;
11641
0
      }
11642
0
      if XNN_UNLIKELY(n != 0) {
11643
0
        do {
11644
0
          uint32_t nnz = *nnzmap++;
11645
0
          float vacc0 = *w++;
11646
0
          float vacc1 = vacc0;
11647
0
          float vacc2 = vacc0;
11648
0
          float vacc3 = vacc0;
11649
0
          if XNN_LIKELY(nnz != 0) {
11650
0
            do {
11651
0
              const intptr_t diff = *dmap++;
11652
0
              const float vi0 = input[0];
11653
0
              const float vi1 = input[1];
11654
0
              const float vi2 = input[2];
11655
0
              const float vi3 = input[3];
11656
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11657
0
              const float vw = *w++;
11658
0
              vacc0 += vi0 * vw;
11659
0
              vacc1 += vi1 * vw;
11660
0
              vacc2 += vi2 * vw;
11661
0
              vacc3 += vi3 * vw;
11662
0
            } while (--nnz != 0);
11663
0
          }
11664
0
          float vout0 = math_min_f32(vacc0, vmax);
11665
0
          float vout1 = math_min_f32(vacc1, vmax);
11666
0
          float vout2 = math_min_f32(vacc2, vmax);
11667
0
          float vout3 = math_min_f32(vacc3, vmax);
11668
0
          vout0 = math_max_f32(vout0, vmin);
11669
0
          vout1 = math_max_f32(vout1, vmin);
11670
0
          vout2 = math_max_f32(vout2, vmin);
11671
0
          vout3 = math_max_f32(vout3, vmin);
11672
0
          output[0] = vout0;
11673
0
          output[1] = vout1;
11674
0
          output[2] = vout2;
11675
0
          output[3] = vout3;
11676
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
11677
0
          n -= 1;
11678
0
        } while (n != 0);
11679
0
      }
11680
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
11681
0
      input += 4;
11682
0
    }
11683
0
    output_decrement += 2 * sizeof(float);
11684
0
    if (mc & (2 * sizeof(float))) {
11685
0
      const float* w = weights;
11686
0
      const int32_t* dmap = widx_dmap;
11687
0
      const uint32_t* nnzmap = nidx_nnzmap;
11688
0
      size_t n = nc;
11689
0
      while (n >= 4) {
11690
0
        uint32_t nnz = *nnzmap++;
11691
0
        float vacc0x0 = *w++;
11692
0
        float vacc0x1 = *w++;
11693
0
        float vacc0x2 = *w++;
11694
0
        float vacc0x3 = *w++;
11695
0
        float vacc1x0 = vacc0x0;
11696
0
        float vacc1x1 = vacc0x1;
11697
0
        float vacc1x2 = vacc0x2;
11698
0
        float vacc1x3 = vacc0x3;
11699
0
        if XNN_LIKELY(nnz != 0) {
11700
0
          do {
11701
0
            const intptr_t diff = *dmap++;
11702
0
            const float vi0 = input[0];
11703
0
            const float vi1 = input[1];
11704
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11705
0
            const float vw0 = *w++;
11706
0
            const float vw1 = *w++;
11707
0
            const float vw2 = *w++;
11708
0
            const float vw3 = *w++;
11709
0
            vacc0x0 += vi0 * vw0;
11710
0
            vacc1x0 += vi1 * vw0;
11711
0
            vacc0x1 += vi0 * vw1;
11712
0
            vacc1x1 += vi1 * vw1;
11713
0
            vacc0x2 += vi0 * vw2;
11714
0
            vacc1x2 += vi1 * vw2;
11715
0
            vacc0x3 += vi0 * vw3;
11716
0
            vacc1x3 += vi1 * vw3;
11717
0
          } while (--nnz != 0);
11718
0
        }
11719
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
11720
0
        float vout1x0 = math_min_f32(vacc1x0, vmax);
11721
0
        float vout0x1 = math_min_f32(vacc0x1, vmax);
11722
0
        float vout1x1 = math_min_f32(vacc1x1, vmax);
11723
0
        float vout0x2 = math_min_f32(vacc0x2, vmax);
11724
0
        float vout1x2 = math_min_f32(vacc1x2, vmax);
11725
0
        float vout0x3 = math_min_f32(vacc0x3, vmax);
11726
0
        float vout1x3 = math_min_f32(vacc1x3, vmax);
11727
0
        vout0x0 = math_max_f32(vout0x0, vmin);
11728
0
        vout1x0 = math_max_f32(vout1x0, vmin);
11729
0
        vout0x1 = math_max_f32(vout0x1, vmin);
11730
0
        vout1x1 = math_max_f32(vout1x1, vmin);
11731
0
        vout0x2 = math_max_f32(vout0x2, vmin);
11732
0
        vout1x2 = math_max_f32(vout1x2, vmin);
11733
0
        vout0x3 = math_max_f32(vout0x3, vmin);
11734
0
        vout1x3 = math_max_f32(vout1x3, vmin);
11735
0
        output[0] = vout0x0;
11736
0
        output[1] = vout1x0;
11737
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11738
0
        output[0] = vout0x1;
11739
0
        output[1] = vout1x1;
11740
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11741
0
        output[0] = vout0x2;
11742
0
        output[1] = vout1x2;
11743
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11744
0
        output[0] = vout0x3;
11745
0
        output[1] = vout1x3;
11746
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11747
0
        n -= 4;
11748
0
      }
11749
0
      if XNN_UNLIKELY(n != 0) {
11750
0
        do {
11751
0
          uint32_t nnz = *nnzmap++;
11752
0
          float vacc0 = *w++;
11753
0
          float vacc1 = vacc0;
11754
0
          if XNN_LIKELY(nnz != 0) {
11755
0
            do {
11756
0
              const intptr_t diff = *dmap++;
11757
0
              const float vi0 = input[0];
11758
0
              const float vi1 = input[1];
11759
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11760
0
              const float vw = *w++;
11761
0
              vacc0 += vi0 * vw;
11762
0
              vacc1 += vi1 * vw;
11763
0
            } while (--nnz != 0);
11764
0
          }
11765
0
          float vout0 = math_min_f32(vacc0, vmax);
11766
0
          float vout1 = math_min_f32(vacc1, vmax);
11767
0
          vout0 = math_max_f32(vout0, vmin);
11768
0
          vout1 = math_max_f32(vout1, vmin);
11769
0
          output[0] = vout0;
11770
0
          output[1] = vout1;
11771
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
11772
0
          n -= 1;
11773
0
        } while (n != 0);
11774
0
      }
11775
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
11776
0
      input += 2;
11777
0
    }
11778
0
    output_decrement += 1 * sizeof(float);
11779
0
    if (mc & (1 * sizeof(float))) {
11780
0
      const float* w = weights;
11781
0
      const int32_t* dmap = widx_dmap;
11782
0
      const uint32_t* nnzmap = nidx_nnzmap;
11783
0
      size_t n = nc;
11784
0
      while (n >= 4) {
11785
0
        uint32_t nnz = *nnzmap++;
11786
0
        float vacc0x0 = *w++;
11787
0
        float vacc0x1 = *w++;
11788
0
        float vacc0x2 = *w++;
11789
0
        float vacc0x3 = *w++;
11790
0
        if XNN_LIKELY(nnz != 0) {
11791
0
          do {
11792
0
            const intptr_t diff = *dmap++;
11793
0
            const float vi0 = input[0];
11794
0
            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11795
0
            const float vw0 = *w++;
11796
0
            const float vw1 = *w++;
11797
0
            const float vw2 = *w++;
11798
0
            const float vw3 = *w++;
11799
0
            vacc0x0 += vi0 * vw0;
11800
0
            vacc0x1 += vi0 * vw1;
11801
0
            vacc0x2 += vi0 * vw2;
11802
0
            vacc0x3 += vi0 * vw3;
11803
0
          } while (--nnz != 0);
11804
0
        }
11805
0
        float vout0x0 = math_min_f32(vacc0x0, vmax);
11806
0
        float vout0x1 = math_min_f32(vacc0x1, vmax);
11807
0
        float vout0x2 = math_min_f32(vacc0x2, vmax);
11808
0
        float vout0x3 = math_min_f32(vacc0x3, vmax);
11809
0
        vout0x0 = math_max_f32(vout0x0, vmin);
11810
0
        vout0x1 = math_max_f32(vout0x1, vmin);
11811
0
        vout0x2 = math_max_f32(vout0x2, vmin);
11812
0
        vout0x3 = math_max_f32(vout0x3, vmin);
11813
0
        output[0] = vout0x0;
11814
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11815
0
        output[0] = vout0x1;
11816
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11817
0
        output[0] = vout0x2;
11818
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11819
0
        output[0] = vout0x3;
11820
0
        output = (float*restrict) ((uintptr_t) output + output_stride);
11821
0
        n -= 4;
11822
0
      }
11823
0
      if XNN_UNLIKELY(n != 0) {
11824
0
        do {
11825
0
          uint32_t nnz = *nnzmap++;
11826
0
          float vacc0 = *w++;
11827
0
          if XNN_LIKELY(nnz != 0) {
11828
0
            do {
11829
0
              const intptr_t diff = *dmap++;
11830
0
              const float vi0 = input[0];
11831
0
              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
11832
0
              const float vw = *w++;
11833
0
              vacc0 += vi0 * vw;
11834
0
            } while (--nnz != 0);
11835
0
          }
11836
0
          float vout0 = math_min_f32(vacc0, vmax);
11837
0
          vout0 = math_max_f32(vout0, vmin);
11838
0
          output[0] = vout0;
11839
0
          output = (float*restrict) ((uintptr_t) output + output_stride);
11840
0
          n -= 1;
11841
0
        } while (n != 0);
11842
0
      }
11843
0
      output = (float*restrict) ((uintptr_t) output - output_decrement);
11844
0
      input += 1;
11845
0
    }
11846
0
  }
11847
0
}
11848
11849
void xnn_f32_vadd_minmax_ukernel__scalar_u8(
11850
    size_t batch,
11851
    const float* input_a,
11852
    const float* input_b,
11853
    float* output,
11854
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
11855
0
{
11856
0
  assert(batch != 0);
11857
0
  assert(batch % sizeof(float) == 0);
11858
0
  assert(input_a != NULL);
11859
0
  assert(input_b != NULL);
11860
0
  assert(output != NULL);
11861
11862
0
  const float voutput_min = params->scalar.min;
11863
0
  const float voutput_max = params->scalar.max;
11864
11865
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
11866
0
    const float va0 = input_a[0];
11867
0
    const float va1 = input_a[1];
11868
0
    const float va2 = input_a[2];
11869
0
    const float va3 = input_a[3];
11870
0
    const float va4 = input_a[4];
11871
0
    const float va5 = input_a[5];
11872
0
    const float va6 = input_a[6];
11873
0
    const float va7 = input_a[7];
11874
0
    input_a += 8;
11875
11876
0
    const float vb0 = input_b[0];
11877
0
    const float vb1 = input_b[1];
11878
0
    const float vb2 = input_b[2];
11879
0
    const float vb3 = input_b[3];
11880
0
    const float vb4 = input_b[4];
11881
0
    const float vb5 = input_b[5];
11882
0
    const float vb6 = input_b[6];
11883
0
    const float vb7 = input_b[7];
11884
0
    input_b += 8;
11885
11886
0
    float vacc0 = va0 + vb0;
11887
0
    float vacc1 = va1 + vb1;
11888
0
    float vacc2 = va2 + vb2;
11889
0
    float vacc3 = va3 + vb3;
11890
0
    float vacc4 = va4 + vb4;
11891
0
    float vacc5 = va5 + vb5;
11892
0
    float vacc6 = va6 + vb6;
11893
0
    float vacc7 = va7 + vb7;
11894
11895
11896
0
    vacc0 = math_max_f32(vacc0, voutput_min);
11897
0
    vacc1 = math_max_f32(vacc1, voutput_min);
11898
0
    vacc2 = math_max_f32(vacc2, voutput_min);
11899
0
    vacc3 = math_max_f32(vacc3, voutput_min);
11900
0
    vacc4 = math_max_f32(vacc4, voutput_min);
11901
0
    vacc5 = math_max_f32(vacc5, voutput_min);
11902
0
    vacc6 = math_max_f32(vacc6, voutput_min);
11903
0
    vacc7 = math_max_f32(vacc7, voutput_min);
11904
11905
0
    vacc0 = math_min_f32(vacc0, voutput_max);
11906
0
    vacc1 = math_min_f32(vacc1, voutput_max);
11907
0
    vacc2 = math_min_f32(vacc2, voutput_max);
11908
0
    vacc3 = math_min_f32(vacc3, voutput_max);
11909
0
    vacc4 = math_min_f32(vacc4, voutput_max);
11910
0
    vacc5 = math_min_f32(vacc5, voutput_max);
11911
0
    vacc6 = math_min_f32(vacc6, voutput_max);
11912
0
    vacc7 = math_min_f32(vacc7, voutput_max);
11913
11914
0
    output[0] = vacc0;
11915
0
    output[1] = vacc1;
11916
0
    output[2] = vacc2;
11917
0
    output[3] = vacc3;
11918
0
    output[4] = vacc4;
11919
0
    output[5] = vacc5;
11920
0
    output[6] = vacc6;
11921
0
    output[7] = vacc7;
11922
0
    output += 8;
11923
0
  }
11924
0
  if XNN_UNLIKELY(batch != 0) {
11925
0
    do {
11926
0
      const float va = *input_a++;
11927
0
      const float vb = *input_b++;
11928
0
      float vacc = va + vb;
11929
0
      vacc = math_max_f32(vacc, voutput_min);
11930
0
      vacc = math_min_f32(vacc, voutput_max);
11931
0
      *output++ = vacc;
11932
0
      batch -= sizeof(float);
11933
0
    } while (batch != 0);
11934
0
  }
11935
0
}
11936
11937
void xnn_f32_vaddc_minmax_ukernel__scalar_u8(
11938
    size_t batch,
11939
    const float* input_a,
11940
    const float* input_b,
11941
    float* output,
11942
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
11943
0
{
11944
0
  assert(batch != 0);
11945
0
  assert(batch % sizeof(float) == 0);
11946
0
  assert(input_a != NULL);
11947
0
  assert(input_b != NULL);
11948
0
  assert(output != NULL);
11949
11950
0
  const float voutput_min = params->scalar.min;
11951
0
  const float voutput_max = params->scalar.max;
11952
0
  const float vb = *input_b;
11953
11954
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
11955
0
    const float va0 = input_a[0];
11956
0
    const float va1 = input_a[1];
11957
0
    const float va2 = input_a[2];
11958
0
    const float va3 = input_a[3];
11959
0
    const float va4 = input_a[4];
11960
0
    const float va5 = input_a[5];
11961
0
    const float va6 = input_a[6];
11962
0
    const float va7 = input_a[7];
11963
0
    input_a += 8;
11964
11965
0
    float vacc0 = va0 + vb;
11966
0
    float vacc1 = va1 + vb;
11967
0
    float vacc2 = va2 + vb;
11968
0
    float vacc3 = va3 + vb;
11969
0
    float vacc4 = va4 + vb;
11970
0
    float vacc5 = va5 + vb;
11971
0
    float vacc6 = va6 + vb;
11972
0
    float vacc7 = va7 + vb;
11973
11974
11975
0
    vacc0 = math_max_f32(vacc0, voutput_min);
11976
0
    vacc1 = math_max_f32(vacc1, voutput_min);
11977
0
    vacc2 = math_max_f32(vacc2, voutput_min);
11978
0
    vacc3 = math_max_f32(vacc3, voutput_min);
11979
0
    vacc4 = math_max_f32(vacc4, voutput_min);
11980
0
    vacc5 = math_max_f32(vacc5, voutput_min);
11981
0
    vacc6 = math_max_f32(vacc6, voutput_min);
11982
0
    vacc7 = math_max_f32(vacc7, voutput_min);
11983
11984
0
    vacc0 = math_min_f32(vacc0, voutput_max);
11985
0
    vacc1 = math_min_f32(vacc1, voutput_max);
11986
0
    vacc2 = math_min_f32(vacc2, voutput_max);
11987
0
    vacc3 = math_min_f32(vacc3, voutput_max);
11988
0
    vacc4 = math_min_f32(vacc4, voutput_max);
11989
0
    vacc5 = math_min_f32(vacc5, voutput_max);
11990
0
    vacc6 = math_min_f32(vacc6, voutput_max);
11991
0
    vacc7 = math_min_f32(vacc7, voutput_max);
11992
11993
0
    output[0] = vacc0;
11994
0
    output[1] = vacc1;
11995
0
    output[2] = vacc2;
11996
0
    output[3] = vacc3;
11997
0
    output[4] = vacc4;
11998
0
    output[5] = vacc5;
11999
0
    output[6] = vacc6;
12000
0
    output[7] = vacc7;
12001
0
    output += 8;
12002
0
  }
12003
0
  if XNN_UNLIKELY(batch != 0) {
12004
0
    do {
12005
0
      const float va = *input_a++;
12006
0
      float vacc = va + vb;
12007
0
      vacc = math_max_f32(vacc, voutput_min);
12008
0
      vacc = math_min_f32(vacc, voutput_max);
12009
0
      *output++ = vacc;
12010
0
      batch -= sizeof(float);
12011
0
    } while (batch != 0);
12012
0
  }
12013
0
}
12014
12015
void xnn_f32_vdiv_minmax_ukernel__scalar_u2(
12016
    size_t batch,
12017
    const float* input_a,
12018
    const float* input_b,
12019
    float* output,
12020
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12021
0
{
12022
0
  assert(batch != 0);
12023
0
  assert(batch % sizeof(float) == 0);
12024
0
  assert(input_a != NULL);
12025
0
  assert(input_b != NULL);
12026
0
  assert(output != NULL);
12027
12028
0
  const float voutput_min = params->scalar.min;
12029
0
  const float voutput_max = params->scalar.max;
12030
12031
0
  for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
12032
0
    const float va0 = input_a[0];
12033
0
    const float va1 = input_a[1];
12034
0
    input_a += 2;
12035
12036
0
    const float vb0 = input_b[0];
12037
0
    const float vb1 = input_b[1];
12038
0
    input_b += 2;
12039
12040
0
    float vacc0 = va0 / vb0;
12041
0
    float vacc1 = va1 / vb1;
12042
12043
12044
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12045
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12046
12047
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12048
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12049
12050
0
    output[0] = vacc0;
12051
0
    output[1] = vacc1;
12052
0
    output += 2;
12053
0
  }
12054
0
  if XNN_UNLIKELY(batch != 0) {
12055
0
    assert(batch == sizeof(float));
12056
0
    const float va = *input_a;
12057
0
    const float vb = *input_b;
12058
0
    float vacc = va / vb;
12059
0
    vacc = math_max_f32(vacc, voutput_min);
12060
0
    vacc = math_min_f32(vacc, voutput_max);
12061
0
    *output = vacc;
12062
0
  }
12063
0
}
12064
12065
void xnn_f32_vdivc_minmax_ukernel__scalar_u2(
12066
    size_t batch,
12067
    const float* input_a,
12068
    const float* input_b,
12069
    float* output,
12070
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12071
0
{
12072
0
  assert(batch != 0);
12073
0
  assert(batch % sizeof(float) == 0);
12074
0
  assert(input_a != NULL);
12075
0
  assert(input_b != NULL);
12076
0
  assert(output != NULL);
12077
12078
0
  const float voutput_min = params->scalar.min;
12079
0
  const float voutput_max = params->scalar.max;
12080
0
  const float vb = *input_b;
12081
12082
0
  for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
12083
0
    const float va0 = input_a[0];
12084
0
    const float va1 = input_a[1];
12085
0
    input_a += 2;
12086
12087
0
    float vacc0 = va0 / vb;
12088
0
    float vacc1 = va1 / vb;
12089
12090
12091
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12092
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12093
12094
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12095
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12096
12097
0
    output[0] = vacc0;
12098
0
    output[1] = vacc1;
12099
0
    output += 2;
12100
0
  }
12101
0
  if XNN_UNLIKELY(batch != 0) {
12102
0
    assert(batch == sizeof(float));
12103
0
    const float va = *input_a;
12104
0
    float vacc = va / vb;
12105
0
    vacc = math_max_f32(vacc, voutput_min);
12106
0
    vacc = math_min_f32(vacc, voutput_max);
12107
0
    *output = vacc;
12108
0
  }
12109
0
}
12110
12111
void xnn_f32_vmax_ukernel__scalar_u8(
12112
    size_t batch,
12113
    const float* input_a,
12114
    const float* input_b,
12115
    float* output,
12116
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
12117
0
{
12118
0
  assert(batch != 0);
12119
0
  assert(batch % sizeof(float) == 0);
12120
0
  assert(input_a != NULL);
12121
0
  assert(input_b != NULL);
12122
0
  assert(output != NULL);
12123
12124
12125
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12126
0
    const float va0 = input_a[0];
12127
0
    const float va1 = input_a[1];
12128
0
    const float va2 = input_a[2];
12129
0
    const float va3 = input_a[3];
12130
0
    const float va4 = input_a[4];
12131
0
    const float va5 = input_a[5];
12132
0
    const float va6 = input_a[6];
12133
0
    const float va7 = input_a[7];
12134
0
    input_a += 8;
12135
12136
0
    const float vb0 = input_b[0];
12137
0
    const float vb1 = input_b[1];
12138
0
    const float vb2 = input_b[2];
12139
0
    const float vb3 = input_b[3];
12140
0
    const float vb4 = input_b[4];
12141
0
    const float vb5 = input_b[5];
12142
0
    const float vb6 = input_b[6];
12143
0
    const float vb7 = input_b[7];
12144
0
    input_b += 8;
12145
12146
0
    float vacc0 = math_max_f32(va0, vb0);
12147
0
    float vacc1 = math_max_f32(va1, vb1);
12148
0
    float vacc2 = math_max_f32(va2, vb2);
12149
0
    float vacc3 = math_max_f32(va3, vb3);
12150
0
    float vacc4 = math_max_f32(va4, vb4);
12151
0
    float vacc5 = math_max_f32(va5, vb5);
12152
0
    float vacc6 = math_max_f32(va6, vb6);
12153
0
    float vacc7 = math_max_f32(va7, vb7);
12154
12155
12156
12157
0
    output[0] = vacc0;
12158
0
    output[1] = vacc1;
12159
0
    output[2] = vacc2;
12160
0
    output[3] = vacc3;
12161
0
    output[4] = vacc4;
12162
0
    output[5] = vacc5;
12163
0
    output[6] = vacc6;
12164
0
    output[7] = vacc7;
12165
0
    output += 8;
12166
0
  }
12167
0
  if XNN_UNLIKELY(batch != 0) {
12168
0
    do {
12169
0
      const float va = *input_a++;
12170
0
      const float vb = *input_b++;
12171
0
      float vacc = math_max_f32(va, vb);
12172
0
      *output++ = vacc;
12173
0
      batch -= sizeof(float);
12174
0
    } while (batch != 0);
12175
0
  }
12176
0
}
12177
12178
void xnn_f32_vmaxc_ukernel__scalar_u8(
12179
    size_t batch,
12180
    const float* input_a,
12181
    const float* input_b,
12182
    float* output,
12183
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
12184
0
{
12185
0
  assert(batch != 0);
12186
0
  assert(batch % sizeof(float) == 0);
12187
0
  assert(input_a != NULL);
12188
0
  assert(input_b != NULL);
12189
0
  assert(output != NULL);
12190
12191
0
  const float vb = *input_b;
12192
12193
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12194
0
    const float va0 = input_a[0];
12195
0
    const float va1 = input_a[1];
12196
0
    const float va2 = input_a[2];
12197
0
    const float va3 = input_a[3];
12198
0
    const float va4 = input_a[4];
12199
0
    const float va5 = input_a[5];
12200
0
    const float va6 = input_a[6];
12201
0
    const float va7 = input_a[7];
12202
0
    input_a += 8;
12203
12204
0
    float vacc0 = math_max_f32(va0, vb);
12205
0
    float vacc1 = math_max_f32(va1, vb);
12206
0
    float vacc2 = math_max_f32(va2, vb);
12207
0
    float vacc3 = math_max_f32(va3, vb);
12208
0
    float vacc4 = math_max_f32(va4, vb);
12209
0
    float vacc5 = math_max_f32(va5, vb);
12210
0
    float vacc6 = math_max_f32(va6, vb);
12211
0
    float vacc7 = math_max_f32(va7, vb);
12212
12213
12214
12215
0
    output[0] = vacc0;
12216
0
    output[1] = vacc1;
12217
0
    output[2] = vacc2;
12218
0
    output[3] = vacc3;
12219
0
    output[4] = vacc4;
12220
0
    output[5] = vacc5;
12221
0
    output[6] = vacc6;
12222
0
    output[7] = vacc7;
12223
0
    output += 8;
12224
0
  }
12225
0
  if XNN_UNLIKELY(batch != 0) {
12226
0
    do {
12227
0
      const float va = *input_a++;
12228
0
      float vacc = math_max_f32(va, vb);
12229
0
      *output++ = vacc;
12230
0
      batch -= sizeof(float);
12231
0
    } while (batch != 0);
12232
0
  }
12233
0
}
12234
12235
void xnn_f32_vmin_ukernel__scalar_u8(
12236
    size_t batch,
12237
    const float* input_a,
12238
    const float* input_b,
12239
    float* output,
12240
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
12241
0
{
12242
0
  assert(batch != 0);
12243
0
  assert(batch % sizeof(float) == 0);
12244
0
  assert(input_a != NULL);
12245
0
  assert(input_b != NULL);
12246
0
  assert(output != NULL);
12247
12248
12249
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12250
0
    const float va0 = input_a[0];
12251
0
    const float va1 = input_a[1];
12252
0
    const float va2 = input_a[2];
12253
0
    const float va3 = input_a[3];
12254
0
    const float va4 = input_a[4];
12255
0
    const float va5 = input_a[5];
12256
0
    const float va6 = input_a[6];
12257
0
    const float va7 = input_a[7];
12258
0
    input_a += 8;
12259
12260
0
    const float vb0 = input_b[0];
12261
0
    const float vb1 = input_b[1];
12262
0
    const float vb2 = input_b[2];
12263
0
    const float vb3 = input_b[3];
12264
0
    const float vb4 = input_b[4];
12265
0
    const float vb5 = input_b[5];
12266
0
    const float vb6 = input_b[6];
12267
0
    const float vb7 = input_b[7];
12268
0
    input_b += 8;
12269
12270
0
    float vacc0 = math_min_f32(va0, vb0);
12271
0
    float vacc1 = math_min_f32(va1, vb1);
12272
0
    float vacc2 = math_min_f32(va2, vb2);
12273
0
    float vacc3 = math_min_f32(va3, vb3);
12274
0
    float vacc4 = math_min_f32(va4, vb4);
12275
0
    float vacc5 = math_min_f32(va5, vb5);
12276
0
    float vacc6 = math_min_f32(va6, vb6);
12277
0
    float vacc7 = math_min_f32(va7, vb7);
12278
12279
12280
12281
0
    output[0] = vacc0;
12282
0
    output[1] = vacc1;
12283
0
    output[2] = vacc2;
12284
0
    output[3] = vacc3;
12285
0
    output[4] = vacc4;
12286
0
    output[5] = vacc5;
12287
0
    output[6] = vacc6;
12288
0
    output[7] = vacc7;
12289
0
    output += 8;
12290
0
  }
12291
0
  if XNN_UNLIKELY(batch != 0) {
12292
0
    do {
12293
0
      const float va = *input_a++;
12294
0
      const float vb = *input_b++;
12295
0
      float vacc = math_min_f32(va, vb);
12296
0
      *output++ = vacc;
12297
0
      batch -= sizeof(float);
12298
0
    } while (batch != 0);
12299
0
  }
12300
0
}
12301
12302
void xnn_f32_vminc_ukernel__scalar_u8(
12303
    size_t batch,
12304
    const float* input_a,
12305
    const float* input_b,
12306
    float* output,
12307
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
12308
0
{
12309
0
  assert(batch != 0);
12310
0
  assert(batch % sizeof(float) == 0);
12311
0
  assert(input_a != NULL);
12312
0
  assert(input_b != NULL);
12313
0
  assert(output != NULL);
12314
12315
0
  const float vb = *input_b;
12316
12317
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12318
0
    const float va0 = input_a[0];
12319
0
    const float va1 = input_a[1];
12320
0
    const float va2 = input_a[2];
12321
0
    const float va3 = input_a[3];
12322
0
    const float va4 = input_a[4];
12323
0
    const float va5 = input_a[5];
12324
0
    const float va6 = input_a[6];
12325
0
    const float va7 = input_a[7];
12326
0
    input_a += 8;
12327
12328
0
    float vacc0 = math_min_f32(va0, vb);
12329
0
    float vacc1 = math_min_f32(va1, vb);
12330
0
    float vacc2 = math_min_f32(va2, vb);
12331
0
    float vacc3 = math_min_f32(va3, vb);
12332
0
    float vacc4 = math_min_f32(va4, vb);
12333
0
    float vacc5 = math_min_f32(va5, vb);
12334
0
    float vacc6 = math_min_f32(va6, vb);
12335
0
    float vacc7 = math_min_f32(va7, vb);
12336
12337
12338
12339
0
    output[0] = vacc0;
12340
0
    output[1] = vacc1;
12341
0
    output[2] = vacc2;
12342
0
    output[3] = vacc3;
12343
0
    output[4] = vacc4;
12344
0
    output[5] = vacc5;
12345
0
    output[6] = vacc6;
12346
0
    output[7] = vacc7;
12347
0
    output += 8;
12348
0
  }
12349
0
  if XNN_UNLIKELY(batch != 0) {
12350
0
    do {
12351
0
      const float va = *input_a++;
12352
0
      float vacc = math_min_f32(va, vb);
12353
0
      *output++ = vacc;
12354
0
      batch -= sizeof(float);
12355
0
    } while (batch != 0);
12356
0
  }
12357
0
}
12358
12359
void xnn_f32_vmul_minmax_ukernel__scalar_u8(
12360
    size_t batch,
12361
    const float* input_a,
12362
    const float* input_b,
12363
    float* output,
12364
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12365
0
{
12366
0
  assert(batch != 0);
12367
0
  assert(batch % sizeof(float) == 0);
12368
0
  assert(input_a != NULL);
12369
0
  assert(input_b != NULL);
12370
0
  assert(output != NULL);
12371
12372
0
  const float voutput_min = params->scalar.min;
12373
0
  const float voutput_max = params->scalar.max;
12374
12375
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12376
0
    const float va0 = input_a[0];
12377
0
    const float va1 = input_a[1];
12378
0
    const float va2 = input_a[2];
12379
0
    const float va3 = input_a[3];
12380
0
    const float va4 = input_a[4];
12381
0
    const float va5 = input_a[5];
12382
0
    const float va6 = input_a[6];
12383
0
    const float va7 = input_a[7];
12384
0
    input_a += 8;
12385
12386
0
    const float vb0 = input_b[0];
12387
0
    const float vb1 = input_b[1];
12388
0
    const float vb2 = input_b[2];
12389
0
    const float vb3 = input_b[3];
12390
0
    const float vb4 = input_b[4];
12391
0
    const float vb5 = input_b[5];
12392
0
    const float vb6 = input_b[6];
12393
0
    const float vb7 = input_b[7];
12394
0
    input_b += 8;
12395
12396
0
    float vacc0 = va0 * vb0;
12397
0
    float vacc1 = va1 * vb1;
12398
0
    float vacc2 = va2 * vb2;
12399
0
    float vacc3 = va3 * vb3;
12400
0
    float vacc4 = va4 * vb4;
12401
0
    float vacc5 = va5 * vb5;
12402
0
    float vacc6 = va6 * vb6;
12403
0
    float vacc7 = va7 * vb7;
12404
12405
12406
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12407
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12408
0
    vacc2 = math_max_f32(vacc2, voutput_min);
12409
0
    vacc3 = math_max_f32(vacc3, voutput_min);
12410
0
    vacc4 = math_max_f32(vacc4, voutput_min);
12411
0
    vacc5 = math_max_f32(vacc5, voutput_min);
12412
0
    vacc6 = math_max_f32(vacc6, voutput_min);
12413
0
    vacc7 = math_max_f32(vacc7, voutput_min);
12414
12415
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12416
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12417
0
    vacc2 = math_min_f32(vacc2, voutput_max);
12418
0
    vacc3 = math_min_f32(vacc3, voutput_max);
12419
0
    vacc4 = math_min_f32(vacc4, voutput_max);
12420
0
    vacc5 = math_min_f32(vacc5, voutput_max);
12421
0
    vacc6 = math_min_f32(vacc6, voutput_max);
12422
0
    vacc7 = math_min_f32(vacc7, voutput_max);
12423
12424
0
    output[0] = vacc0;
12425
0
    output[1] = vacc1;
12426
0
    output[2] = vacc2;
12427
0
    output[3] = vacc3;
12428
0
    output[4] = vacc4;
12429
0
    output[5] = vacc5;
12430
0
    output[6] = vacc6;
12431
0
    output[7] = vacc7;
12432
0
    output += 8;
12433
0
  }
12434
0
  if XNN_UNLIKELY(batch != 0) {
12435
0
    do {
12436
0
      const float va = *input_a++;
12437
0
      const float vb = *input_b++;
12438
0
      float vacc = va * vb;
12439
0
      vacc = math_max_f32(vacc, voutput_min);
12440
0
      vacc = math_min_f32(vacc, voutput_max);
12441
0
      *output++ = vacc;
12442
0
      batch -= sizeof(float);
12443
0
    } while (batch != 0);
12444
0
  }
12445
0
}
12446
12447
void xnn_f32_vmulc_minmax_ukernel__scalar_u8(
12448
    size_t batch,
12449
    const float* input_a,
12450
    const float* input_b,
12451
    float* output,
12452
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12453
0
{
12454
0
  assert(batch != 0);
12455
0
  assert(batch % sizeof(float) == 0);
12456
0
  assert(input_a != NULL);
12457
0
  assert(input_b != NULL);
12458
0
  assert(output != NULL);
12459
12460
0
  const float voutput_min = params->scalar.min;
12461
0
  const float voutput_max = params->scalar.max;
12462
0
  const float vb = *input_b;
12463
12464
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12465
0
    const float va0 = input_a[0];
12466
0
    const float va1 = input_a[1];
12467
0
    const float va2 = input_a[2];
12468
0
    const float va3 = input_a[3];
12469
0
    const float va4 = input_a[4];
12470
0
    const float va5 = input_a[5];
12471
0
    const float va6 = input_a[6];
12472
0
    const float va7 = input_a[7];
12473
0
    input_a += 8;
12474
12475
0
    float vacc0 = va0 * vb;
12476
0
    float vacc1 = va1 * vb;
12477
0
    float vacc2 = va2 * vb;
12478
0
    float vacc3 = va3 * vb;
12479
0
    float vacc4 = va4 * vb;
12480
0
    float vacc5 = va5 * vb;
12481
0
    float vacc6 = va6 * vb;
12482
0
    float vacc7 = va7 * vb;
12483
12484
12485
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12486
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12487
0
    vacc2 = math_max_f32(vacc2, voutput_min);
12488
0
    vacc3 = math_max_f32(vacc3, voutput_min);
12489
0
    vacc4 = math_max_f32(vacc4, voutput_min);
12490
0
    vacc5 = math_max_f32(vacc5, voutput_min);
12491
0
    vacc6 = math_max_f32(vacc6, voutput_min);
12492
0
    vacc7 = math_max_f32(vacc7, voutput_min);
12493
12494
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12495
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12496
0
    vacc2 = math_min_f32(vacc2, voutput_max);
12497
0
    vacc3 = math_min_f32(vacc3, voutput_max);
12498
0
    vacc4 = math_min_f32(vacc4, voutput_max);
12499
0
    vacc5 = math_min_f32(vacc5, voutput_max);
12500
0
    vacc6 = math_min_f32(vacc6, voutput_max);
12501
0
    vacc7 = math_min_f32(vacc7, voutput_max);
12502
12503
0
    output[0] = vacc0;
12504
0
    output[1] = vacc1;
12505
0
    output[2] = vacc2;
12506
0
    output[3] = vacc3;
12507
0
    output[4] = vacc4;
12508
0
    output[5] = vacc5;
12509
0
    output[6] = vacc6;
12510
0
    output[7] = vacc7;
12511
0
    output += 8;
12512
0
  }
12513
0
  if XNN_UNLIKELY(batch != 0) {
12514
0
    do {
12515
0
      const float va = *input_a++;
12516
0
      float vacc = va * vb;
12517
0
      vacc = math_max_f32(vacc, voutput_min);
12518
0
      vacc = math_min_f32(vacc, voutput_max);
12519
0
      *output++ = vacc;
12520
0
      batch -= sizeof(float);
12521
0
    } while (batch != 0);
12522
0
  }
12523
0
}
12524
12525
void xnn_f32_vrdivc_minmax_ukernel__scalar_u2(
12526
    size_t batch,
12527
    const float* input_a,
12528
    const float* input_b,
12529
    float* output,
12530
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12531
0
{
12532
0
  assert(batch != 0);
12533
0
  assert(batch % sizeof(float) == 0);
12534
0
  assert(input_a != NULL);
12535
0
  assert(input_b != NULL);
12536
0
  assert(output != NULL);
12537
12538
0
  const float voutput_min = params->scalar.min;
12539
0
  const float voutput_max = params->scalar.max;
12540
0
  const float vb = *input_b;
12541
12542
0
  for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
12543
0
    const float va0 = input_a[0];
12544
0
    const float va1 = input_a[1];
12545
0
    input_a += 2;
12546
12547
0
    float vacc0 = vb / va0;
12548
0
    float vacc1 = vb / va1;
12549
12550
12551
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12552
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12553
12554
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12555
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12556
12557
0
    output[0] = vacc0;
12558
0
    output[1] = vacc1;
12559
0
    output += 2;
12560
0
  }
12561
0
  if XNN_UNLIKELY(batch != 0) {
12562
0
    assert(batch == sizeof(float));
12563
0
    const float va = *input_a;
12564
0
    float vacc = vb / va;
12565
0
    vacc = math_max_f32(vacc, voutput_min);
12566
0
    vacc = math_min_f32(vacc, voutput_max);
12567
0
    *output = vacc;
12568
0
  }
12569
0
}
12570
12571
void xnn_f32_vrsubc_minmax_ukernel__scalar_u8(
12572
    size_t batch,
12573
    const float* input_a,
12574
    const float* input_b,
12575
    float* output,
12576
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12577
0
{
12578
0
  assert(batch != 0);
12579
0
  assert(batch % sizeof(float) == 0);
12580
0
  assert(input_a != NULL);
12581
0
  assert(input_b != NULL);
12582
0
  assert(output != NULL);
12583
12584
0
  const float voutput_min = params->scalar.min;
12585
0
  const float voutput_max = params->scalar.max;
12586
0
  const float vb = *input_b;
12587
12588
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12589
0
    const float va0 = input_a[0];
12590
0
    const float va1 = input_a[1];
12591
0
    const float va2 = input_a[2];
12592
0
    const float va3 = input_a[3];
12593
0
    const float va4 = input_a[4];
12594
0
    const float va5 = input_a[5];
12595
0
    const float va6 = input_a[6];
12596
0
    const float va7 = input_a[7];
12597
0
    input_a += 8;
12598
12599
0
    float vacc0 = vb - va0;
12600
0
    float vacc1 = vb - va1;
12601
0
    float vacc2 = vb - va2;
12602
0
    float vacc3 = vb - va3;
12603
0
    float vacc4 = vb - va4;
12604
0
    float vacc5 = vb - va5;
12605
0
    float vacc6 = vb - va6;
12606
0
    float vacc7 = vb - va7;
12607
12608
12609
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12610
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12611
0
    vacc2 = math_max_f32(vacc2, voutput_min);
12612
0
    vacc3 = math_max_f32(vacc3, voutput_min);
12613
0
    vacc4 = math_max_f32(vacc4, voutput_min);
12614
0
    vacc5 = math_max_f32(vacc5, voutput_min);
12615
0
    vacc6 = math_max_f32(vacc6, voutput_min);
12616
0
    vacc7 = math_max_f32(vacc7, voutput_min);
12617
12618
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12619
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12620
0
    vacc2 = math_min_f32(vacc2, voutput_max);
12621
0
    vacc3 = math_min_f32(vacc3, voutput_max);
12622
0
    vacc4 = math_min_f32(vacc4, voutput_max);
12623
0
    vacc5 = math_min_f32(vacc5, voutput_max);
12624
0
    vacc6 = math_min_f32(vacc6, voutput_max);
12625
0
    vacc7 = math_min_f32(vacc7, voutput_max);
12626
12627
0
    output[0] = vacc0;
12628
0
    output[1] = vacc1;
12629
0
    output[2] = vacc2;
12630
0
    output[3] = vacc3;
12631
0
    output[4] = vacc4;
12632
0
    output[5] = vacc5;
12633
0
    output[6] = vacc6;
12634
0
    output[7] = vacc7;
12635
0
    output += 8;
12636
0
  }
12637
0
  if XNN_UNLIKELY(batch != 0) {
12638
0
    do {
12639
0
      const float va = *input_a++;
12640
0
      float vacc = vb - va;
12641
0
      vacc = math_max_f32(vacc, voutput_min);
12642
0
      vacc = math_min_f32(vacc, voutput_max);
12643
0
      *output++ = vacc;
12644
0
      batch -= sizeof(float);
12645
0
    } while (batch != 0);
12646
0
  }
12647
0
}
12648
12649
void xnn_f32_vsqrdiff_ukernel__scalar_u8(
12650
    size_t batch,
12651
    const float* input_a,
12652
    const float* input_b,
12653
    float* output,
12654
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
12655
0
{
12656
0
  assert(batch != 0);
12657
0
  assert(batch % sizeof(float) == 0);
12658
0
  assert(input_a != NULL);
12659
0
  assert(input_b != NULL);
12660
0
  assert(output != NULL);
12661
12662
12663
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12664
0
    const float va0 = input_a[0];
12665
0
    const float va1 = input_a[1];
12666
0
    const float va2 = input_a[2];
12667
0
    const float va3 = input_a[3];
12668
0
    const float va4 = input_a[4];
12669
0
    const float va5 = input_a[5];
12670
0
    const float va6 = input_a[6];
12671
0
    const float va7 = input_a[7];
12672
0
    input_a += 8;
12673
12674
0
    const float vb0 = input_b[0];
12675
0
    const float vb1 = input_b[1];
12676
0
    const float vb2 = input_b[2];
12677
0
    const float vb3 = input_b[3];
12678
0
    const float vb4 = input_b[4];
12679
0
    const float vb5 = input_b[5];
12680
0
    const float vb6 = input_b[6];
12681
0
    const float vb7 = input_b[7];
12682
0
    input_b += 8;
12683
12684
0
    float vacc0 = va0 - vb0;
12685
0
    float vacc1 = va1 - vb1;
12686
0
    float vacc2 = va2 - vb2;
12687
0
    float vacc3 = va3 - vb3;
12688
0
    float vacc4 = va4 - vb4;
12689
0
    float vacc5 = va5 - vb5;
12690
0
    float vacc6 = va6 - vb6;
12691
0
    float vacc7 = va7 - vb7;
12692
12693
0
    vacc0 = vacc0 * vacc0;
12694
0
    vacc1 = vacc1 * vacc1;
12695
0
    vacc2 = vacc2 * vacc2;
12696
0
    vacc3 = vacc3 * vacc3;
12697
0
    vacc4 = vacc4 * vacc4;
12698
0
    vacc5 = vacc5 * vacc5;
12699
0
    vacc6 = vacc6 * vacc6;
12700
0
    vacc7 = vacc7 * vacc7;
12701
12702
12703
0
    output[0] = vacc0;
12704
0
    output[1] = vacc1;
12705
0
    output[2] = vacc2;
12706
0
    output[3] = vacc3;
12707
0
    output[4] = vacc4;
12708
0
    output[5] = vacc5;
12709
0
    output[6] = vacc6;
12710
0
    output[7] = vacc7;
12711
0
    output += 8;
12712
0
  }
12713
0
  if XNN_UNLIKELY(batch != 0) {
12714
0
    do {
12715
0
      const float va = *input_a++;
12716
0
      const float vb = *input_b++;
12717
0
      float vacc = va - vb;
12718
0
      vacc = vacc * vacc;
12719
0
      *output++ = vacc;
12720
0
      batch -= sizeof(float);
12721
0
    } while (batch != 0);
12722
0
  }
12723
0
}
12724
12725
void xnn_f32_vsqrdiffc_ukernel__scalar_u8(
12726
    size_t batch,
12727
    const float* input_a,
12728
    const float* input_b,
12729
    float* output,
12730
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
12731
0
{
12732
0
  assert(batch != 0);
12733
0
  assert(batch % sizeof(float) == 0);
12734
0
  assert(input_a != NULL);
12735
0
  assert(input_b != NULL);
12736
0
  assert(output != NULL);
12737
12738
0
  const float vb = *input_b;
12739
12740
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12741
0
    const float va0 = input_a[0];
12742
0
    const float va1 = input_a[1];
12743
0
    const float va2 = input_a[2];
12744
0
    const float va3 = input_a[3];
12745
0
    const float va4 = input_a[4];
12746
0
    const float va5 = input_a[5];
12747
0
    const float va6 = input_a[6];
12748
0
    const float va7 = input_a[7];
12749
0
    input_a += 8;
12750
12751
0
    float vacc0 = va0 - vb;
12752
0
    float vacc1 = va1 - vb;
12753
0
    float vacc2 = va2 - vb;
12754
0
    float vacc3 = va3 - vb;
12755
0
    float vacc4 = va4 - vb;
12756
0
    float vacc5 = va5 - vb;
12757
0
    float vacc6 = va6 - vb;
12758
0
    float vacc7 = va7 - vb;
12759
12760
0
    vacc0 = vacc0 * vacc0;
12761
0
    vacc1 = vacc1 * vacc1;
12762
0
    vacc2 = vacc2 * vacc2;
12763
0
    vacc3 = vacc3 * vacc3;
12764
0
    vacc4 = vacc4 * vacc4;
12765
0
    vacc5 = vacc5 * vacc5;
12766
0
    vacc6 = vacc6 * vacc6;
12767
0
    vacc7 = vacc7 * vacc7;
12768
12769
12770
0
    output[0] = vacc0;
12771
0
    output[1] = vacc1;
12772
0
    output[2] = vacc2;
12773
0
    output[3] = vacc3;
12774
0
    output[4] = vacc4;
12775
0
    output[5] = vacc5;
12776
0
    output[6] = vacc6;
12777
0
    output[7] = vacc7;
12778
0
    output += 8;
12779
0
  }
12780
0
  if XNN_UNLIKELY(batch != 0) {
12781
0
    do {
12782
0
      const float va = *input_a++;
12783
0
      float vacc = va - vb;
12784
0
      vacc = vacc * vacc;
12785
0
      *output++ = vacc;
12786
0
      batch -= sizeof(float);
12787
0
    } while (batch != 0);
12788
0
  }
12789
0
}
12790
12791
void xnn_f32_vsub_minmax_ukernel__scalar_u8(
12792
    size_t batch,
12793
    const float* input_a,
12794
    const float* input_b,
12795
    float* output,
12796
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12797
0
{
12798
0
  assert(batch != 0);
12799
0
  assert(batch % sizeof(float) == 0);
12800
0
  assert(input_a != NULL);
12801
0
  assert(input_b != NULL);
12802
0
  assert(output != NULL);
12803
12804
0
  const float voutput_min = params->scalar.min;
12805
0
  const float voutput_max = params->scalar.max;
12806
12807
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12808
0
    const float va0 = input_a[0];
12809
0
    const float va1 = input_a[1];
12810
0
    const float va2 = input_a[2];
12811
0
    const float va3 = input_a[3];
12812
0
    const float va4 = input_a[4];
12813
0
    const float va5 = input_a[5];
12814
0
    const float va6 = input_a[6];
12815
0
    const float va7 = input_a[7];
12816
0
    input_a += 8;
12817
12818
0
    const float vb0 = input_b[0];
12819
0
    const float vb1 = input_b[1];
12820
0
    const float vb2 = input_b[2];
12821
0
    const float vb3 = input_b[3];
12822
0
    const float vb4 = input_b[4];
12823
0
    const float vb5 = input_b[5];
12824
0
    const float vb6 = input_b[6];
12825
0
    const float vb7 = input_b[7];
12826
0
    input_b += 8;
12827
12828
0
    float vacc0 = va0 - vb0;
12829
0
    float vacc1 = va1 - vb1;
12830
0
    float vacc2 = va2 - vb2;
12831
0
    float vacc3 = va3 - vb3;
12832
0
    float vacc4 = va4 - vb4;
12833
0
    float vacc5 = va5 - vb5;
12834
0
    float vacc6 = va6 - vb6;
12835
0
    float vacc7 = va7 - vb7;
12836
12837
12838
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12839
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12840
0
    vacc2 = math_max_f32(vacc2, voutput_min);
12841
0
    vacc3 = math_max_f32(vacc3, voutput_min);
12842
0
    vacc4 = math_max_f32(vacc4, voutput_min);
12843
0
    vacc5 = math_max_f32(vacc5, voutput_min);
12844
0
    vacc6 = math_max_f32(vacc6, voutput_min);
12845
0
    vacc7 = math_max_f32(vacc7, voutput_min);
12846
12847
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12848
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12849
0
    vacc2 = math_min_f32(vacc2, voutput_max);
12850
0
    vacc3 = math_min_f32(vacc3, voutput_max);
12851
0
    vacc4 = math_min_f32(vacc4, voutput_max);
12852
0
    vacc5 = math_min_f32(vacc5, voutput_max);
12853
0
    vacc6 = math_min_f32(vacc6, voutput_max);
12854
0
    vacc7 = math_min_f32(vacc7, voutput_max);
12855
12856
0
    output[0] = vacc0;
12857
0
    output[1] = vacc1;
12858
0
    output[2] = vacc2;
12859
0
    output[3] = vacc3;
12860
0
    output[4] = vacc4;
12861
0
    output[5] = vacc5;
12862
0
    output[6] = vacc6;
12863
0
    output[7] = vacc7;
12864
0
    output += 8;
12865
0
  }
12866
0
  if XNN_UNLIKELY(batch != 0) {
12867
0
    do {
12868
0
      const float va = *input_a++;
12869
0
      const float vb = *input_b++;
12870
0
      float vacc = va - vb;
12871
0
      vacc = math_max_f32(vacc, voutput_min);
12872
0
      vacc = math_min_f32(vacc, voutput_max);
12873
0
      *output++ = vacc;
12874
0
      batch -= sizeof(float);
12875
0
    } while (batch != 0);
12876
0
  }
12877
0
}
12878
12879
void xnn_f32_vsubc_minmax_ukernel__scalar_u8(
12880
    size_t batch,
12881
    const float* input_a,
12882
    const float* input_b,
12883
    float* output,
12884
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12885
0
{
12886
0
  assert(batch != 0);
12887
0
  assert(batch % sizeof(float) == 0);
12888
0
  assert(input_a != NULL);
12889
0
  assert(input_b != NULL);
12890
0
  assert(output != NULL);
12891
12892
0
  const float voutput_min = params->scalar.min;
12893
0
  const float voutput_max = params->scalar.max;
12894
0
  const float vb = *input_b;
12895
12896
0
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
12897
0
    const float va0 = input_a[0];
12898
0
    const float va1 = input_a[1];
12899
0
    const float va2 = input_a[2];
12900
0
    const float va3 = input_a[3];
12901
0
    const float va4 = input_a[4];
12902
0
    const float va5 = input_a[5];
12903
0
    const float va6 = input_a[6];
12904
0
    const float va7 = input_a[7];
12905
0
    input_a += 8;
12906
12907
0
    float vacc0 = va0 - vb;
12908
0
    float vacc1 = va1 - vb;
12909
0
    float vacc2 = va2 - vb;
12910
0
    float vacc3 = va3 - vb;
12911
0
    float vacc4 = va4 - vb;
12912
0
    float vacc5 = va5 - vb;
12913
0
    float vacc6 = va6 - vb;
12914
0
    float vacc7 = va7 - vb;
12915
12916
12917
0
    vacc0 = math_max_f32(vacc0, voutput_min);
12918
0
    vacc1 = math_max_f32(vacc1, voutput_min);
12919
0
    vacc2 = math_max_f32(vacc2, voutput_min);
12920
0
    vacc3 = math_max_f32(vacc3, voutput_min);
12921
0
    vacc4 = math_max_f32(vacc4, voutput_min);
12922
0
    vacc5 = math_max_f32(vacc5, voutput_min);
12923
0
    vacc6 = math_max_f32(vacc6, voutput_min);
12924
0
    vacc7 = math_max_f32(vacc7, voutput_min);
12925
12926
0
    vacc0 = math_min_f32(vacc0, voutput_max);
12927
0
    vacc1 = math_min_f32(vacc1, voutput_max);
12928
0
    vacc2 = math_min_f32(vacc2, voutput_max);
12929
0
    vacc3 = math_min_f32(vacc3, voutput_max);
12930
0
    vacc4 = math_min_f32(vacc4, voutput_max);
12931
0
    vacc5 = math_min_f32(vacc5, voutput_max);
12932
0
    vacc6 = math_min_f32(vacc6, voutput_max);
12933
0
    vacc7 = math_min_f32(vacc7, voutput_max);
12934
12935
0
    output[0] = vacc0;
12936
0
    output[1] = vacc1;
12937
0
    output[2] = vacc2;
12938
0
    output[3] = vacc3;
12939
0
    output[4] = vacc4;
12940
0
    output[5] = vacc5;
12941
0
    output[6] = vacc6;
12942
0
    output[7] = vacc7;
12943
0
    output += 8;
12944
0
  }
12945
0
  if XNN_UNLIKELY(batch != 0) {
12946
0
    do {
12947
0
      const float va = *input_a++;
12948
0
      float vacc = va - vb;
12949
0
      vacc = math_max_f32(vacc, voutput_min);
12950
0
      vacc = math_min_f32(vacc, voutput_max);
12951
0
      *output++ = vacc;
12952
0
      batch -= sizeof(float);
12953
0
    } while (batch != 0);
12954
0
  }
12955
0
}
12956
12957
void xnn_f32_vclamp_ukernel__scalar_u4(
12958
    size_t batch,
12959
    const float* input,
12960
    float* output,
12961
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
12962
0
{
12963
0
  assert(batch != 0);
12964
0
  assert(batch % sizeof(float) == 0);
12965
0
  assert(input != NULL);
12966
0
  assert(output != NULL);
12967
12968
0
  const float vy_min = params->scalar.min;
12969
0
  const float vy_max = params->scalar.max;
12970
12971
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
12972
0
    float vacc0 = input[0];
12973
0
    float vacc1 = input[1];
12974
0
    float vacc2 = input[2];
12975
0
    float vacc3 = input[3];
12976
0
    input += 4;
12977
12978
0
    vacc0 = math_max_f32(vacc0, vy_min);
12979
0
    vacc1 = math_max_f32(vacc1, vy_min);
12980
0
    vacc2 = math_max_f32(vacc2, vy_min);
12981
0
    vacc3 = math_max_f32(vacc3, vy_min);
12982
12983
0
    vacc0 = math_min_f32(vacc0, vy_max);
12984
0
    vacc1 = math_min_f32(vacc1, vy_max);
12985
0
    vacc2 = math_min_f32(vacc2, vy_max);
12986
0
    vacc3 = math_min_f32(vacc3, vy_max);
12987
12988
0
    output[0] = vacc0;
12989
0
    output[1] = vacc1;
12990
0
    output[2] = vacc2;
12991
0
    output[3] = vacc3;
12992
0
    output += 4;
12993
0
  }
12994
0
  if XNN_UNLIKELY(batch != 0) {
12995
0
    do {
12996
0
      float vacc = *input++;
12997
0
      vacc = math_max_f32(vacc, vy_min);
12998
0
      vacc = math_min_f32(vacc, vy_max);
12999
0
      *output++ = vacc;
13000
0
      batch -= sizeof(float);
13001
0
    } while (batch != 0);
13002
0
  }
13003
0
}
13004
13005
void xnn_f32_vcmul_ukernel__scalar_u4(
13006
    size_t batch,
13007
    const float* input_a,
13008
    const float* input_b,
13009
    float* output,
13010
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
13011
0
{
13012
0
  assert(batch != 0);
13013
0
  assert(batch % sizeof(float) == 0);
13014
0
  assert(input_a != NULL);
13015
0
  assert(input_b != NULL);
13016
0
  assert(output != NULL);
13017
13018
0
  const float* ar = input_a;
13019
0
  const float* ai = (const float*) ((uintptr_t) input_a + batch);
13020
0
  const float* br = input_b;
13021
0
  const float* bi = (const float*) ((uintptr_t) input_b + batch);
13022
0
  float* or = output;
13023
0
  float* oi = (float*) ((uintptr_t) output + batch);
13024
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13025
0
    const float va0r = ar[0];
13026
0
    const float va1r = ar[1];
13027
0
    const float va2r = ar[2];
13028
0
    const float va3r = ar[3];
13029
0
    ar += 4;
13030
13031
0
    const float va0i = ai[0];
13032
0
    const float va1i = ai[1];
13033
0
    const float va2i = ai[2];
13034
0
    const float va3i = ai[3];
13035
0
    ai += 4;
13036
13037
0
    const float vb0r = br[0];
13038
0
    const float vb1r = br[1];
13039
0
    const float vb2r = br[2];
13040
0
    const float vb3r = br[3];
13041
0
    br += 4;
13042
13043
0
    const float vb0i = bi[0];
13044
0
    const float vb1i = bi[1];
13045
0
    const float vb2i = bi[2];
13046
0
    const float vb3i = bi[3];
13047
0
    bi += 4;
13048
13049
0
    const float vacc0r = va0r * vb0r - va0i * vb0i;
13050
0
    const float vacc1r = va1r * vb1r - va1i * vb1i;
13051
0
    const float vacc2r = va2r * vb2r - va2i * vb2i;
13052
0
    const float vacc3r = va3r * vb3r - va3i * vb3i;
13053
13054
0
    const float vacc0i = va0r * vb0i + va0i * vb0r;
13055
0
    const float vacc1i = va1r * vb1i + va1i * vb1r;
13056
0
    const float vacc2i = va2r * vb2i + va2i * vb2r;
13057
0
    const float vacc3i = va3r * vb3i + va3i * vb3r;
13058
13059
0
    or[0] = vacc0r;
13060
0
    or[1] = vacc1r;
13061
0
    or[2] = vacc2r;
13062
0
    or[3] = vacc3r;
13063
0
    or += 4;
13064
13065
0
    oi[0] = vacc0i;
13066
0
    oi[1] = vacc1i;
13067
0
    oi[2] = vacc2i;
13068
0
    oi[3] = vacc3i;
13069
0
    oi += 4;
13070
0
  }
13071
0
  if XNN_UNLIKELY(batch != 0) {
13072
0
    do {
13073
0
      const float var = *ar++;
13074
0
      const float vai = *ai++;
13075
0
      const float vbr = *br++;
13076
0
      const float vbi = *bi++;
13077
0
      const float vaccr = var * vbr - vai * vbi;
13078
0
      const float vacci = var * vbi + vai * vbr;
13079
0
      *or++ = vaccr;
13080
0
      *oi++ = vacci;
13081
0
      batch -= sizeof(float);
13082
0
    } while (batch != 0);
13083
0
  }
13084
0
}
13085
13086
extern XNN_INTERNAL const uint32_t xnn_table_exp2minus_k_over_16[16];
13087
13088
void xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u2(
13089
    size_t batch,
13090
    const float* input,
13091
    float* output,
13092
    const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
13093
0
{
13094
0
  assert(batch != 0);
13095
0
  assert(batch % sizeof(float) == 0);
13096
0
  assert(input != NULL);
13097
0
  assert(output != NULL);
13098
13099
0
  const float vprescale = params->scalar_rr2_lut16_p3.prescale;
13100
0
  const float valpha = params->scalar_rr2_lut16_p3.alpha;
13101
0
  const float vbeta = params->scalar_rr2_lut16_p3.beta;
13102
0
  const float vmagic_bias = params->scalar_rr2_lut16_p3.magic_bias;
13103
0
  const float vlog2e = params->scalar_rr2_lut16_p3.log2e;
13104
0
  const uint32_t vindex_mask = UINT32_C(0xF);
13105
0
  const float vsat_cutoff = params->scalar_rr2_lut16_p3.sat_cutoff;
13106
0
  const float vminus_ln2_hi = params->scalar_rr2_lut16_p3.minus_ln2_hi;
13107
0
  const float vminus_ln2_lo = params->scalar_rr2_lut16_p3.minus_ln2_lo;
13108
0
  const float vc3 = params->scalar_rr2_lut16_p3.c3;
13109
0
  const float vc2 = params->scalar_rr2_lut16_p3.c2;
13110
0
  const float vone = params->scalar_rr2_lut16_p3.one;
13111
13112
0
  for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
13113
0
    float vx0 = input[0];
13114
0
    float vx1 = input[1];
13115
0
    input += 2;
13116
13117
0
    const float vz0 = vx0 * vprescale;
13118
0
    const float vz1 = vx1 * vprescale;
13119
13120
0
    float vn0 = vz0 * vlog2e + vmagic_bias;
13121
0
    float vn1 = vz1 * vlog2e + vmagic_bias;
13122
13123
0
    const uint32_t ven0 = float_as_uint32(vn0) << 19;
13124
0
    const uint32_t vidx0 = float_as_uint32(vn0) & vindex_mask;
13125
0
    vn0 -= vmagic_bias;
13126
0
    const uint32_t ven1 = float_as_uint32(vn1) << 19;
13127
0
    const uint32_t vidx1 = float_as_uint32(vn1) & vindex_mask;
13128
0
    vn1 -= vmagic_bias;
13129
13130
0
    float vt0 = vn0 * vminus_ln2_hi + vz0;
13131
0
    float vs0 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx0] + ven0);
13132
0
    float vt1 = vn1 * vminus_ln2_hi + vz1;
13133
0
    float vs1 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx1] + ven1);
13134
13135
0
    vt0 = vn0 * vminus_ln2_lo + vt0;
13136
0
    if XNN_UNPREDICTABLE(vz0 <= vsat_cutoff) {
13137
0
      vs0 = 0.0f;
13138
0
      vt0 = 0.0f;
13139
0
    }
13140
0
    vt1 = vn1 * vminus_ln2_lo + vt1;
13141
0
    if XNN_UNPREDICTABLE(vz1 <= vsat_cutoff) {
13142
0
      vs1 = 0.0f;
13143
0
      vt1 = 0.0f;
13144
0
    }
13145
13146
0
    float vp0 = vc3 * vt0 + vc2;
13147
0
    float vp1 = vc3 * vt1 + vc2;
13148
13149
0
    vp0 *= vt0;
13150
0
    vp1 *= vt1;
13151
13152
0
    vt0 *= vs0;
13153
0
    vs0 -= vone;
13154
0
    vt1 *= vs1;
13155
0
    vs1 -= vone;
13156
13157
0
    vp0 = vp0 * vt0 + vt0;
13158
0
    vp1 = vp1 * vt1 + vt1;
13159
13160
0
    const float ve0 = (vp0 + vs0) * valpha;
13161
0
    float vy0 = vx0 * vbeta;
13162
0
    const float ve1 = (vp1 + vs1) * valpha;
13163
0
    float vy1 = vx1 * vbeta;
13164
13165
0
    if XNN_UNPREDICTABLE(vx0 < 0.0f) {
13166
0
      vy0 = ve0;
13167
0
    }
13168
0
    if XNN_UNPREDICTABLE(vx1 < 0.0f) {
13169
0
      vy1 = ve1;
13170
0
    }
13171
13172
0
    output[0] = vy0;
13173
0
    output[1] = vy1;
13174
0
    output += 2;
13175
0
  }
13176
0
  if XNN_UNLIKELY(batch != 0) {
13177
0
    float vx = *input;
13178
13179
0
    const float vz = vx * vprescale;
13180
13181
0
    float vn = vz * vlog2e + vmagic_bias;
13182
0
    const uint32_t ven = float_as_uint32(vn) << 19;
13183
0
    const uint32_t vidx = float_as_uint32(vn) & vindex_mask;
13184
0
    vn -= vmagic_bias;
13185
13186
0
    float vt = vn * vminus_ln2_hi + vz;
13187
0
    float vs = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx] + ven);
13188
13189
0
    vt = vn * vminus_ln2_lo + vt;
13190
0
    if XNN_UNPREDICTABLE(vz <= vsat_cutoff) {
13191
0
      vs = 0.0f;
13192
0
      vt = 0.0f;
13193
0
    }
13194
13195
0
    float vp = vc3 * vt + vc2;
13196
0
    vp *= vt;
13197
13198
0
    vt *= vs;
13199
0
    vs -= vone;
13200
0
    vp = vp * vt + vt;
13201
0
    const float ve = (vp + vs) * valpha;
13202
13203
0
    float vy = vx * vbeta;
13204
0
    if XNN_UNPREDICTABLE(vx < 0.0f) {
13205
0
      vy = ve;
13206
0
    }
13207
13208
0
    *output = vy;
13209
0
  }
13210
0
}
13211
13212
extern XNN_INTERNAL const uint32_t xnn_table_exp2minus_k_over_16[16];
13213
13214
void xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4(
13215
    size_t batch,
13216
    const float* input,
13217
    float* output,
13218
    const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
13219
0
{
13220
0
  assert(batch != 0);
13221
0
  assert(batch % sizeof(float) == 0);
13222
0
  assert(input != NULL);
13223
0
  assert(output != NULL);
13224
13225
0
  const float vprescale = params->scalar_rr2_lut16_p3.prescale;
13226
0
  const float valpha = params->scalar_rr2_lut16_p3.alpha;
13227
0
  const float vbeta = params->scalar_rr2_lut16_p3.beta;
13228
0
  const float vmagic_bias = params->scalar_rr2_lut16_p3.magic_bias;
13229
0
  const float vlog2e = params->scalar_rr2_lut16_p3.log2e;
13230
0
  const uint32_t vindex_mask = UINT32_C(0xF);
13231
0
  const float vsat_cutoff = params->scalar_rr2_lut16_p3.sat_cutoff;
13232
0
  const float vminus_ln2_hi = params->scalar_rr2_lut16_p3.minus_ln2_hi;
13233
0
  const float vminus_ln2_lo = params->scalar_rr2_lut16_p3.minus_ln2_lo;
13234
0
  const float vc3 = params->scalar_rr2_lut16_p3.c3;
13235
0
  const float vc2 = params->scalar_rr2_lut16_p3.c2;
13236
0
  const float vone = params->scalar_rr2_lut16_p3.one;
13237
13238
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13239
0
    float vx0 = input[0];
13240
0
    float vx1 = input[1];
13241
0
    float vx2 = input[2];
13242
0
    float vx3 = input[3];
13243
0
    input += 4;
13244
13245
0
    const float vz0 = vx0 * vprescale;
13246
0
    const float vz1 = vx1 * vprescale;
13247
0
    const float vz2 = vx2 * vprescale;
13248
0
    const float vz3 = vx3 * vprescale;
13249
13250
0
    float vn0 = vz0 * vlog2e + vmagic_bias;
13251
0
    float vn1 = vz1 * vlog2e + vmagic_bias;
13252
0
    float vn2 = vz2 * vlog2e + vmagic_bias;
13253
0
    float vn3 = vz3 * vlog2e + vmagic_bias;
13254
13255
0
    const uint32_t ven0 = float_as_uint32(vn0) << 19;
13256
0
    const uint32_t vidx0 = float_as_uint32(vn0) & vindex_mask;
13257
0
    vn0 -= vmagic_bias;
13258
0
    const uint32_t ven1 = float_as_uint32(vn1) << 19;
13259
0
    const uint32_t vidx1 = float_as_uint32(vn1) & vindex_mask;
13260
0
    vn1 -= vmagic_bias;
13261
0
    const uint32_t ven2 = float_as_uint32(vn2) << 19;
13262
0
    const uint32_t vidx2 = float_as_uint32(vn2) & vindex_mask;
13263
0
    vn2 -= vmagic_bias;
13264
0
    const uint32_t ven3 = float_as_uint32(vn3) << 19;
13265
0
    const uint32_t vidx3 = float_as_uint32(vn3) & vindex_mask;
13266
0
    vn3 -= vmagic_bias;
13267
13268
0
    float vt0 = vn0 * vminus_ln2_hi + vz0;
13269
0
    float vs0 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx0] + ven0);
13270
0
    float vt1 = vn1 * vminus_ln2_hi + vz1;
13271
0
    float vs1 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx1] + ven1);
13272
0
    float vt2 = vn2 * vminus_ln2_hi + vz2;
13273
0
    float vs2 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx2] + ven2);
13274
0
    float vt3 = vn3 * vminus_ln2_hi + vz3;
13275
0
    float vs3 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx3] + ven3);
13276
13277
0
    vt0 = vn0 * vminus_ln2_lo + vt0;
13278
0
    if XNN_UNPREDICTABLE(vz0 <= vsat_cutoff) {
13279
0
      vs0 = 0.0f;
13280
0
      vt0 = 0.0f;
13281
0
    }
13282
0
    vt1 = vn1 * vminus_ln2_lo + vt1;
13283
0
    if XNN_UNPREDICTABLE(vz1 <= vsat_cutoff) {
13284
0
      vs1 = 0.0f;
13285
0
      vt1 = 0.0f;
13286
0
    }
13287
0
    vt2 = vn2 * vminus_ln2_lo + vt2;
13288
0
    if XNN_UNPREDICTABLE(vz2 <= vsat_cutoff) {
13289
0
      vs2 = 0.0f;
13290
0
      vt2 = 0.0f;
13291
0
    }
13292
0
    vt3 = vn3 * vminus_ln2_lo + vt3;
13293
0
    if XNN_UNPREDICTABLE(vz3 <= vsat_cutoff) {
13294
0
      vs3 = 0.0f;
13295
0
      vt3 = 0.0f;
13296
0
    }
13297
13298
0
    float vp0 = vc3 * vt0 + vc2;
13299
0
    float vp1 = vc3 * vt1 + vc2;
13300
0
    float vp2 = vc3 * vt2 + vc2;
13301
0
    float vp3 = vc3 * vt3 + vc2;
13302
13303
0
    vp0 *= vt0;
13304
0
    vp1 *= vt1;
13305
0
    vp2 *= vt2;
13306
0
    vp3 *= vt3;
13307
13308
0
    vt0 *= vs0;
13309
0
    vs0 -= vone;
13310
0
    vt1 *= vs1;
13311
0
    vs1 -= vone;
13312
0
    vt2 *= vs2;
13313
0
    vs2 -= vone;
13314
0
    vt3 *= vs3;
13315
0
    vs3 -= vone;
13316
13317
0
    vp0 = vp0 * vt0 + vt0;
13318
0
    vp1 = vp1 * vt1 + vt1;
13319
0
    vp2 = vp2 * vt2 + vt2;
13320
0
    vp3 = vp3 * vt3 + vt3;
13321
13322
0
    const float ve0 = (vp0 + vs0) * valpha;
13323
0
    float vy0 = vx0 * vbeta;
13324
0
    const float ve1 = (vp1 + vs1) * valpha;
13325
0
    float vy1 = vx1 * vbeta;
13326
0
    const float ve2 = (vp2 + vs2) * valpha;
13327
0
    float vy2 = vx2 * vbeta;
13328
0
    const float ve3 = (vp3 + vs3) * valpha;
13329
0
    float vy3 = vx3 * vbeta;
13330
13331
0
    if XNN_UNPREDICTABLE(vx0 < 0.0f) {
13332
0
      vy0 = ve0;
13333
0
    }
13334
0
    if XNN_UNPREDICTABLE(vx1 < 0.0f) {
13335
0
      vy1 = ve1;
13336
0
    }
13337
0
    if XNN_UNPREDICTABLE(vx2 < 0.0f) {
13338
0
      vy2 = ve2;
13339
0
    }
13340
0
    if XNN_UNPREDICTABLE(vx3 < 0.0f) {
13341
0
      vy3 = ve3;
13342
0
    }
13343
13344
0
    output[0] = vy0;
13345
0
    output[1] = vy1;
13346
0
    output[2] = vy2;
13347
0
    output[3] = vy3;
13348
0
    output += 4;
13349
0
  }
13350
0
  if XNN_UNLIKELY(batch != 0) {
13351
0
    do {
13352
0
      float vx = *input++;
13353
13354
0
      const float vz = vx * vprescale;
13355
13356
0
      float vn = vz * vlog2e + vmagic_bias;
13357
0
      const uint32_t ven = float_as_uint32(vn) << 19;
13358
0
      const uint32_t vidx = float_as_uint32(vn) & vindex_mask;
13359
0
      vn -= vmagic_bias;
13360
13361
0
      float vt = vn * vminus_ln2_hi + vz;
13362
0
      float vs = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx] + ven);
13363
13364
0
      vt = vn * vminus_ln2_lo + vt;
13365
0
      if XNN_UNPREDICTABLE(vz <= vsat_cutoff) {
13366
0
        vs = 0.0f;
13367
0
        vt = 0.0f;
13368
0
      }
13369
13370
0
      float vp = vc3 * vt + vc2;
13371
0
      vp *= vt;
13372
13373
0
      vt *= vs;
13374
0
      vs -= vone;
13375
0
      vp = vp * vt + vt;
13376
0
      const float ve = (vp + vs) * valpha;
13377
13378
0
      float vy = vx * vbeta;
13379
0
      if XNN_UNPREDICTABLE(vx < 0.0f) {
13380
0
        vy = ve;
13381
0
      }
13382
13383
0
      *output++ = vy;
13384
13385
0
      batch -= sizeof(float);
13386
0
    } while (batch != 0);
13387
0
  }
13388
0
}
13389
13390
void xnn_f32_vhswish_ukernel__scalar_u4(
13391
    size_t batch,
13392
    const float* input,
13393
    float* output,
13394
    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)])
13395
0
{
13396
0
  assert(batch != 0);
13397
0
  assert(batch % sizeof(float) == 0);
13398
0
  assert(input != NULL);
13399
0
  assert(output != NULL);
13400
13401
0
  const float vsixth = params->scalar.sixth;
13402
0
  const float vthree = params->scalar.three;
13403
0
  const float vsix = params->scalar.six;
13404
0
  const float vzero = 0.0f;
13405
0
  assert(vthree == 3.0f);
13406
0
  assert(vsix == 6.0f);
13407
13408
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13409
0
    float vx0 = input[0];
13410
0
    float vx1 = input[1];
13411
0
    float vx2 = input[2];
13412
0
    float vx3 = input[3];
13413
0
    input += 4;
13414
13415
0
    float vacc0 = vx0 + vthree;
13416
0
    vx0 *= vsixth;
13417
0
    float vacc1 = vx1 + vthree;
13418
0
    vx1 *= vsixth;
13419
0
    float vacc2 = vx2 + vthree;
13420
0
    vx2 *= vsixth;
13421
0
    float vacc3 = vx3 + vthree;
13422
0
    vx3 *= vsixth;
13423
13424
0
    vacc0 = math_max_f32(vacc0, vzero);
13425
0
    vacc1 = math_max_f32(vacc1, vzero);
13426
0
    vacc2 = math_max_f32(vacc2, vzero);
13427
0
    vacc3 = math_max_f32(vacc3, vzero);
13428
13429
0
    vacc0 = math_min_f32(vacc0, vsix);
13430
0
    vacc1 = math_min_f32(vacc1, vsix);
13431
0
    vacc2 = math_min_f32(vacc2, vsix);
13432
0
    vacc3 = math_min_f32(vacc3, vsix);
13433
13434
0
    vacc0 *= vx0;
13435
0
    vacc1 *= vx1;
13436
0
    vacc2 *= vx2;
13437
0
    vacc3 *= vx3;
13438
13439
0
    output[0] = vacc0;
13440
0
    output[1] = vacc1;
13441
0
    output[2] = vacc2;
13442
0
    output[3] = vacc3;
13443
0
    output += 4;
13444
0
  }
13445
0
  if XNN_UNLIKELY(batch != 0) {
13446
0
    do {
13447
0
      float vx = *input++;
13448
0
      float vacc = vx + vthree;
13449
0
      vx *= vsixth;
13450
0
      vacc = math_max_f32(vacc, vzero);
13451
0
      vacc = math_min_f32(vacc, vsix);
13452
0
      vacc *= vx;
13453
0
      *output++ = vacc;
13454
0
      batch -= sizeof(float);
13455
0
    } while (batch != 0);
13456
0
  }
13457
0
}
13458
13459
void xnn_f32_vlrelu_ukernel__scalar_u4(
13460
    size_t batch,
13461
    const float* input,
13462
    float* output,
13463
    const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
13464
0
{
13465
0
  assert(batch != 0);
13466
0
  assert(batch % sizeof(float) == 0);
13467
0
  assert(input != NULL);
13468
0
  assert(output != NULL);
13469
13470
0
  const float vslope = params->scalar.slope;
13471
13472
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13473
0
    const float vx0 = input[0];
13474
0
    const float vx1 = input[1];
13475
0
    const float vx2 = input[2];
13476
0
    const float vx3 = input[3];
13477
0
    input += 4;
13478
13479
0
    float vacc0 = vx0 * vslope;
13480
0
    float vacc1 = vx1 * vslope;
13481
0
    float vacc2 = vx2 * vslope;
13482
0
    float vacc3 = vx3 * vslope;
13483
13484
0
    vacc0 = XNN_UNPREDICTABLE(vx0 < 0.0f) ? vacc0 : vx0;
13485
0
    vacc1 = XNN_UNPREDICTABLE(vx1 < 0.0f) ? vacc1 : vx1;
13486
0
    vacc2 = XNN_UNPREDICTABLE(vx2 < 0.0f) ? vacc2 : vx2;
13487
0
    vacc3 = XNN_UNPREDICTABLE(vx3 < 0.0f) ? vacc3 : vx3;
13488
13489
0
    output[0] = vacc0;
13490
0
    output[1] = vacc1;
13491
0
    output[2] = vacc2;
13492
0
    output[3] = vacc3;
13493
0
    output += 4;
13494
0
  }
13495
0
  if XNN_UNLIKELY(batch != 0) {
13496
0
    do {
13497
0
      const float vx = *input++;
13498
0
      float vacc = vx * vslope;
13499
0
      vacc = XNN_UNPREDICTABLE(vx < 0.0f) ? vacc : vx;
13500
0
      *output++ = vacc;
13501
0
      batch -= sizeof(float);
13502
0
    } while (batch != 0);
13503
0
  }
13504
0
}
13505
13506
void xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x(
13507
    size_t rows,
13508
    size_t channels,
13509
    const float* restrict input,
13510
    size_t input_stride,
13511
    const float* restrict weights,
13512
    float* restrict output,
13513
    size_t output_stride,
13514
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
13515
0
{
13516
0
  assert(rows != 0);
13517
0
  assert(channels != 0);
13518
0
  assert(channels % sizeof(float) == 0);
13519
13520
0
  const size_t input_increment = input_stride * 2 - channels;
13521
0
  const size_t output_increment = output_stride * 2 - channels;
13522
13523
0
  const float* i0 = input;
13524
0
  float* o0 = output;
13525
0
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
13526
0
  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
13527
13528
0
  const float vmin = params->scalar.min;
13529
0
  const float vmax = params->scalar.max;
13530
0
  do {
13531
0
    if XNN_UNPREDICTABLE(rows < 2) {
13532
0
      i1 = i0;
13533
0
      o1 = o0;
13534
0
    }
13535
13536
0
    const float* w = weights;
13537
0
    size_t c = channels;
13538
0
    do {
13539
0
      const float vscale = w[0];
13540
13541
0
      float vacc0 = *i0++;
13542
0
      float vacc1 = *i1++;
13543
13544
0
      const float vbias = w[1];
13545
13546
0
      vacc0 = vacc0 * vscale + vbias;
13547
0
      vacc1 = vacc1 * vscale + vbias;
13548
13549
0
      vacc0 = math_max_f32(vacc0, vmin);
13550
0
      vacc1 = math_max_f32(vacc1, vmin);
13551
13552
0
      vacc0 = math_min_f32(vacc0, vmax);
13553
0
      vacc1 = math_min_f32(vacc1, vmax);
13554
13555
0
      *o0++ = vacc0;
13556
0
      *o1++ = vacc1;
13557
13558
0
      w += 2;
13559
0
      c -= sizeof(float);
13560
0
    } while (c != 0);
13561
0
    i0 = (const float*) ((uintptr_t) i0 + input_increment);
13562
0
    o0 = (float*) ((uintptr_t) o0 + output_increment);
13563
0
    i1 = (const float*) ((uintptr_t) i1 + input_increment);
13564
0
    o1 = (float*) ((uintptr_t) o1 + output_increment);
13565
0
    rows = doz(rows, 2);
13566
0
  } while (rows != 0);
13567
0
}
13568
13569
void xnn_f32_vrelu_ukernel__scalar_u8(
13570
    size_t batch,
13571
    const float* input,
13572
    float* output,
13573
    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
13574
0
{
13575
0
  assert(batch != 0);
13576
0
  assert(batch % sizeof(float) == 0);
13577
0
  assert(input != NULL);
13578
0
  assert(output != NULL);
13579
13580
0
  const uint32_t* i = (const uint32_t*) input;
13581
0
  uint32_t* o = (uint32_t*) output;
13582
13583
0
  for (; batch >= 8 * sizeof(uint32_t); batch -= 8 * sizeof(uint32_t)) {
13584
0
    uint32_t vacc0 = i[0];
13585
0
    uint32_t vacc1 = i[1];
13586
0
    uint32_t vacc2 = i[2];
13587
0
    uint32_t vacc3 = i[3];
13588
0
    uint32_t vacc4 = i[4];
13589
0
    uint32_t vacc5 = i[5];
13590
0
    uint32_t vacc6 = i[6];
13591
0
    uint32_t vacc7 = i[7];
13592
0
    i += 8;
13593
13594
0
    vacc0 = ((vacc0 >> 31) - 1) & vacc0;
13595
0
    vacc1 = ((vacc1 >> 31) - 1) & vacc1;
13596
0
    vacc2 = ((vacc2 >> 31) - 1) & vacc2;
13597
0
    vacc3 = ((vacc3 >> 31) - 1) & vacc3;
13598
0
    vacc4 = ((vacc4 >> 31) - 1) & vacc4;
13599
0
    vacc5 = ((vacc5 >> 31) - 1) & vacc5;
13600
0
    vacc6 = ((vacc6 >> 31) - 1) & vacc6;
13601
0
    vacc7 = ((vacc7 >> 31) - 1) & vacc7;
13602
13603
0
    o[0] = vacc0;
13604
0
    o[1] = vacc1;
13605
0
    o[2] = vacc2;
13606
0
    o[3] = vacc3;
13607
0
    o[4] = vacc4;
13608
0
    o[5] = vacc5;
13609
0
    o[6] = vacc6;
13610
0
    o[7] = vacc7;
13611
0
    o += 8;
13612
0
  }
13613
0
  if XNN_UNLIKELY(batch != 0) {
13614
0
    do {
13615
0
      uint32_t vacc = *i++;
13616
0
      vacc =  ((vacc >> 31) - 1) & vacc;
13617
0
      *o++ = vacc;
13618
0
      batch -= sizeof(uint32_t);
13619
0
    } while (batch != 0);
13620
0
  }
13621
0
}
13622
13623
void xnn_f32_vrndd_ukernel__scalar_libm_u1(
13624
    size_t batch,
13625
    const float* input,
13626
    float* output,
13627
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13628
0
{
13629
0
  assert(batch != 0);
13630
0
  assert(batch % sizeof(float) == 0);
13631
0
  assert(input != NULL);
13632
0
  assert(output != NULL);
13633
13634
0
  do {
13635
0
    const float vx = *input++;
13636
0
    const float vy = floorf(vx);
13637
0
    *output++ = vy;
13638
0
    batch -= sizeof(float);
13639
0
  } while (batch != 0);
13640
0
}
13641
13642
void xnn_f32_vrndd_ukernel__scalar_libm_u4(
13643
    size_t batch,
13644
    const float* input,
13645
    float* output,
13646
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13647
0
{
13648
0
  assert(batch != 0);
13649
0
  assert(batch % sizeof(float) == 0);
13650
0
  assert(input != NULL);
13651
0
  assert(output != NULL);
13652
13653
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13654
0
    const float vx0 = input[0];
13655
0
    const float vx1 = input[1];
13656
0
    const float vx2 = input[2];
13657
0
    const float vx3 = input[3];
13658
0
    input += 4;
13659
13660
0
    const float vy0 = floorf(vx0);
13661
0
    const float vy1 = floorf(vx1);
13662
0
    const float vy2 = floorf(vx2);
13663
0
    const float vy3 = floorf(vx3);
13664
13665
0
    output[0] = vy0;
13666
0
    output[1] = vy1;
13667
0
    output[2] = vy2;
13668
0
    output[3] = vy3;
13669
0
    output += 4;
13670
0
  }
13671
0
  if XNN_UNLIKELY(batch != 0) {
13672
0
    do {
13673
0
      const float vx = *input++;
13674
0
      const float vy = floorf(vx);
13675
0
      *output++ = vy;
13676
0
      batch -= sizeof(float);
13677
0
    } while (batch != 0);
13678
0
  }
13679
0
}
13680
13681
void xnn_f32_vrndne_ukernel__scalar_libm_u1(
13682
    size_t batch,
13683
    const float* input,
13684
    float* output,
13685
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13686
0
{
13687
0
  assert(batch != 0);
13688
0
  assert(batch % sizeof(float) == 0);
13689
0
  assert(input != NULL);
13690
0
  assert(output != NULL);
13691
13692
0
  do {
13693
0
    const float vx = *input++;
13694
0
    const float vy = nearbyintf(vx);
13695
0
    *output++ = vy;
13696
0
    batch -= sizeof(float);
13697
0
  } while (batch != 0);
13698
0
}
13699
13700
void xnn_f32_vrndne_ukernel__scalar_libm_u4(
13701
    size_t batch,
13702
    const float* input,
13703
    float* output,
13704
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13705
0
{
13706
0
  assert(batch != 0);
13707
0
  assert(batch % sizeof(float) == 0);
13708
0
  assert(input != NULL);
13709
0
  assert(output != NULL);
13710
13711
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13712
0
    const float vx0 = input[0];
13713
0
    const float vx1 = input[1];
13714
0
    const float vx2 = input[2];
13715
0
    const float vx3 = input[3];
13716
0
    input += 4;
13717
13718
0
    const float vy0 = nearbyintf(vx0);
13719
0
    const float vy1 = nearbyintf(vx1);
13720
0
    const float vy2 = nearbyintf(vx2);
13721
0
    const float vy3 = nearbyintf(vx3);
13722
13723
0
    output[0] = vy0;
13724
0
    output[1] = vy1;
13725
0
    output[2] = vy2;
13726
0
    output[3] = vy3;
13727
0
    output += 4;
13728
0
  }
13729
0
  if XNN_UNLIKELY(batch != 0) {
13730
0
    do {
13731
0
      const float vx = *input++;
13732
0
      const float vy = nearbyintf(vx);
13733
0
      *output++ = vy;
13734
0
      batch -= sizeof(float);
13735
0
    } while (batch != 0);
13736
0
  }
13737
0
}
13738
13739
void xnn_f32_vrndu_ukernel__scalar_libm_u1(
13740
    size_t batch,
13741
    const float* input,
13742
    float* output,
13743
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13744
0
{
13745
0
  assert(batch != 0);
13746
0
  assert(batch % sizeof(float) == 0);
13747
0
  assert(input != NULL);
13748
0
  assert(output != NULL);
13749
13750
0
  do {
13751
0
    const float vx = *input++;
13752
0
    const float vy = ceilf(vx);
13753
0
    *output++ = vy;
13754
0
    batch -= sizeof(float);
13755
0
  } while (batch != 0);
13756
0
}
13757
13758
void xnn_f32_vrndu_ukernel__scalar_libm_u4(
13759
    size_t batch,
13760
    const float* input,
13761
    float* output,
13762
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13763
0
{
13764
0
  assert(batch != 0);
13765
0
  assert(batch % sizeof(float) == 0);
13766
0
  assert(input != NULL);
13767
0
  assert(output != NULL);
13768
13769
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13770
0
    const float vx0 = input[0];
13771
0
    const float vx1 = input[1];
13772
0
    const float vx2 = input[2];
13773
0
    const float vx3 = input[3];
13774
0
    input += 4;
13775
13776
0
    const float vy0 = ceilf(vx0);
13777
0
    const float vy1 = ceilf(vx1);
13778
0
    const float vy2 = ceilf(vx2);
13779
0
    const float vy3 = ceilf(vx3);
13780
13781
0
    output[0] = vy0;
13782
0
    output[1] = vy1;
13783
0
    output[2] = vy2;
13784
0
    output[3] = vy3;
13785
0
    output += 4;
13786
0
  }
13787
0
  if XNN_UNLIKELY(batch != 0) {
13788
0
    do {
13789
0
      const float vx = *input++;
13790
0
      const float vy = ceilf(vx);
13791
0
      *output++ = vy;
13792
0
      batch -= sizeof(float);
13793
0
    } while (batch != 0);
13794
0
  }
13795
0
}
13796
13797
void xnn_f32_vrndz_ukernel__scalar_libm_u1(
13798
    size_t batch,
13799
    const float* input,
13800
    float* output,
13801
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13802
0
{
13803
0
  assert(batch != 0);
13804
0
  assert(batch % sizeof(float) == 0);
13805
0
  assert(input != NULL);
13806
0
  assert(output != NULL);
13807
13808
0
  do {
13809
0
    const float vx = *input++;
13810
0
    const float vy = truncf(vx);
13811
0
    *output++ = vy;
13812
0
    batch -= sizeof(float);
13813
0
  } while (batch != 0);
13814
0
}
13815
13816
void xnn_f32_vrndz_ukernel__scalar_libm_u4(
13817
    size_t batch,
13818
    const float* input,
13819
    float* output,
13820
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
13821
0
{
13822
0
  assert(batch != 0);
13823
0
  assert(batch % sizeof(float) == 0);
13824
0
  assert(input != NULL);
13825
0
  assert(output != NULL);
13826
13827
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
13828
0
    const float vx0 = input[0];
13829
0
    const float vx1 = input[1];
13830
0
    const float vx2 = input[2];
13831
0
    const float vx3 = input[3];
13832
0
    input += 4;
13833
13834
0
    const float vy0 = truncf(vx0);
13835
0
    const float vy1 = truncf(vx1);
13836
0
    const float vy2 = truncf(vx2);
13837
0
    const float vy3 = truncf(vx3);
13838
13839
0
    output[0] = vy0;
13840
0
    output[1] = vy1;
13841
0
    output[2] = vy2;
13842
0
    output[3] = vy3;
13843
0
    output += 4;
13844
0
  }
13845
0
  if XNN_UNLIKELY(batch != 0) {
13846
0
    do {
13847
0
      const float vx = *input++;
13848
0
      const float vy = truncf(vx);
13849
0
      *output++ = vy;
13850
0
      batch -= sizeof(float);
13851
0
    } while (batch != 0);
13852
0
  }
13853
0
}
13854
13855
extern XNN_INTERNAL const uint32_t xnn_table_exp2minus_k_over_64[64];
13856
13857
void xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u2(
13858
    size_t batch,
13859
    const float* input,
13860
    float* output,
13861
    const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
13862
0
{
13863
0
  assert(batch != 0);
13864
0
  assert(batch % sizeof(float) == 0);
13865
0
  assert(input != NULL);
13866
0
  assert(output != NULL);
13867
13868
0
  const float vmagic_bias = params->scalar_rr2_lut64_p2.magic_bias;
13869
0
  const float vminus_log2e = params->scalar_rr2_lut64_p2.minus_log2e;
13870
0
  const uint32_t vindex_mask = UINT32_C(0x3F);
13871
0
  const float vln2_hi = params->scalar_rr2_lut64_p2.ln2_hi;
13872
0
  const float vln2_lo = params->scalar_rr2_lut64_p2.ln2_lo;
13873
0
  const float vc2 = params->scalar_rr2_lut64_p2.c2;
13874
0
  const float vone = params->scalar_rr2_lut64_p2.one;
13875
0
  const float vdenorm_cutoff = params->scalar_rr2_lut64_p2.denorm_cutoff;
13876
13877
0
  for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
13878
0
    const float vx0 = input[0];
13879
0
    const float vx1 = input[1];
13880
0
    input += 2;
13881
13882
0
    const float vz0 = fabsf(vx0);
13883
0
    const float vz1 = fabsf(vx1);
13884
13885
0
    float vn0 = vz0 * vminus_log2e + vmagic_bias;
13886
0
    float vn1 = vz1 * vminus_log2e + vmagic_bias;
13887
13888
0
    const uint32_t ve0 = float_as_uint32(vn0) << 17;
13889
0
    const uint32_t ve1 = float_as_uint32(vn1) << 17;
13890
13891
0
    const uint32_t vidx0 = float_as_uint32(vn0) & vindex_mask;
13892
0
    const float vs0 = uint32_as_float(xnn_table_exp2minus_k_over_64[vidx0] + ve0);
13893
0
    const uint32_t vidx1 = float_as_uint32(vn1) & vindex_mask;
13894
0
    const float vs1 = uint32_as_float(xnn_table_exp2minus_k_over_64[vidx1] + ve1);
13895
13896
0
    vn0 -= vmagic_bias;
13897
0
    vn1 -= vmagic_bias;
13898
13899
0
    float vt0 = vn0 * vln2_hi + vz0;
13900
0
    float vt1 = vn1 * vln2_hi + vz1;
13901
13902
0
    vt0 = vn0 * vln2_lo + vt0;
13903
0
    vt1 = vn1 * vln2_lo + vt1;
13904
13905
0
    float vp0 = vt0 * vc2;
13906
0
    float vp1 = vt1 * vc2;
13907
13908
0
    vp0 = vt0 - vp0 * vt0;
13909
0
    vp1 = vt1 - vp1 * vt1;
13910
13911
0
    const float vy0 = vs0 - vs0 * vp0;
13912
0
    const float vy1 = vs1 - vs1 * vp1;
13913
13914
0
    const float vd0 = vy0 + vone;
13915
0
    const float vd1 = vy1 + vone;
13916
13917
0
    float vf0 = vy0 / vd0;
13918
0
    float vf1 = vy1 / vd1;
13919
13920
0
    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
13921
0
      vf0 = 0.0f;
13922
0
    }
13923
0
    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
13924
0
      vf1 = 0.0f;
13925
0
    }
13926
13927
0
    if XNN_UNPREDICTABLE(vx0 > 0.0f) {
13928
0
      vf0 = vone - vf0;
13929
0
    }
13930
0
    if XNN_UNPREDICTABLE(vx1 > 0.0f) {
13931
0
      vf1 = vone - vf1;
13932
0
    }
13933
13934
0
    output[0] = vf0;
13935
0
    output[1] = vf1;
13936
0
    output += 2;
13937
0
  }
13938
0
  if XNN_UNLIKELY(batch != 0) {
13939
0
    const float vx = *input;
13940
13941
0
    const float vz = fabsf(vx);
13942
13943
0
    float vn = vz * vminus_log2e + vmagic_bias;
13944
0
    const uint32_t ve = float_as_uint32(vn) << 17;
13945
0
    const uint32_t vidx = float_as_uint32(vn) & vindex_mask;
13946
0
    const float vs = uint32_as_float(xnn_table_exp2minus_k_over_64[vidx] + ve);
13947
0
    vn -= vmagic_bias;
13948
13949
0
    float vt = vn * vln2_hi + vz;
13950
0
    vt = vn * vln2_lo + vt;
13951
13952
0
    float vp = vt * vc2;
13953
0
    vp = vt - vp * vt;
13954
13955
0
    const float vy = vs - vs * vp;
13956
0
    const float vd = vy + vone;
13957
13958
0
    float vf = vy / vd;
13959
0
    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
13960
0
      vf = 0.0f;
13961
0
    }
13962
0
    if XNN_UNPREDICTABLE(vx > 0.0f) {
13963
0
      vf = vone - vf;
13964
0
    }
13965
13966
0
    *output = vf;
13967
0
  }
13968
0
}
13969
13970
void xnn_f32_vsqrt_ukernel__scalar_sqrt_u1(
13971
    size_t batch,
13972
    const float* input,
13973
    float* output,
13974
    const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)])
13975
0
{
13976
0
  assert(batch != 0);
13977
0
  assert(batch % sizeof(float) == 0);
13978
0
  assert(input != NULL);
13979
0
  assert(output != NULL);
13980
13981
0
  for (; batch >= sizeof(float); batch -= sizeof(float)) {
13982
0
    const float vx = *input++;
13983
0
    const float vy = sqrtf(vx);
13984
0
    *output++ = vy;
13985
0
  }
13986
0
}
13987
13988
extern XNN_INTERNAL const uint32_t xnn_table_exp2minus_k_over_8[8];
13989
13990
void xnn_f32_vtanh_ukernel__scalar_expm1minus_rr1_lut8_p4h3ts_div_u4(
13991
    size_t batch,
13992
    const float* input,
13993
    float* output,
13994
    const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
13995
0
{
13996
0
  assert(batch != 0);
13997
0
  assert(batch % sizeof(float) == 0);
13998
0
  assert(input != NULL);
13999
0
  assert(output != NULL);
14000
14001
0
  const float vsat_cutoff = params->scalar_expm1minus_rr1_lut8_p4h3.sat_cutoff;
14002
0
  const float vminus_log2e = params->scalar_expm1minus_rr1_lut8_p4h3.minus_log2e;
14003
0
  const float vmagic_bias = params->scalar_expm1minus_rr1_lut8_p4h3.magic_bias;
14004
0
  const uint32_t vindex_mask = UINT32_C(0x7);
14005
0
  const float vln2 = params->scalar_expm1minus_rr1_lut8_p4h3.ln2;
14006
0
  const float vc4 = params->scalar_expm1minus_rr1_lut8_p4h3.c4;
14007
0
  const float vc3 = params->scalar_expm1minus_rr1_lut8_p4h3.c3;
14008
0
  const float vc2 = params->scalar_expm1minus_rr1_lut8_p4h3.c2;
14009
0
  const float vminus_two = params->scalar_expm1minus_rr1_lut8_p4h3.minus_two;
14010
0
  const float vone = params->scalar_expm1minus_rr1_lut8_p4h3.one;
14011
14012
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
14013
0
    const float vx0 = input[0];
14014
0
    const float vx1 = input[1];
14015
0
    const float vx2 = input[2];
14016
0
    const float vx3 = input[3];
14017
0
    input += 4;
14018
14019
0
    float vz0 = fabsf(vx0);
14020
0
    float vz1 = fabsf(vx1);
14021
0
    float vz2 = fabsf(vx2);
14022
0
    float vz3 = fabsf(vx3);
14023
14024
0
    vz0 = math_pmin_f32(vz0, vsat_cutoff);
14025
0
    vz1 = math_pmin_f32(vz1, vsat_cutoff);
14026
0
    vz2 = math_pmin_f32(vz2, vsat_cutoff);
14027
0
    vz3 = math_pmin_f32(vz3, vsat_cutoff);
14028
14029
0
    float vn0 = vz0 * vminus_log2e + vmagic_bias;
14030
0
    float vn1 = vz1 * vminus_log2e + vmagic_bias;
14031
0
    float vn2 = vz2 * vminus_log2e + vmagic_bias;
14032
0
    float vn3 = vz3 * vminus_log2e + vmagic_bias;
14033
14034
0
    const uint32_t vb0 = float_as_uint32(vn0);
14035
0
    vn0 -= vmagic_bias;
14036
0
    const uint32_t vb1 = float_as_uint32(vn1);
14037
0
    vn1 -= vmagic_bias;
14038
0
    const uint32_t vb2 = float_as_uint32(vn2);
14039
0
    vn2 -= vmagic_bias;
14040
0
    const uint32_t vb3 = float_as_uint32(vn3);
14041
0
    vn3 -= vmagic_bias;
14042
14043
0
    const uint32_t vidx0 = vb0 & vindex_mask;
14044
0
    const uint32_t vidx1 = vb1 & vindex_mask;
14045
0
    const uint32_t vidx2 = vb2 & vindex_mask;
14046
0
    const uint32_t vidx3 = vb3 & vindex_mask;
14047
14048
0
    const uint32_t vl0 = xnn_table_exp2minus_k_over_8[vidx0];
14049
0
    uint32_t ve0 = vb0 << 20;
14050
0
    const uint32_t vl1 = xnn_table_exp2minus_k_over_8[vidx1];
14051
0
    uint32_t ve1 = vb1 << 20;
14052
0
    const uint32_t vl2 = xnn_table_exp2minus_k_over_8[vidx2];
14053
0
    uint32_t ve2 = vb2 << 20;
14054
0
    const uint32_t vl3 = xnn_table_exp2minus_k_over_8[vidx3];
14055
0
    uint32_t ve3 = vb3 << 20;
14056
14057
0
    ve0 += vl0;
14058
0
    ve1 += vl1;
14059
0
    ve2 += vl2;
14060
0
    ve3 += vl3;
14061
14062
0
    const float vt0 = vn0 * vln2 + vz0;
14063
0
    const float vs0 = uint32_as_float(ve0);
14064
0
    const float vt1 = vn1 * vln2 + vz1;
14065
0
    const float vs1 = uint32_as_float(ve1);
14066
0
    const float vt2 = vn2 * vln2 + vz2;
14067
0
    const float vs2 = uint32_as_float(ve2);
14068
0
    const float vt3 = vn3 * vln2 + vz3;
14069
0
    const float vs3 = uint32_as_float(ve3);
14070
14071
0
    float vp0 = vc4 * vt0 + vc3;
14072
0
    float vp1 = vc4 * vt1 + vc3;
14073
0
    float vp2 = vc4 * vt2 + vc3;
14074
0
    float vp3 = vc4 * vt3 + vc3;
14075
0
    vp0 = vp0 * vt0 + vc2;
14076
0
    vp1 = vp1 * vt1 + vc2;
14077
0
    vp2 = vp2 * vt2 + vc2;
14078
0
    vp3 = vp3 * vt3 + vc2;
14079
0
    vp0 = vp0 * vt0 + vminus_two;
14080
0
    vp1 = vp1 * vt1 + vminus_two;
14081
0
    vp2 = vp2 * vt2 + vminus_two;
14082
0
    vp3 = vp3 * vt3 + vminus_two;
14083
14084
0
    const float vts0 = vt0 * vs0;
14085
0
    const float vsmo0 = vs0 - vone;
14086
0
    const float vts1 = vt1 * vs1;
14087
0
    const float vsmo1 = vs1 - vone;
14088
0
    const float vts2 = vt2 * vs2;
14089
0
    const float vsmo2 = vs2 - vone;
14090
0
    const float vts3 = vt3 * vs3;
14091
0
    const float vsmo3 = vs3 - vone;
14092
14093
0
    const float vemo0 = vp0 * vts0 + vsmo0;
14094
0
    const float vemo1 = vp1 * vts1 + vsmo1;
14095
0
    const float vemo2 = vp2 * vts2 + vsmo2;
14096
0
    const float vemo3 = vp3 * vts3 + vsmo3;
14097
14098
0
    const float vepo0 = vemo0 - vminus_two;
14099
0
    const float vepo1 = vemo1 - vminus_two;
14100
0
    const float vepo2 = vemo2 - vminus_two;
14101
0
    const float vepo3 = vemo3 - vminus_two;
14102
14103
0
    float vy0 = vemo0 / vepo0;
14104
0
    float vy1 = vemo1 / vepo1;
14105
0
    float vy2 = vemo2 / vepo2;
14106
0
    float vy3 = vemo3 / vepo3;
14107
14108
0
    vy0 = copysignf(vy0, vx0);
14109
0
    vy1 = copysignf(vy1, vx1);
14110
0
    vy2 = copysignf(vy2, vx2);
14111
0
    vy3 = copysignf(vy3, vx3);
14112
14113
0
    output[0] = vy0;
14114
0
    output[1] = vy1;
14115
0
    output[2] = vy2;
14116
0
    output[3] = vy3;
14117
0
    output += 4;
14118
0
  }
14119
0
  if XNN_UNLIKELY(batch != 0) {
14120
0
    do {
14121
0
      const float vx = *input++;
14122
14123
0
      float vz = fabsf(vx);
14124
14125
0
      vz = math_pmin_f32(vz, vsat_cutoff);
14126
14127
0
      float vn = vz * vminus_log2e + vmagic_bias;
14128
14129
0
      const uint32_t vb = float_as_uint32(vn);
14130
0
      vn -= vmagic_bias;
14131
14132
0
      const uint32_t vidx = vb & vindex_mask;
14133
0
      const uint32_t vl = xnn_table_exp2minus_k_over_8[vidx];
14134
0
      uint32_t ve = vb << 20;
14135
0
      ve += vl;
14136
0
      const float vs = uint32_as_float(ve);
14137
14138
0
      const float vt = vn * vln2 + vz;
14139
14140
0
      float vp = vc4 * vt + vc3;
14141
0
      vp = vp * vt + vc2;
14142
0
      vp = vp * vt + vminus_two;
14143
14144
0
      const float vts = vt * vs;
14145
0
      const float vsmo = vs - vone;
14146
0
      const float vemo = vp * vts + vsmo;
14147
14148
0
      const float vepo = vemo - vminus_two;
14149
14150
0
      float vy = vemo / vepo;
14151
14152
0
      vy = copysignf(vy, vx);
14153
14154
0
      *output++ = vy;
14155
14156
0
      batch -= sizeof(float);
14157
0
    } while (batch != 0);
14158
0
  }
14159
0
}
14160
14161
void xnn_f32_vabs_ukernel__scalar_u4(
14162
    size_t batch,
14163
    const float* input,
14164
    float* output,
14165
    const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)])
14166
0
{
14167
0
  assert(batch != 0);
14168
0
  assert(batch % sizeof(float) == 0);
14169
0
  assert(input != NULL);
14170
0
  assert(output != NULL);
14171
14172
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
14173
0
    const float vx0 = input[0];
14174
0
    const float vx1 = input[1];
14175
0
    const float vx2 = input[2];
14176
0
    const float vx3 = input[3];
14177
0
    input += 4;
14178
14179
0
    const float vy0 = fabsf(vx0);
14180
0
    const float vy1 = fabsf(vx1);
14181
0
    const float vy2 = fabsf(vx2);
14182
0
    const float vy3 = fabsf(vx3);
14183
14184
0
    output[0] = vy0;
14185
0
    output[1] = vy1;
14186
0
    output[2] = vy2;
14187
0
    output[3] = vy3;
14188
0
    output += 4;
14189
0
  }
14190
0
  if XNN_UNLIKELY(batch != 0) {
14191
0
    do {
14192
0
      const float vx = *input++;
14193
0
      const float vy = fabsf(vx);
14194
0
      *output++ = vy;
14195
0
      batch -= sizeof(float);
14196
0
    } while (batch != 0);
14197
0
  }
14198
0
}
14199
14200
void xnn_f32_vneg_ukernel__scalar_u4(
14201
    size_t batch,
14202
    const float* input,
14203
    float* output,
14204
    const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)])
14205
0
{
14206
0
  assert(batch != 0);
14207
0
  assert(batch % sizeof(float) == 0);
14208
0
  assert(input != NULL);
14209
0
  assert(output != NULL);
14210
14211
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
14212
0
    const float vx0 = input[0];
14213
0
    const float vx1 = input[1];
14214
0
    const float vx2 = input[2];
14215
0
    const float vx3 = input[3];
14216
0
    input += 4;
14217
14218
0
    const float vy0 = -vx0;
14219
0
    const float vy1 = -vx1;
14220
0
    const float vy2 = -vx2;
14221
0
    const float vy3 = -vx3;
14222
14223
0
    output[0] = vy0;
14224
0
    output[1] = vy1;
14225
0
    output[2] = vy2;
14226
0
    output[3] = vy3;
14227
0
    output += 4;
14228
0
  }
14229
0
  if XNN_UNLIKELY(batch != 0) {
14230
0
    do {
14231
0
      const float vx = *input++;
14232
0
      const float vy = -vx;
14233
0
      *output++ = vy;
14234
0
      batch -= sizeof(float);
14235
0
    } while (batch != 0);
14236
0
  }
14237
0
}
14238
14239
void xnn_f32_vsqr_ukernel__scalar_u4(
14240
    size_t batch,
14241
    const float* input,
14242
    float* output,
14243
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
14244
0
{
14245
0
  assert(batch != 0);
14246
0
  assert(batch % sizeof(float) == 0);
14247
0
  assert(input != NULL);
14248
0
  assert(output != NULL);
14249
14250
0
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
14251
0
    const float vx0 = input[0];
14252
0
    const float vx1 = input[1];
14253
0
    const float vx2 = input[2];
14254
0
    const float vx3 = input[3];
14255
0
    input += 4;
14256
14257
0
    const float vy0 = vx0 * vx0;
14258
0
    const float vy1 = vx1 * vx1;
14259
0
    const float vy2 = vx2 * vx2;
14260
0
    const float vy3 = vx3 * vx3;
14261
14262
0
    output[0] = vy0;
14263
0
    output[1] = vy1;
14264
0
    output[2] = vy2;
14265
0
    output[3] = vy3;
14266
0
    output += 4;
14267
0
  }
14268
0
  if XNN_UNLIKELY(batch != 0) {
14269
0
    do {
14270
0
      const float vx = *input++;
14271
0
      const float vy = vx * vx;
14272
0
      *output++ = vy;
14273
0
      batch -= sizeof(float);
14274
0
    } while (batch != 0);
14275
0
  }
14276
0
}
14277
14278
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__scalar(
14279
    size_t mr,
14280
    size_t nc,
14281
    size_t kc,
14282
    const int8_t* restrict a,
14283
    size_t a_stride,
14284
    const void* restrict w,
14285
    float* restrict c,
14286
    size_t cm_stride,
14287
    size_t cn_stride,
14288
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
14289
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)])
14290
0
{
14291
0
  assert(mr != 0);
14292
0
  assert(mr <= 1);
14293
0
  assert(nc != 0);
14294
0
  assert(kc != 0);
14295
14296
0
  const int8_t* a0 = a;
14297
0
  float* c0 = c;
14298
14299
0
  do {
14300
0
    const int32_t vksum0 = unaligned_indexed_load_s32(w, 0);
14301
0
    const int32_t vksum1 = unaligned_indexed_load_s32(w, 1);
14302
0
    const int32_t vinput_zero_point0 = quantization_params[0].zero_point;
14303
0
    int32_t vacc0x0 = vksum0 * vinput_zero_point0;
14304
0
    int32_t vacc0x1 = vksum1 * vinput_zero_point0;
14305
0
    w = (const int32_t*) w + 2;
14306
14307
0
    size_t k = kc;
14308
0
    do {
14309
0
      const int32_t va0 = (int32_t) *a0++;
14310
14311
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
14312
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
14313
0
      w = (const int8_t*) w + 2;
14314
14315
0
      vacc0x0 += va0 * vb0;
14316
0
      vacc0x1 += va0 * vb1;
14317
14318
0
      k -= sizeof(int8_t);
14319
0
    } while (k != 0);
14320
14321
0
    float vout0x0 = (float) vacc0x0;
14322
0
    float vout0x1 = (float) vacc0x1;
14323
14324
0
    const float vinput_scale0 = quantization_params[0].inv_scale;
14325
0
    vout0x0 *= vinput_scale0;
14326
0
    vout0x1 *= vinput_scale0;
14327
14328
0
    const float vfilter_output_scale0 = unaligned_indexed_load_f32(w, 0);
14329
0
    vout0x0 *= vfilter_output_scale0;
14330
0
    const float vfilter_output_scale1 = unaligned_indexed_load_f32(w, 1);
14331
0
    vout0x1 *= vfilter_output_scale1;
14332
14333
0
    const float vbias0 = unaligned_indexed_load_f32(w, 2);
14334
0
    vout0x0 += vbias0;
14335
0
    const float vbias1 = unaligned_indexed_load_f32(w, 3);
14336
0
    vout0x1 += vbias1;
14337
14338
0
    w = (const float*) w + 4;
14339
14340
0
    const float voutput_min = params->scalar.min;
14341
0
    vout0x0 = math_max_f32(vout0x0, voutput_min);
14342
0
    vout0x1 = math_max_f32(vout0x1, voutput_min);
14343
14344
0
    const float voutput_max = params->scalar.max;
14345
0
    vout0x0 = math_min_f32(vout0x0, voutput_max);
14346
0
    vout0x1 = math_min_f32(vout0x1, voutput_max);
14347
14348
0
    if XNN_LIKELY(nc >= 2) {
14349
0
      c0[0] = vout0x0;
14350
0
      c0[1] = vout0x1;
14351
14352
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
14353
14354
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
14355
14356
0
      nc -= 2;
14357
0
    } else {
14358
0
      if (nc & 1) {
14359
0
        c0[0] = vout0x0;
14360
0
      }
14361
14362
0
      nc = 0;
14363
0
    }
14364
0
  } while (nc != 0);
14365
0
}
14366
14367
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4__scalar(
14368
    size_t mr,
14369
    size_t nc,
14370
    size_t kc,
14371
    const int8_t* restrict a,
14372
    size_t a_stride,
14373
    const void* restrict w,
14374
    float* restrict c,
14375
    size_t cm_stride,
14376
    size_t cn_stride,
14377
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
14378
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)])
14379
0
{
14380
0
  assert(mr != 0);
14381
0
  assert(mr <= 1);
14382
0
  assert(nc != 0);
14383
0
  assert(kc != 0);
14384
14385
0
  const int8_t* a0 = a;
14386
0
  float* c0 = c;
14387
14388
0
  do {
14389
0
    const int32_t vksum0 = ((const int32_t*) w)[0];
14390
0
    const int32_t vksum1 = ((const int32_t*) w)[1];
14391
0
    const int32_t vksum2 = ((const int32_t*) w)[2];
14392
0
    const int32_t vksum3 = ((const int32_t*) w)[3];
14393
0
    const int32_t vinput_zero_point0 = quantization_params[0].zero_point;
14394
0
    int32_t vacc0x0 = vksum0 * vinput_zero_point0;
14395
0
    int32_t vacc0x1 = vksum1 * vinput_zero_point0;
14396
0
    int32_t vacc0x2 = vksum2 * vinput_zero_point0;
14397
0
    int32_t vacc0x3 = vksum3 * vinput_zero_point0;
14398
0
    w = (const int32_t*) w + 4;
14399
14400
0
    size_t k = kc;
14401
0
    do {
14402
0
      const int32_t va0 = (int32_t) *a0++;
14403
14404
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
14405
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
14406
0
      const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
14407
0
      const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
14408
0
      w = (const int8_t*) w + 4;
14409
14410
0
      vacc0x0 += va0 * vb0;
14411
0
      vacc0x1 += va0 * vb1;
14412
0
      vacc0x2 += va0 * vb2;
14413
0
      vacc0x3 += va0 * vb3;
14414
14415
0
      k -= sizeof(int8_t);
14416
0
    } while (k != 0);
14417
14418
0
    float vout0x0 = (float) vacc0x0;
14419
0
    float vout0x1 = (float) vacc0x1;
14420
0
    float vout0x2 = (float) vacc0x2;
14421
0
    float vout0x3 = (float) vacc0x3;
14422
14423
0
    const float vinput_scale0 = quantization_params[0].inv_scale;
14424
0
    vout0x0 *= vinput_scale0;
14425
0
    vout0x1 *= vinput_scale0;
14426
0
    vout0x2 *= vinput_scale0;
14427
0
    vout0x3 *= vinput_scale0;
14428
14429
0
    const float vfilter_output_scale0 = ((const float*) w)[0];
14430
0
    vout0x0 *= vfilter_output_scale0;
14431
0
    const float vfilter_output_scale1 = ((const float*) w)[1];
14432
0
    vout0x1 *= vfilter_output_scale1;
14433
0
    const float vfilter_output_scale2 = ((const float*) w)[2];
14434
0
    vout0x2 *= vfilter_output_scale2;
14435
0
    const float vfilter_output_scale3 = ((const float*) w)[3];
14436
0
    vout0x3 *= vfilter_output_scale3;
14437
14438
0
    const float vbias0 = ((const float*) w)[4];
14439
0
    vout0x0 += vbias0;
14440
0
    const float vbias1 = ((const float*) w)[5];
14441
0
    vout0x1 += vbias1;
14442
0
    const float vbias2 = ((const float*) w)[6];
14443
0
    vout0x2 += vbias2;
14444
0
    const float vbias3 = ((const float*) w)[7];
14445
0
    vout0x3 += vbias3;
14446
14447
0
    w = (const float*) w + 8;
14448
14449
0
    const float voutput_min = params->scalar.min;
14450
0
    vout0x0 = math_max_f32(vout0x0, voutput_min);
14451
0
    vout0x1 = math_max_f32(vout0x1, voutput_min);
14452
0
    vout0x2 = math_max_f32(vout0x2, voutput_min);
14453
0
    vout0x3 = math_max_f32(vout0x3, voutput_min);
14454
14455
0
    const float voutput_max = params->scalar.max;
14456
0
    vout0x0 = math_min_f32(vout0x0, voutput_max);
14457
0
    vout0x1 = math_min_f32(vout0x1, voutput_max);
14458
0
    vout0x2 = math_min_f32(vout0x2, voutput_max);
14459
0
    vout0x3 = math_min_f32(vout0x3, voutput_max);
14460
14461
0
    if XNN_LIKELY(nc >= 4) {
14462
0
      c0[0] = vout0x0;
14463
0
      c0[1] = vout0x1;
14464
0
      c0[2] = vout0x2;
14465
0
      c0[3] = vout0x3;
14466
14467
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
14468
14469
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
14470
14471
0
      nc -= 4;
14472
0
    } else {
14473
0
      if (nc & 2) {
14474
0
        c0[0] = vout0x0;
14475
0
        c0[1] = vout0x1;
14476
0
        vout0x0 = vout0x2;
14477
0
        c0 += 2;
14478
0
      }
14479
0
      if (nc & 1) {
14480
0
        c0[0] = vout0x0;
14481
0
      }
14482
14483
0
      nc = 0;
14484
0
    }
14485
0
  } while (nc != 0);
14486
0
}
14487
14488
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x2__scalar(
14489
    size_t mr,
14490
    size_t nc,
14491
    size_t kc,
14492
    const int8_t* restrict a,
14493
    size_t a_stride,
14494
    const void* restrict w,
14495
    float* restrict c,
14496
    size_t cm_stride,
14497
    size_t cn_stride,
14498
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
14499
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)])
14500
0
{
14501
0
  assert(mr != 0);
14502
0
  assert(mr <= 2);
14503
0
  assert(nc != 0);
14504
0
  assert(kc != 0);
14505
14506
0
  const int8_t* a0 = a;
14507
0
  float* c0 = c;
14508
0
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
14509
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
14510
0
  if XNN_UNPREDICTABLE(mr != 2) {
14511
0
    a1 = a0;
14512
0
    c1 = c0;
14513
0
  }
14514
14515
0
  do {
14516
0
    const int32_t vksum0 = unaligned_indexed_load_s32(w, 0);
14517
0
    const int32_t vksum1 = unaligned_indexed_load_s32(w, 1);
14518
0
    const int32_t vinput_zero_point0 = quantization_params[0].zero_point;
14519
0
    int32_t vacc0x0 = vksum0 * vinput_zero_point0;
14520
0
    int32_t vacc0x1 = vksum1 * vinput_zero_point0;
14521
0
    const int32_t vinput_zero_point1 = quantization_params[1].zero_point;
14522
0
    int32_t vacc1x0 = vksum0 * vinput_zero_point1;
14523
0
    int32_t vacc1x1 = vksum1 * vinput_zero_point1;
14524
0
    w = (const int32_t*) w + 2;
14525
14526
0
    size_t k = kc;
14527
0
    do {
14528
0
      const int32_t va0 = (int32_t) *a0++;
14529
0
      const int32_t va1 = (int32_t) *a1++;
14530
14531
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
14532
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
14533
0
      w = (const int8_t*) w + 2;
14534
14535
0
      vacc0x0 += va0 * vb0;
14536
0
      vacc0x1 += va0 * vb1;
14537
0
      vacc1x0 += va1 * vb0;
14538
0
      vacc1x1 += va1 * vb1;
14539
14540
0
      k -= sizeof(int8_t);
14541
0
    } while (k != 0);
14542
14543
0
    float vout0x0 = (float) vacc0x0;
14544
0
    float vout0x1 = (float) vacc0x1;
14545
0
    float vout1x0 = (float) vacc1x0;
14546
0
    float vout1x1 = (float) vacc1x1;
14547
14548
0
    const float vinput_scale0 = quantization_params[0].inv_scale;
14549
0
    vout0x0 *= vinput_scale0;
14550
0
    vout0x1 *= vinput_scale0;
14551
0
    const float vinput_scale1 = quantization_params[1].inv_scale;
14552
0
    vout1x0 *= vinput_scale1;
14553
0
    vout1x1 *= vinput_scale1;
14554
14555
0
    const float vfilter_output_scale0 = unaligned_indexed_load_f32(w, 0);
14556
0
    vout0x0 *= vfilter_output_scale0;
14557
0
    vout1x0 *= vfilter_output_scale0;
14558
0
    const float vfilter_output_scale1 = unaligned_indexed_load_f32(w, 1);
14559
0
    vout0x1 *= vfilter_output_scale1;
14560
0
    vout1x1 *= vfilter_output_scale1;
14561
14562
0
    const float vbias0 = unaligned_indexed_load_f32(w, 2);
14563
0
    vout0x0 += vbias0;
14564
0
    vout1x0 += vbias0;
14565
0
    const float vbias1 = unaligned_indexed_load_f32(w, 3);
14566
0
    vout0x1 += vbias1;
14567
0
    vout1x1 += vbias1;
14568
14569
0
    w = (const float*) w + 4;
14570
14571
0
    const float voutput_min = params->scalar.min;
14572
0
    vout0x0 = math_max_f32(vout0x0, voutput_min);
14573
0
    vout1x0 = math_max_f32(vout1x0, voutput_min);
14574
0
    vout0x1 = math_max_f32(vout0x1, voutput_min);
14575
0
    vout1x1 = math_max_f32(vout1x1, voutput_min);
14576
14577
0
    const float voutput_max = params->scalar.max;
14578
0
    vout0x0 = math_min_f32(vout0x0, voutput_max);
14579
0
    vout1x0 = math_min_f32(vout1x0, voutput_max);
14580
0
    vout0x1 = math_min_f32(vout0x1, voutput_max);
14581
0
    vout1x1 = math_min_f32(vout1x1, voutput_max);
14582
14583
0
    if XNN_LIKELY(nc >= 2) {
14584
0
      c1[0] = vout1x0;
14585
0
      c1[1] = vout1x1;
14586
0
      c0[0] = vout0x0;
14587
0
      c0[1] = vout0x1;
14588
14589
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
14590
0
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
14591
14592
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
14593
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
14594
14595
0
      nc -= 2;
14596
0
    } else {
14597
0
      if (nc & 1) {
14598
0
        c1[0] = vout1x0;
14599
0
        c0[0] = vout0x0;
14600
0
      }
14601
14602
0
      nc = 0;
14603
0
    }
14604
0
  } while (nc != 0);
14605
0
}
14606
14607
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4__scalar(
14608
    size_t mr,
14609
    size_t nc,
14610
    size_t kc,
14611
    const int8_t* restrict a,
14612
    size_t a_stride,
14613
    const void* restrict w,
14614
    float* restrict c,
14615
    size_t cm_stride,
14616
    size_t cn_stride,
14617
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
14618
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)])
14619
0
{
14620
0
  assert(mr != 0);
14621
0
  assert(mr <= 4);
14622
0
  assert(nc != 0);
14623
0
  assert(kc != 0);
14624
14625
0
  const int8_t* a0 = a;
14626
0
  float* c0 = c;
14627
0
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
14628
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
14629
0
  if XNN_UNPREDICTABLE(mr < 2) {
14630
0
    a1 = a0;
14631
0
    c1 = c0;
14632
0
  }
14633
0
  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
14634
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
14635
0
  if XNN_UNPREDICTABLE(mr <= 2) {
14636
0
    a2 = a1;
14637
0
    c2 = c1;
14638
0
  }
14639
0
  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
14640
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
14641
0
  if XNN_UNPREDICTABLE(mr != 4) {
14642
0
    a3 = a2;
14643
0
    c3 = c2;
14644
0
  }
14645
14646
0
  do {
14647
0
    const int32_t vksum0 = ((const int32_t*) w)[0];
14648
0
    const int32_t vksum1 = ((const int32_t*) w)[1];
14649
0
    const int32_t vksum2 = ((const int32_t*) w)[2];
14650
0
    const int32_t vksum3 = ((const int32_t*) w)[3];
14651
0
    const int32_t vinput_zero_point0 = quantization_params[0].zero_point;
14652
0
    int32_t vacc0x0 = vksum0 * vinput_zero_point0;
14653
0
    int32_t vacc0x1 = vksum1 * vinput_zero_point0;
14654
0
    int32_t vacc0x2 = vksum2 * vinput_zero_point0;
14655
0
    int32_t vacc0x3 = vksum3 * vinput_zero_point0;
14656
0
    const int32_t vinput_zero_point1 = quantization_params[1].zero_point;
14657
0
    int32_t vacc1x0 = vksum0 * vinput_zero_point1;
14658
0
    int32_t vacc1x1 = vksum1 * vinput_zero_point1;
14659
0
    int32_t vacc1x2 = vksum2 * vinput_zero_point1;
14660
0
    int32_t vacc1x3 = vksum3 * vinput_zero_point1;
14661
0
    const int32_t vinput_zero_point2 = quantization_params[2].zero_point;
14662
0
    int32_t vacc2x0 = vksum0 * vinput_zero_point2;
14663
0
    int32_t vacc2x1 = vksum1 * vinput_zero_point2;
14664
0
    int32_t vacc2x2 = vksum2 * vinput_zero_point2;
14665
0
    int32_t vacc2x3 = vksum3 * vinput_zero_point2;
14666
0
    const int32_t vinput_zero_point3 = quantization_params[3].zero_point;
14667
0
    int32_t vacc3x0 = vksum0 * vinput_zero_point3;
14668
0
    int32_t vacc3x1 = vksum1 * vinput_zero_point3;
14669
0
    int32_t vacc3x2 = vksum2 * vinput_zero_point3;
14670
0
    int32_t vacc3x3 = vksum3 * vinput_zero_point3;
14671
0
    w = (const int32_t*) w + 4;
14672
14673
0
    size_t k = kc;
14674
0
    do {
14675
0
      const int32_t va0 = (int32_t) *a0++;
14676
0
      const int32_t va1 = (int32_t) *a1++;
14677
0
      const int32_t va2 = (int32_t) *a2++;
14678
0
      const int32_t va3 = (int32_t) *a3++;
14679
14680
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
14681
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
14682
0
      const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
14683
0
      const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
14684
0
      w = (const int8_t*) w + 4;
14685
14686
0
      vacc0x0 += va0 * vb0;
14687
0
      vacc0x1 += va0 * vb1;
14688
0
      vacc0x2 += va0 * vb2;
14689
0
      vacc0x3 += va0 * vb3;
14690
0
      vacc1x0 += va1 * vb0;
14691
0
      vacc1x1 += va1 * vb1;
14692
0
      vacc1x2 += va1 * vb2;
14693
0
      vacc1x3 += va1 * vb3;
14694
0
      vacc2x0 += va2 * vb0;
14695
0
      vacc2x1 += va2 * vb1;
14696
0
      vacc2x2 += va2 * vb2;
14697
0
      vacc2x3 += va2 * vb3;
14698
0
      vacc3x0 += va3 * vb0;
14699
0
      vacc3x1 += va3 * vb1;
14700
0
      vacc3x2 += va3 * vb2;
14701
0
      vacc3x3 += va3 * vb3;
14702
14703
0
      k -= sizeof(int8_t);
14704
0
    } while (k != 0);
14705
14706
0
    float vout0x0 = (float) vacc0x0;
14707
0
    float vout0x1 = (float) vacc0x1;
14708
0
    float vout0x2 = (float) vacc0x2;
14709
0
    float vout0x3 = (float) vacc0x3;
14710
0
    float vout1x0 = (float) vacc1x0;
14711
0
    float vout1x1 = (float) vacc1x1;
14712
0
    float vout1x2 = (float) vacc1x2;
14713
0
    float vout1x3 = (float) vacc1x3;
14714
0
    float vout2x0 = (float) vacc2x0;
14715
0
    float vout2x1 = (float) vacc2x1;
14716
0
    float vout2x2 = (float) vacc2x2;
14717
0
    float vout2x3 = (float) vacc2x3;
14718
0
    float vout3x0 = (float) vacc3x0;
14719
0
    float vout3x1 = (float) vacc3x1;
14720
0
    float vout3x2 = (float) vacc3x2;
14721
0
    float vout3x3 = (float) vacc3x3;
14722
14723
0
    const float vinput_scale0 = quantization_params[0].inv_scale;
14724
0
    vout0x0 *= vinput_scale0;
14725
0
    vout0x1 *= vinput_scale0;
14726
0
    vout0x2 *= vinput_scale0;
14727
0
    vout0x3 *= vinput_scale0;
14728
0
    const float vinput_scale1 = quantization_params[1].inv_scale;
14729
0
    vout1x0 *= vinput_scale1;
14730
0
    vout1x1 *= vinput_scale1;
14731
0
    vout1x2 *= vinput_scale1;
14732
0
    vout1x3 *= vinput_scale1;
14733
0
    const float vinput_scale2 = quantization_params[2].inv_scale;
14734
0
    vout2x0 *= vinput_scale2;
14735
0
    vout2x1 *= vinput_scale2;
14736
0
    vout2x2 *= vinput_scale2;
14737
0
    vout2x3 *= vinput_scale2;
14738
0
    const float vinput_scale3 = quantization_params[3].inv_scale;
14739
0
    vout3x0 *= vinput_scale3;
14740
0
    vout3x1 *= vinput_scale3;
14741
0
    vout3x2 *= vinput_scale3;
14742
0
    vout3x3 *= vinput_scale3;
14743
14744
0
    const float vfilter_output_scale0 = ((const float*) w)[0];
14745
0
    vout0x0 *= vfilter_output_scale0;
14746
0
    vout1x0 *= vfilter_output_scale0;
14747
0
    vout2x0 *= vfilter_output_scale0;
14748
0
    vout3x0 *= vfilter_output_scale0;
14749
0
    const float vfilter_output_scale1 = ((const float*) w)[1];
14750
0
    vout0x1 *= vfilter_output_scale1;
14751
0
    vout1x1 *= vfilter_output_scale1;
14752
0
    vout2x1 *= vfilter_output_scale1;
14753
0
    vout3x1 *= vfilter_output_scale1;
14754
0
    const float vfilter_output_scale2 = ((const float*) w)[2];
14755
0
    vout0x2 *= vfilter_output_scale2;
14756
0
    vout1x2 *= vfilter_output_scale2;
14757
0
    vout2x2 *= vfilter_output_scale2;
14758
0
    vout3x2 *= vfilter_output_scale2;
14759
0
    const float vfilter_output_scale3 = ((const float*) w)[3];
14760
0
    vout0x3 *= vfilter_output_scale3;
14761
0
    vout1x3 *= vfilter_output_scale3;
14762
0
    vout2x3 *= vfilter_output_scale3;
14763
0
    vout3x3 *= vfilter_output_scale3;
14764
14765
0
    const float vbias0 = ((const float*) w)[4];
14766
0
    vout0x0 += vbias0;
14767
0
    vout1x0 += vbias0;
14768
0
    vout2x0 += vbias0;
14769
0
    vout3x0 += vbias0;
14770
0
    const float vbias1 = ((const float*) w)[5];
14771
0
    vout0x1 += vbias1;
14772
0
    vout1x1 += vbias1;
14773
0
    vout2x1 += vbias1;
14774
0
    vout3x1 += vbias1;
14775
0
    const float vbias2 = ((const float*) w)[6];
14776
0
    vout0x2 += vbias2;
14777
0
    vout1x2 += vbias2;
14778
0
    vout2x2 += vbias2;
14779
0
    vout3x2 += vbias2;
14780
0
    const float vbias3 = ((const float*) w)[7];
14781
0
    vout0x3 += vbias3;
14782
0
    vout1x3 += vbias3;
14783
0
    vout2x3 += vbias3;
14784
0
    vout3x3 += vbias3;
14785
14786
0
    w = (const float*) w + 8;
14787
14788
0
    const float voutput_min = params->scalar.min;
14789
0
    vout0x0 = math_max_f32(vout0x0, voutput_min);
14790
0
    vout1x0 = math_max_f32(vout1x0, voutput_min);
14791
0
    vout2x0 = math_max_f32(vout2x0, voutput_min);
14792
0
    vout3x0 = math_max_f32(vout3x0, voutput_min);
14793
0
    vout0x1 = math_max_f32(vout0x1, voutput_min);
14794
0
    vout1x1 = math_max_f32(vout1x1, voutput_min);
14795
0
    vout2x1 = math_max_f32(vout2x1, voutput_min);
14796
0
    vout3x1 = math_max_f32(vout3x1, voutput_min);
14797
0
    vout0x2 = math_max_f32(vout0x2, voutput_min);
14798
0
    vout1x2 = math_max_f32(vout1x2, voutput_min);
14799
0
    vout2x2 = math_max_f32(vout2x2, voutput_min);
14800
0
    vout3x2 = math_max_f32(vout3x2, voutput_min);
14801
0
    vout0x3 = math_max_f32(vout0x3, voutput_min);
14802
0
    vout1x3 = math_max_f32(vout1x3, voutput_min);
14803
0
    vout2x3 = math_max_f32(vout2x3, voutput_min);
14804
0
    vout3x3 = math_max_f32(vout3x3, voutput_min);
14805
14806
0
    const float voutput_max = params->scalar.max;
14807
0
    vout0x0 = math_min_f32(vout0x0, voutput_max);
14808
0
    vout1x0 = math_min_f32(vout1x0, voutput_max);
14809
0
    vout2x0 = math_min_f32(vout2x0, voutput_max);
14810
0
    vout3x0 = math_min_f32(vout3x0, voutput_max);
14811
0
    vout0x1 = math_min_f32(vout0x1, voutput_max);
14812
0
    vout1x1 = math_min_f32(vout1x1, voutput_max);
14813
0
    vout2x1 = math_min_f32(vout2x1, voutput_max);
14814
0
    vout3x1 = math_min_f32(vout3x1, voutput_max);
14815
0
    vout0x2 = math_min_f32(vout0x2, voutput_max);
14816
0
    vout1x2 = math_min_f32(vout1x2, voutput_max);
14817
0
    vout2x2 = math_min_f32(vout2x2, voutput_max);
14818
0
    vout3x2 = math_min_f32(vout3x2, voutput_max);
14819
0
    vout0x3 = math_min_f32(vout0x3, voutput_max);
14820
0
    vout1x3 = math_min_f32(vout1x3, voutput_max);
14821
0
    vout2x3 = math_min_f32(vout2x3, voutput_max);
14822
0
    vout3x3 = math_min_f32(vout3x3, voutput_max);
14823
14824
0
    if XNN_LIKELY(nc >= 4) {
14825
0
      c3[0] = vout3x0;
14826
0
      c3[1] = vout3x1;
14827
0
      c3[2] = vout3x2;
14828
0
      c3[3] = vout3x3;
14829
0
      c2[0] = vout2x0;
14830
0
      c2[1] = vout2x1;
14831
0
      c2[2] = vout2x2;
14832
0
      c2[3] = vout2x3;
14833
0
      c1[0] = vout1x0;
14834
0
      c1[1] = vout1x1;
14835
0
      c1[2] = vout1x2;
14836
0
      c1[3] = vout1x3;
14837
0
      c0[0] = vout0x0;
14838
0
      c0[1] = vout0x1;
14839
0
      c0[2] = vout0x2;
14840
0
      c0[3] = vout0x3;
14841
14842
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
14843
0
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
14844
0
      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
14845
0
      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
14846
14847
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
14848
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
14849
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
14850
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
14851
14852
0
      nc -= 4;
14853
0
    } else {
14854
0
      if (nc & 2) {
14855
0
        c3[0] = vout3x0;
14856
0
        c3[1] = vout3x1;
14857
0
        vout3x0 = vout3x2;
14858
0
        c3 += 2;
14859
0
        c2[0] = vout2x0;
14860
0
        c2[1] = vout2x1;
14861
0
        vout2x0 = vout2x2;
14862
0
        c2 += 2;
14863
0
        c1[0] = vout1x0;
14864
0
        c1[1] = vout1x1;
14865
0
        vout1x0 = vout1x2;
14866
0
        c1 += 2;
14867
0
        c0[0] = vout0x0;
14868
0
        c0[1] = vout0x1;
14869
0
        vout0x0 = vout0x2;
14870
0
        c0 += 2;
14871
0
      }
14872
0
      if (nc & 1) {
14873
0
        c3[0] = vout3x0;
14874
0
        c2[0] = vout2x0;
14875
0
        c1[0] = vout1x0;
14876
0
        c0[0] = vout0x0;
14877
0
      }
14878
14879
0
      nc = 0;
14880
0
    }
14881
0
  } while (nc != 0);
14882
0
}
14883
14884
void xnn_qs16_qs8_vcvt_ukernel__scalar_u4(
14885
    size_t batch,
14886
    const int16_t* input,
14887
    int8_t* output,
14888
    const union xnn_qs16_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
14889
0
{
14890
0
  assert(batch != 0);
14891
0
  assert(batch % sizeof(int16_t) == 0);
14892
0
  assert(input != NULL);
14893
0
  assert(output != NULL);
14894
14895
0
  const int32_t vmultiplier = params->scalar.multiplier;
14896
0
  const int64_t vbias = (int64_t) params->scalar.bias;
14897
0
  for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) {
14898
14899
0
    const int32_t vx0 = (int32_t) input[0];
14900
0
    const int32_t vx1 = (int32_t) input[1];
14901
0
    const int32_t vx2 = (int32_t) input[2];
14902
0
    const int32_t vx3 = (int32_t) input[3];
14903
0
    input += 4;
14904
14905
0
    int32_t vout0 = (int32_t) math_asr_s64(math_mulext_s32(vx0, vmultiplier) + vbias, 16);
14906
0
    int32_t vout1 = (int32_t) math_asr_s64(math_mulext_s32(vx1, vmultiplier) + vbias, 16);
14907
0
    int32_t vout2 = (int32_t) math_asr_s64(math_mulext_s32(vx2, vmultiplier) + vbias, 16);
14908
0
    int32_t vout3 = (int32_t) math_asr_s64(math_mulext_s32(vx3, vmultiplier) + vbias, 16);
14909
14910
0
    vout0 = math_max_s32(vout0, -128);
14911
0
    vout1 = math_max_s32(vout1, -128);
14912
0
    vout2 = math_max_s32(vout2, -128);
14913
0
    vout3 = math_max_s32(vout3, -128);
14914
14915
0
    vout0 = math_min_s32(vout0, 127);
14916
0
    vout1 = math_min_s32(vout1, 127);
14917
0
    vout2 = math_min_s32(vout2, 127);
14918
0
    vout3 = math_min_s32(vout3, 127);
14919
14920
0
    output[0] = (int8_t) vout0;
14921
0
    output[1] = (int8_t) vout1;
14922
0
    output[2] = (int8_t) vout2;
14923
0
    output[3] = (int8_t) vout3;
14924
0
    output += 4;
14925
0
  }
14926
0
  if XNN_UNLIKELY(batch != 0) {
14927
0
    do {
14928
0
      const int32_t vx = (int32_t) *input++;
14929
14930
0
      int32_t vout = (int32_t) math_asr_s64(math_mulext_s32(vx, vmultiplier) + vbias, 16);
14931
14932
0
      vout = math_max_s32(vout, -128);
14933
0
      vout = math_min_s32(vout, 127);
14934
0
      *output++ = (int8_t) vout;
14935
14936
0
      batch -= sizeof(int16_t);
14937
0
    } while (batch != 0);
14938
0
  }
14939
0
}
14940
14941
void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic(
14942
    size_t channels,
14943
    size_t output_width,
14944
    const int8_t** input,
14945
    const void* weights,
14946
    int8_t* output,
14947
    intptr_t input_stride,
14948
    size_t output_increment,
14949
    size_t input_offset,
14950
    const int8_t* zero,
14951
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
14952
0
{
14953
0
  assert(channels != 0);
14954
0
  assert(output_width != 0);
14955
14956
0
  const float vscale = params->fp32_scalar_fmagic.scale;
14957
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
14958
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
14959
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
14960
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
14961
0
  do {
14962
0
    const int8_t* i0 = input[0];
14963
0
    assert(i0 != NULL);
14964
0
    if XNN_UNPREDICTABLE(i0 != zero) {
14965
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
14966
0
    }
14967
0
    const int8_t* i1 = input[1];
14968
0
    assert(i1 != NULL);
14969
0
    if XNN_UNPREDICTABLE(i1 != zero) {
14970
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
14971
0
    }
14972
0
    const int8_t* i2 = input[2];
14973
0
    assert(i2 != NULL);
14974
0
    if XNN_UNPREDICTABLE(i2 != zero) {
14975
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
14976
0
    }
14977
0
    const int8_t* i3 = input[3];
14978
0
    assert(i3 != NULL);
14979
0
    if XNN_UNPREDICTABLE(i3 != zero) {
14980
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
14981
0
    }
14982
0
    const int8_t* i4 = input[4];
14983
0
    assert(i4 != NULL);
14984
0
    if XNN_UNPREDICTABLE(i4 != zero) {
14985
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
14986
0
    }
14987
0
    const int8_t* i5 = input[5];
14988
0
    assert(i5 != NULL);
14989
0
    if XNN_UNPREDICTABLE(i5 != zero) {
14990
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
14991
0
    }
14992
0
    const int8_t* i6 = input[6];
14993
0
    assert(i6 != NULL);
14994
0
    if XNN_UNPREDICTABLE(i6 != zero) {
14995
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
14996
0
    }
14997
0
    const int8_t* i7 = input[7];
14998
0
    assert(i7 != NULL);
14999
0
    if XNN_UNPREDICTABLE(i7 != zero) {
15000
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
15001
0
    }
15002
0
    const int8_t* i8 = input[8];
15003
0
    assert(i8 != NULL);
15004
0
    if XNN_UNPREDICTABLE(i8 != zero) {
15005
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
15006
0
    }
15007
0
    const int8_t* i9 = input[9];
15008
0
    assert(i9 != NULL);
15009
0
    if XNN_UNPREDICTABLE(i9 != zero) {
15010
0
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
15011
0
    }
15012
0
    const int8_t* i10 = input[10];
15013
0
    assert(i10 != NULL);
15014
0
    if XNN_UNPREDICTABLE(i10 != zero) {
15015
0
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
15016
0
    }
15017
0
    const int8_t* i11 = input[11];
15018
0
    assert(i11 != NULL);
15019
0
    if XNN_UNPREDICTABLE(i11 != zero) {
15020
0
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
15021
0
    }
15022
0
    const int8_t* i12 = input[12];
15023
0
    assert(i12 != NULL);
15024
0
    if XNN_UNPREDICTABLE(i12 != zero) {
15025
0
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
15026
0
    }
15027
0
    const int8_t* i13 = input[13];
15028
0
    assert(i13 != NULL);
15029
0
    if XNN_UNPREDICTABLE(i13 != zero) {
15030
0
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
15031
0
    }
15032
0
    const int8_t* i14 = input[14];
15033
0
    assert(i14 != NULL);
15034
0
    if XNN_UNPREDICTABLE(i14 != zero) {
15035
0
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
15036
0
    }
15037
0
    const int8_t* i15 = input[15];
15038
0
    assert(i15 != NULL);
15039
0
    if XNN_UNPREDICTABLE(i15 != zero) {
15040
0
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
15041
0
    }
15042
0
    const int8_t* i16 = input[16];
15043
0
    assert(i16 != NULL);
15044
0
    if XNN_UNPREDICTABLE(i16 != zero) {
15045
0
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
15046
0
    }
15047
0
    const int8_t* i17 = input[17];
15048
0
    assert(i17 != NULL);
15049
0
    if XNN_UNPREDICTABLE(i17 != zero) {
15050
0
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
15051
0
    }
15052
0
    const int8_t* i18 = input[18];
15053
0
    assert(i18 != NULL);
15054
0
    if XNN_UNPREDICTABLE(i18 != zero) {
15055
0
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
15056
0
    }
15057
0
    const int8_t* i19 = input[19];
15058
0
    assert(i19 != NULL);
15059
0
    if XNN_UNPREDICTABLE(i19 != zero) {
15060
0
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
15061
0
    }
15062
0
    const int8_t* i20 = input[20];
15063
0
    assert(i20 != NULL);
15064
0
    if XNN_UNPREDICTABLE(i20 != zero) {
15065
0
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
15066
0
    }
15067
0
    const int8_t* i21 = input[21];
15068
0
    assert(i21 != NULL);
15069
0
    if XNN_UNPREDICTABLE(i21 != zero) {
15070
0
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
15071
0
    }
15072
0
    const int8_t* i22 = input[22];
15073
0
    assert(i22 != NULL);
15074
0
    if XNN_UNPREDICTABLE(i22 != zero) {
15075
0
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
15076
0
    }
15077
0
    const int8_t* i23 = input[23];
15078
0
    assert(i23 != NULL);
15079
0
    if XNN_UNPREDICTABLE(i23 != zero) {
15080
0
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
15081
0
    }
15082
0
    const int8_t* i24 = input[24];
15083
0
    assert(i24 != NULL);
15084
0
    if XNN_UNPREDICTABLE(i24 != zero) {
15085
0
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
15086
0
    }
15087
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
15088
15089
0
    size_t c = channels;
15090
0
    const void* w = weights;
15091
0
    do {
15092
0
      int32_t vacc = unaligned_load_s32(w);
15093
15094
0
      const int32_t vi0 = (int32_t) *i0++;
15095
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
15096
0
      vacc += vi0 * vk0;
15097
0
      const int32_t vi1 = (int32_t) *i1++;
15098
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
15099
0
      vacc += vi1 * vk1;
15100
0
      const int32_t vi2 = (int32_t) *i2++;
15101
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
15102
0
      vacc += vi2 * vk2;
15103
0
      const int32_t vi3 = (int32_t) *i3++;
15104
0
      const int32_t vk3 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[3];
15105
0
      vacc += vi3 * vk3;
15106
0
      const int32_t vi4 = (int32_t) *i4++;
15107
0
      const int32_t vk4 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[4];
15108
0
      vacc += vi4 * vk4;
15109
0
      const int32_t vi5 = (int32_t) *i5++;
15110
0
      const int32_t vk5 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[5];
15111
0
      vacc += vi5 * vk5;
15112
0
      const int32_t vi6 = (int32_t) *i6++;
15113
0
      const int32_t vk6 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[6];
15114
0
      vacc += vi6 * vk6;
15115
0
      const int32_t vi7 = (int32_t) *i7++;
15116
0
      const int32_t vk7 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[7];
15117
0
      vacc += vi7 * vk7;
15118
0
      const int32_t vi8 = (int32_t) *i8++;
15119
0
      const int32_t vk8 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[8];
15120
0
      vacc += vi8 * vk8;
15121
0
      const int32_t vi9 = (int32_t) *i9++;
15122
0
      const int32_t vk9 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[9];
15123
0
      vacc += vi9 * vk9;
15124
0
      const int32_t vi10 = (int32_t) *i10++;
15125
0
      const int32_t vk10 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[10];
15126
0
      vacc += vi10 * vk10;
15127
0
      const int32_t vi11 = (int32_t) *i11++;
15128
0
      const int32_t vk11 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[11];
15129
0
      vacc += vi11 * vk11;
15130
0
      const int32_t vi12 = (int32_t) *i12++;
15131
0
      const int32_t vk12 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[12];
15132
0
      vacc += vi12 * vk12;
15133
0
      const int32_t vi13 = (int32_t) *i13++;
15134
0
      const int32_t vk13 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[13];
15135
0
      vacc += vi13 * vk13;
15136
0
      const int32_t vi14 = (int32_t) *i14++;
15137
0
      const int32_t vk14 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[14];
15138
0
      vacc += vi14 * vk14;
15139
0
      const int32_t vi15 = (int32_t) *i15++;
15140
0
      const int32_t vk15 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[15];
15141
0
      vacc += vi15 * vk15;
15142
0
      const int32_t vi16 = (int32_t) *i16++;
15143
0
      const int32_t vk16 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[16];
15144
0
      vacc += vi16 * vk16;
15145
0
      const int32_t vi17 = (int32_t) *i17++;
15146
0
      const int32_t vk17 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[17];
15147
0
      vacc += vi17 * vk17;
15148
0
      const int32_t vi18 = (int32_t) *i18++;
15149
0
      const int32_t vk18 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[18];
15150
0
      vacc += vi18 * vk18;
15151
0
      const int32_t vi19 = (int32_t) *i19++;
15152
0
      const int32_t vk19 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[19];
15153
0
      vacc += vi19 * vk19;
15154
0
      const int32_t vi20 = (int32_t) *i20++;
15155
0
      const int32_t vk20 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[20];
15156
0
      vacc += vi20 * vk20;
15157
0
      const int32_t vi21 = (int32_t) *i21++;
15158
0
      const int32_t vk21 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[21];
15159
0
      vacc += vi21 * vk21;
15160
0
      const int32_t vi22 = (int32_t) *i22++;
15161
0
      const int32_t vk22 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[22];
15162
0
      vacc += vi22 * vk22;
15163
0
      const int32_t vi23 = (int32_t) *i23++;
15164
0
      const int32_t vk23 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[23];
15165
0
      vacc += vi23 * vk23;
15166
0
      const int32_t vi24 = (int32_t) *i24++;
15167
0
      const int32_t vk24 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[24];
15168
0
      vacc += vi24 * vk24;
15169
15170
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 25 * sizeof(int8_t));
15171
15172
0
      float vfpacc = (float) vacc * vscale;
15173
15174
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
15175
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
15176
0
      vfpacc += vmagic_bias;
15177
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
15178
15179
0
      *output++ = (int8_t) vout;
15180
0
    } while (--c != 0);
15181
15182
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
15183
0
  } while (--output_width != 0);
15184
0
}
15185
15186
void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic(
15187
    size_t channels,
15188
    size_t output_width,
15189
    const int8_t** input,
15190
    const void* weights,
15191
    int8_t* output,
15192
    intptr_t input_stride,
15193
    size_t output_increment,
15194
    size_t input_offset,
15195
    const int8_t* zero,
15196
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
15197
0
{
15198
0
  assert(channels != 0);
15199
0
  assert(output_width != 0);
15200
15201
0
  const float vscale = params->fp32_scalar_imagic.scale;
15202
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
15203
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
15204
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
15205
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
15206
0
  do {
15207
0
    const int8_t* i0 = input[0];
15208
0
    assert(i0 != NULL);
15209
0
    if XNN_UNPREDICTABLE(i0 != zero) {
15210
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
15211
0
    }
15212
0
    const int8_t* i1 = input[1];
15213
0
    assert(i1 != NULL);
15214
0
    if XNN_UNPREDICTABLE(i1 != zero) {
15215
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
15216
0
    }
15217
0
    const int8_t* i2 = input[2];
15218
0
    assert(i2 != NULL);
15219
0
    if XNN_UNPREDICTABLE(i2 != zero) {
15220
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
15221
0
    }
15222
0
    const int8_t* i3 = input[3];
15223
0
    assert(i3 != NULL);
15224
0
    if XNN_UNPREDICTABLE(i3 != zero) {
15225
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
15226
0
    }
15227
0
    const int8_t* i4 = input[4];
15228
0
    assert(i4 != NULL);
15229
0
    if XNN_UNPREDICTABLE(i4 != zero) {
15230
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
15231
0
    }
15232
0
    const int8_t* i5 = input[5];
15233
0
    assert(i5 != NULL);
15234
0
    if XNN_UNPREDICTABLE(i5 != zero) {
15235
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
15236
0
    }
15237
0
    const int8_t* i6 = input[6];
15238
0
    assert(i6 != NULL);
15239
0
    if XNN_UNPREDICTABLE(i6 != zero) {
15240
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
15241
0
    }
15242
0
    const int8_t* i7 = input[7];
15243
0
    assert(i7 != NULL);
15244
0
    if XNN_UNPREDICTABLE(i7 != zero) {
15245
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
15246
0
    }
15247
0
    const int8_t* i8 = input[8];
15248
0
    assert(i8 != NULL);
15249
0
    if XNN_UNPREDICTABLE(i8 != zero) {
15250
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
15251
0
    }
15252
0
    const int8_t* i9 = input[9];
15253
0
    assert(i9 != NULL);
15254
0
    if XNN_UNPREDICTABLE(i9 != zero) {
15255
0
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
15256
0
    }
15257
0
    const int8_t* i10 = input[10];
15258
0
    assert(i10 != NULL);
15259
0
    if XNN_UNPREDICTABLE(i10 != zero) {
15260
0
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
15261
0
    }
15262
0
    const int8_t* i11 = input[11];
15263
0
    assert(i11 != NULL);
15264
0
    if XNN_UNPREDICTABLE(i11 != zero) {
15265
0
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
15266
0
    }
15267
0
    const int8_t* i12 = input[12];
15268
0
    assert(i12 != NULL);
15269
0
    if XNN_UNPREDICTABLE(i12 != zero) {
15270
0
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
15271
0
    }
15272
0
    const int8_t* i13 = input[13];
15273
0
    assert(i13 != NULL);
15274
0
    if XNN_UNPREDICTABLE(i13 != zero) {
15275
0
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
15276
0
    }
15277
0
    const int8_t* i14 = input[14];
15278
0
    assert(i14 != NULL);
15279
0
    if XNN_UNPREDICTABLE(i14 != zero) {
15280
0
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
15281
0
    }
15282
0
    const int8_t* i15 = input[15];
15283
0
    assert(i15 != NULL);
15284
0
    if XNN_UNPREDICTABLE(i15 != zero) {
15285
0
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
15286
0
    }
15287
0
    const int8_t* i16 = input[16];
15288
0
    assert(i16 != NULL);
15289
0
    if XNN_UNPREDICTABLE(i16 != zero) {
15290
0
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
15291
0
    }
15292
0
    const int8_t* i17 = input[17];
15293
0
    assert(i17 != NULL);
15294
0
    if XNN_UNPREDICTABLE(i17 != zero) {
15295
0
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
15296
0
    }
15297
0
    const int8_t* i18 = input[18];
15298
0
    assert(i18 != NULL);
15299
0
    if XNN_UNPREDICTABLE(i18 != zero) {
15300
0
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
15301
0
    }
15302
0
    const int8_t* i19 = input[19];
15303
0
    assert(i19 != NULL);
15304
0
    if XNN_UNPREDICTABLE(i19 != zero) {
15305
0
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
15306
0
    }
15307
0
    const int8_t* i20 = input[20];
15308
0
    assert(i20 != NULL);
15309
0
    if XNN_UNPREDICTABLE(i20 != zero) {
15310
0
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
15311
0
    }
15312
0
    const int8_t* i21 = input[21];
15313
0
    assert(i21 != NULL);
15314
0
    if XNN_UNPREDICTABLE(i21 != zero) {
15315
0
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
15316
0
    }
15317
0
    const int8_t* i22 = input[22];
15318
0
    assert(i22 != NULL);
15319
0
    if XNN_UNPREDICTABLE(i22 != zero) {
15320
0
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
15321
0
    }
15322
0
    const int8_t* i23 = input[23];
15323
0
    assert(i23 != NULL);
15324
0
    if XNN_UNPREDICTABLE(i23 != zero) {
15325
0
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
15326
0
    }
15327
0
    const int8_t* i24 = input[24];
15328
0
    assert(i24 != NULL);
15329
0
    if XNN_UNPREDICTABLE(i24 != zero) {
15330
0
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
15331
0
    }
15332
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
15333
15334
0
    size_t c = channels;
15335
0
    const void* w = weights;
15336
0
    do {
15337
0
      int32_t vacc = unaligned_load_s32(w);
15338
15339
0
      const int32_t vi0 = (int32_t) *i0++;
15340
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
15341
0
      vacc += vi0 * vk0;
15342
0
      const int32_t vi1 = (int32_t) *i1++;
15343
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
15344
0
      vacc += vi1 * vk1;
15345
0
      const int32_t vi2 = (int32_t) *i2++;
15346
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
15347
0
      vacc += vi2 * vk2;
15348
0
      const int32_t vi3 = (int32_t) *i3++;
15349
0
      const int32_t vk3 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[3];
15350
0
      vacc += vi3 * vk3;
15351
0
      const int32_t vi4 = (int32_t) *i4++;
15352
0
      const int32_t vk4 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[4];
15353
0
      vacc += vi4 * vk4;
15354
0
      const int32_t vi5 = (int32_t) *i5++;
15355
0
      const int32_t vk5 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[5];
15356
0
      vacc += vi5 * vk5;
15357
0
      const int32_t vi6 = (int32_t) *i6++;
15358
0
      const int32_t vk6 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[6];
15359
0
      vacc += vi6 * vk6;
15360
0
      const int32_t vi7 = (int32_t) *i7++;
15361
0
      const int32_t vk7 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[7];
15362
0
      vacc += vi7 * vk7;
15363
0
      const int32_t vi8 = (int32_t) *i8++;
15364
0
      const int32_t vk8 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[8];
15365
0
      vacc += vi8 * vk8;
15366
0
      const int32_t vi9 = (int32_t) *i9++;
15367
0
      const int32_t vk9 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[9];
15368
0
      vacc += vi9 * vk9;
15369
0
      const int32_t vi10 = (int32_t) *i10++;
15370
0
      const int32_t vk10 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[10];
15371
0
      vacc += vi10 * vk10;
15372
0
      const int32_t vi11 = (int32_t) *i11++;
15373
0
      const int32_t vk11 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[11];
15374
0
      vacc += vi11 * vk11;
15375
0
      const int32_t vi12 = (int32_t) *i12++;
15376
0
      const int32_t vk12 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[12];
15377
0
      vacc += vi12 * vk12;
15378
0
      const int32_t vi13 = (int32_t) *i13++;
15379
0
      const int32_t vk13 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[13];
15380
0
      vacc += vi13 * vk13;
15381
0
      const int32_t vi14 = (int32_t) *i14++;
15382
0
      const int32_t vk14 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[14];
15383
0
      vacc += vi14 * vk14;
15384
0
      const int32_t vi15 = (int32_t) *i15++;
15385
0
      const int32_t vk15 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[15];
15386
0
      vacc += vi15 * vk15;
15387
0
      const int32_t vi16 = (int32_t) *i16++;
15388
0
      const int32_t vk16 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[16];
15389
0
      vacc += vi16 * vk16;
15390
0
      const int32_t vi17 = (int32_t) *i17++;
15391
0
      const int32_t vk17 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[17];
15392
0
      vacc += vi17 * vk17;
15393
0
      const int32_t vi18 = (int32_t) *i18++;
15394
0
      const int32_t vk18 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[18];
15395
0
      vacc += vi18 * vk18;
15396
0
      const int32_t vi19 = (int32_t) *i19++;
15397
0
      const int32_t vk19 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[19];
15398
0
      vacc += vi19 * vk19;
15399
0
      const int32_t vi20 = (int32_t) *i20++;
15400
0
      const int32_t vk20 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[20];
15401
0
      vacc += vi20 * vk20;
15402
0
      const int32_t vi21 = (int32_t) *i21++;
15403
0
      const int32_t vk21 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[21];
15404
0
      vacc += vi21 * vk21;
15405
0
      const int32_t vi22 = (int32_t) *i22++;
15406
0
      const int32_t vk22 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[22];
15407
0
      vacc += vi22 * vk22;
15408
0
      const int32_t vi23 = (int32_t) *i23++;
15409
0
      const int32_t vk23 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[23];
15410
0
      vacc += vi23 * vk23;
15411
0
      const int32_t vi24 = (int32_t) *i24++;
15412
0
      const int32_t vk24 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[24];
15413
0
      vacc += vi24 * vk24;
15414
15415
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 25 * sizeof(int8_t));
15416
15417
0
      float vfpacc = (float) vacc * vscale;
15418
15419
0
      vfpacc += vmagic_bias;
15420
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
15421
0
      vout = math_max_s32(vout, vmagic_min);
15422
0
      vout = math_min_s32(vout, vmagic_max);
15423
0
      vout -= vmagic_bias_less_zero_point;
15424
15425
0
      *output++ = (int8_t) vout;
15426
0
    } while (--c != 0);
15427
15428
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
15429
0
  } while (--output_width != 0);
15430
0
}
15431
15432
void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf(
15433
    size_t channels,
15434
    size_t output_width,
15435
    const int8_t** input,
15436
    const void* weights,
15437
    int8_t* output,
15438
    intptr_t input_stride,
15439
    size_t output_increment,
15440
    size_t input_offset,
15441
    const int8_t* zero,
15442
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
15443
0
{
15444
0
  assert(channels != 0);
15445
0
  assert(output_width != 0);
15446
15447
0
  const float vscale = params->fp32_scalar_lrintf.scale;
15448
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
15449
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
15450
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
15451
0
  do {
15452
0
    const int8_t* i0 = input[0];
15453
0
    assert(i0 != NULL);
15454
0
    if XNN_UNPREDICTABLE(i0 != zero) {
15455
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
15456
0
    }
15457
0
    const int8_t* i1 = input[1];
15458
0
    assert(i1 != NULL);
15459
0
    if XNN_UNPREDICTABLE(i1 != zero) {
15460
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
15461
0
    }
15462
0
    const int8_t* i2 = input[2];
15463
0
    assert(i2 != NULL);
15464
0
    if XNN_UNPREDICTABLE(i2 != zero) {
15465
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
15466
0
    }
15467
0
    const int8_t* i3 = input[3];
15468
0
    assert(i3 != NULL);
15469
0
    if XNN_UNPREDICTABLE(i3 != zero) {
15470
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
15471
0
    }
15472
0
    const int8_t* i4 = input[4];
15473
0
    assert(i4 != NULL);
15474
0
    if XNN_UNPREDICTABLE(i4 != zero) {
15475
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
15476
0
    }
15477
0
    const int8_t* i5 = input[5];
15478
0
    assert(i5 != NULL);
15479
0
    if XNN_UNPREDICTABLE(i5 != zero) {
15480
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
15481
0
    }
15482
0
    const int8_t* i6 = input[6];
15483
0
    assert(i6 != NULL);
15484
0
    if XNN_UNPREDICTABLE(i6 != zero) {
15485
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
15486
0
    }
15487
0
    const int8_t* i7 = input[7];
15488
0
    assert(i7 != NULL);
15489
0
    if XNN_UNPREDICTABLE(i7 != zero) {
15490
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
15491
0
    }
15492
0
    const int8_t* i8 = input[8];
15493
0
    assert(i8 != NULL);
15494
0
    if XNN_UNPREDICTABLE(i8 != zero) {
15495
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
15496
0
    }
15497
0
    const int8_t* i9 = input[9];
15498
0
    assert(i9 != NULL);
15499
0
    if XNN_UNPREDICTABLE(i9 != zero) {
15500
0
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
15501
0
    }
15502
0
    const int8_t* i10 = input[10];
15503
0
    assert(i10 != NULL);
15504
0
    if XNN_UNPREDICTABLE(i10 != zero) {
15505
0
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
15506
0
    }
15507
0
    const int8_t* i11 = input[11];
15508
0
    assert(i11 != NULL);
15509
0
    if XNN_UNPREDICTABLE(i11 != zero) {
15510
0
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
15511
0
    }
15512
0
    const int8_t* i12 = input[12];
15513
0
    assert(i12 != NULL);
15514
0
    if XNN_UNPREDICTABLE(i12 != zero) {
15515
0
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
15516
0
    }
15517
0
    const int8_t* i13 = input[13];
15518
0
    assert(i13 != NULL);
15519
0
    if XNN_UNPREDICTABLE(i13 != zero) {
15520
0
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
15521
0
    }
15522
0
    const int8_t* i14 = input[14];
15523
0
    assert(i14 != NULL);
15524
0
    if XNN_UNPREDICTABLE(i14 != zero) {
15525
0
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
15526
0
    }
15527
0
    const int8_t* i15 = input[15];
15528
0
    assert(i15 != NULL);
15529
0
    if XNN_UNPREDICTABLE(i15 != zero) {
15530
0
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
15531
0
    }
15532
0
    const int8_t* i16 = input[16];
15533
0
    assert(i16 != NULL);
15534
0
    if XNN_UNPREDICTABLE(i16 != zero) {
15535
0
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
15536
0
    }
15537
0
    const int8_t* i17 = input[17];
15538
0
    assert(i17 != NULL);
15539
0
    if XNN_UNPREDICTABLE(i17 != zero) {
15540
0
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
15541
0
    }
15542
0
    const int8_t* i18 = input[18];
15543
0
    assert(i18 != NULL);
15544
0
    if XNN_UNPREDICTABLE(i18 != zero) {
15545
0
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
15546
0
    }
15547
0
    const int8_t* i19 = input[19];
15548
0
    assert(i19 != NULL);
15549
0
    if XNN_UNPREDICTABLE(i19 != zero) {
15550
0
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
15551
0
    }
15552
0
    const int8_t* i20 = input[20];
15553
0
    assert(i20 != NULL);
15554
0
    if XNN_UNPREDICTABLE(i20 != zero) {
15555
0
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
15556
0
    }
15557
0
    const int8_t* i21 = input[21];
15558
0
    assert(i21 != NULL);
15559
0
    if XNN_UNPREDICTABLE(i21 != zero) {
15560
0
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
15561
0
    }
15562
0
    const int8_t* i22 = input[22];
15563
0
    assert(i22 != NULL);
15564
0
    if XNN_UNPREDICTABLE(i22 != zero) {
15565
0
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
15566
0
    }
15567
0
    const int8_t* i23 = input[23];
15568
0
    assert(i23 != NULL);
15569
0
    if XNN_UNPREDICTABLE(i23 != zero) {
15570
0
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
15571
0
    }
15572
0
    const int8_t* i24 = input[24];
15573
0
    assert(i24 != NULL);
15574
0
    if XNN_UNPREDICTABLE(i24 != zero) {
15575
0
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
15576
0
    }
15577
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
15578
15579
0
    size_t c = channels;
15580
0
    const void* w = weights;
15581
0
    for (; c >= 2; c -= 2) {
15582
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
15583
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
15584
15585
15586
0
      const int32_t vi0x0 = (int32_t) i0[0];
15587
0
      const int32_t vi0x1 = (int32_t) i0[1];
15588
0
      i0 += 2;
15589
15590
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
15591
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
15592
15593
0
      vacc0 += vi0x0 * vk0x0;
15594
0
      vacc1 += vi0x1 * vk0x1;
15595
15596
0
      const int32_t vi1x0 = (int32_t) i1[0];
15597
0
      const int32_t vi1x1 = (int32_t) i1[1];
15598
0
      i1 += 2;
15599
15600
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
15601
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
15602
15603
0
      vacc0 += vi1x0 * vk1x0;
15604
0
      vacc1 += vi1x1 * vk1x1;
15605
15606
0
      const int32_t vi2x0 = (int32_t) i2[0];
15607
0
      const int32_t vi2x1 = (int32_t) i2[1];
15608
0
      i2 += 2;
15609
15610
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
15611
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
15612
15613
0
      vacc0 += vi2x0 * vk2x0;
15614
0
      vacc1 += vi2x1 * vk2x1;
15615
15616
0
      const int32_t vi3x0 = (int32_t) i3[0];
15617
0
      const int32_t vi3x1 = (int32_t) i3[1];
15618
0
      i3 += 2;
15619
15620
0
      const int32_t vk3x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
15621
0
      const int32_t vk3x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7];
15622
15623
0
      vacc0 += vi3x0 * vk3x0;
15624
0
      vacc1 += vi3x1 * vk3x1;
15625
15626
0
      const int32_t vi4x0 = (int32_t) i4[0];
15627
0
      const int32_t vi4x1 = (int32_t) i4[1];
15628
0
      i4 += 2;
15629
15630
0
      const int32_t vk4x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
15631
0
      const int32_t vk4x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9];
15632
15633
0
      vacc0 += vi4x0 * vk4x0;
15634
0
      vacc1 += vi4x1 * vk4x1;
15635
15636
0
      const int32_t vi5x0 = (int32_t) i5[0];
15637
0
      const int32_t vi5x1 = (int32_t) i5[1];
15638
0
      i5 += 2;
15639
15640
0
      const int32_t vk5x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
15641
0
      const int32_t vk5x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11];
15642
15643
0
      vacc0 += vi5x0 * vk5x0;
15644
0
      vacc1 += vi5x1 * vk5x1;
15645
15646
0
      const int32_t vi6x0 = (int32_t) i6[0];
15647
0
      const int32_t vi6x1 = (int32_t) i6[1];
15648
0
      i6 += 2;
15649
15650
0
      const int32_t vk6x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
15651
0
      const int32_t vk6x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13];
15652
15653
0
      vacc0 += vi6x0 * vk6x0;
15654
0
      vacc1 += vi6x1 * vk6x1;
15655
15656
0
      const int32_t vi7x0 = (int32_t) i7[0];
15657
0
      const int32_t vi7x1 = (int32_t) i7[1];
15658
0
      i7 += 2;
15659
15660
0
      const int32_t vk7x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
15661
0
      const int32_t vk7x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15];
15662
15663
0
      vacc0 += vi7x0 * vk7x0;
15664
0
      vacc1 += vi7x1 * vk7x1;
15665
15666
0
      const int32_t vi8x0 = (int32_t) i8[0];
15667
0
      const int32_t vi8x1 = (int32_t) i8[1];
15668
0
      i8 += 2;
15669
15670
0
      const int32_t vk8x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
15671
0
      const int32_t vk8x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17];
15672
15673
0
      vacc0 += vi8x0 * vk8x0;
15674
0
      vacc1 += vi8x1 * vk8x1;
15675
15676
0
      const int32_t vi9x0 = (int32_t) i9[0];
15677
0
      const int32_t vi9x1 = (int32_t) i9[1];
15678
0
      i9 += 2;
15679
15680
0
      const int32_t vk9x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[18];
15681
0
      const int32_t vk9x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[19];
15682
15683
0
      vacc0 += vi9x0 * vk9x0;
15684
0
      vacc1 += vi9x1 * vk9x1;
15685
15686
0
      const int32_t vi10x0 = (int32_t) i10[0];
15687
0
      const int32_t vi10x1 = (int32_t) i10[1];
15688
0
      i10 += 2;
15689
15690
0
      const int32_t vk10x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[20];
15691
0
      const int32_t vk10x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[21];
15692
15693
0
      vacc0 += vi10x0 * vk10x0;
15694
0
      vacc1 += vi10x1 * vk10x1;
15695
15696
0
      const int32_t vi11x0 = (int32_t) i11[0];
15697
0
      const int32_t vi11x1 = (int32_t) i11[1];
15698
0
      i11 += 2;
15699
15700
0
      const int32_t vk11x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[22];
15701
0
      const int32_t vk11x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[23];
15702
15703
0
      vacc0 += vi11x0 * vk11x0;
15704
0
      vacc1 += vi11x1 * vk11x1;
15705
15706
0
      const int32_t vi12x0 = (int32_t) i12[0];
15707
0
      const int32_t vi12x1 = (int32_t) i12[1];
15708
0
      i12 += 2;
15709
15710
0
      const int32_t vk12x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[24];
15711
0
      const int32_t vk12x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[25];
15712
15713
0
      vacc0 += vi12x0 * vk12x0;
15714
0
      vacc1 += vi12x1 * vk12x1;
15715
15716
0
      const int32_t vi13x0 = (int32_t) i13[0];
15717
0
      const int32_t vi13x1 = (int32_t) i13[1];
15718
0
      i13 += 2;
15719
15720
0
      const int32_t vk13x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[26];
15721
0
      const int32_t vk13x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[27];
15722
15723
0
      vacc0 += vi13x0 * vk13x0;
15724
0
      vacc1 += vi13x1 * vk13x1;
15725
15726
0
      const int32_t vi14x0 = (int32_t) i14[0];
15727
0
      const int32_t vi14x1 = (int32_t) i14[1];
15728
0
      i14 += 2;
15729
15730
0
      const int32_t vk14x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[28];
15731
0
      const int32_t vk14x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[29];
15732
15733
0
      vacc0 += vi14x0 * vk14x0;
15734
0
      vacc1 += vi14x1 * vk14x1;
15735
15736
0
      const int32_t vi15x0 = (int32_t) i15[0];
15737
0
      const int32_t vi15x1 = (int32_t) i15[1];
15738
0
      i15 += 2;
15739
15740
0
      const int32_t vk15x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[30];
15741
0
      const int32_t vk15x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[31];
15742
15743
0
      vacc0 += vi15x0 * vk15x0;
15744
0
      vacc1 += vi15x1 * vk15x1;
15745
15746
0
      const int32_t vi16x0 = (int32_t) i16[0];
15747
0
      const int32_t vi16x1 = (int32_t) i16[1];
15748
0
      i16 += 2;
15749
15750
0
      const int32_t vk16x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[32];
15751
0
      const int32_t vk16x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[33];
15752
15753
0
      vacc0 += vi16x0 * vk16x0;
15754
0
      vacc1 += vi16x1 * vk16x1;
15755
15756
0
      const int32_t vi17x0 = (int32_t) i17[0];
15757
0
      const int32_t vi17x1 = (int32_t) i17[1];
15758
0
      i17 += 2;
15759
15760
0
      const int32_t vk17x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[34];
15761
0
      const int32_t vk17x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[35];
15762
15763
0
      vacc0 += vi17x0 * vk17x0;
15764
0
      vacc1 += vi17x1 * vk17x1;
15765
15766
0
      const int32_t vi18x0 = (int32_t) i18[0];
15767
0
      const int32_t vi18x1 = (int32_t) i18[1];
15768
0
      i18 += 2;
15769
15770
0
      const int32_t vk18x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[36];
15771
0
      const int32_t vk18x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[37];
15772
15773
0
      vacc0 += vi18x0 * vk18x0;
15774
0
      vacc1 += vi18x1 * vk18x1;
15775
15776
0
      const int32_t vi19x0 = (int32_t) i19[0];
15777
0
      const int32_t vi19x1 = (int32_t) i19[1];
15778
0
      i19 += 2;
15779
15780
0
      const int32_t vk19x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[38];
15781
0
      const int32_t vk19x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[39];
15782
15783
0
      vacc0 += vi19x0 * vk19x0;
15784
0
      vacc1 += vi19x1 * vk19x1;
15785
15786
0
      const int32_t vi20x0 = (int32_t) i20[0];
15787
0
      const int32_t vi20x1 = (int32_t) i20[1];
15788
0
      i20 += 2;
15789
15790
0
      const int32_t vk20x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[40];
15791
0
      const int32_t vk20x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[41];
15792
15793
0
      vacc0 += vi20x0 * vk20x0;
15794
0
      vacc1 += vi20x1 * vk20x1;
15795
15796
0
      const int32_t vi21x0 = (int32_t) i21[0];
15797
0
      const int32_t vi21x1 = (int32_t) i21[1];
15798
0
      i21 += 2;
15799
15800
0
      const int32_t vk21x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[42];
15801
0
      const int32_t vk21x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[43];
15802
15803
0
      vacc0 += vi21x0 * vk21x0;
15804
0
      vacc1 += vi21x1 * vk21x1;
15805
15806
0
      const int32_t vi22x0 = (int32_t) i22[0];
15807
0
      const int32_t vi22x1 = (int32_t) i22[1];
15808
0
      i22 += 2;
15809
15810
0
      const int32_t vk22x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[44];
15811
0
      const int32_t vk22x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[45];
15812
15813
0
      vacc0 += vi22x0 * vk22x0;
15814
0
      vacc1 += vi22x1 * vk22x1;
15815
15816
0
      const int32_t vi23x0 = (int32_t) i23[0];
15817
0
      const int32_t vi23x1 = (int32_t) i23[1];
15818
0
      i23 += 2;
15819
15820
0
      const int32_t vk23x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[46];
15821
0
      const int32_t vk23x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[47];
15822
15823
0
      vacc0 += vi23x0 * vk23x0;
15824
0
      vacc1 += vi23x1 * vk23x1;
15825
15826
0
      const int32_t vi24x0 = (int32_t) i24[0];
15827
0
      const int32_t vi24x1 = (int32_t) i24[1];
15828
0
      i24 += 2;
15829
15830
0
      const int32_t vk24x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[48];
15831
0
      const int32_t vk24x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[49];
15832
15833
0
      vacc0 += vi24x0 * vk24x0;
15834
0
      vacc1 += vi24x1 * vk24x1;
15835
15836
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 50 * sizeof(int8_t));
15837
15838
0
      float vfpacc0 = (float) vacc0;
15839
0
      float vfpacc1 = (float) vacc1;
15840
15841
0
      vfpacc0 *= vscale;
15842
0
      vfpacc1 *= vscale;
15843
15844
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
15845
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
15846
15847
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
15848
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
15849
15850
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
15851
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
15852
15853
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
15854
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
15855
15856
0
      output[0] = (int8_t) vout0;
15857
0
      output[1] = (int8_t) vout1;
15858
0
      output += 2;
15859
0
    }
15860
0
    if XNN_UNLIKELY(c != 0) {
15861
0
      int32_t vacc = unaligned_load_s32(w);
15862
15863
0
      const int32_t vi0 = (int32_t) *i0;
15864
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
15865
0
      vacc += vi0 * vk0;
15866
0
      const int32_t vi1 = (int32_t) *i1;
15867
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
15868
0
      vacc += vi1 * vk1;
15869
0
      const int32_t vi2 = (int32_t) *i2;
15870
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
15871
0
      vacc += vi2 * vk2;
15872
0
      const int32_t vi3 = (int32_t) *i3;
15873
0
      const int32_t vk3 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
15874
0
      vacc += vi3 * vk3;
15875
0
      const int32_t vi4 = (int32_t) *i4;
15876
0
      const int32_t vk4 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
15877
0
      vacc += vi4 * vk4;
15878
0
      const int32_t vi5 = (int32_t) *i5;
15879
0
      const int32_t vk5 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
15880
0
      vacc += vi5 * vk5;
15881
0
      const int32_t vi6 = (int32_t) *i6;
15882
0
      const int32_t vk6 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
15883
0
      vacc += vi6 * vk6;
15884
0
      const int32_t vi7 = (int32_t) *i7;
15885
0
      const int32_t vk7 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
15886
0
      vacc += vi7 * vk7;
15887
0
      const int32_t vi8 = (int32_t) *i8;
15888
0
      const int32_t vk8 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
15889
0
      vacc += vi8 * vk8;
15890
0
      const int32_t vi9 = (int32_t) *i9;
15891
0
      const int32_t vk9 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[18];
15892
0
      vacc += vi9 * vk9;
15893
0
      const int32_t vi10 = (int32_t) *i10;
15894
0
      const int32_t vk10 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[20];
15895
0
      vacc += vi10 * vk10;
15896
0
      const int32_t vi11 = (int32_t) *i11;
15897
0
      const int32_t vk11 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[22];
15898
0
      vacc += vi11 * vk11;
15899
0
      const int32_t vi12 = (int32_t) *i12;
15900
0
      const int32_t vk12 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[24];
15901
0
      vacc += vi12 * vk12;
15902
0
      const int32_t vi13 = (int32_t) *i13;
15903
0
      const int32_t vk13 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[26];
15904
0
      vacc += vi13 * vk13;
15905
0
      const int32_t vi14 = (int32_t) *i14;
15906
0
      const int32_t vk14 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[28];
15907
0
      vacc += vi14 * vk14;
15908
0
      const int32_t vi15 = (int32_t) *i15;
15909
0
      const int32_t vk15 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[30];
15910
0
      vacc += vi15 * vk15;
15911
0
      const int32_t vi16 = (int32_t) *i16;
15912
0
      const int32_t vk16 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[32];
15913
0
      vacc += vi16 * vk16;
15914
0
      const int32_t vi17 = (int32_t) *i17;
15915
0
      const int32_t vk17 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[34];
15916
0
      vacc += vi17 * vk17;
15917
0
      const int32_t vi18 = (int32_t) *i18;
15918
0
      const int32_t vk18 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[36];
15919
0
      vacc += vi18 * vk18;
15920
0
      const int32_t vi19 = (int32_t) *i19;
15921
0
      const int32_t vk19 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[38];
15922
0
      vacc += vi19 * vk19;
15923
0
      const int32_t vi20 = (int32_t) *i20;
15924
0
      const int32_t vk20 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[40];
15925
0
      vacc += vi20 * vk20;
15926
0
      const int32_t vi21 = (int32_t) *i21;
15927
0
      const int32_t vk21 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[42];
15928
0
      vacc += vi21 * vk21;
15929
0
      const int32_t vi22 = (int32_t) *i22;
15930
0
      const int32_t vk22 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[44];
15931
0
      vacc += vi22 * vk22;
15932
0
      const int32_t vi23 = (int32_t) *i23;
15933
0
      const int32_t vk23 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[46];
15934
0
      vacc += vi23 * vk23;
15935
0
      const int32_t vi24 = (int32_t) *i24;
15936
0
      const int32_t vk24 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[48];
15937
0
      vacc += vi24 * vk24;
15938
15939
0
      float vfpacc = (float) vacc * vscale;
15940
15941
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
15942
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
15943
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
15944
0
      int32_t vout = vrndacc + voutput_zero_point;
15945
15946
0
      *output++ = (int8_t) vout;
15947
0
    }
15948
15949
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
15950
0
  } while (--output_width != 0);
15951
0
}
15952
15953
void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic(
15954
    size_t channels,
15955
    size_t output_width,
15956
    const int8_t** input,
15957
    const void* weights,
15958
    int8_t* output,
15959
    intptr_t input_stride,
15960
    size_t output_increment,
15961
    size_t input_offset,
15962
    const int8_t* zero,
15963
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
15964
0
{
15965
0
  assert(channels != 0);
15966
0
  assert(output_width != 0);
15967
15968
0
  const float vscale = params->fp32_scalar_fmagic.scale;
15969
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
15970
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
15971
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
15972
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
15973
0
  do {
15974
0
    const int8_t* i0 = input[0];
15975
0
    assert(i0 != NULL);
15976
0
    if XNN_UNPREDICTABLE(i0 != zero) {
15977
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
15978
0
    }
15979
0
    const int8_t* i1 = input[1];
15980
0
    assert(i1 != NULL);
15981
0
    if XNN_UNPREDICTABLE(i1 != zero) {
15982
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
15983
0
    }
15984
0
    const int8_t* i2 = input[2];
15985
0
    assert(i2 != NULL);
15986
0
    if XNN_UNPREDICTABLE(i2 != zero) {
15987
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
15988
0
    }
15989
0
    const int8_t* i3 = input[3];
15990
0
    assert(i3 != NULL);
15991
0
    if XNN_UNPREDICTABLE(i3 != zero) {
15992
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
15993
0
    }
15994
0
    const int8_t* i4 = input[4];
15995
0
    assert(i4 != NULL);
15996
0
    if XNN_UNPREDICTABLE(i4 != zero) {
15997
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
15998
0
    }
15999
0
    const int8_t* i5 = input[5];
16000
0
    assert(i5 != NULL);
16001
0
    if XNN_UNPREDICTABLE(i5 != zero) {
16002
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
16003
0
    }
16004
0
    const int8_t* i6 = input[6];
16005
0
    assert(i6 != NULL);
16006
0
    if XNN_UNPREDICTABLE(i6 != zero) {
16007
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
16008
0
    }
16009
0
    const int8_t* i7 = input[7];
16010
0
    assert(i7 != NULL);
16011
0
    if XNN_UNPREDICTABLE(i7 != zero) {
16012
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
16013
0
    }
16014
0
    const int8_t* i8 = input[8];
16015
0
    assert(i8 != NULL);
16016
0
    if XNN_UNPREDICTABLE(i8 != zero) {
16017
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
16018
0
    }
16019
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
16020
16021
0
    size_t c = channels;
16022
0
    const void* w = weights;
16023
0
    do {
16024
0
      int32_t vacc = unaligned_load_s32(w);
16025
16026
0
      const int32_t vi0 = (int32_t) *i0++;
16027
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
16028
0
      vacc += vi0 * vk0;
16029
0
      const int32_t vi1 = (int32_t) *i1++;
16030
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
16031
0
      vacc += vi1 * vk1;
16032
0
      const int32_t vi2 = (int32_t) *i2++;
16033
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
16034
0
      vacc += vi2 * vk2;
16035
0
      const int32_t vi3 = (int32_t) *i3++;
16036
0
      const int32_t vk3 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[3];
16037
0
      vacc += vi3 * vk3;
16038
0
      const int32_t vi4 = (int32_t) *i4++;
16039
0
      const int32_t vk4 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[4];
16040
0
      vacc += vi4 * vk4;
16041
0
      const int32_t vi5 = (int32_t) *i5++;
16042
0
      const int32_t vk5 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[5];
16043
0
      vacc += vi5 * vk5;
16044
0
      const int32_t vi6 = (int32_t) *i6++;
16045
0
      const int32_t vk6 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[6];
16046
0
      vacc += vi6 * vk6;
16047
0
      const int32_t vi7 = (int32_t) *i7++;
16048
0
      const int32_t vk7 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[7];
16049
0
      vacc += vi7 * vk7;
16050
0
      const int32_t vi8 = (int32_t) *i8++;
16051
0
      const int32_t vk8 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[8];
16052
0
      vacc += vi8 * vk8;
16053
16054
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 9 * sizeof(int8_t));
16055
16056
0
      float vfpacc = (float) vacc * vscale;
16057
16058
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
16059
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
16060
0
      vfpacc += vmagic_bias;
16061
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
16062
16063
0
      *output++ = (int8_t) vout;
16064
0
    } while (--c != 0);
16065
16066
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
16067
0
  } while (--output_width != 0);
16068
0
}
16069
16070
void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic(
16071
    size_t channels,
16072
    size_t output_width,
16073
    const int8_t** input,
16074
    const void* weights,
16075
    int8_t* output,
16076
    intptr_t input_stride,
16077
    size_t output_increment,
16078
    size_t input_offset,
16079
    const int8_t* zero,
16080
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
16081
0
{
16082
0
  assert(channels != 0);
16083
0
  assert(output_width != 0);
16084
16085
0
  const float vscale = params->fp32_scalar_imagic.scale;
16086
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
16087
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
16088
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
16089
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
16090
0
  do {
16091
0
    const int8_t* i0 = input[0];
16092
0
    assert(i0 != NULL);
16093
0
    if XNN_UNPREDICTABLE(i0 != zero) {
16094
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
16095
0
    }
16096
0
    const int8_t* i1 = input[1];
16097
0
    assert(i1 != NULL);
16098
0
    if XNN_UNPREDICTABLE(i1 != zero) {
16099
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
16100
0
    }
16101
0
    const int8_t* i2 = input[2];
16102
0
    assert(i2 != NULL);
16103
0
    if XNN_UNPREDICTABLE(i2 != zero) {
16104
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
16105
0
    }
16106
0
    const int8_t* i3 = input[3];
16107
0
    assert(i3 != NULL);
16108
0
    if XNN_UNPREDICTABLE(i3 != zero) {
16109
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
16110
0
    }
16111
0
    const int8_t* i4 = input[4];
16112
0
    assert(i4 != NULL);
16113
0
    if XNN_UNPREDICTABLE(i4 != zero) {
16114
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
16115
0
    }
16116
0
    const int8_t* i5 = input[5];
16117
0
    assert(i5 != NULL);
16118
0
    if XNN_UNPREDICTABLE(i5 != zero) {
16119
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
16120
0
    }
16121
0
    const int8_t* i6 = input[6];
16122
0
    assert(i6 != NULL);
16123
0
    if XNN_UNPREDICTABLE(i6 != zero) {
16124
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
16125
0
    }
16126
0
    const int8_t* i7 = input[7];
16127
0
    assert(i7 != NULL);
16128
0
    if XNN_UNPREDICTABLE(i7 != zero) {
16129
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
16130
0
    }
16131
0
    const int8_t* i8 = input[8];
16132
0
    assert(i8 != NULL);
16133
0
    if XNN_UNPREDICTABLE(i8 != zero) {
16134
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
16135
0
    }
16136
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
16137
16138
0
    size_t c = channels;
16139
0
    const void* w = weights;
16140
0
    for (; c >= 2; c -= 2) {
16141
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
16142
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
16143
16144
16145
0
      const int32_t vi0x0 = (int32_t) i0[0];
16146
0
      const int32_t vi0x1 = (int32_t) i0[1];
16147
0
      i0 += 2;
16148
16149
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
16150
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
16151
16152
0
      vacc0 += vi0x0 * vk0x0;
16153
0
      vacc1 += vi0x1 * vk0x1;
16154
16155
0
      const int32_t vi1x0 = (int32_t) i1[0];
16156
0
      const int32_t vi1x1 = (int32_t) i1[1];
16157
0
      i1 += 2;
16158
16159
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
16160
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
16161
16162
0
      vacc0 += vi1x0 * vk1x0;
16163
0
      vacc1 += vi1x1 * vk1x1;
16164
16165
0
      const int32_t vi2x0 = (int32_t) i2[0];
16166
0
      const int32_t vi2x1 = (int32_t) i2[1];
16167
0
      i2 += 2;
16168
16169
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
16170
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
16171
16172
0
      vacc0 += vi2x0 * vk2x0;
16173
0
      vacc1 += vi2x1 * vk2x1;
16174
16175
0
      const int32_t vi3x0 = (int32_t) i3[0];
16176
0
      const int32_t vi3x1 = (int32_t) i3[1];
16177
0
      i3 += 2;
16178
16179
0
      const int32_t vk3x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
16180
0
      const int32_t vk3x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7];
16181
16182
0
      vacc0 += vi3x0 * vk3x0;
16183
0
      vacc1 += vi3x1 * vk3x1;
16184
16185
0
      const int32_t vi4x0 = (int32_t) i4[0];
16186
0
      const int32_t vi4x1 = (int32_t) i4[1];
16187
0
      i4 += 2;
16188
16189
0
      const int32_t vk4x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
16190
0
      const int32_t vk4x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9];
16191
16192
0
      vacc0 += vi4x0 * vk4x0;
16193
0
      vacc1 += vi4x1 * vk4x1;
16194
16195
0
      const int32_t vi5x0 = (int32_t) i5[0];
16196
0
      const int32_t vi5x1 = (int32_t) i5[1];
16197
0
      i5 += 2;
16198
16199
0
      const int32_t vk5x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
16200
0
      const int32_t vk5x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11];
16201
16202
0
      vacc0 += vi5x0 * vk5x0;
16203
0
      vacc1 += vi5x1 * vk5x1;
16204
16205
0
      const int32_t vi6x0 = (int32_t) i6[0];
16206
0
      const int32_t vi6x1 = (int32_t) i6[1];
16207
0
      i6 += 2;
16208
16209
0
      const int32_t vk6x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
16210
0
      const int32_t vk6x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13];
16211
16212
0
      vacc0 += vi6x0 * vk6x0;
16213
0
      vacc1 += vi6x1 * vk6x1;
16214
16215
0
      const int32_t vi7x0 = (int32_t) i7[0];
16216
0
      const int32_t vi7x1 = (int32_t) i7[1];
16217
0
      i7 += 2;
16218
16219
0
      const int32_t vk7x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
16220
0
      const int32_t vk7x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15];
16221
16222
0
      vacc0 += vi7x0 * vk7x0;
16223
0
      vacc1 += vi7x1 * vk7x1;
16224
16225
0
      const int32_t vi8x0 = (int32_t) i8[0];
16226
0
      const int32_t vi8x1 = (int32_t) i8[1];
16227
0
      i8 += 2;
16228
16229
0
      const int32_t vk8x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
16230
0
      const int32_t vk8x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17];
16231
16232
0
      vacc0 += vi8x0 * vk8x0;
16233
0
      vacc1 += vi8x1 * vk8x1;
16234
16235
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(int8_t));
16236
16237
0
      float vfpacc0 = (float) vacc0;
16238
0
      float vfpacc1 = (float) vacc1;
16239
16240
0
      vfpacc0 *= vscale;
16241
0
      vfpacc1 *= vscale;
16242
16243
0
      vfpacc0 += vmagic_bias;
16244
0
      vfpacc1 += vmagic_bias;
16245
16246
0
      int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
16247
0
      int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
16248
16249
0
      vout0 = math_max_s32(vout0, vmagic_min);
16250
0
      vout1 = math_max_s32(vout1, vmagic_min);
16251
16252
0
      vout0 = math_min_s32(vout0, vmagic_max);
16253
0
      vout1 = math_min_s32(vout1, vmagic_max);
16254
16255
0
      vout0 -= vmagic_bias_less_zero_point;
16256
0
      vout1 -= vmagic_bias_less_zero_point;
16257
16258
0
      output[0] = (int8_t) vout0;
16259
0
      output[1] = (int8_t) vout1;
16260
0
      output += 2;
16261
0
    }
16262
0
    if XNN_UNLIKELY(c != 0) {
16263
0
      int32_t vacc = unaligned_load_s32(w);
16264
16265
0
      const int32_t vi0 = (int32_t) *i0;
16266
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
16267
0
      vacc += vi0 * vk0;
16268
0
      const int32_t vi1 = (int32_t) *i1;
16269
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
16270
0
      vacc += vi1 * vk1;
16271
0
      const int32_t vi2 = (int32_t) *i2;
16272
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
16273
0
      vacc += vi2 * vk2;
16274
0
      const int32_t vi3 = (int32_t) *i3;
16275
0
      const int32_t vk3 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
16276
0
      vacc += vi3 * vk3;
16277
0
      const int32_t vi4 = (int32_t) *i4;
16278
0
      const int32_t vk4 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
16279
0
      vacc += vi4 * vk4;
16280
0
      const int32_t vi5 = (int32_t) *i5;
16281
0
      const int32_t vk5 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
16282
0
      vacc += vi5 * vk5;
16283
0
      const int32_t vi6 = (int32_t) *i6;
16284
0
      const int32_t vk6 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
16285
0
      vacc += vi6 * vk6;
16286
0
      const int32_t vi7 = (int32_t) *i7;
16287
0
      const int32_t vk7 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
16288
0
      vacc += vi7 * vk7;
16289
0
      const int32_t vi8 = (int32_t) *i8;
16290
0
      const int32_t vk8 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
16291
0
      vacc += vi8 * vk8;
16292
16293
0
      float vfpacc = (float) vacc * vscale;
16294
16295
0
      vfpacc += vmagic_bias;
16296
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
16297
0
      vout = math_max_s32(vout, vmagic_min);
16298
0
      vout = math_min_s32(vout, vmagic_max);
16299
0
      vout -= vmagic_bias_less_zero_point;
16300
16301
0
      *output++ = (int8_t) vout;
16302
0
    }
16303
16304
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
16305
0
  } while (--output_width != 0);
16306
0
}
16307
16308
void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf(
16309
    size_t channels,
16310
    size_t output_width,
16311
    const int8_t** input,
16312
    const void* weights,
16313
    int8_t* output,
16314
    intptr_t input_stride,
16315
    size_t output_increment,
16316
    size_t input_offset,
16317
    const int8_t* zero,
16318
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
16319
0
{
16320
0
  assert(channels != 0);
16321
0
  assert(output_width != 0);
16322
16323
0
  const float vscale = params->fp32_scalar_lrintf.scale;
16324
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
16325
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
16326
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
16327
0
  do {
16328
0
    const int8_t* i0 = input[0];
16329
0
    assert(i0 != NULL);
16330
0
    if XNN_UNPREDICTABLE(i0 != zero) {
16331
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
16332
0
    }
16333
0
    const int8_t* i1 = input[1];
16334
0
    assert(i1 != NULL);
16335
0
    if XNN_UNPREDICTABLE(i1 != zero) {
16336
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
16337
0
    }
16338
0
    const int8_t* i2 = input[2];
16339
0
    assert(i2 != NULL);
16340
0
    if XNN_UNPREDICTABLE(i2 != zero) {
16341
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
16342
0
    }
16343
0
    const int8_t* i3 = input[3];
16344
0
    assert(i3 != NULL);
16345
0
    if XNN_UNPREDICTABLE(i3 != zero) {
16346
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
16347
0
    }
16348
0
    const int8_t* i4 = input[4];
16349
0
    assert(i4 != NULL);
16350
0
    if XNN_UNPREDICTABLE(i4 != zero) {
16351
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
16352
0
    }
16353
0
    const int8_t* i5 = input[5];
16354
0
    assert(i5 != NULL);
16355
0
    if XNN_UNPREDICTABLE(i5 != zero) {
16356
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
16357
0
    }
16358
0
    const int8_t* i6 = input[6];
16359
0
    assert(i6 != NULL);
16360
0
    if XNN_UNPREDICTABLE(i6 != zero) {
16361
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
16362
0
    }
16363
0
    const int8_t* i7 = input[7];
16364
0
    assert(i7 != NULL);
16365
0
    if XNN_UNPREDICTABLE(i7 != zero) {
16366
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
16367
0
    }
16368
0
    const int8_t* i8 = input[8];
16369
0
    assert(i8 != NULL);
16370
0
    if XNN_UNPREDICTABLE(i8 != zero) {
16371
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
16372
0
    }
16373
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
16374
16375
0
    size_t c = channels;
16376
0
    const void* w = weights;
16377
0
    for (; c >= 2; c -= 2) {
16378
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
16379
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
16380
16381
16382
0
      const int32_t vi0x0 = (int32_t) i0[0];
16383
0
      const int32_t vi0x1 = (int32_t) i0[1];
16384
0
      i0 += 2;
16385
16386
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
16387
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
16388
16389
0
      vacc0 += vi0x0 * vk0x0;
16390
0
      vacc1 += vi0x1 * vk0x1;
16391
16392
0
      const int32_t vi1x0 = (int32_t) i1[0];
16393
0
      const int32_t vi1x1 = (int32_t) i1[1];
16394
0
      i1 += 2;
16395
16396
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
16397
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
16398
16399
0
      vacc0 += vi1x0 * vk1x0;
16400
0
      vacc1 += vi1x1 * vk1x1;
16401
16402
0
      const int32_t vi2x0 = (int32_t) i2[0];
16403
0
      const int32_t vi2x1 = (int32_t) i2[1];
16404
0
      i2 += 2;
16405
16406
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
16407
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
16408
16409
0
      vacc0 += vi2x0 * vk2x0;
16410
0
      vacc1 += vi2x1 * vk2x1;
16411
16412
0
      const int32_t vi3x0 = (int32_t) i3[0];
16413
0
      const int32_t vi3x1 = (int32_t) i3[1];
16414
0
      i3 += 2;
16415
16416
0
      const int32_t vk3x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
16417
0
      const int32_t vk3x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7];
16418
16419
0
      vacc0 += vi3x0 * vk3x0;
16420
0
      vacc1 += vi3x1 * vk3x1;
16421
16422
0
      const int32_t vi4x0 = (int32_t) i4[0];
16423
0
      const int32_t vi4x1 = (int32_t) i4[1];
16424
0
      i4 += 2;
16425
16426
0
      const int32_t vk4x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
16427
0
      const int32_t vk4x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9];
16428
16429
0
      vacc0 += vi4x0 * vk4x0;
16430
0
      vacc1 += vi4x1 * vk4x1;
16431
16432
0
      const int32_t vi5x0 = (int32_t) i5[0];
16433
0
      const int32_t vi5x1 = (int32_t) i5[1];
16434
0
      i5 += 2;
16435
16436
0
      const int32_t vk5x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
16437
0
      const int32_t vk5x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11];
16438
16439
0
      vacc0 += vi5x0 * vk5x0;
16440
0
      vacc1 += vi5x1 * vk5x1;
16441
16442
0
      const int32_t vi6x0 = (int32_t) i6[0];
16443
0
      const int32_t vi6x1 = (int32_t) i6[1];
16444
0
      i6 += 2;
16445
16446
0
      const int32_t vk6x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
16447
0
      const int32_t vk6x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13];
16448
16449
0
      vacc0 += vi6x0 * vk6x0;
16450
0
      vacc1 += vi6x1 * vk6x1;
16451
16452
0
      const int32_t vi7x0 = (int32_t) i7[0];
16453
0
      const int32_t vi7x1 = (int32_t) i7[1];
16454
0
      i7 += 2;
16455
16456
0
      const int32_t vk7x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
16457
0
      const int32_t vk7x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15];
16458
16459
0
      vacc0 += vi7x0 * vk7x0;
16460
0
      vacc1 += vi7x1 * vk7x1;
16461
16462
0
      const int32_t vi8x0 = (int32_t) i8[0];
16463
0
      const int32_t vi8x1 = (int32_t) i8[1];
16464
0
      i8 += 2;
16465
16466
0
      const int32_t vk8x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
16467
0
      const int32_t vk8x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17];
16468
16469
0
      vacc0 += vi8x0 * vk8x0;
16470
0
      vacc1 += vi8x1 * vk8x1;
16471
16472
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(int8_t));
16473
16474
0
      float vfpacc0 = (float) vacc0;
16475
0
      float vfpacc1 = (float) vacc1;
16476
16477
0
      vfpacc0 *= vscale;
16478
0
      vfpacc1 *= vscale;
16479
16480
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
16481
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
16482
16483
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
16484
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
16485
16486
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
16487
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
16488
16489
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
16490
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
16491
16492
0
      output[0] = (int8_t) vout0;
16493
0
      output[1] = (int8_t) vout1;
16494
0
      output += 2;
16495
0
    }
16496
0
    if XNN_UNLIKELY(c != 0) {
16497
0
      int32_t vacc = unaligned_load_s32(w);
16498
16499
0
      const int32_t vi0 = (int32_t) *i0;
16500
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
16501
0
      vacc += vi0 * vk0;
16502
0
      const int32_t vi1 = (int32_t) *i1;
16503
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
16504
0
      vacc += vi1 * vk1;
16505
0
      const int32_t vi2 = (int32_t) *i2;
16506
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
16507
0
      vacc += vi2 * vk2;
16508
0
      const int32_t vi3 = (int32_t) *i3;
16509
0
      const int32_t vk3 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
16510
0
      vacc += vi3 * vk3;
16511
0
      const int32_t vi4 = (int32_t) *i4;
16512
0
      const int32_t vk4 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
16513
0
      vacc += vi4 * vk4;
16514
0
      const int32_t vi5 = (int32_t) *i5;
16515
0
      const int32_t vk5 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
16516
0
      vacc += vi5 * vk5;
16517
0
      const int32_t vi6 = (int32_t) *i6;
16518
0
      const int32_t vk6 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
16519
0
      vacc += vi6 * vk6;
16520
0
      const int32_t vi7 = (int32_t) *i7;
16521
0
      const int32_t vk7 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
16522
0
      vacc += vi7 * vk7;
16523
0
      const int32_t vi8 = (int32_t) *i8;
16524
0
      const int32_t vk8 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
16525
0
      vacc += vi8 * vk8;
16526
16527
0
      float vfpacc = (float) vacc * vscale;
16528
16529
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
16530
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
16531
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
16532
0
      int32_t vout = vrndacc + voutput_zero_point;
16533
16534
0
      *output++ = (int8_t) vout;
16535
0
    }
16536
16537
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
16538
0
  } while (--output_width != 0);
16539
0
}
16540
16541
void xnn_qs8_f32_vcvt_ukernel__scalar_u1(
16542
    size_t batch,
16543
    const int8_t* input,
16544
    float* output,
16545
    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
16546
0
{
16547
0
  assert(batch != 0);
16548
0
  assert(batch % sizeof(int8_t) == 0);
16549
0
  assert(input != NULL);
16550
0
  assert(output != NULL);
16551
16552
0
  const int32_t vzero_point = params->scalar.zero_point;
16553
0
  const float vscale = params->scalar.scale;
16554
16555
0
  do {
16556
0
    int32_t vx = *input++;
16557
0
    vx -= vzero_point;
16558
16559
0
    float vy = (float) vx;
16560
0
    vy *= vscale;
16561
0
    *output++ = vy;
16562
16563
0
    batch -= sizeof(int8_t);
16564
0
  } while (batch != 0);
16565
0
}
16566
16567
void xnn_qs8_f32_vcvt_ukernel__scalar_u4(
16568
    size_t batch,
16569
    const int8_t* input,
16570
    float* output,
16571
    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
16572
0
{
16573
0
  assert(batch != 0);
16574
0
  assert(batch % sizeof(int8_t) == 0);
16575
0
  assert(input != NULL);
16576
0
  assert(output != NULL);
16577
16578
0
  const int32_t vzero_point = params->scalar.zero_point;
16579
0
  const float vscale = params->scalar.scale;
16580
16581
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
16582
0
    int32_t vx0 = (int32_t) input[0];
16583
0
    int32_t vx1 = (int32_t) input[1];
16584
0
    int32_t vx2 = (int32_t) input[2];
16585
0
    int32_t vx3 = (int32_t) input[3];
16586
0
    input += 4;
16587
16588
0
    vx0 -= vzero_point;
16589
0
    vx1 -= vzero_point;
16590
0
    vx2 -= vzero_point;
16591
0
    vx3 -= vzero_point;
16592
16593
0
    float vy0 = (float) vx0;
16594
0
    float vy1 = (float) vx1;
16595
0
    float vy2 = (float) vx2;
16596
0
    float vy3 = (float) vx3;
16597
16598
0
    vy0 *= vscale;
16599
0
    vy1 *= vscale;
16600
0
    vy2 *= vscale;
16601
0
    vy3 *= vscale;
16602
16603
0
    output[0] = vy0;
16604
0
    output[1] = vy1;
16605
0
    output[2] = vy2;
16606
0
    output[3] = vy3;
16607
0
    output += 4;
16608
0
  }
16609
0
  if XNN_UNLIKELY(batch != 0) {
16610
0
    do {
16611
0
      int32_t vx = *input++;
16612
0
      vx -= vzero_point;
16613
16614
0
      float vy = (float) vx;
16615
0
      vy *= vscale;
16616
0
      *output++ = vy;
16617
16618
0
      batch -= sizeof(int8_t);
16619
0
    } while (batch != 0);
16620
0
  }
16621
0
}
16622
16623
void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1(
16624
    size_t rows,
16625
    size_t channels,
16626
    const int8_t* input,
16627
    size_t input_stride,
16628
    const int8_t* zero,
16629
    int32_t* buffer,
16630
    int8_t* output,
16631
    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
16632
0
{
16633
0
  assert(rows > 7);
16634
0
  assert(channels != 0);
16635
16636
0
  const int8_t* i0 = input;
16637
0
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
16638
0
  const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
16639
0
  const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
16640
0
  const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
16641
0
  const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
16642
0
  const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
16643
0
  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
16644
16645
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
16646
0
  int32_t* b = buffer;
16647
0
  size_t c = channels;
16648
0
  do {
16649
0
    int32_t vacc = vinit_bias;
16650
0
    const int32_t vi0 = (int32_t) *i0++;
16651
0
    const int32_t vi1 = (int32_t) *i1++;
16652
16653
0
    vacc += vi0;
16654
0
    const int32_t vi2 = (int32_t) *i2++;
16655
0
    vacc += vi1;
16656
0
    const int32_t vi3 = (int32_t) *i3++;
16657
0
    vacc += vi2;
16658
0
    const int32_t vi4 = (int32_t) *i4++;
16659
0
    vacc += vi3;
16660
0
    const int32_t vi5 = (int32_t) *i5++;
16661
0
    vacc += vi4;
16662
0
    const int32_t vi6 = (int32_t) *i6++;
16663
16664
0
    vacc += vi5;
16665
0
    vacc += vi6;
16666
16667
0
    *b++ = vacc;
16668
0
  } while (--c != 0);
16669
16670
0
  for (rows -= 7; rows > 7; rows -= 7) {
16671
0
    i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
16672
0
    i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
16673
0
    i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
16674
0
    i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
16675
0
    i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
16676
0
    i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
16677
0
    i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
16678
16679
0
    int32_t* b = buffer;
16680
0
    size_t c = channels;
16681
0
    do {
16682
0
      int32_t vacc = *b;
16683
0
      const int32_t vi0 = (int32_t) *i0++;
16684
0
      const int32_t vi1 = (int32_t) *i1++;
16685
16686
0
      vacc += vi0;
16687
0
      const int32_t vi2 = (int32_t) *i2++;
16688
0
      vacc += vi1;
16689
0
      const int32_t vi3 = (int32_t) *i3++;
16690
0
      vacc += vi2;
16691
0
      const int32_t vi4 = (int32_t) *i4++;
16692
0
      vacc += vi3;
16693
0
      const int32_t vi5 = (int32_t) *i5++;
16694
0
      vacc += vi4;
16695
0
      const int32_t vi6 = (int32_t) *i6++;
16696
16697
0
      vacc += vi5;
16698
0
      vacc += vi6;
16699
16700
0
      *b++ = vacc;
16701
0
    } while (--c != 0);
16702
0
  }
16703
16704
0
  i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
16705
0
  i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
16706
0
  if XNN_UNPREDICTABLE(rows < 2) {
16707
0
    i1 = zero;
16708
0
  }
16709
0
  i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
16710
0
  if XNN_UNPREDICTABLE(rows <= 2) {
16711
0
    i2 = zero;
16712
0
  }
16713
0
  i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
16714
0
  if XNN_UNPREDICTABLE(rows < 4) {
16715
0
    i3 = zero;
16716
0
  }
16717
0
  i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
16718
0
  if XNN_UNPREDICTABLE(rows <= 4) {
16719
0
    i4 = zero;
16720
0
  }
16721
0
  i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
16722
0
  if XNN_UNPREDICTABLE(rows < 6) {
16723
0
    i5 = zero;
16724
0
  }
16725
0
  i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
16726
0
  if XNN_UNPREDICTABLE(rows <= 6) {
16727
0
    i6 = zero;
16728
0
  }
16729
16730
0
  const float vscale = params->fp32_scalar_imagic.scale;
16731
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
16732
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
16733
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
16734
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
16735
0
  do {
16736
0
    int32_t vacc = *buffer++;
16737
0
    const int32_t vi0 = (int32_t) *i0++;
16738
0
    const int32_t vi1 = (int32_t) *i1++;
16739
16740
0
    vacc += vi0;
16741
0
    const int32_t vi2 = (int32_t) *i2++;
16742
0
    vacc += vi1;
16743
0
    const int32_t vi3 = (int32_t) *i3++;
16744
0
    vacc += vi2;
16745
0
    const int32_t vi4 = (int32_t) *i4++;
16746
0
    vacc += vi3;
16747
0
    const int32_t vi5 = (int32_t) *i5++;
16748
0
    vacc += vi4;
16749
0
    const int32_t vi6 = (int32_t) *i6++;
16750
16751
0
    vacc += vi5;
16752
0
    vacc += vi6;
16753
16754
0
    float vfpacc = (float) vacc * vscale;
16755
0
    vfpacc += vmagic_bias;
16756
0
    int32_t vout = (int32_t) float_as_uint32(vfpacc);
16757
0
    vout = math_max_s32(vout, vmagic_min);
16758
0
    vout = math_min_s32(vout, vmagic_max);
16759
0
    vout -= vmagic_bias_less_zero_point;
16760
16761
0
    *output++ = (int8_t) vout;
16762
0
  } while (--channels != 0);
16763
0
}
16764
16765
void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4(
16766
    size_t rows,
16767
    size_t channels,
16768
    const int8_t* input,
16769
    size_t input_stride,
16770
    const int8_t* zero,
16771
    int32_t* buffer,
16772
    int8_t* output,
16773
    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
16774
0
{
16775
0
  assert(rows > 7);
16776
0
  assert(channels != 0);
16777
16778
0
  const int8_t* i0 = input;
16779
0
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
16780
0
  const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
16781
0
  const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
16782
0
  const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
16783
0
  const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
16784
0
  const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
16785
0
  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
16786
16787
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
16788
0
  int32_t* b = buffer;
16789
0
  for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
16790
0
    const int32_t vi0x0 = (int32_t) i0[0];
16791
0
    const int32_t vi0x1 = (int32_t) i0[1];
16792
0
    const int32_t vi0x2 = (int32_t) i0[2];
16793
0
    const int32_t vi0x3 = (int32_t) i0[3];
16794
0
    i0 += 4;
16795
16796
0
    int32_t vacc0 = vi0x0 + vinit_bias;
16797
0
    const int32_t vi1x0 = (int32_t) i1[0];
16798
0
    int32_t vacc1 = vi0x1 + vinit_bias;
16799
0
    const int32_t vi1x1 = (int32_t) i1[1];
16800
0
    int32_t vacc2 = vi0x2 + vinit_bias;
16801
0
    const int32_t vi1x2 = (int32_t) i1[2];
16802
0
    int32_t vacc3 = vi0x3 + vinit_bias;
16803
0
    const int32_t vi1x3 = (int32_t) i1[3];
16804
0
    i1 += 4;
16805
16806
0
    vacc0 += vi1x0;
16807
0
    const int32_t vi2x0 = (int32_t) i2[0];
16808
0
    vacc1 += vi1x1;
16809
0
    const int32_t vi2x1 = (int32_t) i2[1];
16810
0
    vacc2 += vi1x2;
16811
0
    const int32_t vi2x2 = (int32_t) i2[2];
16812
0
    vacc3 += vi1x3;
16813
0
    const int32_t vi2x3 = (int32_t) i2[3];
16814
0
    i2 += 4;
16815
0
    vacc0 += vi2x0;
16816
0
    const int32_t vi3x0 = (int32_t) i3[0];
16817
0
    vacc1 += vi2x1;
16818
0
    const int32_t vi3x1 = (int32_t) i3[1];
16819
0
    vacc2 += vi2x2;
16820
0
    const int32_t vi3x2 = (int32_t) i3[2];
16821
0
    vacc3 += vi2x3;
16822
0
    const int32_t vi3x3 = (int32_t) i3[3];
16823
0
    i3 += 4;
16824
0
    vacc0 += vi3x0;
16825
0
    const int32_t vi4x0 = (int32_t) i4[0];
16826
0
    vacc1 += vi3x1;
16827
0
    const int32_t vi4x1 = (int32_t) i4[1];
16828
0
    vacc2 += vi3x2;
16829
0
    const int32_t vi4x2 = (int32_t) i4[2];
16830
0
    vacc3 += vi3x3;
16831
0
    const int32_t vi4x3 = (int32_t) i4[3];
16832
0
    i4 += 4;
16833
0
    vacc0 += vi4x0;
16834
0
    const int32_t vi5x0 = (int32_t) i5[0];
16835
0
    vacc1 += vi4x1;
16836
0
    const int32_t vi5x1 = (int32_t) i5[1];
16837
0
    vacc2 += vi4x2;
16838
0
    const int32_t vi5x2 = (int32_t) i5[2];
16839
0
    vacc3 += vi4x3;
16840
0
    const int32_t vi5x3 = (int32_t) i5[3];
16841
0
    i5 += 4;
16842
0
    vacc0 += vi5x0;
16843
0
    const int32_t vi6x0 = (int32_t) i6[0];
16844
0
    vacc1 += vi5x1;
16845
0
    const int32_t vi6x1 = (int32_t) i6[1];
16846
0
    vacc2 += vi5x2;
16847
0
    const int32_t vi6x2 = (int32_t) i6[2];
16848
0
    vacc3 += vi5x3;
16849
0
    const int32_t vi6x3 = (int32_t) i6[3];
16850
0
    i6 += 4;
16851
16852
0
    vacc0 += vi6x0;
16853
0
    vacc1 += vi6x1;
16854
0
    vacc2 += vi6x2;
16855
0
    vacc3 += vi6x3;
16856
16857
0
    b[0] = vacc0;
16858
0
    b[1] = vacc1;
16859
0
    b[2] = vacc2;
16860
0
    b[3] = vacc3;
16861
0
    b += 4;
16862
0
  }
16863
16864
0
  for (rows -= 7; rows > 7; rows -= 7) {
16865
0
    i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
16866
0
    i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
16867
0
    i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
16868
0
    i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
16869
0
    i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
16870
0
    i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
16871
0
    i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
16872
16873
0
    int32_t* b = buffer;
16874
0
    for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
16875
0
      int32_t vacc0 = b[0];
16876
0
      const int32_t vi0x0 = (int32_t) i0[0];
16877
0
      int32_t vacc1 = b[1];
16878
0
      const int32_t vi0x1 = (int32_t) i0[1];
16879
0
      int32_t vacc2 = b[2];
16880
0
      const int32_t vi0x2 = (int32_t) i0[2];
16881
0
      int32_t vacc3 = b[3];
16882
0
      const int32_t vi0x3 = (int32_t) i0[3];
16883
0
      i0 += 4;
16884
16885
0
      vacc0 += vi0x0;
16886
0
      const int32_t vi1x0 = (int32_t) i1[0];
16887
0
      vacc1 += vi0x1;
16888
0
      const int32_t vi1x1 = (int32_t) i1[1];
16889
0
      vacc2 += vi0x2;
16890
0
      const int32_t vi1x2 = (int32_t) i1[2];
16891
0
      vacc3 += vi0x3;
16892
0
      const int32_t vi1x3 = (int32_t) i1[3];
16893
0
      i1 += 4;
16894
0
      vacc0 += vi1x0;
16895
0
      const int32_t vi2x0 = (int32_t) i2[0];
16896
0
      vacc1 += vi1x1;
16897
0
      const int32_t vi2x1 = (int32_t) i2[1];
16898
0
      vacc2 += vi1x2;
16899
0
      const int32_t vi2x2 = (int32_t) i2[2];
16900
0
      vacc3 += vi1x3;
16901
0
      const int32_t vi2x3 = (int32_t) i2[3];
16902
0
      i2 += 4;
16903
0
      vacc0 += vi2x0;
16904
0
      const int32_t vi3x0 = (int32_t) i3[0];
16905
0
      vacc1 += vi2x1;
16906
0
      const int32_t vi3x1 = (int32_t) i3[1];
16907
0
      vacc2 += vi2x2;
16908
0
      const int32_t vi3x2 = (int32_t) i3[2];
16909
0
      vacc3 += vi2x3;
16910
0
      const int32_t vi3x3 = (int32_t) i3[3];
16911
0
      i3 += 4;
16912
0
      vacc0 += vi3x0;
16913
0
      const int32_t vi4x0 = (int32_t) i4[0];
16914
0
      vacc1 += vi3x1;
16915
0
      const int32_t vi4x1 = (int32_t) i4[1];
16916
0
      vacc2 += vi3x2;
16917
0
      const int32_t vi4x2 = (int32_t) i4[2];
16918
0
      vacc3 += vi3x3;
16919
0
      const int32_t vi4x3 = (int32_t) i4[3];
16920
0
      i4 += 4;
16921
0
      vacc0 += vi4x0;
16922
0
      const int32_t vi5x0 = (int32_t) i5[0];
16923
0
      vacc1 += vi4x1;
16924
0
      const int32_t vi5x1 = (int32_t) i5[1];
16925
0
      vacc2 += vi4x2;
16926
0
      const int32_t vi5x2 = (int32_t) i5[2];
16927
0
      vacc3 += vi4x3;
16928
0
      const int32_t vi5x3 = (int32_t) i5[3];
16929
0
      i5 += 4;
16930
0
      vacc0 += vi5x0;
16931
0
      const int32_t vi6x0 = (int32_t) i6[0];
16932
0
      vacc1 += vi5x1;
16933
0
      const int32_t vi6x1 = (int32_t) i6[1];
16934
0
      vacc2 += vi5x2;
16935
0
      const int32_t vi6x2 = (int32_t) i6[2];
16936
0
      vacc3 += vi5x3;
16937
0
      const int32_t vi6x3 = (int32_t) i6[3];
16938
0
      i6 += 4;
16939
16940
0
      vacc0 += vi6x0;
16941
0
      vacc1 += vi6x1;
16942
0
      vacc2 += vi6x2;
16943
0
      vacc3 += vi6x3;
16944
16945
0
      b[0] = vacc0;
16946
0
      b[1] = vacc1;
16947
0
      b[2] = vacc2;
16948
0
      b[3] = vacc3;
16949
0
      b += 4;
16950
0
    }
16951
0
  }
16952
16953
0
  i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
16954
0
  i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
16955
0
  if XNN_UNPREDICTABLE(rows < 2) {
16956
0
    i1 = zero;
16957
0
  }
16958
0
  i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
16959
0
  if XNN_UNPREDICTABLE(rows <= 2) {
16960
0
    i2 = zero;
16961
0
  }
16962
0
  i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
16963
0
  if XNN_UNPREDICTABLE(rows < 4) {
16964
0
    i3 = zero;
16965
0
  }
16966
0
  i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
16967
0
  if XNN_UNPREDICTABLE(rows <= 4) {
16968
0
    i4 = zero;
16969
0
  }
16970
0
  i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
16971
0
  if XNN_UNPREDICTABLE(rows < 6) {
16972
0
    i5 = zero;
16973
0
  }
16974
0
  i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
16975
0
  if XNN_UNPREDICTABLE(rows <= 6) {
16976
0
    i6 = zero;
16977
0
  }
16978
16979
0
  const float vscale = params->fp32_scalar_imagic.scale;
16980
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
16981
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
16982
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
16983
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
16984
0
  for (; channels >= 4; channels -= 4) {
16985
0
    int32_t vacc0 = buffer[0];
16986
0
    const int32_t vi0x0 = (int32_t) i0[0];
16987
0
    int32_t vacc1 = buffer[1];
16988
0
    const int32_t vi0x1 = (int32_t) i0[1];
16989
0
    int32_t vacc2 = buffer[2];
16990
0
    const int32_t vi0x2 = (int32_t) i0[2];
16991
0
    int32_t vacc3 = buffer[3];
16992
0
    const int32_t vi0x3 = (int32_t) i0[3];
16993
0
    buffer += 4;
16994
0
    i0 += 4;
16995
16996
0
    vacc0 += vi0x0;
16997
0
    const int32_t vi1x0 = (int32_t) i1[0];
16998
0
    vacc1 += vi0x1;
16999
0
    const int32_t vi1x1 = (int32_t) i1[1];
17000
0
    vacc2 += vi0x2;
17001
0
    const int32_t vi1x2 = (int32_t) i1[2];
17002
0
    vacc3 += vi0x3;
17003
0
    const int32_t vi1x3 = (int32_t) i1[3];
17004
0
    i1 += 4;
17005
0
    vacc0 += vi1x0;
17006
0
    const int32_t vi2x0 = (int32_t) i2[0];
17007
0
    vacc1 += vi1x1;
17008
0
    const int32_t vi2x1 = (int32_t) i2[1];
17009
0
    vacc2 += vi1x2;
17010
0
    const int32_t vi2x2 = (int32_t) i2[2];
17011
0
    vacc3 += vi1x3;
17012
0
    const int32_t vi2x3 = (int32_t) i2[3];
17013
0
    i2 += 4;
17014
0
    vacc0 += vi2x0;
17015
0
    const int32_t vi3x0 = (int32_t) i3[0];
17016
0
    vacc1 += vi2x1;
17017
0
    const int32_t vi3x1 = (int32_t) i3[1];
17018
0
    vacc2 += vi2x2;
17019
0
    const int32_t vi3x2 = (int32_t) i3[2];
17020
0
    vacc3 += vi2x3;
17021
0
    const int32_t vi3x3 = (int32_t) i3[3];
17022
0
    i3 += 4;
17023
0
    vacc0 += vi3x0;
17024
0
    const int32_t vi4x0 = (int32_t) i4[0];
17025
0
    vacc1 += vi3x1;
17026
0
    const int32_t vi4x1 = (int32_t) i4[1];
17027
0
    vacc2 += vi3x2;
17028
0
    const int32_t vi4x2 = (int32_t) i4[2];
17029
0
    vacc3 += vi3x3;
17030
0
    const int32_t vi4x3 = (int32_t) i4[3];
17031
0
    i4 += 4;
17032
0
    vacc0 += vi4x0;
17033
0
    const int32_t vi5x0 = (int32_t) i5[0];
17034
0
    vacc1 += vi4x1;
17035
0
    const int32_t vi5x1 = (int32_t) i5[1];
17036
0
    vacc2 += vi4x2;
17037
0
    const int32_t vi5x2 = (int32_t) i5[2];
17038
0
    vacc3 += vi4x3;
17039
0
    const int32_t vi5x3 = (int32_t) i5[3];
17040
0
    i5 += 4;
17041
0
    vacc0 += vi5x0;
17042
0
    const int32_t vi6x0 = (int32_t) i6[0];
17043
0
    vacc1 += vi5x1;
17044
0
    const int32_t vi6x1 = (int32_t) i6[1];
17045
0
    vacc2 += vi5x2;
17046
0
    const int32_t vi6x2 = (int32_t) i6[2];
17047
0
    vacc3 += vi5x3;
17048
0
    const int32_t vi6x3 = (int32_t) i6[3];
17049
0
    i6 += 4;
17050
17051
0
    vacc0 += vi6x0;
17052
0
    vacc1 += vi6x1;
17053
0
    vacc2 += vi6x2;
17054
0
    vacc3 += vi6x3;
17055
17056
0
    float vfpacc0 = (float) vacc0 * vscale;
17057
0
    float vfpacc1 = (float) vacc1 * vscale;
17058
0
    float vfpacc2 = (float) vacc2 * vscale;
17059
0
    float vfpacc3 = (float) vacc3 * vscale;
17060
17061
0
    vfpacc0 += vmagic_bias;
17062
0
    vfpacc1 += vmagic_bias;
17063
0
    vfpacc2 += vmagic_bias;
17064
0
    vfpacc3 += vmagic_bias;
17065
17066
0
    int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
17067
0
    int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
17068
0
    int32_t vout2 = (int32_t) float_as_uint32(vfpacc2);
17069
0
    int32_t vout3 = (int32_t) float_as_uint32(vfpacc3);
17070
17071
0
    vout0 = math_max_s32(vout0, vmagic_min);
17072
0
    vout1 = math_max_s32(vout1, vmagic_min);
17073
0
    vout2 = math_max_s32(vout2, vmagic_min);
17074
0
    vout3 = math_max_s32(vout3, vmagic_min);
17075
17076
0
    vout0 = math_min_s32(vout0, vmagic_max);
17077
0
    vout1 = math_min_s32(vout1, vmagic_max);
17078
0
    vout2 = math_min_s32(vout2, vmagic_max);
17079
0
    vout3 = math_min_s32(vout3, vmagic_max);
17080
17081
0
    vout0 -= vmagic_bias_less_zero_point;
17082
0
    vout1 -= vmagic_bias_less_zero_point;
17083
0
    vout2 -= vmagic_bias_less_zero_point;
17084
0
    vout3 -= vmagic_bias_less_zero_point;
17085
17086
0
    output[0] = (int8_t) vout0;
17087
0
    output[1] = (int8_t) vout1;
17088
0
    output[2] = (int8_t) vout2;
17089
0
    output[3] = (int8_t) vout3;
17090
0
    output += 4;
17091
0
  }
17092
0
  if XNN_UNLIKELY(channels != 0) {
17093
0
    do {
17094
0
      int32_t vacc = *buffer++;
17095
0
      const int32_t vi0 = (int32_t) *i0++;
17096
0
      const int32_t vi1 = (int32_t) *i1++;
17097
17098
0
      vacc += vi0;
17099
0
      const int32_t vi2 = (int32_t) *i2++;
17100
0
      vacc += vi1;
17101
0
      const int32_t vi3 = (int32_t) *i3++;
17102
0
      vacc += vi2;
17103
0
      const int32_t vi4 = (int32_t) *i4++;
17104
0
      vacc += vi3;
17105
0
      const int32_t vi5 = (int32_t) *i5++;
17106
0
      vacc += vi4;
17107
0
      const int32_t vi6 = (int32_t) *i6++;
17108
17109
0
      vacc += vi5;
17110
0
      vacc += vi6;
17111
17112
0
      float vfpacc = (float) vacc * vscale;
17113
0
      vfpacc += vmagic_bias;
17114
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
17115
0
      vout = math_max_s32(vout, vmagic_min);
17116
0
      vout = math_min_s32(vout, vmagic_max);
17117
0
      vout -= vmagic_bias_less_zero_point;
17118
17119
0
      *output++ = (int8_t) vout;
17120
0
    } while (--channels != 0);
17121
0
  }
17122
0
}
17123
17124
void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1(
17125
    size_t rows,
17126
    size_t channels,
17127
    const int8_t* input,
17128
    size_t input_stride,
17129
    const int8_t* zero,
17130
    int8_t* output,
17131
    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17132
0
{
17133
0
  assert(rows != 0);
17134
0
  assert(rows <= 7);
17135
0
  assert(channels != 0);
17136
17137
0
  const int8_t* i0 = input;
17138
0
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
17139
0
  if XNN_UNPREDICTABLE(rows < 2) {
17140
0
    i1 = zero;
17141
0
  }
17142
0
  const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
17143
0
  if XNN_UNPREDICTABLE(rows <= 2) {
17144
0
    i2 = zero;
17145
0
  }
17146
0
  const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
17147
0
  if XNN_UNPREDICTABLE(rows < 4) {
17148
0
    i3 = zero;
17149
0
  }
17150
0
  const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
17151
0
  if XNN_UNPREDICTABLE(rows <= 4) {
17152
0
    i4 = zero;
17153
0
  }
17154
0
  const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
17155
0
  if XNN_UNPREDICTABLE(rows < 6) {
17156
0
    i5 = zero;
17157
0
  }
17158
0
  const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
17159
0
  if XNN_UNPREDICTABLE(rows <= 6) {
17160
0
    i6 = zero;
17161
0
  }
17162
17163
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
17164
0
  const float vscale = params->fp32_scalar_imagic.scale;
17165
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
17166
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
17167
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
17168
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
17169
0
  do {
17170
0
    int32_t vacc = vinit_bias;
17171
0
    const int32_t vi0 = (int32_t) *i0++;
17172
0
    const int32_t vi1 = (int32_t) *i1++;
17173
17174
0
    vacc += vi0;
17175
0
    const int32_t vi2 = (int32_t) *i2++;
17176
0
    vacc += vi1;
17177
0
    const int32_t vi3 = (int32_t) *i3++;
17178
0
    vacc += vi2;
17179
0
    const int32_t vi4 = (int32_t) *i4++;
17180
0
    vacc += vi3;
17181
0
    const int32_t vi5 = (int32_t) *i5++;
17182
0
    vacc += vi4;
17183
0
    const int32_t vi6 = (int32_t) *i6++;
17184
17185
0
    vacc += vi5;
17186
0
    vacc += vi6;
17187
17188
0
    float vfpacc = (float) vacc * vscale;
17189
0
    vfpacc += vmagic_bias;
17190
0
    int32_t vout = (int32_t) float_as_uint32(vfpacc);
17191
0
    vout = math_max_s32(vout, vmagic_min);
17192
0
    vout = math_min_s32(vout, vmagic_max);
17193
0
    vout -= vmagic_bias_less_zero_point;
17194
17195
0
    *output++ = (int8_t) vout;
17196
0
  } while (--channels != 0);
17197
0
}
17198
17199
void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4(
17200
    size_t rows,
17201
    size_t channels,
17202
    const int8_t* input,
17203
    size_t input_stride,
17204
    const int8_t* zero,
17205
    int8_t* output,
17206
    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17207
0
{
17208
0
  assert(rows != 0);
17209
0
  assert(rows <= 7);
17210
0
  assert(channels != 0);
17211
17212
0
  const int8_t* i0 = input;
17213
0
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
17214
0
  if XNN_UNPREDICTABLE(rows < 2) {
17215
0
    i1 = zero;
17216
0
  }
17217
0
  const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
17218
0
  if XNN_UNPREDICTABLE(rows <= 2) {
17219
0
    i2 = zero;
17220
0
  }
17221
0
  const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
17222
0
  if XNN_UNPREDICTABLE(rows < 4) {
17223
0
    i3 = zero;
17224
0
  }
17225
0
  const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
17226
0
  if XNN_UNPREDICTABLE(rows <= 4) {
17227
0
    i4 = zero;
17228
0
  }
17229
0
  const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
17230
0
  if XNN_UNPREDICTABLE(rows < 6) {
17231
0
    i5 = zero;
17232
0
  }
17233
0
  const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
17234
0
  if XNN_UNPREDICTABLE(rows <= 6) {
17235
0
    i6 = zero;
17236
0
  }
17237
17238
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
17239
0
  const float vscale = params->fp32_scalar_imagic.scale;
17240
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
17241
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
17242
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
17243
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
17244
0
  for (; channels >= 4; channels -= 4) {
17245
0
    const int32_t vi0x0 = (int32_t) i0[0];
17246
0
    const int32_t vi0x1 = (int32_t) i0[1];
17247
0
    const int32_t vi0x2 = (int32_t) i0[2];
17248
0
    const int32_t vi0x3 = (int32_t) i0[3];
17249
0
    i0 += 4;
17250
17251
0
    int32_t vacc0 = vi0x0 + vinit_bias;
17252
0
    const int32_t vi1x0 = (int32_t) i1[0];
17253
0
    int32_t vacc1 = vi0x1 + vinit_bias;
17254
0
    const int32_t vi1x1 = (int32_t) i1[1];
17255
0
    int32_t vacc2 = vi0x2 + vinit_bias;
17256
0
    const int32_t vi1x2 = (int32_t) i1[2];
17257
0
    int32_t vacc3 = vi0x3 + vinit_bias;
17258
0
    const int32_t vi1x3 = (int32_t) i1[3];
17259
0
    i1 += 4;
17260
17261
0
    vacc0 += vi1x0;
17262
0
    const int32_t vi2x0 = (int32_t) i2[0];
17263
0
    vacc1 += vi1x1;
17264
0
    const int32_t vi2x1 = (int32_t) i2[1];
17265
0
    vacc2 += vi1x2;
17266
0
    const int32_t vi2x2 = (int32_t) i2[2];
17267
0
    vacc3 += vi1x3;
17268
0
    const int32_t vi2x3 = (int32_t) i2[3];
17269
0
    i2 += 4;
17270
0
    vacc0 += vi2x0;
17271
0
    const int32_t vi3x0 = (int32_t) i3[0];
17272
0
    vacc1 += vi2x1;
17273
0
    const int32_t vi3x1 = (int32_t) i3[1];
17274
0
    vacc2 += vi2x2;
17275
0
    const int32_t vi3x2 = (int32_t) i3[2];
17276
0
    vacc3 += vi2x3;
17277
0
    const int32_t vi3x3 = (int32_t) i3[3];
17278
0
    i3 += 4;
17279
0
    vacc0 += vi3x0;
17280
0
    const int32_t vi4x0 = (int32_t) i4[0];
17281
0
    vacc1 += vi3x1;
17282
0
    const int32_t vi4x1 = (int32_t) i4[1];
17283
0
    vacc2 += vi3x2;
17284
0
    const int32_t vi4x2 = (int32_t) i4[2];
17285
0
    vacc3 += vi3x3;
17286
0
    const int32_t vi4x3 = (int32_t) i4[3];
17287
0
    i4 += 4;
17288
0
    vacc0 += vi4x0;
17289
0
    const int32_t vi5x0 = (int32_t) i5[0];
17290
0
    vacc1 += vi4x1;
17291
0
    const int32_t vi5x1 = (int32_t) i5[1];
17292
0
    vacc2 += vi4x2;
17293
0
    const int32_t vi5x2 = (int32_t) i5[2];
17294
0
    vacc3 += vi4x3;
17295
0
    const int32_t vi5x3 = (int32_t) i5[3];
17296
0
    i5 += 4;
17297
0
    vacc0 += vi5x0;
17298
0
    const int32_t vi6x0 = (int32_t) i6[0];
17299
0
    vacc1 += vi5x1;
17300
0
    const int32_t vi6x1 = (int32_t) i6[1];
17301
0
    vacc2 += vi5x2;
17302
0
    const int32_t vi6x2 = (int32_t) i6[2];
17303
0
    vacc3 += vi5x3;
17304
0
    const int32_t vi6x3 = (int32_t) i6[3];
17305
0
    i6 += 4;
17306
17307
0
    vacc0 += vi6x0;
17308
0
    vacc1 += vi6x1;
17309
0
    vacc2 += vi6x2;
17310
0
    vacc3 += vi6x3;
17311
17312
0
    float vfpacc0 = (float) vacc0 * vscale;
17313
0
    float vfpacc1 = (float) vacc1 * vscale;
17314
0
    float vfpacc2 = (float) vacc2 * vscale;
17315
0
    float vfpacc3 = (float) vacc3 * vscale;
17316
17317
0
    vfpacc0 += vmagic_bias;
17318
0
    vfpacc1 += vmagic_bias;
17319
0
    vfpacc2 += vmagic_bias;
17320
0
    vfpacc3 += vmagic_bias;
17321
17322
0
    int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
17323
0
    int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
17324
0
    int32_t vout2 = (int32_t) float_as_uint32(vfpacc2);
17325
0
    int32_t vout3 = (int32_t) float_as_uint32(vfpacc3);
17326
17327
0
    vout0 = math_max_s32(vout0, vmagic_min);
17328
0
    vout1 = math_max_s32(vout1, vmagic_min);
17329
0
    vout2 = math_max_s32(vout2, vmagic_min);
17330
0
    vout3 = math_max_s32(vout3, vmagic_min);
17331
17332
0
    vout0 = math_min_s32(vout0, vmagic_max);
17333
0
    vout1 = math_min_s32(vout1, vmagic_max);
17334
0
    vout2 = math_min_s32(vout2, vmagic_max);
17335
0
    vout3 = math_min_s32(vout3, vmagic_max);
17336
17337
0
    vout0 -= vmagic_bias_less_zero_point;
17338
0
    vout1 -= vmagic_bias_less_zero_point;
17339
0
    vout2 -= vmagic_bias_less_zero_point;
17340
0
    vout3 -= vmagic_bias_less_zero_point;
17341
17342
0
    output[0] = (int8_t) vout0;
17343
0
    output[1] = (int8_t) vout1;
17344
0
    output[2] = (int8_t) vout2;
17345
0
    output[3] = (int8_t) vout3;
17346
0
    output += 4;
17347
0
  }
17348
0
  if XNN_UNLIKELY(channels != 0) {
17349
0
    do {
17350
0
      int32_t vacc = vinit_bias;
17351
0
      const int32_t vi0 = (int32_t) *i0++;
17352
0
      const int32_t vi1 = (int32_t) *i1++;
17353
17354
0
      vacc += vi0;
17355
0
      const int32_t vi2 = (int32_t) *i2++;
17356
0
      vacc += vi1;
17357
0
      const int32_t vi3 = (int32_t) *i3++;
17358
0
      vacc += vi2;
17359
0
      const int32_t vi4 = (int32_t) *i4++;
17360
0
      vacc += vi3;
17361
0
      const int32_t vi5 = (int32_t) *i5++;
17362
0
      vacc += vi4;
17363
0
      const int32_t vi6 = (int32_t) *i6++;
17364
17365
0
      vacc += vi5;
17366
0
      vacc += vi6;
17367
17368
0
      float vfpacc = (float) vacc * vscale;
17369
0
      vfpacc += vmagic_bias;
17370
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
17371
0
      vout = math_max_s32(vout, vmagic_min);
17372
0
      vout = math_min_s32(vout, vmagic_max);
17373
0
      vout -= vmagic_bias_less_zero_point;
17374
17375
0
      *output++ = (int8_t) vout;
17376
0
    } while (--channels != 0);
17377
0
  }
17378
0
}
17379
17380
void xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic(
17381
    size_t mr,
17382
    size_t nc,
17383
    size_t kc,
17384
    const int8_t* restrict a,
17385
    size_t a_stride,
17386
    const void* restrict w,
17387
    int8_t* restrict c,
17388
    size_t cm_stride,
17389
    size_t cn_stride,
17390
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17391
0
{
17392
0
  assert(mr != 0);
17393
0
  assert(mr <= 1);
17394
0
  assert(nc != 0);
17395
0
  assert(kc != 0);
17396
17397
0
  const int8_t* a0 = a;
17398
0
  int8_t* c0 = c;
17399
17400
0
  do {
17401
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
17402
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
17403
0
    w = (const int32_t*) w + 2;
17404
17405
0
    size_t k = kc;
17406
0
    do {
17407
0
      const int32_t va0 = (int32_t) *a0++;
17408
17409
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
17410
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
17411
0
      w = (const int8_t*) w + 2;
17412
17413
0
      vacc0x0 += va0 * vb0;
17414
0
      vacc0x1 += va0 * vb1;
17415
17416
0
      k -= sizeof(int8_t);
17417
0
    } while (k != 0);
17418
17419
0
    float vfpacc0x0 = (float) vacc0x0;
17420
0
    float vfpacc0x1 = (float) vacc0x1;
17421
17422
0
    const float vscale = params->fp32_scalar_imagic.scale;
17423
0
    vfpacc0x0 *= vscale;
17424
0
    vfpacc0x1 *= vscale;
17425
17426
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
17427
0
    vfpacc0x0 += vmagic_bias;
17428
0
    vfpacc0x1 += vmagic_bias;
17429
17430
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
17431
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
17432
17433
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
17434
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
17435
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
17436
17437
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
17438
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
17439
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
17440
17441
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
17442
0
    vout0x0 -= vmagic_bias_less_zero_point;
17443
0
    vout0x1 -= vmagic_bias_less_zero_point;
17444
17445
0
    if XNN_LIKELY(nc >= 2) {
17446
0
      c0[0] = (int8_t) vout0x0;
17447
0
      c0[1] = (int8_t) vout0x1;
17448
17449
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
17450
17451
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
17452
17453
0
      nc -= 2;
17454
0
    } else {
17455
0
      if (nc & 1) {
17456
0
        c0[0] = (int8_t) vout0x0;
17457
0
      }
17458
17459
0
      nc = 0;
17460
0
    }
17461
0
  } while (nc != 0);
17462
0
}
17463
17464
void xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf(
17465
    size_t mr,
17466
    size_t nc,
17467
    size_t kc,
17468
    const int8_t* restrict a,
17469
    size_t a_stride,
17470
    const void* restrict w,
17471
    int8_t* restrict c,
17472
    size_t cm_stride,
17473
    size_t cn_stride,
17474
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17475
0
{
17476
0
  assert(mr != 0);
17477
0
  assert(mr <= 1);
17478
0
  assert(nc != 0);
17479
0
  assert(kc != 0);
17480
17481
0
  const int8_t* a0 = a;
17482
0
  int8_t* c0 = c;
17483
17484
0
  do {
17485
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
17486
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
17487
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
17488
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
17489
0
    w = (const int32_t*) w + 4;
17490
17491
0
    size_t k = kc;
17492
0
    do {
17493
0
      const int32_t va0 = (int32_t) *a0++;
17494
17495
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
17496
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
17497
0
      const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
17498
0
      const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
17499
0
      w = (const int8_t*) w + 4;
17500
17501
0
      vacc0x0 += va0 * vb0;
17502
0
      vacc0x1 += va0 * vb1;
17503
0
      vacc0x2 += va0 * vb2;
17504
0
      vacc0x3 += va0 * vb3;
17505
17506
0
      k -= sizeof(int8_t);
17507
0
    } while (k != 0);
17508
17509
0
    float vfpacc0x0 = (float) vacc0x0;
17510
0
    float vfpacc0x1 = (float) vacc0x1;
17511
0
    float vfpacc0x2 = (float) vacc0x2;
17512
0
    float vfpacc0x3 = (float) vacc0x3;
17513
17514
0
    const float vscale = params->fp32_scalar_lrintf.scale;
17515
0
    vfpacc0x0 *= vscale;
17516
0
    vfpacc0x1 *= vscale;
17517
0
    vfpacc0x2 *= vscale;
17518
0
    vfpacc0x3 *= vscale;
17519
17520
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
17521
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
17522
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
17523
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
17524
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
17525
17526
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
17527
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
17528
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
17529
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
17530
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
17531
17532
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
17533
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
17534
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
17535
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
17536
17537
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
17538
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
17539
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
17540
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
17541
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
17542
17543
0
    if XNN_LIKELY(nc >= 4) {
17544
0
      c0[0] = (int8_t) vout0x0;
17545
0
      c0[1] = (int8_t) vout0x1;
17546
0
      c0[2] = (int8_t) vout0x2;
17547
0
      c0[3] = (int8_t) vout0x3;
17548
17549
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
17550
17551
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
17552
17553
0
      nc -= 4;
17554
0
    } else {
17555
0
      if (nc & 2) {
17556
0
        c0[0] = (int8_t) vout0x0;
17557
0
        c0[1] = (int8_t) vout0x1;
17558
0
        vout0x0 = vout0x2;
17559
0
        c0 += 2;
17560
0
      }
17561
0
      if (nc & 1) {
17562
0
        c0[0] = (int8_t) vout0x0;
17563
0
      }
17564
17565
0
      nc = 0;
17566
0
    }
17567
0
  } while (nc != 0);
17568
0
}
17569
17570
void xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic(
17571
    size_t mr,
17572
    size_t nc,
17573
    size_t kc,
17574
    const int8_t* restrict a,
17575
    size_t a_stride,
17576
    const void* restrict w,
17577
    int8_t* restrict c,
17578
    size_t cm_stride,
17579
    size_t cn_stride,
17580
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17581
0
{
17582
0
  assert(mr != 0);
17583
0
  assert(mr <= 2);
17584
0
  assert(nc != 0);
17585
0
  assert(kc != 0);
17586
17587
0
  const int8_t* a0 = a;
17588
0
  int8_t* c0 = c;
17589
0
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
17590
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
17591
0
  if XNN_UNPREDICTABLE(mr != 2) {
17592
0
    a1 = a0;
17593
0
    c1 = c0;
17594
0
  }
17595
17596
0
  do {
17597
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
17598
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
17599
0
    int32_t vacc1x0 = vacc0x0;
17600
0
    int32_t vacc1x1 = vacc0x1;
17601
0
    w = (const int32_t*) w + 2;
17602
17603
0
    size_t k = kc;
17604
0
    do {
17605
0
      const int32_t va0 = (int32_t) *a0++;
17606
0
      const int32_t va1 = (int32_t) *a1++;
17607
17608
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
17609
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
17610
0
      w = (const int8_t*) w + 2;
17611
17612
0
      vacc0x0 += va0 * vb0;
17613
0
      vacc0x1 += va0 * vb1;
17614
0
      vacc1x0 += va1 * vb0;
17615
0
      vacc1x1 += va1 * vb1;
17616
17617
0
      k -= sizeof(int8_t);
17618
0
    } while (k != 0);
17619
17620
0
    float vfpacc0x0 = (float) vacc0x0;
17621
0
    float vfpacc0x1 = (float) vacc0x1;
17622
0
    float vfpacc1x0 = (float) vacc1x0;
17623
0
    float vfpacc1x1 = (float) vacc1x1;
17624
17625
0
    const float vscale = params->fp32_scalar_imagic.scale;
17626
0
    vfpacc0x0 *= vscale;
17627
0
    vfpacc0x1 *= vscale;
17628
0
    vfpacc1x0 *= vscale;
17629
0
    vfpacc1x1 *= vscale;
17630
17631
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
17632
0
    vfpacc0x0 += vmagic_bias;
17633
0
    vfpacc0x1 += vmagic_bias;
17634
0
    vfpacc1x0 += vmagic_bias;
17635
0
    vfpacc1x1 += vmagic_bias;
17636
17637
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
17638
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
17639
0
    int32_t vout1x0 = (int32_t) float_as_uint32(vfpacc1x0);
17640
0
    int32_t vout1x1 = (int32_t) float_as_uint32(vfpacc1x1);
17641
17642
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
17643
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
17644
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
17645
0
    vout1x0 = math_max_s32(vout1x0, vmagic_min);
17646
0
    vout1x1 = math_max_s32(vout1x1, vmagic_min);
17647
17648
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
17649
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
17650
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
17651
0
    vout1x0 = math_min_s32(vout1x0, vmagic_max);
17652
0
    vout1x1 = math_min_s32(vout1x1, vmagic_max);
17653
17654
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
17655
0
    vout0x0 -= vmagic_bias_less_zero_point;
17656
0
    vout0x1 -= vmagic_bias_less_zero_point;
17657
0
    vout1x0 -= vmagic_bias_less_zero_point;
17658
0
    vout1x1 -= vmagic_bias_less_zero_point;
17659
17660
0
    if XNN_LIKELY(nc >= 2) {
17661
0
      c0[0] = (int8_t) vout0x0;
17662
0
      c0[1] = (int8_t) vout0x1;
17663
0
      c1[0] = (int8_t) vout1x0;
17664
0
      c1[1] = (int8_t) vout1x1;
17665
17666
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
17667
0
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
17668
17669
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
17670
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
17671
17672
0
      nc -= 2;
17673
0
    } else {
17674
0
      if (nc & 1) {
17675
0
        c0[0] = (int8_t) vout0x0;
17676
0
        c1[0] = (int8_t) vout1x0;
17677
0
      }
17678
17679
0
      nc = 0;
17680
0
    }
17681
0
  } while (nc != 0);
17682
0
}
17683
17684
void xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf(
17685
    size_t mr,
17686
    size_t nc,
17687
    size_t kc,
17688
    const int8_t* restrict a,
17689
    size_t a_stride,
17690
    const void* restrict w,
17691
    int8_t* restrict c,
17692
    size_t cm_stride,
17693
    size_t cn_stride,
17694
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17695
0
{
17696
0
  assert(mr != 0);
17697
0
  assert(mr <= 3);
17698
0
  assert(nc != 0);
17699
0
  assert(kc != 0);
17700
17701
0
  const int8_t* a0 = a;
17702
0
  int8_t* c0 = c;
17703
0
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
17704
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
17705
0
  if XNN_UNPREDICTABLE(mr < 2) {
17706
0
    a1 = a0;
17707
0
    c1 = c0;
17708
0
  }
17709
0
  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
17710
0
  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
17711
0
  if XNN_UNPREDICTABLE(mr <= 2) {
17712
0
    a2 = a1;
17713
0
    c2 = c1;
17714
0
  }
17715
17716
0
  do {
17717
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
17718
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
17719
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
17720
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
17721
0
    int32_t vacc1x0 = vacc0x0;
17722
0
    int32_t vacc1x1 = vacc0x1;
17723
0
    int32_t vacc1x2 = vacc0x2;
17724
0
    int32_t vacc1x3 = vacc0x3;
17725
0
    int32_t vacc2x0 = vacc0x0;
17726
0
    int32_t vacc2x1 = vacc0x1;
17727
0
    int32_t vacc2x2 = vacc0x2;
17728
0
    int32_t vacc2x3 = vacc0x3;
17729
0
    w = (const int32_t*) w + 4;
17730
17731
0
    size_t k = kc;
17732
0
    do {
17733
0
      const int32_t va0 = (int32_t) *a0++;
17734
0
      const int32_t va1 = (int32_t) *a1++;
17735
0
      const int32_t va2 = (int32_t) *a2++;
17736
17737
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
17738
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
17739
0
      const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
17740
0
      const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
17741
0
      w = (const int8_t*) w + 4;
17742
17743
0
      vacc0x0 += va0 * vb0;
17744
0
      vacc0x1 += va0 * vb1;
17745
0
      vacc0x2 += va0 * vb2;
17746
0
      vacc0x3 += va0 * vb3;
17747
0
      vacc1x0 += va1 * vb0;
17748
0
      vacc1x1 += va1 * vb1;
17749
0
      vacc1x2 += va1 * vb2;
17750
0
      vacc1x3 += va1 * vb3;
17751
0
      vacc2x0 += va2 * vb0;
17752
0
      vacc2x1 += va2 * vb1;
17753
0
      vacc2x2 += va2 * vb2;
17754
0
      vacc2x3 += va2 * vb3;
17755
17756
0
      k -= sizeof(int8_t);
17757
0
    } while (k != 0);
17758
17759
0
    float vfpacc0x0 = (float) vacc0x0;
17760
0
    float vfpacc0x1 = (float) vacc0x1;
17761
0
    float vfpacc0x2 = (float) vacc0x2;
17762
0
    float vfpacc0x3 = (float) vacc0x3;
17763
0
    float vfpacc1x0 = (float) vacc1x0;
17764
0
    float vfpacc1x1 = (float) vacc1x1;
17765
0
    float vfpacc1x2 = (float) vacc1x2;
17766
0
    float vfpacc1x3 = (float) vacc1x3;
17767
0
    float vfpacc2x0 = (float) vacc2x0;
17768
0
    float vfpacc2x1 = (float) vacc2x1;
17769
0
    float vfpacc2x2 = (float) vacc2x2;
17770
0
    float vfpacc2x3 = (float) vacc2x3;
17771
17772
0
    const float vscale = params->fp32_scalar_lrintf.scale;
17773
0
    vfpacc0x0 *= vscale;
17774
0
    vfpacc0x1 *= vscale;
17775
0
    vfpacc0x2 *= vscale;
17776
0
    vfpacc0x3 *= vscale;
17777
0
    vfpacc1x0 *= vscale;
17778
0
    vfpacc1x1 *= vscale;
17779
0
    vfpacc1x2 *= vscale;
17780
0
    vfpacc1x3 *= vscale;
17781
0
    vfpacc2x0 *= vscale;
17782
0
    vfpacc2x1 *= vscale;
17783
0
    vfpacc2x2 *= vscale;
17784
0
    vfpacc2x3 *= vscale;
17785
17786
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
17787
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
17788
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
17789
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
17790
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
17791
0
    vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
17792
0
    vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
17793
0
    vfpacc1x2 = math_max_f32(vfpacc1x2, voutput_min_less_zero_point);
17794
0
    vfpacc1x3 = math_max_f32(vfpacc1x3, voutput_min_less_zero_point);
17795
0
    vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
17796
0
    vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
17797
0
    vfpacc2x2 = math_max_f32(vfpacc2x2, voutput_min_less_zero_point);
17798
0
    vfpacc2x3 = math_max_f32(vfpacc2x3, voutput_min_less_zero_point);
17799
17800
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
17801
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
17802
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
17803
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
17804
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
17805
0
    vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
17806
0
    vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
17807
0
    vfpacc1x2 = math_min_f32(vfpacc1x2, voutput_max_less_zero_point);
17808
0
    vfpacc1x3 = math_min_f32(vfpacc1x3, voutput_max_less_zero_point);
17809
0
    vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
17810
0
    vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
17811
0
    vfpacc2x2 = math_min_f32(vfpacc2x2, voutput_max_less_zero_point);
17812
0
    vfpacc2x3 = math_min_f32(vfpacc2x3, voutput_max_less_zero_point);
17813
17814
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
17815
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
17816
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
17817
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
17818
0
    const int32_t vrndacc1x0 = (int32_t) lrintf(vfpacc1x0);
17819
0
    const int32_t vrndacc1x1 = (int32_t) lrintf(vfpacc1x1);
17820
0
    const int32_t vrndacc1x2 = (int32_t) lrintf(vfpacc1x2);
17821
0
    const int32_t vrndacc1x3 = (int32_t) lrintf(vfpacc1x3);
17822
0
    const int32_t vrndacc2x0 = (int32_t) lrintf(vfpacc2x0);
17823
0
    const int32_t vrndacc2x1 = (int32_t) lrintf(vfpacc2x1);
17824
0
    const int32_t vrndacc2x2 = (int32_t) lrintf(vfpacc2x2);
17825
0
    const int32_t vrndacc2x3 = (int32_t) lrintf(vfpacc2x3);
17826
17827
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
17828
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
17829
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
17830
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
17831
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
17832
0
    int32_t vout1x0 = vrndacc1x0 + voutput_zero_point;
17833
0
    int32_t vout1x1 = vrndacc1x1 + voutput_zero_point;
17834
0
    int32_t vout1x2 = vrndacc1x2 + voutput_zero_point;
17835
0
    int32_t vout1x3 = vrndacc1x3 + voutput_zero_point;
17836
0
    int32_t vout2x0 = vrndacc2x0 + voutput_zero_point;
17837
0
    int32_t vout2x1 = vrndacc2x1 + voutput_zero_point;
17838
0
    int32_t vout2x2 = vrndacc2x2 + voutput_zero_point;
17839
0
    int32_t vout2x3 = vrndacc2x3 + voutput_zero_point;
17840
17841
0
    if XNN_LIKELY(nc >= 4) {
17842
0
      c0[0] = (int8_t) vout0x0;
17843
0
      c0[1] = (int8_t) vout0x1;
17844
0
      c0[2] = (int8_t) vout0x2;
17845
0
      c0[3] = (int8_t) vout0x3;
17846
0
      c1[0] = (int8_t) vout1x0;
17847
0
      c1[1] = (int8_t) vout1x1;
17848
0
      c1[2] = (int8_t) vout1x2;
17849
0
      c1[3] = (int8_t) vout1x3;
17850
0
      c2[0] = (int8_t) vout2x0;
17851
0
      c2[1] = (int8_t) vout2x1;
17852
0
      c2[2] = (int8_t) vout2x2;
17853
0
      c2[3] = (int8_t) vout2x3;
17854
17855
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
17856
0
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
17857
0
      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
17858
17859
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
17860
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
17861
0
      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
17862
17863
0
      nc -= 4;
17864
0
    } else {
17865
0
      if (nc & 2) {
17866
0
        c0[0] = (int8_t) vout0x0;
17867
0
        c0[1] = (int8_t) vout0x1;
17868
0
        vout0x0 = vout0x2;
17869
0
        c0 += 2;
17870
0
        c1[0] = (int8_t) vout1x0;
17871
0
        c1[1] = (int8_t) vout1x1;
17872
0
        vout1x0 = vout1x2;
17873
0
        c1 += 2;
17874
0
        c2[0] = (int8_t) vout2x0;
17875
0
        c2[1] = (int8_t) vout2x1;
17876
0
        vout2x0 = vout2x2;
17877
0
        c2 += 2;
17878
0
      }
17879
0
      if (nc & 1) {
17880
0
        c0[0] = (int8_t) vout0x0;
17881
0
        c1[0] = (int8_t) vout1x0;
17882
0
        c2[0] = (int8_t) vout2x0;
17883
0
      }
17884
17885
0
      nc = 0;
17886
0
    }
17887
0
  } while (nc != 0);
17888
0
}
17889
17890
void xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic(
17891
    size_t mr,
17892
    size_t nc,
17893
    size_t kc,
17894
    size_t ks,
17895
    const int8_t** restrict a,
17896
    const void* restrict w,
17897
    int8_t* restrict c,
17898
    size_t cm_stride,
17899
    size_t cn_stride,
17900
    size_t a_offset,
17901
    const int8_t* zero,
17902
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
17903
0
{
17904
0
  assert(mr != 0);
17905
0
  assert(mr <= 1);
17906
0
  assert(nc != 0);
17907
0
  assert(kc != 0);
17908
0
  assert(ks != 0);
17909
0
  assert(ks % (1 * sizeof(void*)) == 0);
17910
0
  assert(a != NULL);
17911
0
  assert(w != NULL);
17912
0
  assert(c != NULL);
17913
17914
0
  int8_t* c0 = c;
17915
17916
0
  do {
17917
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
17918
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
17919
0
    w = (const void*) ((const int32_t*) w + 2);
17920
17921
0
    size_t p = ks;
17922
0
    do {
17923
0
      const int8_t* restrict a0 = a[0];
17924
0
      assert(a0 != NULL);
17925
0
      if XNN_UNPREDICTABLE(a0 != zero) {
17926
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
17927
0
      }
17928
0
      a += 1;
17929
17930
0
      size_t k = kc;
17931
0
      do {
17932
0
        const int32_t va0 = (int32_t) *a0++;
17933
17934
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
17935
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
17936
0
        w = (const void*) ((const int8_t*) w + 2);
17937
17938
0
        vacc0x0 += va0 * vb0;
17939
0
        vacc0x1 += va0 * vb1;
17940
17941
0
        k -= sizeof(int8_t);
17942
0
      } while (k != 0);
17943
0
      p -= 1 * sizeof(void*);
17944
0
    } while (p != 0);
17945
17946
0
    float vfpacc0x0 = (float) vacc0x0;
17947
0
    float vfpacc0x1 = (float) vacc0x1;
17948
17949
0
    const float vscale = params->fp32_scalar_imagic.scale;
17950
0
    vfpacc0x0 *= vscale;
17951
0
    vfpacc0x1 *= vscale;
17952
17953
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
17954
0
    vfpacc0x0 += vmagic_bias;
17955
0
    vfpacc0x1 += vmagic_bias;
17956
17957
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
17958
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
17959
17960
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
17961
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
17962
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
17963
17964
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
17965
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
17966
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
17967
17968
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
17969
0
    vout0x0 -= vmagic_bias_less_zero_point;
17970
0
    vout0x1 -= vmagic_bias_less_zero_point;
17971
17972
0
    if XNN_LIKELY(nc >= 2) {
17973
0
      c0[0] = (int8_t) vout0x0;
17974
0
      c0[1] = (int8_t) vout0x1;
17975
17976
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
17977
17978
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
17979
0
      nc -= 2;
17980
0
    } else {
17981
0
      if (nc & 1) {
17982
0
        c0[0] = (int8_t) vout0x0;
17983
0
      }
17984
17985
0
      nc = 0;
17986
0
    }
17987
0
  } while (nc != 0);
17988
0
}
17989
17990
void xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf(
17991
    size_t mr,
17992
    size_t nc,
17993
    size_t kc,
17994
    size_t ks,
17995
    const int8_t** restrict a,
17996
    const void* restrict w,
17997
    int8_t* restrict c,
17998
    size_t cm_stride,
17999
    size_t cn_stride,
18000
    size_t a_offset,
18001
    const int8_t* zero,
18002
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
18003
0
{
18004
0
  assert(mr != 0);
18005
0
  assert(mr <= 1);
18006
0
  assert(nc != 0);
18007
0
  assert(kc != 0);
18008
0
  assert(ks != 0);
18009
0
  assert(ks % (1 * sizeof(void*)) == 0);
18010
0
  assert(a != NULL);
18011
0
  assert(w != NULL);
18012
0
  assert(c != NULL);
18013
18014
0
  int8_t* c0 = c;
18015
18016
0
  do {
18017
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
18018
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
18019
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
18020
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
18021
0
    w = (const void*) ((const int32_t*) w + 4);
18022
18023
0
    size_t p = ks;
18024
0
    do {
18025
0
      const int8_t* restrict a0 = a[0];
18026
0
      assert(a0 != NULL);
18027
0
      if XNN_UNPREDICTABLE(a0 != zero) {
18028
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
18029
0
      }
18030
0
      a += 1;
18031
18032
0
      size_t k = kc;
18033
0
      do {
18034
0
        const int32_t va0 = (int32_t) *a0++;
18035
18036
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
18037
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
18038
0
        const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
18039
0
        const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
18040
0
        w = (const void*) ((const int8_t*) w + 4);
18041
18042
0
        vacc0x0 += va0 * vb0;
18043
0
        vacc0x1 += va0 * vb1;
18044
0
        vacc0x2 += va0 * vb2;
18045
0
        vacc0x3 += va0 * vb3;
18046
18047
0
        k -= sizeof(int8_t);
18048
0
      } while (k != 0);
18049
0
      p -= 1 * sizeof(void*);
18050
0
    } while (p != 0);
18051
18052
0
    float vfpacc0x0 = (float) vacc0x0;
18053
0
    float vfpacc0x1 = (float) vacc0x1;
18054
0
    float vfpacc0x2 = (float) vacc0x2;
18055
0
    float vfpacc0x3 = (float) vacc0x3;
18056
18057
0
    const float vscale = params->fp32_scalar_lrintf.scale;
18058
0
    vfpacc0x0 *= vscale;
18059
0
    vfpacc0x1 *= vscale;
18060
0
    vfpacc0x2 *= vscale;
18061
0
    vfpacc0x3 *= vscale;
18062
18063
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
18064
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
18065
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
18066
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
18067
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
18068
18069
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
18070
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
18071
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
18072
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
18073
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
18074
18075
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
18076
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
18077
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
18078
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
18079
18080
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
18081
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
18082
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
18083
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
18084
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
18085
18086
0
    if XNN_LIKELY(nc >= 4) {
18087
0
      c0[0] = (int8_t) vout0x0;
18088
0
      c0[1] = (int8_t) vout0x1;
18089
0
      c0[2] = (int8_t) vout0x2;
18090
0
      c0[3] = (int8_t) vout0x3;
18091
18092
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
18093
18094
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
18095
0
      nc -= 4;
18096
0
    } else {
18097
0
      if (nc & 2) {
18098
0
        c0[0] = (int8_t) vout0x0;
18099
0
        c0[1] = (int8_t) vout0x1;
18100
0
        vout0x0 = vout0x2;
18101
0
        c0 += 2;
18102
0
      }
18103
0
      if (nc & 1) {
18104
0
        c0[0] = (int8_t) vout0x0;
18105
0
      }
18106
18107
0
      nc = 0;
18108
0
    }
18109
0
  } while (nc != 0);
18110
0
}
18111
18112
void xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic(
18113
    size_t mr,
18114
    size_t nc,
18115
    size_t kc,
18116
    size_t ks,
18117
    const int8_t** restrict a,
18118
    const void* restrict w,
18119
    int8_t* restrict c,
18120
    size_t cm_stride,
18121
    size_t cn_stride,
18122
    size_t a_offset,
18123
    const int8_t* zero,
18124
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
18125
0
{
18126
0
  assert(mr != 0);
18127
0
  assert(mr <= 2);
18128
0
  assert(nc != 0);
18129
0
  assert(kc != 0);
18130
0
  assert(ks != 0);
18131
0
  assert(ks % (2 * sizeof(void*)) == 0);
18132
0
  assert(a != NULL);
18133
0
  assert(w != NULL);
18134
0
  assert(c != NULL);
18135
18136
0
  int8_t* c0 = c;
18137
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
18138
0
  if XNN_UNPREDICTABLE(mr != 2) {
18139
0
    c1 = c0;
18140
0
  }
18141
18142
0
  do {
18143
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
18144
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
18145
0
    int32_t vacc1x0 = vacc0x0;
18146
0
    int32_t vacc1x1 = vacc0x1;
18147
0
    w = (const void*) ((const int32_t*) w + 2);
18148
18149
0
    size_t p = ks;
18150
0
    do {
18151
0
      const int8_t* restrict a0 = a[0];
18152
0
      assert(a0 != NULL);
18153
0
      if XNN_UNPREDICTABLE(a0 != zero) {
18154
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
18155
0
      }
18156
0
      const int8_t* restrict a1 = a[1];
18157
0
      assert(a1 != NULL);
18158
0
      if XNN_UNPREDICTABLE(a1 != zero) {
18159
0
        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
18160
0
      }
18161
0
      a += 2;
18162
18163
0
      size_t k = kc;
18164
0
      do {
18165
0
        const int32_t va0 = (int32_t) *a0++;
18166
0
        const int32_t va1 = (int32_t) *a1++;
18167
18168
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
18169
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
18170
0
        w = (const void*) ((const int8_t*) w + 2);
18171
18172
0
        vacc0x0 += va0 * vb0;
18173
0
        vacc0x1 += va0 * vb1;
18174
0
        vacc1x0 += va1 * vb0;
18175
0
        vacc1x1 += va1 * vb1;
18176
18177
0
        k -= sizeof(int8_t);
18178
0
      } while (k != 0);
18179
0
      p -= 2 * sizeof(void*);
18180
0
    } while (p != 0);
18181
18182
0
    float vfpacc0x0 = (float) vacc0x0;
18183
0
    float vfpacc0x1 = (float) vacc0x1;
18184
0
    float vfpacc1x0 = (float) vacc1x0;
18185
0
    float vfpacc1x1 = (float) vacc1x1;
18186
18187
0
    const float vscale = params->fp32_scalar_imagic.scale;
18188
0
    vfpacc0x0 *= vscale;
18189
0
    vfpacc0x1 *= vscale;
18190
0
    vfpacc1x0 *= vscale;
18191
0
    vfpacc1x1 *= vscale;
18192
18193
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
18194
0
    vfpacc0x0 += vmagic_bias;
18195
0
    vfpacc0x1 += vmagic_bias;
18196
0
    vfpacc1x0 += vmagic_bias;
18197
0
    vfpacc1x1 += vmagic_bias;
18198
18199
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
18200
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
18201
0
    int32_t vout1x0 = (int32_t) float_as_uint32(vfpacc1x0);
18202
0
    int32_t vout1x1 = (int32_t) float_as_uint32(vfpacc1x1);
18203
18204
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
18205
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
18206
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
18207
0
    vout1x0 = math_max_s32(vout1x0, vmagic_min);
18208
0
    vout1x1 = math_max_s32(vout1x1, vmagic_min);
18209
18210
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
18211
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
18212
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
18213
0
    vout1x0 = math_min_s32(vout1x0, vmagic_max);
18214
0
    vout1x1 = math_min_s32(vout1x1, vmagic_max);
18215
18216
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
18217
0
    vout0x0 -= vmagic_bias_less_zero_point;
18218
0
    vout0x1 -= vmagic_bias_less_zero_point;
18219
0
    vout1x0 -= vmagic_bias_less_zero_point;
18220
0
    vout1x1 -= vmagic_bias_less_zero_point;
18221
18222
0
    if XNN_LIKELY(nc >= 2) {
18223
0
      c1[0] = (int8_t) vout1x0;
18224
0
      c1[1] = (int8_t) vout1x1;
18225
0
      c0[0] = (int8_t) vout0x0;
18226
0
      c0[1] = (int8_t) vout0x1;
18227
18228
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
18229
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
18230
18231
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
18232
0
      nc -= 2;
18233
0
    } else {
18234
0
      if (nc & 1) {
18235
0
        c1[0] = (int8_t) vout1x0;
18236
0
        c0[0] = (int8_t) vout0x0;
18237
0
      }
18238
18239
0
      nc = 0;
18240
0
    }
18241
0
  } while (nc != 0);
18242
0
}
18243
18244
void xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf(
18245
    size_t mr,
18246
    size_t nc,
18247
    size_t kc,
18248
    size_t ks,
18249
    const int8_t** restrict a,
18250
    const void* restrict w,
18251
    int8_t* restrict c,
18252
    size_t cm_stride,
18253
    size_t cn_stride,
18254
    size_t a_offset,
18255
    const int8_t* zero,
18256
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
18257
0
{
18258
0
  assert(mr != 0);
18259
0
  assert(mr <= 3);
18260
0
  assert(nc != 0);
18261
0
  assert(kc != 0);
18262
0
  assert(ks != 0);
18263
0
  assert(ks % (3 * sizeof(void*)) == 0);
18264
0
  assert(a != NULL);
18265
0
  assert(w != NULL);
18266
0
  assert(c != NULL);
18267
18268
0
  int8_t* c0 = c;
18269
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
18270
0
  if XNN_UNPREDICTABLE(mr < 2) {
18271
0
    c1 = c0;
18272
0
  }
18273
0
  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
18274
0
  if XNN_UNPREDICTABLE(mr <= 2) {
18275
0
    c2 = c1;
18276
0
  }
18277
18278
0
  do {
18279
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
18280
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
18281
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
18282
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
18283
0
    int32_t vacc1x0 = vacc0x0;
18284
0
    int32_t vacc1x1 = vacc0x1;
18285
0
    int32_t vacc1x2 = vacc0x2;
18286
0
    int32_t vacc1x3 = vacc0x3;
18287
0
    int32_t vacc2x0 = vacc0x0;
18288
0
    int32_t vacc2x1 = vacc0x1;
18289
0
    int32_t vacc2x2 = vacc0x2;
18290
0
    int32_t vacc2x3 = vacc0x3;
18291
0
    w = (const void*) ((const int32_t*) w + 4);
18292
18293
0
    size_t p = ks;
18294
0
    do {
18295
0
      const int8_t* restrict a0 = a[0];
18296
0
      assert(a0 != NULL);
18297
0
      if XNN_UNPREDICTABLE(a0 != zero) {
18298
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
18299
0
      }
18300
0
      const int8_t* restrict a1 = a[1];
18301
0
      assert(a1 != NULL);
18302
0
      if XNN_UNPREDICTABLE(a1 != zero) {
18303
0
        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
18304
0
      }
18305
0
      const int8_t* restrict a2 = a[2];
18306
0
      assert(a2 != NULL);
18307
0
      if XNN_UNPREDICTABLE(a2 != zero) {
18308
0
        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
18309
0
      }
18310
0
      a += 3;
18311
18312
0
      size_t k = kc;
18313
0
      do {
18314
0
        const int32_t va0 = (int32_t) *a0++;
18315
0
        const int32_t va1 = (int32_t) *a1++;
18316
0
        const int32_t va2 = (int32_t) *a2++;
18317
18318
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
18319
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
18320
0
        const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
18321
0
        const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
18322
0
        w = (const void*) ((const int8_t*) w + 4);
18323
18324
0
        vacc0x0 += va0 * vb0;
18325
0
        vacc0x1 += va0 * vb1;
18326
0
        vacc0x2 += va0 * vb2;
18327
0
        vacc0x3 += va0 * vb3;
18328
0
        vacc1x0 += va1 * vb0;
18329
0
        vacc1x1 += va1 * vb1;
18330
0
        vacc1x2 += va1 * vb2;
18331
0
        vacc1x3 += va1 * vb3;
18332
0
        vacc2x0 += va2 * vb0;
18333
0
        vacc2x1 += va2 * vb1;
18334
0
        vacc2x2 += va2 * vb2;
18335
0
        vacc2x3 += va2 * vb3;
18336
18337
0
        k -= sizeof(int8_t);
18338
0
      } while (k != 0);
18339
0
      p -= 3 * sizeof(void*);
18340
0
    } while (p != 0);
18341
18342
0
    float vfpacc0x0 = (float) vacc0x0;
18343
0
    float vfpacc0x1 = (float) vacc0x1;
18344
0
    float vfpacc0x2 = (float) vacc0x2;
18345
0
    float vfpacc0x3 = (float) vacc0x3;
18346
0
    float vfpacc1x0 = (float) vacc1x0;
18347
0
    float vfpacc1x1 = (float) vacc1x1;
18348
0
    float vfpacc1x2 = (float) vacc1x2;
18349
0
    float vfpacc1x3 = (float) vacc1x3;
18350
0
    float vfpacc2x0 = (float) vacc2x0;
18351
0
    float vfpacc2x1 = (float) vacc2x1;
18352
0
    float vfpacc2x2 = (float) vacc2x2;
18353
0
    float vfpacc2x3 = (float) vacc2x3;
18354
18355
0
    const float vscale = params->fp32_scalar_lrintf.scale;
18356
0
    vfpacc0x0 *= vscale;
18357
0
    vfpacc0x1 *= vscale;
18358
0
    vfpacc0x2 *= vscale;
18359
0
    vfpacc0x3 *= vscale;
18360
0
    vfpacc1x0 *= vscale;
18361
0
    vfpacc1x1 *= vscale;
18362
0
    vfpacc1x2 *= vscale;
18363
0
    vfpacc1x3 *= vscale;
18364
0
    vfpacc2x0 *= vscale;
18365
0
    vfpacc2x1 *= vscale;
18366
0
    vfpacc2x2 *= vscale;
18367
0
    vfpacc2x3 *= vscale;
18368
18369
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
18370
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
18371
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
18372
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
18373
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
18374
0
    vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
18375
0
    vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
18376
0
    vfpacc1x2 = math_max_f32(vfpacc1x2, voutput_min_less_zero_point);
18377
0
    vfpacc1x3 = math_max_f32(vfpacc1x3, voutput_min_less_zero_point);
18378
0
    vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
18379
0
    vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
18380
0
    vfpacc2x2 = math_max_f32(vfpacc2x2, voutput_min_less_zero_point);
18381
0
    vfpacc2x3 = math_max_f32(vfpacc2x3, voutput_min_less_zero_point);
18382
18383
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
18384
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
18385
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
18386
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
18387
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
18388
0
    vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
18389
0
    vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
18390
0
    vfpacc1x2 = math_min_f32(vfpacc1x2, voutput_max_less_zero_point);
18391
0
    vfpacc1x3 = math_min_f32(vfpacc1x3, voutput_max_less_zero_point);
18392
0
    vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
18393
0
    vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
18394
0
    vfpacc2x2 = math_min_f32(vfpacc2x2, voutput_max_less_zero_point);
18395
0
    vfpacc2x3 = math_min_f32(vfpacc2x3, voutput_max_less_zero_point);
18396
18397
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
18398
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
18399
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
18400
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
18401
0
    const int32_t vrndacc1x0 = (int32_t) lrintf(vfpacc1x0);
18402
0
    const int32_t vrndacc1x1 = (int32_t) lrintf(vfpacc1x1);
18403
0
    const int32_t vrndacc1x2 = (int32_t) lrintf(vfpacc1x2);
18404
0
    const int32_t vrndacc1x3 = (int32_t) lrintf(vfpacc1x3);
18405
0
    const int32_t vrndacc2x0 = (int32_t) lrintf(vfpacc2x0);
18406
0
    const int32_t vrndacc2x1 = (int32_t) lrintf(vfpacc2x1);
18407
0
    const int32_t vrndacc2x2 = (int32_t) lrintf(vfpacc2x2);
18408
0
    const int32_t vrndacc2x3 = (int32_t) lrintf(vfpacc2x3);
18409
18410
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
18411
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
18412
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
18413
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
18414
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
18415
0
    int32_t vout1x0 = vrndacc1x0 + voutput_zero_point;
18416
0
    int32_t vout1x1 = vrndacc1x1 + voutput_zero_point;
18417
0
    int32_t vout1x2 = vrndacc1x2 + voutput_zero_point;
18418
0
    int32_t vout1x3 = vrndacc1x3 + voutput_zero_point;
18419
0
    int32_t vout2x0 = vrndacc2x0 + voutput_zero_point;
18420
0
    int32_t vout2x1 = vrndacc2x1 + voutput_zero_point;
18421
0
    int32_t vout2x2 = vrndacc2x2 + voutput_zero_point;
18422
0
    int32_t vout2x3 = vrndacc2x3 + voutput_zero_point;
18423
18424
0
    if XNN_LIKELY(nc >= 4) {
18425
0
      c2[0] = (int8_t) vout2x0;
18426
0
      c2[1] = (int8_t) vout2x1;
18427
0
      c2[2] = (int8_t) vout2x2;
18428
0
      c2[3] = (int8_t) vout2x3;
18429
0
      c1[0] = (int8_t) vout1x0;
18430
0
      c1[1] = (int8_t) vout1x1;
18431
0
      c1[2] = (int8_t) vout1x2;
18432
0
      c1[3] = (int8_t) vout1x3;
18433
0
      c0[0] = (int8_t) vout0x0;
18434
0
      c0[1] = (int8_t) vout0x1;
18435
0
      c0[2] = (int8_t) vout0x2;
18436
0
      c0[3] = (int8_t) vout0x3;
18437
18438
0
      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
18439
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
18440
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
18441
18442
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
18443
0
      nc -= 4;
18444
0
    } else {
18445
0
      if (nc & 2) {
18446
0
        c2[0] = (int8_t) vout2x0;
18447
0
        c2[1] = (int8_t) vout2x1;
18448
0
        vout2x0 = vout2x2;
18449
0
        c2 += 2;
18450
0
        c1[0] = (int8_t) vout1x0;
18451
0
        c1[1] = (int8_t) vout1x1;
18452
0
        vout1x0 = vout1x2;
18453
0
        c1 += 2;
18454
0
        c0[0] = (int8_t) vout0x0;
18455
0
        c0[1] = (int8_t) vout0x1;
18456
0
        vout0x0 = vout0x2;
18457
0
        c0 += 2;
18458
0
      }
18459
0
      if (nc & 1) {
18460
0
        c2[0] = (int8_t) vout2x0;
18461
0
        c1[0] = (int8_t) vout1x0;
18462
0
        c0[0] = (int8_t) vout0x0;
18463
0
      }
18464
18465
0
      nc = 0;
18466
0
    }
18467
0
  } while (nc != 0);
18468
0
}
18469
18470
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic(
18471
    size_t channels,
18472
    size_t output_width,
18473
    const int8_t** input,
18474
    const void* weights,
18475
    int8_t* output,
18476
    intptr_t input_stride,
18477
    size_t output_increment,
18478
    size_t input_offset,
18479
    const int8_t* zero,
18480
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
18481
0
{
18482
0
  assert(channels != 0);
18483
0
  assert(output_width != 0);
18484
18485
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
18486
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
18487
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
18488
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
18489
0
  do {
18490
0
    const int8_t* i0 = input[0];
18491
0
    assert(i0 != NULL);
18492
0
    if XNN_UNPREDICTABLE(i0 != zero) {
18493
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
18494
0
    }
18495
0
    const int8_t* i1 = input[1];
18496
0
    assert(i1 != NULL);
18497
0
    if XNN_UNPREDICTABLE(i1 != zero) {
18498
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
18499
0
    }
18500
0
    const int8_t* i2 = input[2];
18501
0
    assert(i2 != NULL);
18502
0
    if XNN_UNPREDICTABLE(i2 != zero) {
18503
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
18504
0
    }
18505
0
    const int8_t* i3 = input[3];
18506
0
    assert(i3 != NULL);
18507
0
    if XNN_UNPREDICTABLE(i3 != zero) {
18508
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
18509
0
    }
18510
0
    const int8_t* i4 = input[4];
18511
0
    assert(i4 != NULL);
18512
0
    if XNN_UNPREDICTABLE(i4 != zero) {
18513
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
18514
0
    }
18515
0
    const int8_t* i5 = input[5];
18516
0
    assert(i5 != NULL);
18517
0
    if XNN_UNPREDICTABLE(i5 != zero) {
18518
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
18519
0
    }
18520
0
    const int8_t* i6 = input[6];
18521
0
    assert(i6 != NULL);
18522
0
    if XNN_UNPREDICTABLE(i6 != zero) {
18523
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
18524
0
    }
18525
0
    const int8_t* i7 = input[7];
18526
0
    assert(i7 != NULL);
18527
0
    if XNN_UNPREDICTABLE(i7 != zero) {
18528
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
18529
0
    }
18530
0
    const int8_t* i8 = input[8];
18531
0
    assert(i8 != NULL);
18532
0
    if XNN_UNPREDICTABLE(i8 != zero) {
18533
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
18534
0
    }
18535
0
    const int8_t* i9 = input[9];
18536
0
    assert(i9 != NULL);
18537
0
    if XNN_UNPREDICTABLE(i9 != zero) {
18538
0
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
18539
0
    }
18540
0
    const int8_t* i10 = input[10];
18541
0
    assert(i10 != NULL);
18542
0
    if XNN_UNPREDICTABLE(i10 != zero) {
18543
0
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
18544
0
    }
18545
0
    const int8_t* i11 = input[11];
18546
0
    assert(i11 != NULL);
18547
0
    if XNN_UNPREDICTABLE(i11 != zero) {
18548
0
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
18549
0
    }
18550
0
    const int8_t* i12 = input[12];
18551
0
    assert(i12 != NULL);
18552
0
    if XNN_UNPREDICTABLE(i12 != zero) {
18553
0
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
18554
0
    }
18555
0
    const int8_t* i13 = input[13];
18556
0
    assert(i13 != NULL);
18557
0
    if XNN_UNPREDICTABLE(i13 != zero) {
18558
0
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
18559
0
    }
18560
0
    const int8_t* i14 = input[14];
18561
0
    assert(i14 != NULL);
18562
0
    if XNN_UNPREDICTABLE(i14 != zero) {
18563
0
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
18564
0
    }
18565
0
    const int8_t* i15 = input[15];
18566
0
    assert(i15 != NULL);
18567
0
    if XNN_UNPREDICTABLE(i15 != zero) {
18568
0
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
18569
0
    }
18570
0
    const int8_t* i16 = input[16];
18571
0
    assert(i16 != NULL);
18572
0
    if XNN_UNPREDICTABLE(i16 != zero) {
18573
0
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
18574
0
    }
18575
0
    const int8_t* i17 = input[17];
18576
0
    assert(i17 != NULL);
18577
0
    if XNN_UNPREDICTABLE(i17 != zero) {
18578
0
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
18579
0
    }
18580
0
    const int8_t* i18 = input[18];
18581
0
    assert(i18 != NULL);
18582
0
    if XNN_UNPREDICTABLE(i18 != zero) {
18583
0
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
18584
0
    }
18585
0
    const int8_t* i19 = input[19];
18586
0
    assert(i19 != NULL);
18587
0
    if XNN_UNPREDICTABLE(i19 != zero) {
18588
0
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
18589
0
    }
18590
0
    const int8_t* i20 = input[20];
18591
0
    assert(i20 != NULL);
18592
0
    if XNN_UNPREDICTABLE(i20 != zero) {
18593
0
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
18594
0
    }
18595
0
    const int8_t* i21 = input[21];
18596
0
    assert(i21 != NULL);
18597
0
    if XNN_UNPREDICTABLE(i21 != zero) {
18598
0
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
18599
0
    }
18600
0
    const int8_t* i22 = input[22];
18601
0
    assert(i22 != NULL);
18602
0
    if XNN_UNPREDICTABLE(i22 != zero) {
18603
0
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
18604
0
    }
18605
0
    const int8_t* i23 = input[23];
18606
0
    assert(i23 != NULL);
18607
0
    if XNN_UNPREDICTABLE(i23 != zero) {
18608
0
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
18609
0
    }
18610
0
    const int8_t* i24 = input[24];
18611
0
    assert(i24 != NULL);
18612
0
    if XNN_UNPREDICTABLE(i24 != zero) {
18613
0
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
18614
0
    }
18615
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
18616
18617
0
    size_t c = channels;
18618
0
    const void* w = weights;
18619
0
    do {
18620
0
      int32_t vacc = unaligned_load_s32(w);
18621
18622
0
      const int32_t vi0 = (int32_t) *i0++;
18623
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
18624
0
      vacc += vi0 * vk0;
18625
0
      const int32_t vi1 = (int32_t) *i1++;
18626
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
18627
0
      vacc += vi1 * vk1;
18628
0
      const int32_t vi2 = (int32_t) *i2++;
18629
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
18630
0
      vacc += vi2 * vk2;
18631
0
      const int32_t vi3 = (int32_t) *i3++;
18632
0
      const int32_t vk3 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[3];
18633
0
      vacc += vi3 * vk3;
18634
0
      const int32_t vi4 = (int32_t) *i4++;
18635
0
      const int32_t vk4 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[4];
18636
0
      vacc += vi4 * vk4;
18637
0
      const int32_t vi5 = (int32_t) *i5++;
18638
0
      const int32_t vk5 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[5];
18639
0
      vacc += vi5 * vk5;
18640
0
      const int32_t vi6 = (int32_t) *i6++;
18641
0
      const int32_t vk6 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[6];
18642
0
      vacc += vi6 * vk6;
18643
0
      const int32_t vi7 = (int32_t) *i7++;
18644
0
      const int32_t vk7 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[7];
18645
0
      vacc += vi7 * vk7;
18646
0
      const int32_t vi8 = (int32_t) *i8++;
18647
0
      const int32_t vk8 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[8];
18648
0
      vacc += vi8 * vk8;
18649
0
      const int32_t vi9 = (int32_t) *i9++;
18650
0
      const int32_t vk9 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[9];
18651
0
      vacc += vi9 * vk9;
18652
0
      const int32_t vi10 = (int32_t) *i10++;
18653
0
      const int32_t vk10 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[10];
18654
0
      vacc += vi10 * vk10;
18655
0
      const int32_t vi11 = (int32_t) *i11++;
18656
0
      const int32_t vk11 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[11];
18657
0
      vacc += vi11 * vk11;
18658
0
      const int32_t vi12 = (int32_t) *i12++;
18659
0
      const int32_t vk12 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[12];
18660
0
      vacc += vi12 * vk12;
18661
0
      const int32_t vi13 = (int32_t) *i13++;
18662
0
      const int32_t vk13 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[13];
18663
0
      vacc += vi13 * vk13;
18664
0
      const int32_t vi14 = (int32_t) *i14++;
18665
0
      const int32_t vk14 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[14];
18666
0
      vacc += vi14 * vk14;
18667
0
      const int32_t vi15 = (int32_t) *i15++;
18668
0
      const int32_t vk15 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[15];
18669
0
      vacc += vi15 * vk15;
18670
0
      const int32_t vi16 = (int32_t) *i16++;
18671
0
      const int32_t vk16 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[16];
18672
0
      vacc += vi16 * vk16;
18673
0
      const int32_t vi17 = (int32_t) *i17++;
18674
0
      const int32_t vk17 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[17];
18675
0
      vacc += vi17 * vk17;
18676
0
      const int32_t vi18 = (int32_t) *i18++;
18677
0
      const int32_t vk18 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[18];
18678
0
      vacc += vi18 * vk18;
18679
0
      const int32_t vi19 = (int32_t) *i19++;
18680
0
      const int32_t vk19 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[19];
18681
0
      vacc += vi19 * vk19;
18682
0
      const int32_t vi20 = (int32_t) *i20++;
18683
0
      const int32_t vk20 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[20];
18684
0
      vacc += vi20 * vk20;
18685
0
      const int32_t vi21 = (int32_t) *i21++;
18686
0
      const int32_t vk21 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[21];
18687
0
      vacc += vi21 * vk21;
18688
0
      const int32_t vi22 = (int32_t) *i22++;
18689
0
      const int32_t vk22 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[22];
18690
0
      vacc += vi22 * vk22;
18691
0
      const int32_t vi23 = (int32_t) *i23++;
18692
0
      const int32_t vk23 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[23];
18693
0
      vacc += vi23 * vk23;
18694
0
      const int32_t vi24 = (int32_t) *i24++;
18695
0
      const int32_t vk24 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[24];
18696
0
      vacc += vi24 * vk24;
18697
18698
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 25 * sizeof(int8_t));
18699
18700
0
      const float vscale = unaligned_load_f32(w);
18701
0
      w = (const void*) ((const float*) w + 1);
18702
0
      float vfpacc = (float) vacc * vscale;
18703
18704
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
18705
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
18706
0
      vfpacc += vmagic_bias;
18707
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
18708
18709
0
      *output++ = (int8_t) vout;
18710
0
    } while (--c != 0);
18711
18712
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
18713
0
  } while (--output_width != 0);
18714
0
}
18715
18716
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic(
18717
    size_t channels,
18718
    size_t output_width,
18719
    const int8_t** input,
18720
    const void* weights,
18721
    int8_t* output,
18722
    intptr_t input_stride,
18723
    size_t output_increment,
18724
    size_t input_offset,
18725
    const int8_t* zero,
18726
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
18727
0
{
18728
0
  assert(channels != 0);
18729
0
  assert(output_width != 0);
18730
18731
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
18732
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
18733
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
18734
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
18735
0
  do {
18736
0
    const int8_t* i0 = input[0];
18737
0
    assert(i0 != NULL);
18738
0
    if XNN_UNPREDICTABLE(i0 != zero) {
18739
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
18740
0
    }
18741
0
    const int8_t* i1 = input[1];
18742
0
    assert(i1 != NULL);
18743
0
    if XNN_UNPREDICTABLE(i1 != zero) {
18744
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
18745
0
    }
18746
0
    const int8_t* i2 = input[2];
18747
0
    assert(i2 != NULL);
18748
0
    if XNN_UNPREDICTABLE(i2 != zero) {
18749
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
18750
0
    }
18751
0
    const int8_t* i3 = input[3];
18752
0
    assert(i3 != NULL);
18753
0
    if XNN_UNPREDICTABLE(i3 != zero) {
18754
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
18755
0
    }
18756
0
    const int8_t* i4 = input[4];
18757
0
    assert(i4 != NULL);
18758
0
    if XNN_UNPREDICTABLE(i4 != zero) {
18759
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
18760
0
    }
18761
0
    const int8_t* i5 = input[5];
18762
0
    assert(i5 != NULL);
18763
0
    if XNN_UNPREDICTABLE(i5 != zero) {
18764
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
18765
0
    }
18766
0
    const int8_t* i6 = input[6];
18767
0
    assert(i6 != NULL);
18768
0
    if XNN_UNPREDICTABLE(i6 != zero) {
18769
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
18770
0
    }
18771
0
    const int8_t* i7 = input[7];
18772
0
    assert(i7 != NULL);
18773
0
    if XNN_UNPREDICTABLE(i7 != zero) {
18774
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
18775
0
    }
18776
0
    const int8_t* i8 = input[8];
18777
0
    assert(i8 != NULL);
18778
0
    if XNN_UNPREDICTABLE(i8 != zero) {
18779
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
18780
0
    }
18781
0
    const int8_t* i9 = input[9];
18782
0
    assert(i9 != NULL);
18783
0
    if XNN_UNPREDICTABLE(i9 != zero) {
18784
0
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
18785
0
    }
18786
0
    const int8_t* i10 = input[10];
18787
0
    assert(i10 != NULL);
18788
0
    if XNN_UNPREDICTABLE(i10 != zero) {
18789
0
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
18790
0
    }
18791
0
    const int8_t* i11 = input[11];
18792
0
    assert(i11 != NULL);
18793
0
    if XNN_UNPREDICTABLE(i11 != zero) {
18794
0
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
18795
0
    }
18796
0
    const int8_t* i12 = input[12];
18797
0
    assert(i12 != NULL);
18798
0
    if XNN_UNPREDICTABLE(i12 != zero) {
18799
0
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
18800
0
    }
18801
0
    const int8_t* i13 = input[13];
18802
0
    assert(i13 != NULL);
18803
0
    if XNN_UNPREDICTABLE(i13 != zero) {
18804
0
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
18805
0
    }
18806
0
    const int8_t* i14 = input[14];
18807
0
    assert(i14 != NULL);
18808
0
    if XNN_UNPREDICTABLE(i14 != zero) {
18809
0
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
18810
0
    }
18811
0
    const int8_t* i15 = input[15];
18812
0
    assert(i15 != NULL);
18813
0
    if XNN_UNPREDICTABLE(i15 != zero) {
18814
0
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
18815
0
    }
18816
0
    const int8_t* i16 = input[16];
18817
0
    assert(i16 != NULL);
18818
0
    if XNN_UNPREDICTABLE(i16 != zero) {
18819
0
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
18820
0
    }
18821
0
    const int8_t* i17 = input[17];
18822
0
    assert(i17 != NULL);
18823
0
    if XNN_UNPREDICTABLE(i17 != zero) {
18824
0
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
18825
0
    }
18826
0
    const int8_t* i18 = input[18];
18827
0
    assert(i18 != NULL);
18828
0
    if XNN_UNPREDICTABLE(i18 != zero) {
18829
0
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
18830
0
    }
18831
0
    const int8_t* i19 = input[19];
18832
0
    assert(i19 != NULL);
18833
0
    if XNN_UNPREDICTABLE(i19 != zero) {
18834
0
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
18835
0
    }
18836
0
    const int8_t* i20 = input[20];
18837
0
    assert(i20 != NULL);
18838
0
    if XNN_UNPREDICTABLE(i20 != zero) {
18839
0
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
18840
0
    }
18841
0
    const int8_t* i21 = input[21];
18842
0
    assert(i21 != NULL);
18843
0
    if XNN_UNPREDICTABLE(i21 != zero) {
18844
0
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
18845
0
    }
18846
0
    const int8_t* i22 = input[22];
18847
0
    assert(i22 != NULL);
18848
0
    if XNN_UNPREDICTABLE(i22 != zero) {
18849
0
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
18850
0
    }
18851
0
    const int8_t* i23 = input[23];
18852
0
    assert(i23 != NULL);
18853
0
    if XNN_UNPREDICTABLE(i23 != zero) {
18854
0
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
18855
0
    }
18856
0
    const int8_t* i24 = input[24];
18857
0
    assert(i24 != NULL);
18858
0
    if XNN_UNPREDICTABLE(i24 != zero) {
18859
0
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
18860
0
    }
18861
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
18862
18863
0
    size_t c = channels;
18864
0
    const void* w = weights;
18865
0
    do {
18866
0
      int32_t vacc = unaligned_load_s32(w);
18867
18868
0
      const int32_t vi0 = (int32_t) *i0++;
18869
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
18870
0
      vacc += vi0 * vk0;
18871
0
      const int32_t vi1 = (int32_t) *i1++;
18872
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
18873
0
      vacc += vi1 * vk1;
18874
0
      const int32_t vi2 = (int32_t) *i2++;
18875
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
18876
0
      vacc += vi2 * vk2;
18877
0
      const int32_t vi3 = (int32_t) *i3++;
18878
0
      const int32_t vk3 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[3];
18879
0
      vacc += vi3 * vk3;
18880
0
      const int32_t vi4 = (int32_t) *i4++;
18881
0
      const int32_t vk4 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[4];
18882
0
      vacc += vi4 * vk4;
18883
0
      const int32_t vi5 = (int32_t) *i5++;
18884
0
      const int32_t vk5 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[5];
18885
0
      vacc += vi5 * vk5;
18886
0
      const int32_t vi6 = (int32_t) *i6++;
18887
0
      const int32_t vk6 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[6];
18888
0
      vacc += vi6 * vk6;
18889
0
      const int32_t vi7 = (int32_t) *i7++;
18890
0
      const int32_t vk7 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[7];
18891
0
      vacc += vi7 * vk7;
18892
0
      const int32_t vi8 = (int32_t) *i8++;
18893
0
      const int32_t vk8 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[8];
18894
0
      vacc += vi8 * vk8;
18895
0
      const int32_t vi9 = (int32_t) *i9++;
18896
0
      const int32_t vk9 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[9];
18897
0
      vacc += vi9 * vk9;
18898
0
      const int32_t vi10 = (int32_t) *i10++;
18899
0
      const int32_t vk10 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[10];
18900
0
      vacc += vi10 * vk10;
18901
0
      const int32_t vi11 = (int32_t) *i11++;
18902
0
      const int32_t vk11 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[11];
18903
0
      vacc += vi11 * vk11;
18904
0
      const int32_t vi12 = (int32_t) *i12++;
18905
0
      const int32_t vk12 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[12];
18906
0
      vacc += vi12 * vk12;
18907
0
      const int32_t vi13 = (int32_t) *i13++;
18908
0
      const int32_t vk13 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[13];
18909
0
      vacc += vi13 * vk13;
18910
0
      const int32_t vi14 = (int32_t) *i14++;
18911
0
      const int32_t vk14 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[14];
18912
0
      vacc += vi14 * vk14;
18913
0
      const int32_t vi15 = (int32_t) *i15++;
18914
0
      const int32_t vk15 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[15];
18915
0
      vacc += vi15 * vk15;
18916
0
      const int32_t vi16 = (int32_t) *i16++;
18917
0
      const int32_t vk16 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[16];
18918
0
      vacc += vi16 * vk16;
18919
0
      const int32_t vi17 = (int32_t) *i17++;
18920
0
      const int32_t vk17 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[17];
18921
0
      vacc += vi17 * vk17;
18922
0
      const int32_t vi18 = (int32_t) *i18++;
18923
0
      const int32_t vk18 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[18];
18924
0
      vacc += vi18 * vk18;
18925
0
      const int32_t vi19 = (int32_t) *i19++;
18926
0
      const int32_t vk19 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[19];
18927
0
      vacc += vi19 * vk19;
18928
0
      const int32_t vi20 = (int32_t) *i20++;
18929
0
      const int32_t vk20 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[20];
18930
0
      vacc += vi20 * vk20;
18931
0
      const int32_t vi21 = (int32_t) *i21++;
18932
0
      const int32_t vk21 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[21];
18933
0
      vacc += vi21 * vk21;
18934
0
      const int32_t vi22 = (int32_t) *i22++;
18935
0
      const int32_t vk22 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[22];
18936
0
      vacc += vi22 * vk22;
18937
0
      const int32_t vi23 = (int32_t) *i23++;
18938
0
      const int32_t vk23 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[23];
18939
0
      vacc += vi23 * vk23;
18940
0
      const int32_t vi24 = (int32_t) *i24++;
18941
0
      const int32_t vk24 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[24];
18942
0
      vacc += vi24 * vk24;
18943
18944
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 25 * sizeof(int8_t));
18945
18946
0
      const float vscale = unaligned_load_f32(w);
18947
0
      w = (const void*) ((const float*) w + 1);
18948
0
      float vfpacc = (float) vacc * vscale;
18949
18950
0
      vfpacc += vmagic_bias;
18951
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
18952
0
      vout = math_max_s32(vout, vmagic_min);
18953
0
      vout = math_min_s32(vout, vmagic_max);
18954
0
      vout -= vmagic_bias_less_zero_point;
18955
18956
0
      *output++ = (int8_t) vout;
18957
0
    } while (--c != 0);
18958
18959
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
18960
0
  } while (--output_width != 0);
18961
0
}
18962
18963
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf(
18964
    size_t channels,
18965
    size_t output_width,
18966
    const int8_t** input,
18967
    const void* weights,
18968
    int8_t* output,
18969
    intptr_t input_stride,
18970
    size_t output_increment,
18971
    size_t input_offset,
18972
    const int8_t* zero,
18973
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
18974
0
{
18975
0
  assert(channels != 0);
18976
0
  assert(output_width != 0);
18977
18978
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
18979
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
18980
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
18981
0
  do {
18982
0
    const int8_t* i0 = input[0];
18983
0
    assert(i0 != NULL);
18984
0
    if XNN_UNPREDICTABLE(i0 != zero) {
18985
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
18986
0
    }
18987
0
    const int8_t* i1 = input[1];
18988
0
    assert(i1 != NULL);
18989
0
    if XNN_UNPREDICTABLE(i1 != zero) {
18990
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
18991
0
    }
18992
0
    const int8_t* i2 = input[2];
18993
0
    assert(i2 != NULL);
18994
0
    if XNN_UNPREDICTABLE(i2 != zero) {
18995
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
18996
0
    }
18997
0
    const int8_t* i3 = input[3];
18998
0
    assert(i3 != NULL);
18999
0
    if XNN_UNPREDICTABLE(i3 != zero) {
19000
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
19001
0
    }
19002
0
    const int8_t* i4 = input[4];
19003
0
    assert(i4 != NULL);
19004
0
    if XNN_UNPREDICTABLE(i4 != zero) {
19005
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
19006
0
    }
19007
0
    const int8_t* i5 = input[5];
19008
0
    assert(i5 != NULL);
19009
0
    if XNN_UNPREDICTABLE(i5 != zero) {
19010
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
19011
0
    }
19012
0
    const int8_t* i6 = input[6];
19013
0
    assert(i6 != NULL);
19014
0
    if XNN_UNPREDICTABLE(i6 != zero) {
19015
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
19016
0
    }
19017
0
    const int8_t* i7 = input[7];
19018
0
    assert(i7 != NULL);
19019
0
    if XNN_UNPREDICTABLE(i7 != zero) {
19020
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
19021
0
    }
19022
0
    const int8_t* i8 = input[8];
19023
0
    assert(i8 != NULL);
19024
0
    if XNN_UNPREDICTABLE(i8 != zero) {
19025
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
19026
0
    }
19027
0
    const int8_t* i9 = input[9];
19028
0
    assert(i9 != NULL);
19029
0
    if XNN_UNPREDICTABLE(i9 != zero) {
19030
0
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
19031
0
    }
19032
0
    const int8_t* i10 = input[10];
19033
0
    assert(i10 != NULL);
19034
0
    if XNN_UNPREDICTABLE(i10 != zero) {
19035
0
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
19036
0
    }
19037
0
    const int8_t* i11 = input[11];
19038
0
    assert(i11 != NULL);
19039
0
    if XNN_UNPREDICTABLE(i11 != zero) {
19040
0
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
19041
0
    }
19042
0
    const int8_t* i12 = input[12];
19043
0
    assert(i12 != NULL);
19044
0
    if XNN_UNPREDICTABLE(i12 != zero) {
19045
0
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
19046
0
    }
19047
0
    const int8_t* i13 = input[13];
19048
0
    assert(i13 != NULL);
19049
0
    if XNN_UNPREDICTABLE(i13 != zero) {
19050
0
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
19051
0
    }
19052
0
    const int8_t* i14 = input[14];
19053
0
    assert(i14 != NULL);
19054
0
    if XNN_UNPREDICTABLE(i14 != zero) {
19055
0
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
19056
0
    }
19057
0
    const int8_t* i15 = input[15];
19058
0
    assert(i15 != NULL);
19059
0
    if XNN_UNPREDICTABLE(i15 != zero) {
19060
0
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
19061
0
    }
19062
0
    const int8_t* i16 = input[16];
19063
0
    assert(i16 != NULL);
19064
0
    if XNN_UNPREDICTABLE(i16 != zero) {
19065
0
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
19066
0
    }
19067
0
    const int8_t* i17 = input[17];
19068
0
    assert(i17 != NULL);
19069
0
    if XNN_UNPREDICTABLE(i17 != zero) {
19070
0
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
19071
0
    }
19072
0
    const int8_t* i18 = input[18];
19073
0
    assert(i18 != NULL);
19074
0
    if XNN_UNPREDICTABLE(i18 != zero) {
19075
0
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
19076
0
    }
19077
0
    const int8_t* i19 = input[19];
19078
0
    assert(i19 != NULL);
19079
0
    if XNN_UNPREDICTABLE(i19 != zero) {
19080
0
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
19081
0
    }
19082
0
    const int8_t* i20 = input[20];
19083
0
    assert(i20 != NULL);
19084
0
    if XNN_UNPREDICTABLE(i20 != zero) {
19085
0
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
19086
0
    }
19087
0
    const int8_t* i21 = input[21];
19088
0
    assert(i21 != NULL);
19089
0
    if XNN_UNPREDICTABLE(i21 != zero) {
19090
0
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
19091
0
    }
19092
0
    const int8_t* i22 = input[22];
19093
0
    assert(i22 != NULL);
19094
0
    if XNN_UNPREDICTABLE(i22 != zero) {
19095
0
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
19096
0
    }
19097
0
    const int8_t* i23 = input[23];
19098
0
    assert(i23 != NULL);
19099
0
    if XNN_UNPREDICTABLE(i23 != zero) {
19100
0
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
19101
0
    }
19102
0
    const int8_t* i24 = input[24];
19103
0
    assert(i24 != NULL);
19104
0
    if XNN_UNPREDICTABLE(i24 != zero) {
19105
0
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
19106
0
    }
19107
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
19108
19109
0
    size_t c = channels;
19110
0
    const void* w = weights;
19111
0
    for (; c >= 2; c -= 2) {
19112
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
19113
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
19114
19115
19116
0
      const int32_t vi0x0 = (int32_t) i0[0];
19117
0
      const int32_t vi0x1 = (int32_t) i0[1];
19118
0
      i0 += 2;
19119
19120
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
19121
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
19122
19123
0
      vacc0 += vi0x0 * vk0x0;
19124
0
      vacc1 += vi0x1 * vk0x1;
19125
19126
0
      const int32_t vi1x0 = (int32_t) i1[0];
19127
0
      const int32_t vi1x1 = (int32_t) i1[1];
19128
0
      i1 += 2;
19129
19130
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
19131
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
19132
19133
0
      vacc0 += vi1x0 * vk1x0;
19134
0
      vacc1 += vi1x1 * vk1x1;
19135
19136
0
      const int32_t vi2x0 = (int32_t) i2[0];
19137
0
      const int32_t vi2x1 = (int32_t) i2[1];
19138
0
      i2 += 2;
19139
19140
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
19141
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
19142
19143
0
      vacc0 += vi2x0 * vk2x0;
19144
0
      vacc1 += vi2x1 * vk2x1;
19145
19146
0
      const int32_t vi3x0 = (int32_t) i3[0];
19147
0
      const int32_t vi3x1 = (int32_t) i3[1];
19148
0
      i3 += 2;
19149
19150
0
      const int32_t vk3x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
19151
0
      const int32_t vk3x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7];
19152
19153
0
      vacc0 += vi3x0 * vk3x0;
19154
0
      vacc1 += vi3x1 * vk3x1;
19155
19156
0
      const int32_t vi4x0 = (int32_t) i4[0];
19157
0
      const int32_t vi4x1 = (int32_t) i4[1];
19158
0
      i4 += 2;
19159
19160
0
      const int32_t vk4x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
19161
0
      const int32_t vk4x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9];
19162
19163
0
      vacc0 += vi4x0 * vk4x0;
19164
0
      vacc1 += vi4x1 * vk4x1;
19165
19166
0
      const int32_t vi5x0 = (int32_t) i5[0];
19167
0
      const int32_t vi5x1 = (int32_t) i5[1];
19168
0
      i5 += 2;
19169
19170
0
      const int32_t vk5x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
19171
0
      const int32_t vk5x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11];
19172
19173
0
      vacc0 += vi5x0 * vk5x0;
19174
0
      vacc1 += vi5x1 * vk5x1;
19175
19176
0
      const int32_t vi6x0 = (int32_t) i6[0];
19177
0
      const int32_t vi6x1 = (int32_t) i6[1];
19178
0
      i6 += 2;
19179
19180
0
      const int32_t vk6x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
19181
0
      const int32_t vk6x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13];
19182
19183
0
      vacc0 += vi6x0 * vk6x0;
19184
0
      vacc1 += vi6x1 * vk6x1;
19185
19186
0
      const int32_t vi7x0 = (int32_t) i7[0];
19187
0
      const int32_t vi7x1 = (int32_t) i7[1];
19188
0
      i7 += 2;
19189
19190
0
      const int32_t vk7x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
19191
0
      const int32_t vk7x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15];
19192
19193
0
      vacc0 += vi7x0 * vk7x0;
19194
0
      vacc1 += vi7x1 * vk7x1;
19195
19196
0
      const int32_t vi8x0 = (int32_t) i8[0];
19197
0
      const int32_t vi8x1 = (int32_t) i8[1];
19198
0
      i8 += 2;
19199
19200
0
      const int32_t vk8x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
19201
0
      const int32_t vk8x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17];
19202
19203
0
      vacc0 += vi8x0 * vk8x0;
19204
0
      vacc1 += vi8x1 * vk8x1;
19205
19206
0
      const int32_t vi9x0 = (int32_t) i9[0];
19207
0
      const int32_t vi9x1 = (int32_t) i9[1];
19208
0
      i9 += 2;
19209
19210
0
      const int32_t vk9x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[18];
19211
0
      const int32_t vk9x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[19];
19212
19213
0
      vacc0 += vi9x0 * vk9x0;
19214
0
      vacc1 += vi9x1 * vk9x1;
19215
19216
0
      const int32_t vi10x0 = (int32_t) i10[0];
19217
0
      const int32_t vi10x1 = (int32_t) i10[1];
19218
0
      i10 += 2;
19219
19220
0
      const int32_t vk10x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[20];
19221
0
      const int32_t vk10x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[21];
19222
19223
0
      vacc0 += vi10x0 * vk10x0;
19224
0
      vacc1 += vi10x1 * vk10x1;
19225
19226
0
      const int32_t vi11x0 = (int32_t) i11[0];
19227
0
      const int32_t vi11x1 = (int32_t) i11[1];
19228
0
      i11 += 2;
19229
19230
0
      const int32_t vk11x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[22];
19231
0
      const int32_t vk11x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[23];
19232
19233
0
      vacc0 += vi11x0 * vk11x0;
19234
0
      vacc1 += vi11x1 * vk11x1;
19235
19236
0
      const int32_t vi12x0 = (int32_t) i12[0];
19237
0
      const int32_t vi12x1 = (int32_t) i12[1];
19238
0
      i12 += 2;
19239
19240
0
      const int32_t vk12x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[24];
19241
0
      const int32_t vk12x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[25];
19242
19243
0
      vacc0 += vi12x0 * vk12x0;
19244
0
      vacc1 += vi12x1 * vk12x1;
19245
19246
0
      const int32_t vi13x0 = (int32_t) i13[0];
19247
0
      const int32_t vi13x1 = (int32_t) i13[1];
19248
0
      i13 += 2;
19249
19250
0
      const int32_t vk13x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[26];
19251
0
      const int32_t vk13x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[27];
19252
19253
0
      vacc0 += vi13x0 * vk13x0;
19254
0
      vacc1 += vi13x1 * vk13x1;
19255
19256
0
      const int32_t vi14x0 = (int32_t) i14[0];
19257
0
      const int32_t vi14x1 = (int32_t) i14[1];
19258
0
      i14 += 2;
19259
19260
0
      const int32_t vk14x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[28];
19261
0
      const int32_t vk14x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[29];
19262
19263
0
      vacc0 += vi14x0 * vk14x0;
19264
0
      vacc1 += vi14x1 * vk14x1;
19265
19266
0
      const int32_t vi15x0 = (int32_t) i15[0];
19267
0
      const int32_t vi15x1 = (int32_t) i15[1];
19268
0
      i15 += 2;
19269
19270
0
      const int32_t vk15x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[30];
19271
0
      const int32_t vk15x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[31];
19272
19273
0
      vacc0 += vi15x0 * vk15x0;
19274
0
      vacc1 += vi15x1 * vk15x1;
19275
19276
0
      const int32_t vi16x0 = (int32_t) i16[0];
19277
0
      const int32_t vi16x1 = (int32_t) i16[1];
19278
0
      i16 += 2;
19279
19280
0
      const int32_t vk16x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[32];
19281
0
      const int32_t vk16x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[33];
19282
19283
0
      vacc0 += vi16x0 * vk16x0;
19284
0
      vacc1 += vi16x1 * vk16x1;
19285
19286
0
      const int32_t vi17x0 = (int32_t) i17[0];
19287
0
      const int32_t vi17x1 = (int32_t) i17[1];
19288
0
      i17 += 2;
19289
19290
0
      const int32_t vk17x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[34];
19291
0
      const int32_t vk17x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[35];
19292
19293
0
      vacc0 += vi17x0 * vk17x0;
19294
0
      vacc1 += vi17x1 * vk17x1;
19295
19296
0
      const int32_t vi18x0 = (int32_t) i18[0];
19297
0
      const int32_t vi18x1 = (int32_t) i18[1];
19298
0
      i18 += 2;
19299
19300
0
      const int32_t vk18x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[36];
19301
0
      const int32_t vk18x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[37];
19302
19303
0
      vacc0 += vi18x0 * vk18x0;
19304
0
      vacc1 += vi18x1 * vk18x1;
19305
19306
0
      const int32_t vi19x0 = (int32_t) i19[0];
19307
0
      const int32_t vi19x1 = (int32_t) i19[1];
19308
0
      i19 += 2;
19309
19310
0
      const int32_t vk19x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[38];
19311
0
      const int32_t vk19x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[39];
19312
19313
0
      vacc0 += vi19x0 * vk19x0;
19314
0
      vacc1 += vi19x1 * vk19x1;
19315
19316
0
      const int32_t vi20x0 = (int32_t) i20[0];
19317
0
      const int32_t vi20x1 = (int32_t) i20[1];
19318
0
      i20 += 2;
19319
19320
0
      const int32_t vk20x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[40];
19321
0
      const int32_t vk20x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[41];
19322
19323
0
      vacc0 += vi20x0 * vk20x0;
19324
0
      vacc1 += vi20x1 * vk20x1;
19325
19326
0
      const int32_t vi21x0 = (int32_t) i21[0];
19327
0
      const int32_t vi21x1 = (int32_t) i21[1];
19328
0
      i21 += 2;
19329
19330
0
      const int32_t vk21x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[42];
19331
0
      const int32_t vk21x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[43];
19332
19333
0
      vacc0 += vi21x0 * vk21x0;
19334
0
      vacc1 += vi21x1 * vk21x1;
19335
19336
0
      const int32_t vi22x0 = (int32_t) i22[0];
19337
0
      const int32_t vi22x1 = (int32_t) i22[1];
19338
0
      i22 += 2;
19339
19340
0
      const int32_t vk22x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[44];
19341
0
      const int32_t vk22x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[45];
19342
19343
0
      vacc0 += vi22x0 * vk22x0;
19344
0
      vacc1 += vi22x1 * vk22x1;
19345
19346
0
      const int32_t vi23x0 = (int32_t) i23[0];
19347
0
      const int32_t vi23x1 = (int32_t) i23[1];
19348
0
      i23 += 2;
19349
19350
0
      const int32_t vk23x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[46];
19351
0
      const int32_t vk23x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[47];
19352
19353
0
      vacc0 += vi23x0 * vk23x0;
19354
0
      vacc1 += vi23x1 * vk23x1;
19355
19356
0
      const int32_t vi24x0 = (int32_t) i24[0];
19357
0
      const int32_t vi24x1 = (int32_t) i24[1];
19358
0
      i24 += 2;
19359
19360
0
      const int32_t vk24x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[48];
19361
0
      const int32_t vk24x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[49];
19362
19363
0
      vacc0 += vi24x0 * vk24x0;
19364
0
      vacc1 += vi24x1 * vk24x1;
19365
19366
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 50 * sizeof(int8_t));
19367
19368
0
      float vfpacc0 = (float) vacc0;
19369
0
      float vfpacc1 = (float) vacc1;
19370
19371
0
      const float vscale0 = unaligned_indexed_load_f32(w, 0);
19372
0
      const float vscale1 = unaligned_indexed_load_f32(w, 1);
19373
0
      w = (const void*) ((const float*) w + 2);
19374
19375
0
      vfpacc0 *= vscale0;
19376
0
      vfpacc1 *= vscale1;
19377
19378
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
19379
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
19380
19381
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
19382
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
19383
19384
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
19385
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
19386
19387
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
19388
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
19389
19390
0
      output[0] = (int8_t) vout0;
19391
0
      output[1] = (int8_t) vout1;
19392
0
      output += 2;
19393
0
    }
19394
0
    if XNN_UNLIKELY(c != 0) {
19395
0
      int32_t vacc = unaligned_load_s32(w);
19396
19397
0
      const int32_t vi0 = (int32_t) *i0;
19398
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
19399
0
      vacc += vi0 * vk0;
19400
0
      const int32_t vi1 = (int32_t) *i1;
19401
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
19402
0
      vacc += vi1 * vk1;
19403
0
      const int32_t vi2 = (int32_t) *i2;
19404
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
19405
0
      vacc += vi2 * vk2;
19406
0
      const int32_t vi3 = (int32_t) *i3;
19407
0
      const int32_t vk3 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
19408
0
      vacc += vi3 * vk3;
19409
0
      const int32_t vi4 = (int32_t) *i4;
19410
0
      const int32_t vk4 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
19411
0
      vacc += vi4 * vk4;
19412
0
      const int32_t vi5 = (int32_t) *i5;
19413
0
      const int32_t vk5 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
19414
0
      vacc += vi5 * vk5;
19415
0
      const int32_t vi6 = (int32_t) *i6;
19416
0
      const int32_t vk6 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
19417
0
      vacc += vi6 * vk6;
19418
0
      const int32_t vi7 = (int32_t) *i7;
19419
0
      const int32_t vk7 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
19420
0
      vacc += vi7 * vk7;
19421
0
      const int32_t vi8 = (int32_t) *i8;
19422
0
      const int32_t vk8 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
19423
0
      vacc += vi8 * vk8;
19424
0
      const int32_t vi9 = (int32_t) *i9;
19425
0
      const int32_t vk9 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[18];
19426
0
      vacc += vi9 * vk9;
19427
0
      const int32_t vi10 = (int32_t) *i10;
19428
0
      const int32_t vk10 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[20];
19429
0
      vacc += vi10 * vk10;
19430
0
      const int32_t vi11 = (int32_t) *i11;
19431
0
      const int32_t vk11 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[22];
19432
0
      vacc += vi11 * vk11;
19433
0
      const int32_t vi12 = (int32_t) *i12;
19434
0
      const int32_t vk12 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[24];
19435
0
      vacc += vi12 * vk12;
19436
0
      const int32_t vi13 = (int32_t) *i13;
19437
0
      const int32_t vk13 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[26];
19438
0
      vacc += vi13 * vk13;
19439
0
      const int32_t vi14 = (int32_t) *i14;
19440
0
      const int32_t vk14 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[28];
19441
0
      vacc += vi14 * vk14;
19442
0
      const int32_t vi15 = (int32_t) *i15;
19443
0
      const int32_t vk15 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[30];
19444
0
      vacc += vi15 * vk15;
19445
0
      const int32_t vi16 = (int32_t) *i16;
19446
0
      const int32_t vk16 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[32];
19447
0
      vacc += vi16 * vk16;
19448
0
      const int32_t vi17 = (int32_t) *i17;
19449
0
      const int32_t vk17 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[34];
19450
0
      vacc += vi17 * vk17;
19451
0
      const int32_t vi18 = (int32_t) *i18;
19452
0
      const int32_t vk18 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[36];
19453
0
      vacc += vi18 * vk18;
19454
0
      const int32_t vi19 = (int32_t) *i19;
19455
0
      const int32_t vk19 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[38];
19456
0
      vacc += vi19 * vk19;
19457
0
      const int32_t vi20 = (int32_t) *i20;
19458
0
      const int32_t vk20 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[40];
19459
0
      vacc += vi20 * vk20;
19460
0
      const int32_t vi21 = (int32_t) *i21;
19461
0
      const int32_t vk21 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[42];
19462
0
      vacc += vi21 * vk21;
19463
0
      const int32_t vi22 = (int32_t) *i22;
19464
0
      const int32_t vk22 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[44];
19465
0
      vacc += vi22 * vk22;
19466
0
      const int32_t vi23 = (int32_t) *i23;
19467
0
      const int32_t vk23 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[46];
19468
0
      vacc += vi23 * vk23;
19469
0
      const int32_t vi24 = (int32_t) *i24;
19470
0
      const int32_t vk24 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[48];
19471
0
      vacc += vi24 * vk24;
19472
19473
0
      const float vscale = unaligned_load_f32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 50 * sizeof(int8_t)));
19474
0
      float vfpacc = (float) vacc * vscale;
19475
19476
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
19477
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
19478
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
19479
0
      int32_t vout = vrndacc + voutput_zero_point;
19480
19481
0
      *output++ = (int8_t) vout;
19482
0
    }
19483
19484
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
19485
0
  } while (--output_width != 0);
19486
0
}
19487
19488
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic(
19489
    size_t channels,
19490
    size_t output_width,
19491
    const int8_t** input,
19492
    const void* weights,
19493
    int8_t* output,
19494
    intptr_t input_stride,
19495
    size_t output_increment,
19496
    size_t input_offset,
19497
    const int8_t* zero,
19498
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
19499
0
{
19500
0
  assert(channels != 0);
19501
0
  assert(output_width != 0);
19502
19503
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
19504
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
19505
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
19506
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
19507
0
  do {
19508
0
    const int8_t* i0 = input[0];
19509
0
    assert(i0 != NULL);
19510
0
    if XNN_UNPREDICTABLE(i0 != zero) {
19511
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
19512
0
    }
19513
0
    const int8_t* i1 = input[1];
19514
0
    assert(i1 != NULL);
19515
0
    if XNN_UNPREDICTABLE(i1 != zero) {
19516
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
19517
0
    }
19518
0
    const int8_t* i2 = input[2];
19519
0
    assert(i2 != NULL);
19520
0
    if XNN_UNPREDICTABLE(i2 != zero) {
19521
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
19522
0
    }
19523
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
19524
19525
0
    size_t c = channels;
19526
0
    const void* w = weights;
19527
0
    do {
19528
0
      int32_t vacc = unaligned_load_s32(w);
19529
19530
0
      const int32_t vi0 = (int32_t) *i0++;
19531
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
19532
0
      vacc += vi0 * vk0;
19533
0
      const int32_t vi1 = (int32_t) *i1++;
19534
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
19535
0
      vacc += vi1 * vk1;
19536
0
      const int32_t vi2 = (int32_t) *i2++;
19537
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
19538
0
      vacc += vi2 * vk2;
19539
19540
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 3 * sizeof(int8_t));
19541
19542
0
      const float vscale = unaligned_load_f32(w);
19543
0
      w = (const void*) ((const float*) w + 1);
19544
0
      float vfpacc = (float) vacc * vscale;
19545
19546
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
19547
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
19548
0
      vfpacc += vmagic_bias;
19549
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
19550
19551
0
      *output++ = (int8_t) vout;
19552
0
    } while (--c != 0);
19553
19554
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
19555
0
  } while (--output_width != 0);
19556
0
}
19557
19558
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic(
19559
    size_t channels,
19560
    size_t output_width,
19561
    const int8_t** input,
19562
    const void* weights,
19563
    int8_t* output,
19564
    intptr_t input_stride,
19565
    size_t output_increment,
19566
    size_t input_offset,
19567
    const int8_t* zero,
19568
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
19569
0
{
19570
0
  assert(channels != 0);
19571
0
  assert(output_width != 0);
19572
19573
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
19574
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
19575
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
19576
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
19577
0
  do {
19578
0
    const int8_t* i0 = input[0];
19579
0
    assert(i0 != NULL);
19580
0
    if XNN_UNPREDICTABLE(i0 != zero) {
19581
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
19582
0
    }
19583
0
    const int8_t* i1 = input[1];
19584
0
    assert(i1 != NULL);
19585
0
    if XNN_UNPREDICTABLE(i1 != zero) {
19586
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
19587
0
    }
19588
0
    const int8_t* i2 = input[2];
19589
0
    assert(i2 != NULL);
19590
0
    if XNN_UNPREDICTABLE(i2 != zero) {
19591
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
19592
0
    }
19593
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
19594
19595
0
    size_t c = channels;
19596
0
    const void* w = weights;
19597
0
    for (; c >= 2; c -= 2) {
19598
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
19599
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
19600
19601
19602
0
      const int32_t vi0x0 = (int32_t) i0[0];
19603
0
      const int32_t vi0x1 = (int32_t) i0[1];
19604
0
      i0 += 2;
19605
19606
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
19607
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
19608
19609
0
      vacc0 += vi0x0 * vk0x0;
19610
0
      vacc1 += vi0x1 * vk0x1;
19611
19612
0
      const int32_t vi1x0 = (int32_t) i1[0];
19613
0
      const int32_t vi1x1 = (int32_t) i1[1];
19614
0
      i1 += 2;
19615
19616
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
19617
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
19618
19619
0
      vacc0 += vi1x0 * vk1x0;
19620
0
      vacc1 += vi1x1 * vk1x1;
19621
19622
0
      const int32_t vi2x0 = (int32_t) i2[0];
19623
0
      const int32_t vi2x1 = (int32_t) i2[1];
19624
0
      i2 += 2;
19625
19626
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
19627
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
19628
19629
0
      vacc0 += vi2x0 * vk2x0;
19630
0
      vacc1 += vi2x1 * vk2x1;
19631
19632
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 6 * sizeof(int8_t));
19633
19634
0
      float vfpacc0 = (float) vacc0;
19635
0
      float vfpacc1 = (float) vacc1;
19636
19637
0
      const float vscale0 = unaligned_indexed_load_f32(w, 0);
19638
0
      const float vscale1 = unaligned_indexed_load_f32(w, 1);
19639
0
      w = (const void*) ((const float*) w + 2);
19640
19641
0
      vfpacc0 *= vscale0;
19642
0
      vfpacc1 *= vscale1;
19643
19644
0
      vfpacc0 += vmagic_bias;
19645
0
      vfpacc1 += vmagic_bias;
19646
19647
0
      int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
19648
0
      int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
19649
19650
0
      vout0 = math_max_s32(vout0, vmagic_min);
19651
0
      vout1 = math_max_s32(vout1, vmagic_min);
19652
19653
0
      vout0 = math_min_s32(vout0, vmagic_max);
19654
0
      vout1 = math_min_s32(vout1, vmagic_max);
19655
19656
0
      vout0 -= vmagic_bias_less_zero_point;
19657
0
      vout1 -= vmagic_bias_less_zero_point;
19658
19659
0
      output[0] = (int8_t) vout0;
19660
0
      output[1] = (int8_t) vout1;
19661
0
      output += 2;
19662
0
    }
19663
0
    if XNN_UNLIKELY(c != 0) {
19664
0
      int32_t vacc = unaligned_load_s32(w);
19665
19666
0
      const int32_t vi0 = (int32_t) *i0;
19667
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
19668
0
      vacc += vi0 * vk0;
19669
0
      const int32_t vi1 = (int32_t) *i1;
19670
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
19671
0
      vacc += vi1 * vk1;
19672
0
      const int32_t vi2 = (int32_t) *i2;
19673
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
19674
0
      vacc += vi2 * vk2;
19675
19676
0
      const float vscale = unaligned_load_f32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 6 * sizeof(int8_t)));
19677
0
      float vfpacc = (float) vacc * vscale;
19678
19679
0
      vfpacc += vmagic_bias;
19680
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
19681
0
      vout = math_max_s32(vout, vmagic_min);
19682
0
      vout = math_min_s32(vout, vmagic_max);
19683
0
      vout -= vmagic_bias_less_zero_point;
19684
19685
0
      *output++ = (int8_t) vout;
19686
0
    }
19687
19688
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
19689
0
  } while (--output_width != 0);
19690
0
}
19691
19692
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf(
19693
    size_t channels,
19694
    size_t output_width,
19695
    const int8_t** input,
19696
    const void* weights,
19697
    int8_t* output,
19698
    intptr_t input_stride,
19699
    size_t output_increment,
19700
    size_t input_offset,
19701
    const int8_t* zero,
19702
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
19703
0
{
19704
0
  assert(channels != 0);
19705
0
  assert(output_width != 0);
19706
19707
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
19708
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
19709
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
19710
0
  do {
19711
0
    const int8_t* i0 = input[0];
19712
0
    assert(i0 != NULL);
19713
0
    if XNN_UNPREDICTABLE(i0 != zero) {
19714
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
19715
0
    }
19716
0
    const int8_t* i1 = input[1];
19717
0
    assert(i1 != NULL);
19718
0
    if XNN_UNPREDICTABLE(i1 != zero) {
19719
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
19720
0
    }
19721
0
    const int8_t* i2 = input[2];
19722
0
    assert(i2 != NULL);
19723
0
    if XNN_UNPREDICTABLE(i2 != zero) {
19724
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
19725
0
    }
19726
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
19727
19728
0
    size_t c = channels;
19729
0
    const void* w = weights;
19730
0
    for (; c >= 2; c -= 2) {
19731
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
19732
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
19733
19734
19735
0
      const int32_t vi0x0 = (int32_t) i0[0];
19736
0
      const int32_t vi0x1 = (int32_t) i0[1];
19737
0
      i0 += 2;
19738
19739
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
19740
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
19741
19742
0
      vacc0 += vi0x0 * vk0x0;
19743
0
      vacc1 += vi0x1 * vk0x1;
19744
19745
0
      const int32_t vi1x0 = (int32_t) i1[0];
19746
0
      const int32_t vi1x1 = (int32_t) i1[1];
19747
0
      i1 += 2;
19748
19749
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
19750
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
19751
19752
0
      vacc0 += vi1x0 * vk1x0;
19753
0
      vacc1 += vi1x1 * vk1x1;
19754
19755
0
      const int32_t vi2x0 = (int32_t) i2[0];
19756
0
      const int32_t vi2x1 = (int32_t) i2[1];
19757
0
      i2 += 2;
19758
19759
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
19760
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
19761
19762
0
      vacc0 += vi2x0 * vk2x0;
19763
0
      vacc1 += vi2x1 * vk2x1;
19764
19765
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 6 * sizeof(int8_t));
19766
19767
0
      float vfpacc0 = (float) vacc0;
19768
0
      float vfpacc1 = (float) vacc1;
19769
19770
0
      const float vscale0 = unaligned_indexed_load_f32(w, 0);
19771
0
      const float vscale1 = unaligned_indexed_load_f32(w, 1);
19772
0
      w = (const void*) ((const float*) w + 2);
19773
19774
0
      vfpacc0 *= vscale0;
19775
0
      vfpacc1 *= vscale1;
19776
19777
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
19778
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
19779
19780
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
19781
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
19782
19783
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
19784
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
19785
19786
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
19787
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
19788
19789
0
      output[0] = (int8_t) vout0;
19790
0
      output[1] = (int8_t) vout1;
19791
0
      output += 2;
19792
0
    }
19793
0
    if XNN_UNLIKELY(c != 0) {
19794
0
      int32_t vacc = unaligned_load_s32(w);
19795
19796
0
      const int32_t vi0 = (int32_t) *i0;
19797
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
19798
0
      vacc += vi0 * vk0;
19799
0
      const int32_t vi1 = (int32_t) *i1;
19800
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
19801
0
      vacc += vi1 * vk1;
19802
0
      const int32_t vi2 = (int32_t) *i2;
19803
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
19804
0
      vacc += vi2 * vk2;
19805
19806
0
      const float vscale = unaligned_load_f32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 6 * sizeof(int8_t)));
19807
0
      float vfpacc = (float) vacc * vscale;
19808
19809
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
19810
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
19811
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
19812
0
      int32_t vout = vrndacc + voutput_zero_point;
19813
19814
0
      *output++ = (int8_t) vout;
19815
0
    }
19816
19817
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
19818
0
  } while (--output_width != 0);
19819
0
}
19820
19821
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic(
19822
    size_t channels,
19823
    size_t output_width,
19824
    const int8_t** input,
19825
    const void* weights,
19826
    int8_t* output,
19827
    intptr_t input_stride,
19828
    size_t output_increment,
19829
    size_t input_offset,
19830
    const int8_t* zero,
19831
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
19832
0
{
19833
0
  assert(channels != 0);
19834
0
  assert(output_width != 0);
19835
19836
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
19837
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
19838
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
19839
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
19840
0
  do {
19841
0
    const int8_t* i0 = input[0];
19842
0
    assert(i0 != NULL);
19843
0
    if XNN_UNPREDICTABLE(i0 != zero) {
19844
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
19845
0
    }
19846
0
    const int8_t* i1 = input[1];
19847
0
    assert(i1 != NULL);
19848
0
    if XNN_UNPREDICTABLE(i1 != zero) {
19849
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
19850
0
    }
19851
0
    const int8_t* i2 = input[2];
19852
0
    assert(i2 != NULL);
19853
0
    if XNN_UNPREDICTABLE(i2 != zero) {
19854
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
19855
0
    }
19856
0
    const int8_t* i3 = input[3];
19857
0
    assert(i3 != NULL);
19858
0
    if XNN_UNPREDICTABLE(i3 != zero) {
19859
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
19860
0
    }
19861
0
    const int8_t* i4 = input[4];
19862
0
    assert(i4 != NULL);
19863
0
    if XNN_UNPREDICTABLE(i4 != zero) {
19864
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
19865
0
    }
19866
0
    const int8_t* i5 = input[5];
19867
0
    assert(i5 != NULL);
19868
0
    if XNN_UNPREDICTABLE(i5 != zero) {
19869
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
19870
0
    }
19871
0
    const int8_t* i6 = input[6];
19872
0
    assert(i6 != NULL);
19873
0
    if XNN_UNPREDICTABLE(i6 != zero) {
19874
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
19875
0
    }
19876
0
    const int8_t* i7 = input[7];
19877
0
    assert(i7 != NULL);
19878
0
    if XNN_UNPREDICTABLE(i7 != zero) {
19879
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
19880
0
    }
19881
0
    const int8_t* i8 = input[8];
19882
0
    assert(i8 != NULL);
19883
0
    if XNN_UNPREDICTABLE(i8 != zero) {
19884
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
19885
0
    }
19886
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
19887
19888
0
    size_t c = channels;
19889
0
    const void* w = weights;
19890
0
    do {
19891
0
      int32_t vacc = unaligned_load_s32(w);
19892
19893
0
      const int32_t vi0 = (int32_t) *i0++;
19894
0
      const int32_t vk0 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[0];
19895
0
      vacc += vi0 * vk0;
19896
0
      const int32_t vi1 = (int32_t) *i1++;
19897
0
      const int32_t vk1 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[1];
19898
0
      vacc += vi1 * vk1;
19899
0
      const int32_t vi2 = (int32_t) *i2++;
19900
0
      const int32_t vk2 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[2];
19901
0
      vacc += vi2 * vk2;
19902
0
      const int32_t vi3 = (int32_t) *i3++;
19903
0
      const int32_t vk3 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[3];
19904
0
      vacc += vi3 * vk3;
19905
0
      const int32_t vi4 = (int32_t) *i4++;
19906
0
      const int32_t vk4 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[4];
19907
0
      vacc += vi4 * vk4;
19908
0
      const int32_t vi5 = (int32_t) *i5++;
19909
0
      const int32_t vk5 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[5];
19910
0
      vacc += vi5 * vk5;
19911
0
      const int32_t vi6 = (int32_t) *i6++;
19912
0
      const int32_t vk6 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[6];
19913
0
      vacc += vi6 * vk6;
19914
0
      const int32_t vi7 = (int32_t) *i7++;
19915
0
      const int32_t vk7 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[7];
19916
0
      vacc += vi7 * vk7;
19917
0
      const int32_t vi8 = (int32_t) *i8++;
19918
0
      const int32_t vk8 = ((const int8_t*) ((uintptr_t) w + sizeof(int32_t)))[8];
19919
0
      vacc += vi8 * vk8;
19920
19921
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 9 * sizeof(int8_t));
19922
19923
0
      const float vscale = unaligned_load_f32(w);
19924
0
      w = (const void*) ((const float*) w + 1);
19925
0
      float vfpacc = (float) vacc * vscale;
19926
19927
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
19928
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
19929
0
      vfpacc += vmagic_bias;
19930
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
19931
19932
0
      *output++ = (int8_t) vout;
19933
0
    } while (--c != 0);
19934
19935
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
19936
0
  } while (--output_width != 0);
19937
0
}
19938
19939
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic(
19940
    size_t channels,
19941
    size_t output_width,
19942
    const int8_t** input,
19943
    const void* weights,
19944
    int8_t* output,
19945
    intptr_t input_stride,
19946
    size_t output_increment,
19947
    size_t input_offset,
19948
    const int8_t* zero,
19949
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
19950
0
{
19951
0
  assert(channels != 0);
19952
0
  assert(output_width != 0);
19953
19954
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
19955
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
19956
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
19957
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
19958
0
  do {
19959
0
    const int8_t* i0 = input[0];
19960
0
    assert(i0 != NULL);
19961
0
    if XNN_UNPREDICTABLE(i0 != zero) {
19962
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
19963
0
    }
19964
0
    const int8_t* i1 = input[1];
19965
0
    assert(i1 != NULL);
19966
0
    if XNN_UNPREDICTABLE(i1 != zero) {
19967
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
19968
0
    }
19969
0
    const int8_t* i2 = input[2];
19970
0
    assert(i2 != NULL);
19971
0
    if XNN_UNPREDICTABLE(i2 != zero) {
19972
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
19973
0
    }
19974
0
    const int8_t* i3 = input[3];
19975
0
    assert(i3 != NULL);
19976
0
    if XNN_UNPREDICTABLE(i3 != zero) {
19977
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
19978
0
    }
19979
0
    const int8_t* i4 = input[4];
19980
0
    assert(i4 != NULL);
19981
0
    if XNN_UNPREDICTABLE(i4 != zero) {
19982
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
19983
0
    }
19984
0
    const int8_t* i5 = input[5];
19985
0
    assert(i5 != NULL);
19986
0
    if XNN_UNPREDICTABLE(i5 != zero) {
19987
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
19988
0
    }
19989
0
    const int8_t* i6 = input[6];
19990
0
    assert(i6 != NULL);
19991
0
    if XNN_UNPREDICTABLE(i6 != zero) {
19992
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
19993
0
    }
19994
0
    const int8_t* i7 = input[7];
19995
0
    assert(i7 != NULL);
19996
0
    if XNN_UNPREDICTABLE(i7 != zero) {
19997
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
19998
0
    }
19999
0
    const int8_t* i8 = input[8];
20000
0
    assert(i8 != NULL);
20001
0
    if XNN_UNPREDICTABLE(i8 != zero) {
20002
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
20003
0
    }
20004
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
20005
20006
0
    size_t c = channels;
20007
0
    const void* w = weights;
20008
0
    for (; c >= 2; c -= 2) {
20009
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
20010
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
20011
20012
20013
0
      const int32_t vi0x0 = (int32_t) i0[0];
20014
0
      const int32_t vi0x1 = (int32_t) i0[1];
20015
0
      i0 += 2;
20016
20017
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
20018
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
20019
20020
0
      vacc0 += vi0x0 * vk0x0;
20021
0
      vacc1 += vi0x1 * vk0x1;
20022
20023
0
      const int32_t vi1x0 = (int32_t) i1[0];
20024
0
      const int32_t vi1x1 = (int32_t) i1[1];
20025
0
      i1 += 2;
20026
20027
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
20028
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
20029
20030
0
      vacc0 += vi1x0 * vk1x0;
20031
0
      vacc1 += vi1x1 * vk1x1;
20032
20033
0
      const int32_t vi2x0 = (int32_t) i2[0];
20034
0
      const int32_t vi2x1 = (int32_t) i2[1];
20035
0
      i2 += 2;
20036
20037
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
20038
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
20039
20040
0
      vacc0 += vi2x0 * vk2x0;
20041
0
      vacc1 += vi2x1 * vk2x1;
20042
20043
0
      const int32_t vi3x0 = (int32_t) i3[0];
20044
0
      const int32_t vi3x1 = (int32_t) i3[1];
20045
0
      i3 += 2;
20046
20047
0
      const int32_t vk3x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
20048
0
      const int32_t vk3x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7];
20049
20050
0
      vacc0 += vi3x0 * vk3x0;
20051
0
      vacc1 += vi3x1 * vk3x1;
20052
20053
0
      const int32_t vi4x0 = (int32_t) i4[0];
20054
0
      const int32_t vi4x1 = (int32_t) i4[1];
20055
0
      i4 += 2;
20056
20057
0
      const int32_t vk4x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
20058
0
      const int32_t vk4x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9];
20059
20060
0
      vacc0 += vi4x0 * vk4x0;
20061
0
      vacc1 += vi4x1 * vk4x1;
20062
20063
0
      const int32_t vi5x0 = (int32_t) i5[0];
20064
0
      const int32_t vi5x1 = (int32_t) i5[1];
20065
0
      i5 += 2;
20066
20067
0
      const int32_t vk5x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
20068
0
      const int32_t vk5x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11];
20069
20070
0
      vacc0 += vi5x0 * vk5x0;
20071
0
      vacc1 += vi5x1 * vk5x1;
20072
20073
0
      const int32_t vi6x0 = (int32_t) i6[0];
20074
0
      const int32_t vi6x1 = (int32_t) i6[1];
20075
0
      i6 += 2;
20076
20077
0
      const int32_t vk6x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
20078
0
      const int32_t vk6x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13];
20079
20080
0
      vacc0 += vi6x0 * vk6x0;
20081
0
      vacc1 += vi6x1 * vk6x1;
20082
20083
0
      const int32_t vi7x0 = (int32_t) i7[0];
20084
0
      const int32_t vi7x1 = (int32_t) i7[1];
20085
0
      i7 += 2;
20086
20087
0
      const int32_t vk7x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
20088
0
      const int32_t vk7x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15];
20089
20090
0
      vacc0 += vi7x0 * vk7x0;
20091
0
      vacc1 += vi7x1 * vk7x1;
20092
20093
0
      const int32_t vi8x0 = (int32_t) i8[0];
20094
0
      const int32_t vi8x1 = (int32_t) i8[1];
20095
0
      i8 += 2;
20096
20097
0
      const int32_t vk8x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
20098
0
      const int32_t vk8x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17];
20099
20100
0
      vacc0 += vi8x0 * vk8x0;
20101
0
      vacc1 += vi8x1 * vk8x1;
20102
20103
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(int8_t));
20104
20105
0
      float vfpacc0 = (float) vacc0;
20106
0
      float vfpacc1 = (float) vacc1;
20107
20108
0
      const float vscale0 = unaligned_indexed_load_f32(w, 0);
20109
0
      const float vscale1 = unaligned_indexed_load_f32(w, 1);
20110
0
      w = (const void*) ((const float*) w + 2);
20111
20112
0
      vfpacc0 *= vscale0;
20113
0
      vfpacc1 *= vscale1;
20114
20115
0
      vfpacc0 += vmagic_bias;
20116
0
      vfpacc1 += vmagic_bias;
20117
20118
0
      int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
20119
0
      int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
20120
20121
0
      vout0 = math_max_s32(vout0, vmagic_min);
20122
0
      vout1 = math_max_s32(vout1, vmagic_min);
20123
20124
0
      vout0 = math_min_s32(vout0, vmagic_max);
20125
0
      vout1 = math_min_s32(vout1, vmagic_max);
20126
20127
0
      vout0 -= vmagic_bias_less_zero_point;
20128
0
      vout1 -= vmagic_bias_less_zero_point;
20129
20130
0
      output[0] = (int8_t) vout0;
20131
0
      output[1] = (int8_t) vout1;
20132
0
      output += 2;
20133
0
    }
20134
0
    if XNN_UNLIKELY(c != 0) {
20135
0
      int32_t vacc = unaligned_load_s32(w);
20136
20137
0
      const int32_t vi0 = (int32_t) *i0;
20138
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
20139
0
      vacc += vi0 * vk0;
20140
0
      const int32_t vi1 = (int32_t) *i1;
20141
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
20142
0
      vacc += vi1 * vk1;
20143
0
      const int32_t vi2 = (int32_t) *i2;
20144
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
20145
0
      vacc += vi2 * vk2;
20146
0
      const int32_t vi3 = (int32_t) *i3;
20147
0
      const int32_t vk3 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
20148
0
      vacc += vi3 * vk3;
20149
0
      const int32_t vi4 = (int32_t) *i4;
20150
0
      const int32_t vk4 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
20151
0
      vacc += vi4 * vk4;
20152
0
      const int32_t vi5 = (int32_t) *i5;
20153
0
      const int32_t vk5 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
20154
0
      vacc += vi5 * vk5;
20155
0
      const int32_t vi6 = (int32_t) *i6;
20156
0
      const int32_t vk6 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
20157
0
      vacc += vi6 * vk6;
20158
0
      const int32_t vi7 = (int32_t) *i7;
20159
0
      const int32_t vk7 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
20160
0
      vacc += vi7 * vk7;
20161
0
      const int32_t vi8 = (int32_t) *i8;
20162
0
      const int32_t vk8 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
20163
0
      vacc += vi8 * vk8;
20164
20165
0
      const float vscale = unaligned_load_f32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(int8_t)));
20166
0
      float vfpacc = (float) vacc * vscale;
20167
20168
0
      vfpacc += vmagic_bias;
20169
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
20170
0
      vout = math_max_s32(vout, vmagic_min);
20171
0
      vout = math_min_s32(vout, vmagic_max);
20172
0
      vout -= vmagic_bias_less_zero_point;
20173
20174
0
      *output++ = (int8_t) vout;
20175
0
    }
20176
20177
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
20178
0
  } while (--output_width != 0);
20179
0
}
20180
20181
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf(
20182
    size_t channels,
20183
    size_t output_width,
20184
    const int8_t** input,
20185
    const void* weights,
20186
    int8_t* output,
20187
    intptr_t input_stride,
20188
    size_t output_increment,
20189
    size_t input_offset,
20190
    const int8_t* zero,
20191
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
20192
0
{
20193
0
  assert(channels != 0);
20194
0
  assert(output_width != 0);
20195
20196
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
20197
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
20198
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
20199
0
  do {
20200
0
    const int8_t* i0 = input[0];
20201
0
    assert(i0 != NULL);
20202
0
    if XNN_UNPREDICTABLE(i0 != zero) {
20203
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
20204
0
    }
20205
0
    const int8_t* i1 = input[1];
20206
0
    assert(i1 != NULL);
20207
0
    if XNN_UNPREDICTABLE(i1 != zero) {
20208
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
20209
0
    }
20210
0
    const int8_t* i2 = input[2];
20211
0
    assert(i2 != NULL);
20212
0
    if XNN_UNPREDICTABLE(i2 != zero) {
20213
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
20214
0
    }
20215
0
    const int8_t* i3 = input[3];
20216
0
    assert(i3 != NULL);
20217
0
    if XNN_UNPREDICTABLE(i3 != zero) {
20218
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
20219
0
    }
20220
0
    const int8_t* i4 = input[4];
20221
0
    assert(i4 != NULL);
20222
0
    if XNN_UNPREDICTABLE(i4 != zero) {
20223
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
20224
0
    }
20225
0
    const int8_t* i5 = input[5];
20226
0
    assert(i5 != NULL);
20227
0
    if XNN_UNPREDICTABLE(i5 != zero) {
20228
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
20229
0
    }
20230
0
    const int8_t* i6 = input[6];
20231
0
    assert(i6 != NULL);
20232
0
    if XNN_UNPREDICTABLE(i6 != zero) {
20233
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
20234
0
    }
20235
0
    const int8_t* i7 = input[7];
20236
0
    assert(i7 != NULL);
20237
0
    if XNN_UNPREDICTABLE(i7 != zero) {
20238
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
20239
0
    }
20240
0
    const int8_t* i8 = input[8];
20241
0
    assert(i8 != NULL);
20242
0
    if XNN_UNPREDICTABLE(i8 != zero) {
20243
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
20244
0
    }
20245
0
    input = (const int8_t**) ((uintptr_t) input + input_stride);
20246
20247
0
    size_t c = channels;
20248
0
    const void* w = weights;
20249
0
    for (; c >= 2; c -= 2) {
20250
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
20251
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
20252
20253
20254
0
      const int32_t vi0x0 = (int32_t) i0[0];
20255
0
      const int32_t vi0x1 = (int32_t) i0[1];
20256
0
      i0 += 2;
20257
20258
0
      const int32_t vk0x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
20259
0
      const int32_t vk0x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1];
20260
20261
0
      vacc0 += vi0x0 * vk0x0;
20262
0
      vacc1 += vi0x1 * vk0x1;
20263
20264
0
      const int32_t vi1x0 = (int32_t) i1[0];
20265
0
      const int32_t vi1x1 = (int32_t) i1[1];
20266
0
      i1 += 2;
20267
20268
0
      const int32_t vk1x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
20269
0
      const int32_t vk1x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3];
20270
20271
0
      vacc0 += vi1x0 * vk1x0;
20272
0
      vacc1 += vi1x1 * vk1x1;
20273
20274
0
      const int32_t vi2x0 = (int32_t) i2[0];
20275
0
      const int32_t vi2x1 = (int32_t) i2[1];
20276
0
      i2 += 2;
20277
20278
0
      const int32_t vk2x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
20279
0
      const int32_t vk2x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5];
20280
20281
0
      vacc0 += vi2x0 * vk2x0;
20282
0
      vacc1 += vi2x1 * vk2x1;
20283
20284
0
      const int32_t vi3x0 = (int32_t) i3[0];
20285
0
      const int32_t vi3x1 = (int32_t) i3[1];
20286
0
      i3 += 2;
20287
20288
0
      const int32_t vk3x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
20289
0
      const int32_t vk3x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7];
20290
20291
0
      vacc0 += vi3x0 * vk3x0;
20292
0
      vacc1 += vi3x1 * vk3x1;
20293
20294
0
      const int32_t vi4x0 = (int32_t) i4[0];
20295
0
      const int32_t vi4x1 = (int32_t) i4[1];
20296
0
      i4 += 2;
20297
20298
0
      const int32_t vk4x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
20299
0
      const int32_t vk4x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9];
20300
20301
0
      vacc0 += vi4x0 * vk4x0;
20302
0
      vacc1 += vi4x1 * vk4x1;
20303
20304
0
      const int32_t vi5x0 = (int32_t) i5[0];
20305
0
      const int32_t vi5x1 = (int32_t) i5[1];
20306
0
      i5 += 2;
20307
20308
0
      const int32_t vk5x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
20309
0
      const int32_t vk5x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11];
20310
20311
0
      vacc0 += vi5x0 * vk5x0;
20312
0
      vacc1 += vi5x1 * vk5x1;
20313
20314
0
      const int32_t vi6x0 = (int32_t) i6[0];
20315
0
      const int32_t vi6x1 = (int32_t) i6[1];
20316
0
      i6 += 2;
20317
20318
0
      const int32_t vk6x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
20319
0
      const int32_t vk6x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13];
20320
20321
0
      vacc0 += vi6x0 * vk6x0;
20322
0
      vacc1 += vi6x1 * vk6x1;
20323
20324
0
      const int32_t vi7x0 = (int32_t) i7[0];
20325
0
      const int32_t vi7x1 = (int32_t) i7[1];
20326
0
      i7 += 2;
20327
20328
0
      const int32_t vk7x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
20329
0
      const int32_t vk7x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15];
20330
20331
0
      vacc0 += vi7x0 * vk7x0;
20332
0
      vacc1 += vi7x1 * vk7x1;
20333
20334
0
      const int32_t vi8x0 = (int32_t) i8[0];
20335
0
      const int32_t vi8x1 = (int32_t) i8[1];
20336
0
      i8 += 2;
20337
20338
0
      const int32_t vk8x0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
20339
0
      const int32_t vk8x1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17];
20340
20341
0
      vacc0 += vi8x0 * vk8x0;
20342
0
      vacc1 += vi8x1 * vk8x1;
20343
20344
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(int8_t));
20345
20346
0
      float vfpacc0 = (float) vacc0;
20347
0
      float vfpacc1 = (float) vacc1;
20348
20349
0
      const float vscale0 = unaligned_indexed_load_f32(w, 0);
20350
0
      const float vscale1 = unaligned_indexed_load_f32(w, 1);
20351
0
      w = (const void*) ((const float*) w + 2);
20352
20353
0
      vfpacc0 *= vscale0;
20354
0
      vfpacc1 *= vscale1;
20355
20356
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
20357
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
20358
20359
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
20360
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
20361
20362
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
20363
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
20364
20365
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
20366
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
20367
20368
0
      output[0] = (int8_t) vout0;
20369
0
      output[1] = (int8_t) vout1;
20370
0
      output += 2;
20371
0
    }
20372
0
    if XNN_UNLIKELY(c != 0) {
20373
0
      int32_t vacc = unaligned_load_s32(w);
20374
20375
0
      const int32_t vi0 = (int32_t) *i0;
20376
0
      const int32_t vk0 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0];
20377
0
      vacc += vi0 * vk0;
20378
0
      const int32_t vi1 = (int32_t) *i1;
20379
0
      const int32_t vk1 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2];
20380
0
      vacc += vi1 * vk1;
20381
0
      const int32_t vi2 = (int32_t) *i2;
20382
0
      const int32_t vk2 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4];
20383
0
      vacc += vi2 * vk2;
20384
0
      const int32_t vi3 = (int32_t) *i3;
20385
0
      const int32_t vk3 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6];
20386
0
      vacc += vi3 * vk3;
20387
0
      const int32_t vi4 = (int32_t) *i4;
20388
0
      const int32_t vk4 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8];
20389
0
      vacc += vi4 * vk4;
20390
0
      const int32_t vi5 = (int32_t) *i5;
20391
0
      const int32_t vk5 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10];
20392
0
      vacc += vi5 * vk5;
20393
0
      const int32_t vi6 = (int32_t) *i6;
20394
0
      const int32_t vk6 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12];
20395
0
      vacc += vi6 * vk6;
20396
0
      const int32_t vi7 = (int32_t) *i7;
20397
0
      const int32_t vk7 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14];
20398
0
      vacc += vi7 * vk7;
20399
0
      const int32_t vi8 = (int32_t) *i8;
20400
0
      const int32_t vk8 = (int32_t) ((const int8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16];
20401
0
      vacc += vi8 * vk8;
20402
20403
0
      const float vscale = unaligned_load_f32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(int8_t)));
20404
0
      float vfpacc = (float) vacc * vscale;
20405
20406
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
20407
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
20408
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
20409
0
      int32_t vout = vrndacc + voutput_zero_point;
20410
20411
0
      *output++ = (int8_t) vout;
20412
0
    }
20413
20414
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
20415
0
  } while (--output_width != 0);
20416
0
}
20417
20418
void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__scalar_imagic(
20419
    size_t mr,
20420
    size_t nc,
20421
    size_t kc,
20422
    const int8_t* restrict a,
20423
    size_t a_stride,
20424
    const void* restrict w,
20425
    int8_t* restrict c,
20426
    size_t cm_stride,
20427
    size_t cn_stride,
20428
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
20429
0
{
20430
0
  assert(mr != 0);
20431
0
  assert(mr <= 1);
20432
0
  assert(nc != 0);
20433
0
  assert(kc != 0);
20434
20435
0
  const int8_t* a0 = a;
20436
0
  int8_t* c0 = c;
20437
20438
0
  do {
20439
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
20440
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
20441
0
    w = (const int32_t*) w + 2;
20442
20443
0
    size_t k = kc;
20444
0
    do {
20445
0
      const int32_t va0 = (int32_t) *a0++;
20446
20447
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
20448
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
20449
0
      w = (const int8_t*) w + 2;
20450
20451
0
      vacc0x0 += va0 * vb0;
20452
0
      vacc0x1 += va0 * vb1;
20453
20454
0
      k -= sizeof(int8_t);
20455
0
    } while (k != 0);
20456
20457
0
    float vfpacc0x0 = (float) vacc0x0;
20458
0
    float vfpacc0x1 = (float) vacc0x1;
20459
20460
0
    const float vscale0 = unaligned_indexed_load_f32(w, 0);
20461
0
    vfpacc0x0 *= vscale0;
20462
0
    const float vscale1 = unaligned_indexed_load_f32(w, 1);
20463
0
    vfpacc0x1 *= vscale1;
20464
0
    w = (const void*) ((const float*) w + 2);
20465
20466
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
20467
0
    vfpacc0x0 += vmagic_bias;
20468
0
    vfpacc0x1 += vmagic_bias;
20469
20470
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
20471
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
20472
20473
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
20474
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
20475
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
20476
20477
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
20478
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
20479
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
20480
20481
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
20482
0
    vout0x0 -= vmagic_bias_less_zero_point;
20483
0
    vout0x1 -= vmagic_bias_less_zero_point;
20484
20485
0
    if XNN_LIKELY(nc >= 2) {
20486
0
      c0[0] = (int8_t) vout0x0;
20487
0
      c0[1] = (int8_t) vout0x1;
20488
20489
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
20490
20491
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
20492
20493
0
      nc -= 2;
20494
0
    } else {
20495
0
      if (nc & 1) {
20496
0
        c0[0] = (int8_t) vout0x0;
20497
0
      }
20498
20499
0
      nc = 0;
20500
0
    }
20501
0
  } while (nc != 0);
20502
0
}
20503
20504
void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf(
20505
    size_t mr,
20506
    size_t nc,
20507
    size_t kc,
20508
    const int8_t* restrict a,
20509
    size_t a_stride,
20510
    const void* restrict w,
20511
    int8_t* restrict c,
20512
    size_t cm_stride,
20513
    size_t cn_stride,
20514
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
20515
0
{
20516
0
  assert(mr != 0);
20517
0
  assert(mr <= 1);
20518
0
  assert(nc != 0);
20519
0
  assert(kc != 0);
20520
20521
0
  const int8_t* a0 = a;
20522
0
  int8_t* c0 = c;
20523
20524
0
  do {
20525
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
20526
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
20527
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
20528
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
20529
0
    w = (const int32_t*) w + 4;
20530
20531
0
    size_t k = kc;
20532
0
    do {
20533
0
      const int32_t va0 = (int32_t) *a0++;
20534
20535
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
20536
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
20537
0
      const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
20538
0
      const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
20539
0
      w = (const int8_t*) w + 4;
20540
20541
0
      vacc0x0 += va0 * vb0;
20542
0
      vacc0x1 += va0 * vb1;
20543
0
      vacc0x2 += va0 * vb2;
20544
0
      vacc0x3 += va0 * vb3;
20545
20546
0
      k -= sizeof(int8_t);
20547
0
    } while (k != 0);
20548
20549
0
    float vfpacc0x0 = (float) vacc0x0;
20550
0
    float vfpacc0x1 = (float) vacc0x1;
20551
0
    float vfpacc0x2 = (float) vacc0x2;
20552
0
    float vfpacc0x3 = (float) vacc0x3;
20553
20554
0
    const float vscale0 = ((const float*) w)[0];
20555
0
    vfpacc0x0 *= vscale0;
20556
0
    const float vscale1 = ((const float*) w)[1];
20557
0
    vfpacc0x1 *= vscale1;
20558
0
    const float vscale2 = ((const float*) w)[2];
20559
0
    vfpacc0x2 *= vscale2;
20560
0
    const float vscale3 = ((const float*) w)[3];
20561
0
    vfpacc0x3 *= vscale3;
20562
0
    w = (const void*) ((const float*) w + 4);
20563
20564
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
20565
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
20566
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
20567
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
20568
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
20569
20570
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
20571
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
20572
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
20573
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
20574
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
20575
20576
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
20577
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
20578
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
20579
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
20580
20581
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
20582
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
20583
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
20584
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
20585
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
20586
20587
0
    if XNN_LIKELY(nc >= 4) {
20588
0
      c0[0] = (int8_t) vout0x0;
20589
0
      c0[1] = (int8_t) vout0x1;
20590
0
      c0[2] = (int8_t) vout0x2;
20591
0
      c0[3] = (int8_t) vout0x3;
20592
20593
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
20594
20595
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
20596
20597
0
      nc -= 4;
20598
0
    } else {
20599
0
      if (nc & 2) {
20600
0
        c0[0] = (int8_t) vout0x0;
20601
0
        c0[1] = (int8_t) vout0x1;
20602
0
        vout0x0 = vout0x2;
20603
0
        c0 += 2;
20604
0
      }
20605
0
      if (nc & 1) {
20606
0
        c0[0] = (int8_t) vout0x0;
20607
0
      }
20608
20609
0
      nc = 0;
20610
0
    }
20611
0
  } while (nc != 0);
20612
0
}
20613
20614
void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x2__scalar_imagic(
20615
    size_t mr,
20616
    size_t nc,
20617
    size_t kc,
20618
    const int8_t* restrict a,
20619
    size_t a_stride,
20620
    const void* restrict w,
20621
    int8_t* restrict c,
20622
    size_t cm_stride,
20623
    size_t cn_stride,
20624
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
20625
0
{
20626
0
  assert(mr != 0);
20627
0
  assert(mr <= 2);
20628
0
  assert(nc != 0);
20629
0
  assert(kc != 0);
20630
20631
0
  const int8_t* a0 = a;
20632
0
  int8_t* c0 = c;
20633
0
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
20634
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
20635
0
  if XNN_UNPREDICTABLE(mr != 2) {
20636
0
    a1 = a0;
20637
0
    c1 = c0;
20638
0
  }
20639
20640
0
  do {
20641
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
20642
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
20643
0
    int32_t vacc1x0 = vacc0x0;
20644
0
    int32_t vacc1x1 = vacc0x1;
20645
0
    w = (const int32_t*) w + 2;
20646
20647
0
    size_t k = kc;
20648
0
    do {
20649
0
      const int32_t va0 = (int32_t) *a0++;
20650
0
      const int32_t va1 = (int32_t) *a1++;
20651
20652
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
20653
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
20654
0
      w = (const int8_t*) w + 2;
20655
20656
0
      vacc0x0 += va0 * vb0;
20657
0
      vacc0x1 += va0 * vb1;
20658
0
      vacc1x0 += va1 * vb0;
20659
0
      vacc1x1 += va1 * vb1;
20660
20661
0
      k -= sizeof(int8_t);
20662
0
    } while (k != 0);
20663
20664
0
    float vfpacc0x0 = (float) vacc0x0;
20665
0
    float vfpacc0x1 = (float) vacc0x1;
20666
0
    float vfpacc1x0 = (float) vacc1x0;
20667
0
    float vfpacc1x1 = (float) vacc1x1;
20668
20669
0
    const float vscale0 = unaligned_indexed_load_f32(w, 0);
20670
0
    vfpacc0x0 *= vscale0;
20671
0
    vfpacc1x0 *= vscale0;
20672
0
    const float vscale1 = unaligned_indexed_load_f32(w, 1);
20673
0
    vfpacc0x1 *= vscale1;
20674
0
    vfpacc1x1 *= vscale1;
20675
0
    w = (const void*) ((const float*) w + 2);
20676
20677
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
20678
0
    vfpacc0x0 += vmagic_bias;
20679
0
    vfpacc0x1 += vmagic_bias;
20680
0
    vfpacc1x0 += vmagic_bias;
20681
0
    vfpacc1x1 += vmagic_bias;
20682
20683
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
20684
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
20685
0
    int32_t vout1x0 = (int32_t) float_as_uint32(vfpacc1x0);
20686
0
    int32_t vout1x1 = (int32_t) float_as_uint32(vfpacc1x1);
20687
20688
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
20689
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
20690
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
20691
0
    vout1x0 = math_max_s32(vout1x0, vmagic_min);
20692
0
    vout1x1 = math_max_s32(vout1x1, vmagic_min);
20693
20694
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
20695
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
20696
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
20697
0
    vout1x0 = math_min_s32(vout1x0, vmagic_max);
20698
0
    vout1x1 = math_min_s32(vout1x1, vmagic_max);
20699
20700
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
20701
0
    vout0x0 -= vmagic_bias_less_zero_point;
20702
0
    vout0x1 -= vmagic_bias_less_zero_point;
20703
0
    vout1x0 -= vmagic_bias_less_zero_point;
20704
0
    vout1x1 -= vmagic_bias_less_zero_point;
20705
20706
0
    if XNN_LIKELY(nc >= 2) {
20707
0
      c0[0] = (int8_t) vout0x0;
20708
0
      c0[1] = (int8_t) vout0x1;
20709
0
      c1[0] = (int8_t) vout1x0;
20710
0
      c1[1] = (int8_t) vout1x1;
20711
20712
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
20713
0
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
20714
20715
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
20716
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
20717
20718
0
      nc -= 2;
20719
0
    } else {
20720
0
      if (nc & 1) {
20721
0
        c0[0] = (int8_t) vout0x0;
20722
0
        c1[0] = (int8_t) vout1x0;
20723
0
      }
20724
20725
0
      nc = 0;
20726
0
    }
20727
0
  } while (nc != 0);
20728
0
}
20729
20730
void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf(
20731
    size_t mr,
20732
    size_t nc,
20733
    size_t kc,
20734
    const int8_t* restrict a,
20735
    size_t a_stride,
20736
    const void* restrict w,
20737
    int8_t* restrict c,
20738
    size_t cm_stride,
20739
    size_t cn_stride,
20740
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
20741
0
{
20742
0
  assert(mr != 0);
20743
0
  assert(mr <= 3);
20744
0
  assert(nc != 0);
20745
0
  assert(kc != 0);
20746
20747
0
  const int8_t* a0 = a;
20748
0
  int8_t* c0 = c;
20749
0
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
20750
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
20751
0
  if XNN_UNPREDICTABLE(mr < 2) {
20752
0
    a1 = a0;
20753
0
    c1 = c0;
20754
0
  }
20755
0
  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
20756
0
  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
20757
0
  if XNN_UNPREDICTABLE(mr <= 2) {
20758
0
    a2 = a1;
20759
0
    c2 = c1;
20760
0
  }
20761
20762
0
  do {
20763
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
20764
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
20765
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
20766
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
20767
0
    int32_t vacc1x0 = vacc0x0;
20768
0
    int32_t vacc1x1 = vacc0x1;
20769
0
    int32_t vacc1x2 = vacc0x2;
20770
0
    int32_t vacc1x3 = vacc0x3;
20771
0
    int32_t vacc2x0 = vacc0x0;
20772
0
    int32_t vacc2x1 = vacc0x1;
20773
0
    int32_t vacc2x2 = vacc0x2;
20774
0
    int32_t vacc2x3 = vacc0x3;
20775
0
    w = (const int32_t*) w + 4;
20776
20777
0
    size_t k = kc;
20778
0
    do {
20779
0
      const int32_t va0 = (int32_t) *a0++;
20780
0
      const int32_t va1 = (int32_t) *a1++;
20781
0
      const int32_t va2 = (int32_t) *a2++;
20782
20783
0
      const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
20784
0
      const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
20785
0
      const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
20786
0
      const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
20787
0
      w = (const int8_t*) w + 4;
20788
20789
0
      vacc0x0 += va0 * vb0;
20790
0
      vacc0x1 += va0 * vb1;
20791
0
      vacc0x2 += va0 * vb2;
20792
0
      vacc0x3 += va0 * vb3;
20793
0
      vacc1x0 += va1 * vb0;
20794
0
      vacc1x1 += va1 * vb1;
20795
0
      vacc1x2 += va1 * vb2;
20796
0
      vacc1x3 += va1 * vb3;
20797
0
      vacc2x0 += va2 * vb0;
20798
0
      vacc2x1 += va2 * vb1;
20799
0
      vacc2x2 += va2 * vb2;
20800
0
      vacc2x3 += va2 * vb3;
20801
20802
0
      k -= sizeof(int8_t);
20803
0
    } while (k != 0);
20804
20805
0
    float vfpacc0x0 = (float) vacc0x0;
20806
0
    float vfpacc0x1 = (float) vacc0x1;
20807
0
    float vfpacc0x2 = (float) vacc0x2;
20808
0
    float vfpacc0x3 = (float) vacc0x3;
20809
0
    float vfpacc1x0 = (float) vacc1x0;
20810
0
    float vfpacc1x1 = (float) vacc1x1;
20811
0
    float vfpacc1x2 = (float) vacc1x2;
20812
0
    float vfpacc1x3 = (float) vacc1x3;
20813
0
    float vfpacc2x0 = (float) vacc2x0;
20814
0
    float vfpacc2x1 = (float) vacc2x1;
20815
0
    float vfpacc2x2 = (float) vacc2x2;
20816
0
    float vfpacc2x3 = (float) vacc2x3;
20817
20818
0
    const float vscale0 = ((const float*) w)[0];
20819
0
    vfpacc0x0 *= vscale0;
20820
0
    vfpacc1x0 *= vscale0;
20821
0
    vfpacc2x0 *= vscale0;
20822
0
    const float vscale1 = ((const float*) w)[1];
20823
0
    vfpacc0x1 *= vscale1;
20824
0
    vfpacc1x1 *= vscale1;
20825
0
    vfpacc2x1 *= vscale1;
20826
0
    const float vscale2 = ((const float*) w)[2];
20827
0
    vfpacc0x2 *= vscale2;
20828
0
    vfpacc1x2 *= vscale2;
20829
0
    vfpacc2x2 *= vscale2;
20830
0
    const float vscale3 = ((const float*) w)[3];
20831
0
    vfpacc0x3 *= vscale3;
20832
0
    vfpacc1x3 *= vscale3;
20833
0
    vfpacc2x3 *= vscale3;
20834
0
    w = (const void*) ((const float*) w + 4);
20835
20836
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
20837
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
20838
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
20839
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
20840
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
20841
0
    vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
20842
0
    vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
20843
0
    vfpacc1x2 = math_max_f32(vfpacc1x2, voutput_min_less_zero_point);
20844
0
    vfpacc1x3 = math_max_f32(vfpacc1x3, voutput_min_less_zero_point);
20845
0
    vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
20846
0
    vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
20847
0
    vfpacc2x2 = math_max_f32(vfpacc2x2, voutput_min_less_zero_point);
20848
0
    vfpacc2x3 = math_max_f32(vfpacc2x3, voutput_min_less_zero_point);
20849
20850
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
20851
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
20852
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
20853
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
20854
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
20855
0
    vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
20856
0
    vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
20857
0
    vfpacc1x2 = math_min_f32(vfpacc1x2, voutput_max_less_zero_point);
20858
0
    vfpacc1x3 = math_min_f32(vfpacc1x3, voutput_max_less_zero_point);
20859
0
    vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
20860
0
    vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
20861
0
    vfpacc2x2 = math_min_f32(vfpacc2x2, voutput_max_less_zero_point);
20862
0
    vfpacc2x3 = math_min_f32(vfpacc2x3, voutput_max_less_zero_point);
20863
20864
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
20865
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
20866
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
20867
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
20868
0
    const int32_t vrndacc1x0 = (int32_t) lrintf(vfpacc1x0);
20869
0
    const int32_t vrndacc1x1 = (int32_t) lrintf(vfpacc1x1);
20870
0
    const int32_t vrndacc1x2 = (int32_t) lrintf(vfpacc1x2);
20871
0
    const int32_t vrndacc1x3 = (int32_t) lrintf(vfpacc1x3);
20872
0
    const int32_t vrndacc2x0 = (int32_t) lrintf(vfpacc2x0);
20873
0
    const int32_t vrndacc2x1 = (int32_t) lrintf(vfpacc2x1);
20874
0
    const int32_t vrndacc2x2 = (int32_t) lrintf(vfpacc2x2);
20875
0
    const int32_t vrndacc2x3 = (int32_t) lrintf(vfpacc2x3);
20876
20877
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
20878
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
20879
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
20880
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
20881
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
20882
0
    int32_t vout1x0 = vrndacc1x0 + voutput_zero_point;
20883
0
    int32_t vout1x1 = vrndacc1x1 + voutput_zero_point;
20884
0
    int32_t vout1x2 = vrndacc1x2 + voutput_zero_point;
20885
0
    int32_t vout1x3 = vrndacc1x3 + voutput_zero_point;
20886
0
    int32_t vout2x0 = vrndacc2x0 + voutput_zero_point;
20887
0
    int32_t vout2x1 = vrndacc2x1 + voutput_zero_point;
20888
0
    int32_t vout2x2 = vrndacc2x2 + voutput_zero_point;
20889
0
    int32_t vout2x3 = vrndacc2x3 + voutput_zero_point;
20890
20891
0
    if XNN_LIKELY(nc >= 4) {
20892
0
      c0[0] = (int8_t) vout0x0;
20893
0
      c0[1] = (int8_t) vout0x1;
20894
0
      c0[2] = (int8_t) vout0x2;
20895
0
      c0[3] = (int8_t) vout0x3;
20896
0
      c1[0] = (int8_t) vout1x0;
20897
0
      c1[1] = (int8_t) vout1x1;
20898
0
      c1[2] = (int8_t) vout1x2;
20899
0
      c1[3] = (int8_t) vout1x3;
20900
0
      c2[0] = (int8_t) vout2x0;
20901
0
      c2[1] = (int8_t) vout2x1;
20902
0
      c2[2] = (int8_t) vout2x2;
20903
0
      c2[3] = (int8_t) vout2x3;
20904
20905
0
      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
20906
0
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
20907
0
      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
20908
20909
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
20910
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
20911
0
      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
20912
20913
0
      nc -= 4;
20914
0
    } else {
20915
0
      if (nc & 2) {
20916
0
        c0[0] = (int8_t) vout0x0;
20917
0
        c0[1] = (int8_t) vout0x1;
20918
0
        vout0x0 = vout0x2;
20919
0
        c0 += 2;
20920
0
        c1[0] = (int8_t) vout1x0;
20921
0
        c1[1] = (int8_t) vout1x1;
20922
0
        vout1x0 = vout1x2;
20923
0
        c1 += 2;
20924
0
        c2[0] = (int8_t) vout2x0;
20925
0
        c2[1] = (int8_t) vout2x1;
20926
0
        vout2x0 = vout2x2;
20927
0
        c2 += 2;
20928
0
      }
20929
0
      if (nc & 1) {
20930
0
        c0[0] = (int8_t) vout0x0;
20931
0
        c1[0] = (int8_t) vout1x0;
20932
0
        c2[0] = (int8_t) vout2x0;
20933
0
      }
20934
20935
0
      nc = 0;
20936
0
    }
20937
0
  } while (nc != 0);
20938
0
}
20939
20940
void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__scalar_imagic(
20941
    size_t mr,
20942
    size_t nc,
20943
    size_t kc,
20944
    size_t ks,
20945
    const int8_t** restrict a,
20946
    const void* restrict w,
20947
    int8_t* restrict c,
20948
    size_t cm_stride,
20949
    size_t cn_stride,
20950
    size_t a_offset,
20951
    const int8_t* zero,
20952
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
20953
0
{
20954
0
  assert(mr != 0);
20955
0
  assert(mr <= 1);
20956
0
  assert(nc != 0);
20957
0
  assert(kc != 0);
20958
0
  assert(ks != 0);
20959
0
  assert(ks % (1 * sizeof(void*)) == 0);
20960
0
  assert(a != NULL);
20961
0
  assert(w != NULL);
20962
0
  assert(c != NULL);
20963
20964
0
  int8_t* c0 = c;
20965
20966
0
  do {
20967
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
20968
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
20969
0
    w = (const void*) ((const int32_t*) w + 2);
20970
20971
0
    size_t p = ks;
20972
0
    do {
20973
0
      const int8_t* restrict a0 = a[0];
20974
0
      assert(a0 != NULL);
20975
0
      if XNN_UNPREDICTABLE(a0 != zero) {
20976
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
20977
0
      }
20978
0
      a += 1;
20979
20980
0
      size_t k = kc;
20981
0
      do {
20982
0
        const int32_t va0 = (int32_t) *a0++;
20983
20984
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
20985
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
20986
0
        w = (const void*) ((const int8_t*) w + 2);
20987
20988
0
        vacc0x0 += va0 * vb0;
20989
0
        vacc0x1 += va0 * vb1;
20990
20991
0
        k -= sizeof(int8_t);
20992
0
      } while (k != 0);
20993
0
      p -= 1 * sizeof(void*);
20994
0
    } while (p != 0);
20995
20996
0
    float vfpacc0x0 = (float) vacc0x0;
20997
0
    float vfpacc0x1 = (float) vacc0x1;
20998
20999
0
    const float vscale0 = unaligned_indexed_load_f32(w, 0);
21000
0
    vfpacc0x0 *= vscale0;
21001
0
    const float vscale1 = unaligned_indexed_load_f32(w, 1);
21002
0
    vfpacc0x1 *= vscale1;
21003
0
    w = (const void*) ((const float*) w + 2);
21004
21005
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
21006
0
    vfpacc0x0 += vmagic_bias;
21007
0
    vfpacc0x1 += vmagic_bias;
21008
21009
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
21010
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
21011
21012
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
21013
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
21014
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
21015
21016
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
21017
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
21018
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
21019
21020
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
21021
0
    vout0x0 -= vmagic_bias_less_zero_point;
21022
0
    vout0x1 -= vmagic_bias_less_zero_point;
21023
21024
0
    if XNN_LIKELY(nc >= 2) {
21025
0
      c0[0] = (int8_t) vout0x0;
21026
0
      c0[1] = (int8_t) vout0x1;
21027
21028
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
21029
21030
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
21031
0
      nc -= 2;
21032
0
    } else {
21033
0
      if (nc & 1) {
21034
0
        c0[0] = (int8_t) vout0x0;
21035
0
      }
21036
21037
0
      nc = 0;
21038
0
    }
21039
0
  } while (nc != 0);
21040
0
}
21041
21042
void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf(
21043
    size_t mr,
21044
    size_t nc,
21045
    size_t kc,
21046
    size_t ks,
21047
    const int8_t** restrict a,
21048
    const void* restrict w,
21049
    int8_t* restrict c,
21050
    size_t cm_stride,
21051
    size_t cn_stride,
21052
    size_t a_offset,
21053
    const int8_t* zero,
21054
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21055
0
{
21056
0
  assert(mr != 0);
21057
0
  assert(mr <= 1);
21058
0
  assert(nc != 0);
21059
0
  assert(kc != 0);
21060
0
  assert(ks != 0);
21061
0
  assert(ks % (1 * sizeof(void*)) == 0);
21062
0
  assert(a != NULL);
21063
0
  assert(w != NULL);
21064
0
  assert(c != NULL);
21065
21066
0
  int8_t* c0 = c;
21067
21068
0
  do {
21069
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
21070
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
21071
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
21072
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
21073
0
    w = (const void*) ((const int32_t*) w + 4);
21074
21075
0
    size_t p = ks;
21076
0
    do {
21077
0
      const int8_t* restrict a0 = a[0];
21078
0
      assert(a0 != NULL);
21079
0
      if XNN_UNPREDICTABLE(a0 != zero) {
21080
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
21081
0
      }
21082
0
      a += 1;
21083
21084
0
      size_t k = kc;
21085
0
      do {
21086
0
        const int32_t va0 = (int32_t) *a0++;
21087
21088
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
21089
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
21090
0
        const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
21091
0
        const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
21092
0
        w = (const void*) ((const int8_t*) w + 4);
21093
21094
0
        vacc0x0 += va0 * vb0;
21095
0
        vacc0x1 += va0 * vb1;
21096
0
        vacc0x2 += va0 * vb2;
21097
0
        vacc0x3 += va0 * vb3;
21098
21099
0
        k -= sizeof(int8_t);
21100
0
      } while (k != 0);
21101
0
      p -= 1 * sizeof(void*);
21102
0
    } while (p != 0);
21103
21104
0
    float vfpacc0x0 = (float) vacc0x0;
21105
0
    float vfpacc0x1 = (float) vacc0x1;
21106
0
    float vfpacc0x2 = (float) vacc0x2;
21107
0
    float vfpacc0x3 = (float) vacc0x3;
21108
21109
0
    const float vscale0 = ((const float*) w)[0];
21110
0
    vfpacc0x0 *= vscale0;
21111
0
    const float vscale1 = ((const float*) w)[1];
21112
0
    vfpacc0x1 *= vscale1;
21113
0
    const float vscale2 = ((const float*) w)[2];
21114
0
    vfpacc0x2 *= vscale2;
21115
0
    const float vscale3 = ((const float*) w)[3];
21116
0
    vfpacc0x3 *= vscale3;
21117
0
    w = (const void*) ((const float*) w + 4);
21118
21119
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
21120
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
21121
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
21122
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
21123
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
21124
21125
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
21126
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
21127
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
21128
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
21129
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
21130
21131
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
21132
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
21133
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
21134
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
21135
21136
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
21137
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
21138
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
21139
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
21140
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
21141
21142
0
    if XNN_LIKELY(nc >= 4) {
21143
0
      c0[0] = (int8_t) vout0x0;
21144
0
      c0[1] = (int8_t) vout0x1;
21145
0
      c0[2] = (int8_t) vout0x2;
21146
0
      c0[3] = (int8_t) vout0x3;
21147
21148
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
21149
21150
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
21151
0
      nc -= 4;
21152
0
    } else {
21153
0
      if (nc & 2) {
21154
0
        c0[0] = (int8_t) vout0x0;
21155
0
        c0[1] = (int8_t) vout0x1;
21156
0
        vout0x0 = vout0x2;
21157
0
        c0 += 2;
21158
0
      }
21159
0
      if (nc & 1) {
21160
0
        c0[0] = (int8_t) vout0x0;
21161
0
      }
21162
21163
0
      nc = 0;
21164
0
    }
21165
0
  } while (nc != 0);
21166
0
}
21167
21168
void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2__scalar_imagic(
21169
    size_t mr,
21170
    size_t nc,
21171
    size_t kc,
21172
    size_t ks,
21173
    const int8_t** restrict a,
21174
    const void* restrict w,
21175
    int8_t* restrict c,
21176
    size_t cm_stride,
21177
    size_t cn_stride,
21178
    size_t a_offset,
21179
    const int8_t* zero,
21180
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21181
0
{
21182
0
  assert(mr != 0);
21183
0
  assert(mr <= 2);
21184
0
  assert(nc != 0);
21185
0
  assert(kc != 0);
21186
0
  assert(ks != 0);
21187
0
  assert(ks % (2 * sizeof(void*)) == 0);
21188
0
  assert(a != NULL);
21189
0
  assert(w != NULL);
21190
0
  assert(c != NULL);
21191
21192
0
  int8_t* c0 = c;
21193
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
21194
0
  if XNN_UNPREDICTABLE(mr != 2) {
21195
0
    c1 = c0;
21196
0
  }
21197
21198
0
  do {
21199
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
21200
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
21201
0
    int32_t vacc1x0 = vacc0x0;
21202
0
    int32_t vacc1x1 = vacc0x1;
21203
0
    w = (const void*) ((const int32_t*) w + 2);
21204
21205
0
    size_t p = ks;
21206
0
    do {
21207
0
      const int8_t* restrict a0 = a[0];
21208
0
      assert(a0 != NULL);
21209
0
      if XNN_UNPREDICTABLE(a0 != zero) {
21210
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
21211
0
      }
21212
0
      const int8_t* restrict a1 = a[1];
21213
0
      assert(a1 != NULL);
21214
0
      if XNN_UNPREDICTABLE(a1 != zero) {
21215
0
        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
21216
0
      }
21217
0
      a += 2;
21218
21219
0
      size_t k = kc;
21220
0
      do {
21221
0
        const int32_t va0 = (int32_t) *a0++;
21222
0
        const int32_t va1 = (int32_t) *a1++;
21223
21224
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
21225
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
21226
0
        w = (const void*) ((const int8_t*) w + 2);
21227
21228
0
        vacc0x0 += va0 * vb0;
21229
0
        vacc0x1 += va0 * vb1;
21230
0
        vacc1x0 += va1 * vb0;
21231
0
        vacc1x1 += va1 * vb1;
21232
21233
0
        k -= sizeof(int8_t);
21234
0
      } while (k != 0);
21235
0
      p -= 2 * sizeof(void*);
21236
0
    } while (p != 0);
21237
21238
0
    float vfpacc0x0 = (float) vacc0x0;
21239
0
    float vfpacc0x1 = (float) vacc0x1;
21240
0
    float vfpacc1x0 = (float) vacc1x0;
21241
0
    float vfpacc1x1 = (float) vacc1x1;
21242
21243
0
    const float vscale0 = unaligned_indexed_load_f32(w, 0);
21244
0
    vfpacc0x0 *= vscale0;
21245
0
    vfpacc1x0 *= vscale0;
21246
0
    const float vscale1 = unaligned_indexed_load_f32(w, 1);
21247
0
    vfpacc0x1 *= vscale1;
21248
0
    vfpacc1x1 *= vscale1;
21249
0
    w = (const void*) ((const float*) w + 2);
21250
21251
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
21252
0
    vfpacc0x0 += vmagic_bias;
21253
0
    vfpacc0x1 += vmagic_bias;
21254
0
    vfpacc1x0 += vmagic_bias;
21255
0
    vfpacc1x1 += vmagic_bias;
21256
21257
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
21258
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
21259
0
    int32_t vout1x0 = (int32_t) float_as_uint32(vfpacc1x0);
21260
0
    int32_t vout1x1 = (int32_t) float_as_uint32(vfpacc1x1);
21261
21262
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
21263
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
21264
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
21265
0
    vout1x0 = math_max_s32(vout1x0, vmagic_min);
21266
0
    vout1x1 = math_max_s32(vout1x1, vmagic_min);
21267
21268
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
21269
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
21270
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
21271
0
    vout1x0 = math_min_s32(vout1x0, vmagic_max);
21272
0
    vout1x1 = math_min_s32(vout1x1, vmagic_max);
21273
21274
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
21275
0
    vout0x0 -= vmagic_bias_less_zero_point;
21276
0
    vout0x1 -= vmagic_bias_less_zero_point;
21277
0
    vout1x0 -= vmagic_bias_less_zero_point;
21278
0
    vout1x1 -= vmagic_bias_less_zero_point;
21279
21280
0
    if XNN_LIKELY(nc >= 2) {
21281
0
      c1[0] = (int8_t) vout1x0;
21282
0
      c1[1] = (int8_t) vout1x1;
21283
0
      c0[0] = (int8_t) vout0x0;
21284
0
      c0[1] = (int8_t) vout0x1;
21285
21286
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
21287
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
21288
21289
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
21290
0
      nc -= 2;
21291
0
    } else {
21292
0
      if (nc & 1) {
21293
0
        c1[0] = (int8_t) vout1x0;
21294
0
        c0[0] = (int8_t) vout0x0;
21295
0
      }
21296
21297
0
      nc = 0;
21298
0
    }
21299
0
  } while (nc != 0);
21300
0
}
21301
21302
void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf(
21303
    size_t mr,
21304
    size_t nc,
21305
    size_t kc,
21306
    size_t ks,
21307
    const int8_t** restrict a,
21308
    const void* restrict w,
21309
    int8_t* restrict c,
21310
    size_t cm_stride,
21311
    size_t cn_stride,
21312
    size_t a_offset,
21313
    const int8_t* zero,
21314
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21315
0
{
21316
0
  assert(mr != 0);
21317
0
  assert(mr <= 3);
21318
0
  assert(nc != 0);
21319
0
  assert(kc != 0);
21320
0
  assert(ks != 0);
21321
0
  assert(ks % (3 * sizeof(void*)) == 0);
21322
0
  assert(a != NULL);
21323
0
  assert(w != NULL);
21324
0
  assert(c != NULL);
21325
21326
0
  int8_t* c0 = c;
21327
0
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
21328
0
  if XNN_UNPREDICTABLE(mr < 2) {
21329
0
    c1 = c0;
21330
0
  }
21331
0
  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
21332
0
  if XNN_UNPREDICTABLE(mr <= 2) {
21333
0
    c2 = c1;
21334
0
  }
21335
21336
0
  do {
21337
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
21338
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
21339
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
21340
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
21341
0
    int32_t vacc1x0 = vacc0x0;
21342
0
    int32_t vacc1x1 = vacc0x1;
21343
0
    int32_t vacc1x2 = vacc0x2;
21344
0
    int32_t vacc1x3 = vacc0x3;
21345
0
    int32_t vacc2x0 = vacc0x0;
21346
0
    int32_t vacc2x1 = vacc0x1;
21347
0
    int32_t vacc2x2 = vacc0x2;
21348
0
    int32_t vacc2x3 = vacc0x3;
21349
0
    w = (const void*) ((const int32_t*) w + 4);
21350
21351
0
    size_t p = ks;
21352
0
    do {
21353
0
      const int8_t* restrict a0 = a[0];
21354
0
      assert(a0 != NULL);
21355
0
      if XNN_UNPREDICTABLE(a0 != zero) {
21356
0
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
21357
0
      }
21358
0
      const int8_t* restrict a1 = a[1];
21359
0
      assert(a1 != NULL);
21360
0
      if XNN_UNPREDICTABLE(a1 != zero) {
21361
0
        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
21362
0
      }
21363
0
      const int8_t* restrict a2 = a[2];
21364
0
      assert(a2 != NULL);
21365
0
      if XNN_UNPREDICTABLE(a2 != zero) {
21366
0
        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
21367
0
      }
21368
0
      a += 3;
21369
21370
0
      size_t k = kc;
21371
0
      do {
21372
0
        const int32_t va0 = (int32_t) *a0++;
21373
0
        const int32_t va1 = (int32_t) *a1++;
21374
0
        const int32_t va2 = (int32_t) *a2++;
21375
21376
0
        const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
21377
0
        const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
21378
0
        const int32_t vb2 = (int32_t) ((const int8_t*) w)[2];
21379
0
        const int32_t vb3 = (int32_t) ((const int8_t*) w)[3];
21380
0
        w = (const void*) ((const int8_t*) w + 4);
21381
21382
0
        vacc0x0 += va0 * vb0;
21383
0
        vacc0x1 += va0 * vb1;
21384
0
        vacc0x2 += va0 * vb2;
21385
0
        vacc0x3 += va0 * vb3;
21386
0
        vacc1x0 += va1 * vb0;
21387
0
        vacc1x1 += va1 * vb1;
21388
0
        vacc1x2 += va1 * vb2;
21389
0
        vacc1x3 += va1 * vb3;
21390
0
        vacc2x0 += va2 * vb0;
21391
0
        vacc2x1 += va2 * vb1;
21392
0
        vacc2x2 += va2 * vb2;
21393
0
        vacc2x3 += va2 * vb3;
21394
21395
0
        k -= sizeof(int8_t);
21396
0
      } while (k != 0);
21397
0
      p -= 3 * sizeof(void*);
21398
0
    } while (p != 0);
21399
21400
0
    float vfpacc0x0 = (float) vacc0x0;
21401
0
    float vfpacc0x1 = (float) vacc0x1;
21402
0
    float vfpacc0x2 = (float) vacc0x2;
21403
0
    float vfpacc0x3 = (float) vacc0x3;
21404
0
    float vfpacc1x0 = (float) vacc1x0;
21405
0
    float vfpacc1x1 = (float) vacc1x1;
21406
0
    float vfpacc1x2 = (float) vacc1x2;
21407
0
    float vfpacc1x3 = (float) vacc1x3;
21408
0
    float vfpacc2x0 = (float) vacc2x0;
21409
0
    float vfpacc2x1 = (float) vacc2x1;
21410
0
    float vfpacc2x2 = (float) vacc2x2;
21411
0
    float vfpacc2x3 = (float) vacc2x3;
21412
21413
0
    const float vscale0 = ((const float*) w)[0];
21414
0
    vfpacc0x0 *= vscale0;
21415
0
    vfpacc1x0 *= vscale0;
21416
0
    vfpacc2x0 *= vscale0;
21417
0
    const float vscale1 = ((const float*) w)[1];
21418
0
    vfpacc0x1 *= vscale1;
21419
0
    vfpacc1x1 *= vscale1;
21420
0
    vfpacc2x1 *= vscale1;
21421
0
    const float vscale2 = ((const float*) w)[2];
21422
0
    vfpacc0x2 *= vscale2;
21423
0
    vfpacc1x2 *= vscale2;
21424
0
    vfpacc2x2 *= vscale2;
21425
0
    const float vscale3 = ((const float*) w)[3];
21426
0
    vfpacc0x3 *= vscale3;
21427
0
    vfpacc1x3 *= vscale3;
21428
0
    vfpacc2x3 *= vscale3;
21429
0
    w = (const void*) ((const float*) w + 4);
21430
21431
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
21432
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
21433
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
21434
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
21435
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
21436
0
    vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
21437
0
    vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
21438
0
    vfpacc1x2 = math_max_f32(vfpacc1x2, voutput_min_less_zero_point);
21439
0
    vfpacc1x3 = math_max_f32(vfpacc1x3, voutput_min_less_zero_point);
21440
0
    vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
21441
0
    vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
21442
0
    vfpacc2x2 = math_max_f32(vfpacc2x2, voutput_min_less_zero_point);
21443
0
    vfpacc2x3 = math_max_f32(vfpacc2x3, voutput_min_less_zero_point);
21444
21445
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
21446
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
21447
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
21448
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
21449
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
21450
0
    vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
21451
0
    vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
21452
0
    vfpacc1x2 = math_min_f32(vfpacc1x2, voutput_max_less_zero_point);
21453
0
    vfpacc1x3 = math_min_f32(vfpacc1x3, voutput_max_less_zero_point);
21454
0
    vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
21455
0
    vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
21456
0
    vfpacc2x2 = math_min_f32(vfpacc2x2, voutput_max_less_zero_point);
21457
0
    vfpacc2x3 = math_min_f32(vfpacc2x3, voutput_max_less_zero_point);
21458
21459
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
21460
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
21461
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
21462
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
21463
0
    const int32_t vrndacc1x0 = (int32_t) lrintf(vfpacc1x0);
21464
0
    const int32_t vrndacc1x1 = (int32_t) lrintf(vfpacc1x1);
21465
0
    const int32_t vrndacc1x2 = (int32_t) lrintf(vfpacc1x2);
21466
0
    const int32_t vrndacc1x3 = (int32_t) lrintf(vfpacc1x3);
21467
0
    const int32_t vrndacc2x0 = (int32_t) lrintf(vfpacc2x0);
21468
0
    const int32_t vrndacc2x1 = (int32_t) lrintf(vfpacc2x1);
21469
0
    const int32_t vrndacc2x2 = (int32_t) lrintf(vfpacc2x2);
21470
0
    const int32_t vrndacc2x3 = (int32_t) lrintf(vfpacc2x3);
21471
21472
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
21473
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
21474
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
21475
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
21476
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
21477
0
    int32_t vout1x0 = vrndacc1x0 + voutput_zero_point;
21478
0
    int32_t vout1x1 = vrndacc1x1 + voutput_zero_point;
21479
0
    int32_t vout1x2 = vrndacc1x2 + voutput_zero_point;
21480
0
    int32_t vout1x3 = vrndacc1x3 + voutput_zero_point;
21481
0
    int32_t vout2x0 = vrndacc2x0 + voutput_zero_point;
21482
0
    int32_t vout2x1 = vrndacc2x1 + voutput_zero_point;
21483
0
    int32_t vout2x2 = vrndacc2x2 + voutput_zero_point;
21484
0
    int32_t vout2x3 = vrndacc2x3 + voutput_zero_point;
21485
21486
0
    if XNN_LIKELY(nc >= 4) {
21487
0
      c2[0] = (int8_t) vout2x0;
21488
0
      c2[1] = (int8_t) vout2x1;
21489
0
      c2[2] = (int8_t) vout2x2;
21490
0
      c2[3] = (int8_t) vout2x3;
21491
0
      c1[0] = (int8_t) vout1x0;
21492
0
      c1[1] = (int8_t) vout1x1;
21493
0
      c1[2] = (int8_t) vout1x2;
21494
0
      c1[3] = (int8_t) vout1x3;
21495
0
      c0[0] = (int8_t) vout0x0;
21496
0
      c0[1] = (int8_t) vout0x1;
21497
0
      c0[2] = (int8_t) vout0x2;
21498
0
      c0[3] = (int8_t) vout0x3;
21499
21500
0
      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
21501
0
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
21502
0
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
21503
21504
0
      a = (const int8_t**restrict) ((uintptr_t) a - ks);
21505
0
      nc -= 4;
21506
0
    } else {
21507
0
      if (nc & 2) {
21508
0
        c2[0] = (int8_t) vout2x0;
21509
0
        c2[1] = (int8_t) vout2x1;
21510
0
        vout2x0 = vout2x2;
21511
0
        c2 += 2;
21512
0
        c1[0] = (int8_t) vout1x0;
21513
0
        c1[1] = (int8_t) vout1x1;
21514
0
        vout1x0 = vout1x2;
21515
0
        c1 += 2;
21516
0
        c0[0] = (int8_t) vout0x0;
21517
0
        c0[1] = (int8_t) vout0x1;
21518
0
        vout0x0 = vout0x2;
21519
0
        c0 += 2;
21520
0
      }
21521
0
      if (nc & 1) {
21522
0
        c2[0] = (int8_t) vout2x0;
21523
0
        c1[0] = (int8_t) vout1x0;
21524
0
        c0[0] = (int8_t) vout0x0;
21525
0
      }
21526
21527
0
      nc = 0;
21528
0
    }
21529
0
  } while (nc != 0);
21530
0
}
21531
21532
void xnn_qs8_vadd_minmax_ukernel__scalar_u1(
21533
    size_t batch,
21534
    const int8_t* input_a,
21535
    const int8_t* input_b,
21536
    int8_t* output,
21537
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21538
0
{
21539
0
  assert(batch != 0);
21540
0
  assert(batch % sizeof(int8_t) == 0);
21541
0
  assert(input_a != NULL);
21542
0
  assert(input_b != NULL);
21543
0
  assert(output != NULL);
21544
21545
0
  const int32_t vbias = params->scalar.bias;
21546
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
21547
0
  const int32_t vb_multiplier = params->scalar.b_multiplier;
21548
0
  const uint32_t vshift = params->scalar.shift;
21549
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
21550
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
21551
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
21552
21553
0
  do {
21554
0
    const int32_t va = *input_a++;
21555
0
    const int32_t vb = *input_b++;
21556
0
    const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
21557
21558
0
    int32_t vout = math_asr_s32(vacc, vshift);
21559
0
    vout = math_max_s32(vout, voutput_min_less_zero_point);
21560
0
    vout = math_min_s32(vout, voutput_max_less_zero_point);
21561
0
    *output++ = (int8_t) (vout + voutput_zero_point);
21562
21563
0
    batch -= sizeof(int8_t);
21564
0
  } while (batch != 0);
21565
0
}
21566
21567
void xnn_qs8_vadd_minmax_ukernel__scalar_u4(
21568
    size_t batch,
21569
    const int8_t* input_a,
21570
    const int8_t* input_b,
21571
    int8_t* output,
21572
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21573
0
{
21574
0
  assert(batch != 0);
21575
0
  assert(batch % sizeof(int8_t) == 0);
21576
0
  assert(input_a != NULL);
21577
0
  assert(input_b != NULL);
21578
0
  assert(output != NULL);
21579
21580
0
  const int32_t vbias = params->scalar.bias;
21581
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
21582
0
  const int32_t vb_multiplier = params->scalar.b_multiplier;
21583
0
  const uint32_t vshift = params->scalar.shift;
21584
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
21585
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
21586
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
21587
21588
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
21589
0
    const int32_t va0 = input_a[0];
21590
0
    const int32_t va1 = input_a[1];
21591
0
    const int32_t va2 = input_a[2];
21592
0
    const int32_t va3 = input_a[3];
21593
0
    input_a += 4;
21594
21595
0
    const int32_t vb0 = input_b[0];
21596
0
    int32_t vacc0 = vbias + va0 * va_multiplier;
21597
0
    const int32_t vb1 = input_b[1];
21598
0
    int32_t vacc1 = vbias + va1 * va_multiplier;
21599
0
    const int32_t vb2 = input_b[2];
21600
0
    int32_t vacc2 = vbias + va2 * va_multiplier;
21601
0
    const int32_t vb3 = input_b[3];
21602
0
    int32_t vacc3 = vbias + va3 * va_multiplier;
21603
0
    input_b += 4;
21604
21605
0
    vacc0 += vb0 * vb_multiplier;
21606
0
    vacc1 += vb1 * vb_multiplier;
21607
0
    vacc2 += vb2 * vb_multiplier;
21608
0
    vacc3 += vb3 * vb_multiplier;
21609
21610
0
    int32_t vout0 = math_asr_s32(vacc0, vshift);
21611
0
    int32_t vout1 = math_asr_s32(vacc1, vshift);
21612
0
    int32_t vout2 = math_asr_s32(vacc2, vshift);
21613
0
    int32_t vout3 = math_asr_s32(vacc3, vshift);
21614
21615
0
    vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
21616
0
    vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
21617
0
    vout2 = math_max_s32(vout2, voutput_min_less_zero_point);
21618
0
    vout3 = math_max_s32(vout3, voutput_min_less_zero_point);
21619
21620
0
    vout0 = math_min_s32(vout0, voutput_max_less_zero_point);
21621
0
    vout1 = math_min_s32(vout1, voutput_max_less_zero_point);
21622
0
    vout2 = math_min_s32(vout2, voutput_max_less_zero_point);
21623
0
    vout3 = math_min_s32(vout3, voutput_max_less_zero_point);
21624
21625
0
    vout0 += voutput_zero_point;
21626
0
    vout1 += voutput_zero_point;
21627
0
    vout2 += voutput_zero_point;
21628
0
    vout3 += voutput_zero_point;
21629
21630
0
    output[0] = (int8_t) vout0;
21631
0
    output[1] = (int8_t) vout1;
21632
0
    output[2] = (int8_t) vout2;
21633
0
    output[3] = (int8_t) vout3;
21634
0
    output += 4;
21635
0
  }
21636
0
  if XNN_UNLIKELY(batch != 0) {
21637
0
    do {
21638
0
      const int32_t va = *input_a++;
21639
0
      const int32_t vb = *input_b++;
21640
0
      const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
21641
21642
0
      int32_t vout = math_asr_s32(vacc, vshift);
21643
0
      vout = math_max_s32(vout, voutput_min_less_zero_point);
21644
0
      vout = math_min_s32(vout, voutput_max_less_zero_point);
21645
0
      *output++ = (int8_t) (vout + voutput_zero_point);
21646
21647
0
      batch -= sizeof(int8_t);
21648
0
    } while (batch != 0);
21649
0
  }
21650
0
}
21651
21652
void xnn_qs8_vaddc_minmax_ukernel__scalar_u1(
21653
    size_t batch,
21654
    const int8_t* input_a,
21655
    const int8_t* input_b,
21656
    int8_t* output,
21657
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21658
0
{
21659
0
  assert(batch != 0);
21660
0
  assert(batch % sizeof(int8_t) == 0);
21661
0
  assert(input_a != NULL);
21662
0
  assert(input_b != NULL);
21663
0
  assert(output != NULL);
21664
21665
0
  const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
21666
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
21667
0
  const uint32_t vshift = params->scalar.shift;
21668
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
21669
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
21670
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
21671
21672
0
  do {
21673
0
    const int32_t va = *input_a++;
21674
0
    const int32_t vacc = vbias + va * va_multiplier;
21675
21676
0
    int32_t vout = math_asr_s32(vacc, vshift);
21677
0
    vout = math_max_s32(vout, voutput_min_less_zero_point);
21678
0
    vout = math_min_s32(vout, voutput_max_less_zero_point);
21679
0
    *output++ = (int8_t) (vout + voutput_zero_point);
21680
21681
0
    batch -= sizeof(int8_t);
21682
0
  } while (batch != 0);
21683
0
}
21684
21685
void xnn_qs8_vaddc_minmax_ukernel__scalar_u4(
21686
    size_t batch,
21687
    const int8_t* input_a,
21688
    const int8_t* input_b,
21689
    int8_t* output,
21690
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
21691
0
{
21692
0
  assert(batch != 0);
21693
0
  assert(batch % sizeof(int8_t) == 0);
21694
0
  assert(input_a != NULL);
21695
0
  assert(input_b != NULL);
21696
0
  assert(output != NULL);
21697
21698
0
  const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
21699
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
21700
0
  const uint32_t vshift = params->scalar.shift;
21701
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
21702
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
21703
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
21704
21705
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
21706
0
    const int32_t va0 = input_a[0];
21707
0
    const int32_t va1 = input_a[1];
21708
0
    const int32_t va2 = input_a[2];
21709
0
    const int32_t va3 = input_a[3];
21710
0
    input_a += 4;
21711
21712
0
    const int32_t vacc0 = vbias + va0 * va_multiplier;
21713
0
    const int32_t vacc1 = vbias + va1 * va_multiplier;
21714
0
    const int32_t vacc2 = vbias + va2 * va_multiplier;
21715
0
    const int32_t vacc3 = vbias + va3 * va_multiplier;
21716
0
    input_b += 4;
21717
21718
0
    int32_t vout0 = math_asr_s32(vacc0, vshift);
21719
0
    int32_t vout1 = math_asr_s32(vacc1, vshift);
21720
0
    int32_t vout2 = math_asr_s32(vacc2, vshift);
21721
0
    int32_t vout3 = math_asr_s32(vacc3, vshift);
21722
21723
0
    vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
21724
0
    vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
21725
0
    vout2 = math_max_s32(vout2, voutput_min_less_zero_point);
21726
0
    vout3 = math_max_s32(vout3, voutput_min_less_zero_point);
21727
21728
0
    vout0 = math_min_s32(vout0, voutput_max_less_zero_point);
21729
0
    vout1 = math_min_s32(vout1, voutput_max_less_zero_point);
21730
0
    vout2 = math_min_s32(vout2, voutput_max_less_zero_point);
21731
0
    vout3 = math_min_s32(vout3, voutput_max_less_zero_point);
21732
21733
0
    vout0 += voutput_zero_point;
21734
0
    vout1 += voutput_zero_point;
21735
0
    vout2 += voutput_zero_point;
21736
0
    vout3 += voutput_zero_point;
21737
21738
0
    output[0] = (int8_t) vout0;
21739
0
    output[1] = (int8_t) vout1;
21740
0
    output[2] = (int8_t) vout2;
21741
0
    output[3] = (int8_t) vout3;
21742
0
    output += 4;
21743
0
  }
21744
0
  if XNN_UNLIKELY(batch != 0) {
21745
0
    do {
21746
0
      const int32_t va = *input_a++;
21747
0
      const int32_t vacc = vbias + va * va_multiplier;
21748
21749
0
      int32_t vout = math_asr_s32(vacc, vshift);
21750
0
      vout = math_max_s32(vout, voutput_min_less_zero_point);
21751
0
      vout = math_min_s32(vout, voutput_max_less_zero_point);
21752
0
      *output++ = (int8_t) (vout + voutput_zero_point);
21753
21754
0
      batch -= sizeof(int8_t);
21755
0
    } while (batch != 0);
21756
0
  }
21757
0
}
21758
21759
void xnn_qs8_vcvt_ukernel__scalar_u1(
21760
    size_t batch,
21761
    const int8_t* input,
21762
    int8_t* output,
21763
    const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
21764
0
{
21765
0
  assert(batch != 0);
21766
0
  assert(batch % sizeof(int8_t) == 0);
21767
0
  assert(input != NULL);
21768
0
  assert(output != NULL);
21769
21770
0
  const int32_t vbias = params->scalar.bias;
21771
0
  const int32_t vmultiplier = params->scalar.multiplier;
21772
0
  do {
21773
0
    int32_t vacc = *input++;
21774
0
    vacc = vbias + vacc * vmultiplier;
21775
21776
0
    int32_t vout = math_asr_s32(vacc, 8);
21777
0
    vout = math_max_s32(vout, -128);
21778
0
    vout = math_min_s32(vout, 127);
21779
0
    *output++ = (int8_t) vout;
21780
21781
0
    batch -= sizeof(int8_t);
21782
0
  } while (batch != 0);
21783
0
}
21784
21785
void xnn_qs8_vcvt_ukernel__scalar_u4(
21786
    size_t batch,
21787
    const int8_t* input,
21788
    int8_t* output,
21789
    const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
21790
0
{
21791
0
  assert(batch != 0);
21792
0
  assert(batch % sizeof(int8_t) == 0);
21793
0
  assert(input != NULL);
21794
0
  assert(output != NULL);
21795
21796
0
  const int32_t vbias = params->scalar.bias;
21797
0
  const int32_t vmultiplier = params->scalar.multiplier;
21798
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
21799
0
    int32_t vacc0 = input[0];
21800
0
    int32_t vacc1 = input[1];
21801
0
    int32_t vacc2 = input[2];
21802
0
    int32_t vacc3 = input[3];
21803
0
    input += 4;
21804
21805
0
    vacc0 = vbias + vacc0 * vmultiplier;
21806
0
    vacc1 = vbias + vacc1 * vmultiplier;
21807
0
    vacc2 = vbias + vacc2 * vmultiplier;
21808
0
    vacc3 = vbias + vacc3 * vmultiplier;
21809
21810
0
    int32_t vout0 = math_asr_s32(vacc0, 8);
21811
0
    int32_t vout1 = math_asr_s32(vacc1, 8);
21812
0
    int32_t vout2 = math_asr_s32(vacc2, 8);
21813
0
    int32_t vout3 = math_asr_s32(vacc3, 8);
21814
21815
0
    vout0 = math_max_s32(vout0, -128);
21816
0
    vout1 = math_max_s32(vout1, -128);
21817
0
    vout2 = math_max_s32(vout2, -128);
21818
0
    vout3 = math_max_s32(vout3, -128);
21819
21820
0
    vout0 = math_min_s32(vout0, 127);
21821
0
    vout1 = math_min_s32(vout1, 127);
21822
0
    vout2 = math_min_s32(vout2, 127);
21823
0
    vout3 = math_min_s32(vout3, 127);
21824
21825
0
    output[0] = (int8_t) vout0;
21826
0
    output[1] = (int8_t) vout1;
21827
0
    output[2] = (int8_t) vout2;
21828
0
    output[3] = (int8_t) vout3;
21829
0
    output += 4;
21830
0
  }
21831
0
  if XNN_UNLIKELY(batch != 0) {
21832
0
    do {
21833
0
      int32_t vacc = *input++;
21834
0
      vacc = vbias + vacc * vmultiplier;
21835
21836
0
      int32_t vout = math_asr_s32(vacc, 8);
21837
0
      vout = math_max_s32(vout, -128);
21838
0
      vout = math_min_s32(vout, 127);
21839
0
      *output++ = (int8_t) vout;
21840
21841
0
      batch -= sizeof(int8_t);
21842
0
    } while (batch != 0);
21843
0
  }
21844
0
}
21845
21846
void xnn_qs8_vlrelu_ukernel__scalar_andxor_u4(
21847
    size_t batch,
21848
    const int8_t* input,
21849
    int8_t* output,
21850
    const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
21851
0
{
21852
0
  assert(batch != 0);
21853
0
  assert(batch % sizeof(int8_t) == 0);
21854
0
  assert(input != NULL);
21855
0
  assert(output != NULL);
21856
21857
0
  const int32_t vinput_zero_point = params->scalar_andxor.input_zero_point;
21858
0
  const int32_t vmultiplier_diff = params->scalar_andxor.multiplier_diff;
21859
0
  const int32_t vmultiplier_base = params->scalar_andxor.multiplier_base;
21860
0
  const int32_t vbias = params->scalar_andxor.bias;
21861
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
21862
0
    int32_t vacc0 = (int32_t) input[0];
21863
0
    int32_t vacc1 = (int32_t) input[1];
21864
0
    int32_t vacc2 = (int32_t) input[2];
21865
0
    int32_t vacc3 = (int32_t) input[3];
21866
0
    input += 4;
21867
21868
0
    vacc0 -= vinput_zero_point;
21869
0
    vacc1 -= vinput_zero_point;
21870
0
    vacc2 -= vinput_zero_point;
21871
0
    vacc3 -= vinput_zero_point;
21872
21873
0
    int32_t vmultiplier0 = math_asr_s32(vacc0, 31);
21874
0
    int32_t vmultiplier1 = math_asr_s32(vacc1, 31);
21875
0
    int32_t vmultiplier2 = math_asr_s32(vacc2, 31);
21876
0
    int32_t vmultiplier3 = math_asr_s32(vacc3, 31);
21877
21878
0
    vmultiplier0 &= vmultiplier_diff;
21879
0
    vmultiplier1 &= vmultiplier_diff;
21880
0
    vmultiplier2 &= vmultiplier_diff;
21881
0
    vmultiplier3 &= vmultiplier_diff;
21882
21883
0
    vmultiplier0 ^= vmultiplier_base;
21884
0
    vmultiplier1 ^= vmultiplier_base;
21885
0
    vmultiplier2 ^= vmultiplier_base;
21886
0
    vmultiplier3 ^= vmultiplier_base;
21887
21888
0
    vacc0 = vbias + vacc0 * vmultiplier0;
21889
0
    vacc1 = vbias + vacc1 * vmultiplier1;
21890
0
    vacc2 = vbias + vacc2 * vmultiplier2;
21891
0
    vacc3 = vbias + vacc3 * vmultiplier3;
21892
21893
0
    int32_t vout0 = math_asr_s32(vacc0, 8);
21894
0
    int32_t vout1 = math_asr_s32(vacc1, 8);
21895
0
    int32_t vout2 = math_asr_s32(vacc2, 8);
21896
0
    int32_t vout3 = math_asr_s32(vacc3, 8);
21897
21898
0
    vout0 = math_max_s32(vout0, -128);
21899
0
    vout1 = math_max_s32(vout1, -128);
21900
0
    vout2 = math_max_s32(vout2, -128);
21901
0
    vout3 = math_max_s32(vout3, -128);
21902
21903
0
    vout0 = math_min_s32(vout0, 127);
21904
0
    vout1 = math_min_s32(vout1, 127);
21905
0
    vout2 = math_min_s32(vout2, 127);
21906
0
    vout3 = math_min_s32(vout3, 127);
21907
21908
0
    output[0] = (int8_t) vout0;
21909
0
    output[1] = (int8_t) vout1;
21910
0
    output[2] = (int8_t) vout2;
21911
0
    output[3] = (int8_t) vout3;
21912
0
    output += 4;
21913
0
  }
21914
0
  if XNN_UNLIKELY(batch != 0) {
21915
0
    do {
21916
0
      int32_t vacc = (int32_t) *input++ - vinput_zero_point;
21917
0
      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
21918
0
      vacc = vbias + vacc * vmultiplier;
21919
21920
0
      int32_t vout = math_asr_s32(vacc, 8);
21921
0
      vout = math_max_s32(vout, -128);
21922
0
      vout = math_min_s32(vout, 127);
21923
0
      *output++ = (int8_t) vout;
21924
21925
0
      batch -= sizeof(int8_t);
21926
0
    } while (batch != 0);
21927
0
  }
21928
0
}
21929
21930
void xnn_qs8_vlrelu_ukernel__scalar_select_u4(
21931
    size_t batch,
21932
    const int8_t* input,
21933
    int8_t* output,
21934
    const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
21935
0
{
21936
0
  assert(batch != 0);
21937
0
  assert(batch % sizeof(int8_t) == 0);
21938
0
  assert(input != NULL);
21939
0
  assert(output != NULL);
21940
21941
0
  const int32_t vinput_zero_point = params->scalar_select.input_zero_point;
21942
0
  const int32_t vpositive_multiplier = params->scalar_select.positive_multiplier;
21943
0
  const int32_t vnegative_multiplier = params->scalar_select.negative_multiplier;
21944
0
  const int32_t vbias = params->scalar_select.bias;
21945
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
21946
0
    int32_t vacc0 = (int32_t) input[0];
21947
0
    int32_t vacc1 = (int32_t) input[1];
21948
0
    int32_t vacc2 = (int32_t) input[2];
21949
0
    int32_t vacc3 = (int32_t) input[3];
21950
0
    input += 4;
21951
21952
0
    vacc0 -= vinput_zero_point;
21953
0
    vacc1 -= vinput_zero_point;
21954
0
    vacc2 -= vinput_zero_point;
21955
0
    vacc3 -= vinput_zero_point;
21956
21957
0
    const int32_t vmultiplier0 = XNN_UNPREDICTABLE(vacc0 >= 0) ? vpositive_multiplier : vnegative_multiplier;
21958
0
    const int32_t vmultiplier1 = XNN_UNPREDICTABLE(vacc1 >= 0) ? vpositive_multiplier : vnegative_multiplier;
21959
0
    const int32_t vmultiplier2 = XNN_UNPREDICTABLE(vacc2 >= 0) ? vpositive_multiplier : vnegative_multiplier;
21960
0
    const int32_t vmultiplier3 = XNN_UNPREDICTABLE(vacc3 >= 0) ? vpositive_multiplier : vnegative_multiplier;
21961
21962
0
    vacc0 = vbias + vacc0 * vmultiplier0;
21963
0
    vacc1 = vbias + vacc1 * vmultiplier1;
21964
0
    vacc2 = vbias + vacc2 * vmultiplier2;
21965
0
    vacc3 = vbias + vacc3 * vmultiplier3;
21966
21967
0
    int32_t vout0 = math_asr_s32(vacc0, 8);
21968
0
    int32_t vout1 = math_asr_s32(vacc1, 8);
21969
0
    int32_t vout2 = math_asr_s32(vacc2, 8);
21970
0
    int32_t vout3 = math_asr_s32(vacc3, 8);
21971
21972
0
    vout0 = math_max_s32(vout0, -128);
21973
0
    vout1 = math_max_s32(vout1, -128);
21974
0
    vout2 = math_max_s32(vout2, -128);
21975
0
    vout3 = math_max_s32(vout3, -128);
21976
21977
0
    vout0 = math_min_s32(vout0, 127);
21978
0
    vout1 = math_min_s32(vout1, 127);
21979
0
    vout2 = math_min_s32(vout2, 127);
21980
0
    vout3 = math_min_s32(vout3, 127);
21981
21982
0
    output[0] = (int8_t) vout0;
21983
0
    output[1] = (int8_t) vout1;
21984
0
    output[2] = (int8_t) vout2;
21985
0
    output[3] = (int8_t) vout3;
21986
0
    output += 4;
21987
0
  }
21988
0
  if XNN_UNLIKELY(batch != 0) {
21989
0
    do {
21990
0
      int32_t vacc = (int32_t) *input++ - vinput_zero_point;
21991
0
      const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
21992
0
      vacc = vbias + vacc * vmultiplier;
21993
21994
0
      int32_t vout = math_asr_s32(vacc, 8);
21995
0
      vout = math_max_s32(vout, -128);
21996
0
      vout = math_min_s32(vout, 127);
21997
0
      *output++ = (int8_t) vout;
21998
21999
0
      batch -= sizeof(int8_t);
22000
0
    } while (batch != 0);
22001
0
  }
22002
0
}
22003
22004
void xnn_qs8_vmul_minmax_fp32_ukernel__scalar_u4(
22005
    size_t batch,
22006
    const int8_t* input_a,
22007
    const int8_t* input_b,
22008
    int8_t* output,
22009
    const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
22010
0
{
22011
0
  assert(batch != 0);
22012
0
  assert(batch % sizeof(int8_t) == 0);
22013
0
  assert(input_a != NULL);
22014
0
  assert(input_b != NULL);
22015
0
  assert(output != NULL);
22016
22017
0
  const int32_t va_zero_point = params->fp32_scalar.a_zero_point;
22018
0
  const int32_t vb_zero_point = params->fp32_scalar.b_zero_point;
22019
0
  const float vscale = params->fp32_scalar.scale;
22020
0
  const float voutput_min_less_zero_point = params->fp32_scalar.output_min_less_zero_point;
22021
0
  const float voutput_max_less_zero_point = params->fp32_scalar.output_max_less_zero_point;
22022
0
  const float vmagic_bias = params->fp32_scalar.magic_bias;
22023
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar.magic_bias_less_output_zero_point;
22024
22025
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
22026
0
    const int32_t va0 = input_a[0] - va_zero_point;
22027
0
    const int32_t va1 = input_a[1] - va_zero_point;
22028
0
    const int32_t va2 = input_a[2] - va_zero_point;
22029
0
    const int32_t va3 = input_a[3] - va_zero_point;
22030
0
    input_a += 4;
22031
22032
0
    const int32_t vb0 = input_b[0] - vb_zero_point;
22033
0
    const int32_t vb1 = input_b[1] - vb_zero_point;
22034
0
    const int32_t vb2 = input_b[2] - vb_zero_point;
22035
0
    const int32_t vb3 = input_b[3] - vb_zero_point;
22036
0
    input_b += 4;
22037
22038
0
    const int32_t vacc0 = va0 * vb0;
22039
0
    const int32_t vacc1 = va1 * vb1;
22040
0
    const int32_t vacc2 = va2 * vb2;
22041
0
    const int32_t vacc3 = va3 * vb3;
22042
22043
0
    float vfpacc0 = (float) vacc0 * vscale;
22044
0
    float vfpacc1 = (float) vacc1 * vscale;
22045
0
    float vfpacc2 = (float) vacc2 * vscale;
22046
0
    float vfpacc3 = (float) vacc3 * vscale;
22047
22048
0
    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
22049
0
    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
22050
0
    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
22051
0
    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
22052
22053
0
    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
22054
0
    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
22055
0
    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
22056
0
    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
22057
22058
0
    vfpacc0 += vmagic_bias;
22059
0
    vfpacc1 += vmagic_bias;
22060
0
    vfpacc2 += vmagic_bias;
22061
0
    vfpacc3 += vmagic_bias;
22062
22063
0
    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
22064
0
    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
22065
0
    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
22066
0
    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
22067
22068
0
    output[0] = (int8_t) vout0;
22069
0
    output[1] = (int8_t) vout1;
22070
0
    output[2] = (int8_t) vout2;
22071
0
    output[3] = (int8_t) vout3;
22072
0
    output += 4;
22073
0
  }
22074
0
  if XNN_UNLIKELY(batch != 0) {
22075
0
    do {
22076
0
      const int32_t va = (int32_t) *input_a++ - va_zero_point;
22077
0
      const int32_t vb = (int32_t) *input_b++ - vb_zero_point;
22078
0
      const int32_t vacc = va * vb;
22079
22080
0
      float vfpacc = (float) vacc * vscale;
22081
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
22082
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
22083
0
      vfpacc += vmagic_bias;
22084
0
      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
22085
0
      *output++ = (int8_t) vout;
22086
22087
0
      batch -= sizeof(int8_t);
22088
0
    } while (batch != 0);
22089
0
  }
22090
0
}
22091
22092
void xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_u4(
22093
    size_t batch,
22094
    const int8_t* input_a,
22095
    const int8_t* input_b,
22096
    int8_t* output,
22097
    const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
22098
0
{
22099
0
  assert(batch != 0);
22100
0
  assert(batch % sizeof(int8_t) == 0);
22101
0
  assert(input_a != NULL);
22102
0
  assert(input_b != NULL);
22103
0
  assert(output != NULL);
22104
22105
0
  const int32_t va_zero_point = params->fp32_scalar.a_zero_point;
22106
0
  const float vscale = params->fp32_scalar.scale;
22107
0
  const float voutput_min_less_zero_point = params->fp32_scalar.output_min_less_zero_point;
22108
0
  const float voutput_max_less_zero_point = params->fp32_scalar.output_max_less_zero_point;
22109
0
  const float vmagic_bias = params->fp32_scalar.magic_bias;
22110
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar.magic_bias_less_output_zero_point;
22111
22112
0
  const int32_t vb = (int32_t) *input_b - params->fp32_scalar.b_zero_point;
22113
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
22114
0
    const int32_t va0 = input_a[0] - va_zero_point;
22115
0
    const int32_t va1 = input_a[1] - va_zero_point;
22116
0
    const int32_t va2 = input_a[2] - va_zero_point;
22117
0
    const int32_t va3 = input_a[3] - va_zero_point;
22118
0
    input_a += 4;
22119
22120
0
    const int32_t vacc0 = va0 * vb;
22121
0
    const int32_t vacc1 = va1 * vb;
22122
0
    const int32_t vacc2 = va2 * vb;
22123
0
    const int32_t vacc3 = va3 * vb;
22124
22125
0
    float vfpacc0 = (float) vacc0 * vscale;
22126
0
    float vfpacc1 = (float) vacc1 * vscale;
22127
0
    float vfpacc2 = (float) vacc2 * vscale;
22128
0
    float vfpacc3 = (float) vacc3 * vscale;
22129
22130
0
    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
22131
0
    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
22132
0
    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
22133
0
    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
22134
22135
0
    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
22136
0
    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
22137
0
    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
22138
0
    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
22139
22140
0
    vfpacc0 += vmagic_bias;
22141
0
    vfpacc1 += vmagic_bias;
22142
0
    vfpacc2 += vmagic_bias;
22143
0
    vfpacc3 += vmagic_bias;
22144
22145
0
    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
22146
0
    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
22147
0
    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
22148
0
    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
22149
22150
0
    output[0] = (int8_t) vout0;
22151
0
    output[1] = (int8_t) vout1;
22152
0
    output[2] = (int8_t) vout2;
22153
0
    output[3] = (int8_t) vout3;
22154
0
    output += 4;
22155
0
  }
22156
0
  if XNN_UNLIKELY(batch != 0) {
22157
0
    do {
22158
0
      const int32_t va = (int32_t) *input_a++ - va_zero_point;
22159
0
      const int32_t vacc = va * vb;
22160
22161
0
      float vfpacc = (float) vacc * vscale;
22162
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
22163
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
22164
0
      vfpacc += vmagic_bias;
22165
0
      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
22166
0
      *output++ = (int8_t) vout;
22167
22168
0
      batch -= sizeof(int8_t);
22169
0
    } while (batch != 0);
22170
0
  }
22171
0
}
22172
22173
void xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__scalar_imagic_c1(
22174
    size_t output_pixels,
22175
    size_t kernel_elements,
22176
    size_t channels,
22177
    const uint8_t** input,
22178
    size_t input_offset,
22179
    const uint8_t* zero,
22180
    int32_t* buffer,
22181
    uint8_t* output,
22182
    size_t input_increment,
22183
    size_t output_increment,
22184
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
22185
0
{
22186
0
  assert(output_pixels != 0);
22187
0
  assert(kernel_elements > 9);
22188
0
  assert(channels != 0);
22189
22190
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
22191
0
  const float vscale = params->fp32_scalar_imagic.scale;
22192
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
22193
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
22194
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
22195
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
22196
0
  do {
22197
    // First pass.
22198
0
    {
22199
0
      const uint8_t* i0 = *input++;
22200
0
      assert(i0 != NULL);
22201
0
      if XNN_UNPREDICTABLE(i0 != zero) {
22202
0
        i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
22203
0
      }
22204
0
      const uint8_t* i1 = *input++;
22205
0
      assert(i1 != NULL);
22206
0
      if XNN_UNPREDICTABLE(i1 != zero) {
22207
0
        i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
22208
0
      }
22209
0
      const uint8_t* i2 = *input++;
22210
0
      assert(i2 != NULL);
22211
0
      if XNN_UNPREDICTABLE(i2 != zero) {
22212
0
        i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
22213
0
      }
22214
0
      const uint8_t* i3 = *input++;
22215
0
      assert(i3 != NULL);
22216
0
      if XNN_UNPREDICTABLE(i3 != zero) {
22217
0
        i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
22218
0
      }
22219
0
      const uint8_t* i4 = *input++;
22220
0
      assert(i4 != NULL);
22221
0
      if XNN_UNPREDICTABLE(i4 != zero) {
22222
0
        i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
22223
0
      }
22224
0
      const uint8_t* i5 = *input++;
22225
0
      assert(i5 != NULL);
22226
0
      if XNN_UNPREDICTABLE(i5 != zero) {
22227
0
        i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
22228
0
      }
22229
0
      const uint8_t* i6 = *input++;
22230
0
      assert(i6 != NULL);
22231
0
      if XNN_UNPREDICTABLE(i6 != zero) {
22232
0
        i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
22233
0
      }
22234
0
      const uint8_t* i7 = *input++;
22235
0
      assert(i7 != NULL);
22236
0
      if XNN_UNPREDICTABLE(i7 != zero) {
22237
0
        i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
22238
0
      }
22239
0
      const uint8_t* i8 = *input++;
22240
0
      assert(i8 != NULL);
22241
0
      if XNN_UNPREDICTABLE(i8 != zero) {
22242
0
        i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
22243
0
      }
22244
22245
0
      int32_t* b = buffer;
22246
0
      size_t c = channels;
22247
0
      do {
22248
0
        int32_t vacc = vinit_bias;
22249
22250
0
        const int32_t vi0 = (int32_t) (uint32_t) *i0++;
22251
0
        vacc += vi0;
22252
0
        const int32_t vi1 = (int32_t) (uint32_t) *i1++;
22253
0
        vacc += vi1;
22254
0
        const int32_t vi2 = (int32_t) (uint32_t) *i2++;
22255
0
        vacc += vi2;
22256
0
        const int32_t vi3 = (int32_t) (uint32_t) *i3++;
22257
0
        vacc += vi3;
22258
0
        const int32_t vi4 = (int32_t) (uint32_t) *i4++;
22259
0
        vacc += vi4;
22260
0
        const int32_t vi5 = (int32_t) (uint32_t) *i5++;
22261
0
        vacc += vi5;
22262
0
        const int32_t vi6 = (int32_t) (uint32_t) *i6++;
22263
0
        vacc += vi6;
22264
0
        const int32_t vi7 = (int32_t) (uint32_t) *i7++;
22265
0
        vacc += vi7;
22266
0
        const int32_t vi8 = (int32_t) (uint32_t) *i8++;
22267
0
        vacc += vi8;
22268
22269
0
        *b++ = vacc;
22270
0
      } while (--c != 0);
22271
0
    }
22272
22273
0
    size_t k = kernel_elements;
22274
    // Intermediate passes.
22275
0
    for (k -= 9; k > 8; k -= 8) {
22276
0
      const uint8_t* i0 = *input++;
22277
0
      assert(i0 != NULL);
22278
0
      if XNN_UNPREDICTABLE(i0 != zero) {
22279
0
        i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
22280
0
      }
22281
0
      const uint8_t* i1 = *input++;
22282
0
      assert(i1 != NULL);
22283
0
      if XNN_UNPREDICTABLE(i1 != zero) {
22284
0
        i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
22285
0
      }
22286
0
      const uint8_t* i2 = *input++;
22287
0
      assert(i2 != NULL);
22288
0
      if XNN_UNPREDICTABLE(i2 != zero) {
22289
0
        i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
22290
0
      }
22291
0
      const uint8_t* i3 = *input++;
22292
0
      assert(i3 != NULL);
22293
0
      if XNN_UNPREDICTABLE(i3 != zero) {
22294
0
        i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
22295
0
      }
22296
0
      const uint8_t* i4 = *input++;
22297
0
      assert(i4 != NULL);
22298
0
      if XNN_UNPREDICTABLE(i4 != zero) {
22299
0
        i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
22300
0
      }
22301
0
      const uint8_t* i5 = *input++;
22302
0
      assert(i5 != NULL);
22303
0
      if XNN_UNPREDICTABLE(i5 != zero) {
22304
0
        i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
22305
0
      }
22306
0
      const uint8_t* i6 = *input++;
22307
0
      assert(i6 != NULL);
22308
0
      if XNN_UNPREDICTABLE(i6 != zero) {
22309
0
        i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
22310
0
      }
22311
0
      const uint8_t* i7 = *input++;
22312
0
      assert(i7 != NULL);
22313
0
      if XNN_UNPREDICTABLE(i7 != zero) {
22314
0
        i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
22315
0
      }
22316
22317
0
      int32_t* b = buffer;
22318
0
      size_t c = channels;
22319
0
      do {
22320
0
        int32_t vacc = *b;
22321
22322
0
        const int32_t vi0 = (int32_t) (uint32_t) *i0++;
22323
0
        vacc += vi0;
22324
0
        const int32_t vi1 = (int32_t) (uint32_t) *i1++;
22325
0
        vacc += vi1;
22326
0
        const int32_t vi2 = (int32_t) (uint32_t) *i2++;
22327
0
        vacc += vi2;
22328
0
        const int32_t vi3 = (int32_t) (uint32_t) *i3++;
22329
0
        vacc += vi3;
22330
0
        const int32_t vi4 = (int32_t) (uint32_t) *i4++;
22331
0
        vacc += vi4;
22332
0
        const int32_t vi5 = (int32_t) (uint32_t) *i5++;
22333
0
        vacc += vi5;
22334
0
        const int32_t vi6 = (int32_t) (uint32_t) *i6++;
22335
0
        vacc += vi6;
22336
0
        const int32_t vi7 = (int32_t) (uint32_t) *i7++;
22337
0
        vacc += vi7;
22338
22339
0
        *b++ = vacc;
22340
0
      } while (--c != 0);
22341
0
    }
22342
22343
    // Last pass.
22344
0
    {
22345
0
      const uint8_t* i0 = input[0];
22346
0
      assert(i0 != NULL);
22347
0
      const uint8_t* i1 = input[1];
22348
0
      const uint8_t* i2 = input[2];
22349
0
      const uint8_t* i3 = input[3];
22350
0
      const uint8_t* i4 = input[4];
22351
0
      const uint8_t* i5 = input[5];
22352
0
      const uint8_t* i6 = input[6];
22353
0
      const uint8_t* i7 = input[7];
22354
0
      input = (const uint8_t**) ((uintptr_t) input + input_increment);
22355
0
      if (k < 2) {
22356
0
        i1 = zero;
22357
0
      }
22358
0
      assert(i1 != NULL);
22359
0
      if (k <= 2) {
22360
0
        i2 = zero;
22361
0
      }
22362
0
      assert(i2 != NULL);
22363
0
      if (k < 4) {
22364
0
        i3 = zero;
22365
0
      }
22366
0
      assert(i3 != NULL);
22367
0
      if (k <= 4) {
22368
0
        i4 = zero;
22369
0
      }
22370
0
      assert(i4 != NULL);
22371
0
      if (k < 6) {
22372
0
        i5 = zero;
22373
0
      }
22374
0
      assert(i5 != NULL);
22375
0
      if (k <= 6) {
22376
0
        i6 = zero;
22377
0
      }
22378
0
      assert(i6 != NULL);
22379
0
      if (k < 8) {
22380
0
        i7 = zero;
22381
0
      }
22382
0
      assert(i7 != NULL);
22383
0
      if XNN_UNPREDICTABLE(i0 != zero) {
22384
0
        i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
22385
0
      }
22386
0
      if XNN_UNPREDICTABLE(i1 != zero) {
22387
0
        i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
22388
0
      }
22389
0
      if XNN_UNPREDICTABLE(i2 != zero) {
22390
0
        i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
22391
0
      }
22392
0
      if XNN_UNPREDICTABLE(i3 != zero) {
22393
0
        i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
22394
0
      }
22395
0
      if XNN_UNPREDICTABLE(i4 != zero) {
22396
0
        i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
22397
0
      }
22398
0
      if XNN_UNPREDICTABLE(i5 != zero) {
22399
0
        i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
22400
0
      }
22401
0
      if XNN_UNPREDICTABLE(i6 != zero) {
22402
0
        i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
22403
0
      }
22404
0
      if XNN_UNPREDICTABLE(i7 != zero) {
22405
0
        i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
22406
0
      }
22407
22408
0
      size_t c = channels;
22409
0
      int32_t* b = buffer;
22410
0
      do {
22411
0
        int32_t vacc = *b++;
22412
22413
0
        const int32_t vi0 = (int32_t) (uint32_t) *i0++;
22414
0
        vacc += vi0;
22415
0
        const int32_t vi1 = (int32_t) (uint32_t) *i1++;
22416
0
        vacc += vi1;
22417
0
        const int32_t vi2 = (int32_t) (uint32_t) *i2++;
22418
0
        vacc += vi2;
22419
0
        const int32_t vi3 = (int32_t) (uint32_t) *i3++;
22420
0
        vacc += vi3;
22421
0
        const int32_t vi4 = (int32_t) (uint32_t) *i4++;
22422
0
        vacc += vi4;
22423
0
        const int32_t vi5 = (int32_t) (uint32_t) *i5++;
22424
0
        vacc += vi5;
22425
0
        const int32_t vi6 = (int32_t) (uint32_t) *i6++;
22426
0
        vacc += vi6;
22427
0
        const int32_t vi7 = (int32_t) (uint32_t) *i7++;
22428
0
        vacc += vi7;
22429
22430
0
        float vfpacc = (float) vacc * vscale;
22431
0
        vfpacc += vmagic_bias;
22432
0
        int32_t vout = (int32_t) float_as_uint32(vfpacc);
22433
0
        vout = math_max_s32(vout, vmagic_min);
22434
0
        vout = math_min_s32(vout, vmagic_max);
22435
0
        vout -= vmagic_bias_less_zero_point;
22436
22437
0
        *output++ = (uint8_t) vout;
22438
0
      } while (--c != 0);
22439
0
    }
22440
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
22441
0
  } while (--output_pixels != 0);
22442
0
}
22443
22444
void xnn_qu8_avgpool_minmax_fp32_ukernel_9x__scalar_imagic_c1(
22445
    size_t output_pixels,
22446
    size_t kernel_elements,
22447
    size_t channels,
22448
    const uint8_t** input,
22449
    size_t input_offset,
22450
    const uint8_t* zero,
22451
    uint8_t* output,
22452
    size_t input_increment,
22453
    size_t output_increment,
22454
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
22455
0
{
22456
0
  assert(output_pixels != 0);
22457
0
  assert(kernel_elements != 0);
22458
0
  assert(kernel_elements <= 9);
22459
0
  assert(channels != 0);
22460
22461
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
22462
0
  const float vscale = params->fp32_scalar_imagic.scale;
22463
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
22464
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
22465
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
22466
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
22467
0
  do {
22468
0
    const uint8_t* i0 = input[0];
22469
0
    assert(i0 != NULL);
22470
0
    const uint8_t* i1 = input[1];
22471
0
    const uint8_t* i2 = input[2];
22472
0
    const uint8_t* i3 = input[3];
22473
0
    const uint8_t* i4 = input[4];
22474
0
    const uint8_t* i5 = input[5];
22475
0
    const uint8_t* i6 = input[6];
22476
0
    const uint8_t* i7 = input[7];
22477
0
    const uint8_t* i8 = input[8];
22478
0
    input = (const uint8_t**) ((uintptr_t) input + input_increment);
22479
0
    if (kernel_elements < 2) {
22480
0
      i1 = zero;
22481
0
    }
22482
0
    assert(i1 != NULL);
22483
0
    if (kernel_elements <= 2) {
22484
0
      i2 = zero;
22485
0
    }
22486
0
    assert(i2 != NULL);
22487
0
    if (kernel_elements < 4) {
22488
0
      i3 = zero;
22489
0
    }
22490
0
    assert(i3 != NULL);
22491
0
    if (kernel_elements <= 4) {
22492
0
      i4 = zero;
22493
0
    }
22494
0
    assert(i4 != NULL);
22495
0
    if (kernel_elements < 6) {
22496
0
      i5 = zero;
22497
0
    }
22498
0
    assert(i5 != NULL);
22499
0
    if (kernel_elements <= 6) {
22500
0
      i6 = zero;
22501
0
    }
22502
0
    assert(i6 != NULL);
22503
0
    if (kernel_elements < 8) {
22504
0
      i7 = zero;
22505
0
    }
22506
0
    assert(i7 != NULL);
22507
0
    if (kernel_elements <= 8) {
22508
0
      i8 = zero;
22509
0
    }
22510
0
    assert(i8 != NULL);
22511
0
    if XNN_UNPREDICTABLE(i0 != zero) {
22512
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
22513
0
    }
22514
0
    if XNN_UNPREDICTABLE(i1 != zero) {
22515
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
22516
0
    }
22517
0
    if XNN_UNPREDICTABLE(i2 != zero) {
22518
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
22519
0
    }
22520
0
    if XNN_UNPREDICTABLE(i3 != zero) {
22521
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
22522
0
    }
22523
0
    if XNN_UNPREDICTABLE(i4 != zero) {
22524
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
22525
0
    }
22526
0
    if XNN_UNPREDICTABLE(i5 != zero) {
22527
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
22528
0
    }
22529
0
    if XNN_UNPREDICTABLE(i6 != zero) {
22530
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
22531
0
    }
22532
0
    if XNN_UNPREDICTABLE(i7 != zero) {
22533
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
22534
0
    }
22535
0
    if XNN_UNPREDICTABLE(i8 != zero) {
22536
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
22537
0
    }
22538
22539
0
    size_t c = channels;
22540
0
    do {
22541
0
      int32_t vacc = vinit_bias;
22542
22543
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0++;
22544
0
      vacc += vi0;
22545
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1++;
22546
0
      vacc += vi1;
22547
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2++;
22548
0
      vacc += vi2;
22549
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3++;
22550
0
      vacc += vi3;
22551
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4++;
22552
0
      vacc += vi4;
22553
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5++;
22554
0
      vacc += vi5;
22555
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6++;
22556
0
      vacc += vi6;
22557
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7++;
22558
0
      vacc += vi7;
22559
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8++;
22560
0
      vacc += vi8;
22561
22562
0
      float vfpacc = (float) vacc * vscale;
22563
0
      vfpacc += vmagic_bias;
22564
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
22565
0
      vout = math_max_s32(vout, vmagic_min);
22566
0
      vout = math_min_s32(vout, vmagic_max);
22567
0
      vout -= vmagic_bias_less_zero_point;
22568
22569
0
      *output++ = (uint8_t) vout;
22570
0
    } while (--c != 0);
22571
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
22572
0
  } while (--output_pixels != 0);
22573
0
}
22574
22575
void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic(
22576
    size_t channels,
22577
    size_t output_width,
22578
    const uint8_t** input,
22579
    const void* weights,
22580
    uint8_t* output,
22581
    intptr_t input_stride,
22582
    size_t output_increment,
22583
    size_t input_offset,
22584
    const uint8_t* zero,
22585
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
22586
0
{
22587
0
  assert(channels != 0);
22588
0
  assert(output_width != 0);
22589
22590
0
  const float vscale = params->fp32_scalar_fmagic.scale;
22591
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
22592
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
22593
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
22594
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
22595
0
  const int32_t vkernel_zero_point = params->fp32_scalar_fmagic.kernel_zero_point;
22596
0
  do {
22597
0
    const uint8_t* i0 = input[0];
22598
0
    assert(i0 != NULL);
22599
0
    if XNN_UNPREDICTABLE(i0 != zero) {
22600
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
22601
0
    }
22602
0
    const uint8_t* i1 = input[1];
22603
0
    assert(i1 != NULL);
22604
0
    if XNN_UNPREDICTABLE(i1 != zero) {
22605
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
22606
0
    }
22607
0
    const uint8_t* i2 = input[2];
22608
0
    assert(i2 != NULL);
22609
0
    if XNN_UNPREDICTABLE(i2 != zero) {
22610
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
22611
0
    }
22612
0
    const uint8_t* i3 = input[3];
22613
0
    assert(i3 != NULL);
22614
0
    if XNN_UNPREDICTABLE(i3 != zero) {
22615
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
22616
0
    }
22617
0
    const uint8_t* i4 = input[4];
22618
0
    assert(i4 != NULL);
22619
0
    if XNN_UNPREDICTABLE(i4 != zero) {
22620
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
22621
0
    }
22622
0
    const uint8_t* i5 = input[5];
22623
0
    assert(i5 != NULL);
22624
0
    if XNN_UNPREDICTABLE(i5 != zero) {
22625
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
22626
0
    }
22627
0
    const uint8_t* i6 = input[6];
22628
0
    assert(i6 != NULL);
22629
0
    if XNN_UNPREDICTABLE(i6 != zero) {
22630
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
22631
0
    }
22632
0
    const uint8_t* i7 = input[7];
22633
0
    assert(i7 != NULL);
22634
0
    if XNN_UNPREDICTABLE(i7 != zero) {
22635
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
22636
0
    }
22637
0
    const uint8_t* i8 = input[8];
22638
0
    assert(i8 != NULL);
22639
0
    if XNN_UNPREDICTABLE(i8 != zero) {
22640
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
22641
0
    }
22642
0
    const uint8_t* i9 = input[9];
22643
0
    assert(i9 != NULL);
22644
0
    if XNN_UNPREDICTABLE(i9 != zero) {
22645
0
      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
22646
0
    }
22647
0
    const uint8_t* i10 = input[10];
22648
0
    assert(i10 != NULL);
22649
0
    if XNN_UNPREDICTABLE(i10 != zero) {
22650
0
      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
22651
0
    }
22652
0
    const uint8_t* i11 = input[11];
22653
0
    assert(i11 != NULL);
22654
0
    if XNN_UNPREDICTABLE(i11 != zero) {
22655
0
      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
22656
0
    }
22657
0
    const uint8_t* i12 = input[12];
22658
0
    assert(i12 != NULL);
22659
0
    if XNN_UNPREDICTABLE(i12 != zero) {
22660
0
      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
22661
0
    }
22662
0
    const uint8_t* i13 = input[13];
22663
0
    assert(i13 != NULL);
22664
0
    if XNN_UNPREDICTABLE(i13 != zero) {
22665
0
      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
22666
0
    }
22667
0
    const uint8_t* i14 = input[14];
22668
0
    assert(i14 != NULL);
22669
0
    if XNN_UNPREDICTABLE(i14 != zero) {
22670
0
      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
22671
0
    }
22672
0
    const uint8_t* i15 = input[15];
22673
0
    assert(i15 != NULL);
22674
0
    if XNN_UNPREDICTABLE(i15 != zero) {
22675
0
      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
22676
0
    }
22677
0
    const uint8_t* i16 = input[16];
22678
0
    assert(i16 != NULL);
22679
0
    if XNN_UNPREDICTABLE(i16 != zero) {
22680
0
      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
22681
0
    }
22682
0
    const uint8_t* i17 = input[17];
22683
0
    assert(i17 != NULL);
22684
0
    if XNN_UNPREDICTABLE(i17 != zero) {
22685
0
      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
22686
0
    }
22687
0
    const uint8_t* i18 = input[18];
22688
0
    assert(i18 != NULL);
22689
0
    if XNN_UNPREDICTABLE(i18 != zero) {
22690
0
      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
22691
0
    }
22692
0
    const uint8_t* i19 = input[19];
22693
0
    assert(i19 != NULL);
22694
0
    if XNN_UNPREDICTABLE(i19 != zero) {
22695
0
      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
22696
0
    }
22697
0
    const uint8_t* i20 = input[20];
22698
0
    assert(i20 != NULL);
22699
0
    if XNN_UNPREDICTABLE(i20 != zero) {
22700
0
      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
22701
0
    }
22702
0
    const uint8_t* i21 = input[21];
22703
0
    assert(i21 != NULL);
22704
0
    if XNN_UNPREDICTABLE(i21 != zero) {
22705
0
      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
22706
0
    }
22707
0
    const uint8_t* i22 = input[22];
22708
0
    assert(i22 != NULL);
22709
0
    if XNN_UNPREDICTABLE(i22 != zero) {
22710
0
      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
22711
0
    }
22712
0
    const uint8_t* i23 = input[23];
22713
0
    assert(i23 != NULL);
22714
0
    if XNN_UNPREDICTABLE(i23 != zero) {
22715
0
      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
22716
0
    }
22717
0
    const uint8_t* i24 = input[24];
22718
0
    assert(i24 != NULL);
22719
0
    if XNN_UNPREDICTABLE(i24 != zero) {
22720
0
      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
22721
0
    }
22722
0
    input = (const uint8_t**) ((uintptr_t) input + input_stride);
22723
22724
0
    size_t c = channels;
22725
0
    const void* w = weights;
22726
0
    do {
22727
0
      int32_t vacc = unaligned_load_s32(w);
22728
22729
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0++;
22730
0
      const int32_t vk0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[0] - vkernel_zero_point;
22731
0
      vacc += vi0 * vk0;
22732
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1++;
22733
0
      const int32_t vk1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[1] - vkernel_zero_point;
22734
0
      vacc += vi1 * vk1;
22735
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2++;
22736
0
      const int32_t vk2 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[2] - vkernel_zero_point;
22737
0
      vacc += vi2 * vk2;
22738
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3++;
22739
0
      const int32_t vk3 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[3] - vkernel_zero_point;
22740
0
      vacc += vi3 * vk3;
22741
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4++;
22742
0
      const int32_t vk4 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[4] - vkernel_zero_point;
22743
0
      vacc += vi4 * vk4;
22744
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5++;
22745
0
      const int32_t vk5 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[5] - vkernel_zero_point;
22746
0
      vacc += vi5 * vk5;
22747
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6++;
22748
0
      const int32_t vk6 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[6] - vkernel_zero_point;
22749
0
      vacc += vi6 * vk6;
22750
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7++;
22751
0
      const int32_t vk7 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[7] - vkernel_zero_point;
22752
0
      vacc += vi7 * vk7;
22753
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8++;
22754
0
      const int32_t vk8 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[8] - vkernel_zero_point;
22755
0
      vacc += vi8 * vk8;
22756
0
      const int32_t vi9 = (int32_t) (uint32_t) *i9++;
22757
0
      const int32_t vk9 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[9] - vkernel_zero_point;
22758
0
      vacc += vi9 * vk9;
22759
0
      const int32_t vi10 = (int32_t) (uint32_t) *i10++;
22760
0
      const int32_t vk10 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[10] - vkernel_zero_point;
22761
0
      vacc += vi10 * vk10;
22762
0
      const int32_t vi11 = (int32_t) (uint32_t) *i11++;
22763
0
      const int32_t vk11 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[11] - vkernel_zero_point;
22764
0
      vacc += vi11 * vk11;
22765
0
      const int32_t vi12 = (int32_t) (uint32_t) *i12++;
22766
0
      const int32_t vk12 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[12] - vkernel_zero_point;
22767
0
      vacc += vi12 * vk12;
22768
0
      const int32_t vi13 = (int32_t) (uint32_t) *i13++;
22769
0
      const int32_t vk13 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[13] - vkernel_zero_point;
22770
0
      vacc += vi13 * vk13;
22771
0
      const int32_t vi14 = (int32_t) (uint32_t) *i14++;
22772
0
      const int32_t vk14 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[14] - vkernel_zero_point;
22773
0
      vacc += vi14 * vk14;
22774
0
      const int32_t vi15 = (int32_t) (uint32_t) *i15++;
22775
0
      const int32_t vk15 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[15] - vkernel_zero_point;
22776
0
      vacc += vi15 * vk15;
22777
0
      const int32_t vi16 = (int32_t) (uint32_t) *i16++;
22778
0
      const int32_t vk16 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[16] - vkernel_zero_point;
22779
0
      vacc += vi16 * vk16;
22780
0
      const int32_t vi17 = (int32_t) (uint32_t) *i17++;
22781
0
      const int32_t vk17 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[17] - vkernel_zero_point;
22782
0
      vacc += vi17 * vk17;
22783
0
      const int32_t vi18 = (int32_t) (uint32_t) *i18++;
22784
0
      const int32_t vk18 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[18] - vkernel_zero_point;
22785
0
      vacc += vi18 * vk18;
22786
0
      const int32_t vi19 = (int32_t) (uint32_t) *i19++;
22787
0
      const int32_t vk19 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[19] - vkernel_zero_point;
22788
0
      vacc += vi19 * vk19;
22789
0
      const int32_t vi20 = (int32_t) (uint32_t) *i20++;
22790
0
      const int32_t vk20 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[20] - vkernel_zero_point;
22791
0
      vacc += vi20 * vk20;
22792
0
      const int32_t vi21 = (int32_t) (uint32_t) *i21++;
22793
0
      const int32_t vk21 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[21] - vkernel_zero_point;
22794
0
      vacc += vi21 * vk21;
22795
0
      const int32_t vi22 = (int32_t) (uint32_t) *i22++;
22796
0
      const int32_t vk22 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[22] - vkernel_zero_point;
22797
0
      vacc += vi22 * vk22;
22798
0
      const int32_t vi23 = (int32_t) (uint32_t) *i23++;
22799
0
      const int32_t vk23 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[23] - vkernel_zero_point;
22800
0
      vacc += vi23 * vk23;
22801
0
      const int32_t vi24 = (int32_t) (uint32_t) *i24++;
22802
0
      const int32_t vk24 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[24] - vkernel_zero_point;
22803
0
      vacc += vi24 * vk24;
22804
22805
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 25 * sizeof(uint8_t));
22806
22807
0
      float vfpacc = (float) vacc * vscale;
22808
22809
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
22810
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
22811
0
      vfpacc += vmagic_bias;
22812
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
22813
22814
0
      *output++ = (uint8_t) vout;
22815
0
    } while (--c != 0);
22816
22817
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
22818
0
  } while (--output_width != 0);
22819
0
}
22820
22821
void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic(
22822
    size_t channels,
22823
    size_t output_width,
22824
    const uint8_t** input,
22825
    const void* weights,
22826
    uint8_t* output,
22827
    intptr_t input_stride,
22828
    size_t output_increment,
22829
    size_t input_offset,
22830
    const uint8_t* zero,
22831
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
22832
0
{
22833
0
  assert(channels != 0);
22834
0
  assert(output_width != 0);
22835
22836
0
  const float vscale = params->fp32_scalar_imagic.scale;
22837
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
22838
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
22839
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
22840
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
22841
0
  const int32_t vkernel_zero_point = params->fp32_scalar_imagic.kernel_zero_point;
22842
0
  do {
22843
0
    const uint8_t* i0 = input[0];
22844
0
    assert(i0 != NULL);
22845
0
    if XNN_UNPREDICTABLE(i0 != zero) {
22846
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
22847
0
    }
22848
0
    const uint8_t* i1 = input[1];
22849
0
    assert(i1 != NULL);
22850
0
    if XNN_UNPREDICTABLE(i1 != zero) {
22851
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
22852
0
    }
22853
0
    const uint8_t* i2 = input[2];
22854
0
    assert(i2 != NULL);
22855
0
    if XNN_UNPREDICTABLE(i2 != zero) {
22856
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
22857
0
    }
22858
0
    const uint8_t* i3 = input[3];
22859
0
    assert(i3 != NULL);
22860
0
    if XNN_UNPREDICTABLE(i3 != zero) {
22861
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
22862
0
    }
22863
0
    const uint8_t* i4 = input[4];
22864
0
    assert(i4 != NULL);
22865
0
    if XNN_UNPREDICTABLE(i4 != zero) {
22866
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
22867
0
    }
22868
0
    const uint8_t* i5 = input[5];
22869
0
    assert(i5 != NULL);
22870
0
    if XNN_UNPREDICTABLE(i5 != zero) {
22871
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
22872
0
    }
22873
0
    const uint8_t* i6 = input[6];
22874
0
    assert(i6 != NULL);
22875
0
    if XNN_UNPREDICTABLE(i6 != zero) {
22876
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
22877
0
    }
22878
0
    const uint8_t* i7 = input[7];
22879
0
    assert(i7 != NULL);
22880
0
    if XNN_UNPREDICTABLE(i7 != zero) {
22881
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
22882
0
    }
22883
0
    const uint8_t* i8 = input[8];
22884
0
    assert(i8 != NULL);
22885
0
    if XNN_UNPREDICTABLE(i8 != zero) {
22886
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
22887
0
    }
22888
0
    const uint8_t* i9 = input[9];
22889
0
    assert(i9 != NULL);
22890
0
    if XNN_UNPREDICTABLE(i9 != zero) {
22891
0
      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
22892
0
    }
22893
0
    const uint8_t* i10 = input[10];
22894
0
    assert(i10 != NULL);
22895
0
    if XNN_UNPREDICTABLE(i10 != zero) {
22896
0
      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
22897
0
    }
22898
0
    const uint8_t* i11 = input[11];
22899
0
    assert(i11 != NULL);
22900
0
    if XNN_UNPREDICTABLE(i11 != zero) {
22901
0
      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
22902
0
    }
22903
0
    const uint8_t* i12 = input[12];
22904
0
    assert(i12 != NULL);
22905
0
    if XNN_UNPREDICTABLE(i12 != zero) {
22906
0
      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
22907
0
    }
22908
0
    const uint8_t* i13 = input[13];
22909
0
    assert(i13 != NULL);
22910
0
    if XNN_UNPREDICTABLE(i13 != zero) {
22911
0
      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
22912
0
    }
22913
0
    const uint8_t* i14 = input[14];
22914
0
    assert(i14 != NULL);
22915
0
    if XNN_UNPREDICTABLE(i14 != zero) {
22916
0
      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
22917
0
    }
22918
0
    const uint8_t* i15 = input[15];
22919
0
    assert(i15 != NULL);
22920
0
    if XNN_UNPREDICTABLE(i15 != zero) {
22921
0
      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
22922
0
    }
22923
0
    const uint8_t* i16 = input[16];
22924
0
    assert(i16 != NULL);
22925
0
    if XNN_UNPREDICTABLE(i16 != zero) {
22926
0
      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
22927
0
    }
22928
0
    const uint8_t* i17 = input[17];
22929
0
    assert(i17 != NULL);
22930
0
    if XNN_UNPREDICTABLE(i17 != zero) {
22931
0
      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
22932
0
    }
22933
0
    const uint8_t* i18 = input[18];
22934
0
    assert(i18 != NULL);
22935
0
    if XNN_UNPREDICTABLE(i18 != zero) {
22936
0
      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
22937
0
    }
22938
0
    const uint8_t* i19 = input[19];
22939
0
    assert(i19 != NULL);
22940
0
    if XNN_UNPREDICTABLE(i19 != zero) {
22941
0
      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
22942
0
    }
22943
0
    const uint8_t* i20 = input[20];
22944
0
    assert(i20 != NULL);
22945
0
    if XNN_UNPREDICTABLE(i20 != zero) {
22946
0
      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
22947
0
    }
22948
0
    const uint8_t* i21 = input[21];
22949
0
    assert(i21 != NULL);
22950
0
    if XNN_UNPREDICTABLE(i21 != zero) {
22951
0
      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
22952
0
    }
22953
0
    const uint8_t* i22 = input[22];
22954
0
    assert(i22 != NULL);
22955
0
    if XNN_UNPREDICTABLE(i22 != zero) {
22956
0
      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
22957
0
    }
22958
0
    const uint8_t* i23 = input[23];
22959
0
    assert(i23 != NULL);
22960
0
    if XNN_UNPREDICTABLE(i23 != zero) {
22961
0
      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
22962
0
    }
22963
0
    const uint8_t* i24 = input[24];
22964
0
    assert(i24 != NULL);
22965
0
    if XNN_UNPREDICTABLE(i24 != zero) {
22966
0
      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
22967
0
    }
22968
0
    input = (const uint8_t**) ((uintptr_t) input + input_stride);
22969
22970
0
    size_t c = channels;
22971
0
    const void* w = weights;
22972
0
    do {
22973
0
      int32_t vacc = unaligned_load_s32(w);
22974
22975
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0++;
22976
0
      const int32_t vk0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[0] - vkernel_zero_point;
22977
0
      vacc += vi0 * vk0;
22978
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1++;
22979
0
      const int32_t vk1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[1] - vkernel_zero_point;
22980
0
      vacc += vi1 * vk1;
22981
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2++;
22982
0
      const int32_t vk2 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[2] - vkernel_zero_point;
22983
0
      vacc += vi2 * vk2;
22984
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3++;
22985
0
      const int32_t vk3 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[3] - vkernel_zero_point;
22986
0
      vacc += vi3 * vk3;
22987
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4++;
22988
0
      const int32_t vk4 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[4] - vkernel_zero_point;
22989
0
      vacc += vi4 * vk4;
22990
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5++;
22991
0
      const int32_t vk5 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[5] - vkernel_zero_point;
22992
0
      vacc += vi5 * vk5;
22993
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6++;
22994
0
      const int32_t vk6 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[6] - vkernel_zero_point;
22995
0
      vacc += vi6 * vk6;
22996
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7++;
22997
0
      const int32_t vk7 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[7] - vkernel_zero_point;
22998
0
      vacc += vi7 * vk7;
22999
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8++;
23000
0
      const int32_t vk8 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[8] - vkernel_zero_point;
23001
0
      vacc += vi8 * vk8;
23002
0
      const int32_t vi9 = (int32_t) (uint32_t) *i9++;
23003
0
      const int32_t vk9 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[9] - vkernel_zero_point;
23004
0
      vacc += vi9 * vk9;
23005
0
      const int32_t vi10 = (int32_t) (uint32_t) *i10++;
23006
0
      const int32_t vk10 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[10] - vkernel_zero_point;
23007
0
      vacc += vi10 * vk10;
23008
0
      const int32_t vi11 = (int32_t) (uint32_t) *i11++;
23009
0
      const int32_t vk11 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[11] - vkernel_zero_point;
23010
0
      vacc += vi11 * vk11;
23011
0
      const int32_t vi12 = (int32_t) (uint32_t) *i12++;
23012
0
      const int32_t vk12 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[12] - vkernel_zero_point;
23013
0
      vacc += vi12 * vk12;
23014
0
      const int32_t vi13 = (int32_t) (uint32_t) *i13++;
23015
0
      const int32_t vk13 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[13] - vkernel_zero_point;
23016
0
      vacc += vi13 * vk13;
23017
0
      const int32_t vi14 = (int32_t) (uint32_t) *i14++;
23018
0
      const int32_t vk14 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[14] - vkernel_zero_point;
23019
0
      vacc += vi14 * vk14;
23020
0
      const int32_t vi15 = (int32_t) (uint32_t) *i15++;
23021
0
      const int32_t vk15 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[15] - vkernel_zero_point;
23022
0
      vacc += vi15 * vk15;
23023
0
      const int32_t vi16 = (int32_t) (uint32_t) *i16++;
23024
0
      const int32_t vk16 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[16] - vkernel_zero_point;
23025
0
      vacc += vi16 * vk16;
23026
0
      const int32_t vi17 = (int32_t) (uint32_t) *i17++;
23027
0
      const int32_t vk17 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[17] - vkernel_zero_point;
23028
0
      vacc += vi17 * vk17;
23029
0
      const int32_t vi18 = (int32_t) (uint32_t) *i18++;
23030
0
      const int32_t vk18 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[18] - vkernel_zero_point;
23031
0
      vacc += vi18 * vk18;
23032
0
      const int32_t vi19 = (int32_t) (uint32_t) *i19++;
23033
0
      const int32_t vk19 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[19] - vkernel_zero_point;
23034
0
      vacc += vi19 * vk19;
23035
0
      const int32_t vi20 = (int32_t) (uint32_t) *i20++;
23036
0
      const int32_t vk20 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[20] - vkernel_zero_point;
23037
0
      vacc += vi20 * vk20;
23038
0
      const int32_t vi21 = (int32_t) (uint32_t) *i21++;
23039
0
      const int32_t vk21 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[21] - vkernel_zero_point;
23040
0
      vacc += vi21 * vk21;
23041
0
      const int32_t vi22 = (int32_t) (uint32_t) *i22++;
23042
0
      const int32_t vk22 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[22] - vkernel_zero_point;
23043
0
      vacc += vi22 * vk22;
23044
0
      const int32_t vi23 = (int32_t) (uint32_t) *i23++;
23045
0
      const int32_t vk23 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[23] - vkernel_zero_point;
23046
0
      vacc += vi23 * vk23;
23047
0
      const int32_t vi24 = (int32_t) (uint32_t) *i24++;
23048
0
      const int32_t vk24 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[24] - vkernel_zero_point;
23049
0
      vacc += vi24 * vk24;
23050
23051
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 25 * sizeof(uint8_t));
23052
23053
0
      float vfpacc = (float) vacc * vscale;
23054
23055
0
      vfpacc += vmagic_bias;
23056
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
23057
0
      vout = math_max_s32(vout, vmagic_min);
23058
0
      vout = math_min_s32(vout, vmagic_max);
23059
0
      vout -= vmagic_bias_less_zero_point;
23060
23061
0
      *output++ = (uint8_t) vout;
23062
0
    } while (--c != 0);
23063
23064
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
23065
0
  } while (--output_width != 0);
23066
0
}
23067
23068
void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf(
23069
    size_t channels,
23070
    size_t output_width,
23071
    const uint8_t** input,
23072
    const void* weights,
23073
    uint8_t* output,
23074
    intptr_t input_stride,
23075
    size_t output_increment,
23076
    size_t input_offset,
23077
    const uint8_t* zero,
23078
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
23079
0
{
23080
0
  assert(channels != 0);
23081
0
  assert(output_width != 0);
23082
23083
0
  const float vscale = params->fp32_scalar_lrintf.scale;
23084
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
23085
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
23086
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
23087
0
  const int32_t vkernel_zero_point = params->fp32_scalar_lrintf.kernel_zero_point;
23088
0
  do {
23089
0
    const uint8_t* i0 = input[0];
23090
0
    assert(i0 != NULL);
23091
0
    if XNN_UNPREDICTABLE(i0 != zero) {
23092
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
23093
0
    }
23094
0
    const uint8_t* i1 = input[1];
23095
0
    assert(i1 != NULL);
23096
0
    if XNN_UNPREDICTABLE(i1 != zero) {
23097
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
23098
0
    }
23099
0
    const uint8_t* i2 = input[2];
23100
0
    assert(i2 != NULL);
23101
0
    if XNN_UNPREDICTABLE(i2 != zero) {
23102
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
23103
0
    }
23104
0
    const uint8_t* i3 = input[3];
23105
0
    assert(i3 != NULL);
23106
0
    if XNN_UNPREDICTABLE(i3 != zero) {
23107
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
23108
0
    }
23109
0
    const uint8_t* i4 = input[4];
23110
0
    assert(i4 != NULL);
23111
0
    if XNN_UNPREDICTABLE(i4 != zero) {
23112
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
23113
0
    }
23114
0
    const uint8_t* i5 = input[5];
23115
0
    assert(i5 != NULL);
23116
0
    if XNN_UNPREDICTABLE(i5 != zero) {
23117
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
23118
0
    }
23119
0
    const uint8_t* i6 = input[6];
23120
0
    assert(i6 != NULL);
23121
0
    if XNN_UNPREDICTABLE(i6 != zero) {
23122
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
23123
0
    }
23124
0
    const uint8_t* i7 = input[7];
23125
0
    assert(i7 != NULL);
23126
0
    if XNN_UNPREDICTABLE(i7 != zero) {
23127
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
23128
0
    }
23129
0
    const uint8_t* i8 = input[8];
23130
0
    assert(i8 != NULL);
23131
0
    if XNN_UNPREDICTABLE(i8 != zero) {
23132
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
23133
0
    }
23134
0
    const uint8_t* i9 = input[9];
23135
0
    assert(i9 != NULL);
23136
0
    if XNN_UNPREDICTABLE(i9 != zero) {
23137
0
      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
23138
0
    }
23139
0
    const uint8_t* i10 = input[10];
23140
0
    assert(i10 != NULL);
23141
0
    if XNN_UNPREDICTABLE(i10 != zero) {
23142
0
      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
23143
0
    }
23144
0
    const uint8_t* i11 = input[11];
23145
0
    assert(i11 != NULL);
23146
0
    if XNN_UNPREDICTABLE(i11 != zero) {
23147
0
      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
23148
0
    }
23149
0
    const uint8_t* i12 = input[12];
23150
0
    assert(i12 != NULL);
23151
0
    if XNN_UNPREDICTABLE(i12 != zero) {
23152
0
      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
23153
0
    }
23154
0
    const uint8_t* i13 = input[13];
23155
0
    assert(i13 != NULL);
23156
0
    if XNN_UNPREDICTABLE(i13 != zero) {
23157
0
      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
23158
0
    }
23159
0
    const uint8_t* i14 = input[14];
23160
0
    assert(i14 != NULL);
23161
0
    if XNN_UNPREDICTABLE(i14 != zero) {
23162
0
      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
23163
0
    }
23164
0
    const uint8_t* i15 = input[15];
23165
0
    assert(i15 != NULL);
23166
0
    if XNN_UNPREDICTABLE(i15 != zero) {
23167
0
      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
23168
0
    }
23169
0
    const uint8_t* i16 = input[16];
23170
0
    assert(i16 != NULL);
23171
0
    if XNN_UNPREDICTABLE(i16 != zero) {
23172
0
      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
23173
0
    }
23174
0
    const uint8_t* i17 = input[17];
23175
0
    assert(i17 != NULL);
23176
0
    if XNN_UNPREDICTABLE(i17 != zero) {
23177
0
      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
23178
0
    }
23179
0
    const uint8_t* i18 = input[18];
23180
0
    assert(i18 != NULL);
23181
0
    if XNN_UNPREDICTABLE(i18 != zero) {
23182
0
      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
23183
0
    }
23184
0
    const uint8_t* i19 = input[19];
23185
0
    assert(i19 != NULL);
23186
0
    if XNN_UNPREDICTABLE(i19 != zero) {
23187
0
      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
23188
0
    }
23189
0
    const uint8_t* i20 = input[20];
23190
0
    assert(i20 != NULL);
23191
0
    if XNN_UNPREDICTABLE(i20 != zero) {
23192
0
      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
23193
0
    }
23194
0
    const uint8_t* i21 = input[21];
23195
0
    assert(i21 != NULL);
23196
0
    if XNN_UNPREDICTABLE(i21 != zero) {
23197
0
      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
23198
0
    }
23199
0
    const uint8_t* i22 = input[22];
23200
0
    assert(i22 != NULL);
23201
0
    if XNN_UNPREDICTABLE(i22 != zero) {
23202
0
      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
23203
0
    }
23204
0
    const uint8_t* i23 = input[23];
23205
0
    assert(i23 != NULL);
23206
0
    if XNN_UNPREDICTABLE(i23 != zero) {
23207
0
      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
23208
0
    }
23209
0
    const uint8_t* i24 = input[24];
23210
0
    assert(i24 != NULL);
23211
0
    if XNN_UNPREDICTABLE(i24 != zero) {
23212
0
      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
23213
0
    }
23214
0
    input = (const uint8_t**) ((uintptr_t) input + input_stride);
23215
23216
0
    size_t c = channels;
23217
0
    const void* w = weights;
23218
0
    for (; c >= 2; c -= 2) {
23219
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
23220
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
23221
23222
23223
0
      const int32_t vi0x0 = (int32_t) (uint32_t) i0[0];
23224
0
      const int32_t vi0x1 = (int32_t) (uint32_t) i0[1];
23225
0
      i0 += 2;
23226
23227
0
      const int32_t vk0x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0] - vkernel_zero_point;
23228
0
      const int32_t vk0x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1] - vkernel_zero_point;
23229
23230
0
      vacc0 += vi0x0 * vk0x0;
23231
0
      vacc1 += vi0x1 * vk0x1;
23232
23233
0
      const int32_t vi1x0 = (int32_t) (uint32_t) i1[0];
23234
0
      const int32_t vi1x1 = (int32_t) (uint32_t) i1[1];
23235
0
      i1 += 2;
23236
23237
0
      const int32_t vk1x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2] - vkernel_zero_point;
23238
0
      const int32_t vk1x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3] - vkernel_zero_point;
23239
23240
0
      vacc0 += vi1x0 * vk1x0;
23241
0
      vacc1 += vi1x1 * vk1x1;
23242
23243
0
      const int32_t vi2x0 = (int32_t) (uint32_t) i2[0];
23244
0
      const int32_t vi2x1 = (int32_t) (uint32_t) i2[1];
23245
0
      i2 += 2;
23246
23247
0
      const int32_t vk2x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4] - vkernel_zero_point;
23248
0
      const int32_t vk2x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5] - vkernel_zero_point;
23249
23250
0
      vacc0 += vi2x0 * vk2x0;
23251
0
      vacc1 += vi2x1 * vk2x1;
23252
23253
0
      const int32_t vi3x0 = (int32_t) (uint32_t) i3[0];
23254
0
      const int32_t vi3x1 = (int32_t) (uint32_t) i3[1];
23255
0
      i3 += 2;
23256
23257
0
      const int32_t vk3x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6] - vkernel_zero_point;
23258
0
      const int32_t vk3x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7] - vkernel_zero_point;
23259
23260
0
      vacc0 += vi3x0 * vk3x0;
23261
0
      vacc1 += vi3x1 * vk3x1;
23262
23263
0
      const int32_t vi4x0 = (int32_t) (uint32_t) i4[0];
23264
0
      const int32_t vi4x1 = (int32_t) (uint32_t) i4[1];
23265
0
      i4 += 2;
23266
23267
0
      const int32_t vk4x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8] - vkernel_zero_point;
23268
0
      const int32_t vk4x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9] - vkernel_zero_point;
23269
23270
0
      vacc0 += vi4x0 * vk4x0;
23271
0
      vacc1 += vi4x1 * vk4x1;
23272
23273
0
      const int32_t vi5x0 = (int32_t) (uint32_t) i5[0];
23274
0
      const int32_t vi5x1 = (int32_t) (uint32_t) i5[1];
23275
0
      i5 += 2;
23276
23277
0
      const int32_t vk5x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10] - vkernel_zero_point;
23278
0
      const int32_t vk5x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11] - vkernel_zero_point;
23279
23280
0
      vacc0 += vi5x0 * vk5x0;
23281
0
      vacc1 += vi5x1 * vk5x1;
23282
23283
0
      const int32_t vi6x0 = (int32_t) (uint32_t) i6[0];
23284
0
      const int32_t vi6x1 = (int32_t) (uint32_t) i6[1];
23285
0
      i6 += 2;
23286
23287
0
      const int32_t vk6x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12] - vkernel_zero_point;
23288
0
      const int32_t vk6x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13] - vkernel_zero_point;
23289
23290
0
      vacc0 += vi6x0 * vk6x0;
23291
0
      vacc1 += vi6x1 * vk6x1;
23292
23293
0
      const int32_t vi7x0 = (int32_t) (uint32_t) i7[0];
23294
0
      const int32_t vi7x1 = (int32_t) (uint32_t) i7[1];
23295
0
      i7 += 2;
23296
23297
0
      const int32_t vk7x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14] - vkernel_zero_point;
23298
0
      const int32_t vk7x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15] - vkernel_zero_point;
23299
23300
0
      vacc0 += vi7x0 * vk7x0;
23301
0
      vacc1 += vi7x1 * vk7x1;
23302
23303
0
      const int32_t vi8x0 = (int32_t) (uint32_t) i8[0];
23304
0
      const int32_t vi8x1 = (int32_t) (uint32_t) i8[1];
23305
0
      i8 += 2;
23306
23307
0
      const int32_t vk8x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16] - vkernel_zero_point;
23308
0
      const int32_t vk8x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17] - vkernel_zero_point;
23309
23310
0
      vacc0 += vi8x0 * vk8x0;
23311
0
      vacc1 += vi8x1 * vk8x1;
23312
23313
0
      const int32_t vi9x0 = (int32_t) (uint32_t) i9[0];
23314
0
      const int32_t vi9x1 = (int32_t) (uint32_t) i9[1];
23315
0
      i9 += 2;
23316
23317
0
      const int32_t vk9x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[18] - vkernel_zero_point;
23318
0
      const int32_t vk9x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[19] - vkernel_zero_point;
23319
23320
0
      vacc0 += vi9x0 * vk9x0;
23321
0
      vacc1 += vi9x1 * vk9x1;
23322
23323
0
      const int32_t vi10x0 = (int32_t) (uint32_t) i10[0];
23324
0
      const int32_t vi10x1 = (int32_t) (uint32_t) i10[1];
23325
0
      i10 += 2;
23326
23327
0
      const int32_t vk10x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[20] - vkernel_zero_point;
23328
0
      const int32_t vk10x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[21] - vkernel_zero_point;
23329
23330
0
      vacc0 += vi10x0 * vk10x0;
23331
0
      vacc1 += vi10x1 * vk10x1;
23332
23333
0
      const int32_t vi11x0 = (int32_t) (uint32_t) i11[0];
23334
0
      const int32_t vi11x1 = (int32_t) (uint32_t) i11[1];
23335
0
      i11 += 2;
23336
23337
0
      const int32_t vk11x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[22] - vkernel_zero_point;
23338
0
      const int32_t vk11x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[23] - vkernel_zero_point;
23339
23340
0
      vacc0 += vi11x0 * vk11x0;
23341
0
      vacc1 += vi11x1 * vk11x1;
23342
23343
0
      const int32_t vi12x0 = (int32_t) (uint32_t) i12[0];
23344
0
      const int32_t vi12x1 = (int32_t) (uint32_t) i12[1];
23345
0
      i12 += 2;
23346
23347
0
      const int32_t vk12x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[24] - vkernel_zero_point;
23348
0
      const int32_t vk12x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[25] - vkernel_zero_point;
23349
23350
0
      vacc0 += vi12x0 * vk12x0;
23351
0
      vacc1 += vi12x1 * vk12x1;
23352
23353
0
      const int32_t vi13x0 = (int32_t) (uint32_t) i13[0];
23354
0
      const int32_t vi13x1 = (int32_t) (uint32_t) i13[1];
23355
0
      i13 += 2;
23356
23357
0
      const int32_t vk13x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[26] - vkernel_zero_point;
23358
0
      const int32_t vk13x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[27] - vkernel_zero_point;
23359
23360
0
      vacc0 += vi13x0 * vk13x0;
23361
0
      vacc1 += vi13x1 * vk13x1;
23362
23363
0
      const int32_t vi14x0 = (int32_t) (uint32_t) i14[0];
23364
0
      const int32_t vi14x1 = (int32_t) (uint32_t) i14[1];
23365
0
      i14 += 2;
23366
23367
0
      const int32_t vk14x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[28] - vkernel_zero_point;
23368
0
      const int32_t vk14x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[29] - vkernel_zero_point;
23369
23370
0
      vacc0 += vi14x0 * vk14x0;
23371
0
      vacc1 += vi14x1 * vk14x1;
23372
23373
0
      const int32_t vi15x0 = (int32_t) (uint32_t) i15[0];
23374
0
      const int32_t vi15x1 = (int32_t) (uint32_t) i15[1];
23375
0
      i15 += 2;
23376
23377
0
      const int32_t vk15x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[30] - vkernel_zero_point;
23378
0
      const int32_t vk15x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[31] - vkernel_zero_point;
23379
23380
0
      vacc0 += vi15x0 * vk15x0;
23381
0
      vacc1 += vi15x1 * vk15x1;
23382
23383
0
      const int32_t vi16x0 = (int32_t) (uint32_t) i16[0];
23384
0
      const int32_t vi16x1 = (int32_t) (uint32_t) i16[1];
23385
0
      i16 += 2;
23386
23387
0
      const int32_t vk16x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[32] - vkernel_zero_point;
23388
0
      const int32_t vk16x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[33] - vkernel_zero_point;
23389
23390
0
      vacc0 += vi16x0 * vk16x0;
23391
0
      vacc1 += vi16x1 * vk16x1;
23392
23393
0
      const int32_t vi17x0 = (int32_t) (uint32_t) i17[0];
23394
0
      const int32_t vi17x1 = (int32_t) (uint32_t) i17[1];
23395
0
      i17 += 2;
23396
23397
0
      const int32_t vk17x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[34] - vkernel_zero_point;
23398
0
      const int32_t vk17x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[35] - vkernel_zero_point;
23399
23400
0
      vacc0 += vi17x0 * vk17x0;
23401
0
      vacc1 += vi17x1 * vk17x1;
23402
23403
0
      const int32_t vi18x0 = (int32_t) (uint32_t) i18[0];
23404
0
      const int32_t vi18x1 = (int32_t) (uint32_t) i18[1];
23405
0
      i18 += 2;
23406
23407
0
      const int32_t vk18x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[36] - vkernel_zero_point;
23408
0
      const int32_t vk18x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[37] - vkernel_zero_point;
23409
23410
0
      vacc0 += vi18x0 * vk18x0;
23411
0
      vacc1 += vi18x1 * vk18x1;
23412
23413
0
      const int32_t vi19x0 = (int32_t) (uint32_t) i19[0];
23414
0
      const int32_t vi19x1 = (int32_t) (uint32_t) i19[1];
23415
0
      i19 += 2;
23416
23417
0
      const int32_t vk19x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[38] - vkernel_zero_point;
23418
0
      const int32_t vk19x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[39] - vkernel_zero_point;
23419
23420
0
      vacc0 += vi19x0 * vk19x0;
23421
0
      vacc1 += vi19x1 * vk19x1;
23422
23423
0
      const int32_t vi20x0 = (int32_t) (uint32_t) i20[0];
23424
0
      const int32_t vi20x1 = (int32_t) (uint32_t) i20[1];
23425
0
      i20 += 2;
23426
23427
0
      const int32_t vk20x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[40] - vkernel_zero_point;
23428
0
      const int32_t vk20x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[41] - vkernel_zero_point;
23429
23430
0
      vacc0 += vi20x0 * vk20x0;
23431
0
      vacc1 += vi20x1 * vk20x1;
23432
23433
0
      const int32_t vi21x0 = (int32_t) (uint32_t) i21[0];
23434
0
      const int32_t vi21x1 = (int32_t) (uint32_t) i21[1];
23435
0
      i21 += 2;
23436
23437
0
      const int32_t vk21x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[42] - vkernel_zero_point;
23438
0
      const int32_t vk21x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[43] - vkernel_zero_point;
23439
23440
0
      vacc0 += vi21x0 * vk21x0;
23441
0
      vacc1 += vi21x1 * vk21x1;
23442
23443
0
      const int32_t vi22x0 = (int32_t) (uint32_t) i22[0];
23444
0
      const int32_t vi22x1 = (int32_t) (uint32_t) i22[1];
23445
0
      i22 += 2;
23446
23447
0
      const int32_t vk22x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[44] - vkernel_zero_point;
23448
0
      const int32_t vk22x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[45] - vkernel_zero_point;
23449
23450
0
      vacc0 += vi22x0 * vk22x0;
23451
0
      vacc1 += vi22x1 * vk22x1;
23452
23453
0
      const int32_t vi23x0 = (int32_t) (uint32_t) i23[0];
23454
0
      const int32_t vi23x1 = (int32_t) (uint32_t) i23[1];
23455
0
      i23 += 2;
23456
23457
0
      const int32_t vk23x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[46] - vkernel_zero_point;
23458
0
      const int32_t vk23x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[47] - vkernel_zero_point;
23459
23460
0
      vacc0 += vi23x0 * vk23x0;
23461
0
      vacc1 += vi23x1 * vk23x1;
23462
23463
0
      const int32_t vi24x0 = (int32_t) (uint32_t) i24[0];
23464
0
      const int32_t vi24x1 = (int32_t) (uint32_t) i24[1];
23465
0
      i24 += 2;
23466
23467
0
      const int32_t vk24x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[48] - vkernel_zero_point;
23468
0
      const int32_t vk24x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[49] - vkernel_zero_point;
23469
23470
0
      vacc0 += vi24x0 * vk24x0;
23471
0
      vacc1 += vi24x1 * vk24x1;
23472
23473
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 50 * sizeof(uint8_t));
23474
23475
0
      float vfpacc0 = (float) vacc0;
23476
0
      float vfpacc1 = (float) vacc1;
23477
23478
0
      vfpacc0 *= vscale;
23479
0
      vfpacc1 *= vscale;
23480
23481
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
23482
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
23483
23484
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
23485
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
23486
23487
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
23488
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
23489
23490
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
23491
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
23492
23493
0
      output[0] = (uint8_t) vout0;
23494
0
      output[1] = (uint8_t) vout1;
23495
0
      output += 2;
23496
0
    }
23497
0
    if XNN_UNLIKELY(c != 0) {
23498
0
      int32_t vacc = unaligned_load_s32(w);
23499
23500
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0;
23501
0
      const int32_t vk0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0] - vkernel_zero_point;
23502
0
      vacc += vi0 * vk0;
23503
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1;
23504
0
      const int32_t vk1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2] - vkernel_zero_point;
23505
0
      vacc += vi1 * vk1;
23506
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2;
23507
0
      const int32_t vk2 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4] - vkernel_zero_point;
23508
0
      vacc += vi2 * vk2;
23509
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3;
23510
0
      const int32_t vk3 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6] - vkernel_zero_point;
23511
0
      vacc += vi3 * vk3;
23512
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4;
23513
0
      const int32_t vk4 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8] - vkernel_zero_point;
23514
0
      vacc += vi4 * vk4;
23515
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5;
23516
0
      const int32_t vk5 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10] - vkernel_zero_point;
23517
0
      vacc += vi5 * vk5;
23518
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6;
23519
0
      const int32_t vk6 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12] - vkernel_zero_point;
23520
0
      vacc += vi6 * vk6;
23521
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7;
23522
0
      const int32_t vk7 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14] - vkernel_zero_point;
23523
0
      vacc += vi7 * vk7;
23524
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8;
23525
0
      const int32_t vk8 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16] - vkernel_zero_point;
23526
0
      vacc += vi8 * vk8;
23527
0
      const int32_t vi9 = (int32_t) (uint32_t) *i9;
23528
0
      const int32_t vk9 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[18] - vkernel_zero_point;
23529
0
      vacc += vi9 * vk9;
23530
0
      const int32_t vi10 = (int32_t) (uint32_t) *i10;
23531
0
      const int32_t vk10 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[20] - vkernel_zero_point;
23532
0
      vacc += vi10 * vk10;
23533
0
      const int32_t vi11 = (int32_t) (uint32_t) *i11;
23534
0
      const int32_t vk11 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[22] - vkernel_zero_point;
23535
0
      vacc += vi11 * vk11;
23536
0
      const int32_t vi12 = (int32_t) (uint32_t) *i12;
23537
0
      const int32_t vk12 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[24] - vkernel_zero_point;
23538
0
      vacc += vi12 * vk12;
23539
0
      const int32_t vi13 = (int32_t) (uint32_t) *i13;
23540
0
      const int32_t vk13 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[26] - vkernel_zero_point;
23541
0
      vacc += vi13 * vk13;
23542
0
      const int32_t vi14 = (int32_t) (uint32_t) *i14;
23543
0
      const int32_t vk14 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[28] - vkernel_zero_point;
23544
0
      vacc += vi14 * vk14;
23545
0
      const int32_t vi15 = (int32_t) (uint32_t) *i15;
23546
0
      const int32_t vk15 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[30] - vkernel_zero_point;
23547
0
      vacc += vi15 * vk15;
23548
0
      const int32_t vi16 = (int32_t) (uint32_t) *i16;
23549
0
      const int32_t vk16 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[32] - vkernel_zero_point;
23550
0
      vacc += vi16 * vk16;
23551
0
      const int32_t vi17 = (int32_t) (uint32_t) *i17;
23552
0
      const int32_t vk17 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[34] - vkernel_zero_point;
23553
0
      vacc += vi17 * vk17;
23554
0
      const int32_t vi18 = (int32_t) (uint32_t) *i18;
23555
0
      const int32_t vk18 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[36] - vkernel_zero_point;
23556
0
      vacc += vi18 * vk18;
23557
0
      const int32_t vi19 = (int32_t) (uint32_t) *i19;
23558
0
      const int32_t vk19 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[38] - vkernel_zero_point;
23559
0
      vacc += vi19 * vk19;
23560
0
      const int32_t vi20 = (int32_t) (uint32_t) *i20;
23561
0
      const int32_t vk20 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[40] - vkernel_zero_point;
23562
0
      vacc += vi20 * vk20;
23563
0
      const int32_t vi21 = (int32_t) (uint32_t) *i21;
23564
0
      const int32_t vk21 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[42] - vkernel_zero_point;
23565
0
      vacc += vi21 * vk21;
23566
0
      const int32_t vi22 = (int32_t) (uint32_t) *i22;
23567
0
      const int32_t vk22 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[44] - vkernel_zero_point;
23568
0
      vacc += vi22 * vk22;
23569
0
      const int32_t vi23 = (int32_t) (uint32_t) *i23;
23570
0
      const int32_t vk23 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[46] - vkernel_zero_point;
23571
0
      vacc += vi23 * vk23;
23572
0
      const int32_t vi24 = (int32_t) (uint32_t) *i24;
23573
0
      const int32_t vk24 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[48] - vkernel_zero_point;
23574
0
      vacc += vi24 * vk24;
23575
23576
0
      float vfpacc = (float) vacc * vscale;
23577
23578
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
23579
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
23580
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
23581
0
      int32_t vout = vrndacc + voutput_zero_point;
23582
23583
0
      *output++ = (uint8_t) vout;
23584
0
    }
23585
23586
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
23587
0
  } while (--output_width != 0);
23588
0
}
23589
23590
void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic(
23591
    size_t channels,
23592
    size_t output_width,
23593
    const uint8_t** input,
23594
    const void* weights,
23595
    uint8_t* output,
23596
    intptr_t input_stride,
23597
    size_t output_increment,
23598
    size_t input_offset,
23599
    const uint8_t* zero,
23600
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
23601
0
{
23602
0
  assert(channels != 0);
23603
0
  assert(output_width != 0);
23604
23605
0
  const float vscale = params->fp32_scalar_fmagic.scale;
23606
0
  const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
23607
0
  const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
23608
0
  const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
23609
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
23610
0
  const int32_t vkernel_zero_point = params->fp32_scalar_fmagic.kernel_zero_point;
23611
0
  do {
23612
0
    const uint8_t* i0 = input[0];
23613
0
    assert(i0 != NULL);
23614
0
    if XNN_UNPREDICTABLE(i0 != zero) {
23615
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
23616
0
    }
23617
0
    const uint8_t* i1 = input[1];
23618
0
    assert(i1 != NULL);
23619
0
    if XNN_UNPREDICTABLE(i1 != zero) {
23620
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
23621
0
    }
23622
0
    const uint8_t* i2 = input[2];
23623
0
    assert(i2 != NULL);
23624
0
    if XNN_UNPREDICTABLE(i2 != zero) {
23625
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
23626
0
    }
23627
0
    const uint8_t* i3 = input[3];
23628
0
    assert(i3 != NULL);
23629
0
    if XNN_UNPREDICTABLE(i3 != zero) {
23630
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
23631
0
    }
23632
0
    const uint8_t* i4 = input[4];
23633
0
    assert(i4 != NULL);
23634
0
    if XNN_UNPREDICTABLE(i4 != zero) {
23635
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
23636
0
    }
23637
0
    const uint8_t* i5 = input[5];
23638
0
    assert(i5 != NULL);
23639
0
    if XNN_UNPREDICTABLE(i5 != zero) {
23640
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
23641
0
    }
23642
0
    const uint8_t* i6 = input[6];
23643
0
    assert(i6 != NULL);
23644
0
    if XNN_UNPREDICTABLE(i6 != zero) {
23645
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
23646
0
    }
23647
0
    const uint8_t* i7 = input[7];
23648
0
    assert(i7 != NULL);
23649
0
    if XNN_UNPREDICTABLE(i7 != zero) {
23650
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
23651
0
    }
23652
0
    const uint8_t* i8 = input[8];
23653
0
    assert(i8 != NULL);
23654
0
    if XNN_UNPREDICTABLE(i8 != zero) {
23655
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
23656
0
    }
23657
0
    input = (const uint8_t**) ((uintptr_t) input + input_stride);
23658
23659
0
    size_t c = channels;
23660
0
    const void* w = weights;
23661
0
    do {
23662
0
      int32_t vacc = unaligned_load_s32(w);
23663
23664
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0++;
23665
0
      const int32_t vk0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[0] - vkernel_zero_point;
23666
0
      vacc += vi0 * vk0;
23667
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1++;
23668
0
      const int32_t vk1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[1] - vkernel_zero_point;
23669
0
      vacc += vi1 * vk1;
23670
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2++;
23671
0
      const int32_t vk2 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[2] - vkernel_zero_point;
23672
0
      vacc += vi2 * vk2;
23673
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3++;
23674
0
      const int32_t vk3 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[3] - vkernel_zero_point;
23675
0
      vacc += vi3 * vk3;
23676
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4++;
23677
0
      const int32_t vk4 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[4] - vkernel_zero_point;
23678
0
      vacc += vi4 * vk4;
23679
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5++;
23680
0
      const int32_t vk5 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[5] - vkernel_zero_point;
23681
0
      vacc += vi5 * vk5;
23682
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6++;
23683
0
      const int32_t vk6 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[6] - vkernel_zero_point;
23684
0
      vacc += vi6 * vk6;
23685
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7++;
23686
0
      const int32_t vk7 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[7] - vkernel_zero_point;
23687
0
      vacc += vi7 * vk7;
23688
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8++;
23689
0
      const int32_t vk8 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + sizeof(int32_t)))[8] - vkernel_zero_point;
23690
0
      vacc += vi8 * vk8;
23691
23692
0
      w = (const void*) ((uintptr_t) w + sizeof(int32_t) + 9 * sizeof(uint8_t));
23693
23694
0
      float vfpacc = (float) vacc * vscale;
23695
23696
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
23697
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
23698
0
      vfpacc += vmagic_bias;
23699
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
23700
23701
0
      *output++ = (uint8_t) vout;
23702
0
    } while (--c != 0);
23703
23704
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
23705
0
  } while (--output_width != 0);
23706
0
}
23707
23708
void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic(
23709
    size_t channels,
23710
    size_t output_width,
23711
    const uint8_t** input,
23712
    const void* weights,
23713
    uint8_t* output,
23714
    intptr_t input_stride,
23715
    size_t output_increment,
23716
    size_t input_offset,
23717
    const uint8_t* zero,
23718
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
23719
0
{
23720
0
  assert(channels != 0);
23721
0
  assert(output_width != 0);
23722
23723
0
  const float vscale = params->fp32_scalar_imagic.scale;
23724
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
23725
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
23726
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
23727
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
23728
0
  const int32_t vkernel_zero_point = params->fp32_scalar_imagic.kernel_zero_point;
23729
0
  do {
23730
0
    const uint8_t* i0 = input[0];
23731
0
    assert(i0 != NULL);
23732
0
    if XNN_UNPREDICTABLE(i0 != zero) {
23733
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
23734
0
    }
23735
0
    const uint8_t* i1 = input[1];
23736
0
    assert(i1 != NULL);
23737
0
    if XNN_UNPREDICTABLE(i1 != zero) {
23738
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
23739
0
    }
23740
0
    const uint8_t* i2 = input[2];
23741
0
    assert(i2 != NULL);
23742
0
    if XNN_UNPREDICTABLE(i2 != zero) {
23743
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
23744
0
    }
23745
0
    const uint8_t* i3 = input[3];
23746
0
    assert(i3 != NULL);
23747
0
    if XNN_UNPREDICTABLE(i3 != zero) {
23748
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
23749
0
    }
23750
0
    const uint8_t* i4 = input[4];
23751
0
    assert(i4 != NULL);
23752
0
    if XNN_UNPREDICTABLE(i4 != zero) {
23753
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
23754
0
    }
23755
0
    const uint8_t* i5 = input[5];
23756
0
    assert(i5 != NULL);
23757
0
    if XNN_UNPREDICTABLE(i5 != zero) {
23758
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
23759
0
    }
23760
0
    const uint8_t* i6 = input[6];
23761
0
    assert(i6 != NULL);
23762
0
    if XNN_UNPREDICTABLE(i6 != zero) {
23763
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
23764
0
    }
23765
0
    const uint8_t* i7 = input[7];
23766
0
    assert(i7 != NULL);
23767
0
    if XNN_UNPREDICTABLE(i7 != zero) {
23768
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
23769
0
    }
23770
0
    const uint8_t* i8 = input[8];
23771
0
    assert(i8 != NULL);
23772
0
    if XNN_UNPREDICTABLE(i8 != zero) {
23773
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
23774
0
    }
23775
0
    input = (const uint8_t**) ((uintptr_t) input + input_stride);
23776
23777
0
    size_t c = channels;
23778
0
    const void* w = weights;
23779
0
    for (; c >= 2; c -= 2) {
23780
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
23781
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
23782
23783
23784
0
      const int32_t vi0x0 = (int32_t) (uint32_t) i0[0];
23785
0
      const int32_t vi0x1 = (int32_t) (uint32_t) i0[1];
23786
0
      i0 += 2;
23787
23788
0
      const int32_t vk0x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0] - vkernel_zero_point;
23789
0
      const int32_t vk0x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1] - vkernel_zero_point;
23790
23791
0
      vacc0 += vi0x0 * vk0x0;
23792
0
      vacc1 += vi0x1 * vk0x1;
23793
23794
0
      const int32_t vi1x0 = (int32_t) (uint32_t) i1[0];
23795
0
      const int32_t vi1x1 = (int32_t) (uint32_t) i1[1];
23796
0
      i1 += 2;
23797
23798
0
      const int32_t vk1x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2] - vkernel_zero_point;
23799
0
      const int32_t vk1x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3] - vkernel_zero_point;
23800
23801
0
      vacc0 += vi1x0 * vk1x0;
23802
0
      vacc1 += vi1x1 * vk1x1;
23803
23804
0
      const int32_t vi2x0 = (int32_t) (uint32_t) i2[0];
23805
0
      const int32_t vi2x1 = (int32_t) (uint32_t) i2[1];
23806
0
      i2 += 2;
23807
23808
0
      const int32_t vk2x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4] - vkernel_zero_point;
23809
0
      const int32_t vk2x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5] - vkernel_zero_point;
23810
23811
0
      vacc0 += vi2x0 * vk2x0;
23812
0
      vacc1 += vi2x1 * vk2x1;
23813
23814
0
      const int32_t vi3x0 = (int32_t) (uint32_t) i3[0];
23815
0
      const int32_t vi3x1 = (int32_t) (uint32_t) i3[1];
23816
0
      i3 += 2;
23817
23818
0
      const int32_t vk3x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6] - vkernel_zero_point;
23819
0
      const int32_t vk3x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7] - vkernel_zero_point;
23820
23821
0
      vacc0 += vi3x0 * vk3x0;
23822
0
      vacc1 += vi3x1 * vk3x1;
23823
23824
0
      const int32_t vi4x0 = (int32_t) (uint32_t) i4[0];
23825
0
      const int32_t vi4x1 = (int32_t) (uint32_t) i4[1];
23826
0
      i4 += 2;
23827
23828
0
      const int32_t vk4x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8] - vkernel_zero_point;
23829
0
      const int32_t vk4x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9] - vkernel_zero_point;
23830
23831
0
      vacc0 += vi4x0 * vk4x0;
23832
0
      vacc1 += vi4x1 * vk4x1;
23833
23834
0
      const int32_t vi5x0 = (int32_t) (uint32_t) i5[0];
23835
0
      const int32_t vi5x1 = (int32_t) (uint32_t) i5[1];
23836
0
      i5 += 2;
23837
23838
0
      const int32_t vk5x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10] - vkernel_zero_point;
23839
0
      const int32_t vk5x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11] - vkernel_zero_point;
23840
23841
0
      vacc0 += vi5x0 * vk5x0;
23842
0
      vacc1 += vi5x1 * vk5x1;
23843
23844
0
      const int32_t vi6x0 = (int32_t) (uint32_t) i6[0];
23845
0
      const int32_t vi6x1 = (int32_t) (uint32_t) i6[1];
23846
0
      i6 += 2;
23847
23848
0
      const int32_t vk6x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12] - vkernel_zero_point;
23849
0
      const int32_t vk6x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13] - vkernel_zero_point;
23850
23851
0
      vacc0 += vi6x0 * vk6x0;
23852
0
      vacc1 += vi6x1 * vk6x1;
23853
23854
0
      const int32_t vi7x0 = (int32_t) (uint32_t) i7[0];
23855
0
      const int32_t vi7x1 = (int32_t) (uint32_t) i7[1];
23856
0
      i7 += 2;
23857
23858
0
      const int32_t vk7x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14] - vkernel_zero_point;
23859
0
      const int32_t vk7x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15] - vkernel_zero_point;
23860
23861
0
      vacc0 += vi7x0 * vk7x0;
23862
0
      vacc1 += vi7x1 * vk7x1;
23863
23864
0
      const int32_t vi8x0 = (int32_t) (uint32_t) i8[0];
23865
0
      const int32_t vi8x1 = (int32_t) (uint32_t) i8[1];
23866
0
      i8 += 2;
23867
23868
0
      const int32_t vk8x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16] - vkernel_zero_point;
23869
0
      const int32_t vk8x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17] - vkernel_zero_point;
23870
23871
0
      vacc0 += vi8x0 * vk8x0;
23872
0
      vacc1 += vi8x1 * vk8x1;
23873
23874
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(uint8_t));
23875
23876
0
      float vfpacc0 = (float) vacc0;
23877
0
      float vfpacc1 = (float) vacc1;
23878
23879
0
      vfpacc0 *= vscale;
23880
0
      vfpacc1 *= vscale;
23881
23882
0
      vfpacc0 += vmagic_bias;
23883
0
      vfpacc1 += vmagic_bias;
23884
23885
0
      int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
23886
0
      int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
23887
23888
0
      vout0 = math_max_s32(vout0, vmagic_min);
23889
0
      vout1 = math_max_s32(vout1, vmagic_min);
23890
23891
0
      vout0 = math_min_s32(vout0, vmagic_max);
23892
0
      vout1 = math_min_s32(vout1, vmagic_max);
23893
23894
0
      vout0 -= vmagic_bias_less_zero_point;
23895
0
      vout1 -= vmagic_bias_less_zero_point;
23896
23897
0
      output[0] = (uint8_t) vout0;
23898
0
      output[1] = (uint8_t) vout1;
23899
0
      output += 2;
23900
0
    }
23901
0
    if XNN_UNLIKELY(c != 0) {
23902
0
      int32_t vacc = unaligned_load_s32(w);
23903
23904
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0;
23905
0
      const int32_t vk0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0] - vkernel_zero_point;
23906
0
      vacc += vi0 * vk0;
23907
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1;
23908
0
      const int32_t vk1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2] - vkernel_zero_point;
23909
0
      vacc += vi1 * vk1;
23910
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2;
23911
0
      const int32_t vk2 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4] - vkernel_zero_point;
23912
0
      vacc += vi2 * vk2;
23913
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3;
23914
0
      const int32_t vk3 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6] - vkernel_zero_point;
23915
0
      vacc += vi3 * vk3;
23916
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4;
23917
0
      const int32_t vk4 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8] - vkernel_zero_point;
23918
0
      vacc += vi4 * vk4;
23919
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5;
23920
0
      const int32_t vk5 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10] - vkernel_zero_point;
23921
0
      vacc += vi5 * vk5;
23922
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6;
23923
0
      const int32_t vk6 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12] - vkernel_zero_point;
23924
0
      vacc += vi6 * vk6;
23925
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7;
23926
0
      const int32_t vk7 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14] - vkernel_zero_point;
23927
0
      vacc += vi7 * vk7;
23928
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8;
23929
0
      const int32_t vk8 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16] - vkernel_zero_point;
23930
0
      vacc += vi8 * vk8;
23931
23932
0
      float vfpacc = (float) vacc * vscale;
23933
23934
0
      vfpacc += vmagic_bias;
23935
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
23936
0
      vout = math_max_s32(vout, vmagic_min);
23937
0
      vout = math_min_s32(vout, vmagic_max);
23938
0
      vout -= vmagic_bias_less_zero_point;
23939
23940
0
      *output++ = (uint8_t) vout;
23941
0
    }
23942
23943
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
23944
0
  } while (--output_width != 0);
23945
0
}
23946
23947
void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf(
23948
    size_t channels,
23949
    size_t output_width,
23950
    const uint8_t** input,
23951
    const void* weights,
23952
    uint8_t* output,
23953
    intptr_t input_stride,
23954
    size_t output_increment,
23955
    size_t input_offset,
23956
    const uint8_t* zero,
23957
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
23958
0
{
23959
0
  assert(channels != 0);
23960
0
  assert(output_width != 0);
23961
23962
0
  const float vscale = params->fp32_scalar_lrintf.scale;
23963
0
  const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
23964
0
  const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
23965
0
  const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
23966
0
  const int32_t vkernel_zero_point = params->fp32_scalar_lrintf.kernel_zero_point;
23967
0
  do {
23968
0
    const uint8_t* i0 = input[0];
23969
0
    assert(i0 != NULL);
23970
0
    if XNN_UNPREDICTABLE(i0 != zero) {
23971
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
23972
0
    }
23973
0
    const uint8_t* i1 = input[1];
23974
0
    assert(i1 != NULL);
23975
0
    if XNN_UNPREDICTABLE(i1 != zero) {
23976
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
23977
0
    }
23978
0
    const uint8_t* i2 = input[2];
23979
0
    assert(i2 != NULL);
23980
0
    if XNN_UNPREDICTABLE(i2 != zero) {
23981
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
23982
0
    }
23983
0
    const uint8_t* i3 = input[3];
23984
0
    assert(i3 != NULL);
23985
0
    if XNN_UNPREDICTABLE(i3 != zero) {
23986
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
23987
0
    }
23988
0
    const uint8_t* i4 = input[4];
23989
0
    assert(i4 != NULL);
23990
0
    if XNN_UNPREDICTABLE(i4 != zero) {
23991
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
23992
0
    }
23993
0
    const uint8_t* i5 = input[5];
23994
0
    assert(i5 != NULL);
23995
0
    if XNN_UNPREDICTABLE(i5 != zero) {
23996
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
23997
0
    }
23998
0
    const uint8_t* i6 = input[6];
23999
0
    assert(i6 != NULL);
24000
0
    if XNN_UNPREDICTABLE(i6 != zero) {
24001
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
24002
0
    }
24003
0
    const uint8_t* i7 = input[7];
24004
0
    assert(i7 != NULL);
24005
0
    if XNN_UNPREDICTABLE(i7 != zero) {
24006
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
24007
0
    }
24008
0
    const uint8_t* i8 = input[8];
24009
0
    assert(i8 != NULL);
24010
0
    if XNN_UNPREDICTABLE(i8 != zero) {
24011
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
24012
0
    }
24013
0
    input = (const uint8_t**) ((uintptr_t) input + input_stride);
24014
24015
0
    size_t c = channels;
24016
0
    const void* w = weights;
24017
0
    for (; c >= 2; c -= 2) {
24018
0
      int32_t vacc0 = unaligned_indexed_load_s32(w, 0);
24019
0
      int32_t vacc1 = unaligned_indexed_load_s32(w, 1);
24020
24021
24022
0
      const int32_t vi0x0 = (int32_t) (uint32_t) i0[0];
24023
0
      const int32_t vi0x1 = (int32_t) (uint32_t) i0[1];
24024
0
      i0 += 2;
24025
24026
0
      const int32_t vk0x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0] - vkernel_zero_point;
24027
0
      const int32_t vk0x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[1] - vkernel_zero_point;
24028
24029
0
      vacc0 += vi0x0 * vk0x0;
24030
0
      vacc1 += vi0x1 * vk0x1;
24031
24032
0
      const int32_t vi1x0 = (int32_t) (uint32_t) i1[0];
24033
0
      const int32_t vi1x1 = (int32_t) (uint32_t) i1[1];
24034
0
      i1 += 2;
24035
24036
0
      const int32_t vk1x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2] - vkernel_zero_point;
24037
0
      const int32_t vk1x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[3] - vkernel_zero_point;
24038
24039
0
      vacc0 += vi1x0 * vk1x0;
24040
0
      vacc1 += vi1x1 * vk1x1;
24041
24042
0
      const int32_t vi2x0 = (int32_t) (uint32_t) i2[0];
24043
0
      const int32_t vi2x1 = (int32_t) (uint32_t) i2[1];
24044
0
      i2 += 2;
24045
24046
0
      const int32_t vk2x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4] - vkernel_zero_point;
24047
0
      const int32_t vk2x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[5] - vkernel_zero_point;
24048
24049
0
      vacc0 += vi2x0 * vk2x0;
24050
0
      vacc1 += vi2x1 * vk2x1;
24051
24052
0
      const int32_t vi3x0 = (int32_t) (uint32_t) i3[0];
24053
0
      const int32_t vi3x1 = (int32_t) (uint32_t) i3[1];
24054
0
      i3 += 2;
24055
24056
0
      const int32_t vk3x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6] - vkernel_zero_point;
24057
0
      const int32_t vk3x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[7] - vkernel_zero_point;
24058
24059
0
      vacc0 += vi3x0 * vk3x0;
24060
0
      vacc1 += vi3x1 * vk3x1;
24061
24062
0
      const int32_t vi4x0 = (int32_t) (uint32_t) i4[0];
24063
0
      const int32_t vi4x1 = (int32_t) (uint32_t) i4[1];
24064
0
      i4 += 2;
24065
24066
0
      const int32_t vk4x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8] - vkernel_zero_point;
24067
0
      const int32_t vk4x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[9] - vkernel_zero_point;
24068
24069
0
      vacc0 += vi4x0 * vk4x0;
24070
0
      vacc1 += vi4x1 * vk4x1;
24071
24072
0
      const int32_t vi5x0 = (int32_t) (uint32_t) i5[0];
24073
0
      const int32_t vi5x1 = (int32_t) (uint32_t) i5[1];
24074
0
      i5 += 2;
24075
24076
0
      const int32_t vk5x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10] - vkernel_zero_point;
24077
0
      const int32_t vk5x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[11] - vkernel_zero_point;
24078
24079
0
      vacc0 += vi5x0 * vk5x0;
24080
0
      vacc1 += vi5x1 * vk5x1;
24081
24082
0
      const int32_t vi6x0 = (int32_t) (uint32_t) i6[0];
24083
0
      const int32_t vi6x1 = (int32_t) (uint32_t) i6[1];
24084
0
      i6 += 2;
24085
24086
0
      const int32_t vk6x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12] - vkernel_zero_point;
24087
0
      const int32_t vk6x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[13] - vkernel_zero_point;
24088
24089
0
      vacc0 += vi6x0 * vk6x0;
24090
0
      vacc1 += vi6x1 * vk6x1;
24091
24092
0
      const int32_t vi7x0 = (int32_t) (uint32_t) i7[0];
24093
0
      const int32_t vi7x1 = (int32_t) (uint32_t) i7[1];
24094
0
      i7 += 2;
24095
24096
0
      const int32_t vk7x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14] - vkernel_zero_point;
24097
0
      const int32_t vk7x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[15] - vkernel_zero_point;
24098
24099
0
      vacc0 += vi7x0 * vk7x0;
24100
0
      vacc1 += vi7x1 * vk7x1;
24101
24102
0
      const int32_t vi8x0 = (int32_t) (uint32_t) i8[0];
24103
0
      const int32_t vi8x1 = (int32_t) (uint32_t) i8[1];
24104
0
      i8 += 2;
24105
24106
0
      const int32_t vk8x0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16] - vkernel_zero_point;
24107
0
      const int32_t vk8x1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[17] - vkernel_zero_point;
24108
24109
0
      vacc0 += vi8x0 * vk8x0;
24110
0
      vacc1 += vi8x1 * vk8x1;
24111
24112
0
      w = (const void*) ((uintptr_t) w + 2 * sizeof(int32_t) + 18 * sizeof(uint8_t));
24113
24114
0
      float vfpacc0 = (float) vacc0;
24115
0
      float vfpacc1 = (float) vacc1;
24116
24117
0
      vfpacc0 *= vscale;
24118
0
      vfpacc1 *= vscale;
24119
24120
0
      vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
24121
0
      vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
24122
24123
0
      vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
24124
0
      vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
24125
24126
0
      const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
24127
0
      const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
24128
24129
0
      int32_t vout0 = (int32_t) vrndacc0 + voutput_zero_point;
24130
0
      int32_t vout1 = (int32_t) vrndacc1 + voutput_zero_point;
24131
24132
0
      output[0] = (uint8_t) vout0;
24133
0
      output[1] = (uint8_t) vout1;
24134
0
      output += 2;
24135
0
    }
24136
0
    if XNN_UNLIKELY(c != 0) {
24137
0
      int32_t vacc = unaligned_load_s32(w);
24138
24139
0
      const int32_t vi0 = (int32_t) (uint32_t) *i0;
24140
0
      const int32_t vk0 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[0] - vkernel_zero_point;
24141
0
      vacc += vi0 * vk0;
24142
0
      const int32_t vi1 = (int32_t) (uint32_t) *i1;
24143
0
      const int32_t vk1 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[2] - vkernel_zero_point;
24144
0
      vacc += vi1 * vk1;
24145
0
      const int32_t vi2 = (int32_t) (uint32_t) *i2;
24146
0
      const int32_t vk2 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[4] - vkernel_zero_point;
24147
0
      vacc += vi2 * vk2;
24148
0
      const int32_t vi3 = (int32_t) (uint32_t) *i3;
24149
0
      const int32_t vk3 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[6] - vkernel_zero_point;
24150
0
      vacc += vi3 * vk3;
24151
0
      const int32_t vi4 = (int32_t) (uint32_t) *i4;
24152
0
      const int32_t vk4 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[8] - vkernel_zero_point;
24153
0
      vacc += vi4 * vk4;
24154
0
      const int32_t vi5 = (int32_t) (uint32_t) *i5;
24155
0
      const int32_t vk5 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[10] - vkernel_zero_point;
24156
0
      vacc += vi5 * vk5;
24157
0
      const int32_t vi6 = (int32_t) (uint32_t) *i6;
24158
0
      const int32_t vk6 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[12] - vkernel_zero_point;
24159
0
      vacc += vi6 * vk6;
24160
0
      const int32_t vi7 = (int32_t) (uint32_t) *i7;
24161
0
      const int32_t vk7 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[14] - vkernel_zero_point;
24162
0
      vacc += vi7 * vk7;
24163
0
      const int32_t vi8 = (int32_t) (uint32_t) *i8;
24164
0
      const int32_t vk8 = (int32_t) (uint32_t) ((const uint8_t*) ((uintptr_t) w + 2 * sizeof(int32_t)))[16] - vkernel_zero_point;
24165
0
      vacc += vi8 * vk8;
24166
24167
0
      float vfpacc = (float) vacc * vscale;
24168
24169
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
24170
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
24171
0
      const int32_t vrndacc = (int32_t) lrintf(vfpacc);
24172
0
      int32_t vout = vrndacc + voutput_zero_point;
24173
24174
0
      *output++ = (uint8_t) vout;
24175
0
    }
24176
24177
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
24178
0
  } while (--output_width != 0);
24179
0
}
24180
24181
void xnn_qu8_f32_vcvt_ukernel__scalar_u1(
24182
    size_t batch,
24183
    const uint8_t* input,
24184
    float* output,
24185
    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
24186
0
{
24187
0
  assert(batch != 0);
24188
0
  assert(batch % sizeof(uint8_t) == 0);
24189
0
  assert(input != NULL);
24190
0
  assert(output != NULL);
24191
24192
0
  const int32_t vzero_point = params->scalar.zero_point;
24193
0
  const float vscale = params->scalar.scale;
24194
24195
0
  do {
24196
0
    int32_t vx = *input++;
24197
0
    vx -= vzero_point;
24198
24199
0
    float vy = (float) vx;
24200
0
    vy *= vscale;
24201
0
    *output++ = vy;
24202
24203
0
    batch -= sizeof(uint8_t);
24204
0
  } while (batch != 0);
24205
0
}
24206
24207
void xnn_qu8_f32_vcvt_ukernel__scalar_u4(
24208
    size_t batch,
24209
    const uint8_t* input,
24210
    float* output,
24211
    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
24212
0
{
24213
0
  assert(batch != 0);
24214
0
  assert(batch % sizeof(uint8_t) == 0);
24215
0
  assert(input != NULL);
24216
0
  assert(output != NULL);
24217
24218
0
  const int32_t vzero_point = params->scalar.zero_point;
24219
0
  const float vscale = params->scalar.scale;
24220
24221
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
24222
0
    int32_t vx0 = (int32_t) input[0];
24223
0
    int32_t vx1 = (int32_t) input[1];
24224
0
    int32_t vx2 = (int32_t) input[2];
24225
0
    int32_t vx3 = (int32_t) input[3];
24226
0
    input += 4;
24227
24228
0
    vx0 -= vzero_point;
24229
0
    vx1 -= vzero_point;
24230
0
    vx2 -= vzero_point;
24231
0
    vx3 -= vzero_point;
24232
24233
0
    float vy0 = (float) vx0;
24234
0
    float vy1 = (float) vx1;
24235
0
    float vy2 = (float) vx2;
24236
0
    float vy3 = (float) vx3;
24237
24238
0
    vy0 *= vscale;
24239
0
    vy1 *= vscale;
24240
0
    vy2 *= vscale;
24241
0
    vy3 *= vscale;
24242
24243
0
    output[0] = vy0;
24244
0
    output[1] = vy1;
24245
0
    output[2] = vy2;
24246
0
    output[3] = vy3;
24247
0
    output += 4;
24248
0
  }
24249
0
  if XNN_UNLIKELY(batch != 0) {
24250
0
    do {
24251
0
      int32_t vx = *input++;
24252
0
      vx -= vzero_point;
24253
24254
0
      float vy = (float) vx;
24255
0
      vy *= vscale;
24256
0
      *output++ = vy;
24257
24258
0
      batch -= sizeof(uint8_t);
24259
0
    } while (batch != 0);
24260
0
  }
24261
0
}
24262
24263
void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1(
24264
    size_t rows,
24265
    size_t channels,
24266
    const uint8_t* input,
24267
    size_t input_stride,
24268
    const uint8_t* zero,
24269
    int32_t* buffer,
24270
    uint8_t* output,
24271
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
24272
0
{
24273
0
  assert(rows > 7);
24274
0
  assert(channels != 0);
24275
24276
0
  const uint8_t* i0 = input;
24277
0
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
24278
0
  const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
24279
0
  const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
24280
0
  const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
24281
0
  const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
24282
0
  const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
24283
0
  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
24284
24285
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
24286
0
  int32_t* b = buffer;
24287
0
  size_t c = channels;
24288
0
  do {
24289
0
    int32_t vacc = vinit_bias;
24290
0
    const int32_t vi0 = (int32_t) *i0++;
24291
0
    const int32_t vi1 = (int32_t) *i1++;
24292
24293
0
    vacc += vi0;
24294
0
    const int32_t vi2 = (int32_t) *i2++;
24295
0
    vacc += vi1;
24296
0
    const int32_t vi3 = (int32_t) *i3++;
24297
0
    vacc += vi2;
24298
0
    const int32_t vi4 = (int32_t) *i4++;
24299
0
    vacc += vi3;
24300
0
    const int32_t vi5 = (int32_t) *i5++;
24301
0
    vacc += vi4;
24302
0
    const int32_t vi6 = (int32_t) *i6++;
24303
24304
0
    vacc += vi5;
24305
0
    vacc += vi6;
24306
24307
0
    *b++ = vacc;
24308
0
  } while (--c != 0);
24309
24310
0
  for (rows -= 7; rows > 7; rows -= 7) {
24311
0
    i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
24312
0
    i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
24313
0
    i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
24314
0
    i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
24315
0
    i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
24316
0
    i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
24317
0
    i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
24318
24319
0
    int32_t* b = buffer;
24320
0
    size_t c = channels;
24321
0
    do {
24322
0
      int32_t vacc = *b;
24323
0
      const int32_t vi0 = (int32_t) *i0++;
24324
0
      const int32_t vi1 = (int32_t) *i1++;
24325
24326
0
      vacc += vi0;
24327
0
      const int32_t vi2 = (int32_t) *i2++;
24328
0
      vacc += vi1;
24329
0
      const int32_t vi3 = (int32_t) *i3++;
24330
0
      vacc += vi2;
24331
0
      const int32_t vi4 = (int32_t) *i4++;
24332
0
      vacc += vi3;
24333
0
      const int32_t vi5 = (int32_t) *i5++;
24334
0
      vacc += vi4;
24335
0
      const int32_t vi6 = (int32_t) *i6++;
24336
24337
0
      vacc += vi5;
24338
0
      vacc += vi6;
24339
24340
0
      *b++ = vacc;
24341
0
    } while (--c != 0);
24342
0
  }
24343
24344
0
  i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
24345
0
  i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
24346
0
  if XNN_UNPREDICTABLE(rows < 2) {
24347
0
    i1 = zero;
24348
0
  }
24349
0
  i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
24350
0
  if XNN_UNPREDICTABLE(rows <= 2) {
24351
0
    i2 = zero;
24352
0
  }
24353
0
  i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
24354
0
  if XNN_UNPREDICTABLE(rows < 4) {
24355
0
    i3 = zero;
24356
0
  }
24357
0
  i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
24358
0
  if XNN_UNPREDICTABLE(rows <= 4) {
24359
0
    i4 = zero;
24360
0
  }
24361
0
  i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
24362
0
  if XNN_UNPREDICTABLE(rows < 6) {
24363
0
    i5 = zero;
24364
0
  }
24365
0
  i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
24366
0
  if XNN_UNPREDICTABLE(rows <= 6) {
24367
0
    i6 = zero;
24368
0
  }
24369
24370
0
  const float vscale = params->fp32_scalar_imagic.scale;
24371
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
24372
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
24373
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
24374
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
24375
0
  do {
24376
0
    int32_t vacc = *buffer++;
24377
0
    const int32_t vi0 = (int32_t) *i0++;
24378
0
    const int32_t vi1 = (int32_t) *i1++;
24379
24380
0
    vacc += vi0;
24381
0
    const int32_t vi2 = (int32_t) *i2++;
24382
0
    vacc += vi1;
24383
0
    const int32_t vi3 = (int32_t) *i3++;
24384
0
    vacc += vi2;
24385
0
    const int32_t vi4 = (int32_t) *i4++;
24386
0
    vacc += vi3;
24387
0
    const int32_t vi5 = (int32_t) *i5++;
24388
0
    vacc += vi4;
24389
0
    const int32_t vi6 = (int32_t) *i6++;
24390
24391
0
    vacc += vi5;
24392
0
    vacc += vi6;
24393
24394
0
    float vfpacc = (float) vacc * vscale;
24395
0
    vfpacc += vmagic_bias;
24396
0
    int32_t vout = (int32_t) float_as_uint32(vfpacc);
24397
0
    vout = math_max_s32(vout, vmagic_min);
24398
0
    vout = math_min_s32(vout, vmagic_max);
24399
0
    vout -= vmagic_bias_less_zero_point;
24400
24401
0
    *output++ = (uint8_t) vout;
24402
0
  } while (--channels != 0);
24403
0
}
24404
24405
void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4(
24406
    size_t rows,
24407
    size_t channels,
24408
    const uint8_t* input,
24409
    size_t input_stride,
24410
    const uint8_t* zero,
24411
    int32_t* buffer,
24412
    uint8_t* output,
24413
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
24414
0
{
24415
0
  assert(rows > 7);
24416
0
  assert(channels != 0);
24417
24418
0
  const uint8_t* i0 = input;
24419
0
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
24420
0
  const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
24421
0
  const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
24422
0
  const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
24423
0
  const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
24424
0
  const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
24425
0
  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
24426
24427
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
24428
0
  int32_t* b = buffer;
24429
0
  for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
24430
0
    const int32_t vi0x0 = (int32_t) i0[0];
24431
0
    const int32_t vi0x1 = (int32_t) i0[1];
24432
0
    const int32_t vi0x2 = (int32_t) i0[2];
24433
0
    const int32_t vi0x3 = (int32_t) i0[3];
24434
0
    i0 += 4;
24435
24436
0
    int32_t vacc0 = vi0x0 + vinit_bias;
24437
0
    const int32_t vi1x0 = (int32_t) i1[0];
24438
0
    int32_t vacc1 = vi0x1 + vinit_bias;
24439
0
    const int32_t vi1x1 = (int32_t) i1[1];
24440
0
    int32_t vacc2 = vi0x2 + vinit_bias;
24441
0
    const int32_t vi1x2 = (int32_t) i1[2];
24442
0
    int32_t vacc3 = vi0x3 + vinit_bias;
24443
0
    const int32_t vi1x3 = (int32_t) i1[3];
24444
0
    i1 += 4;
24445
24446
0
    vacc0 += vi1x0;
24447
0
    const int32_t vi2x0 = (int32_t) i2[0];
24448
0
    vacc1 += vi1x1;
24449
0
    const int32_t vi2x1 = (int32_t) i2[1];
24450
0
    vacc2 += vi1x2;
24451
0
    const int32_t vi2x2 = (int32_t) i2[2];
24452
0
    vacc3 += vi1x3;
24453
0
    const int32_t vi2x3 = (int32_t) i2[3];
24454
0
    i2 += 4;
24455
0
    vacc0 += vi2x0;
24456
0
    const int32_t vi3x0 = (int32_t) i3[0];
24457
0
    vacc1 += vi2x1;
24458
0
    const int32_t vi3x1 = (int32_t) i3[1];
24459
0
    vacc2 += vi2x2;
24460
0
    const int32_t vi3x2 = (int32_t) i3[2];
24461
0
    vacc3 += vi2x3;
24462
0
    const int32_t vi3x3 = (int32_t) i3[3];
24463
0
    i3 += 4;
24464
0
    vacc0 += vi3x0;
24465
0
    const int32_t vi4x0 = (int32_t) i4[0];
24466
0
    vacc1 += vi3x1;
24467
0
    const int32_t vi4x1 = (int32_t) i4[1];
24468
0
    vacc2 += vi3x2;
24469
0
    const int32_t vi4x2 = (int32_t) i4[2];
24470
0
    vacc3 += vi3x3;
24471
0
    const int32_t vi4x3 = (int32_t) i4[3];
24472
0
    i4 += 4;
24473
0
    vacc0 += vi4x0;
24474
0
    const int32_t vi5x0 = (int32_t) i5[0];
24475
0
    vacc1 += vi4x1;
24476
0
    const int32_t vi5x1 = (int32_t) i5[1];
24477
0
    vacc2 += vi4x2;
24478
0
    const int32_t vi5x2 = (int32_t) i5[2];
24479
0
    vacc3 += vi4x3;
24480
0
    const int32_t vi5x3 = (int32_t) i5[3];
24481
0
    i5 += 4;
24482
0
    vacc0 += vi5x0;
24483
0
    const int32_t vi6x0 = (int32_t) i6[0];
24484
0
    vacc1 += vi5x1;
24485
0
    const int32_t vi6x1 = (int32_t) i6[1];
24486
0
    vacc2 += vi5x2;
24487
0
    const int32_t vi6x2 = (int32_t) i6[2];
24488
0
    vacc3 += vi5x3;
24489
0
    const int32_t vi6x3 = (int32_t) i6[3];
24490
0
    i6 += 4;
24491
24492
0
    vacc0 += vi6x0;
24493
0
    vacc1 += vi6x1;
24494
0
    vacc2 += vi6x2;
24495
0
    vacc3 += vi6x3;
24496
24497
0
    b[0] = vacc0;
24498
0
    b[1] = vacc1;
24499
0
    b[2] = vacc2;
24500
0
    b[3] = vacc3;
24501
0
    b += 4;
24502
0
  }
24503
24504
0
  for (rows -= 7; rows > 7; rows -= 7) {
24505
0
    i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
24506
0
    i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
24507
0
    i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
24508
0
    i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
24509
0
    i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
24510
0
    i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
24511
0
    i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
24512
24513
0
    int32_t* b = buffer;
24514
0
    for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
24515
0
      int32_t vacc0 = b[0];
24516
0
      const int32_t vi0x0 = (int32_t) i0[0];
24517
0
      int32_t vacc1 = b[1];
24518
0
      const int32_t vi0x1 = (int32_t) i0[1];
24519
0
      int32_t vacc2 = b[2];
24520
0
      const int32_t vi0x2 = (int32_t) i0[2];
24521
0
      int32_t vacc3 = b[3];
24522
0
      const int32_t vi0x3 = (int32_t) i0[3];
24523
0
      i0 += 4;
24524
24525
0
      vacc0 += vi0x0;
24526
0
      const int32_t vi1x0 = (int32_t) i1[0];
24527
0
      vacc1 += vi0x1;
24528
0
      const int32_t vi1x1 = (int32_t) i1[1];
24529
0
      vacc2 += vi0x2;
24530
0
      const int32_t vi1x2 = (int32_t) i1[2];
24531
0
      vacc3 += vi0x3;
24532
0
      const int32_t vi1x3 = (int32_t) i1[3];
24533
0
      i1 += 4;
24534
0
      vacc0 += vi1x0;
24535
0
      const int32_t vi2x0 = (int32_t) i2[0];
24536
0
      vacc1 += vi1x1;
24537
0
      const int32_t vi2x1 = (int32_t) i2[1];
24538
0
      vacc2 += vi1x2;
24539
0
      const int32_t vi2x2 = (int32_t) i2[2];
24540
0
      vacc3 += vi1x3;
24541
0
      const int32_t vi2x3 = (int32_t) i2[3];
24542
0
      i2 += 4;
24543
0
      vacc0 += vi2x0;
24544
0
      const int32_t vi3x0 = (int32_t) i3[0];
24545
0
      vacc1 += vi2x1;
24546
0
      const int32_t vi3x1 = (int32_t) i3[1];
24547
0
      vacc2 += vi2x2;
24548
0
      const int32_t vi3x2 = (int32_t) i3[2];
24549
0
      vacc3 += vi2x3;
24550
0
      const int32_t vi3x3 = (int32_t) i3[3];
24551
0
      i3 += 4;
24552
0
      vacc0 += vi3x0;
24553
0
      const int32_t vi4x0 = (int32_t) i4[0];
24554
0
      vacc1 += vi3x1;
24555
0
      const int32_t vi4x1 = (int32_t) i4[1];
24556
0
      vacc2 += vi3x2;
24557
0
      const int32_t vi4x2 = (int32_t) i4[2];
24558
0
      vacc3 += vi3x3;
24559
0
      const int32_t vi4x3 = (int32_t) i4[3];
24560
0
      i4 += 4;
24561
0
      vacc0 += vi4x0;
24562
0
      const int32_t vi5x0 = (int32_t) i5[0];
24563
0
      vacc1 += vi4x1;
24564
0
      const int32_t vi5x1 = (int32_t) i5[1];
24565
0
      vacc2 += vi4x2;
24566
0
      const int32_t vi5x2 = (int32_t) i5[2];
24567
0
      vacc3 += vi4x3;
24568
0
      const int32_t vi5x3 = (int32_t) i5[3];
24569
0
      i5 += 4;
24570
0
      vacc0 += vi5x0;
24571
0
      const int32_t vi6x0 = (int32_t) i6[0];
24572
0
      vacc1 += vi5x1;
24573
0
      const int32_t vi6x1 = (int32_t) i6[1];
24574
0
      vacc2 += vi5x2;
24575
0
      const int32_t vi6x2 = (int32_t) i6[2];
24576
0
      vacc3 += vi5x3;
24577
0
      const int32_t vi6x3 = (int32_t) i6[3];
24578
0
      i6 += 4;
24579
24580
0
      vacc0 += vi6x0;
24581
0
      vacc1 += vi6x1;
24582
0
      vacc2 += vi6x2;
24583
0
      vacc3 += vi6x3;
24584
24585
0
      b[0] = vacc0;
24586
0
      b[1] = vacc1;
24587
0
      b[2] = vacc2;
24588
0
      b[3] = vacc3;
24589
0
      b += 4;
24590
0
    }
24591
0
  }
24592
24593
0
  i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
24594
0
  i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
24595
0
  if XNN_UNPREDICTABLE(rows < 2) {
24596
0
    i1 = zero;
24597
0
  }
24598
0
  i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
24599
0
  if XNN_UNPREDICTABLE(rows <= 2) {
24600
0
    i2 = zero;
24601
0
  }
24602
0
  i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
24603
0
  if XNN_UNPREDICTABLE(rows < 4) {
24604
0
    i3 = zero;
24605
0
  }
24606
0
  i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
24607
0
  if XNN_UNPREDICTABLE(rows <= 4) {
24608
0
    i4 = zero;
24609
0
  }
24610
0
  i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
24611
0
  if XNN_UNPREDICTABLE(rows < 6) {
24612
0
    i5 = zero;
24613
0
  }
24614
0
  i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
24615
0
  if XNN_UNPREDICTABLE(rows <= 6) {
24616
0
    i6 = zero;
24617
0
  }
24618
24619
0
  const float vscale = params->fp32_scalar_imagic.scale;
24620
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
24621
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
24622
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
24623
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
24624
0
  for (; channels >= 4; channels -= 4) {
24625
0
    int32_t vacc0 = buffer[0];
24626
0
    const int32_t vi0x0 = (int32_t) i0[0];
24627
0
    int32_t vacc1 = buffer[1];
24628
0
    const int32_t vi0x1 = (int32_t) i0[1];
24629
0
    int32_t vacc2 = buffer[2];
24630
0
    const int32_t vi0x2 = (int32_t) i0[2];
24631
0
    int32_t vacc3 = buffer[3];
24632
0
    const int32_t vi0x3 = (int32_t) i0[3];
24633
0
    buffer += 4;
24634
0
    i0 += 4;
24635
24636
0
    vacc0 += vi0x0;
24637
0
    const int32_t vi1x0 = (int32_t) i1[0];
24638
0
    vacc1 += vi0x1;
24639
0
    const int32_t vi1x1 = (int32_t) i1[1];
24640
0
    vacc2 += vi0x2;
24641
0
    const int32_t vi1x2 = (int32_t) i1[2];
24642
0
    vacc3 += vi0x3;
24643
0
    const int32_t vi1x3 = (int32_t) i1[3];
24644
0
    i1 += 4;
24645
0
    vacc0 += vi1x0;
24646
0
    const int32_t vi2x0 = (int32_t) i2[0];
24647
0
    vacc1 += vi1x1;
24648
0
    const int32_t vi2x1 = (int32_t) i2[1];
24649
0
    vacc2 += vi1x2;
24650
0
    const int32_t vi2x2 = (int32_t) i2[2];
24651
0
    vacc3 += vi1x3;
24652
0
    const int32_t vi2x3 = (int32_t) i2[3];
24653
0
    i2 += 4;
24654
0
    vacc0 += vi2x0;
24655
0
    const int32_t vi3x0 = (int32_t) i3[0];
24656
0
    vacc1 += vi2x1;
24657
0
    const int32_t vi3x1 = (int32_t) i3[1];
24658
0
    vacc2 += vi2x2;
24659
0
    const int32_t vi3x2 = (int32_t) i3[2];
24660
0
    vacc3 += vi2x3;
24661
0
    const int32_t vi3x3 = (int32_t) i3[3];
24662
0
    i3 += 4;
24663
0
    vacc0 += vi3x0;
24664
0
    const int32_t vi4x0 = (int32_t) i4[0];
24665
0
    vacc1 += vi3x1;
24666
0
    const int32_t vi4x1 = (int32_t) i4[1];
24667
0
    vacc2 += vi3x2;
24668
0
    const int32_t vi4x2 = (int32_t) i4[2];
24669
0
    vacc3 += vi3x3;
24670
0
    const int32_t vi4x3 = (int32_t) i4[3];
24671
0
    i4 += 4;
24672
0
    vacc0 += vi4x0;
24673
0
    const int32_t vi5x0 = (int32_t) i5[0];
24674
0
    vacc1 += vi4x1;
24675
0
    const int32_t vi5x1 = (int32_t) i5[1];
24676
0
    vacc2 += vi4x2;
24677
0
    const int32_t vi5x2 = (int32_t) i5[2];
24678
0
    vacc3 += vi4x3;
24679
0
    const int32_t vi5x3 = (int32_t) i5[3];
24680
0
    i5 += 4;
24681
0
    vacc0 += vi5x0;
24682
0
    const int32_t vi6x0 = (int32_t) i6[0];
24683
0
    vacc1 += vi5x1;
24684
0
    const int32_t vi6x1 = (int32_t) i6[1];
24685
0
    vacc2 += vi5x2;
24686
0
    const int32_t vi6x2 = (int32_t) i6[2];
24687
0
    vacc3 += vi5x3;
24688
0
    const int32_t vi6x3 = (int32_t) i6[3];
24689
0
    i6 += 4;
24690
24691
0
    vacc0 += vi6x0;
24692
0
    vacc1 += vi6x1;
24693
0
    vacc2 += vi6x2;
24694
0
    vacc3 += vi6x3;
24695
24696
0
    float vfpacc0 = (float) vacc0 * vscale;
24697
0
    float vfpacc1 = (float) vacc1 * vscale;
24698
0
    float vfpacc2 = (float) vacc2 * vscale;
24699
0
    float vfpacc3 = (float) vacc3 * vscale;
24700
24701
0
    vfpacc0 += vmagic_bias;
24702
0
    vfpacc1 += vmagic_bias;
24703
0
    vfpacc2 += vmagic_bias;
24704
0
    vfpacc3 += vmagic_bias;
24705
24706
0
    int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
24707
0
    int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
24708
0
    int32_t vout2 = (int32_t) float_as_uint32(vfpacc2);
24709
0
    int32_t vout3 = (int32_t) float_as_uint32(vfpacc3);
24710
24711
0
    vout0 = math_max_s32(vout0, vmagic_min);
24712
0
    vout1 = math_max_s32(vout1, vmagic_min);
24713
0
    vout2 = math_max_s32(vout2, vmagic_min);
24714
0
    vout3 = math_max_s32(vout3, vmagic_min);
24715
24716
0
    vout0 = math_min_s32(vout0, vmagic_max);
24717
0
    vout1 = math_min_s32(vout1, vmagic_max);
24718
0
    vout2 = math_min_s32(vout2, vmagic_max);
24719
0
    vout3 = math_min_s32(vout3, vmagic_max);
24720
24721
0
    vout0 -= vmagic_bias_less_zero_point;
24722
0
    vout1 -= vmagic_bias_less_zero_point;
24723
0
    vout2 -= vmagic_bias_less_zero_point;
24724
0
    vout3 -= vmagic_bias_less_zero_point;
24725
24726
0
    output[0] = (uint8_t) vout0;
24727
0
    output[1] = (uint8_t) vout1;
24728
0
    output[2] = (uint8_t) vout2;
24729
0
    output[3] = (uint8_t) vout3;
24730
0
    output += 4;
24731
0
  }
24732
0
  if XNN_UNLIKELY(channels != 0) {
24733
0
    do {
24734
0
      int32_t vacc = *buffer++;
24735
0
      const int32_t vi0 = (int32_t) *i0++;
24736
0
      const int32_t vi1 = (int32_t) *i1++;
24737
24738
0
      vacc += vi0;
24739
0
      const int32_t vi2 = (int32_t) *i2++;
24740
0
      vacc += vi1;
24741
0
      const int32_t vi3 = (int32_t) *i3++;
24742
0
      vacc += vi2;
24743
0
      const int32_t vi4 = (int32_t) *i4++;
24744
0
      vacc += vi3;
24745
0
      const int32_t vi5 = (int32_t) *i5++;
24746
0
      vacc += vi4;
24747
0
      const int32_t vi6 = (int32_t) *i6++;
24748
24749
0
      vacc += vi5;
24750
0
      vacc += vi6;
24751
24752
0
      float vfpacc = (float) vacc * vscale;
24753
0
      vfpacc += vmagic_bias;
24754
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
24755
0
      vout = math_max_s32(vout, vmagic_min);
24756
0
      vout = math_min_s32(vout, vmagic_max);
24757
0
      vout -= vmagic_bias_less_zero_point;
24758
24759
0
      *output++ = (uint8_t) vout;
24760
0
    } while (--channels != 0);
24761
0
  }
24762
0
}
24763
24764
void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1(
24765
    size_t rows,
24766
    size_t channels,
24767
    const uint8_t* input,
24768
    size_t input_stride,
24769
    const uint8_t* zero,
24770
    uint8_t* output,
24771
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
24772
0
{
24773
0
  assert(rows != 0);
24774
0
  assert(rows <= 7);
24775
0
  assert(channels != 0);
24776
24777
0
  const uint8_t* i0 = input;
24778
0
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
24779
0
  if XNN_UNPREDICTABLE(rows < 2) {
24780
0
    i1 = zero;
24781
0
  }
24782
0
  const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
24783
0
  if XNN_UNPREDICTABLE(rows <= 2) {
24784
0
    i2 = zero;
24785
0
  }
24786
0
  const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
24787
0
  if XNN_UNPREDICTABLE(rows < 4) {
24788
0
    i3 = zero;
24789
0
  }
24790
0
  const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
24791
0
  if XNN_UNPREDICTABLE(rows <= 4) {
24792
0
    i4 = zero;
24793
0
  }
24794
0
  const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
24795
0
  if XNN_UNPREDICTABLE(rows < 6) {
24796
0
    i5 = zero;
24797
0
  }
24798
0
  const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
24799
0
  if XNN_UNPREDICTABLE(rows <= 6) {
24800
0
    i6 = zero;
24801
0
  }
24802
24803
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
24804
0
  const float vscale = params->fp32_scalar_imagic.scale;
24805
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
24806
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
24807
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
24808
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
24809
0
  do {
24810
0
    int32_t vacc = vinit_bias;
24811
0
    const int32_t vi0 = (int32_t) *i0++;
24812
0
    const int32_t vi1 = (int32_t) *i1++;
24813
24814
0
    vacc += vi0;
24815
0
    const int32_t vi2 = (int32_t) *i2++;
24816
0
    vacc += vi1;
24817
0
    const int32_t vi3 = (int32_t) *i3++;
24818
0
    vacc += vi2;
24819
0
    const int32_t vi4 = (int32_t) *i4++;
24820
0
    vacc += vi3;
24821
0
    const int32_t vi5 = (int32_t) *i5++;
24822
0
    vacc += vi4;
24823
0
    const int32_t vi6 = (int32_t) *i6++;
24824
24825
0
    vacc += vi5;
24826
0
    vacc += vi6;
24827
24828
0
    float vfpacc = (float) vacc * vscale;
24829
0
    vfpacc += vmagic_bias;
24830
0
    int32_t vout = (int32_t) float_as_uint32(vfpacc);
24831
0
    vout = math_max_s32(vout, vmagic_min);
24832
0
    vout = math_min_s32(vout, vmagic_max);
24833
0
    vout -= vmagic_bias_less_zero_point;
24834
24835
0
    *output++ = (uint8_t) vout;
24836
0
  } while (--channels != 0);
24837
0
}
24838
24839
void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4(
24840
    size_t rows,
24841
    size_t channels,
24842
    const uint8_t* input,
24843
    size_t input_stride,
24844
    const uint8_t* zero,
24845
    uint8_t* output,
24846
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
24847
0
{
24848
0
  assert(rows != 0);
24849
0
  assert(rows <= 7);
24850
0
  assert(channels != 0);
24851
24852
0
  const uint8_t* i0 = input;
24853
0
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
24854
0
  if XNN_UNPREDICTABLE(rows < 2) {
24855
0
    i1 = zero;
24856
0
  }
24857
0
  const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
24858
0
  if XNN_UNPREDICTABLE(rows <= 2) {
24859
0
    i2 = zero;
24860
0
  }
24861
0
  const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
24862
0
  if XNN_UNPREDICTABLE(rows < 4) {
24863
0
    i3 = zero;
24864
0
  }
24865
0
  const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
24866
0
  if XNN_UNPREDICTABLE(rows <= 4) {
24867
0
    i4 = zero;
24868
0
  }
24869
0
  const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
24870
0
  if XNN_UNPREDICTABLE(rows < 6) {
24871
0
    i5 = zero;
24872
0
  }
24873
0
  const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
24874
0
  if XNN_UNPREDICTABLE(rows <= 6) {
24875
0
    i6 = zero;
24876
0
  }
24877
24878
0
  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
24879
0
  const float vscale = params->fp32_scalar_imagic.scale;
24880
0
  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
24881
0
  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
24882
0
  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
24883
0
  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
24884
0
  for (; channels >= 4; channels -= 4) {
24885
0
    const int32_t vi0x0 = (int32_t) i0[0];
24886
0
    const int32_t vi0x1 = (int32_t) i0[1];
24887
0
    const int32_t vi0x2 = (int32_t) i0[2];
24888
0
    const int32_t vi0x3 = (int32_t) i0[3];
24889
0
    i0 += 4;
24890
24891
0
    int32_t vacc0 = vi0x0 + vinit_bias;
24892
0
    const int32_t vi1x0 = (int32_t) i1[0];
24893
0
    int32_t vacc1 = vi0x1 + vinit_bias;
24894
0
    const int32_t vi1x1 = (int32_t) i1[1];
24895
0
    int32_t vacc2 = vi0x2 + vinit_bias;
24896
0
    const int32_t vi1x2 = (int32_t) i1[2];
24897
0
    int32_t vacc3 = vi0x3 + vinit_bias;
24898
0
    const int32_t vi1x3 = (int32_t) i1[3];
24899
0
    i1 += 4;
24900
24901
0
    vacc0 += vi1x0;
24902
0
    const int32_t vi2x0 = (int32_t) i2[0];
24903
0
    vacc1 += vi1x1;
24904
0
    const int32_t vi2x1 = (int32_t) i2[1];
24905
0
    vacc2 += vi1x2;
24906
0
    const int32_t vi2x2 = (int32_t) i2[2];
24907
0
    vacc3 += vi1x3;
24908
0
    const int32_t vi2x3 = (int32_t) i2[3];
24909
0
    i2 += 4;
24910
0
    vacc0 += vi2x0;
24911
0
    const int32_t vi3x0 = (int32_t) i3[0];
24912
0
    vacc1 += vi2x1;
24913
0
    const int32_t vi3x1 = (int32_t) i3[1];
24914
0
    vacc2 += vi2x2;
24915
0
    const int32_t vi3x2 = (int32_t) i3[2];
24916
0
    vacc3 += vi2x3;
24917
0
    const int32_t vi3x3 = (int32_t) i3[3];
24918
0
    i3 += 4;
24919
0
    vacc0 += vi3x0;
24920
0
    const int32_t vi4x0 = (int32_t) i4[0];
24921
0
    vacc1 += vi3x1;
24922
0
    const int32_t vi4x1 = (int32_t) i4[1];
24923
0
    vacc2 += vi3x2;
24924
0
    const int32_t vi4x2 = (int32_t) i4[2];
24925
0
    vacc3 += vi3x3;
24926
0
    const int32_t vi4x3 = (int32_t) i4[3];
24927
0
    i4 += 4;
24928
0
    vacc0 += vi4x0;
24929
0
    const int32_t vi5x0 = (int32_t) i5[0];
24930
0
    vacc1 += vi4x1;
24931
0
    const int32_t vi5x1 = (int32_t) i5[1];
24932
0
    vacc2 += vi4x2;
24933
0
    const int32_t vi5x2 = (int32_t) i5[2];
24934
0
    vacc3 += vi4x3;
24935
0
    const int32_t vi5x3 = (int32_t) i5[3];
24936
0
    i5 += 4;
24937
0
    vacc0 += vi5x0;
24938
0
    const int32_t vi6x0 = (int32_t) i6[0];
24939
0
    vacc1 += vi5x1;
24940
0
    const int32_t vi6x1 = (int32_t) i6[1];
24941
0
    vacc2 += vi5x2;
24942
0
    const int32_t vi6x2 = (int32_t) i6[2];
24943
0
    vacc3 += vi5x3;
24944
0
    const int32_t vi6x3 = (int32_t) i6[3];
24945
0
    i6 += 4;
24946
24947
0
    vacc0 += vi6x0;
24948
0
    vacc1 += vi6x1;
24949
0
    vacc2 += vi6x2;
24950
0
    vacc3 += vi6x3;
24951
24952
0
    float vfpacc0 = (float) vacc0 * vscale;
24953
0
    float vfpacc1 = (float) vacc1 * vscale;
24954
0
    float vfpacc2 = (float) vacc2 * vscale;
24955
0
    float vfpacc3 = (float) vacc3 * vscale;
24956
24957
0
    vfpacc0 += vmagic_bias;
24958
0
    vfpacc1 += vmagic_bias;
24959
0
    vfpacc2 += vmagic_bias;
24960
0
    vfpacc3 += vmagic_bias;
24961
24962
0
    int32_t vout0 = (int32_t) float_as_uint32(vfpacc0);
24963
0
    int32_t vout1 = (int32_t) float_as_uint32(vfpacc1);
24964
0
    int32_t vout2 = (int32_t) float_as_uint32(vfpacc2);
24965
0
    int32_t vout3 = (int32_t) float_as_uint32(vfpacc3);
24966
24967
0
    vout0 = math_max_s32(vout0, vmagic_min);
24968
0
    vout1 = math_max_s32(vout1, vmagic_min);
24969
0
    vout2 = math_max_s32(vout2, vmagic_min);
24970
0
    vout3 = math_max_s32(vout3, vmagic_min);
24971
24972
0
    vout0 = math_min_s32(vout0, vmagic_max);
24973
0
    vout1 = math_min_s32(vout1, vmagic_max);
24974
0
    vout2 = math_min_s32(vout2, vmagic_max);
24975
0
    vout3 = math_min_s32(vout3, vmagic_max);
24976
24977
0
    vout0 -= vmagic_bias_less_zero_point;
24978
0
    vout1 -= vmagic_bias_less_zero_point;
24979
0
    vout2 -= vmagic_bias_less_zero_point;
24980
0
    vout3 -= vmagic_bias_less_zero_point;
24981
24982
0
    output[0] = (uint8_t) vout0;
24983
0
    output[1] = (uint8_t) vout1;
24984
0
    output[2] = (uint8_t) vout2;
24985
0
    output[3] = (uint8_t) vout3;
24986
0
    output += 4;
24987
0
  }
24988
0
  if XNN_UNLIKELY(channels != 0) {
24989
0
    do {
24990
0
      int32_t vacc = vinit_bias;
24991
0
      const int32_t vi0 = (int32_t) *i0++;
24992
0
      const int32_t vi1 = (int32_t) *i1++;
24993
24994
0
      vacc += vi0;
24995
0
      const int32_t vi2 = (int32_t) *i2++;
24996
0
      vacc += vi1;
24997
0
      const int32_t vi3 = (int32_t) *i3++;
24998
0
      vacc += vi2;
24999
0
      const int32_t vi4 = (int32_t) *i4++;
25000
0
      vacc += vi3;
25001
0
      const int32_t vi5 = (int32_t) *i5++;
25002
0
      vacc += vi4;
25003
0
      const int32_t vi6 = (int32_t) *i6++;
25004
25005
0
      vacc += vi5;
25006
0
      vacc += vi6;
25007
25008
0
      float vfpacc = (float) vacc * vscale;
25009
0
      vfpacc += vmagic_bias;
25010
0
      int32_t vout = (int32_t) float_as_uint32(vfpacc);
25011
0
      vout = math_max_s32(vout, vmagic_min);
25012
0
      vout = math_min_s32(vout, vmagic_max);
25013
0
      vout -= vmagic_bias_less_zero_point;
25014
25015
0
      *output++ = (uint8_t) vout;
25016
0
    } while (--channels != 0);
25017
0
  }
25018
0
}
25019
25020
void xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic(
25021
    size_t mr,
25022
    size_t nc,
25023
    size_t kc,
25024
    const uint8_t* restrict a,
25025
    size_t a_stride,
25026
    const void* restrict w,
25027
    uint8_t* restrict c,
25028
    size_t cm_stride,
25029
    size_t cn_stride,
25030
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25031
0
{
25032
0
  assert(mr != 0);
25033
0
  assert(mr <= 1);
25034
0
  assert(nc != 0);
25035
0
  assert(kc != 0);
25036
25037
0
  const uint8_t* a0 = a;
25038
0
  uint8_t* c0 = c;
25039
25040
0
  const int32_t vb_zero_point = params->fp32_scalar_imagic.kernel_zero_point;
25041
0
  do {
25042
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
25043
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
25044
0
    w = (const int32_t*) w + 2;
25045
25046
0
    size_t k = kc;
25047
0
    do {
25048
0
      const int32_t va0 = (int32_t) (uint32_t) *a0++;
25049
25050
0
      const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25051
0
      const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25052
0
      w = (const uint8_t*) w + 2;
25053
25054
0
      vacc0x0 += va0 * vb0;
25055
0
      vacc0x1 += va0 * vb1;
25056
25057
0
      k -= sizeof(uint8_t);
25058
0
    } while (k != 0);
25059
25060
0
    float vfpacc0x0 = (float) vacc0x0;
25061
0
    float vfpacc0x1 = (float) vacc0x1;
25062
25063
0
    const float vscale = params->fp32_scalar_imagic.scale;
25064
0
    vfpacc0x0 *= vscale;
25065
0
    vfpacc0x1 *= vscale;
25066
25067
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
25068
0
    vfpacc0x0 += vmagic_bias;
25069
0
    vfpacc0x1 += vmagic_bias;
25070
25071
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
25072
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
25073
25074
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
25075
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
25076
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
25077
25078
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
25079
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
25080
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
25081
25082
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
25083
0
    vout0x0 -= vmagic_bias_less_zero_point;
25084
0
    vout0x1 -= vmagic_bias_less_zero_point;
25085
25086
0
    if XNN_LIKELY(nc >= 2) {
25087
0
      c0[0] = (uint8_t) vout0x0;
25088
0
      c0[1] = (uint8_t) vout0x1;
25089
25090
0
      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
25091
25092
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25093
25094
0
      nc -= 2;
25095
0
    } else {
25096
0
      if (nc & 1) {
25097
0
        c0[0] = (uint8_t) vout0x0;
25098
0
      }
25099
25100
0
      nc = 0;
25101
0
    }
25102
0
  } while (nc != 0);
25103
0
}
25104
25105
void xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf(
25106
    size_t mr,
25107
    size_t nc,
25108
    size_t kc,
25109
    const uint8_t* restrict a,
25110
    size_t a_stride,
25111
    const void* restrict w,
25112
    uint8_t* restrict c,
25113
    size_t cm_stride,
25114
    size_t cn_stride,
25115
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25116
0
{
25117
0
  assert(mr != 0);
25118
0
  assert(mr <= 1);
25119
0
  assert(nc != 0);
25120
0
  assert(kc != 0);
25121
25122
0
  const uint8_t* a0 = a;
25123
0
  uint8_t* c0 = c;
25124
25125
0
  const int32_t vb_zero_point = params->fp32_scalar_lrintf.kernel_zero_point;
25126
0
  do {
25127
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
25128
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
25129
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
25130
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
25131
0
    w = (const int32_t*) w + 4;
25132
25133
0
    size_t k = kc;
25134
0
    do {
25135
0
      const int32_t va0 = (int32_t) (uint32_t) *a0++;
25136
25137
0
      const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25138
0
      const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25139
0
      const int32_t vb2 = (int32_t) (uint32_t) ((const uint8_t*) w)[2] - vb_zero_point;
25140
0
      const int32_t vb3 = (int32_t) (uint32_t) ((const uint8_t*) w)[3] - vb_zero_point;
25141
0
      w = (const uint8_t*) w + 4;
25142
25143
0
      vacc0x0 += va0 * vb0;
25144
0
      vacc0x1 += va0 * vb1;
25145
0
      vacc0x2 += va0 * vb2;
25146
0
      vacc0x3 += va0 * vb3;
25147
25148
0
      k -= sizeof(uint8_t);
25149
0
    } while (k != 0);
25150
25151
0
    float vfpacc0x0 = (float) vacc0x0;
25152
0
    float vfpacc0x1 = (float) vacc0x1;
25153
0
    float vfpacc0x2 = (float) vacc0x2;
25154
0
    float vfpacc0x3 = (float) vacc0x3;
25155
25156
0
    const float vscale = params->fp32_scalar_lrintf.scale;
25157
0
    vfpacc0x0 *= vscale;
25158
0
    vfpacc0x1 *= vscale;
25159
0
    vfpacc0x2 *= vscale;
25160
0
    vfpacc0x3 *= vscale;
25161
25162
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
25163
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
25164
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
25165
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
25166
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
25167
25168
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
25169
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
25170
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
25171
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
25172
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
25173
25174
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
25175
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
25176
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
25177
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
25178
25179
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
25180
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
25181
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
25182
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
25183
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
25184
25185
0
    if XNN_LIKELY(nc >= 4) {
25186
0
      c0[0] = (uint8_t) vout0x0;
25187
0
      c0[1] = (uint8_t) vout0x1;
25188
0
      c0[2] = (uint8_t) vout0x2;
25189
0
      c0[3] = (uint8_t) vout0x3;
25190
25191
0
      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
25192
25193
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25194
25195
0
      nc -= 4;
25196
0
    } else {
25197
0
      if (nc & 2) {
25198
0
        c0[0] = (uint8_t) vout0x0;
25199
0
        c0[1] = (uint8_t) vout0x1;
25200
0
        vout0x0 = vout0x2;
25201
0
        c0 += 2;
25202
0
      }
25203
0
      if (nc & 1) {
25204
0
        c0[0] = (uint8_t) vout0x0;
25205
0
      }
25206
25207
0
      nc = 0;
25208
0
    }
25209
0
  } while (nc != 0);
25210
0
}
25211
25212
void xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic(
25213
    size_t mr,
25214
    size_t nc,
25215
    size_t kc,
25216
    const uint8_t* restrict a,
25217
    size_t a_stride,
25218
    const void* restrict w,
25219
    uint8_t* restrict c,
25220
    size_t cm_stride,
25221
    size_t cn_stride,
25222
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25223
0
{
25224
0
  assert(mr != 0);
25225
0
  assert(mr <= 2);
25226
0
  assert(nc != 0);
25227
0
  assert(kc != 0);
25228
25229
0
  const uint8_t* a0 = a;
25230
0
  uint8_t* c0 = c;
25231
0
  const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
25232
0
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
25233
0
  if XNN_UNPREDICTABLE(mr != 2) {
25234
0
    a1 = a0;
25235
0
    c1 = c0;
25236
0
  }
25237
25238
0
  const int32_t vb_zero_point = params->fp32_scalar_imagic.kernel_zero_point;
25239
0
  do {
25240
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
25241
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
25242
0
    int32_t vacc1x0 = vacc0x0;
25243
0
    int32_t vacc1x1 = vacc0x1;
25244
0
    w = (const int32_t*) w + 2;
25245
25246
0
    size_t k = kc;
25247
0
    do {
25248
0
      const int32_t va0 = (int32_t) (uint32_t) *a0++;
25249
0
      const int32_t va1 = (int32_t) (uint32_t) *a1++;
25250
25251
0
      const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25252
0
      const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25253
0
      w = (const uint8_t*) w + 2;
25254
25255
0
      vacc0x0 += va0 * vb0;
25256
0
      vacc0x1 += va0 * vb1;
25257
0
      vacc1x0 += va1 * vb0;
25258
0
      vacc1x1 += va1 * vb1;
25259
25260
0
      k -= sizeof(uint8_t);
25261
0
    } while (k != 0);
25262
25263
0
    float vfpacc0x0 = (float) vacc0x0;
25264
0
    float vfpacc0x1 = (float) vacc0x1;
25265
0
    float vfpacc1x0 = (float) vacc1x0;
25266
0
    float vfpacc1x1 = (float) vacc1x1;
25267
25268
0
    const float vscale = params->fp32_scalar_imagic.scale;
25269
0
    vfpacc0x0 *= vscale;
25270
0
    vfpacc0x1 *= vscale;
25271
0
    vfpacc1x0 *= vscale;
25272
0
    vfpacc1x1 *= vscale;
25273
25274
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
25275
0
    vfpacc0x0 += vmagic_bias;
25276
0
    vfpacc0x1 += vmagic_bias;
25277
0
    vfpacc1x0 += vmagic_bias;
25278
0
    vfpacc1x1 += vmagic_bias;
25279
25280
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
25281
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
25282
0
    int32_t vout1x0 = (int32_t) float_as_uint32(vfpacc1x0);
25283
0
    int32_t vout1x1 = (int32_t) float_as_uint32(vfpacc1x1);
25284
25285
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
25286
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
25287
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
25288
0
    vout1x0 = math_max_s32(vout1x0, vmagic_min);
25289
0
    vout1x1 = math_max_s32(vout1x1, vmagic_min);
25290
25291
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
25292
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
25293
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
25294
0
    vout1x0 = math_min_s32(vout1x0, vmagic_max);
25295
0
    vout1x1 = math_min_s32(vout1x1, vmagic_max);
25296
25297
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
25298
0
    vout0x0 -= vmagic_bias_less_zero_point;
25299
0
    vout0x1 -= vmagic_bias_less_zero_point;
25300
0
    vout1x0 -= vmagic_bias_less_zero_point;
25301
0
    vout1x1 -= vmagic_bias_less_zero_point;
25302
25303
0
    if XNN_LIKELY(nc >= 2) {
25304
0
      c0[0] = (uint8_t) vout0x0;
25305
0
      c0[1] = (uint8_t) vout0x1;
25306
0
      c1[0] = (uint8_t) vout1x0;
25307
0
      c1[1] = (uint8_t) vout1x1;
25308
25309
0
      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
25310
0
      a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
25311
25312
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25313
0
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
25314
25315
0
      nc -= 2;
25316
0
    } else {
25317
0
      if (nc & 1) {
25318
0
        c0[0] = (uint8_t) vout0x0;
25319
0
        c1[0] = (uint8_t) vout1x0;
25320
0
      }
25321
25322
0
      nc = 0;
25323
0
    }
25324
0
  } while (nc != 0);
25325
0
}
25326
25327
void xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf(
25328
    size_t mr,
25329
    size_t nc,
25330
    size_t kc,
25331
    const uint8_t* restrict a,
25332
    size_t a_stride,
25333
    const void* restrict w,
25334
    uint8_t* restrict c,
25335
    size_t cm_stride,
25336
    size_t cn_stride,
25337
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25338
0
{
25339
0
  assert(mr != 0);
25340
0
  assert(mr <= 3);
25341
0
  assert(nc != 0);
25342
0
  assert(kc != 0);
25343
25344
0
  const uint8_t* a0 = a;
25345
0
  uint8_t* c0 = c;
25346
0
  const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
25347
0
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
25348
0
  if XNN_UNPREDICTABLE(mr < 2) {
25349
0
    a1 = a0;
25350
0
    c1 = c0;
25351
0
  }
25352
0
  const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
25353
0
  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
25354
0
  if XNN_UNPREDICTABLE(mr <= 2) {
25355
0
    a2 = a1;
25356
0
    c2 = c1;
25357
0
  }
25358
25359
0
  const int32_t vb_zero_point = params->fp32_scalar_lrintf.kernel_zero_point;
25360
0
  do {
25361
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
25362
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
25363
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
25364
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
25365
0
    int32_t vacc1x0 = vacc0x0;
25366
0
    int32_t vacc1x1 = vacc0x1;
25367
0
    int32_t vacc1x2 = vacc0x2;
25368
0
    int32_t vacc1x3 = vacc0x3;
25369
0
    int32_t vacc2x0 = vacc0x0;
25370
0
    int32_t vacc2x1 = vacc0x1;
25371
0
    int32_t vacc2x2 = vacc0x2;
25372
0
    int32_t vacc2x3 = vacc0x3;
25373
0
    w = (const int32_t*) w + 4;
25374
25375
0
    size_t k = kc;
25376
0
    do {
25377
0
      const int32_t va0 = (int32_t) (uint32_t) *a0++;
25378
0
      const int32_t va1 = (int32_t) (uint32_t) *a1++;
25379
0
      const int32_t va2 = (int32_t) (uint32_t) *a2++;
25380
25381
0
      const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25382
0
      const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25383
0
      const int32_t vb2 = (int32_t) (uint32_t) ((const uint8_t*) w)[2] - vb_zero_point;
25384
0
      const int32_t vb3 = (int32_t) (uint32_t) ((const uint8_t*) w)[3] - vb_zero_point;
25385
0
      w = (const uint8_t*) w + 4;
25386
25387
0
      vacc0x0 += va0 * vb0;
25388
0
      vacc0x1 += va0 * vb1;
25389
0
      vacc0x2 += va0 * vb2;
25390
0
      vacc0x3 += va0 * vb3;
25391
0
      vacc1x0 += va1 * vb0;
25392
0
      vacc1x1 += va1 * vb1;
25393
0
      vacc1x2 += va1 * vb2;
25394
0
      vacc1x3 += va1 * vb3;
25395
0
      vacc2x0 += va2 * vb0;
25396
0
      vacc2x1 += va2 * vb1;
25397
0
      vacc2x2 += va2 * vb2;
25398
0
      vacc2x3 += va2 * vb3;
25399
25400
0
      k -= sizeof(uint8_t);
25401
0
    } while (k != 0);
25402
25403
0
    float vfpacc0x0 = (float) vacc0x0;
25404
0
    float vfpacc0x1 = (float) vacc0x1;
25405
0
    float vfpacc0x2 = (float) vacc0x2;
25406
0
    float vfpacc0x3 = (float) vacc0x3;
25407
0
    float vfpacc1x0 = (float) vacc1x0;
25408
0
    float vfpacc1x1 = (float) vacc1x1;
25409
0
    float vfpacc1x2 = (float) vacc1x2;
25410
0
    float vfpacc1x3 = (float) vacc1x3;
25411
0
    float vfpacc2x0 = (float) vacc2x0;
25412
0
    float vfpacc2x1 = (float) vacc2x1;
25413
0
    float vfpacc2x2 = (float) vacc2x2;
25414
0
    float vfpacc2x3 = (float) vacc2x3;
25415
25416
0
    const float vscale = params->fp32_scalar_lrintf.scale;
25417
0
    vfpacc0x0 *= vscale;
25418
0
    vfpacc0x1 *= vscale;
25419
0
    vfpacc0x2 *= vscale;
25420
0
    vfpacc0x3 *= vscale;
25421
0
    vfpacc1x0 *= vscale;
25422
0
    vfpacc1x1 *= vscale;
25423
0
    vfpacc1x2 *= vscale;
25424
0
    vfpacc1x3 *= vscale;
25425
0
    vfpacc2x0 *= vscale;
25426
0
    vfpacc2x1 *= vscale;
25427
0
    vfpacc2x2 *= vscale;
25428
0
    vfpacc2x3 *= vscale;
25429
25430
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
25431
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
25432
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
25433
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
25434
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
25435
0
    vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
25436
0
    vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
25437
0
    vfpacc1x2 = math_max_f32(vfpacc1x2, voutput_min_less_zero_point);
25438
0
    vfpacc1x3 = math_max_f32(vfpacc1x3, voutput_min_less_zero_point);
25439
0
    vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
25440
0
    vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
25441
0
    vfpacc2x2 = math_max_f32(vfpacc2x2, voutput_min_less_zero_point);
25442
0
    vfpacc2x3 = math_max_f32(vfpacc2x3, voutput_min_less_zero_point);
25443
25444
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
25445
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
25446
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
25447
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
25448
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
25449
0
    vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
25450
0
    vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
25451
0
    vfpacc1x2 = math_min_f32(vfpacc1x2, voutput_max_less_zero_point);
25452
0
    vfpacc1x3 = math_min_f32(vfpacc1x3, voutput_max_less_zero_point);
25453
0
    vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
25454
0
    vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
25455
0
    vfpacc2x2 = math_min_f32(vfpacc2x2, voutput_max_less_zero_point);
25456
0
    vfpacc2x3 = math_min_f32(vfpacc2x3, voutput_max_less_zero_point);
25457
25458
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
25459
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
25460
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
25461
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
25462
0
    const int32_t vrndacc1x0 = (int32_t) lrintf(vfpacc1x0);
25463
0
    const int32_t vrndacc1x1 = (int32_t) lrintf(vfpacc1x1);
25464
0
    const int32_t vrndacc1x2 = (int32_t) lrintf(vfpacc1x2);
25465
0
    const int32_t vrndacc1x3 = (int32_t) lrintf(vfpacc1x3);
25466
0
    const int32_t vrndacc2x0 = (int32_t) lrintf(vfpacc2x0);
25467
0
    const int32_t vrndacc2x1 = (int32_t) lrintf(vfpacc2x1);
25468
0
    const int32_t vrndacc2x2 = (int32_t) lrintf(vfpacc2x2);
25469
0
    const int32_t vrndacc2x3 = (int32_t) lrintf(vfpacc2x3);
25470
25471
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
25472
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
25473
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
25474
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
25475
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
25476
0
    int32_t vout1x0 = vrndacc1x0 + voutput_zero_point;
25477
0
    int32_t vout1x1 = vrndacc1x1 + voutput_zero_point;
25478
0
    int32_t vout1x2 = vrndacc1x2 + voutput_zero_point;
25479
0
    int32_t vout1x3 = vrndacc1x3 + voutput_zero_point;
25480
0
    int32_t vout2x0 = vrndacc2x0 + voutput_zero_point;
25481
0
    int32_t vout2x1 = vrndacc2x1 + voutput_zero_point;
25482
0
    int32_t vout2x2 = vrndacc2x2 + voutput_zero_point;
25483
0
    int32_t vout2x3 = vrndacc2x3 + voutput_zero_point;
25484
25485
0
    if XNN_LIKELY(nc >= 4) {
25486
0
      c0[0] = (uint8_t) vout0x0;
25487
0
      c0[1] = (uint8_t) vout0x1;
25488
0
      c0[2] = (uint8_t) vout0x2;
25489
0
      c0[3] = (uint8_t) vout0x3;
25490
0
      c1[0] = (uint8_t) vout1x0;
25491
0
      c1[1] = (uint8_t) vout1x1;
25492
0
      c1[2] = (uint8_t) vout1x2;
25493
0
      c1[3] = (uint8_t) vout1x3;
25494
0
      c2[0] = (uint8_t) vout2x0;
25495
0
      c2[1] = (uint8_t) vout2x1;
25496
0
      c2[2] = (uint8_t) vout2x2;
25497
0
      c2[3] = (uint8_t) vout2x3;
25498
25499
0
      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
25500
0
      a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
25501
0
      a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
25502
25503
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25504
0
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
25505
0
      c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
25506
25507
0
      nc -= 4;
25508
0
    } else {
25509
0
      if (nc & 2) {
25510
0
        c0[0] = (uint8_t) vout0x0;
25511
0
        c0[1] = (uint8_t) vout0x1;
25512
0
        vout0x0 = vout0x2;
25513
0
        c0 += 2;
25514
0
        c1[0] = (uint8_t) vout1x0;
25515
0
        c1[1] = (uint8_t) vout1x1;
25516
0
        vout1x0 = vout1x2;
25517
0
        c1 += 2;
25518
0
        c2[0] = (uint8_t) vout2x0;
25519
0
        c2[1] = (uint8_t) vout2x1;
25520
0
        vout2x0 = vout2x2;
25521
0
        c2 += 2;
25522
0
      }
25523
0
      if (nc & 1) {
25524
0
        c0[0] = (uint8_t) vout0x0;
25525
0
        c1[0] = (uint8_t) vout1x0;
25526
0
        c2[0] = (uint8_t) vout2x0;
25527
0
      }
25528
25529
0
      nc = 0;
25530
0
    }
25531
0
  } while (nc != 0);
25532
0
}
25533
25534
void xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic(
25535
    size_t mr,
25536
    size_t nc,
25537
    size_t kc,
25538
    size_t ks,
25539
    const uint8_t** restrict a,
25540
    const void* restrict w,
25541
    uint8_t* restrict c,
25542
    size_t cm_stride,
25543
    size_t cn_stride,
25544
    size_t a_offset,
25545
    const uint8_t* zero,
25546
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25547
0
{
25548
0
  assert(mr != 0);
25549
0
  assert(mr <= 1);
25550
0
  assert(nc != 0);
25551
0
  assert(kc != 0);
25552
0
  assert(ks != 0);
25553
0
  assert(ks % (1 * sizeof(void*)) == 0);
25554
0
  assert(a != NULL);
25555
0
  assert(w != NULL);
25556
0
  assert(c != NULL);
25557
25558
0
  uint8_t* c0 = c;
25559
25560
0
  const int32_t vb_zero_point = params->fp32_scalar_imagic.kernel_zero_point;
25561
0
  do {
25562
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
25563
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
25564
0
    w = (const void*) ((const int32_t*) w + 2);
25565
25566
0
    size_t p = ks;
25567
0
    do {
25568
0
      const uint8_t* restrict a0 = a[0];
25569
0
      assert(a0 != NULL);
25570
0
      if XNN_UNPREDICTABLE(a0 != zero) {
25571
0
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
25572
0
      }
25573
0
      a += 1;
25574
25575
0
      size_t k = kc;
25576
0
      do {
25577
0
        const int32_t va0 = (int32_t) (uint32_t) *a0++;
25578
25579
0
        const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25580
0
        const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25581
0
        w = (const void*) ((const uint8_t*) w + 2);
25582
25583
0
        vacc0x0 += va0 * vb0;
25584
0
        vacc0x1 += va0 * vb1;
25585
25586
0
        k -= sizeof(uint8_t);
25587
0
      } while (k != 0);
25588
0
      p -= 1 * sizeof(void*);
25589
0
    } while (p != 0);
25590
25591
0
    float vfpacc0x0 = (float) vacc0x0;
25592
0
    float vfpacc0x1 = (float) vacc0x1;
25593
25594
0
    const float vscale = params->fp32_scalar_imagic.scale;
25595
0
    vfpacc0x0 *= vscale;
25596
0
    vfpacc0x1 *= vscale;
25597
25598
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
25599
0
    vfpacc0x0 += vmagic_bias;
25600
0
    vfpacc0x1 += vmagic_bias;
25601
25602
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
25603
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
25604
25605
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
25606
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
25607
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
25608
25609
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
25610
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
25611
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
25612
25613
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
25614
0
    vout0x0 -= vmagic_bias_less_zero_point;
25615
0
    vout0x1 -= vmagic_bias_less_zero_point;
25616
25617
0
    if XNN_LIKELY(nc >= 2) {
25618
0
      c0[0] = (uint8_t) vout0x0;
25619
0
      c0[1] = (uint8_t) vout0x1;
25620
25621
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25622
25623
0
      a = (const uint8_t**restrict) ((uintptr_t) a - ks);
25624
0
      nc -= 2;
25625
0
    } else {
25626
0
      if (nc & 1) {
25627
0
        c0[0] = (uint8_t) vout0x0;
25628
0
      }
25629
25630
0
      nc = 0;
25631
0
    }
25632
0
  } while (nc != 0);
25633
0
}
25634
25635
void xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf(
25636
    size_t mr,
25637
    size_t nc,
25638
    size_t kc,
25639
    size_t ks,
25640
    const uint8_t** restrict a,
25641
    const void* restrict w,
25642
    uint8_t* restrict c,
25643
    size_t cm_stride,
25644
    size_t cn_stride,
25645
    size_t a_offset,
25646
    const uint8_t* zero,
25647
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25648
0
{
25649
0
  assert(mr != 0);
25650
0
  assert(mr <= 1);
25651
0
  assert(nc != 0);
25652
0
  assert(kc != 0);
25653
0
  assert(ks != 0);
25654
0
  assert(ks % (1 * sizeof(void*)) == 0);
25655
0
  assert(a != NULL);
25656
0
  assert(w != NULL);
25657
0
  assert(c != NULL);
25658
25659
0
  uint8_t* c0 = c;
25660
25661
0
  const int32_t vb_zero_point = params->fp32_scalar_lrintf.kernel_zero_point;
25662
0
  do {
25663
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
25664
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
25665
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
25666
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
25667
0
    w = (const void*) ((const int32_t*) w + 4);
25668
25669
0
    size_t p = ks;
25670
0
    do {
25671
0
      const uint8_t* restrict a0 = a[0];
25672
0
      assert(a0 != NULL);
25673
0
      if XNN_UNPREDICTABLE(a0 != zero) {
25674
0
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
25675
0
      }
25676
0
      a += 1;
25677
25678
0
      size_t k = kc;
25679
0
      do {
25680
0
        const int32_t va0 = (int32_t) (uint32_t) *a0++;
25681
25682
0
        const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25683
0
        const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25684
0
        const int32_t vb2 = (int32_t) (uint32_t) ((const uint8_t*) w)[2] - vb_zero_point;
25685
0
        const int32_t vb3 = (int32_t) (uint32_t) ((const uint8_t*) w)[3] - vb_zero_point;
25686
0
        w = (const void*) ((const uint8_t*) w + 4);
25687
25688
0
        vacc0x0 += va0 * vb0;
25689
0
        vacc0x1 += va0 * vb1;
25690
0
        vacc0x2 += va0 * vb2;
25691
0
        vacc0x3 += va0 * vb3;
25692
25693
0
        k -= sizeof(uint8_t);
25694
0
      } while (k != 0);
25695
0
      p -= 1 * sizeof(void*);
25696
0
    } while (p != 0);
25697
25698
0
    float vfpacc0x0 = (float) vacc0x0;
25699
0
    float vfpacc0x1 = (float) vacc0x1;
25700
0
    float vfpacc0x2 = (float) vacc0x2;
25701
0
    float vfpacc0x3 = (float) vacc0x3;
25702
25703
0
    const float vscale = params->fp32_scalar_lrintf.scale;
25704
0
    vfpacc0x0 *= vscale;
25705
0
    vfpacc0x1 *= vscale;
25706
0
    vfpacc0x2 *= vscale;
25707
0
    vfpacc0x3 *= vscale;
25708
25709
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
25710
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
25711
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
25712
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
25713
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
25714
25715
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
25716
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
25717
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
25718
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
25719
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
25720
25721
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
25722
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
25723
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
25724
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
25725
25726
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
25727
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
25728
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
25729
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
25730
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
25731
25732
0
    if XNN_LIKELY(nc >= 4) {
25733
0
      c0[0] = (uint8_t) vout0x0;
25734
0
      c0[1] = (uint8_t) vout0x1;
25735
0
      c0[2] = (uint8_t) vout0x2;
25736
0
      c0[3] = (uint8_t) vout0x3;
25737
25738
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25739
25740
0
      a = (const uint8_t**restrict) ((uintptr_t) a - ks);
25741
0
      nc -= 4;
25742
0
    } else {
25743
0
      if (nc & 2) {
25744
0
        c0[0] = (uint8_t) vout0x0;
25745
0
        c0[1] = (uint8_t) vout0x1;
25746
0
        vout0x0 = vout0x2;
25747
0
        c0 += 2;
25748
0
      }
25749
0
      if (nc & 1) {
25750
0
        c0[0] = (uint8_t) vout0x0;
25751
0
      }
25752
25753
0
      nc = 0;
25754
0
    }
25755
0
  } while (nc != 0);
25756
0
}
25757
25758
void xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic(
25759
    size_t mr,
25760
    size_t nc,
25761
    size_t kc,
25762
    size_t ks,
25763
    const uint8_t** restrict a,
25764
    const void* restrict w,
25765
    uint8_t* restrict c,
25766
    size_t cm_stride,
25767
    size_t cn_stride,
25768
    size_t a_offset,
25769
    const uint8_t* zero,
25770
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25771
0
{
25772
0
  assert(mr != 0);
25773
0
  assert(mr <= 2);
25774
0
  assert(nc != 0);
25775
0
  assert(kc != 0);
25776
0
  assert(ks != 0);
25777
0
  assert(ks % (2 * sizeof(void*)) == 0);
25778
0
  assert(a != NULL);
25779
0
  assert(w != NULL);
25780
0
  assert(c != NULL);
25781
25782
0
  uint8_t* c0 = c;
25783
0
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
25784
0
  if XNN_UNPREDICTABLE(mr != 2) {
25785
0
    c1 = c0;
25786
0
  }
25787
25788
0
  const int32_t vb_zero_point = params->fp32_scalar_imagic.kernel_zero_point;
25789
0
  do {
25790
0
    int32_t vacc0x0 = unaligned_indexed_load_s32(w, 0);
25791
0
    int32_t vacc0x1 = unaligned_indexed_load_s32(w, 1);
25792
0
    int32_t vacc1x0 = vacc0x0;
25793
0
    int32_t vacc1x1 = vacc0x1;
25794
0
    w = (const void*) ((const int32_t*) w + 2);
25795
25796
0
    size_t p = ks;
25797
0
    do {
25798
0
      const uint8_t* restrict a0 = a[0];
25799
0
      assert(a0 != NULL);
25800
0
      if XNN_UNPREDICTABLE(a0 != zero) {
25801
0
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
25802
0
      }
25803
0
      const uint8_t* restrict a1 = a[1];
25804
0
      assert(a1 != NULL);
25805
0
      if XNN_UNPREDICTABLE(a1 != zero) {
25806
0
        a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
25807
0
      }
25808
0
      a += 2;
25809
25810
0
      size_t k = kc;
25811
0
      do {
25812
0
        const int32_t va0 = (int32_t) (uint32_t) *a0++;
25813
0
        const int32_t va1 = (int32_t) (uint32_t) *a1++;
25814
25815
0
        const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25816
0
        const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25817
0
        w = (const void*) ((const uint8_t*) w + 2);
25818
25819
0
        vacc0x0 += va0 * vb0;
25820
0
        vacc0x1 += va0 * vb1;
25821
0
        vacc1x0 += va1 * vb0;
25822
0
        vacc1x1 += va1 * vb1;
25823
25824
0
        k -= sizeof(uint8_t);
25825
0
      } while (k != 0);
25826
0
      p -= 2 * sizeof(void*);
25827
0
    } while (p != 0);
25828
25829
0
    float vfpacc0x0 = (float) vacc0x0;
25830
0
    float vfpacc0x1 = (float) vacc0x1;
25831
0
    float vfpacc1x0 = (float) vacc1x0;
25832
0
    float vfpacc1x1 = (float) vacc1x1;
25833
25834
0
    const float vscale = params->fp32_scalar_imagic.scale;
25835
0
    vfpacc0x0 *= vscale;
25836
0
    vfpacc0x1 *= vscale;
25837
0
    vfpacc1x0 *= vscale;
25838
0
    vfpacc1x1 *= vscale;
25839
25840
0
    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
25841
0
    vfpacc0x0 += vmagic_bias;
25842
0
    vfpacc0x1 += vmagic_bias;
25843
0
    vfpacc1x0 += vmagic_bias;
25844
0
    vfpacc1x1 += vmagic_bias;
25845
25846
0
    int32_t vout0x0 = (int32_t) float_as_uint32(vfpacc0x0);
25847
0
    int32_t vout0x1 = (int32_t) float_as_uint32(vfpacc0x1);
25848
0
    int32_t vout1x0 = (int32_t) float_as_uint32(vfpacc1x0);
25849
0
    int32_t vout1x1 = (int32_t) float_as_uint32(vfpacc1x1);
25850
25851
0
    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
25852
0
    vout0x0 = math_max_s32(vout0x0, vmagic_min);
25853
0
    vout0x1 = math_max_s32(vout0x1, vmagic_min);
25854
0
    vout1x0 = math_max_s32(vout1x0, vmagic_min);
25855
0
    vout1x1 = math_max_s32(vout1x1, vmagic_min);
25856
25857
0
    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
25858
0
    vout0x0 = math_min_s32(vout0x0, vmagic_max);
25859
0
    vout0x1 = math_min_s32(vout0x1, vmagic_max);
25860
0
    vout1x0 = math_min_s32(vout1x0, vmagic_max);
25861
0
    vout1x1 = math_min_s32(vout1x1, vmagic_max);
25862
25863
0
    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
25864
0
    vout0x0 -= vmagic_bias_less_zero_point;
25865
0
    vout0x1 -= vmagic_bias_less_zero_point;
25866
0
    vout1x0 -= vmagic_bias_less_zero_point;
25867
0
    vout1x1 -= vmagic_bias_less_zero_point;
25868
25869
0
    if XNN_LIKELY(nc >= 2) {
25870
0
      c1[0] = (uint8_t) vout1x0;
25871
0
      c1[1] = (uint8_t) vout1x1;
25872
0
      c0[0] = (uint8_t) vout0x0;
25873
0
      c0[1] = (uint8_t) vout0x1;
25874
25875
0
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
25876
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
25877
25878
0
      a = (const uint8_t**restrict) ((uintptr_t) a - ks);
25879
0
      nc -= 2;
25880
0
    } else {
25881
0
      if (nc & 1) {
25882
0
        c1[0] = (uint8_t) vout1x0;
25883
0
        c0[0] = (uint8_t) vout0x0;
25884
0
      }
25885
25886
0
      nc = 0;
25887
0
    }
25888
0
  } while (nc != 0);
25889
0
}
25890
25891
void xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf(
25892
    size_t mr,
25893
    size_t nc,
25894
    size_t kc,
25895
    size_t ks,
25896
    const uint8_t** restrict a,
25897
    const void* restrict w,
25898
    uint8_t* restrict c,
25899
    size_t cm_stride,
25900
    size_t cn_stride,
25901
    size_t a_offset,
25902
    const uint8_t* zero,
25903
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25904
0
{
25905
0
  assert(mr != 0);
25906
0
  assert(mr <= 3);
25907
0
  assert(nc != 0);
25908
0
  assert(kc != 0);
25909
0
  assert(ks != 0);
25910
0
  assert(ks % (3 * sizeof(void*)) == 0);
25911
0
  assert(a != NULL);
25912
0
  assert(w != NULL);
25913
0
  assert(c != NULL);
25914
25915
0
  uint8_t* c0 = c;
25916
0
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
25917
0
  if XNN_UNPREDICTABLE(mr < 2) {
25918
0
    c1 = c0;
25919
0
  }
25920
0
  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
25921
0
  if XNN_UNPREDICTABLE(mr <= 2) {
25922
0
    c2 = c1;
25923
0
  }
25924
25925
0
  const int32_t vb_zero_point = params->fp32_scalar_lrintf.kernel_zero_point;
25926
0
  do {
25927
0
    int32_t vacc0x0 = ((const int32_t*) w)[0];
25928
0
    int32_t vacc0x1 = ((const int32_t*) w)[1];
25929
0
    int32_t vacc0x2 = ((const int32_t*) w)[2];
25930
0
    int32_t vacc0x3 = ((const int32_t*) w)[3];
25931
0
    int32_t vacc1x0 = vacc0x0;
25932
0
    int32_t vacc1x1 = vacc0x1;
25933
0
    int32_t vacc1x2 = vacc0x2;
25934
0
    int32_t vacc1x3 = vacc0x3;
25935
0
    int32_t vacc2x0 = vacc0x0;
25936
0
    int32_t vacc2x1 = vacc0x1;
25937
0
    int32_t vacc2x2 = vacc0x2;
25938
0
    int32_t vacc2x3 = vacc0x3;
25939
0
    w = (const void*) ((const int32_t*) w + 4);
25940
25941
0
    size_t p = ks;
25942
0
    do {
25943
0
      const uint8_t* restrict a0 = a[0];
25944
0
      assert(a0 != NULL);
25945
0
      if XNN_UNPREDICTABLE(a0 != zero) {
25946
0
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
25947
0
      }
25948
0
      const uint8_t* restrict a1 = a[1];
25949
0
      assert(a1 != NULL);
25950
0
      if XNN_UNPREDICTABLE(a1 != zero) {
25951
0
        a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
25952
0
      }
25953
0
      const uint8_t* restrict a2 = a[2];
25954
0
      assert(a2 != NULL);
25955
0
      if XNN_UNPREDICTABLE(a2 != zero) {
25956
0
        a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
25957
0
      }
25958
0
      a += 3;
25959
25960
0
      size_t k = kc;
25961
0
      do {
25962
0
        const int32_t va0 = (int32_t) (uint32_t) *a0++;
25963
0
        const int32_t va1 = (int32_t) (uint32_t) *a1++;
25964
0
        const int32_t va2 = (int32_t) (uint32_t) *a2++;
25965
25966
0
        const int32_t vb0 = (int32_t) (uint32_t) ((const uint8_t*) w)[0] - vb_zero_point;
25967
0
        const int32_t vb1 = (int32_t) (uint32_t) ((const uint8_t*) w)[1] - vb_zero_point;
25968
0
        const int32_t vb2 = (int32_t) (uint32_t) ((const uint8_t*) w)[2] - vb_zero_point;
25969
0
        const int32_t vb3 = (int32_t) (uint32_t) ((const uint8_t*) w)[3] - vb_zero_point;
25970
0
        w = (const void*) ((const uint8_t*) w + 4);
25971
25972
0
        vacc0x0 += va0 * vb0;
25973
0
        vacc0x1 += va0 * vb1;
25974
0
        vacc0x2 += va0 * vb2;
25975
0
        vacc0x3 += va0 * vb3;
25976
0
        vacc1x0 += va1 * vb0;
25977
0
        vacc1x1 += va1 * vb1;
25978
0
        vacc1x2 += va1 * vb2;
25979
0
        vacc1x3 += va1 * vb3;
25980
0
        vacc2x0 += va2 * vb0;
25981
0
        vacc2x1 += va2 * vb1;
25982
0
        vacc2x2 += va2 * vb2;
25983
0
        vacc2x3 += va2 * vb3;
25984
25985
0
        k -= sizeof(uint8_t);
25986
0
      } while (k != 0);
25987
0
      p -= 3 * sizeof(void*);
25988
0
    } while (p != 0);
25989
25990
0
    float vfpacc0x0 = (float) vacc0x0;
25991
0
    float vfpacc0x1 = (float) vacc0x1;
25992
0
    float vfpacc0x2 = (float) vacc0x2;
25993
0
    float vfpacc0x3 = (float) vacc0x3;
25994
0
    float vfpacc1x0 = (float) vacc1x0;
25995
0
    float vfpacc1x1 = (float) vacc1x1;
25996
0
    float vfpacc1x2 = (float) vacc1x2;
25997
0
    float vfpacc1x3 = (float) vacc1x3;
25998
0
    float vfpacc2x0 = (float) vacc2x0;
25999
0
    float vfpacc2x1 = (float) vacc2x1;
26000
0
    float vfpacc2x2 = (float) vacc2x2;
26001
0
    float vfpacc2x3 = (float) vacc2x3;
26002
26003
0
    const float vscale = params->fp32_scalar_lrintf.scale;
26004
0
    vfpacc0x0 *= vscale;
26005
0
    vfpacc0x1 *= vscale;
26006
0
    vfpacc0x2 *= vscale;
26007
0
    vfpacc0x3 *= vscale;
26008
0
    vfpacc1x0 *= vscale;
26009
0
    vfpacc1x1 *= vscale;
26010
0
    vfpacc1x2 *= vscale;
26011
0
    vfpacc1x3 *= vscale;
26012
0
    vfpacc2x0 *= vscale;
26013
0
    vfpacc2x1 *= vscale;
26014
0
    vfpacc2x2 *= vscale;
26015
0
    vfpacc2x3 *= vscale;
26016
26017
0
    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
26018
0
    vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
26019
0
    vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
26020
0
    vfpacc0x2 = math_max_f32(vfpacc0x2, voutput_min_less_zero_point);
26021
0
    vfpacc0x3 = math_max_f32(vfpacc0x3, voutput_min_less_zero_point);
26022
0
    vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
26023
0
    vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
26024
0
    vfpacc1x2 = math_max_f32(vfpacc1x2, voutput_min_less_zero_point);
26025
0
    vfpacc1x3 = math_max_f32(vfpacc1x3, voutput_min_less_zero_point);
26026
0
    vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
26027
0
    vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
26028
0
    vfpacc2x2 = math_max_f32(vfpacc2x2, voutput_min_less_zero_point);
26029
0
    vfpacc2x3 = math_max_f32(vfpacc2x3, voutput_min_less_zero_point);
26030
26031
0
    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
26032
0
    vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
26033
0
    vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
26034
0
    vfpacc0x2 = math_min_f32(vfpacc0x2, voutput_max_less_zero_point);
26035
0
    vfpacc0x3 = math_min_f32(vfpacc0x3, voutput_max_less_zero_point);
26036
0
    vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
26037
0
    vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
26038
0
    vfpacc1x2 = math_min_f32(vfpacc1x2, voutput_max_less_zero_point);
26039
0
    vfpacc1x3 = math_min_f32(vfpacc1x3, voutput_max_less_zero_point);
26040
0
    vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
26041
0
    vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
26042
0
    vfpacc2x2 = math_min_f32(vfpacc2x2, voutput_max_less_zero_point);
26043
0
    vfpacc2x3 = math_min_f32(vfpacc2x3, voutput_max_less_zero_point);
26044
26045
0
    const int32_t vrndacc0x0 = (int32_t) lrintf(vfpacc0x0);
26046
0
    const int32_t vrndacc0x1 = (int32_t) lrintf(vfpacc0x1);
26047
0
    const int32_t vrndacc0x2 = (int32_t) lrintf(vfpacc0x2);
26048
0
    const int32_t vrndacc0x3 = (int32_t) lrintf(vfpacc0x3);
26049
0
    const int32_t vrndacc1x0 = (int32_t) lrintf(vfpacc1x0);
26050
0
    const int32_t vrndacc1x1 = (int32_t) lrintf(vfpacc1x1);
26051
0
    const int32_t vrndacc1x2 = (int32_t) lrintf(vfpacc1x2);
26052
0
    const int32_t vrndacc1x3 = (int32_t) lrintf(vfpacc1x3);
26053
0
    const int32_t vrndacc2x0 = (int32_t) lrintf(vfpacc2x0);
26054
0
    const int32_t vrndacc2x1 = (int32_t) lrintf(vfpacc2x1);
26055
0
    const int32_t vrndacc2x2 = (int32_t) lrintf(vfpacc2x2);
26056
0
    const int32_t vrndacc2x3 = (int32_t) lrintf(vfpacc2x3);
26057
26058
0
    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
26059
0
    int32_t vout0x0 = vrndacc0x0 + voutput_zero_point;
26060
0
    int32_t vout0x1 = vrndacc0x1 + voutput_zero_point;
26061
0
    int32_t vout0x2 = vrndacc0x2 + voutput_zero_point;
26062
0
    int32_t vout0x3 = vrndacc0x3 + voutput_zero_point;
26063
0
    int32_t vout1x0 = vrndacc1x0 + voutput_zero_point;
26064
0
    int32_t vout1x1 = vrndacc1x1 + voutput_zero_point;
26065
0
    int32_t vout1x2 = vrndacc1x2 + voutput_zero_point;
26066
0
    int32_t vout1x3 = vrndacc1x3 + voutput_zero_point;
26067
0
    int32_t vout2x0 = vrndacc2x0 + voutput_zero_point;
26068
0
    int32_t vout2x1 = vrndacc2x1 + voutput_zero_point;
26069
0
    int32_t vout2x2 = vrndacc2x2 + voutput_zero_point;
26070
0
    int32_t vout2x3 = vrndacc2x3 + voutput_zero_point;
26071
26072
0
    if XNN_LIKELY(nc >= 4) {
26073
0
      c2[0] = (uint8_t) vout2x0;
26074
0
      c2[1] = (uint8_t) vout2x1;
26075
0
      c2[2] = (uint8_t) vout2x2;
26076
0
      c2[3] = (uint8_t) vout2x3;
26077
0
      c1[0] = (uint8_t) vout1x0;
26078
0
      c1[1] = (uint8_t) vout1x1;
26079
0
      c1[2] = (uint8_t) vout1x2;
26080
0
      c1[3] = (uint8_t) vout1x3;
26081
0
      c0[0] = (uint8_t) vout0x0;
26082
0
      c0[1] = (uint8_t) vout0x1;
26083
0
      c0[2] = (uint8_t) vout0x2;
26084
0
      c0[3] = (uint8_t) vout0x3;
26085
26086
0
      c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
26087
0
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
26088
0
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
26089
26090
0
      a = (const uint8_t**restrict) ((uintptr_t) a - ks);
26091
0
      nc -= 4;
26092
0
    } else {
26093
0
      if (nc & 2) {
26094
0
        c2[0] = (uint8_t) vout2x0;
26095
0
        c2[1] = (uint8_t) vout2x1;
26096
0
        vout2x0 = vout2x2;
26097
0
        c2 += 2;
26098
0
        c1[0] = (uint8_t) vout1x0;
26099
0
        c1[1] = (uint8_t) vout1x1;
26100
0
        vout1x0 = vout1x2;
26101
0
        c1 += 2;
26102
0
        c0[0] = (uint8_t) vout0x0;
26103
0
        c0[1] = (uint8_t) vout0x1;
26104
0
        vout0x0 = vout0x2;
26105
0
        c0 += 2;
26106
0
      }
26107
0
      if (nc & 1) {
26108
0
        c2[0] = (uint8_t) vout2x0;
26109
0
        c1[0] = (uint8_t) vout1x0;
26110
0
        c0[0] = (uint8_t) vout0x0;
26111
0
      }
26112
26113
0
      nc = 0;
26114
0
    }
26115
0
  } while (nc != 0);
26116
0
}
26117
26118
void xnn_qu8_vadd_minmax_ukernel__scalar_u1(
26119
    size_t batch,
26120
    const uint8_t* input_a,
26121
    const uint8_t* input_b,
26122
    uint8_t* output,
26123
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26124
0
{
26125
0
  assert(batch != 0);
26126
0
  assert(batch % sizeof(uint8_t) == 0);
26127
0
  assert(input_a != NULL);
26128
0
  assert(input_b != NULL);
26129
0
  assert(output != NULL);
26130
26131
0
  const int32_t vbias = params->scalar.bias;
26132
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
26133
0
  const int32_t vb_multiplier = params->scalar.b_multiplier;
26134
0
  const uint32_t vshift = params->scalar.shift;
26135
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
26136
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
26137
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
26138
26139
0
  do {
26140
0
    const int32_t va = *input_a++;
26141
0
    const int32_t vb = *input_b++;
26142
0
    const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
26143
26144
0
    int32_t vout = math_asr_s32(vacc, vshift);
26145
0
    vout = math_max_s32(vout, voutput_min_less_zero_point);
26146
0
    vout = math_min_s32(vout, voutput_max_less_zero_point);
26147
0
    *output++ = (uint8_t) (vout + voutput_zero_point);
26148
26149
0
    batch -= sizeof(uint8_t);
26150
0
  } while (batch != 0);
26151
0
}
26152
26153
void xnn_qu8_vadd_minmax_ukernel__scalar_u4(
26154
    size_t batch,
26155
    const uint8_t* input_a,
26156
    const uint8_t* input_b,
26157
    uint8_t* output,
26158
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26159
0
{
26160
0
  assert(batch != 0);
26161
0
  assert(batch % sizeof(uint8_t) == 0);
26162
0
  assert(input_a != NULL);
26163
0
  assert(input_b != NULL);
26164
0
  assert(output != NULL);
26165
26166
0
  const int32_t vbias = params->scalar.bias;
26167
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
26168
0
  const int32_t vb_multiplier = params->scalar.b_multiplier;
26169
0
  const uint32_t vshift = params->scalar.shift;
26170
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
26171
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
26172
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
26173
26174
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26175
0
    const int32_t va0 = input_a[0];
26176
0
    const int32_t va1 = input_a[1];
26177
0
    const int32_t va2 = input_a[2];
26178
0
    const int32_t va3 = input_a[3];
26179
0
    input_a += 4;
26180
26181
0
    const int32_t vb0 = input_b[0];
26182
0
    int32_t vacc0 = vbias + va0 * va_multiplier;
26183
0
    const int32_t vb1 = input_b[1];
26184
0
    int32_t vacc1 = vbias + va1 * va_multiplier;
26185
0
    const int32_t vb2 = input_b[2];
26186
0
    int32_t vacc2 = vbias + va2 * va_multiplier;
26187
0
    const int32_t vb3 = input_b[3];
26188
0
    int32_t vacc3 = vbias + va3 * va_multiplier;
26189
0
    input_b += 4;
26190
26191
0
    vacc0 += vb0 * vb_multiplier;
26192
0
    vacc1 += vb1 * vb_multiplier;
26193
0
    vacc2 += vb2 * vb_multiplier;
26194
0
    vacc3 += vb3 * vb_multiplier;
26195
26196
0
    int32_t vout0 = math_asr_s32(vacc0, vshift);
26197
0
    int32_t vout1 = math_asr_s32(vacc1, vshift);
26198
0
    int32_t vout2 = math_asr_s32(vacc2, vshift);
26199
0
    int32_t vout3 = math_asr_s32(vacc3, vshift);
26200
26201
0
    vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
26202
0
    vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
26203
0
    vout2 = math_max_s32(vout2, voutput_min_less_zero_point);
26204
0
    vout3 = math_max_s32(vout3, voutput_min_less_zero_point);
26205
26206
0
    vout0 = math_min_s32(vout0, voutput_max_less_zero_point);
26207
0
    vout1 = math_min_s32(vout1, voutput_max_less_zero_point);
26208
0
    vout2 = math_min_s32(vout2, voutput_max_less_zero_point);
26209
0
    vout3 = math_min_s32(vout3, voutput_max_less_zero_point);
26210
26211
0
    vout0 += voutput_zero_point;
26212
0
    vout1 += voutput_zero_point;
26213
0
    vout2 += voutput_zero_point;
26214
0
    vout3 += voutput_zero_point;
26215
26216
0
    output[0] = (uint8_t) vout0;
26217
0
    output[1] = (uint8_t) vout1;
26218
0
    output[2] = (uint8_t) vout2;
26219
0
    output[3] = (uint8_t) vout3;
26220
0
    output += 4;
26221
0
  }
26222
0
  if XNN_UNLIKELY(batch != 0) {
26223
0
    do {
26224
0
      const int32_t va = *input_a++;
26225
0
      const int32_t vb = *input_b++;
26226
0
      const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
26227
26228
0
      int32_t vout = math_asr_s32(vacc, vshift);
26229
0
      vout = math_max_s32(vout, voutput_min_less_zero_point);
26230
0
      vout = math_min_s32(vout, voutput_max_less_zero_point);
26231
0
      *output++ = (uint8_t) (vout + voutput_zero_point);
26232
26233
0
      batch -= sizeof(uint8_t);
26234
0
    } while (batch != 0);
26235
0
  }
26236
0
}
26237
26238
void xnn_qu8_vaddc_minmax_ukernel__scalar_u1(
26239
    size_t batch,
26240
    const uint8_t* input_a,
26241
    const uint8_t* input_b,
26242
    uint8_t* output,
26243
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26244
0
{
26245
0
  assert(batch != 0);
26246
0
  assert(batch % sizeof(uint8_t) == 0);
26247
0
  assert(input_a != NULL);
26248
0
  assert(input_b != NULL);
26249
0
  assert(output != NULL);
26250
26251
0
  const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
26252
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
26253
0
  const uint32_t vshift = params->scalar.shift;
26254
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
26255
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
26256
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
26257
26258
0
  do {
26259
0
    const int32_t va = *input_a++;
26260
0
    const int32_t vacc = vbias + va * va_multiplier;
26261
26262
0
    int32_t vout = math_asr_s32(vacc, vshift);
26263
0
    vout = math_max_s32(vout, voutput_min_less_zero_point);
26264
0
    vout = math_min_s32(vout, voutput_max_less_zero_point);
26265
0
    *output++ = (uint8_t) (vout + voutput_zero_point);
26266
26267
0
    batch -= sizeof(uint8_t);
26268
0
  } while (batch != 0);
26269
0
}
26270
26271
void xnn_qu8_vaddc_minmax_ukernel__scalar_u4(
26272
    size_t batch,
26273
    const uint8_t* input_a,
26274
    const uint8_t* input_b,
26275
    uint8_t* output,
26276
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26277
0
{
26278
0
  assert(batch != 0);
26279
0
  assert(batch % sizeof(uint8_t) == 0);
26280
0
  assert(input_a != NULL);
26281
0
  assert(input_b != NULL);
26282
0
  assert(output != NULL);
26283
26284
0
  const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
26285
0
  const int32_t va_multiplier = params->scalar.a_multiplier;
26286
0
  const uint32_t vshift = params->scalar.shift;
26287
0
  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
26288
0
  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
26289
0
  const int32_t voutput_zero_point = params->scalar.output_zero_point;
26290
26291
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26292
0
    const int32_t va0 = input_a[0];
26293
0
    const int32_t va1 = input_a[1];
26294
0
    const int32_t va2 = input_a[2];
26295
0
    const int32_t va3 = input_a[3];
26296
0
    input_a += 4;
26297
26298
0
    const int32_t vacc0 = vbias + va0 * va_multiplier;
26299
0
    const int32_t vacc1 = vbias + va1 * va_multiplier;
26300
0
    const int32_t vacc2 = vbias + va2 * va_multiplier;
26301
0
    const int32_t vacc3 = vbias + va3 * va_multiplier;
26302
0
    input_b += 4;
26303
26304
0
    int32_t vout0 = math_asr_s32(vacc0, vshift);
26305
0
    int32_t vout1 = math_asr_s32(vacc1, vshift);
26306
0
    int32_t vout2 = math_asr_s32(vacc2, vshift);
26307
0
    int32_t vout3 = math_asr_s32(vacc3, vshift);
26308
26309
0
    vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
26310
0
    vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
26311
0
    vout2 = math_max_s32(vout2, voutput_min_less_zero_point);
26312
0
    vout3 = math_max_s32(vout3, voutput_min_less_zero_point);
26313
26314
0
    vout0 = math_min_s32(vout0, voutput_max_less_zero_point);
26315
0
    vout1 = math_min_s32(vout1, voutput_max_less_zero_point);
26316
0
    vout2 = math_min_s32(vout2, voutput_max_less_zero_point);
26317
0
    vout3 = math_min_s32(vout3, voutput_max_less_zero_point);
26318
26319
0
    vout0 += voutput_zero_point;
26320
0
    vout1 += voutput_zero_point;
26321
0
    vout2 += voutput_zero_point;
26322
0
    vout3 += voutput_zero_point;
26323
26324
0
    output[0] = (uint8_t) vout0;
26325
0
    output[1] = (uint8_t) vout1;
26326
0
    output[2] = (uint8_t) vout2;
26327
0
    output[3] = (uint8_t) vout3;
26328
0
    output += 4;
26329
0
  }
26330
0
  if XNN_UNLIKELY(batch != 0) {
26331
0
    do {
26332
0
      const int32_t va = *input_a++;
26333
0
      const int32_t vacc = vbias + va * va_multiplier;
26334
26335
0
      int32_t vout = math_asr_s32(vacc, vshift);
26336
0
      vout = math_max_s32(vout, voutput_min_less_zero_point);
26337
0
      vout = math_min_s32(vout, voutput_max_less_zero_point);
26338
0
      *output++ = (uint8_t) (vout + voutput_zero_point);
26339
26340
0
      batch -= sizeof(uint8_t);
26341
0
    } while (batch != 0);
26342
0
  }
26343
0
}
26344
26345
void xnn_qu8_vcvt_ukernel__scalar_u1(
26346
    size_t batch,
26347
    const uint8_t* input,
26348
    uint8_t* output,
26349
    const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
26350
0
{
26351
0
  assert(batch != 0);
26352
0
  assert(batch % sizeof(uint8_t) == 0);
26353
0
  assert(input != NULL);
26354
0
  assert(output != NULL);
26355
26356
0
  const int32_t vbias = params->scalar.bias;
26357
0
  const int32_t vmultiplier = params->scalar.multiplier;
26358
0
  do {
26359
0
    int32_t vacc = *input++;
26360
0
    vacc = vbias + vacc * vmultiplier;
26361
26362
0
    int32_t vout = math_asr_s32(vacc, 8);
26363
0
    vout = math_max_s32(vout, 0);
26364
0
    vout = math_min_s32(vout, 255);
26365
0
    *output++ = (uint8_t) vout;
26366
26367
0
    batch -= sizeof(uint8_t);
26368
0
  } while (batch != 0);
26369
0
}
26370
26371
void xnn_qu8_vcvt_ukernel__scalar_u4(
26372
    size_t batch,
26373
    const uint8_t* input,
26374
    uint8_t* output,
26375
    const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
26376
0
{
26377
0
  assert(batch != 0);
26378
0
  assert(batch % sizeof(uint8_t) == 0);
26379
0
  assert(input != NULL);
26380
0
  assert(output != NULL);
26381
26382
0
  const int32_t vbias = params->scalar.bias;
26383
0
  const int32_t vmultiplier = params->scalar.multiplier;
26384
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26385
0
    int32_t vacc0 = input[0];
26386
0
    int32_t vacc1 = input[1];
26387
0
    int32_t vacc2 = input[2];
26388
0
    int32_t vacc3 = input[3];
26389
0
    input += 4;
26390
26391
0
    vacc0 = vbias + vacc0 * vmultiplier;
26392
0
    vacc1 = vbias + vacc1 * vmultiplier;
26393
0
    vacc2 = vbias + vacc2 * vmultiplier;
26394
0
    vacc3 = vbias + vacc3 * vmultiplier;
26395
26396
0
    int32_t vout0 = math_asr_s32(vacc0, 8);
26397
0
    int32_t vout1 = math_asr_s32(vacc1, 8);
26398
0
    int32_t vout2 = math_asr_s32(vacc2, 8);
26399
0
    int32_t vout3 = math_asr_s32(vacc3, 8);
26400
26401
0
    vout0 = math_max_s32(vout0, 0);
26402
0
    vout1 = math_max_s32(vout1, 0);
26403
0
    vout2 = math_max_s32(vout2, 0);
26404
0
    vout3 = math_max_s32(vout3, 0);
26405
26406
0
    vout0 = math_min_s32(vout0, 255);
26407
0
    vout1 = math_min_s32(vout1, 255);
26408
0
    vout2 = math_min_s32(vout2, 255);
26409
0
    vout3 = math_min_s32(vout3, 255);
26410
26411
0
    output[0] = (uint8_t) vout0;
26412
0
    output[1] = (uint8_t) vout1;
26413
0
    output[2] = (uint8_t) vout2;
26414
0
    output[3] = (uint8_t) vout3;
26415
0
    output += 4;
26416
0
  }
26417
0
  if XNN_UNLIKELY(batch != 0) {
26418
0
    do {
26419
0
      int32_t vacc = *input++;
26420
0
      vacc = vbias + vacc * vmultiplier;
26421
26422
0
      int32_t vout = math_asr_s32(vacc, 8);
26423
0
      vout = math_max_s32(vout, 0);
26424
0
      vout = math_min_s32(vout, 255);
26425
0
      *output++ = (uint8_t) vout;
26426
26427
0
      batch -= sizeof(uint8_t);
26428
0
    } while (batch != 0);
26429
0
  }
26430
0
}
26431
26432
void xnn_qu8_vlrelu_ukernel__scalar_andxor_u4(
26433
    size_t batch,
26434
    const uint8_t* input,
26435
    uint8_t* output,
26436
    const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
26437
0
{
26438
0
  assert(batch != 0);
26439
0
  assert(batch % sizeof(uint8_t) == 0);
26440
0
  assert(input != NULL);
26441
0
  assert(output != NULL);
26442
26443
0
  const int32_t vinput_zero_point = params->scalar_andxor.input_zero_point;
26444
0
  const int32_t vmultiplier_diff = params->scalar_andxor.multiplier_diff;
26445
0
  const int32_t vmultiplier_base = params->scalar_andxor.multiplier_base;
26446
0
  const int32_t vbias = params->scalar_andxor.bias;
26447
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26448
0
    int32_t vacc0 = (int32_t) input[0];
26449
0
    int32_t vacc1 = (int32_t) input[1];
26450
0
    int32_t vacc2 = (int32_t) input[2];
26451
0
    int32_t vacc3 = (int32_t) input[3];
26452
0
    input += 4;
26453
26454
0
    vacc0 -= vinput_zero_point;
26455
0
    vacc1 -= vinput_zero_point;
26456
0
    vacc2 -= vinput_zero_point;
26457
0
    vacc3 -= vinput_zero_point;
26458
26459
0
    int32_t vmultiplier0 = math_asr_s32(vacc0, 31);
26460
0
    int32_t vmultiplier1 = math_asr_s32(vacc1, 31);
26461
0
    int32_t vmultiplier2 = math_asr_s32(vacc2, 31);
26462
0
    int32_t vmultiplier3 = math_asr_s32(vacc3, 31);
26463
26464
0
    vmultiplier0 &= vmultiplier_diff;
26465
0
    vmultiplier1 &= vmultiplier_diff;
26466
0
    vmultiplier2 &= vmultiplier_diff;
26467
0
    vmultiplier3 &= vmultiplier_diff;
26468
26469
0
    vmultiplier0 ^= vmultiplier_base;
26470
0
    vmultiplier1 ^= vmultiplier_base;
26471
0
    vmultiplier2 ^= vmultiplier_base;
26472
0
    vmultiplier3 ^= vmultiplier_base;
26473
26474
0
    vacc0 = vbias + vacc0 * vmultiplier0;
26475
0
    vacc1 = vbias + vacc1 * vmultiplier1;
26476
0
    vacc2 = vbias + vacc2 * vmultiplier2;
26477
0
    vacc3 = vbias + vacc3 * vmultiplier3;
26478
26479
0
    int32_t vout0 = math_asr_s32(vacc0, 8);
26480
0
    int32_t vout1 = math_asr_s32(vacc1, 8);
26481
0
    int32_t vout2 = math_asr_s32(vacc2, 8);
26482
0
    int32_t vout3 = math_asr_s32(vacc3, 8);
26483
26484
0
    vout0 = math_max_s32(vout0, 0);
26485
0
    vout1 = math_max_s32(vout1, 0);
26486
0
    vout2 = math_max_s32(vout2, 0);
26487
0
    vout3 = math_max_s32(vout3, 0);
26488
26489
0
    vout0 = math_min_s32(vout0, 255);
26490
0
    vout1 = math_min_s32(vout1, 255);
26491
0
    vout2 = math_min_s32(vout2, 255);
26492
0
    vout3 = math_min_s32(vout3, 255);
26493
26494
0
    output[0] = (uint8_t) vout0;
26495
0
    output[1] = (uint8_t) vout1;
26496
0
    output[2] = (uint8_t) vout2;
26497
0
    output[3] = (uint8_t) vout3;
26498
0
    output += 4;
26499
0
  }
26500
0
  if XNN_UNLIKELY(batch != 0) {
26501
0
    do {
26502
0
      int32_t vacc = (int32_t) *input++ - vinput_zero_point;
26503
0
      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
26504
0
      vacc = vbias + vacc * vmultiplier;
26505
26506
0
      int32_t vout = math_asr_s32(vacc, 8);
26507
0
      vout = math_max_s32(vout, 0);
26508
0
      vout = math_min_s32(vout, 255);
26509
0
      *output++ = (uint8_t) vout;
26510
26511
0
      batch -= sizeof(uint8_t);
26512
0
    } while (batch != 0);
26513
0
  }
26514
0
}
26515
26516
void xnn_qu8_vlrelu_ukernel__scalar_select_u4(
26517
    size_t batch,
26518
    const uint8_t* input,
26519
    uint8_t* output,
26520
    const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
26521
0
{
26522
0
  assert(batch != 0);
26523
0
  assert(batch % sizeof(uint8_t) == 0);
26524
0
  assert(input != NULL);
26525
0
  assert(output != NULL);
26526
26527
0
  const int32_t vinput_zero_point = params->scalar_select.input_zero_point;
26528
0
  const int32_t vpositive_multiplier = params->scalar_select.positive_multiplier;
26529
0
  const int32_t vnegative_multiplier = params->scalar_select.negative_multiplier;
26530
0
  const int32_t vbias = params->scalar_select.bias;
26531
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26532
0
    int32_t vacc0 = (int32_t) input[0];
26533
0
    int32_t vacc1 = (int32_t) input[1];
26534
0
    int32_t vacc2 = (int32_t) input[2];
26535
0
    int32_t vacc3 = (int32_t) input[3];
26536
0
    input += 4;
26537
26538
0
    vacc0 -= vinput_zero_point;
26539
0
    vacc1 -= vinput_zero_point;
26540
0
    vacc2 -= vinput_zero_point;
26541
0
    vacc3 -= vinput_zero_point;
26542
26543
0
    const int32_t vmultiplier0 = XNN_UNPREDICTABLE(vacc0 >= 0) ? vpositive_multiplier : vnegative_multiplier;
26544
0
    const int32_t vmultiplier1 = XNN_UNPREDICTABLE(vacc1 >= 0) ? vpositive_multiplier : vnegative_multiplier;
26545
0
    const int32_t vmultiplier2 = XNN_UNPREDICTABLE(vacc2 >= 0) ? vpositive_multiplier : vnegative_multiplier;
26546
0
    const int32_t vmultiplier3 = XNN_UNPREDICTABLE(vacc3 >= 0) ? vpositive_multiplier : vnegative_multiplier;
26547
26548
0
    vacc0 = vbias + vacc0 * vmultiplier0;
26549
0
    vacc1 = vbias + vacc1 * vmultiplier1;
26550
0
    vacc2 = vbias + vacc2 * vmultiplier2;
26551
0
    vacc3 = vbias + vacc3 * vmultiplier3;
26552
26553
0
    int32_t vout0 = math_asr_s32(vacc0, 8);
26554
0
    int32_t vout1 = math_asr_s32(vacc1, 8);
26555
0
    int32_t vout2 = math_asr_s32(vacc2, 8);
26556
0
    int32_t vout3 = math_asr_s32(vacc3, 8);
26557
26558
0
    vout0 = math_max_s32(vout0, 0);
26559
0
    vout1 = math_max_s32(vout1, 0);
26560
0
    vout2 = math_max_s32(vout2, 0);
26561
0
    vout3 = math_max_s32(vout3, 0);
26562
26563
0
    vout0 = math_min_s32(vout0, 255);
26564
0
    vout1 = math_min_s32(vout1, 255);
26565
0
    vout2 = math_min_s32(vout2, 255);
26566
0
    vout3 = math_min_s32(vout3, 255);
26567
26568
0
    output[0] = (uint8_t) vout0;
26569
0
    output[1] = (uint8_t) vout1;
26570
0
    output[2] = (uint8_t) vout2;
26571
0
    output[3] = (uint8_t) vout3;
26572
0
    output += 4;
26573
0
  }
26574
0
  if XNN_UNLIKELY(batch != 0) {
26575
0
    do {
26576
0
      int32_t vacc = (int32_t) *input++ - vinput_zero_point;
26577
0
      const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
26578
0
      vacc = vbias + vacc * vmultiplier;
26579
26580
0
      int32_t vout = math_asr_s32(vacc, 8);
26581
0
      vout = math_max_s32(vout, 0);
26582
0
      vout = math_min_s32(vout, 255);
26583
0
      *output++ = (uint8_t) vout;
26584
26585
0
      batch -= sizeof(uint8_t);
26586
0
    } while (batch != 0);
26587
0
  }
26588
0
}
26589
26590
void xnn_qu8_vmul_minmax_fp32_ukernel__scalar_u4(
26591
    size_t batch,
26592
    const uint8_t* input_a,
26593
    const uint8_t* input_b,
26594
    uint8_t* output,
26595
    const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26596
0
{
26597
0
  assert(batch != 0);
26598
0
  assert(batch % sizeof(uint8_t) == 0);
26599
0
  assert(input_a != NULL);
26600
0
  assert(input_b != NULL);
26601
0
  assert(output != NULL);
26602
26603
0
  const int32_t va_zero_point = params->fp32_scalar.a_zero_point;
26604
0
  const int32_t vb_zero_point = params->fp32_scalar.b_zero_point;
26605
0
  const float vscale = params->fp32_scalar.scale;
26606
0
  const float voutput_min_less_zero_point = params->fp32_scalar.output_min_less_zero_point;
26607
0
  const float voutput_max_less_zero_point = params->fp32_scalar.output_max_less_zero_point;
26608
0
  const float vmagic_bias = params->fp32_scalar.magic_bias;
26609
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar.magic_bias_less_output_zero_point;
26610
26611
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26612
0
    const int32_t va0 = input_a[0] - va_zero_point;
26613
0
    const int32_t va1 = input_a[1] - va_zero_point;
26614
0
    const int32_t va2 = input_a[2] - va_zero_point;
26615
0
    const int32_t va3 = input_a[3] - va_zero_point;
26616
0
    input_a += 4;
26617
26618
0
    const int32_t vb0 = input_b[0] - vb_zero_point;
26619
0
    const int32_t vb1 = input_b[1] - vb_zero_point;
26620
0
    const int32_t vb2 = input_b[2] - vb_zero_point;
26621
0
    const int32_t vb3 = input_b[3] - vb_zero_point;
26622
0
    input_b += 4;
26623
26624
0
    const int32_t vacc0 = va0 * vb0;
26625
0
    const int32_t vacc1 = va1 * vb1;
26626
0
    const int32_t vacc2 = va2 * vb2;
26627
0
    const int32_t vacc3 = va3 * vb3;
26628
26629
0
    float vfpacc0 = (float) vacc0 * vscale;
26630
0
    float vfpacc1 = (float) vacc1 * vscale;
26631
0
    float vfpacc2 = (float) vacc2 * vscale;
26632
0
    float vfpacc3 = (float) vacc3 * vscale;
26633
26634
0
    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
26635
0
    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
26636
0
    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
26637
0
    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
26638
26639
0
    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
26640
0
    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
26641
0
    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
26642
0
    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
26643
26644
0
    vfpacc0 += vmagic_bias;
26645
0
    vfpacc1 += vmagic_bias;
26646
0
    vfpacc2 += vmagic_bias;
26647
0
    vfpacc3 += vmagic_bias;
26648
26649
0
    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
26650
0
    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
26651
0
    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
26652
0
    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
26653
26654
0
    output[0] = (uint8_t) vout0;
26655
0
    output[1] = (uint8_t) vout1;
26656
0
    output[2] = (uint8_t) vout2;
26657
0
    output[3] = (uint8_t) vout3;
26658
0
    output += 4;
26659
0
  }
26660
0
  if XNN_UNLIKELY(batch != 0) {
26661
0
    do {
26662
0
      const int32_t va = (int32_t) *input_a++ - va_zero_point;
26663
0
      const int32_t vb = (int32_t) *input_b++ - vb_zero_point;
26664
0
      const int32_t vacc = va * vb;
26665
26666
0
      float vfpacc = (float) vacc * vscale;
26667
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
26668
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
26669
0
      vfpacc += vmagic_bias;
26670
0
      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
26671
0
      *output++ = (uint8_t) vout;
26672
26673
0
      batch -= sizeof(uint8_t);
26674
0
    } while (batch != 0);
26675
0
  }
26676
0
}
26677
26678
void xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_u4(
26679
    size_t batch,
26680
    const uint8_t* input_a,
26681
    const uint8_t* input_b,
26682
    uint8_t* output,
26683
    const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26684
0
{
26685
0
  assert(batch != 0);
26686
0
  assert(batch % sizeof(uint8_t) == 0);
26687
0
  assert(input_a != NULL);
26688
0
  assert(input_b != NULL);
26689
0
  assert(output != NULL);
26690
26691
0
  const int32_t va_zero_point = params->fp32_scalar.a_zero_point;
26692
0
  const float vscale = params->fp32_scalar.scale;
26693
0
  const float voutput_min_less_zero_point = params->fp32_scalar.output_min_less_zero_point;
26694
0
  const float voutput_max_less_zero_point = params->fp32_scalar.output_max_less_zero_point;
26695
0
  const float vmagic_bias = params->fp32_scalar.magic_bias;
26696
0
  const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar.magic_bias_less_output_zero_point;
26697
26698
0
  const int32_t vb = (int32_t) *input_b - params->fp32_scalar.b_zero_point;
26699
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
26700
0
    const int32_t va0 = input_a[0] - va_zero_point;
26701
0
    const int32_t va1 = input_a[1] - va_zero_point;
26702
0
    const int32_t va2 = input_a[2] - va_zero_point;
26703
0
    const int32_t va3 = input_a[3] - va_zero_point;
26704
0
    input_a += 4;
26705
26706
0
    const int32_t vacc0 = va0 * vb;
26707
0
    const int32_t vacc1 = va1 * vb;
26708
0
    const int32_t vacc2 = va2 * vb;
26709
0
    const int32_t vacc3 = va3 * vb;
26710
26711
0
    float vfpacc0 = (float) vacc0 * vscale;
26712
0
    float vfpacc1 = (float) vacc1 * vscale;
26713
0
    float vfpacc2 = (float) vacc2 * vscale;
26714
0
    float vfpacc3 = (float) vacc3 * vscale;
26715
26716
0
    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
26717
0
    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
26718
0
    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
26719
0
    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
26720
26721
0
    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
26722
0
    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
26723
0
    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
26724
0
    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
26725
26726
0
    vfpacc0 += vmagic_bias;
26727
0
    vfpacc1 += vmagic_bias;
26728
0
    vfpacc2 += vmagic_bias;
26729
0
    vfpacc3 += vmagic_bias;
26730
26731
0
    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
26732
0
    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
26733
0
    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
26734
0
    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
26735
26736
0
    output[0] = (uint8_t) vout0;
26737
0
    output[1] = (uint8_t) vout1;
26738
0
    output[2] = (uint8_t) vout2;
26739
0
    output[3] = (uint8_t) vout3;
26740
0
    output += 4;
26741
0
  }
26742
0
  if XNN_UNLIKELY(batch != 0) {
26743
0
    do {
26744
0
      const int32_t va = (int32_t) *input_a++ - va_zero_point;
26745
0
      const int32_t vacc = va * vb;
26746
26747
0
      float vfpacc = (float) vacc * vscale;
26748
0
      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
26749
0
      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
26750
0
      vfpacc += vmagic_bias;
26751
0
      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
26752
0
      *output++ = (uint8_t) vout;
26753
26754
0
      batch -= sizeof(uint8_t);
26755
0
    } while (batch != 0);
26756
0
  }
26757
0
}
26758
26759
void xnn_s8_ibilinear_ukernel__scalar_c1(
26760
    size_t output_pixels,
26761
    size_t channels,
26762
    const int8_t** restrict input,
26763
    size_t input_offset,
26764
    const int16_t* restrict weights,
26765
    int8_t* restrict output,
26766
    size_t output_increment)
26767
0
{
26768
0
  assert(output_pixels != 0);
26769
0
  assert(channels != 0);
26770
26771
0
  do {
26772
0
    const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset);
26773
0
    const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset);
26774
0
    const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset);
26775
0
    const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset);
26776
0
    input += 4;
26777
26778
0
    const int32_t valphah = (int32_t) (uint32_t) (uint16_t) weights[0];
26779
0
    const int32_t valphav = (int32_t) (uint32_t) (uint16_t) weights[1];
26780
0
    weights += 2;
26781
26782
0
    const int32_t vrounding = INT32_C(0x00200000);
26783
26784
0
    size_t c = channels;
26785
0
    do {
26786
0
      const int32_t vtl = (int32_t) *i0++;
26787
0
      const int32_t vtr = (int32_t) *i1++;
26788
0
      const int32_t vbl = (int32_t) *i2++;
26789
0
      const int32_t vbr = (int32_t) *i3++;
26790
26791
0
      const int32_t vtd = vtr - vtl;
26792
0
      const int32_t vbd = vbr - vbl;
26793
26794
0
      const int32_t vt = (int32_t) ((uint32_t) vtl << 11) + vtd * valphah;
26795
0
      const int32_t vb = (int32_t) ((uint32_t) vbl << 11) + vbd * valphah;
26796
26797
0
      const int32_t vd = vb - vt;
26798
26799
0
      const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
26800
26801
0
      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
26802
26803
0
      *output++ = vo;
26804
26805
0
      c -= sizeof(int8_t);
26806
0
    } while (c != 0);
26807
26808
0
    output = (int8_t*) ((uintptr_t) output + output_increment);
26809
0
  } while (--output_pixels != 0);
26810
0
}
26811
26812
void xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1(
26813
    size_t output_pixels,
26814
    size_t kernel_elements,
26815
    size_t channels,
26816
    const int8_t** input,
26817
    size_t input_offset,
26818
    int8_t* output,
26819
    size_t input_increment,
26820
    size_t output_increment,
26821
    const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26822
0
{
26823
0
  assert(output_pixels != 0);
26824
0
  assert(kernel_elements != 0);
26825
0
  assert(channels != 0);
26826
26827
0
  const int32_t voutput_max = params->scalar.max;
26828
0
  const int32_t voutput_min = params->scalar.min;
26829
0
  do {
26830
0
    int8_t* o = output;
26831
0
    {
26832
0
      const int8_t* i0 = *input++;
26833
0
      const int8_t* i1 = *input++;
26834
0
      const int8_t* i2 = *input++;
26835
0
      const int8_t* i3 = *input++;
26836
0
      const int8_t* i4 = *input++;
26837
0
      const int8_t* i5 = *input++;
26838
0
      const int8_t* i6 = *input++;
26839
0
      const int8_t* i7 = *input++;
26840
0
      const int8_t* i8 = *input++;
26841
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
26842
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
26843
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
26844
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
26845
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
26846
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
26847
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
26848
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
26849
0
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
26850
0
      if (kernel_elements < 2) {
26851
0
        i1 = i0;
26852
0
      }
26853
0
      if (kernel_elements <= 2) {
26854
0
        i2 = i0;
26855
0
      }
26856
0
      if (kernel_elements < 4) {
26857
0
        i3 = i0;
26858
0
      }
26859
0
      if (kernel_elements <= 4) {
26860
0
        i4 = i0;
26861
0
      }
26862
0
      if (kernel_elements < 6) {
26863
0
        i5 = i0;
26864
0
      }
26865
0
      if (kernel_elements <= 6) {
26866
0
        i6 = i0;
26867
0
      }
26868
0
      if (kernel_elements < 8) {
26869
0
        i7 = i0;
26870
0
      }
26871
0
      if (kernel_elements <= 8) {
26872
0
        i8 = i0;
26873
0
      }
26874
26875
0
      size_t c = channels;
26876
0
      do {
26877
0
        const int32_t vi0 = (int32_t) *i0++;
26878
0
        const int32_t vi1 = (int32_t) *i1++;
26879
0
        const int32_t vi2 = (int32_t) *i2++;
26880
0
        const int32_t vi3 = (int32_t) *i3++;
26881
0
        const int32_t vi4 = (int32_t) *i4++;
26882
0
        const int32_t vi5 = (int32_t) *i5++;
26883
0
        const int32_t vi6 = (int32_t) *i6++;
26884
0
        const int32_t vi7 = (int32_t) *i7++;
26885
0
        const int32_t vi8 = (int32_t) *i8++;
26886
26887
0
        const int32_t vmax01 = math_max_s32(vi0, vi1);
26888
0
        const int32_t vmax23 = math_max_s32(vi2, vi3);
26889
0
        const int32_t vmax45 = math_max_s32(vi4, vi5);
26890
0
        const int32_t vmax67 = math_max_s32(vi6, vi7);
26891
0
        const int32_t vmax018 = math_max_s32(vmax01, vi8);
26892
26893
0
        const int32_t vmax2345 = math_max_s32(vmax23, vmax45);
26894
0
        const int32_t vmax01678 = math_max_s32(vmax018, vmax67);
26895
26896
0
        int32_t vout = math_max_s32(vmax2345, vmax01678);
26897
0
        vout = math_min_s32(vout, voutput_max);
26898
0
        vout = math_max_s32(vout, voutput_min);
26899
26900
0
        *o++ = (int8_t) vout;
26901
0
      } while (--c != 0);
26902
0
    }
26903
26904
0
    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
26905
0
      const int8_t* i0 = *input++;
26906
0
      const int8_t* i1 = *input++;
26907
0
      const int8_t* i2 = *input++;
26908
0
      const int8_t* i3 = *input++;
26909
0
      const int8_t* i4 = *input++;
26910
0
      const int8_t* i5 = *input++;
26911
0
      const int8_t* i6 = *input++;
26912
0
      const int8_t* i7 = *input++;
26913
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
26914
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
26915
0
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
26916
0
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
26917
0
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
26918
0
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
26919
0
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
26920
0
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
26921
0
      if (k < 2) {
26922
0
        i1 = i0;
26923
0
      }
26924
0
      if (k <= 2) {
26925
0
        i2 = i0;
26926
0
      }
26927
0
      if (k < 4) {
26928
0
        i3 = i0;
26929
0
      }
26930
0
      if (k <= 4) {
26931
0
        i4 = i0;
26932
0
      }
26933
0
      if (k < 6) {
26934
0
        i5 = i0;
26935
0
      }
26936
0
      if (k <= 6) {
26937
0
        i6 = i0;
26938
0
      }
26939
0
      if (k < 8) {
26940
0
        i7 = i0;
26941
0
      }
26942
26943
0
      o = output;
26944
0
      size_t c = channels;
26945
0
      do {
26946
0
        const int32_t vi0 = (int32_t) *i0++;
26947
0
        const int32_t vi1 = (int32_t) *i1++;
26948
0
        const int32_t vi2 = (int32_t) *i2++;
26949
0
        const int32_t vi3 = (int32_t) *i3++;
26950
0
        const int32_t vi4 = (int32_t) *i4++;
26951
0
        const int32_t vi5 = (int32_t) *i5++;
26952
0
        const int32_t vi6 = (int32_t) *i6++;
26953
0
        const int32_t vi7 = (int32_t) *i7++;
26954
0
        const int32_t vi8 = (int32_t) *o;
26955
26956
0
        const int32_t vmax01 = math_max_s32(vi0, vi1);
26957
0
        const int32_t vmax23 = math_max_s32(vi2, vi3);
26958
0
        const int32_t vmax45 = math_max_s32(vi4, vi5);
26959
0
        const int32_t vmax67 = math_max_s32(vi6, vi7);
26960
0
        const int32_t vmax018 = math_max_s32(vmax01, vi8);
26961
26962
0
        const int32_t vmax2345 = math_max_s32(vmax23, vmax45);
26963
0
        const int32_t vmax01678 = math_max_s32(vmax018, vmax67);
26964
26965
0
        int32_t vout = math_max_s32(vmax2345, vmax01678);
26966
0
        vout = math_min_s32(vout, voutput_max);
26967
0
        vout = math_max_s32(vout, voutput_min);
26968
26969
0
        *o++ = (int8_t) vout;
26970
0
      } while (--c != 0);
26971
0
    }
26972
0
    input = (const int8_t**) ((uintptr_t) input + input_increment);
26973
0
    output = (int8_t*) ((uintptr_t) o + output_increment);
26974
0
  } while (--output_pixels != 0);
26975
0
}
26976
26977
void xnn_s8_vclamp_ukernel__scalar_u4(
26978
    size_t batch,
26979
    const int8_t* input,
26980
    int8_t* output,
26981
    const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26982
0
{
26983
0
  assert(batch != 0);
26984
0
  assert(batch % sizeof(int8_t) == 0);
26985
0
  assert(input != NULL);
26986
0
  assert(output != NULL);
26987
26988
0
  const int32_t voutput_max = params->scalar.max;
26989
0
  const int32_t voutput_min = params->scalar.min;
26990
26991
0
  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
26992
0
    int32_t vt0 = (int32_t) input[0];
26993
0
    int32_t vt1 = (int32_t) input[1];
26994
0
    int32_t vt2 = (int32_t) input[2];
26995
0
    int32_t vt3 = (int32_t) input[3];
26996
0
    input += 4;
26997
26998
0
    vt0 = math_max_s32(vt0, voutput_min);
26999
0
    vt1 = math_max_s32(vt1, voutput_min);
27000
0
    vt2 = math_max_s32(vt2, voutput_min);
27001
0
    vt3 = math_max_s32(vt3, voutput_min);
27002
27003
0
    vt0 = math_min_s32(vt0, voutput_max);
27004
0
    vt1 = math_min_s32(vt1, voutput_max);
27005
0
    vt2 = math_min_s32(vt2, voutput_max);
27006
0
    vt3 = math_min_s32(vt3, voutput_max);
27007
27008
0
    output[0] = (int8_t) vt0;
27009
0
    output[1] = (int8_t) vt1;
27010
0
    output[2] = (int8_t) vt2;
27011
0
    output[3] = (int8_t) vt3;
27012
0
    output += 4;
27013
0
  }
27014
27015
0
  if XNN_UNLIKELY(batch != 0) {
27016
0
    do {
27017
0
      int32_t vt = (int32_t) *input++;
27018
0
      vt = math_max_s32(vt, voutput_min);
27019
0
      vt = math_min_s32(vt, voutput_max);
27020
0
      *output++ = (int8_t) vt;
27021
27022
0
      batch -= sizeof(int8_t);
27023
0
    } while (batch != 0);
27024
0
  }
27025
0
}
27026
27027
void xnn_u8_ibilinear_ukernel__scalar_c1(
27028
    size_t output_pixels,
27029
    size_t channels,
27030
    const uint8_t** restrict input,
27031
    size_t input_offset,
27032
    const int16_t* restrict weights,
27033
    uint8_t* restrict output,
27034
    size_t output_increment)
27035
0
{
27036
0
  assert(output_pixels != 0);
27037
0
  assert(channels != 0);
27038
27039
0
  do {
27040
0
    const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset);
27041
0
    const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset);
27042
0
    const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset);
27043
0
    const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset);
27044
0
    input += 4;
27045
27046
0
    const int32_t valphah = (int32_t) (uint32_t) (uint16_t) weights[0];
27047
0
    const int32_t valphav = (int32_t) (uint32_t) (uint16_t) weights[1];
27048
0
    weights += 2;
27049
27050
0
    const int32_t vrounding = INT32_C(0x00200000);
27051
27052
0
    size_t c = channels;
27053
0
    do {
27054
0
      const int32_t vtl = (int32_t) *i0++;
27055
0
      const int32_t vtr = (int32_t) *i1++;
27056
0
      const int32_t vbl = (int32_t) *i2++;
27057
0
      const int32_t vbr = (int32_t) *i3++;
27058
27059
0
      const int32_t vtd = vtr - vtl;
27060
0
      const int32_t vbd = vbr - vbl;
27061
27062
0
      const int32_t vt = (int32_t) ((uint32_t) vtl << 11) + vtd * valphah;
27063
0
      const int32_t vb = (int32_t) ((uint32_t) vbl << 11) + vbd * valphah;
27064
27065
0
      const int32_t vd = vb - vt;
27066
27067
0
      const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
27068
27069
0
      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
27070
27071
0
      *output++ = vo;
27072
27073
0
      c -= sizeof(uint8_t);
27074
0
    } while (c != 0);
27075
27076
0
    output = (uint8_t*) ((uintptr_t) output + output_increment);
27077
0
  } while (--output_pixels != 0);
27078
0
}
27079
27080
static inline uint32_t compute_sum(
27081
    size_t n,
27082
    const uint8_t* x,
27083
    const uint32_t* t)
27084
0
{
27085
0
  assert(n != 0);
27086
27087
0
  uint32_t vsum = 0;
27088
0
  do {
27089
0
    const size_t vx = *x++;
27090
0
    vsum += t[vx];
27091
0
  } while (--n != 0);
27092
0
  return vsum;
27093
0
}
27094
27095
void xnn_u8_lut32norm_ukernel__scalar(
27096
    size_t n,
27097
    const uint8_t* x,
27098
    const uint32_t* t,
27099
    uint8_t* y)
27100
0
{
27101
0
  assert(n != 0);
27102
27103
0
  const uint32_t vsum = compute_sum(n, x, t);
27104
0
  assert(vsum != 0);
27105
27106
0
  struct fxdiv_divisor_uint32_t vsum_divisor = fxdiv_init_uint32_t(vsum);
27107
0
  const uint32_t vrounding = (vsum >> 1);
27108
0
  do {
27109
0
    const size_t vx = *x++;
27110
0
    const uint32_t vt = t[vx];
27111
0
    const uint32_t vq = fxdiv_quotient_uint32_t((vt << 8) + vrounding, vsum_divisor);
27112
0
    const uint8_t vy = vq > 255 ? UINT8_C(255) : (uint8_t) vq;
27113
0
    *y++ = vy;
27114
0
  } while (--n != 0);
27115
0
}
27116
27117
void xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1(
27118
    size_t output_pixels,
27119
    size_t kernel_elements,
27120
    size_t channels,
27121
    const uint8_t** input,
27122
    size_t input_offset,
27123
    uint8_t* output,
27124
    size_t input_increment,
27125
    size_t output_increment,
27126
    const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27127
0
{
27128
0
  assert(output_pixels != 0);
27129
0
  assert(kernel_elements != 0);
27130
0
  assert(channels != 0);
27131
27132
0
  const uint32_t voutput_min = params->scalar.min;
27133
0
  const uint32_t voutput_max = params->scalar.max;
27134
0
  do {
27135
0
    uint8_t* o = output;
27136
0
    {
27137
0
      const uint8_t* i0 = *input++;
27138
0
      const uint8_t* i1 = *input++;
27139
0
      const uint8_t* i2 = *input++;
27140
0
      const uint8_t* i3 = *input++;
27141
0
      const uint8_t* i4 = *input++;
27142
0
      const uint8_t* i5 = *input++;
27143
0
      const uint8_t* i6 = *input++;
27144
0
      const uint8_t* i7 = *input++;
27145
0
      const uint8_t* i8 = *input++;
27146
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
27147
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
27148
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
27149
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
27150
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
27151
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
27152
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
27153
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
27154
0
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
27155
0
      if (kernel_elements < 2) {
27156
0
        i1 = i0;
27157
0
      }
27158
0
      if (kernel_elements <= 2) {
27159
0
        i2 = i0;
27160
0
      }
27161
0
      if (kernel_elements < 4) {
27162
0
        i3 = i0;
27163
0
      }
27164
0
      if (kernel_elements <= 4) {
27165
0
        i4 = i0;
27166
0
      }
27167
0
      if (kernel_elements < 6) {
27168
0
        i5 = i0;
27169
0
      }
27170
0
      if (kernel_elements <= 6) {
27171
0
        i6 = i0;
27172
0
      }
27173
0
      if (kernel_elements < 8) {
27174
0
        i7 = i0;
27175
0
      }
27176
0
      if (kernel_elements <= 8) {
27177
0
        i8 = i0;
27178
0
      }
27179
27180
0
      size_t c = channels;
27181
0
      do {
27182
0
        const uint32_t vi0 = (uint32_t) *i0++;
27183
0
        const uint32_t vi1 = (uint32_t) *i1++;
27184
0
        const uint32_t vi2 = (uint32_t) *i2++;
27185
0
        const uint32_t vi3 = (uint32_t) *i3++;
27186
0
        const uint32_t vi4 = (uint32_t) *i4++;
27187
0
        const uint32_t vi5 = (uint32_t) *i5++;
27188
0
        const uint32_t vi6 = (uint32_t) *i6++;
27189
0
        const uint32_t vi7 = (uint32_t) *i7++;
27190
0
        const uint32_t vi8 = (uint32_t) *i8++;
27191
27192
0
        const uint32_t vmax01 = math_max_u32(vi0, vi1);
27193
0
        const uint32_t vmax23 = math_max_u32(vi2, vi3);
27194
0
        const uint32_t vmax45 = math_max_u32(vi4, vi5);
27195
0
        const uint32_t vmax67 = math_max_u32(vi6, vi7);
27196
0
        const uint32_t vmax018 = math_max_u32(vmax01, vi8);
27197
27198
0
        const uint8_t vmax2345 = math_max_u32(vmax23, vmax45);
27199
0
        const uint8_t vmax01678 = math_max_u32(vmax018, vmax67);
27200
27201
0
        uint32_t vout = math_max_u32(vmax2345, vmax01678);
27202
0
        vout = math_max_u32(vout, voutput_min);
27203
0
        vout = math_min_u32(vout, voutput_max);
27204
27205
0
        *o++ = vout;
27206
0
      } while (--c != 0);
27207
0
    }
27208
27209
0
    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
27210
0
      const uint8_t* i0 = *input++;
27211
0
      const uint8_t* i1 = *input++;
27212
0
      const uint8_t* i2 = *input++;
27213
0
      const uint8_t* i3 = *input++;
27214
0
      const uint8_t* i4 = *input++;
27215
0
      const uint8_t* i5 = *input++;
27216
0
      const uint8_t* i6 = *input++;
27217
0
      const uint8_t* i7 = *input++;
27218
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
27219
0
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
27220
0
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
27221
0
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
27222
0
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
27223
0
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
27224
0
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
27225
0
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
27226
0
      if (k < 2) {
27227
0
        i1 = i0;
27228
0
      }
27229
0
      if (k <= 2) {
27230
0
        i2 = i0;
27231
0
      }
27232
0
      if (k < 4) {
27233
0
        i3 = i0;
27234
0
      }
27235
0
      if (k <= 4) {
27236
0
        i4 = i0;
27237
0
      }
27238
0
      if (k < 6) {
27239
0
        i5 = i0;
27240
0
      }
27241
0
      if (k <= 6) {
27242
0
        i6 = i0;
27243
0
      }
27244
0
      if (k < 8) {
27245
0
        i7 = i0;
27246
0
      }
27247
27248
0
      o = output;
27249
0
      size_t c = channels;
27250
0
      do {
27251
0
        const uint32_t vi0 = (uint32_t) *i0++;
27252
0
        const uint32_t vi1 = (uint32_t) *i1++;
27253
0
        const uint32_t vi2 = (uint32_t) *i2++;
27254
0
        const uint32_t vi3 = (uint32_t) *i3++;
27255
0
        const uint32_t vi4 = (uint32_t) *i4++;
27256
0
        const uint32_t vi5 = (uint32_t) *i5++;
27257
0
        const uint32_t vi6 = (uint32_t) *i6++;
27258
0
        const uint32_t vi7 = (uint32_t) *i7++;
27259
0
        const uint32_t vi8 = (uint32_t) *o;
27260
27261
0
        const uint32_t vmax01 = math_max_u32(vi0, vi1);
27262
0
        const uint32_t vmax23 = math_max_u32(vi2, vi3);
27263
0
        const uint32_t vmax45 = math_max_u32(vi4, vi5);
27264
0
        const uint32_t vmax67 = math_max_u32(vi6, vi7);
27265
0
        const uint32_t vmax018 = math_max_u32(vmax01, vi8);
27266
27267
0
        const uint32_t vmax2345 = math_max_u32(vmax23, vmax45);
27268
0
        const uint32_t vmax01678 = math_max_u32(vmax018, vmax67);
27269
27270
0
        uint32_t vout = math_max_u32(vmax2345, vmax01678);
27271
0
        vout = math_max_u32(vout, voutput_min);
27272
0
        vout = math_min_u32(vout, voutput_max);
27273
27274
0
        *o++ = vout;
27275
0
      } while (--c != 0);
27276
0
    }
27277
0
    input = (const uint8_t**) ((uintptr_t) input + input_increment);
27278
0
    output = (uint8_t*) ((uintptr_t) o + output_increment);
27279
0
  } while (--output_pixels != 0);
27280
0
}
27281
27282
void xnn_u8_rmax_ukernel__scalar(
27283
    size_t batch,
27284
    const uint8_t* input,
27285
    uint8_t* output,
27286
    const void* params)
27287
0
{
27288
0
  assert(batch != 0);
27289
0
  assert(batch % sizeof(uint8_t) == 0);
27290
0
  assert(input != NULL);
27291
0
  assert(output != NULL);
27292
27293
0
  uint8_t vmax0 = 0;
27294
0
  uint8_t vmax1 = 0;
27295
0
  for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) {
27296
0
    const uint8_t vt0 = input[0];
27297
0
    const uint8_t vt1 = input[1];
27298
0
    input += 2;
27299
27300
0
    vmax0 = vt0 > vmax0 ? vt0 : vmax0;
27301
0
    vmax1 = vt1 > vmax1 ? vt1 : vmax1;
27302
0
  }
27303
0
  uint8_t vmax = vmax0 > vmax1 ? vmax0 : vmax1;
27304
0
  if (batch != 0) {
27305
0
    const uint8_t vt = *input++;
27306
0
    vmax = vt > vmax ? vt : vmax;
27307
0
  }
27308
0
  *output = vmax;
27309
0
}
27310
27311
void xnn_u8_vclamp_ukernel__scalar_u4(
27312
    size_t batch,
27313
    const uint8_t* input,
27314
    uint8_t* output,
27315
    const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27316
0
{
27317
0
  assert(batch != 0);
27318
0
  assert(batch % sizeof(uint8_t) == 0);
27319
0
  assert(input != NULL);
27320
0
  assert(output != NULL);
27321
27322
0
  const uint32_t voutput_max = params->scalar.max;
27323
0
  const uint32_t voutput_min = params->scalar.min;
27324
27325
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
27326
0
    uint32_t vt0 = (uint32_t) input[0];
27327
0
    uint32_t vt1 = (uint32_t) input[1];
27328
0
    uint32_t vt2 = (uint32_t) input[2];
27329
0
    uint32_t vt3 = (uint32_t) input[3];
27330
0
    input += 4;
27331
27332
0
    vt0 = math_max_u32(vt0, voutput_min);
27333
0
    vt1 = math_max_u32(vt1, voutput_min);
27334
0
    vt2 = math_max_u32(vt2, voutput_min);
27335
0
    vt3 = math_max_u32(vt3, voutput_min);
27336
27337
0
    vt0 = math_min_u32(vt0, voutput_max);
27338
0
    vt1 = math_min_u32(vt1, voutput_max);
27339
0
    vt2 = math_min_u32(vt2, voutput_max);
27340
0
    vt3 = math_min_u32(vt3, voutput_max);
27341
27342
0
    output[0] = (uint8_t) vt0;
27343
0
    output[1] = (uint8_t) vt1;
27344
0
    output[2] = (uint8_t) vt2;
27345
0
    output[3] = (uint8_t) vt3;
27346
0
    output += 4;
27347
0
  }
27348
27349
0
  if XNN_UNLIKELY(batch != 0) {
27350
0
    do {
27351
0
      uint32_t vt = (uint32_t) *input++;
27352
0
      vt = math_max_u32(vt, voutput_min);
27353
0
      vt = math_min_u32(vt, voutput_max);
27354
0
      *output++ = (uint8_t) vt;
27355
27356
0
      batch -= sizeof(uint8_t);
27357
0
    } while (batch != 0);
27358
0
  }
27359
0
}
27360
27361
void xnn_x16_transposec_ukernel__2x4_scalar_int(
27362
    const uint16_t *input,
27363
    uint16_t * output,
27364
    size_t input_stride,
27365
    size_t output_stride,
27366
    size_t block_width,
27367
    size_t block_height,
27368
    const union xnn_x16_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27369
0
{
27370
0
  assert(output_stride >= block_height * sizeof(int16_t));
27371
0
  assert(input_stride >= block_width * sizeof(int16_t));
27372
27373
0
  const size_t tile_height = 2;
27374
0
  const size_t tile_width = 4;
27375
0
  const size_t tile_wbytes = tile_width * sizeof(int16_t);
27376
0
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
27377
0
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(int16_t);
27378
0
  const size_t input_offset = tile_height * input_stride;
27379
27380
0
  const int16_t* i0 = (const int16_t*) input;
27381
0
  const int16_t* i1 = (const int16_t*) ((uintptr_t) i0 + input_stride);
27382
27383
0
  int16_t* o0 = (int16_t*) output;
27384
0
  int16_t* o1 = (int16_t*) ((uintptr_t) o0 + output_stride);
27385
0
  int16_t* o2 = (int16_t*) ((uintptr_t) o1 + output_stride);
27386
0
  int16_t* o3 = (int16_t*) ((uintptr_t) o2 + output_stride);
27387
27388
0
  do {
27389
0
    if XNN_UNPREDICTABLE(block_width < 2) {
27390
0
      o1 = o0;
27391
0
    }
27392
0
    if XNN_UNPREDICTABLE(block_width <= 2) {
27393
0
      o2 = o0;
27394
0
    }
27395
0
    if XNN_UNPREDICTABLE(block_width < 4) {
27396
0
      o3 = o0;
27397
0
    }
27398
0
    size_t bh = block_height;
27399
0
    for (; bh >= 2; bh -= 2) {
27400
0
      *o3++ = i0[3];
27401
0
      *o3++ = i1[3];
27402
0
      *o2++ = i0[2];
27403
0
      *o2++ = i1[2];
27404
0
      *o1++ = i0[1];
27405
0
      *o1++ = i1[1];
27406
0
      *o0++ = i0[0];
27407
0
      *o0++ = i1[0];
27408
0
      i0 = (const int16_t*) ((uintptr_t) i0 + input_offset);
27409
0
      i1 = (const int16_t*) ((uintptr_t) i1 + input_offset);
27410
0
    }
27411
0
    if (bh & 1) {
27412
0
      o3[0] = i0[3];
27413
0
      o2[0] = i0[2];
27414
0
      o1[0] = i0[1];
27415
0
      o0[0] = i0[0];
27416
0
    }
27417
27418
0
    i0 = (const int16_t*) ((uintptr_t) i0 + input_reset);
27419
0
    i1 = (const int16_t*) ((uintptr_t) i0 + input_stride);
27420
0
    o0 = (int16_t*) ((uintptr_t) o0 + output_reset);
27421
0
    o1 = (int16_t*) ((uintptr_t) o1 + output_reset);
27422
0
    o2 = (int16_t*) ((uintptr_t) o2 + output_reset);
27423
0
    o3 = (int16_t*) ((uintptr_t) o3 + output_reset);
27424
0
    block_width = doz(block_width, tile_width);
27425
0
  } while (block_width != 0);
27426
0
}
27427
27428
void xnn_x24_transposec_ukernel__1x2_scalar(
27429
    const void *input,
27430
    void * output,
27431
    size_t input_stride,
27432
    size_t output_stride,
27433
    size_t block_width,
27434
    size_t block_height,
27435
    const union xnn_x24_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27436
0
{
27437
0
  assert(output_stride >= block_height * 3);
27438
0
  assert(input_stride >= block_width * 3);
27439
27440
0
  const size_t input_reset = 6 - block_height * input_stride;
27441
0
  const size_t output_reset = 2 * output_stride - block_height * 3;
27442
0
  const size_t input_offset = 1 * input_stride;
27443
27444
0
  const uint8_t* i0 = (const uint8_t*) input;
27445
27446
0
  uint8_t* o0 = (uint8_t*) output;
27447
0
  uint8_t* o1 = (uint8_t*) ((uintptr_t) o0 + output_stride);
27448
27449
0
  do {
27450
0
    if XNN_UNPREDICTABLE(block_width < 2) {
27451
0
      o1 = o0;
27452
0
    }
27453
0
    size_t bh = block_height;
27454
0
    for (; bh >= 1; bh -= 1) {
27455
0
      o1[0] = i0[3];
27456
0
      o1[1] = i0[4];
27457
0
      o1[2] = i0[5];
27458
0
      o1 += 3;
27459
0
      o0[0] = i0[0];
27460
0
      o0[1] = i0[1];
27461
0
      o0[2] = i0[2];
27462
0
      o0 += 3;
27463
0
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
27464
0
    }
27465
27466
0
    i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
27467
0
    o0 = (uint8_t*) ((uintptr_t) o0 + output_reset);
27468
0
    o1 = (uint8_t*) ((uintptr_t) o1 + output_reset);
27469
0
    block_width = doz(block_width, 2);
27470
0
  } while (block_width != 0);
27471
0
}
27472
27473
void xnn_x32_packw_gemm_goi_ukernel_x2__scalar_float_u4(
27474
  size_t g,
27475
  size_t nc,
27476
  size_t kc,
27477
  size_t nr,
27478
  size_t kr,
27479
  size_t sr,
27480
  const uint32_t* weights,
27481
  const uint32_t* bias,
27482
  const void* scale,
27483
  uint32_t* packed_weights,
27484
  size_t extra_bytes,
27485
  const void* params)
27486
0
{
27487
0
  assert(g != 0);
27488
0
  assert(nc != 0);
27489
0
  assert(kc != 0);
27490
0
  assert(nr == 2);
27491
0
  assert(kr == 1);
27492
0
  assert(sr == 1);
27493
0
  assert(weights != NULL);
27494
0
  assert(packed_weights != NULL);
27495
27496
0
  float* out = (float*) packed_weights;
27497
0
  const float* b = (const float*) bias;
27498
27499
0
  do {
27500
    // NC main loop multiple of 2
27501
0
    const float* w0 = (const float*) weights;
27502
0
    size_t n = nc;
27503
0
    for (;n >= 2; n -= 2) {
27504
0
      if XNN_LIKELY(b != NULL) {
27505
0
        out[0] = b[0];
27506
0
        out[1] = b[1];
27507
0
        b += 2;
27508
0
      } else {
27509
0
        out[0] = 0;
27510
0
        out[1] = 0;
27511
0
      }
27512
0
      out += 2;
27513
27514
0
      const float* w1 = w0 + kc;
27515
27516
      // KC main loop multiple of 2x4
27517
0
      size_t k = kc;
27518
0
      for (; k >= 4; k -= 4) {
27519
0
        const float v00 = w0[0];
27520
0
        const float v01 = w0[1];
27521
0
        const float v02 = w0[2];
27522
0
        const float v03 = w0[3];
27523
0
        w0 += 4;
27524
0
        const float v10 = w1[0];
27525
0
        const float v11 = w1[1];
27526
0
        const float v12 = w1[2];
27527
0
        const float v13 = w1[3];
27528
0
        w1 += 4;
27529
0
        out[0] = v00;
27530
0
        out[1] = v10;
27531
0
        out[2] = v01;
27532
0
        out[3] = v11;
27533
0
        out[4] = v02;
27534
0
        out[5] = v12;
27535
0
        out[6] = v03;
27536
0
        out[7] = v13;
27537
0
        out += 8;
27538
0
      }
27539
27540
      // KC remainder
27541
0
      for (; k != 0; --k) {
27542
0
        const float v0 = *w0++;
27543
0
        out[0] = v0;
27544
0
        const float v1 = *w1++;
27545
0
        out[1] = v1;
27546
0
        out += 2;
27547
0
      }
27548
0
      out = (float*) ((uintptr_t) out + extra_bytes);
27549
0
      w0 = w1;
27550
0
    }
27551
27552
    // NC remainder (1..1)
27553
0
    if XNN_UNLIKELY(n != 0) {
27554
0
      if XNN_LIKELY(b != NULL) {
27555
0
        size_t nb = n;
27556
0
        do {
27557
0
          *out++ = *b++;
27558
0
        } while (--nb != 0);
27559
0
      } else {
27560
0
        size_t nb = n;
27561
0
        do {
27562
0
          *out++ = 0;
27563
0
        } while (--nb != 0);
27564
0
      }
27565
0
      out += (2 - n);
27566
27567
27568
      // KC main loop multiple of 2x4
27569
0
      size_t k = kc;
27570
0
      for (; k >= 4; k -= 4) {
27571
0
        const float v00 = w0[0];
27572
0
        const float v01 = w0[1];
27573
0
        const float v02 = w0[2];
27574
0
        const float v03 = w0[3];
27575
0
        w0 += 4;
27576
0
        out[0] = v00;
27577
0
        out[2] = v01;
27578
0
        out[4] = v02;
27579
0
        out[6] = v03;
27580
0
        out += 8;
27581
0
      }
27582
27583
      // KC remainder of 1..3
27584
0
      for (; k != 0; --k) {
27585
0
        const float v0 = *w0++;
27586
0
        out[0] = v0;
27587
0
        out += 2;
27588
0
      }
27589
0
      out = (float*) ((uintptr_t) out + extra_bytes);
27590
0
    }
27591
0
    weights += nc * kc;
27592
0
  } while (--g != 0);
27593
0
}
27594
27595
void xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4(
27596
  size_t g,
27597
  size_t nc,
27598
  size_t kc,
27599
  size_t nr,
27600
  size_t kr,
27601
  size_t sr,
27602
  const uint32_t* weights,
27603
  const uint32_t* bias,
27604
  const void* scale,
27605
  uint32_t* packed_weights,
27606
  size_t extra_bytes,
27607
  const void* params)
27608
0
{
27609
0
  assert(g != 0);
27610
0
  assert(nc != 0);
27611
0
  assert(kc != 0);
27612
0
  assert(nr == 4);
27613
0
  assert(kr == 1);
27614
0
  assert(sr == 1);
27615
0
  assert(weights != NULL);
27616
0
  assert(packed_weights != NULL);
27617
27618
0
  float* out = (float*) packed_weights;
27619
0
  const float* b = (const float*) bias;
27620
27621
0
  do {
27622
    // NC main loop multiple of 4
27623
0
    const float* w0 = (const float*) weights;
27624
0
    size_t n = nc;
27625
0
    for (;n >= 4; n -= 4) {
27626
0
      if XNN_LIKELY(b != NULL) {
27627
0
        out[0] = b[0];
27628
0
        out[1] = b[1];
27629
0
        out[2] = b[2];
27630
0
        out[3] = b[3];
27631
0
        b += 4;
27632
0
      } else {
27633
0
        out[0] = 0;
27634
0
        out[1] = 0;
27635
0
        out[2] = 0;
27636
0
        out[3] = 0;
27637
0
      }
27638
0
      out += 4;
27639
27640
0
      const float* w1 = w0 + kc;
27641
0
      const float* w2 = w1 + kc;
27642
0
      const float* w3 = w2 + kc;
27643
27644
      // KC main loop multiple of 4x4
27645
0
      size_t k = kc;
27646
0
      for (; k >= 4; k -= 4) {
27647
0
        const float v00 = w0[0];
27648
0
        const float v01 = w0[1];
27649
0
        const float v02 = w0[2];
27650
0
        const float v03 = w0[3];
27651
0
        w0 += 4;
27652
0
        const float v10 = w1[0];
27653
0
        const float v11 = w1[1];
27654
0
        const float v12 = w1[2];
27655
0
        const float v13 = w1[3];
27656
0
        w1 += 4;
27657
0
        const float v20 = w2[0];
27658
0
        const float v21 = w2[1];
27659
0
        const float v22 = w2[2];
27660
0
        const float v23 = w2[3];
27661
0
        w2 += 4;
27662
0
        const float v30 = w3[0];
27663
0
        const float v31 = w3[1];
27664
0
        const float v32 = w3[2];
27665
0
        const float v33 = w3[3];
27666
0
        w3 += 4;
27667
0
        out[0] = v00;
27668
0
        out[1] = v10;
27669
0
        out[2] = v20;
27670
0
        out[3] = v30;
27671
0
        out[4] = v01;
27672
0
        out[5] = v11;
27673
0
        out[6] = v21;
27674
0
        out[7] = v31;
27675
0
        out[8] = v02;
27676
0
        out[9] = v12;
27677
0
        out[10] = v22;
27678
0
        out[11] = v32;
27679
0
        out[12] = v03;
27680
0
        out[13] = v13;
27681
0
        out[14] = v23;
27682
0
        out[15] = v33;
27683
0
        out += 16;
27684
0
      }
27685
27686
      // KC remainder
27687
0
      for (; k != 0; --k) {
27688
0
        const float v0 = *w0++;
27689
0
        out[0] = v0;
27690
0
        const float v1 = *w1++;
27691
0
        out[1] = v1;
27692
0
        const float v2 = *w2++;
27693
0
        out[2] = v2;
27694
0
        const float v3 = *w3++;
27695
0
        out[3] = v3;
27696
0
        out += 4;
27697
0
      }
27698
0
      out = (float*) ((uintptr_t) out + extra_bytes);
27699
0
      w0 = w3;
27700
0
    }
27701
27702
    // NC remainder (1..3)
27703
0
    if XNN_UNLIKELY(n != 0) {
27704
0
      if XNN_LIKELY(b != NULL) {
27705
0
        size_t nb = n;
27706
0
        do {
27707
0
          *out++ = *b++;
27708
0
        } while (--nb != 0);
27709
0
      } else {
27710
0
        size_t nb = n;
27711
0
        do {
27712
0
          *out++ = 0;
27713
0
        } while (--nb != 0);
27714
0
      }
27715
0
      out += (4 - n);
27716
27717
      // NR remainder has less than 4 rows so last row is not loaded
27718
0
      const float* w1 = w0 + kc;
27719
0
      if XNN_UNPREDICTABLE(n < 2) {
27720
0
        w1 = w0;
27721
0
      }
27722
0
      const float* w2 = w1 + kc;
27723
0
      if XNN_UNPREDICTABLE(n <= 2) {
27724
0
        w2 = w1;
27725
0
      }
27726
27727
      // KC main loop multiple of 4x4
27728
0
      size_t k = kc;
27729
0
      for (; k >= 4; k -= 4) {
27730
0
        const float v00 = w0[0];
27731
0
        const float v01 = w0[1];
27732
0
        const float v02 = w0[2];
27733
0
        const float v03 = w0[3];
27734
0
        w0 += 4;
27735
0
        const float v10 = w1[0];
27736
0
        const float v11 = w1[1];
27737
0
        const float v12 = w1[2];
27738
0
        const float v13 = w1[3];
27739
0
        w1 += 4;
27740
0
        const float v20 = w2[0];
27741
0
        const float v21 = w2[1];
27742
0
        const float v22 = w2[2];
27743
0
        const float v23 = w2[3];
27744
0
        w2 += 4;
27745
0
        out[0] = v00;
27746
0
        out[1] = v10;
27747
0
        out[2] = v20;
27748
0
        out[4] = v01;
27749
0
        out[5] = v11;
27750
0
        out[6] = v21;
27751
0
        out[8] = v02;
27752
0
        out[9] = v12;
27753
0
        out[10] = v22;
27754
0
        out[12] = v03;
27755
0
        out[13] = v13;
27756
0
        out[14] = v23;
27757
0
        out += 16;
27758
0
      }
27759
27760
      // KC remainder of 1..3
27761
0
      for (; k != 0; --k) {
27762
0
        const float v0 = *w0++;
27763
0
        out[0] = v0;
27764
0
        const float v1 = *w1++;
27765
0
        out[1] = v1;
27766
0
        const float v2 = *w2++;
27767
0
        out[2] = v2;
27768
0
        out += 4;
27769
0
      }
27770
0
      out = (float*) ((uintptr_t) out + extra_bytes);
27771
0
    }
27772
0
    weights += nc * kc;
27773
0
  } while (--g != 0);
27774
0
}
27775
27776
void xnn_x32_transposec_ukernel__2x4_scalar_int(
27777
    const uint32_t *input,
27778
    uint32_t * output,
27779
    size_t input_stride,
27780
    size_t output_stride,
27781
    size_t block_width,
27782
    size_t block_height,
27783
    const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27784
0
{
27785
0
  assert(output_stride >= block_height * sizeof(int));
27786
0
  assert(input_stride >= block_width * sizeof(int));
27787
27788
0
  const size_t tile_height = 2;
27789
0
  const size_t tile_width = 4;
27790
0
  const size_t tile_wbytes = tile_width * sizeof(int);
27791
0
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
27792
0
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(int);
27793
0
  const size_t input_offset = tile_height * input_stride;
27794
27795
0
  const int* i0 = (const int*) input;
27796
0
  const int* i1 = (const int*) ((uintptr_t) i0 + input_stride);
27797
27798
0
  int* o0 = (int*) output;
27799
0
  int* o1 = (int*) ((uintptr_t) o0 + output_stride);
27800
0
  int* o2 = (int*) ((uintptr_t) o1 + output_stride);
27801
0
  int* o3 = (int*) ((uintptr_t) o2 + output_stride);
27802
27803
0
  do {
27804
0
    if XNN_UNPREDICTABLE(block_width < 2) {
27805
0
      o1 = o0;
27806
0
    }
27807
0
    if XNN_UNPREDICTABLE(block_width <= 2) {
27808
0
      o2 = o0;
27809
0
    }
27810
0
    if XNN_UNPREDICTABLE(block_width < 4) {
27811
0
      o3 = o0;
27812
0
    }
27813
0
    size_t bh = block_height;
27814
0
    for (; bh >= 2; bh -= 2) {
27815
0
      *o3++ = i0[3];
27816
0
      *o3++ = i1[3];
27817
0
      *o2++ = i0[2];
27818
0
      *o2++ = i1[2];
27819
0
      *o1++ = i0[1];
27820
0
      *o1++ = i1[1];
27821
0
      *o0++ = i0[0];
27822
0
      *o0++ = i1[0];
27823
0
      i0 = (const int*) ((uintptr_t) i0 + input_offset);
27824
0
      i1 = (const int*) ((uintptr_t) i1 + input_offset);
27825
0
    }
27826
0
    if (bh & 1) {
27827
0
      o3[0] = i0[3];
27828
0
      o2[0] = i0[2];
27829
0
      o1[0] = i0[1];
27830
0
      o0[0] = i0[0];
27831
0
    }
27832
27833
0
    i0 = (const int*) ((uintptr_t) i0 + input_reset);
27834
0
    i1 = (const int*) ((uintptr_t) i0 + input_stride);
27835
0
    o0 = (int*) ((uintptr_t) o0 + output_reset);
27836
0
    o1 = (int*) ((uintptr_t) o1 + output_reset);
27837
0
    o2 = (int*) ((uintptr_t) o2 + output_reset);
27838
0
    o3 = (int*) ((uintptr_t) o3 + output_reset);
27839
0
    block_width = doz(block_width, tile_width);
27840
0
  } while (block_width != 0);
27841
0
}
27842
27843
void xnn_x32_unpool_ukernel__scalar(
27844
    size_t kernel_elements,
27845
    size_t channels,
27846
    uint32_t fill,
27847
    const uint32_t* input,
27848
    const uint32_t* index,
27849
    uint32_t** output)
27850
0
{
27851
  // Pre-initialize outputs with constant.
27852
0
  uint32_t** os = output;
27853
0
  do {
27854
0
    uint32_t* o = *os++;
27855
0
    size_t c = channels;
27856
0
    do {
27857
0
      *o++ = fill;
27858
0
    } while (--c != 0);
27859
0
  } while (--kernel_elements != 0);
27860
27861
  // Copy indexed elements to output.
27862
0
  size_t offset = 0;
27863
0
  do {
27864
0
    const uint32_t i = *index++;
27865
0
    *((uint32_t*) ((uintptr_t) output[i] + offset)) = *input++;
27866
0
    offset += sizeof(uint32_t);
27867
0
  } while (--channels != 0);
27868
0
}
27869
27870
void xnn_x32_zip_x2_ukernel__scalar(
27871
    size_t n,
27872
    const uint32_t* input,
27873
    uint32_t* output)
27874
0
{
27875
0
  assert(n != 0);
27876
0
  assert(n % 4 == 0);
27877
27878
0
  const uint32_t* x = input;
27879
0
  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
27880
27881
0
  do {
27882
0
    const uint32_t vx = *x++;
27883
0
    const uint32_t vy = *y++;
27884
0
    output[0] = vx;
27885
0
    output[1] = vy;
27886
0
    output += 2;
27887
27888
0
    n -= 4;
27889
0
  } while (n != 0);
27890
0
}
27891
27892
void xnn_x32_zip_x3_ukernel__scalar(
27893
    size_t n,
27894
    const uint32_t* input,
27895
    uint32_t* output)
27896
0
{
27897
0
  assert(n != 0);
27898
0
  assert(n % 4 == 0);
27899
27900
0
  const uint32_t* x = input;
27901
0
  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
27902
0
  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
27903
0
  uint32_t* o = output;
27904
27905
0
  do {
27906
0
    const uint32_t vx = *x++;
27907
0
    const uint32_t vy = *y++;
27908
0
    const uint32_t vz = *z++;
27909
0
    o[0] = vx;
27910
0
    o[1] = vy;
27911
0
    o[2] = vz;
27912
0
    o += 3;
27913
27914
0
    n -= 4;
27915
0
  } while (n != 0);
27916
0
}
27917
27918
void xnn_x32_zip_x4_ukernel__scalar(
27919
    size_t n,
27920
    const uint32_t* input,
27921
    uint32_t* output)
27922
0
{
27923
0
  assert(n != 0);
27924
0
  assert(n % 4 == 0);
27925
27926
0
  const uint32_t* x = input;
27927
0
  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
27928
0
  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
27929
0
  const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
27930
0
  uint32_t* o = output;
27931
27932
0
  do {
27933
0
    const uint32_t vx = *x++;
27934
0
    const uint32_t vy = *y++;
27935
0
    const uint32_t vz = *z++;
27936
0
    const uint32_t vw = *w++;
27937
0
    o[0] = vx;
27938
0
    o[1] = vy;
27939
0
    o[2] = vz;
27940
0
    o[3] = vw;
27941
0
    o += 4;
27942
27943
0
    n -= 4;
27944
0
  } while (n != 0);
27945
0
}
27946
27947
void xnn_x32_zip_xm_ukernel__scalar(
27948
    size_t n,
27949
    size_t m,
27950
    const uint32_t* input,
27951
    uint32_t* output)
27952
0
{
27953
0
  assert(n != 0);
27954
0
  assert(n % 4 == 0);
27955
0
  assert(m >= 4);
27956
27957
0
  size_t k = n;
27958
0
  do {
27959
0
    size_t l = m;
27960
0
    const uint32_t* input_column = input++;
27961
0
    do {
27962
0
      *output++ = *input_column;
27963
0
      input_column = (uint32_t*) ((uintptr_t) input_column + n);
27964
0
    } while (--l != 0);
27965
0
    k -= 4;
27966
0
  } while (k != 0);
27967
0
}
27968
27969
void xnn_x64_transposec_ukernel__4x2_scalar_int(
27970
    const uint64_t *input,
27971
    uint64_t * output,
27972
    size_t input_stride,
27973
    size_t output_stride,
27974
    size_t block_width,
27975
    size_t block_height,
27976
    const union xnn_x64_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27977
0
{
27978
0
  assert(output_stride >= block_height * sizeof(int64_t));
27979
0
  assert(input_stride >= block_width * sizeof(int64_t));
27980
27981
0
  const size_t tile_height = 4;
27982
0
  const size_t tile_width = 2;
27983
0
  const size_t tile_wbytes = tile_width * sizeof(int64_t);
27984
0
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
27985
0
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(int64_t);
27986
0
  const size_t input_offset = tile_height * input_stride;
27987
27988
0
  const int64_t* i0 = (const int64_t*) input;
27989
0
  const int64_t* i1 = (const int64_t*) ((uintptr_t) i0 + input_stride);
27990
0
  const int64_t* i2 = (const int64_t*) ((uintptr_t) i1 + input_stride);
27991
0
  const int64_t* i3 = (const int64_t*) ((uintptr_t) i2 + input_stride);
27992
27993
0
  int64_t* o0 = (int64_t*) output;
27994
0
  int64_t* o1 = (int64_t*) ((uintptr_t) o0 + output_stride);
27995
27996
0
  do {
27997
0
    if XNN_UNPREDICTABLE(block_width < 2) {
27998
0
      o1 = o0;
27999
0
    }
28000
0
    size_t bh = block_height;
28001
0
    for (; bh >= 4; bh -= 4) {
28002
0
      *o1++ = i0[1];
28003
0
      *o1++ = i1[1];
28004
0
      *o1++ = i2[1];
28005
0
      *o1++ = i3[1];
28006
0
      *o0++ = i0[0];
28007
0
      *o0++ = i1[0];
28008
0
      *o0++ = i2[0];
28009
0
      *o0++ = i3[0];
28010
0
      i0 = (const int64_t*) ((uintptr_t) i0 + input_offset);
28011
0
      i1 = (const int64_t*) ((uintptr_t) i1 + input_offset);
28012
0
      i2 = (const int64_t*) ((uintptr_t) i2 + input_offset);
28013
0
      i3 = (const int64_t*) ((uintptr_t) i3 + input_offset);
28014
0
    }
28015
0
    const int64_t* i = i0;
28016
0
    if (bh & 2) {
28017
0
      o1[0] = i0[1];
28018
0
      o1[1] = i1[1];
28019
0
      o1 += 2;
28020
0
      o0[0] = i0[0];
28021
0
      o0[1] = i1[0];
28022
0
      o0 += 2;
28023
0
      i = i2;
28024
0
    }
28025
0
    if (bh & 1) {
28026
0
      o1[0] = i[1];
28027
0
      o0[0] = i[0];
28028
0
    }
28029
28030
0
    i0 = (const int64_t*) ((uintptr_t) i0 + input_reset);
28031
0
    i1 = (const int64_t*) ((uintptr_t) i0 + input_stride);
28032
0
    i2 = (const int64_t*) ((uintptr_t) i1 + input_stride);
28033
0
    i3 = (const int64_t*) ((uintptr_t) i2 + input_stride);
28034
0
    o0 = (int64_t*) ((uintptr_t) o0 + output_reset);
28035
0
    o1 = (int64_t*) ((uintptr_t) o1 + output_reset);
28036
0
    block_width = doz(block_width, tile_width);
28037
0
  } while (block_width != 0);
28038
0
}
28039
28040
void xnn_x8_lut_ukernel__scalar_u1(
28041
    size_t batch,
28042
    const uint8_t* input,
28043
    uint8_t* output,
28044
    const uint8_t table[restrict XNN_MIN_ELEMENTS(256)])
28045
0
{
28046
0
  assert(batch != 0);
28047
0
  assert(batch % sizeof(uint8_t) == 0);
28048
0
  assert(input != NULL);
28049
0
  assert(output != NULL);
28050
28051
0
  do {
28052
0
    const size_t vx = (size_t) *input++;
28053
0
    const uint32_t vt = (uint32_t) table[vx];
28054
0
    *output++ = (uint8_t) vt;
28055
0
    batch -= sizeof(uint8_t);
28056
0
  } while (batch != 0);
28057
0
}
28058
28059
void xnn_x8_lut_ukernel__scalar_u4(
28060
    size_t batch,
28061
    const uint8_t* input,
28062
    uint8_t* output,
28063
    const uint8_t table[restrict XNN_MIN_ELEMENTS(256)])
28064
0
{
28065
0
  assert(batch != 0);
28066
0
  assert(batch % sizeof(uint8_t) == 0);
28067
0
  assert(input != NULL);
28068
0
  assert(output != NULL);
28069
28070
0
  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
28071
0
    const size_t vx0 = (size_t) input[0];
28072
0
    const size_t vx1 = (size_t) input[1];
28073
0
    const size_t vx2 = (size_t) input[2];
28074
0
    const size_t vx3 = (size_t) input[3];
28075
0
    input += 4;
28076
28077
0
    const uint32_t vt0 = (uint32_t) table[vx0];
28078
0
    const uint32_t vt1 = (uint32_t) table[vx1];
28079
0
    const uint32_t vt2 = (uint32_t) table[vx2];
28080
0
    const uint32_t vt3 = (uint32_t) table[vx3];
28081
28082
0
    output[0] = (uint8_t) vt0;
28083
0
    output[1] = (uint8_t) vt1;
28084
0
    output[2] = (uint8_t) vt2;
28085
0
    output[3] = (uint8_t) vt3;
28086
0
    output += 4;
28087
0
  }
28088
0
  if XNN_UNLIKELY(batch != 0) {
28089
0
    do {
28090
0
      const size_t vx = (size_t) *input++;
28091
0
      const uint32_t vt = (uint32_t) table[vx];
28092
0
      *output++ = (uint8_t) vt;
28093
0
      batch -= sizeof(uint8_t);
28094
0
    } while (batch != 0);
28095
0
  }
28096
0
}
28097
28098
void xnn_x8_packw_gemm_goi_ukernel_x16__scalar_int_u2(
28099
  size_t g,
28100
  size_t nc,
28101
  size_t kc,
28102
  size_t nr,
28103
  size_t kr,
28104
  size_t sr,
28105
  const int8_t* weights,
28106
  const uint32_t* bias,
28107
  const void* scale,
28108
  int8_t* packed_weights,
28109
  size_t extra_bytes,
28110
  const void* params)
28111
0
{
28112
0
  assert(g != 0);
28113
0
  assert(nc != 0);
28114
0
  assert(kc != 0);
28115
0
  assert(nr == 16);   // This kernel is for NR=16
28116
0
  assert(kr == 1);
28117
0
  assert(sr == 1);
28118
0
  assert(weights != NULL);
28119
0
  assert(packed_weights != NULL);
28120
28121
0
  int8_t* out = (int8_t*) packed_weights;
28122
0
  const uint32_t* b = (const uint32_t*) bias;
28123
28124
0
  do {
28125
    // NC main loop multiple of 16
28126
0
    const int8_t* w0 = (const int8_t*) weights;
28127
0
    size_t n = nc;
28128
0
    for (;n >= 16; n -= 16) {
28129
0
      if XNN_LIKELY(b != NULL) {
28130
0
        ((uint32_t*) out)[0] = b[0];
28131
0
        ((uint32_t*) out)[1] = b[1];
28132
0
        ((uint32_t*) out)[2] = b[2];
28133
0
        ((uint32_t*) out)[3] = b[3];
28134
0
        ((uint32_t*) out)[4] = b[4];
28135
0
        ((uint32_t*) out)[5] = b[5];
28136
0
        ((uint32_t*) out)[6] = b[6];
28137
0
        ((uint32_t*) out)[7] = b[7];
28138
0
        ((uint32_t*) out)[8] = b[8];
28139
0
        ((uint32_t*) out)[9] = b[9];
28140
0
        ((uint32_t*) out)[10] = b[10];
28141
0
        ((uint32_t*) out)[11] = b[11];
28142
0
        ((uint32_t*) out)[12] = b[12];
28143
0
        ((uint32_t*) out)[13] = b[13];
28144
0
        ((uint32_t*) out)[14] = b[14];
28145
0
        ((uint32_t*) out)[15] = b[15];
28146
0
        b += 16;
28147
0
      } else {
28148
0
        ((uint32_t*) out)[0] = 0;
28149
0
        ((uint32_t*) out)[1] = 0;
28150
0
        ((uint32_t*) out)[2] = 0;
28151
0
        ((uint32_t*) out)[3] = 0;
28152
0
        ((uint32_t*) out)[4] = 0;
28153
0
        ((uint32_t*) out)[5] = 0;
28154
0
        ((uint32_t*) out)[6] = 0;
28155
0
        ((uint32_t*) out)[7] = 0;
28156
0
        ((uint32_t*) out)[8] = 0;
28157
0
        ((uint32_t*) out)[9] = 0;
28158
0
        ((uint32_t*) out)[10] = 0;
28159
0
        ((uint32_t*) out)[11] = 0;
28160
0
        ((uint32_t*) out)[12] = 0;
28161
0
        ((uint32_t*) out)[13] = 0;
28162
0
        ((uint32_t*) out)[14] = 0;
28163
0
        ((uint32_t*) out)[15] = 0;
28164
0
      }
28165
0
      out += 16 * sizeof(uint32_t);
28166
28167
0
      const int8_t* w1 = w0 + kc;
28168
0
      const int8_t* w2 = w1 + kc;
28169
0
      const int8_t* w3 = w2 + kc;
28170
0
      const int8_t* w4 = w3 + kc;
28171
0
      const int8_t* w5 = w4 + kc;
28172
0
      const int8_t* w6 = w5 + kc;
28173
0
      const int8_t* w7 = w6 + kc;
28174
0
      const int8_t* w8 = w7 + kc;
28175
0
      const int8_t* w9 = w8 + kc;
28176
0
      const int8_t* w10 = w9 + kc;
28177
0
      const int8_t* w11 = w10 + kc;
28178
0
      const int8_t* w12 = w11 + kc;
28179
0
      const int8_t* w13 = w12 + kc;
28180
0
      const int8_t* w14 = w13 + kc;
28181
0
      const int8_t* w15 = w14 + kc;
28182
28183
      // KC main loop multiple of 16x2
28184
0
      size_t k = kc;
28185
0
      for (; k >= 2; k -= 2) {
28186
0
        const int8_t v00 = w0[0];
28187
0
        const int8_t v01 = w0[1];
28188
0
        w0 += 2;
28189
0
        const int8_t v10 = w1[0];
28190
0
        const int8_t v11 = w1[1];
28191
0
        w1 += 2;
28192
0
        const int8_t v20 = w2[0];
28193
0
        const int8_t v21 = w2[1];
28194
0
        w2 += 2;
28195
0
        const int8_t v30 = w3[0];
28196
0
        const int8_t v31 = w3[1];
28197
0
        w3 += 2;
28198
0
        const int8_t v40 = w4[0];
28199
0
        const int8_t v41 = w4[1];
28200
0
        w4 += 2;
28201
0
        const int8_t v50 = w5[0];
28202
0
        const int8_t v51 = w5[1];
28203
0
        w5 += 2;
28204
0
        const int8_t v60 = w6[0];
28205
0
        const int8_t v61 = w6[1];
28206
0
        w6 += 2;
28207
0
        const int8_t v70 = w7[0];
28208
0
        const int8_t v71 = w7[1];
28209
0
        w7 += 2;
28210
0
        const int8_t v80 = w8[0];
28211
0
        const int8_t v81 = w8[1];
28212
0
        w8 += 2;
28213
0
        const int8_t v90 = w9[0];
28214
0
        const int8_t v91 = w9[1];
28215
0
        w9 += 2;
28216
0
        const int8_t v100 = w10[0];
28217
0
        const int8_t v101 = w10[1];
28218
0
        w10 += 2;
28219
0
        const int8_t v110 = w11[0];
28220
0
        const int8_t v111 = w11[1];
28221
0
        w11 += 2;
28222
0
        const int8_t v120 = w12[0];
28223
0
        const int8_t v121 = w12[1];
28224
0
        w12 += 2;
28225
0
        const int8_t v130 = w13[0];
28226
0
        const int8_t v131 = w13[1];
28227
0
        w13 += 2;
28228
0
        const int8_t v140 = w14[0];
28229
0
        const int8_t v141 = w14[1];
28230
0
        w14 += 2;
28231
0
        const int8_t v150 = w15[0];
28232
0
        const int8_t v151 = w15[1];
28233
0
        w15 += 2;
28234
0
        out[0] = v00;
28235
0
        out[1] = v10;
28236
0
        out[2] = v20;
28237
0
        out[3] = v30;
28238
0
        out[4] = v40;
28239
0
        out[5] = v50;
28240
0
        out[6] = v60;
28241
0
        out[7] = v70;
28242
0
        out[8] = v80;
28243
0
        out[9] = v90;
28244
0
        out[10] = v100;
28245
0
        out[11] = v110;
28246
0
        out[12] = v120;
28247
0
        out[13] = v130;
28248
0
        out[14] = v140;
28249
0
        out[15] = v150;
28250
0
        out[16] = v01;
28251
0
        out[17] = v11;
28252
0
        out[18] = v21;
28253
0
        out[19] = v31;
28254
0
        out[20] = v41;
28255
0
        out[21] = v51;
28256
0
        out[22] = v61;
28257
0
        out[23] = v71;
28258
0
        out[24] = v81;
28259
0
        out[25] = v91;
28260
0
        out[26] = v101;
28261
0
        out[27] = v111;
28262
0
        out[28] = v121;
28263
0
        out[29] = v131;
28264
0
        out[30] = v141;
28265
0
        out[31] = v151;
28266
0
        out += 32;
28267
0
      }
28268
28269
      // KC remainder
28270
0
      for (; k != 0; --k) {
28271
0
        const int8_t v0 = *w0++;
28272
0
        out[0] = v0;
28273
0
        const int8_t v1 = *w1++;
28274
0
        out[1] = v1;
28275
0
        const int8_t v2 = *w2++;
28276
0
        out[2] = v2;
28277
0
        const int8_t v3 = *w3++;
28278
0
        out[3] = v3;
28279
0
        const int8_t v4 = *w4++;
28280
0
        out[4] = v4;
28281
0
        const int8_t v5 = *w5++;
28282
0
        out[5] = v5;
28283
0
        const int8_t v6 = *w6++;
28284
0
        out[6] = v6;
28285
0
        const int8_t v7 = *w7++;
28286
0
        out[7] = v7;
28287
0
        const int8_t v8 = *w8++;
28288
0
        out[8] = v8;
28289
0
        const int8_t v9 = *w9++;
28290
0
        out[9] = v9;
28291
0
        const int8_t v10 = *w10++;
28292
0
        out[10] = v10;
28293
0
        const int8_t v11 = *w11++;
28294
0
        out[11] = v11;
28295
0
        const int8_t v12 = *w12++;
28296
0
        out[12] = v12;
28297
0
        const int8_t v13 = *w13++;
28298
0
        out[13] = v13;
28299
0
        const int8_t v14 = *w14++;
28300
0
        out[14] = v14;
28301
0
        const int8_t v15 = *w15++;
28302
0
        out[15] = v15;
28303
0
        out += 16;
28304
0
      }
28305
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
28306
0
      w0 = w15;
28307
0
    }
28308
28309
    // NC remainder (1..15)
28310
0
    if XNN_UNLIKELY(n != 0) {
28311
0
      if XNN_LIKELY(b != NULL) {
28312
0
        size_t nb = n;
28313
0
        do {
28314
0
          *((uint32_t*) out) = *b++;
28315
0
          out += sizeof(uint32_t);
28316
0
        } while (--nb != 0);
28317
0
      } else {
28318
0
        size_t nb = n;
28319
0
        do {
28320
0
          *((uint32_t*) out) = 0;
28321
0
          out += sizeof(uint32_t);
28322
0
        } while (--nb != 0);
28323
0
      }
28324
0
      out += (16 - n) * sizeof(uint32_t);
28325
28326
      // NR remainder has less than 16 rows so last row is not loaded
28327
0
      const int8_t* w1 = w0 + kc;
28328
0
      if XNN_UNPREDICTABLE(n < 2) {
28329
0
        w1 = w0;
28330
0
      }
28331
0
      const int8_t* w2 = w1 + kc;
28332
0
      if XNN_UNPREDICTABLE(n <= 2) {
28333
0
        w2 = w1;
28334
0
      }
28335
0
      const int8_t* w3 = w2 + kc;
28336
0
      if XNN_UNPREDICTABLE(n < 4) {
28337
0
        w3 = w2;
28338
0
      }
28339
0
      const int8_t* w4 = w3 + kc;
28340
0
      if XNN_UNPREDICTABLE(n <= 4) {
28341
0
        w4 = w3;
28342
0
      }
28343
0
      const int8_t* w5 = w4 + kc;
28344
0
      if XNN_UNPREDICTABLE(n < 6) {
28345
0
        w5 = w4;
28346
0
      }
28347
0
      const int8_t* w6 = w5 + kc;
28348
0
      if XNN_UNPREDICTABLE(n <= 6) {
28349
0
        w6 = w5;
28350
0
      }
28351
0
      const int8_t* w7 = w6 + kc;
28352
0
      if XNN_UNPREDICTABLE(n < 8) {
28353
0
        w7 = w6;
28354
0
      }
28355
0
      const int8_t* w8 = w7 + kc;
28356
0
      if XNN_UNPREDICTABLE(n <= 8) {
28357
0
        w8 = w7;
28358
0
      }
28359
0
      const int8_t* w9 = w8 + kc;
28360
0
      if XNN_UNPREDICTABLE(n < 10) {
28361
0
        w9 = w8;
28362
0
      }
28363
0
      const int8_t* w10 = w9 + kc;
28364
0
      if XNN_UNPREDICTABLE(n <= 10) {
28365
0
        w10 = w9;
28366
0
      }
28367
0
      const int8_t* w11 = w10 + kc;
28368
0
      if XNN_UNPREDICTABLE(n < 12) {
28369
0
        w11 = w10;
28370
0
      }
28371
0
      const int8_t* w12 = w11 + kc;
28372
0
      if XNN_UNPREDICTABLE(n <= 12) {
28373
0
        w12 = w11;
28374
0
      }
28375
0
      const int8_t* w13 = w12 + kc;
28376
0
      if XNN_UNPREDICTABLE(n < 14) {
28377
0
        w13 = w12;
28378
0
      }
28379
0
      const int8_t* w14 = w13 + kc;
28380
0
      if XNN_UNPREDICTABLE(n <= 14) {
28381
0
        w14 = w13;
28382
0
      }
28383
28384
      // KC main loop multiple of 16x2
28385
0
      size_t k = kc;
28386
0
      for (; k >= 2; k -= 2) {
28387
0
        const int8_t v00 = w0[0];
28388
0
        const int8_t v01 = w0[1];
28389
0
        w0 += 2;
28390
0
        const int8_t v10 = w1[0];
28391
0
        const int8_t v11 = w1[1];
28392
0
        w1 += 2;
28393
0
        const int8_t v20 = w2[0];
28394
0
        const int8_t v21 = w2[1];
28395
0
        w2 += 2;
28396
0
        const int8_t v30 = w3[0];
28397
0
        const int8_t v31 = w3[1];
28398
0
        w3 += 2;
28399
0
        const int8_t v40 = w4[0];
28400
0
        const int8_t v41 = w4[1];
28401
0
        w4 += 2;
28402
0
        const int8_t v50 = w5[0];
28403
0
        const int8_t v51 = w5[1];
28404
0
        w5 += 2;
28405
0
        const int8_t v60 = w6[0];
28406
0
        const int8_t v61 = w6[1];
28407
0
        w6 += 2;
28408
0
        const int8_t v70 = w7[0];
28409
0
        const int8_t v71 = w7[1];
28410
0
        w7 += 2;
28411
0
        const int8_t v80 = w8[0];
28412
0
        const int8_t v81 = w8[1];
28413
0
        w8 += 2;
28414
0
        const int8_t v90 = w9[0];
28415
0
        const int8_t v91 = w9[1];
28416
0
        w9 += 2;
28417
0
        const int8_t v100 = w10[0];
28418
0
        const int8_t v101 = w10[1];
28419
0
        w10 += 2;
28420
0
        const int8_t v110 = w11[0];
28421
0
        const int8_t v111 = w11[1];
28422
0
        w11 += 2;
28423
0
        const int8_t v120 = w12[0];
28424
0
        const int8_t v121 = w12[1];
28425
0
        w12 += 2;
28426
0
        const int8_t v130 = w13[0];
28427
0
        const int8_t v131 = w13[1];
28428
0
        w13 += 2;
28429
0
        const int8_t v140 = w14[0];
28430
0
        const int8_t v141 = w14[1];
28431
0
        w14 += 2;
28432
0
        out[0] = v00;
28433
0
        out[1] = v10;
28434
0
        out[2] = v20;
28435
0
        out[3] = v30;
28436
0
        out[4] = v40;
28437
0
        out[5] = v50;
28438
0
        out[6] = v60;
28439
0
        out[7] = v70;
28440
0
        out[8] = v80;
28441
0
        out[9] = v90;
28442
0
        out[10] = v100;
28443
0
        out[11] = v110;
28444
0
        out[12] = v120;
28445
0
        out[13] = v130;
28446
0
        out[14] = v140;
28447
0
        out[16] = v01;
28448
0
        out[17] = v11;
28449
0
        out[18] = v21;
28450
0
        out[19] = v31;
28451
0
        out[20] = v41;
28452
0
        out[21] = v51;
28453
0
        out[22] = v61;
28454
0
        out[23] = v71;
28455
0
        out[24] = v81;
28456
0
        out[25] = v91;
28457
0
        out[26] = v101;
28458
0
        out[27] = v111;
28459
0
        out[28] = v121;
28460
0
        out[29] = v131;
28461
0
        out[30] = v141;
28462
0
        out += 32;
28463
0
      }
28464
28465
      // KC remainder of 1..1
28466
0
      for (; k != 0; --k) {
28467
0
        const int8_t v0 = *w0++;
28468
0
        out[0] = v0;
28469
0
        const int8_t v1 = *w1++;
28470
0
        out[1] = v1;
28471
0
        const int8_t v2 = *w2++;
28472
0
        out[2] = v2;
28473
0
        const int8_t v3 = *w3++;
28474
0
        out[3] = v3;
28475
0
        const int8_t v4 = *w4++;
28476
0
        out[4] = v4;
28477
0
        const int8_t v5 = *w5++;
28478
0
        out[5] = v5;
28479
0
        const int8_t v6 = *w6++;
28480
0
        out[6] = v6;
28481
0
        const int8_t v7 = *w7++;
28482
0
        out[7] = v7;
28483
0
        const int8_t v8 = *w8++;
28484
0
        out[8] = v8;
28485
0
        const int8_t v9 = *w9++;
28486
0
        out[9] = v9;
28487
0
        const int8_t v10 = *w10++;
28488
0
        out[10] = v10;
28489
0
        const int8_t v11 = *w11++;
28490
0
        out[11] = v11;
28491
0
        const int8_t v12 = *w12++;
28492
0
        out[12] = v12;
28493
0
        const int8_t v13 = *w13++;
28494
0
        out[13] = v13;
28495
0
        const int8_t v14 = *w14++;
28496
0
        out[14] = v14;
28497
0
        out += 16;
28498
0
      }
28499
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
28500
0
    }
28501
0
    weights += nc * kc;
28502
0
  } while (--g != 0);
28503
0
}
28504
28505
void xnn_x8_packw_gemm_goi_ukernel_x32__scalar_int_u2(
28506
  size_t g,
28507
  size_t nc,
28508
  size_t kc,
28509
  size_t nr,
28510
  size_t kr,
28511
  size_t sr,
28512
  const int8_t* weights,
28513
  const uint32_t* bias,
28514
  const void* scale,
28515
  int8_t* packed_weights,
28516
  size_t extra_bytes,
28517
  const void* params)
28518
0
{
28519
0
  assert(g != 0);
28520
0
  assert(nc != 0);
28521
0
  assert(kc != 0);
28522
0
  assert(nr == 32);   // This kernel is for NR=32
28523
0
  assert(kr == 1);
28524
0
  assert(sr == 1);
28525
0
  assert(weights != NULL);
28526
0
  assert(packed_weights != NULL);
28527
28528
0
  int8_t* out = (int8_t*) packed_weights;
28529
0
  const uint32_t* b = (const uint32_t*) bias;
28530
28531
0
  do {
28532
    // NC main loop multiple of 32
28533
0
    const int8_t* w0 = (const int8_t*) weights;
28534
0
    size_t n = nc;
28535
0
    for (;n >= 32; n -= 32) {
28536
0
      if XNN_LIKELY(b != NULL) {
28537
0
        ((uint32_t*) out)[0] = b[0];
28538
0
        ((uint32_t*) out)[1] = b[1];
28539
0
        ((uint32_t*) out)[2] = b[2];
28540
0
        ((uint32_t*) out)[3] = b[3];
28541
0
        ((uint32_t*) out)[4] = b[4];
28542
0
        ((uint32_t*) out)[5] = b[5];
28543
0
        ((uint32_t*) out)[6] = b[6];
28544
0
        ((uint32_t*) out)[7] = b[7];
28545
0
        ((uint32_t*) out)[8] = b[8];
28546
0
        ((uint32_t*) out)[9] = b[9];
28547
0
        ((uint32_t*) out)[10] = b[10];
28548
0
        ((uint32_t*) out)[11] = b[11];
28549
0
        ((uint32_t*) out)[12] = b[12];
28550
0
        ((uint32_t*) out)[13] = b[13];
28551
0
        ((uint32_t*) out)[14] = b[14];
28552
0
        ((uint32_t*) out)[15] = b[15];
28553
0
        ((uint32_t*) out)[16] = b[16];
28554
0
        ((uint32_t*) out)[17] = b[17];
28555
0
        ((uint32_t*) out)[18] = b[18];
28556
0
        ((uint32_t*) out)[19] = b[19];
28557
0
        ((uint32_t*) out)[20] = b[20];
28558
0
        ((uint32_t*) out)[21] = b[21];
28559
0
        ((uint32_t*) out)[22] = b[22];
28560
0
        ((uint32_t*) out)[23] = b[23];
28561
0
        ((uint32_t*) out)[24] = b[24];
28562
0
        ((uint32_t*) out)[25] = b[25];
28563
0
        ((uint32_t*) out)[26] = b[26];
28564
0
        ((uint32_t*) out)[27] = b[27];
28565
0
        ((uint32_t*) out)[28] = b[28];
28566
0
        ((uint32_t*) out)[29] = b[29];
28567
0
        ((uint32_t*) out)[30] = b[30];
28568
0
        ((uint32_t*) out)[31] = b[31];
28569
0
        b += 32;
28570
0
      } else {
28571
0
        ((uint32_t*) out)[0] = 0;
28572
0
        ((uint32_t*) out)[1] = 0;
28573
0
        ((uint32_t*) out)[2] = 0;
28574
0
        ((uint32_t*) out)[3] = 0;
28575
0
        ((uint32_t*) out)[4] = 0;
28576
0
        ((uint32_t*) out)[5] = 0;
28577
0
        ((uint32_t*) out)[6] = 0;
28578
0
        ((uint32_t*) out)[7] = 0;
28579
0
        ((uint32_t*) out)[8] = 0;
28580
0
        ((uint32_t*) out)[9] = 0;
28581
0
        ((uint32_t*) out)[10] = 0;
28582
0
        ((uint32_t*) out)[11] = 0;
28583
0
        ((uint32_t*) out)[12] = 0;
28584
0
        ((uint32_t*) out)[13] = 0;
28585
0
        ((uint32_t*) out)[14] = 0;
28586
0
        ((uint32_t*) out)[15] = 0;
28587
0
        ((uint32_t*) out)[16] = 0;
28588
0
        ((uint32_t*) out)[17] = 0;
28589
0
        ((uint32_t*) out)[18] = 0;
28590
0
        ((uint32_t*) out)[19] = 0;
28591
0
        ((uint32_t*) out)[20] = 0;
28592
0
        ((uint32_t*) out)[21] = 0;
28593
0
        ((uint32_t*) out)[22] = 0;
28594
0
        ((uint32_t*) out)[23] = 0;
28595
0
        ((uint32_t*) out)[24] = 0;
28596
0
        ((uint32_t*) out)[25] = 0;
28597
0
        ((uint32_t*) out)[26] = 0;
28598
0
        ((uint32_t*) out)[27] = 0;
28599
0
        ((uint32_t*) out)[28] = 0;
28600
0
        ((uint32_t*) out)[29] = 0;
28601
0
        ((uint32_t*) out)[30] = 0;
28602
0
        ((uint32_t*) out)[31] = 0;
28603
0
      }
28604
0
      out += 32 * sizeof(uint32_t);
28605
28606
0
      const int8_t* w1 = w0 + kc;
28607
0
      const int8_t* w2 = w1 + kc;
28608
0
      const int8_t* w3 = w2 + kc;
28609
0
      const int8_t* w4 = w3 + kc;
28610
0
      const int8_t* w5 = w4 + kc;
28611
0
      const int8_t* w6 = w5 + kc;
28612
0
      const int8_t* w7 = w6 + kc;
28613
0
      const int8_t* w8 = w7 + kc;
28614
0
      const int8_t* w9 = w8 + kc;
28615
0
      const int8_t* w10 = w9 + kc;
28616
0
      const int8_t* w11 = w10 + kc;
28617
0
      const int8_t* w12 = w11 + kc;
28618
0
      const int8_t* w13 = w12 + kc;
28619
0
      const int8_t* w14 = w13 + kc;
28620
0
      const int8_t* w15 = w14 + kc;
28621
0
      const int8_t* w16 = w15 + kc;
28622
0
      const int8_t* w17 = w16 + kc;
28623
0
      const int8_t* w18 = w17 + kc;
28624
0
      const int8_t* w19 = w18 + kc;
28625
0
      const int8_t* w20 = w19 + kc;
28626
0
      const int8_t* w21 = w20 + kc;
28627
0
      const int8_t* w22 = w21 + kc;
28628
0
      const int8_t* w23 = w22 + kc;
28629
0
      const int8_t* w24 = w23 + kc;
28630
0
      const int8_t* w25 = w24 + kc;
28631
0
      const int8_t* w26 = w25 + kc;
28632
0
      const int8_t* w27 = w26 + kc;
28633
0
      const int8_t* w28 = w27 + kc;
28634
0
      const int8_t* w29 = w28 + kc;
28635
0
      const int8_t* w30 = w29 + kc;
28636
0
      const int8_t* w31 = w30 + kc;
28637
28638
      // KC main loop multiple of 32x2
28639
0
      size_t k = kc;
28640
0
      for (; k >= 2; k -= 2) {
28641
0
        const int8_t v00 = w0[0];
28642
0
        const int8_t v01 = w0[1];
28643
0
        w0 += 2;
28644
0
        const int8_t v10 = w1[0];
28645
0
        const int8_t v11 = w1[1];
28646
0
        w1 += 2;
28647
0
        const int8_t v20 = w2[0];
28648
0
        const int8_t v21 = w2[1];
28649
0
        w2 += 2;
28650
0
        const int8_t v30 = w3[0];
28651
0
        const int8_t v31 = w3[1];
28652
0
        w3 += 2;
28653
0
        const int8_t v40 = w4[0];
28654
0
        const int8_t v41 = w4[1];
28655
0
        w4 += 2;
28656
0
        const int8_t v50 = w5[0];
28657
0
        const int8_t v51 = w5[1];
28658
0
        w5 += 2;
28659
0
        const int8_t v60 = w6[0];
28660
0
        const int8_t v61 = w6[1];
28661
0
        w6 += 2;
28662
0
        const int8_t v70 = w7[0];
28663
0
        const int8_t v71 = w7[1];
28664
0
        w7 += 2;
28665
0
        const int8_t v80 = w8[0];
28666
0
        const int8_t v81 = w8[1];
28667
0
        w8 += 2;
28668
0
        const int8_t v90 = w9[0];
28669
0
        const int8_t v91 = w9[1];
28670
0
        w9 += 2;
28671
0
        const int8_t v100 = w10[0];
28672
0
        const int8_t v101 = w10[1];
28673
0
        w10 += 2;
28674
0
        const int8_t v110 = w11[0];
28675
0
        const int8_t v111 = w11[1];
28676
0
        w11 += 2;
28677
0
        const int8_t v120 = w12[0];
28678
0
        const int8_t v121 = w12[1];
28679
0
        w12 += 2;
28680
0
        const int8_t v130 = w13[0];
28681
0
        const int8_t v131 = w13[1];
28682
0
        w13 += 2;
28683
0
        const int8_t v140 = w14[0];
28684
0
        const int8_t v141 = w14[1];
28685
0
        w14 += 2;
28686
0
        const int8_t v150 = w15[0];
28687
0
        const int8_t v151 = w15[1];
28688
0
        w15 += 2;
28689
0
        const int8_t v160 = w16[0];
28690
0
        const int8_t v161 = w16[1];
28691
0
        w16 += 2;
28692
0
        const int8_t v170 = w17[0];
28693
0
        const int8_t v171 = w17[1];
28694
0
        w17 += 2;
28695
0
        const int8_t v180 = w18[0];
28696
0
        const int8_t v181 = w18[1];
28697
0
        w18 += 2;
28698
0
        const int8_t v190 = w19[0];
28699
0
        const int8_t v191 = w19[1];
28700
0
        w19 += 2;
28701
0
        const int8_t v200 = w20[0];
28702
0
        const int8_t v201 = w20[1];
28703
0
        w20 += 2;
28704
0
        const int8_t v210 = w21[0];
28705
0
        const int8_t v211 = w21[1];
28706
0
        w21 += 2;
28707
0
        const int8_t v220 = w22[0];
28708
0
        const int8_t v221 = w22[1];
28709
0
        w22 += 2;
28710
0
        const int8_t v230 = w23[0];
28711
0
        const int8_t v231 = w23[1];
28712
0
        w23 += 2;
28713
0
        const int8_t v240 = w24[0];
28714
0
        const int8_t v241 = w24[1];
28715
0
        w24 += 2;
28716
0
        const int8_t v250 = w25[0];
28717
0
        const int8_t v251 = w25[1];
28718
0
        w25 += 2;
28719
0
        const int8_t v260 = w26[0];
28720
0
        const int8_t v261 = w26[1];
28721
0
        w26 += 2;
28722
0
        const int8_t v270 = w27[0];
28723
0
        const int8_t v271 = w27[1];
28724
0
        w27 += 2;
28725
0
        const int8_t v280 = w28[0];
28726
0
        const int8_t v281 = w28[1];
28727
0
        w28 += 2;
28728
0
        const int8_t v290 = w29[0];
28729
0
        const int8_t v291 = w29[1];
28730
0
        w29 += 2;
28731
0
        const int8_t v300 = w30[0];
28732
0
        const int8_t v301 = w30[1];
28733
0
        w30 += 2;
28734
0
        const int8_t v310 = w31[0];
28735
0
        const int8_t v311 = w31[1];
28736
0
        w31 += 2;
28737
0
        out[0] = v00;
28738
0
        out[1] = v10;
28739
0
        out[2] = v20;
28740
0
        out[3] = v30;
28741
0
        out[4] = v40;
28742
0
        out[5] = v50;
28743
0
        out[6] = v60;
28744
0
        out[7] = v70;
28745
0
        out[8] = v80;
28746
0
        out[9] = v90;
28747
0
        out[10] = v100;
28748
0
        out[11] = v110;
28749
0
        out[12] = v120;
28750
0
        out[13] = v130;
28751
0
        out[14] = v140;
28752
0
        out[15] = v150;
28753
0
        out[16] = v160;
28754
0
        out[17] = v170;
28755
0
        out[18] = v180;
28756
0
        out[19] = v190;
28757
0
        out[20] = v200;
28758
0
        out[21] = v210;
28759
0
        out[22] = v220;
28760
0
        out[23] = v230;
28761
0
        out[24] = v240;
28762
0
        out[25] = v250;
28763
0
        out[26] = v260;
28764
0
        out[27] = v270;
28765
0
        out[28] = v280;
28766
0
        out[29] = v290;
28767
0
        out[30] = v300;
28768
0
        out[31] = v310;
28769
0
        out[32] = v01;
28770
0
        out[33] = v11;
28771
0
        out[34] = v21;
28772
0
        out[35] = v31;
28773
0
        out[36] = v41;
28774
0
        out[37] = v51;
28775
0
        out[38] = v61;
28776
0
        out[39] = v71;
28777
0
        out[40] = v81;
28778
0
        out[41] = v91;
28779
0
        out[42] = v101;
28780
0
        out[43] = v111;
28781
0
        out[44] = v121;
28782
0
        out[45] = v131;
28783
0
        out[46] = v141;
28784
0
        out[47] = v151;
28785
0
        out[48] = v161;
28786
0
        out[49] = v171;
28787
0
        out[50] = v181;
28788
0
        out[51] = v191;
28789
0
        out[52] = v201;
28790
0
        out[53] = v211;
28791
0
        out[54] = v221;
28792
0
        out[55] = v231;
28793
0
        out[56] = v241;
28794
0
        out[57] = v251;
28795
0
        out[58] = v261;
28796
0
        out[59] = v271;
28797
0
        out[60] = v281;
28798
0
        out[61] = v291;
28799
0
        out[62] = v301;
28800
0
        out[63] = v311;
28801
0
        out += 64;
28802
0
      }
28803
28804
      // KC remainder
28805
0
      for (; k != 0; --k) {
28806
0
        const int8_t v0 = *w0++;
28807
0
        out[0] = v0;
28808
0
        const int8_t v1 = *w1++;
28809
0
        out[1] = v1;
28810
0
        const int8_t v2 = *w2++;
28811
0
        out[2] = v2;
28812
0
        const int8_t v3 = *w3++;
28813
0
        out[3] = v3;
28814
0
        const int8_t v4 = *w4++;
28815
0
        out[4] = v4;
28816
0
        const int8_t v5 = *w5++;
28817
0
        out[5] = v5;
28818
0
        const int8_t v6 = *w6++;
28819
0
        out[6] = v6;
28820
0
        const int8_t v7 = *w7++;
28821
0
        out[7] = v7;
28822
0
        const int8_t v8 = *w8++;
28823
0
        out[8] = v8;
28824
0
        const int8_t v9 = *w9++;
28825
0
        out[9] = v9;
28826
0
        const int8_t v10 = *w10++;
28827
0
        out[10] = v10;
28828
0
        const int8_t v11 = *w11++;
28829
0
        out[11] = v11;
28830
0
        const int8_t v12 = *w12++;
28831
0
        out[12] = v12;
28832
0
        const int8_t v13 = *w13++;
28833
0
        out[13] = v13;
28834
0
        const int8_t v14 = *w14++;
28835
0
        out[14] = v14;
28836
0
        const int8_t v15 = *w15++;
28837
0
        out[15] = v15;
28838
0
        const int8_t v16 = *w16++;
28839
0
        out[16] = v16;
28840
0
        const int8_t v17 = *w17++;
28841
0
        out[17] = v17;
28842
0
        const int8_t v18 = *w18++;
28843
0
        out[18] = v18;
28844
0
        const int8_t v19 = *w19++;
28845
0
        out[19] = v19;
28846
0
        const int8_t v20 = *w20++;
28847
0
        out[20] = v20;
28848
0
        const int8_t v21 = *w21++;
28849
0
        out[21] = v21;
28850
0
        const int8_t v22 = *w22++;
28851
0
        out[22] = v22;
28852
0
        const int8_t v23 = *w23++;
28853
0
        out[23] = v23;
28854
0
        const int8_t v24 = *w24++;
28855
0
        out[24] = v24;
28856
0
        const int8_t v25 = *w25++;
28857
0
        out[25] = v25;
28858
0
        const int8_t v26 = *w26++;
28859
0
        out[26] = v26;
28860
0
        const int8_t v27 = *w27++;
28861
0
        out[27] = v27;
28862
0
        const int8_t v28 = *w28++;
28863
0
        out[28] = v28;
28864
0
        const int8_t v29 = *w29++;
28865
0
        out[29] = v29;
28866
0
        const int8_t v30 = *w30++;
28867
0
        out[30] = v30;
28868
0
        const int8_t v31 = *w31++;
28869
0
        out[31] = v31;
28870
0
        out += 32;
28871
0
      }
28872
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
28873
0
      w0 = w31;
28874
0
    }
28875
28876
    // NC remainder (1..31)
28877
0
    if XNN_UNLIKELY(n != 0) {
28878
0
      if XNN_LIKELY(b != NULL) {
28879
0
        size_t nb = n;
28880
0
        do {
28881
0
          *((uint32_t*) out) = *b++;
28882
0
          out += sizeof(uint32_t);
28883
0
        } while (--nb != 0);
28884
0
      } else {
28885
0
        size_t nb = n;
28886
0
        do {
28887
0
          *((uint32_t*) out) = 0;
28888
0
          out += sizeof(uint32_t);
28889
0
        } while (--nb != 0);
28890
0
      }
28891
0
      out += (32 - n) * sizeof(uint32_t);
28892
28893
      // NR remainder has less than 32 rows so last row is not loaded
28894
0
      const int8_t* w1 = w0 + kc;
28895
0
      if XNN_UNPREDICTABLE(n < 2) {
28896
0
        w1 = w0;
28897
0
      }
28898
0
      const int8_t* w2 = w1 + kc;
28899
0
      if XNN_UNPREDICTABLE(n <= 2) {
28900
0
        w2 = w1;
28901
0
      }
28902
0
      const int8_t* w3 = w2 + kc;
28903
0
      if XNN_UNPREDICTABLE(n < 4) {
28904
0
        w3 = w2;
28905
0
      }
28906
0
      const int8_t* w4 = w3 + kc;
28907
0
      if XNN_UNPREDICTABLE(n <= 4) {
28908
0
        w4 = w3;
28909
0
      }
28910
0
      const int8_t* w5 = w4 + kc;
28911
0
      if XNN_UNPREDICTABLE(n < 6) {
28912
0
        w5 = w4;
28913
0
      }
28914
0
      const int8_t* w6 = w5 + kc;
28915
0
      if XNN_UNPREDICTABLE(n <= 6) {
28916
0
        w6 = w5;
28917
0
      }
28918
0
      const int8_t* w7 = w6 + kc;
28919
0
      if XNN_UNPREDICTABLE(n < 8) {
28920
0
        w7 = w6;
28921
0
      }
28922
0
      const int8_t* w8 = w7 + kc;
28923
0
      if XNN_UNPREDICTABLE(n <= 8) {
28924
0
        w8 = w7;
28925
0
      }
28926
0
      const int8_t* w9 = w8 + kc;
28927
0
      if XNN_UNPREDICTABLE(n < 10) {
28928
0
        w9 = w8;
28929
0
      }
28930
0
      const int8_t* w10 = w9 + kc;
28931
0
      if XNN_UNPREDICTABLE(n <= 10) {
28932
0
        w10 = w9;
28933
0
      }
28934
0
      const int8_t* w11 = w10 + kc;
28935
0
      if XNN_UNPREDICTABLE(n < 12) {
28936
0
        w11 = w10;
28937
0
      }
28938
0
      const int8_t* w12 = w11 + kc;
28939
0
      if XNN_UNPREDICTABLE(n <= 12) {
28940
0
        w12 = w11;
28941
0
      }
28942
0
      const int8_t* w13 = w12 + kc;
28943
0
      if XNN_UNPREDICTABLE(n < 14) {
28944
0
        w13 = w12;
28945
0
      }
28946
0
      const int8_t* w14 = w13 + kc;
28947
0
      if XNN_UNPREDICTABLE(n <= 14) {
28948
0
        w14 = w13;
28949
0
      }
28950
0
      const int8_t* w15 = w14 + kc;
28951
0
      if XNN_UNPREDICTABLE(n < 16) {
28952
0
        w15 = w14;
28953
0
      }
28954
0
      const int8_t* w16 = w15 + kc;
28955
0
      if XNN_UNPREDICTABLE(n <= 16) {
28956
0
        w16 = w15;
28957
0
      }
28958
0
      const int8_t* w17 = w16 + kc;
28959
0
      if XNN_UNPREDICTABLE(n < 18) {
28960
0
        w17 = w16;
28961
0
      }
28962
0
      const int8_t* w18 = w17 + kc;
28963
0
      if XNN_UNPREDICTABLE(n <= 18) {
28964
0
        w18 = w17;
28965
0
      }
28966
0
      const int8_t* w19 = w18 + kc;
28967
0
      if XNN_UNPREDICTABLE(n < 20) {
28968
0
        w19 = w18;
28969
0
      }
28970
0
      const int8_t* w20 = w19 + kc;
28971
0
      if XNN_UNPREDICTABLE(n <= 20) {
28972
0
        w20 = w19;
28973
0
      }
28974
0
      const int8_t* w21 = w20 + kc;
28975
0
      if XNN_UNPREDICTABLE(n < 22) {
28976
0
        w21 = w20;
28977
0
      }
28978
0
      const int8_t* w22 = w21 + kc;
28979
0
      if XNN_UNPREDICTABLE(n <= 22) {
28980
0
        w22 = w21;
28981
0
      }
28982
0
      const int8_t* w23 = w22 + kc;
28983
0
      if XNN_UNPREDICTABLE(n < 24) {
28984
0
        w23 = w22;
28985
0
      }
28986
0
      const int8_t* w24 = w23 + kc;
28987
0
      if XNN_UNPREDICTABLE(n <= 24) {
28988
0
        w24 = w23;
28989
0
      }
28990
0
      const int8_t* w25 = w24 + kc;
28991
0
      if XNN_UNPREDICTABLE(n < 26) {
28992
0
        w25 = w24;
28993
0
      }
28994
0
      const int8_t* w26 = w25 + kc;
28995
0
      if XNN_UNPREDICTABLE(n <= 26) {
28996
0
        w26 = w25;
28997
0
      }
28998
0
      const int8_t* w27 = w26 + kc;
28999
0
      if XNN_UNPREDICTABLE(n < 28) {
29000
0
        w27 = w26;
29001
0
      }
29002
0
      const int8_t* w28 = w27 + kc;
29003
0
      if XNN_UNPREDICTABLE(n <= 28) {
29004
0
        w28 = w27;
29005
0
      }
29006
0
      const int8_t* w29 = w28 + kc;
29007
0
      if XNN_UNPREDICTABLE(n < 30) {
29008
0
        w29 = w28;
29009
0
      }
29010
0
      const int8_t* w30 = w29 + kc;
29011
0
      if XNN_UNPREDICTABLE(n <= 30) {
29012
0
        w30 = w29;
29013
0
      }
29014
29015
      // KC main loop multiple of 32x2
29016
0
      size_t k = kc;
29017
0
      for (; k >= 2; k -= 2) {
29018
0
        const int8_t v00 = w0[0];
29019
0
        const int8_t v01 = w0[1];
29020
0
        w0 += 2;
29021
0
        const int8_t v10 = w1[0];
29022
0
        const int8_t v11 = w1[1];
29023
0
        w1 += 2;
29024
0
        const int8_t v20 = w2[0];
29025
0
        const int8_t v21 = w2[1];
29026
0
        w2 += 2;
29027
0
        const int8_t v30 = w3[0];
29028
0
        const int8_t v31 = w3[1];
29029
0
        w3 += 2;
29030
0
        const int8_t v40 = w4[0];
29031
0
        const int8_t v41 = w4[1];
29032
0
        w4 += 2;
29033
0
        const int8_t v50 = w5[0];
29034
0
        const int8_t v51 = w5[1];
29035
0
        w5 += 2;
29036
0
        const int8_t v60 = w6[0];
29037
0
        const int8_t v61 = w6[1];
29038
0
        w6 += 2;
29039
0
        const int8_t v70 = w7[0];
29040
0
        const int8_t v71 = w7[1];
29041
0
        w7 += 2;
29042
0
        const int8_t v80 = w8[0];
29043
0
        const int8_t v81 = w8[1];
29044
0
        w8 += 2;
29045
0
        const int8_t v90 = w9[0];
29046
0
        const int8_t v91 = w9[1];
29047
0
        w9 += 2;
29048
0
        const int8_t v100 = w10[0];
29049
0
        const int8_t v101 = w10[1];
29050
0
        w10 += 2;
29051
0
        const int8_t v110 = w11[0];
29052
0
        const int8_t v111 = w11[1];
29053
0
        w11 += 2;
29054
0
        const int8_t v120 = w12[0];
29055
0
        const int8_t v121 = w12[1];
29056
0
        w12 += 2;
29057
0
        const int8_t v130 = w13[0];
29058
0
        const int8_t v131 = w13[1];
29059
0
        w13 += 2;
29060
0
        const int8_t v140 = w14[0];
29061
0
        const int8_t v141 = w14[1];
29062
0
        w14 += 2;
29063
0
        const int8_t v150 = w15[0];
29064
0
        const int8_t v151 = w15[1];
29065
0
        w15 += 2;
29066
0
        const int8_t v160 = w16[0];
29067
0
        const int8_t v161 = w16[1];
29068
0
        w16 += 2;
29069
0
        const int8_t v170 = w17[0];
29070
0
        const int8_t v171 = w17[1];
29071
0
        w17 += 2;
29072
0
        const int8_t v180 = w18[0];
29073
0
        const int8_t v181 = w18[1];
29074
0
        w18 += 2;
29075
0
        const int8_t v190 = w19[0];
29076
0
        const int8_t v191 = w19[1];
29077
0
        w19 += 2;
29078
0
        const int8_t v200 = w20[0];
29079
0
        const int8_t v201 = w20[1];
29080
0
        w20 += 2;
29081
0
        const int8_t v210 = w21[0];
29082
0
        const int8_t v211 = w21[1];
29083
0
        w21 += 2;
29084
0
        const int8_t v220 = w22[0];
29085
0
        const int8_t v221 = w22[1];
29086
0
        w22 += 2;
29087
0
        const int8_t v230 = w23[0];
29088
0
        const int8_t v231 = w23[1];
29089
0
        w23 += 2;
29090
0
        const int8_t v240 = w24[0];
29091
0
        const int8_t v241 = w24[1];
29092
0
        w24 += 2;
29093
0
        const int8_t v250 = w25[0];
29094
0
        const int8_t v251 = w25[1];
29095
0
        w25 += 2;
29096
0
        const int8_t v260 = w26[0];
29097
0
        const int8_t v261 = w26[1];
29098
0
        w26 += 2;
29099
0
        const int8_t v270 = w27[0];
29100
0
        const int8_t v271 = w27[1];
29101
0
        w27 += 2;
29102
0
        const int8_t v280 = w28[0];
29103
0
        const int8_t v281 = w28[1];
29104
0
        w28 += 2;
29105
0
        const int8_t v290 = w29[0];
29106
0
        const int8_t v291 = w29[1];
29107
0
        w29 += 2;
29108
0
        const int8_t v300 = w30[0];
29109
0
        const int8_t v301 = w30[1];
29110
0
        w30 += 2;
29111
0
        out[0] = v00;
29112
0
        out[1] = v10;
29113
0
        out[2] = v20;
29114
0
        out[3] = v30;
29115
0
        out[4] = v40;
29116
0
        out[5] = v50;
29117
0
        out[6] = v60;
29118
0
        out[7] = v70;
29119
0
        out[8] = v80;
29120
0
        out[9] = v90;
29121
0
        out[10] = v100;
29122
0
        out[11] = v110;
29123
0
        out[12] = v120;
29124
0
        out[13] = v130;
29125
0
        out[14] = v140;
29126
0
        out[15] = v150;
29127
0
        out[16] = v160;
29128
0
        out[17] = v170;
29129
0
        out[18] = v180;
29130
0
        out[19] = v190;
29131
0
        out[20] = v200;
29132
0
        out[21] = v210;
29133
0
        out[22] = v220;
29134
0
        out[23] = v230;
29135
0
        out[24] = v240;
29136
0
        out[25] = v250;
29137
0
        out[26] = v260;
29138
0
        out[27] = v270;
29139
0
        out[28] = v280;
29140
0
        out[29] = v290;
29141
0
        out[30] = v300;
29142
0
        out[32] = v01;
29143
0
        out[33] = v11;
29144
0
        out[34] = v21;
29145
0
        out[35] = v31;
29146
0
        out[36] = v41;
29147
0
        out[37] = v51;
29148
0
        out[38] = v61;
29149
0
        out[39] = v71;
29150
0
        out[40] = v81;
29151
0
        out[41] = v91;
29152
0
        out[42] = v101;
29153
0
        out[43] = v111;
29154
0
        out[44] = v121;
29155
0
        out[45] = v131;
29156
0
        out[46] = v141;
29157
0
        out[47] = v151;
29158
0
        out[48] = v161;
29159
0
        out[49] = v171;
29160
0
        out[50] = v181;
29161
0
        out[51] = v191;
29162
0
        out[52] = v201;
29163
0
        out[53] = v211;
29164
0
        out[54] = v221;
29165
0
        out[55] = v231;
29166
0
        out[56] = v241;
29167
0
        out[57] = v251;
29168
0
        out[58] = v261;
29169
0
        out[59] = v271;
29170
0
        out[60] = v281;
29171
0
        out[61] = v291;
29172
0
        out[62] = v301;
29173
0
        out += 64;
29174
0
      }
29175
29176
      // KC remainder of 1..1
29177
0
      for (; k != 0; --k) {
29178
0
        const int8_t v0 = *w0++;
29179
0
        out[0] = v0;
29180
0
        const int8_t v1 = *w1++;
29181
0
        out[1] = v1;
29182
0
        const int8_t v2 = *w2++;
29183
0
        out[2] = v2;
29184
0
        const int8_t v3 = *w3++;
29185
0
        out[3] = v3;
29186
0
        const int8_t v4 = *w4++;
29187
0
        out[4] = v4;
29188
0
        const int8_t v5 = *w5++;
29189
0
        out[5] = v5;
29190
0
        const int8_t v6 = *w6++;
29191
0
        out[6] = v6;
29192
0
        const int8_t v7 = *w7++;
29193
0
        out[7] = v7;
29194
0
        const int8_t v8 = *w8++;
29195
0
        out[8] = v8;
29196
0
        const int8_t v9 = *w9++;
29197
0
        out[9] = v9;
29198
0
        const int8_t v10 = *w10++;
29199
0
        out[10] = v10;
29200
0
        const int8_t v11 = *w11++;
29201
0
        out[11] = v11;
29202
0
        const int8_t v12 = *w12++;
29203
0
        out[12] = v12;
29204
0
        const int8_t v13 = *w13++;
29205
0
        out[13] = v13;
29206
0
        const int8_t v14 = *w14++;
29207
0
        out[14] = v14;
29208
0
        const int8_t v15 = *w15++;
29209
0
        out[15] = v15;
29210
0
        const int8_t v16 = *w16++;
29211
0
        out[16] = v16;
29212
0
        const int8_t v17 = *w17++;
29213
0
        out[17] = v17;
29214
0
        const int8_t v18 = *w18++;
29215
0
        out[18] = v18;
29216
0
        const int8_t v19 = *w19++;
29217
0
        out[19] = v19;
29218
0
        const int8_t v20 = *w20++;
29219
0
        out[20] = v20;
29220
0
        const int8_t v21 = *w21++;
29221
0
        out[21] = v21;
29222
0
        const int8_t v22 = *w22++;
29223
0
        out[22] = v22;
29224
0
        const int8_t v23 = *w23++;
29225
0
        out[23] = v23;
29226
0
        const int8_t v24 = *w24++;
29227
0
        out[24] = v24;
29228
0
        const int8_t v25 = *w25++;
29229
0
        out[25] = v25;
29230
0
        const int8_t v26 = *w26++;
29231
0
        out[26] = v26;
29232
0
        const int8_t v27 = *w27++;
29233
0
        out[27] = v27;
29234
0
        const int8_t v28 = *w28++;
29235
0
        out[28] = v28;
29236
0
        const int8_t v29 = *w29++;
29237
0
        out[29] = v29;
29238
0
        const int8_t v30 = *w30++;
29239
0
        out[30] = v30;
29240
0
        out += 32;
29241
0
      }
29242
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
29243
0
    }
29244
0
    weights += nc * kc;
29245
0
  } while (--g != 0);
29246
0
}
29247
29248
void xnn_x8_packw_gemm_goi_ukernel_x4__scalar_int_u2(
29249
  size_t g,
29250
  size_t nc,
29251
  size_t kc,
29252
  size_t nr,
29253
  size_t kr,
29254
  size_t sr,
29255
  const int8_t* weights,
29256
  const uint32_t* bias,
29257
  const void* scale,
29258
  int8_t* packed_weights,
29259
  size_t extra_bytes,
29260
  const void* params)
29261
0
{
29262
0
  assert(g != 0);
29263
0
  assert(nc != 0);
29264
0
  assert(kc != 0);
29265
0
  assert(nr == 4);   // This kernel is for NR=4
29266
0
  assert(kr == 1);
29267
0
  assert(sr == 1);
29268
0
  assert(weights != NULL);
29269
0
  assert(packed_weights != NULL);
29270
29271
0
  int8_t* out = (int8_t*) packed_weights;
29272
0
  const uint32_t* b = (const uint32_t*) bias;
29273
29274
0
  do {
29275
    // NC main loop multiple of 4
29276
0
    const int8_t* w0 = (const int8_t*) weights;
29277
0
    size_t n = nc;
29278
0
    for (;n >= 4; n -= 4) {
29279
0
      if XNN_LIKELY(b != NULL) {
29280
0
        ((uint32_t*) out)[0] = b[0];
29281
0
        ((uint32_t*) out)[1] = b[1];
29282
0
        ((uint32_t*) out)[2] = b[2];
29283
0
        ((uint32_t*) out)[3] = b[3];
29284
0
        b += 4;
29285
0
      } else {
29286
0
        ((uint32_t*) out)[0] = 0;
29287
0
        ((uint32_t*) out)[1] = 0;
29288
0
        ((uint32_t*) out)[2] = 0;
29289
0
        ((uint32_t*) out)[3] = 0;
29290
0
      }
29291
0
      out += 4 * sizeof(uint32_t);
29292
29293
0
      const int8_t* w1 = w0 + kc;
29294
0
      const int8_t* w2 = w1 + kc;
29295
0
      const int8_t* w3 = w2 + kc;
29296
29297
      // KC main loop multiple of 4x2
29298
0
      size_t k = kc;
29299
0
      for (; k >= 2; k -= 2) {
29300
0
        const int8_t v00 = w0[0];
29301
0
        const int8_t v01 = w0[1];
29302
0
        w0 += 2;
29303
0
        const int8_t v10 = w1[0];
29304
0
        const int8_t v11 = w1[1];
29305
0
        w1 += 2;
29306
0
        const int8_t v20 = w2[0];
29307
0
        const int8_t v21 = w2[1];
29308
0
        w2 += 2;
29309
0
        const int8_t v30 = w3[0];
29310
0
        const int8_t v31 = w3[1];
29311
0
        w3 += 2;
29312
0
        out[0] = v00;
29313
0
        out[1] = v10;
29314
0
        out[2] = v20;
29315
0
        out[3] = v30;
29316
0
        out[4] = v01;
29317
0
        out[5] = v11;
29318
0
        out[6] = v21;
29319
0
        out[7] = v31;
29320
0
        out += 8;
29321
0
      }
29322
29323
      // KC remainder
29324
0
      for (; k != 0; --k) {
29325
0
        const int8_t v0 = *w0++;
29326
0
        out[0] = v0;
29327
0
        const int8_t v1 = *w1++;
29328
0
        out[1] = v1;
29329
0
        const int8_t v2 = *w2++;
29330
0
        out[2] = v2;
29331
0
        const int8_t v3 = *w3++;
29332
0
        out[3] = v3;
29333
0
        out += 4;
29334
0
      }
29335
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
29336
0
      w0 = w3;
29337
0
    }
29338
29339
    // NC remainder (1..3)
29340
0
    if XNN_UNLIKELY(n != 0) {
29341
0
      if XNN_LIKELY(b != NULL) {
29342
0
        size_t nb = n;
29343
0
        do {
29344
0
          *((uint32_t*) out) = *b++;
29345
0
          out += sizeof(uint32_t);
29346
0
        } while (--nb != 0);
29347
0
      } else {
29348
0
        size_t nb = n;
29349
0
        do {
29350
0
          *((uint32_t*) out) = 0;
29351
0
          out += sizeof(uint32_t);
29352
0
        } while (--nb != 0);
29353
0
      }
29354
0
      out += (4 - n) * sizeof(uint32_t);
29355
29356
      // NR remainder has less than 4 rows so last row is not loaded
29357
0
      const int8_t* w1 = w0 + kc;
29358
0
      if XNN_UNPREDICTABLE(n < 2) {
29359
0
        w1 = w0;
29360
0
      }
29361
0
      const int8_t* w2 = w1 + kc;
29362
0
      if XNN_UNPREDICTABLE(n <= 2) {
29363
0
        w2 = w1;
29364
0
      }
29365
29366
      // KC main loop multiple of 4x2
29367
0
      size_t k = kc;
29368
0
      for (; k >= 2; k -= 2) {
29369
0
        const int8_t v00 = w0[0];
29370
0
        const int8_t v01 = w0[1];
29371
0
        w0 += 2;
29372
0
        const int8_t v10 = w1[0];
29373
0
        const int8_t v11 = w1[1];
29374
0
        w1 += 2;
29375
0
        const int8_t v20 = w2[0];
29376
0
        const int8_t v21 = w2[1];
29377
0
        w2 += 2;
29378
0
        out[0] = v00;
29379
0
        out[1] = v10;
29380
0
        out[2] = v20;
29381
0
        out[4] = v01;
29382
0
        out[5] = v11;
29383
0
        out[6] = v21;
29384
0
        out += 8;
29385
0
      }
29386
29387
      // KC remainder of 1..1
29388
0
      for (; k != 0; --k) {
29389
0
        const int8_t v0 = *w0++;
29390
0
        out[0] = v0;
29391
0
        const int8_t v1 = *w1++;
29392
0
        out[1] = v1;
29393
0
        const int8_t v2 = *w2++;
29394
0
        out[2] = v2;
29395
0
        out += 4;
29396
0
      }
29397
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
29398
0
    }
29399
0
    weights += nc * kc;
29400
0
  } while (--g != 0);
29401
0
}
29402
29403
void xnn_x8_packw_gemm_goi_ukernel_x8__scalar_int_u2(
29404
  size_t g,
29405
  size_t nc,
29406
  size_t kc,
29407
  size_t nr,
29408
  size_t kr,
29409
  size_t sr,
29410
  const int8_t* weights,
29411
  const uint32_t* bias,
29412
  const void* scale,
29413
  int8_t* packed_weights,
29414
  size_t extra_bytes,
29415
  const void* params)
29416
0
{
29417
0
  assert(g != 0);
29418
0
  assert(nc != 0);
29419
0
  assert(kc != 0);
29420
0
  assert(nr == 8);   // This kernel is for NR=8
29421
0
  assert(kr == 1);
29422
0
  assert(sr == 1);
29423
0
  assert(weights != NULL);
29424
0
  assert(packed_weights != NULL);
29425
29426
0
  int8_t* out = (int8_t*) packed_weights;
29427
0
  const uint32_t* b = (const uint32_t*) bias;
29428
29429
0
  do {
29430
    // NC main loop multiple of 8
29431
0
    const int8_t* w0 = (const int8_t*) weights;
29432
0
    size_t n = nc;
29433
0
    for (;n >= 8; n -= 8) {
29434
0
      if XNN_LIKELY(b != NULL) {
29435
0
        ((uint32_t*) out)[0] = b[0];
29436
0
        ((uint32_t*) out)[1] = b[1];
29437
0
        ((uint32_t*) out)[2] = b[2];
29438
0
        ((uint32_t*) out)[3] = b[3];
29439
0
        ((uint32_t*) out)[4] = b[4];
29440
0
        ((uint32_t*) out)[5] = b[5];
29441
0
        ((uint32_t*) out)[6] = b[6];
29442
0
        ((uint32_t*) out)[7] = b[7];
29443
0
        b += 8;
29444
0
      } else {
29445
0
        ((uint32_t*) out)[0] = 0;
29446
0
        ((uint32_t*) out)[1] = 0;
29447
0
        ((uint32_t*) out)[2] = 0;
29448
0
        ((uint32_t*) out)[3] = 0;
29449
0
        ((uint32_t*) out)[4] = 0;
29450
0
        ((uint32_t*) out)[5] = 0;
29451
0
        ((uint32_t*) out)[6] = 0;
29452
0
        ((uint32_t*) out)[7] = 0;
29453
0
      }
29454
0
      out += 8 * sizeof(uint32_t);
29455
29456
0
      const int8_t* w1 = w0 + kc;
29457
0
      const int8_t* w2 = w1 + kc;
29458
0
      const int8_t* w3 = w2 + kc;
29459
0
      const int8_t* w4 = w3 + kc;
29460
0
      const int8_t* w5 = w4 + kc;
29461
0
      const int8_t* w6 = w5 + kc;
29462
0
      const int8_t* w7 = w6 + kc;
29463
29464
      // KC main loop multiple of 8x2
29465
0
      size_t k = kc;
29466
0
      for (; k >= 2; k -= 2) {
29467
0
        const int8_t v00 = w0[0];
29468
0
        const int8_t v01 = w0[1];
29469
0
        w0 += 2;
29470
0
        const int8_t v10 = w1[0];
29471
0
        const int8_t v11 = w1[1];
29472
0
        w1 += 2;
29473
0
        const int8_t v20 = w2[0];
29474
0
        const int8_t v21 = w2[1];
29475
0
        w2 += 2;
29476
0
        const int8_t v30 = w3[0];
29477
0
        const int8_t v31 = w3[1];
29478
0
        w3 += 2;
29479
0
        const int8_t v40 = w4[0];
29480
0
        const int8_t v41 = w4[1];
29481
0
        w4 += 2;
29482
0
        const int8_t v50 = w5[0];
29483
0
        const int8_t v51 = w5[1];
29484
0
        w5 += 2;
29485
0
        const int8_t v60 = w6[0];
29486
0
        const int8_t v61 = w6[1];
29487
0
        w6 += 2;
29488
0
        const int8_t v70 = w7[0];
29489
0
        const int8_t v71 = w7[1];
29490
0
        w7 += 2;
29491
0
        out[0] = v00;
29492
0
        out[1] = v10;
29493
0
        out[2] = v20;
29494
0
        out[3] = v30;
29495
0
        out[4] = v40;
29496
0
        out[5] = v50;
29497
0
        out[6] = v60;
29498
0
        out[7] = v70;
29499
0
        out[8] = v01;
29500
0
        out[9] = v11;
29501
0
        out[10] = v21;
29502
0
        out[11] = v31;
29503
0
        out[12] = v41;
29504
0
        out[13] = v51;
29505
0
        out[14] = v61;
29506
0
        out[15] = v71;
29507
0
        out += 16;
29508
0
      }
29509
29510
      // KC remainder
29511
0
      for (; k != 0; --k) {
29512
0
        const int8_t v0 = *w0++;
29513
0
        out[0] = v0;
29514
0
        const int8_t v1 = *w1++;
29515
0
        out[1] = v1;
29516
0
        const int8_t v2 = *w2++;
29517
0
        out[2] = v2;
29518
0
        const int8_t v3 = *w3++;
29519
0
        out[3] = v3;
29520
0
        const int8_t v4 = *w4++;
29521
0
        out[4] = v4;
29522
0
        const int8_t v5 = *w5++;
29523
0
        out[5] = v5;
29524
0
        const int8_t v6 = *w6++;
29525
0
        out[6] = v6;
29526
0
        const int8_t v7 = *w7++;
29527
0
        out[7] = v7;
29528
0
        out += 8;
29529
0
      }
29530
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
29531
0
      w0 = w7;
29532
0
    }
29533
29534
    // NC remainder (1..7)
29535
0
    if XNN_UNLIKELY(n != 0) {
29536
0
      if XNN_LIKELY(b != NULL) {
29537
0
        size_t nb = n;
29538
0
        do {
29539
0
          *((uint32_t*) out) = *b++;
29540
0
          out += sizeof(uint32_t);
29541
0
        } while (--nb != 0);
29542
0
      } else {
29543
0
        size_t nb = n;
29544
0
        do {
29545
0
          *((uint32_t*) out) = 0;
29546
0
          out += sizeof(uint32_t);
29547
0
        } while (--nb != 0);
29548
0
      }
29549
0
      out += (8 - n) * sizeof(uint32_t);
29550
29551
      // NR remainder has less than 8 rows so last row is not loaded
29552
0
      const int8_t* w1 = w0 + kc;
29553
0
      if XNN_UNPREDICTABLE(n < 2) {
29554
0
        w1 = w0;
29555
0
      }
29556
0
      const int8_t* w2 = w1 + kc;
29557
0
      if XNN_UNPREDICTABLE(n <= 2) {
29558
0
        w2 = w1;
29559
0
      }
29560
0
      const int8_t* w3 = w2 + kc;
29561
0
      if XNN_UNPREDICTABLE(n < 4) {
29562
0
        w3 = w2;
29563
0
      }
29564
0
      const int8_t* w4 = w3 + kc;
29565
0
      if XNN_UNPREDICTABLE(n <= 4) {
29566
0
        w4 = w3;
29567
0
      }
29568
0
      const int8_t* w5 = w4 + kc;
29569
0
      if XNN_UNPREDICTABLE(n < 6) {
29570
0
        w5 = w4;
29571
0
      }
29572
0
      const int8_t* w6 = w5 + kc;
29573
0
      if XNN_UNPREDICTABLE(n <= 6) {
29574
0
        w6 = w5;
29575
0
      }
29576
29577
      // KC main loop multiple of 8x2
29578
0
      size_t k = kc;
29579
0
      for (; k >= 2; k -= 2) {
29580
0
        const int8_t v00 = w0[0];
29581
0
        const int8_t v01 = w0[1];
29582
0
        w0 += 2;
29583
0
        const int8_t v10 = w1[0];
29584
0
        const int8_t v11 = w1[1];
29585
0
        w1 += 2;
29586
0
        const int8_t v20 = w2[0];
29587
0
        const int8_t v21 = w2[1];
29588
0
        w2 += 2;
29589
0
        const int8_t v30 = w3[0];
29590
0
        const int8_t v31 = w3[1];
29591
0
        w3 += 2;
29592
0
        const int8_t v40 = w4[0];
29593
0
        const int8_t v41 = w4[1];
29594
0
        w4 += 2;
29595
0
        const int8_t v50 = w5[0];
29596
0
        const int8_t v51 = w5[1];
29597
0
        w5 += 2;
29598
0
        const int8_t v60 = w6[0];
29599
0
        const int8_t v61 = w6[1];
29600
0
        w6 += 2;
29601
0
        out[0] = v00;
29602
0
        out[1] = v10;
29603
0
        out[2] = v20;
29604
0
        out[3] = v30;
29605
0
        out[4] = v40;
29606
0
        out[5] = v50;
29607
0
        out[6] = v60;
29608
0
        out[8] = v01;
29609
0
        out[9] = v11;
29610
0
        out[10] = v21;
29611
0
        out[11] = v31;
29612
0
        out[12] = v41;
29613
0
        out[13] = v51;
29614
0
        out[14] = v61;
29615
0
        out += 16;
29616
0
      }
29617
29618
      // KC remainder of 1..1
29619
0
      for (; k != 0; --k) {
29620
0
        const int8_t v0 = *w0++;
29621
0
        out[0] = v0;
29622
0
        const int8_t v1 = *w1++;
29623
0
        out[1] = v1;
29624
0
        const int8_t v2 = *w2++;
29625
0
        out[2] = v2;
29626
0
        const int8_t v3 = *w3++;
29627
0
        out[3] = v3;
29628
0
        const int8_t v4 = *w4++;
29629
0
        out[4] = v4;
29630
0
        const int8_t v5 = *w5++;
29631
0
        out[5] = v5;
29632
0
        const int8_t v6 = *w6++;
29633
0
        out[6] = v6;
29634
0
        out += 8;
29635
0
      }
29636
0
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
29637
0
    }
29638
0
    weights += nc * kc;
29639
0
  } while (--g != 0);
29640
0
}
29641
29642
void xnn_x8_transposec_ukernel__2x4_scalar_int(
29643
    const uint8_t *input,
29644
    uint8_t * output,
29645
    size_t input_stride,
29646
    size_t output_stride,
29647
    size_t block_width,
29648
    size_t block_height,
29649
    const union xnn_x8_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
29650
0
{
29651
0
  assert(output_stride >= block_height * sizeof(int8_t));
29652
0
  assert(input_stride >= block_width * sizeof(int8_t));
29653
29654
0
  const size_t tile_height = 2;
29655
0
  const size_t tile_width = 4;
29656
0
  const size_t tile_wbytes = tile_width * sizeof(int8_t);
29657
0
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
29658
0
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(int8_t);
29659
0
  const size_t input_offset = tile_height * input_stride;
29660
29661
0
  const int8_t* i0 = (const int8_t*) input;
29662
0
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
29663
29664
0
  int8_t* o0 = (int8_t*) output;
29665
0
  int8_t* o1 = (int8_t*) ((uintptr_t) o0 + output_stride);
29666
0
  int8_t* o2 = (int8_t*) ((uintptr_t) o1 + output_stride);
29667
0
  int8_t* o3 = (int8_t*) ((uintptr_t) o2 + output_stride);
29668
29669
0
  do {
29670
0
    if XNN_UNPREDICTABLE(block_width < 2) {
29671
0
      o1 = o0;
29672
0
    }
29673
0
    if XNN_UNPREDICTABLE(block_width <= 2) {
29674
0
      o2 = o0;
29675
0
    }
29676
0
    if XNN_UNPREDICTABLE(block_width < 4) {
29677
0
      o3 = o0;
29678
0
    }
29679
0
    size_t bh = block_height;
29680
0
    for (; bh >= 2; bh -= 2) {
29681
0
      *o3++ = i0[3];
29682
0
      *o3++ = i1[3];
29683
0
      *o2++ = i0[2];
29684
0
      *o2++ = i1[2];
29685
0
      *o1++ = i0[1];
29686
0
      *o1++ = i1[1];
29687
0
      *o0++ = i0[0];
29688
0
      *o0++ = i1[0];
29689
0
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
29690
0
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
29691
0
    }
29692
0
    if (bh & 1) {
29693
0
      o3[0] = i0[3];
29694
0
      o2[0] = i0[2];
29695
0
      o1[0] = i0[1];
29696
0
      o0[0] = i0[0];
29697
0
    }
29698
29699
0
    i0 = (const int8_t*) ((uintptr_t) i0 + input_reset);
29700
0
    i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
29701
0
    o0 = (int8_t*) ((uintptr_t) o0 + output_reset);
29702
0
    o1 = (int8_t*) ((uintptr_t) o1 + output_reset);
29703
0
    o2 = (int8_t*) ((uintptr_t) o2 + output_reset);
29704
0
    o3 = (int8_t*) ((uintptr_t) o3 + output_reset);
29705
0
    block_width = doz(block_width, tile_width);
29706
0
  } while (block_width != 0);
29707
0
}
29708
29709
void xnn_x8_zip_x2_ukernel__scalar(
29710
    size_t n,
29711
    const uint8_t* input,
29712
    uint8_t* output)
29713
0
{
29714
0
  assert(n != 0);
29715
29716
0
  const uint8_t* x = input;
29717
0
  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
29718
0
  uint8_t* o = output;
29719
29720
0
  do {
29721
0
    const uint8_t vx = *x++;
29722
0
    const uint8_t vy = *y++;
29723
0
    o[0] = vx;
29724
0
    o[1] = vy;
29725
0
    o += 2;
29726
29727
0
    n -= sizeof(uint8_t);
29728
0
  } while (n != 0);
29729
0
}
29730
29731
void xnn_x8_zip_x3_ukernel__scalar(
29732
    size_t n,
29733
    const uint8_t* input,
29734
    uint8_t* output)
29735
0
{
29736
0
  const uint8_t* x = input;
29737
0
  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
29738
0
  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
29739
0
  uint8_t* o = output;
29740
29741
0
  do {
29742
0
    const uint8_t vx = *x++;
29743
0
    const uint8_t vy = *y++;
29744
0
    const uint8_t vz = *z++;
29745
0
    o[0] = vx;
29746
0
    o[1] = vy;
29747
0
    o[2] = vz;
29748
0
    o += 3;
29749
29750
0
    n -= sizeof(uint8_t);
29751
0
  } while (n != 0);
29752
0
}
29753
29754
void xnn_x8_zip_x4_ukernel__scalar(
29755
    size_t n,
29756
    const uint8_t* input,
29757
    uint8_t* output)
29758
0
{
29759
0
  assert(n != 0);
29760
29761
0
  const uint8_t* x = input;
29762
0
  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
29763
0
  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
29764
0
  const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n);
29765
0
  uint8_t* o = output;
29766
29767
0
  do {
29768
0
    const uint8_t vx = *x++;
29769
0
    const uint8_t vy = *y++;
29770
0
    const uint8_t vz = *z++;
29771
0
    const uint8_t vw = *w++;
29772
0
    o[0] = vx;
29773
0
    o[1] = vy;
29774
0
    o[2] = vz;
29775
0
    o[3] = vw;
29776
0
    o += 4;
29777
29778
0
    n -= sizeof(uint8_t);
29779
0
  } while (n != 0);
29780
0
}
29781
29782
void xnn_x8_zip_xm_ukernel__scalar(
29783
    size_t n,
29784
    size_t m,
29785
    const uint8_t* input,
29786
    uint8_t* output)
29787
0
{
29788
0
  assert(n != 0);
29789
0
  assert(m >= 4);
29790
29791
0
  size_t k = n;
29792
0
  do {
29793
0
    size_t l = m;
29794
0
    const uint8_t* input_column = input++;
29795
0
    do {
29796
0
      *output++ = *input_column;
29797
0
      input_column = (uint8_t*) ((uintptr_t) input_column + n);
29798
0
    } while (--l != 0);
29799
0
    k -= sizeof(uint8_t);
29800
0
  } while (k != 0);
29801
0
}
29802
29803
0
void xnn_xx_copy_ukernel__scalar_memcpy(size_t batch, const void* input, void* output, const void* params) {
29804
0
  assert(batch != 0);
29805
0
  assert(input != NULL);
29806
0
  assert(output != NULL);
29807
29808
0
  memcpy(output, input, batch);
29809
0
}
29810
29811
void xnn_xx_fill_ukernel__scalar_x16(
29812
    size_t rows,
29813
    size_t channels,
29814
    void* output,
29815
    size_t output_stride,
29816
    const uint32_t fill_pattern)
29817
0
{
29818
0
  assert(rows != 0);
29819
0
  assert(channels != 0);
29820
29821
0
  const size_t output_increment = output_stride - channels;
29822
29823
0
  do {
29824
0
    uint32_t vfill_pattern = fill_pattern;
29825
0
    size_t c = channels;
29826
0
    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
29827
0
      unaligned_indexed_store_u32(output, 0, vfill_pattern);
29828
0
      unaligned_indexed_store_u32(output, 1, vfill_pattern);
29829
0
      unaligned_indexed_store_u32(output, 2, vfill_pattern);
29830
0
      unaligned_indexed_store_u32(output, 3, vfill_pattern);
29831
0
      output = ((uint8_t*) output + 16);
29832
0
    }
29833
0
    if XNN_UNLIKELY(c != 0) {
29834
0
      if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
29835
0
        unaligned_indexed_store_u32(output, 0, vfill_pattern);
29836
0
        unaligned_indexed_store_u32(output, 1, vfill_pattern);
29837
0
        output = ((uint8_t*) output + 8);
29838
0
      }
29839
0
      if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
29840
0
        unaligned_store_u32(output, vfill_pattern);
29841
0
        output = ((uint8_t*) output + 4);
29842
0
      }
29843
0
      if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
29844
0
        unaligned_store_u16(output, (uint16_t) vfill_pattern);
29845
0
        vfill_pattern >>= 16;
29846
0
        output = ((uint8_t*) output + 2);
29847
0
      }
29848
0
      if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
29849
0
        *((uint8_t*) output) = (uint8_t) vfill_pattern;
29850
0
        output = ((uint8_t*) output + 1);
29851
0
      }
29852
0
    }
29853
0
    output = (void*) ((uintptr_t) output + output_increment);
29854
0
  } while (--rows != 0);
29855
0
}
29856
29857
void xnn_xx_pad_ukernel__scalar(
29858
    size_t rows,
29859
    size_t channels,
29860
    size_t pre_padding,
29861
    size_t post_padding,
29862
    const void* input,
29863
    size_t input_stride,
29864
    void* output,
29865
    size_t output_stride,
29866
    const uint32_t fill_pattern) XNN_OOB_READS
29867
0
{
29868
0
  const size_t input_increment = input_stride - channels;
29869
0
  const size_t output_increment = output_stride - (pre_padding + channels + post_padding);
29870
29871
0
  do {
29872
    // Pre-pad input channels.
29873
0
    size_t l = pre_padding;
29874
0
    if XNN_LIKELY(l != 0) {
29875
0
      uint32_t vfill_pattern = fill_pattern;
29876
0
      for (; l >= 4 * sizeof(uint8_t); l -= 4 * sizeof(uint8_t)) {
29877
0
        unaligned_store_u32(output, vfill_pattern);
29878
0
        output = (uint8_t*) output + 4;
29879
0
      }
29880
0
      if XNN_LIKELY(l & (2 * sizeof(uint8_t))) {
29881
0
        unaligned_store_u16(output, (uint16_t) vfill_pattern);
29882
0
        vfill_pattern >>= 16;
29883
0
        output = (uint8_t*) output + 2;
29884
0
      }
29885
0
      if XNN_LIKELY(l & (1 * sizeof(uint8_t))) {
29886
0
        *((uint8_t*) output) = (uint8_t) vfill_pattern;
29887
0
        output = (uint8_t*) output + 1;
29888
0
      }
29889
0
    }
29890
29891
    // Copy input channels.
29892
0
    size_t c = channels;
29893
0
    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
29894
0
      const uint32_t vdata0 = unaligned_indexed_load_u32(input, 0);
29895
0
      const uint32_t vdata1 = unaligned_indexed_load_u32(input, 1);
29896
0
      const uint32_t vdata2 = unaligned_indexed_load_u32(input, 2);
29897
0
      const uint32_t vdata3 = unaligned_indexed_load_u32(input, 3);
29898
0
      input = (const uint8_t*) input + 16;
29899
29900
0
      unaligned_indexed_store_u32(output, 0, vdata0);
29901
0
      unaligned_indexed_store_u32(output, 1, vdata1);
29902
0
      unaligned_indexed_store_u32(output, 2, vdata2);
29903
0
      unaligned_indexed_store_u32(output, 3, vdata3);
29904
0
      output = (uint8_t*) output + 16;
29905
0
    }
29906
0
    if XNN_UNLIKELY(c != 0) {
29907
0
      for (; c >= 4 * sizeof(uint8_t); c -= 4 * sizeof(uint8_t)) {
29908
0
        unaligned_store_u32(output, unaligned_load_u32(input));
29909
0
        input = (const uint8_t*) input + 4;
29910
0
        output = (uint8_t*) output + 4;
29911
0
      }
29912
0
      if XNN_UNLIKELY(c != 0) {
29913
0
        uint32_t vdata = unaligned_load_u32(input);
29914
0
        input = (const void*) ((uintptr_t) input + c);
29915
29916
0
        if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
29917
0
          unaligned_store_u16(output, vdata);
29918
0
          vdata >>= 16;
29919
0
          output = (uint8_t*) output + 2;
29920
0
        }
29921
0
        if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
29922
0
          *((uint8_t*) output) = (uint8_t) vdata;
29923
0
          output = (uint8_t*) output + 1;
29924
0
        }
29925
0
      }
29926
0
    }
29927
29928
    // Post-pad input channels.
29929
0
    size_t r = post_padding;
29930
0
    if XNN_LIKELY(r != 0) {
29931
0
      uint32_t vfill_pattern = fill_pattern;
29932
0
      for (; r >= 4 * sizeof(uint8_t); r -= 4 * sizeof(uint8_t)) {
29933
0
        unaligned_store_u32(output, vfill_pattern);
29934
0
        output = (uint8_t*) output + 4;
29935
0
      }
29936
0
      if XNN_LIKELY(r & (2 * sizeof(uint8_t))) {
29937
0
        unaligned_store_u16(output, vfill_pattern);
29938
0
        vfill_pattern >>= 16;
29939
0
        output = (uint8_t*) output + 2;
29940
0
      }
29941
0
      if XNN_LIKELY(r & (1 * sizeof(uint8_t))) {
29942
0
        *((uint8_t*) output) = (uint8_t) vfill_pattern;
29943
0
        output = (uint8_t*) output + 1;
29944
0
      }
29945
0
    }
29946
29947
0
    input = (const uint32_t*) ((uintptr_t) input + input_increment);
29948
0
    output = (uint32_t*) ((uintptr_t) output + output_increment);
29949
0
  } while (--rows != 0);
29950
0
}
29951
29952
void xnn_xx_transposev_ukernel__1x1_scalar_memcpy(
29953
    const void* input,
29954
    void* output,
29955
    size_t input_row_stride,
29956
    size_t output_row_stride,
29957
    size_t input_element_stride,
29958
    size_t output_element_stride,
29959
    size_t element_size,
29960
    size_t block_width,
29961
    size_t block_height)
29962
0
{
29963
0
  const size_t input_reset = input_element_stride - block_height * input_row_stride;
29964
0
  const size_t output_reset = output_row_stride - block_height * output_element_stride;
29965
29966
0
  const void* i = (const void*) input;
29967
0
  void* o = (void*) output;
29968
29969
0
  do {
29970
0
    size_t bh = block_height;
29971
0
    for (; bh >= 1; bh -= 1) {
29972
0
      memcpy(o, i, element_size);
29973
0
      i = (const void*) ((uintptr_t) i + input_row_stride);
29974
0
      o = (void*) ((uintptr_t) o + output_element_stride);
29975
0
    }
29976
29977
0
    i = (const void*) ((uintptr_t) i + input_reset);
29978
0
    o = (void*) ((uintptr_t) o + output_reset);
29979
0
    block_width -= 1;
29980
0
  } while (block_width != 0);
29981
0
}