Coverage Report

Created: 2023-09-25 06:31

/src/xnnpack/src/packing.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) Facebook, Inc. and its affiliates.
2
// All rights reserved.
3
//
4
// Copyright 2019 Google LLC
5
//
6
// This source code is licensed under the BSD-style license found in the
7
// LICENSE file in the root directory of this source tree.
8
9
#include <stdint.h>
10
#include <stddef.h>
11
#include <string.h>
12
13
#include <fp16/fp16.h>
14
15
#include <xnnpack/log.h>
16
#include <xnnpack/math.h>
17
#include <xnnpack/operator.h>
18
#include <xnnpack/pack.h>
19
#include <xnnpack/unaligned.h>
20
21
22
void xnn_pack_f32_gemm_goi_w(
23
  size_t g,
24
  size_t nc,
25
  size_t kc,
26
  size_t nr,
27
  size_t kr,
28
  size_t sr,
29
  const float* k,
30
  const float* b,
31
  const void* scale,
32
  float* packed_weights,
33
  size_t extra_bytes,
34
  const void* params)
35
0
{
36
0
  assert(g != 0);
37
0
  assert(nr >= sr);
38
0
  assert(k != NULL);
39
0
  assert(packed_weights != NULL);
40
41
0
  const size_t skr = sr * kr;
42
0
  do {
43
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
44
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
45
0
      if XNN_LIKELY(b != NULL) {
46
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
47
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
48
0
        }
49
0
      } else {
50
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
51
0
          packed_weights[nr_block_offset] = 0;
52
0
        }
53
0
      }
54
0
      packed_weights += nr;
55
56
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
57
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
58
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
59
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
60
0
            if (kc_idx < kc) {
61
0
              packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
62
0
            }
63
0
          }
64
0
          packed_weights += kr;
65
0
        }
66
0
        packed_weights += (nr - nr_block_size) * kr;
67
0
      }
68
0
      packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
69
0
    }
70
0
    k += nc * kc;
71
0
    if XNN_UNPREDICTABLE(b != NULL) {
72
0
      b += nc;
73
0
    }
74
0
  } while (--g != 0);
75
0
}
76
77
void xnn_pack_f16_gemm_goi_w(
78
  size_t g,
79
  size_t nc,
80
  size_t kc,
81
  size_t nr,
82
  size_t kr,
83
  size_t sr,
84
  const uint16_t* k,
85
  const uint16_t* b,
86
  const void* scale,
87
  uint16_t* packed_weights,
88
  size_t extra_bytes,
89
  const void* params)
90
0
{
91
0
  assert(g != 0);
92
0
  assert(nr >= sr);
93
0
  assert(k != NULL);
94
0
  assert(packed_weights != NULL);
95
96
0
  const size_t skr = sr * kr;
97
0
  do {
98
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
99
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
100
0
      if XNN_LIKELY(b != NULL) {
101
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
102
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
103
0
        }
104
0
      } else {
105
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
106
0
          packed_weights[nr_block_offset] = 0;
107
0
        }
108
0
      }
109
0
      packed_weights += nr;
110
111
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
112
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
113
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
114
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
115
0
            if (kc_idx < kc) {
116
0
              packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
117
0
            }
118
0
          }
119
0
          packed_weights += kr;
120
0
        }
121
0
        packed_weights += (nr - nr_block_size) * kr;
122
0
      }
123
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
124
0
    }
125
0
    k += nc * kc;
126
0
    if XNN_UNPREDICTABLE(b != NULL) {
127
0
      b += nc;
128
0
    }
129
0
  } while (--g != 0);
130
0
}
131
132
void xnn_pack_f32_to_f16_gemm_goi_w(
133
  size_t g,
134
  size_t nc,
135
  size_t kc,
136
  size_t nr,
137
  size_t kr,
138
  size_t sr,
139
  const float* k,
140
  const float* b,
141
  const void* scale,
142
  uint16_t* packed_weights,
143
  size_t extra_bytes,
144
  const void* params)
145
0
{
146
0
  assert(g != 0);
147
0
  assert(nr >= sr);
148
0
  assert(k != NULL);
149
0
  assert(packed_weights != NULL);
150
151
0
  const size_t skr = sr * kr;
152
0
  do {
153
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
154
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
155
0
      if XNN_LIKELY(b != NULL) {
156
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
157
0
          packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
158
0
        }
159
0
      } else {
160
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
161
0
          packed_weights[nr_block_offset] = 0;
162
0
        }
163
0
      }
164
0
      packed_weights += nr;
165
166
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
167
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
168
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
169
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
170
0
            if (kc_idx < kc) {
171
0
              packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
172
0
            }
173
0
          }
174
0
          packed_weights += kr;
175
0
        }
176
0
        packed_weights += (nr - nr_block_size) * kr;
177
0
      }
178
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
179
0
    }
180
0
    k += nc * kc;
181
0
    if XNN_UNPREDICTABLE(b != NULL) {
182
0
      b += nc;
183
0
    }
184
0
  } while (--g != 0);
185
0
}
186
187
void xnn_pack_qu8_gemm_goi_w(
188
  size_t g,
189
  size_t nc,
190
  size_t kc,
191
  size_t nr,
192
  size_t kr,
193
  size_t sr,
194
  const uint8_t* k,
195
  const int32_t* b,
196
  const void* scale,
197
  void* packed_weights,
198
  size_t extra_bytes,
199
  const struct xnn_qu8_packing_params* params)
200
0
{
201
0
  assert(g != 0);
202
0
  assert(nr >= sr);
203
0
  assert(k != NULL);
204
0
  assert(packed_weights != NULL);
205
206
0
  const size_t skr = sr * kr;
207
0
  const int32_t izp = (int32_t) params->input_zero_point;
208
0
  const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
209
0
  do {
210
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
211
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
212
0
      int32_t* packed_b = (int32_t*) packed_weights;
213
0
      if XNN_LIKELY(b != NULL) {
214
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
215
0
          unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
216
0
          packed_weights = (int32_t*) packed_weights + 1;
217
0
        }
218
0
      } else {
219
0
        size_t n = nr_block_size;
220
0
        do {
221
0
          unaligned_store_s32(packed_weights, bzp);
222
0
          packed_weights = (int32_t*) packed_weights + 1;
223
0
        } while (--n != 0);
224
0
      }
225
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
226
227
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
228
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
229
0
          int32_t ksum = 0;
230
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
231
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
232
0
            if (kc_idx < kc) {
233
0
              const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
234
0
              ksum += (int32_t) kv;
235
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv;
236
0
            }
237
0
          }
238
0
          unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
239
0
          packed_weights = (uint8_t*) packed_weights + kr;
240
0
        }
241
0
        packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
242
0
      }
243
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
244
0
    }
245
0
    k += nc * kc;
246
0
    if XNN_UNPREDICTABLE(b != NULL) {
247
0
      b += nc;
248
0
    }
249
0
  } while (--g != 0);
250
0
}
251
252
void xnn_pack_qs8_gemm_goi_w(
253
  size_t g,
254
  size_t nc,
255
  size_t kc,
256
  size_t nr,
257
  size_t kr,
258
  size_t sr,
259
  const int8_t* k,
260
  const int32_t* b,
261
  const float* scale,
262
  void* packed_weights,
263
  size_t extra_bytes,
264
  const struct xnn_qs8_packing_params* params)
265
0
{
266
0
  assert(g != 0);
267
0
  assert(nr >= sr);
268
0
  assert(k != NULL);
269
0
  assert(packed_weights != NULL);
270
271
0
  const size_t skr = sr * kr;
272
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
273
0
  do {
274
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
275
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
276
0
      int32_t* packed_b = (int32_t*) packed_weights;
277
0
      if XNN_LIKELY(b != NULL) {
278
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
279
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
280
0
          packed_weights = (int32_t*) packed_weights + 1;
281
0
        }
282
0
      } else {
283
0
        size_t n = nr_block_size;
284
0
        do {
285
0
          unaligned_store_s32(packed_weights, 0);
286
0
          packed_weights = (int32_t*) packed_weights + 1;
287
0
        } while (--n != 0);
288
0
      }
289
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
290
291
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
292
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
293
0
          uint32_t ksum = 0;
294
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
295
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
296
0
            if (kc_idx < kc) {
297
0
              const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
298
0
              ksum += (uint32_t) kv;
299
0
              ((int8_t*) packed_weights)[kr_block_offset] = kv;
300
0
            }
301
0
          }
302
0
          unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
303
0
          packed_weights = (int8_t*) packed_weights + kr;
304
0
        }
305
0
        packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
306
0
      }
307
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
308
0
    }
309
0
    k += nc * kc;
310
0
    if XNN_UNPREDICTABLE(b != NULL) {
311
0
      b += nc;
312
0
    }
313
0
  } while (--g != 0);
314
0
}
315
316
// same as qs8 but ksum both nibbles.
317
void xnn_pack_qs8_qc4w_gemm_goi_w(
318
  size_t g,
319
  size_t nc,
320
  size_t kc,
321
  size_t nr,
322
  size_t kr,
323
  size_t sr,
324
  const uint8_t* k,
325
  const int32_t* b,
326
  const float* scale,
327
  void* packed_weights,
328
  size_t extra_bytes,
329
  const struct xnn_qs8_qc4w_packing_params* params)
330
0
{
331
0
  assert(g != 0);
332
0
  assert(nr >= sr);
333
0
  assert(k != NULL);
334
0
  assert(packed_weights != NULL);
335
336
0
  const size_t kb = (kc + 1) >> 1;
337
0
  const size_t skr = sr * kr;
338
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
339
0
  const uint32_t kzp = (uint32_t) params->kernel_zero_point;
340
0
  assert(kzp == 8);
341
0
  do {
342
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
343
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
344
0
      int32_t* packed_b = (int32_t*) packed_weights;
345
0
      if XNN_LIKELY(b != NULL) {
346
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
347
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
348
0
          packed_weights = (int32_t*) packed_weights + 1;
349
0
        }
350
0
      } else {
351
0
        size_t n = nr_block_size;
352
0
        do {
353
0
          unaligned_store_s32(packed_weights, 0);
354
0
          packed_weights = (int32_t*) packed_weights + 1;
355
0
        } while (--n != 0);
356
0
      }
357
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
358
359
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kb, skr); kr_block_start += kr) {
360
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361
0
          uint32_t ksum = 0;
362
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
363
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
364
0
            if ((kc_idx + 1) == kb && (kc & 1)) {
365
0
              const uint8_t kv = (k[(nr_block_start + nr_block_offset) * kb + kc_idx] & UINT8_C(0xF)) | (kzp << 4);
366
0
              ksum += ((uint32_t) kv & UINT8_C(0xF)) - kzp;
367
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv ^ 0x88;
368
0
            } else if (kc_idx < kb) {
369
0
              const uint8_t kv = k[(nr_block_start + nr_block_offset) * kb + kc_idx];
370
0
              ksum += ((uint32_t) kv & UINT8_C(0xF)) + ((uint32_t) (kv >> 4)) - kzp * 2;
371
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv ^ 0x88;;
372
0
            } else {
373
0
              const uint8_t kv = kzp | (kzp << 4);
374
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv ^ 0x88;;
375
0
            }
376
0
          }
377
0
          unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp * 16);
378
0
          packed_weights = (uint8_t*) packed_weights + kr;
379
0
        }
380
0
        packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
381
0
      }
382
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
383
0
    }
384
0
    k += nc * kb;
385
0
    if XNN_UNPREDICTABLE(b != NULL) {
386
0
      b += nc;
387
0
    }
388
0
  } while (--g != 0);
389
0
}
390
391
void xnn_pack_qs8_qc4w_gemm_gio_w(
392
  size_t g,
393
  size_t nc,
394
  size_t kc,
395
  size_t nr,
396
  size_t kr,
397
  size_t sr,
398
  size_t k_stride,
399
  const uint8_t* k,
400
  const int32_t* b,
401
  const float* scale,
402
  void* packed_weights,
403
  size_t extra_bytes,
404
  const struct xnn_qs8_qc4w_packing_params* params)
405
0
{
406
0
  assert(g != 0);
407
0
  assert(nr >= sr);
408
0
  assert(k != NULL);
409
0
  assert(packed_weights != NULL);
410
411
0
  const size_t kb = (kc + 1) >> 1;
412
  // Weights in GIO are padded with 0 in the O dimension if O is odd. k_stride is given in number of elements, convert
413
  // it to bytes since each element is half-byte.
414
0
  const size_t k_stride_nibble = (k_stride + 1) >> 1;
415
0
  const size_t skr = sr * kr;
416
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
417
0
  const uint32_t kzp = (uint32_t) params->kernel_zero_point;
418
0
  assert(kzp == 8);
419
0
  do {
420
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
421
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
422
0
      int32_t* packed_b = (int32_t*) packed_weights;
423
0
      if XNN_LIKELY(b != NULL) {
424
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
425
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
426
0
          packed_weights = (int32_t*) packed_weights + 1;
427
0
        }
428
0
      } else {
429
0
        size_t n = nr_block_size;
430
0
        do {
431
0
          unaligned_store_s32(packed_weights, 0);
432
0
          packed_weights = (int32_t*) packed_weights + 1;
433
0
        } while (--n != 0);
434
0
      }
435
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
436
437
      // GEMM microkernels require that 2 half-byte weights are packed side by side. Since the layout is GIO, we need to
438
      // pack weights from row i and row i + 1.
439
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kb, skr); kr_block_start += kr) {
440
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
441
0
          uint32_t ksum = 0;
442
          // Output channels only take up half-bytes, so we need to round it down to index into uint8_t correctly.
443
0
          const size_t rounded_nr_index = (nr_block_start + nr_block_offset) >> 1;
444
0
          bool odd_nr_index = ((nr_block_start + nr_block_offset) & 1) != 0;
445
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
446
0
            const size_t kc_idx =
447
0
              round_down_po2(kr_block_start, skr) +
448
0
              ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
449
0
            if ((kc_idx + 1) == kb && (kc & 1)) {
450
              // kc_idx needs to be multiplied by 2 because when we are writing K-th input channel of the packed weight,
451
              // it is reading (K*2)-th row of the weight, since 2 rows make up 1 byte.
452
0
              uint8_t kv = (k[(kc_idx * 2) * k_stride_nibble + rounded_nr_index]);
453
0
              kv = odd_nr_index ? (kv >> 4) : (kv & 0xF);
454
0
              ksum += (uint32_t) kv - kzp;
455
              // Fill in the top with kernel zero point.
456
0
              kv = kv | (kzp << 4);
457
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv ^ 0x88;
458
0
            } else if (kc_idx < kb) {
459
              // Read 2 rows of half-byte weights.
460
0
              uint8_t kv_bot = k[(kc_idx * 2) * k_stride_nibble + rounded_nr_index];
461
0
              uint8_t kv_top = k[(kc_idx * 2 + 1) * k_stride_nibble + rounded_nr_index];
462
0
              kv_bot = odd_nr_index ? (kv_bot >> 4) : (kv_bot & 0xF);
463
0
              kv_top = odd_nr_index ? (kv_top >> 4) : (kv_top & 0xF);
464
0
              ksum += (uint32_t) kv_top + (uint32_t) kv_bot - kzp * 2;
465
0
              const uint8_t kv = (kv_top << 4) | kv_bot;
466
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv ^ 0x88;
467
0
            } else {
468
0
              const uint8_t kv = kzp | (kzp << 4);
469
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv ^ 0x88;
470
0
            }
471
0
          }
472
0
          unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp * 16);
473
0
          packed_weights = (uint8_t*) packed_weights + kr;
474
0
        }
475
0
        packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
476
0
      }
477
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
478
0
    }
479
0
    const size_t padded_nc_in_bytes = (nc + 1) >> 1;
480
0
    k += padded_nc_in_bytes * kc;
481
0
    if XNN_UNPREDICTABLE(b != NULL) {
482
0
      b += nc;
483
0
    }
484
0
  } while (--g != 0);
485
0
}
486
487
void xnn_pack_f32_qs8w_gemm_goi_w(
488
  size_t g,
489
  size_t nc,
490
  size_t kc,
491
  size_t nr,
492
  size_t kr,
493
  size_t sr,
494
  const int8_t* k,
495
  const float* bias,
496
  const float* scale,
497
  void* packed_weights,
498
  size_t extra_bytes,
499
  const void* params)
500
0
{
501
0
  assert(g != 0);
502
0
  assert(nr >= sr);
503
0
  assert(k != NULL);
504
0
  assert(packed_weights != NULL);
505
506
0
  const int32_t* b = (const int32_t*) bias;
507
0
  const size_t skr = sr * kr;
508
0
  do {
509
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
510
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
511
0
      if XNN_LIKELY(b != NULL) {
512
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
513
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
514
0
          packed_weights = (int32_t*) packed_weights + 1;
515
0
        }
516
0
      } else {
517
0
        size_t n = nr_block_size;
518
0
        do {
519
0
          unaligned_store_s32(packed_weights, 0);
520
0
          packed_weights = (int32_t*) packed_weights + 1;
521
0
        } while (--n != 0);
522
0
      }
523
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
524
525
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
526
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
527
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
528
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
529
0
            if (kc_idx < kc) {
530
0
              const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
531
0
              ((int8_t*) packed_weights)[kr_block_offset] = kv;
532
0
            }
533
0
          }
534
0
          packed_weights = (int8_t*) packed_weights + kr;
535
0
        }
536
0
        packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
537
0
      }
538
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
539
0
    }
540
0
    k += nc * kc;
541
0
    if XNN_UNPREDICTABLE(b != NULL) {
542
0
      b += nc;
543
0
    }
544
0
  } while (--g != 0);
545
0
}
546
547
// qs4 packs 2 columns into 2 rows.
548
// kc can be odd.  assume k values in a row are padded to a byte boundary
549
void xnn_pack_f32_qc4w_gemm_goi_w(
550
  size_t g,
551
  size_t nc,
552
  size_t kc,
553
  size_t nr,
554
  size_t kr,
555
  size_t sr,
556
  const void* k,  // 4 bit values
557
  const float* bias,
558
  const float* scale,
559
  void* packed_weights,
560
  size_t extra_bytes,
561
  const void* params)
562
0
{
563
0
  assert(g != 0);
564
0
  assert(nr >= sr);
565
0
  assert(k != NULL);
566
0
  assert(packed_weights != NULL);
567
568
0
  kc = (kc + 1) >> 1;
569
0
  const int32_t* b = (const int32_t*) bias;
570
0
  const size_t skr = sr * kr;
571
0
  do {
572
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
573
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
574
0
      if XNN_LIKELY(b != NULL) {
575
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
576
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
577
0
          packed_weights = (int32_t*) packed_weights + 1;
578
0
        }
579
0
      } else {
580
0
        size_t n = nr_block_size;
581
0
        do {
582
0
          unaligned_store_s32(packed_weights, 0);
583
0
          packed_weights = (int32_t*) packed_weights + 1;
584
0
        } while (--n != 0);
585
0
      }
586
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
587
588
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
589
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
590
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
591
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
592
0
            if (kc_idx < kc) {
593
0
              const uint8_t kv = ((const uint8_t*) k)[(nr_block_start + nr_block_offset) * kc + kc_idx];
594
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv;
595
0
            }
596
0
          }
597
0
          packed_weights = (uint8_t*) packed_weights + kr;
598
0
        }
599
0
        packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
600
0
      }
601
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
602
0
    }
603
0
    k = (const uint8_t*) k + nc * kc;
604
0
    if XNN_UNPREDICTABLE(b != NULL) {
605
0
      b += nc;
606
0
    }
607
0
  } while (--g != 0);
608
0
}
609
610
void xnn_pack_qs8_gemm_xw_goi_w(
611
  size_t g,
612
  size_t nc,
613
  size_t kc,
614
  size_t nr,
615
  size_t kr,
616
  size_t sr,
617
  const int8_t* k,
618
  const int32_t* b,
619
  const float* scale,
620
  void* packed_weights,
621
  size_t extra_bytes,
622
  const struct xnn_qs8_packing_params* params)
623
0
{
624
0
  assert(g != 0);
625
0
  assert(nr >= sr);
626
0
  assert(k != NULL);
627
0
  assert(packed_weights != NULL);
628
629
0
  const size_t skr = sr * kr;
630
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
631
0
  do {
632
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
633
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
634
0
      int32_t* packed_b = (int32_t*) packed_weights;
635
0
      if XNN_LIKELY(b != NULL) {
636
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
637
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
638
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
639
0
        }
640
0
      } else {
641
0
        size_t n = nr_block_size;
642
0
        do {
643
0
          unaligned_store_s32(packed_weights, 0);
644
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
645
0
        } while (--n != 0);
646
0
      }
647
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
648
649
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
650
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
651
0
          uint32_t ksum = 0;
652
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
653
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
654
0
            if (kc_idx < kc) {
655
0
              const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
656
0
              ksum += (uint32_t) kv;
657
0
              ((int16_t*) packed_weights)[kr_block_offset] = (int16_t) kv;
658
0
            }
659
0
          }
660
0
          unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
661
0
          packed_weights = (int16_t*) packed_weights + kr;
662
0
        }
663
0
        packed_weights = (int16_t*) packed_weights + (nr - nr_block_size) * kr;
664
0
      }
665
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
666
0
    }
667
0
    k += nc * kc;
668
0
    if XNN_UNPREDICTABLE(b != NULL) {
669
0
      b += nc;
670
0
    }
671
0
  } while (--g != 0);
672
0
}
673
674
void xnn_pack_f32_gemm_gio_w(
675
  size_t g,
676
  size_t nc,
677
  size_t kc,
678
  size_t nr,
679
  size_t kr,
680
  size_t sr,
681
  size_t k_stride,
682
  const float* k,
683
  const float* b,
684
  const void* scale,
685
  float* packed_weights,
686
  size_t extra_bytes,
687
  const void* params)
688
0
{
689
0
  assert(g != 0);
690
0
  assert(nr >= sr);
691
0
  assert(k != NULL);
692
0
  assert(packed_weights != NULL);
693
694
0
  const size_t skr = sr * kr;
695
0
  do {
696
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
697
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
698
0
      if XNN_LIKELY(b != NULL) {
699
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
700
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
701
0
        }
702
0
      } else {
703
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
704
0
          packed_weights[nr_block_offset] = 0.0f;
705
0
        }
706
0
      }
707
0
      packed_weights += nr;
708
709
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
710
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
711
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
712
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
713
0
            if (kc_idx < kc) {
714
0
              packed_weights[kr_block_offset] = k[kc_idx * k_stride + nr_block_start + nr_block_offset];
715
0
            }
716
0
          }
717
0
          packed_weights += kr;
718
0
        }
719
0
        packed_weights += (nr - nr_block_size) * kr;
720
0
      }
721
0
      packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
722
0
    }
723
0
    k += nc * kc;
724
0
    if XNN_UNPREDICTABLE(b != NULL) {
725
0
      b += nc;
726
0
    }
727
0
  } while (--g != 0);
728
0
}
729
730
void xnn_pack_f16_gemm_gio_w(
731
  size_t g,
732
  size_t nc,
733
  size_t kc,
734
  size_t nr,
735
  size_t kr,
736
  size_t sr,
737
  size_t k_stride,
738
  const uint16_t* k,
739
  const uint16_t* b,
740
  const void* scale,
741
  uint16_t* packed_weights,
742
  size_t extra_bytes,
743
  const void* params)
744
0
{
745
0
  assert(g != 0);
746
0
  assert(nr >= sr);
747
0
  assert(k != NULL);
748
0
  assert(packed_weights != NULL);
749
750
0
  const size_t skr = sr * kr;
751
0
  do {
752
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
753
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
754
0
      if XNN_LIKELY(b != NULL) {
755
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
756
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
757
0
        }
758
0
      } else {
759
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
760
0
          packed_weights[nr_block_offset] = UINT16_C(0);
761
0
        }
762
0
      }
763
0
      packed_weights += nr;
764
765
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
766
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
767
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
768
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
769
0
            if (kc_idx < kc) {
770
0
              packed_weights[kr_block_offset] = k[kc_idx * k_stride + nr_block_start + nr_block_offset];
771
0
            }
772
0
          }
773
0
          packed_weights += kr;
774
0
        }
775
0
        packed_weights += (nr - nr_block_size) * kr;
776
0
      }
777
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
778
0
    }
779
0
    k += nc * kc;
780
0
    if XNN_UNPREDICTABLE(b != NULL) {
781
0
      b += nc;
782
0
    }
783
0
  } while (--g != 0);
784
0
}
785
786
void xnn_pack_f32_to_f16_gemm_gio_w(
787
  size_t g,
788
  size_t nc,
789
  size_t kc,
790
  size_t nr,
791
  size_t kr,
792
  size_t sr,
793
  size_t k_stride,
794
  const float* k,
795
  const float* b,
796
  const void* scale,
797
  uint16_t* packed_weights,
798
  size_t extra_bytes,
799
  const void* params)
800
0
{
801
0
  assert(g != 0);
802
0
  assert(nr >= sr);
803
0
  assert(k != NULL);
804
0
  assert(packed_weights != NULL);
805
806
0
  const size_t skr = sr * kr;
807
0
  do {
808
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
809
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
810
0
      if XNN_LIKELY(b != NULL) {
811
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
812
0
          packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
813
0
        }
814
0
      } else {
815
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
816
0
          packed_weights[nr_block_offset] = 0.0f;
817
0
        }
818
0
      }
819
0
      packed_weights += nr;
820
821
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
822
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
823
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
824
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
825
0
            if (kc_idx < kc) {
826
0
              packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * k_stride + nr_block_start + nr_block_offset]);
827
0
            }
828
0
          }
829
0
          packed_weights += kr;
830
0
        }
831
0
        packed_weights += (nr - nr_block_size) * kr;
832
0
      }
833
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
834
0
    }
835
0
    k += nc * kc;
836
0
    if XNN_UNPREDICTABLE(b != NULL) {
837
0
      b += nc;
838
0
    }
839
0
  } while (--g != 0);
840
0
}
841
842
void xnn_pack_qu8_gemm_gio_w(
843
  size_t g,
844
  size_t nc,
845
  size_t kc,
846
  size_t nr,
847
  size_t kr,
848
  size_t sr,
849
  size_t k_stride,
850
  const uint8_t* k,
851
  const int32_t* b,
852
  const void* scale,
853
  void* packed_weights,
854
  size_t extra_bytes,
855
  const struct xnn_qu8_packing_params* params)
856
0
{
857
0
  assert(g != 0);
858
0
  assert(nr >= sr);
859
0
  assert(k != NULL);
860
0
  assert(packed_weights != NULL);
861
862
0
  const size_t skr = sr * kr;
863
0
  const int32_t izp = (int32_t) params->input_zero_point;
864
0
  const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
865
0
  do {
866
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
867
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
868
0
      int32_t* packed_b = (int32_t*) packed_weights;
869
0
      if XNN_LIKELY(b != NULL) {
870
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
871
0
          unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
872
0
          packed_weights = (int32_t*) packed_weights + 1;
873
0
        }
874
0
      } else {
875
0
        size_t n = nr_block_size;
876
0
        do {
877
0
          unaligned_store_s32(packed_weights, bzp);
878
0
          packed_weights = (int32_t*) packed_weights + 1;
879
0
        } while (--n != 0);
880
0
      }
881
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
882
883
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
884
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
885
0
          int32_t ksum = 0;
886
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
887
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
888
0
            if (kc_idx < kc) {
889
0
              const uint8_t kv = k[kc_idx * k_stride + (nr_block_start + nr_block_offset)];
890
0
              ksum += (int32_t) kv;
891
0
              ((uint8_t*) packed_weights)[kr_block_offset] = kv;
892
0
            }
893
0
          }
894
0
          unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
895
0
          packed_weights = (uint8_t*) packed_weights + kr;
896
0
        }
897
0
        packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
898
0
      }
899
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
900
0
    }
901
0
    k += nc * kc;
902
0
    if XNN_UNPREDICTABLE(b != NULL) {
903
0
      b += nc;
904
0
    }
905
0
  } while (--g != 0);
906
0
}
907
908
void xnn_pack_qs8_gemm_gio_w(
909
  size_t g,
910
  size_t nc,
911
  size_t kc,
912
  size_t nr,
913
  size_t kr,
914
  size_t sr,
915
  size_t k_stride,
916
  const int8_t* k,
917
  const int32_t* b,
918
  const float* scale,
919
  void* packed_weights,
920
  size_t extra_bytes,
921
  const struct xnn_qs8_packing_params* params)
922
0
{
923
0
  assert(g != 0);
924
0
  assert(nr >= sr);
925
0
  assert(k != NULL);
926
0
  assert(packed_weights != NULL);
927
928
0
  const size_t skr = sr * kr;
929
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
930
0
  do {
931
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
932
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
933
0
      int32_t* packed_b = (int32_t*) packed_weights;
934
0
      if XNN_LIKELY(b != NULL) {
935
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
936
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
937
0
          packed_weights = (int32_t*) packed_weights + 1;
938
0
        }
939
0
      } else {
940
0
        size_t n = nr_block_size;
941
0
        do {
942
0
          unaligned_store_s32(packed_weights, 0);
943
0
          packed_weights = (int32_t*) packed_weights + 1;
944
0
        } while (--n != 0);
945
0
      }
946
0
      packed_weights = (uint32_t*) packed_weights + (nr - nr_block_size);
947
948
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
949
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
950
0
          uint32_t ksum = 0;
951
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
952
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
953
0
            if (kc_idx < kc) {
954
0
              const int8_t kv = k[kc_idx * k_stride + (nr_block_start + nr_block_offset)];
955
0
              ksum += (uint32_t) kv;
956
0
              ((int8_t*) packed_weights)[kr_block_offset] = kv;
957
0
            }
958
0
          }
959
0
          unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
960
0
          packed_weights = (int8_t*) packed_weights + kr;
961
0
        }
962
0
        packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
963
0
      }
964
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
965
0
    }
966
0
    k += nc * kc;
967
0
    if XNN_UNPREDICTABLE(b != NULL) {
968
0
      b += nc;
969
0
    }
970
0
  } while (--g != 0);
971
0
}
972
973
void xnn_pack_f32_qs8w_gemm_gio_w(
974
  size_t g,
975
  size_t nc,
976
  size_t kc,
977
  size_t nr,
978
  size_t kr,
979
  size_t sr,
980
  size_t k_stride,
981
  const int8_t* k,
982
  const float* bias,
983
  const float* scale,
984
  void* packed_weights,
985
  size_t extra_bytes,
986
  const void* params)
987
0
{
988
0
  assert(g != 0);
989
0
  assert(nr >= sr);
990
0
  assert(k != NULL);
991
0
  assert(packed_weights != NULL);
992
993
0
  const int32_t* b = (const int32_t*) bias;
994
0
  const size_t skr = sr * kr;
995
0
  do {
996
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
997
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
998
0
      if XNN_LIKELY(b != NULL) {
999
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1000
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1001
0
          packed_weights = (int32_t*) packed_weights + 1;
1002
0
        }
1003
0
      } else {
1004
0
        size_t n = nr_block_size;
1005
0
        do {
1006
0
          unaligned_store_s32(packed_weights, 0);
1007
0
          packed_weights = (int32_t*) packed_weights + 1;
1008
0
        } while (--n != 0);
1009
0
      }
1010
0
      packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
1011
1012
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1013
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1014
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1015
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1016
0
            if (kc_idx < kc) {
1017
0
              const int8_t kv = k[kc_idx * k_stride + (nr_block_start + nr_block_offset)];
1018
0
              ((int8_t*) packed_weights)[kr_block_offset] = kv;
1019
0
            }
1020
0
          }
1021
0
          packed_weights = (int8_t*) packed_weights + kr;
1022
0
        }
1023
0
        packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
1024
0
      }
1025
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1026
0
    }
1027
0
    k += nc * kc;
1028
0
    if XNN_UNPREDICTABLE(b != NULL) {
1029
0
      b += nc;
1030
0
    }
1031
0
  } while (--g != 0);
1032
0
}
1033
1034
void xnn_pack_f32_conv_goki_w(
1035
  size_t g,
1036
  size_t nc,
1037
  size_t ks,
1038
  size_t kc,
1039
  size_t nr,
1040
  size_t kr,
1041
  size_t sr,
1042
  const float* k,
1043
  const float* b,
1044
  const void* scale,
1045
  float* packed_weights,
1046
  size_t extra_bytes,
1047
  const void* params)
1048
0
{
1049
0
  assert(g != 0);
1050
0
  assert(nr >= sr);
1051
0
  assert(k != NULL);
1052
0
  assert(packed_weights != NULL);
1053
1054
0
  const size_t skr = sr * kr;
1055
0
  do {
1056
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1057
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1058
0
      if XNN_LIKELY(b != NULL) {
1059
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1060
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1061
0
        }
1062
0
      }
1063
0
      packed_weights += nr;
1064
1065
0
      for (size_t ki = 0; ki < ks; ki++) {
1066
0
        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1067
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1068
0
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1069
0
              const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1070
0
              if (kc_idx < kc) {
1071
0
                packed_weights[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
1072
0
              }
1073
0
            }
1074
0
            packed_weights += kr;
1075
0
          }
1076
0
          packed_weights += (nr - nr_block_size) * kr;
1077
0
        }
1078
0
      }
1079
0
      packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
1080
0
    }
1081
0
    k += ks * kc * nc;
1082
0
    if XNN_UNPREDICTABLE(b != NULL) {
1083
0
      b += nc;
1084
0
    }
1085
0
  } while (--g != 0);
1086
0
}
1087
1088
void xnn_pack_f16_conv_goki_w(
1089
  size_t g,
1090
  size_t nc,
1091
  size_t ks,
1092
  size_t kc,
1093
  size_t nr,
1094
  size_t kr,
1095
  size_t sr,
1096
  const uint16_t* k,
1097
  const uint16_t* b,
1098
  const void* scale,
1099
  uint16_t* packed_weights,
1100
  size_t extra_bytes,
1101
  const void* params)
1102
0
{
1103
0
  assert(g != 0);
1104
0
  assert(nr >= sr);
1105
0
  assert(k != NULL);
1106
0
  assert(packed_weights != NULL);
1107
1108
0
  const size_t skr = sr * kr;
1109
0
  do {
1110
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1111
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1112
0
      if XNN_LIKELY(b != NULL) {
1113
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1114
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1115
0
        }
1116
0
      }
1117
0
      packed_weights += nr;
1118
1119
0
      for (size_t ki = 0; ki < ks; ki++) {
1120
0
        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1121
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1122
0
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1123
0
              const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1124
0
              if (kc_idx < kc) {
1125
0
                packed_weights[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
1126
0
              }
1127
0
            }
1128
0
            packed_weights += kr;
1129
0
          }
1130
0
          packed_weights += (nr - nr_block_size) * kr;
1131
0
        }
1132
0
      }
1133
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1134
0
    }
1135
0
    k += ks * kc * nc;
1136
0
    if XNN_UNPREDICTABLE(b != NULL) {
1137
0
      b += nc;
1138
0
    }
1139
0
  } while (--g != 0);
1140
0
}
1141
1142
void xnn_pack_f32_to_f16_conv_goki_w(
1143
  size_t g,
1144
  size_t nc,
1145
  size_t ks,
1146
  size_t kc,
1147
  size_t nr,
1148
  size_t kr,
1149
  size_t sr,
1150
  const float* k,
1151
  const float* b,
1152
  const void* scale,
1153
  uint16_t* packed_weights,
1154
  size_t extra_bytes,
1155
  const void* params)
1156
0
{
1157
0
  assert(g != 0);
1158
0
  assert(nr >= sr);
1159
0
  assert(k != NULL);
1160
0
  assert(packed_weights != NULL);
1161
1162
0
  const size_t skr = sr * kr;
1163
0
  do {
1164
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1165
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1166
0
      if XNN_LIKELY(b != NULL) {
1167
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1168
0
          packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1169
0
        }
1170
0
      }
1171
0
      packed_weights += nr;
1172
1173
0
      for (size_t ki = 0; ki < ks; ki++) {
1174
0
        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1175
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1176
0
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1177
0
              const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1178
0
              if (kc_idx < kc) {
1179
0
                packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
1180
0
              }
1181
0
            }
1182
0
            packed_weights += kr;
1183
0
          }
1184
0
          packed_weights += (nr - nr_block_size) * kr;
1185
0
        }
1186
0
      }
1187
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1188
0
    }
1189
0
    k += ks * kc * nc;
1190
0
    if XNN_UNPREDICTABLE(b != NULL) {
1191
0
      b += nc;
1192
0
    }
1193
0
  } while (--g != 0);
1194
0
}
1195
1196
void xnn_pack_qu8_conv_goki_w(
1197
  size_t g,
1198
  size_t nc,
1199
  size_t ks,
1200
  size_t kc,
1201
  size_t nr,
1202
  size_t kr,
1203
  size_t sr,
1204
  const uint8_t* k,
1205
  const int32_t* b,
1206
  const void* scale,
1207
  void* packed_weights,
1208
  size_t extra_bytes,
1209
  const struct xnn_qu8_packing_params* params)
1210
0
{
1211
0
  assert(g != 0);
1212
0
  assert(nr >= sr);
1213
0
  assert(k != NULL);
1214
0
  assert(packed_weights != NULL);
1215
1216
0
  const size_t skr = sr * kr;
1217
0
  const int32_t izp = (int32_t) params->input_zero_point;
1218
0
  const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
1219
0
  do {
1220
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1221
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1222
0
      int32_t* packed_b = (int32_t*) packed_weights;
1223
0
      if XNN_LIKELY(b != NULL) {
1224
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1225
0
          unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1226
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1227
0
        }
1228
0
      } else {
1229
0
        size_t n = nr_block_size;
1230
0
        do {
1231
0
          unaligned_store_s32(packed_weights, bzp);
1232
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1233
0
        } while (--n != 0);
1234
0
      }
1235
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1236
1237
0
      for (size_t ki = 0; ki < ks; ki++) {
1238
0
        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1239
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1240
0
            int32_t ksum = 0;
1241
0
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1242
0
              const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1243
0
              if (kc_idx < kc) {
1244
0
                const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
1245
0
                ksum += (int32_t) kv;
1246
0
                ((uint8_t*) packed_weights)[kr_block_offset] = kv;
1247
0
              }
1248
0
            }
1249
0
            unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1250
0
            packed_weights = (uint8_t*) packed_weights + kr;
1251
0
          }
1252
0
          packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
1253
0
        }
1254
0
      }
1255
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1256
0
    }
1257
0
    k += ks * kc * nc;
1258
0
    if XNN_UNPREDICTABLE(b != NULL) {
1259
0
      b += nc;
1260
0
    }
1261
0
  } while (--g != 0);
1262
0
}
1263
1264
void xnn_pack_qs8_conv_goki_w(
1265
  size_t g,
1266
  size_t nc,
1267
  size_t ks,
1268
  size_t kc,
1269
  size_t nr,
1270
  size_t kr,
1271
  size_t sr,
1272
  const int8_t* k,
1273
  const int32_t* b,
1274
  const float* scale,
1275
  void* packed_weights,
1276
  size_t extra_bytes,
1277
  const struct xnn_qs8_packing_params* params)
1278
0
{
1279
0
  assert(g != 0);
1280
0
  assert(nr >= sr);
1281
0
  assert(k != NULL);
1282
0
  assert(packed_weights != NULL);
1283
1284
0
  const size_t skr = sr * kr;
1285
0
  const uint32_t izp = (int32_t) params->input_zero_point;
1286
0
  do {
1287
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1288
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1289
0
      int32_t* packed_b = (int32_t*) packed_weights;
1290
0
      if XNN_LIKELY(b != NULL) {
1291
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1292
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1293
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1294
0
        }
1295
0
      } else {
1296
0
        size_t n = nr_block_size;
1297
0
        do {
1298
0
          unaligned_store_s32(packed_weights, 0);
1299
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1300
0
        } while (--n != 0);
1301
0
      }
1302
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1303
1304
0
      for (size_t ki = 0; ki < ks; ki++) {
1305
0
        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1306
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1307
0
            uint32_t ksum = 0;
1308
0
            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1309
0
              const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1310
0
              if (kc_idx < kc) {
1311
0
                const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
1312
0
                ksum += (uint32_t) kv;
1313
0
                ((int8_t*) packed_weights)[kr_block_offset] = kv;
1314
0
              }
1315
0
            }
1316
0
            unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1317
0
            packed_weights = (int8_t*) packed_weights + kr;
1318
0
          }
1319
0
          packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
1320
0
        }
1321
0
      }
1322
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1323
0
    }
1324
0
    k += ks * kc * nc;
1325
0
    if XNN_UNPREDICTABLE(b != NULL) {
1326
0
      b += nc;
1327
0
    }
1328
0
  } while (--g != 0);
1329
0
}
1330
1331
void xnn_pack_f32_conv_kgo_w(
1332
  size_t g,
1333
  size_t nc,
1334
  size_t ks,
1335
  size_t nr,
1336
  size_t kr,
1337
  size_t sr,
1338
  const float* k,
1339
  const float* b,
1340
  const void* scale,
1341
  float* packed_weights,
1342
  size_t extra_bytes,
1343
  const void* params)
1344
0
{
1345
0
  assert(g != 0);
1346
0
  assert(nr >= sr);
1347
0
  assert(k != NULL);
1348
0
  assert(packed_weights != NULL);
1349
1350
0
  for (size_t i = 0; i < g; i++) {
1351
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1352
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1353
0
      if XNN_LIKELY(b != NULL) {
1354
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1355
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1356
0
        }
1357
0
      }
1358
0
      packed_weights += nr;
1359
1360
0
      for (size_t ki = 0; ki < ks; ki++) {
1361
0
        for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1362
0
          for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1363
0
            packed_weights[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1364
0
          }
1365
0
          packed_weights += nr * kr;
1366
0
        }
1367
0
      }
1368
0
      packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
1369
0
    }
1370
0
    k += nc;
1371
0
    if XNN_UNPREDICTABLE(b != NULL) {
1372
0
      b += nc;
1373
0
    }
1374
0
  }
1375
0
}
1376
1377
void xnn_pack_f16_conv_kgo_w(
1378
  size_t g,
1379
  size_t nc,
1380
  size_t ks,
1381
  size_t nr,
1382
  size_t kr,
1383
  size_t sr,
1384
  const uint16_t* k,
1385
  const uint16_t* b,
1386
  const void* scale,
1387
  uint16_t* packed_weights,
1388
  size_t extra_bytes,
1389
  const void* params)
1390
0
{
1391
0
  assert(g != 0);
1392
0
  assert(nr >= sr);
1393
0
  assert(k != NULL);
1394
0
  assert(packed_weights != NULL);
1395
1396
0
  for (size_t i = 0; i < g; i++) {
1397
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1398
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1399
0
      if XNN_LIKELY(b != NULL) {
1400
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1401
0
          packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1402
0
        }
1403
0
      }
1404
0
      packed_weights += nr;
1405
1406
0
      for (size_t ki = 0; ki < ks; ki++) {
1407
0
        for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1408
0
          for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1409
0
            packed_weights[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1410
0
          }
1411
0
          packed_weights += nr * kr;
1412
0
        }
1413
0
      }
1414
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1415
0
    }
1416
0
    k += nc;
1417
0
    if XNN_UNPREDICTABLE(b != NULL) {
1418
0
      b += nc;
1419
0
    }
1420
0
  }
1421
0
}
1422
1423
void xnn_pack_f32_to_f16_conv_kgo_w(
1424
  size_t g,
1425
  size_t nc,
1426
  size_t ks,
1427
  size_t nr,
1428
  size_t kr,
1429
  size_t sr,
1430
  const float* k,
1431
  const float* b,
1432
  const void* scale,
1433
  uint16_t* packed_weights,
1434
  size_t extra_bytes,
1435
  const void* params)
1436
0
{
1437
0
  assert(g != 0);
1438
0
  assert(nr >= sr);
1439
0
  assert(k != NULL);
1440
0
  assert(packed_weights != NULL);
1441
1442
0
  for (size_t i = 0; i < g; i++) {
1443
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1444
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1445
0
      if XNN_LIKELY(b != NULL) {
1446
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1447
0
          packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1448
0
        }
1449
0
      }
1450
0
      packed_weights += nr;
1451
1452
0
      for (size_t ki = 0; ki < ks; ki++) {
1453
0
        for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1454
0
          for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1455
0
            packed_weights[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
1456
0
          }
1457
0
          packed_weights += nr * kr;
1458
0
        }
1459
0
      }
1460
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1461
0
    }
1462
0
    k += nc;
1463
0
    if XNN_UNPREDICTABLE(b != NULL) {
1464
0
      b += nc;
1465
0
    }
1466
0
  }
1467
0
}
1468
1469
void xnn_pack_qu8_conv_kgo_w(
1470
  size_t g,
1471
  size_t nc,
1472
  size_t ks,
1473
  size_t nr,
1474
  size_t kr,
1475
  size_t sr,
1476
  const uint8_t* k,
1477
  const int32_t* b,
1478
  const void* scale,
1479
  void* packed_weights,
1480
  size_t extra_bytes,
1481
  const struct xnn_qu8_packing_params* params)
1482
0
{
1483
0
  assert(g != 0);
1484
0
  assert(nr >= sr);
1485
0
  assert(k != NULL);
1486
0
  assert(packed_weights != NULL);
1487
1488
0
  const int32_t izp = (int32_t) params->input_zero_point;
1489
0
  const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
1490
0
  for (size_t i = 0; i < g; i++) {
1491
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1492
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1493
0
      int32_t* packed_b = (int32_t*) packed_weights;
1494
0
      if XNN_LIKELY(b != NULL) {
1495
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1496
0
          unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1497
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1498
0
        }
1499
0
      } else {
1500
0
        size_t n = nr_block_size;
1501
0
        do {
1502
0
          unaligned_store_s32(packed_weights, bzp);
1503
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1504
0
        } while (--n != 0);
1505
0
      }
1506
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1507
1508
0
      for (size_t ki = 0; ki < ks; ki++) {
1509
0
        for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1510
0
          for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1511
0
            const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1512
0
            ((uint8_t*) packed_weights)[nr_block_offset * kr] = kv;
1513
0
            unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - (int32_t) kv * izp);
1514
0
          }
1515
0
          packed_weights = (uint8_t*) packed_weights + nr * kr;
1516
0
        }
1517
0
      }
1518
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1519
0
    }
1520
0
    k += nc;
1521
0
    if XNN_UNPREDICTABLE(b != NULL) {
1522
0
      b += nc;
1523
0
    }
1524
0
  }
1525
0
}
1526
1527
void xnn_pack_qs8_conv_kgo_w(
1528
  size_t g,
1529
  size_t nc,
1530
  size_t ks,
1531
  size_t nr,
1532
  size_t kr,
1533
  size_t sr,
1534
  const int8_t* k,
1535
  const int32_t* b,
1536
  const float* scale,
1537
  void* packed_weights,
1538
  size_t extra_bytes,
1539
  const struct xnn_qs8_packing_params* params)
1540
0
{
1541
0
  assert(g != 0);
1542
0
  assert(nr >= sr);
1543
0
  assert(k != NULL);
1544
0
  assert(packed_weights != NULL);
1545
1546
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
1547
0
  for (size_t i = 0; i < g; i++) {
1548
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1549
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
1550
0
      int32_t* packed_b = (int32_t*) packed_weights;
1551
0
      if XNN_LIKELY(b != NULL) {
1552
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1553
0
          unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1554
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1555
0
        }
1556
0
      } else {
1557
0
        size_t n = nr_block_size;
1558
0
        do {
1559
0
          unaligned_store_s32(packed_weights, 0);
1560
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1561
0
        } while (--n != 0);
1562
0
      }
1563
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1564
1565
0
      for (size_t ki = 0; ki < ks; ki++) {
1566
0
        for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1567
0
          for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1568
0
            const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1569
0
            ((int8_t*) packed_weights)[nr_block_offset * kr] = kv;
1570
0
            unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - (uint32_t) kv * izp);
1571
0
          }
1572
0
          packed_weights = (int8_t*) packed_weights + nr * kr;
1573
0
        }
1574
0
      }
1575
0
      packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1576
0
    }
1577
0
    k += nc;
1578
0
    if XNN_UNPREDICTABLE(b != NULL) {
1579
0
      b += nc;
1580
0
    }
1581
0
  }
1582
0
}
1583
1584
void xnn_pack_f32_deconv_goki_w(
1585
  size_t g,
1586
  size_t nc,
1587
  size_t kh,
1588
  size_t kw,
1589
  size_t kc,
1590
  size_t sh,
1591
  size_t sw,
1592
  size_t nr,
1593
  size_t kr,
1594
  size_t sr,
1595
  const float* k,
1596
  const float* b,
1597
  const void* scale,
1598
  float* packed_weights,
1599
  struct subconvolution_params* subconv_params,
1600
  const void* params)
1601
0
{
1602
0
  assert(g != 0);
1603
0
  assert(nr >= sr);
1604
0
  assert(k != NULL);
1605
0
  assert(packed_weights != NULL);
1606
1607
0
  const size_t skr = sr * kr;
1608
0
  for (size_t i = 0; i < g; i++) {
1609
0
    for (size_t oy = 0; oy < sh; oy++) {
1610
0
      for (size_t ox = 0; ox < sw; ox++) {
1611
0
        if (i == 0) {
1612
0
          (*subconv_params++).weights = packed_weights;
1613
0
        }
1614
0
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1615
0
          const size_t nr_block_size = min(nc - nr_block_start, nr);
1616
0
          if XNN_LIKELY(b != NULL) {
1617
0
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1618
0
              packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1619
0
            }
1620
0
          }
1621
0
          packed_weights += nr;
1622
0
          for (size_t ky = oy; ky < kh; ky += sh) {
1623
0
            for (size_t kx = ox; kx < kw; kx += sw) {
1624
0
              for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1625
0
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1626
0
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1627
0
                    const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1628
0
                    if (kc_idx < kc) {
1629
0
                      packed_weights[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1630
0
                    }
1631
0
                  }
1632
0
                  packed_weights += kr;
1633
0
                }
1634
0
                packed_weights += (nr - nr_block_size) * kr;
1635
0
              }
1636
0
            }
1637
0
          }
1638
0
        }
1639
0
      }
1640
0
    }
1641
0
    k += kh * kw * kc * nc;
1642
0
    if XNN_UNPREDICTABLE(b != NULL) {
1643
0
      b += nc;
1644
0
    }
1645
0
  }
1646
0
}
1647
1648
void xnn_pack_f16_deconv_goki_w(
1649
  size_t g,
1650
  size_t nc,
1651
  size_t kh,
1652
  size_t kw,
1653
  size_t kc,
1654
  size_t sh,
1655
  size_t sw,
1656
  size_t nr,
1657
  size_t kr,
1658
  size_t sr,
1659
  const uint16_t* k,
1660
  const uint16_t* b,
1661
  const void* scale,
1662
  uint16_t* packed_weights,
1663
  struct subconvolution_params* subconv_params,
1664
  const void* params)
1665
0
{
1666
0
  assert(g != 0);
1667
0
  assert(nr >= sr);
1668
0
  assert(k != NULL);
1669
0
  assert(packed_weights != NULL);
1670
1671
0
  const size_t skr = sr * kr;
1672
0
  for (size_t i = 0; i < g; i++) {
1673
0
    for (size_t oy = 0; oy < sh; oy++) {
1674
0
      for (size_t ox = 0; ox < sw; ox++) {
1675
0
        if (i == 0) {
1676
0
          (*subconv_params++).weights = packed_weights;
1677
0
        }
1678
0
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1679
0
          const size_t nr_block_size = min(nc - nr_block_start, nr);
1680
0
          if XNN_LIKELY(b != NULL) {
1681
0
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1682
0
              packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1683
0
            }
1684
0
          }
1685
0
          packed_weights += nr;
1686
0
          for (size_t ky = oy; ky < kh; ky += sh) {
1687
0
            for (size_t kx = ox; kx < kw; kx += sw) {
1688
0
              for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1689
0
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1690
0
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1691
0
                    const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1692
0
                    if (kc_idx < kc) {
1693
0
                      packed_weights[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1694
0
                    }
1695
0
                  }
1696
0
                  packed_weights += kr;
1697
0
                }
1698
0
                packed_weights += (nr - nr_block_size) * kr;
1699
0
              }
1700
0
            }
1701
0
          }
1702
0
        }
1703
0
      }
1704
0
    }
1705
0
    k += kh * kw * kc * nc;
1706
0
    if XNN_UNPREDICTABLE(b != NULL) {
1707
0
      b += nc;
1708
0
    }
1709
0
  }
1710
0
}
1711
1712
void xnn_pack_f32_to_f16_deconv_goki_w(
1713
  size_t g,
1714
  size_t nc,
1715
  size_t kh,
1716
  size_t kw,
1717
  size_t kc,
1718
  size_t sh,
1719
  size_t sw,
1720
  size_t nr,
1721
  size_t kr,
1722
  size_t sr,
1723
  const float* k,
1724
  const float* b,
1725
  const void* scale,
1726
  uint16_t* packed_weights,
1727
  struct subconvolution_params* subconv_params,
1728
  const void* params)
1729
0
{
1730
0
  assert(g != 0);
1731
0
  assert(nr >= sr);
1732
0
  assert(k != NULL);
1733
0
  assert(packed_weights != NULL);
1734
1735
0
  const size_t skr = sr * kr;
1736
0
  for (size_t i = 0; i < g; i++) {
1737
0
    for (size_t oy = 0; oy < sh; oy++) {
1738
0
      for (size_t ox = 0; ox < sw; ox++) {
1739
0
        if (i == 0) {
1740
0
          (*subconv_params++).weights = packed_weights;
1741
0
        }
1742
0
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1743
0
          const size_t nr_block_size = min(nc - nr_block_start, nr);
1744
0
          if XNN_LIKELY(b != NULL) {
1745
0
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1746
0
              packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1747
0
            }
1748
0
          }
1749
0
          packed_weights += nr;
1750
0
          for (size_t ky = oy; ky < kh; ky += sh) {
1751
0
            for (size_t kx = ox; kx < kw; kx += sw) {
1752
0
              for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1753
0
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1754
0
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1755
0
                    const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1756
0
                    if (kc_idx < kc) {
1757
0
                      packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx]);
1758
0
                    }
1759
0
                  }
1760
0
                  packed_weights += kr;
1761
0
                }
1762
0
                packed_weights += (nr - nr_block_size) * kr;
1763
0
              }
1764
0
            }
1765
0
          }
1766
0
        }
1767
0
      }
1768
0
    }
1769
0
    k += kh * kw * kc * nc;
1770
0
    if XNN_UNPREDICTABLE(b != NULL) {
1771
0
      b += nc;
1772
0
    }
1773
0
  }
1774
0
}
1775
1776
void xnn_pack_qs8_deconv_goki_w(
1777
  size_t g,
1778
  size_t nc,
1779
  size_t kh,
1780
  size_t kw,
1781
  size_t kc,
1782
  size_t sh,
1783
  size_t sw,
1784
  size_t nr,
1785
  size_t kr,
1786
  size_t sr,
1787
  const int8_t* k,
1788
  const int32_t* b,
1789
  const float* scale,
1790
  void* packed_weights,
1791
  struct subconvolution_params* subconv_params,
1792
  const struct xnn_qs8_packing_params* params)
1793
0
{
1794
0
  assert(g != 0);
1795
0
  assert(nr >= sr);
1796
0
  assert(k != NULL);
1797
0
  assert(packed_weights != NULL);
1798
1799
0
  const size_t skr = sr * kr;
1800
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
1801
0
  for (size_t i = 0; i < g; i++) {
1802
0
    for (size_t oy = 0; oy < sh; oy++) {
1803
0
      for (size_t ox = 0; ox < sw; ox++) {
1804
0
        if (i == 0) {
1805
0
          (*subconv_params++).weights = packed_weights;
1806
0
        }
1807
0
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1808
0
          const size_t nr_block_size = min(nc - nr_block_start, nr);
1809
0
          int32_t* packed_b = (int32_t*) packed_weights;
1810
0
          if XNN_LIKELY(b != 0) {
1811
0
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1812
0
              unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1813
0
              packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1814
0
            }
1815
0
          } else {
1816
0
            size_t n = nr_block_size;
1817
0
            do {
1818
0
              unaligned_store_s32(packed_weights, 0);
1819
0
              packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1820
0
            } while (--n != 0);
1821
0
          }
1822
0
          packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1823
0
          for (size_t ky = oy; ky < kh; ky += sh) {
1824
0
            for (size_t kx = ox; kx < kw; kx += sw) {
1825
0
              for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1826
0
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1827
0
                  uint32_t ksum = 0;
1828
0
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1829
0
                    const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1830
0
                    if (kc_idx < kc) {
1831
0
                      const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1832
0
                      ksum += (uint32_t) kv;
1833
0
                      ((int8_t*) packed_weights)[kr_block_offset] = kv;
1834
0
                    }
1835
0
                  }
1836
0
                  unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1837
0
                  packed_weights = (int8_t*) packed_weights + kr;
1838
0
                }
1839
0
                packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
1840
0
              }
1841
0
            }
1842
0
          }
1843
0
        }
1844
0
      }
1845
0
    }
1846
0
    k += kh * kw * kc * nc;
1847
0
    if XNN_UNPREDICTABLE(b != NULL) {
1848
0
      b += nc;
1849
0
    }
1850
0
  }
1851
0
}
1852
1853
void xnn_pack_qu8_deconv_goki_w(
1854
  size_t g,
1855
  size_t nc,
1856
  size_t kh,
1857
  size_t kw,
1858
  size_t kc,
1859
  size_t sh,
1860
  size_t sw,
1861
  size_t nr,
1862
  size_t kr,
1863
  size_t sr,
1864
  const uint8_t* k,
1865
  const int32_t* b,
1866
  const void* scale,
1867
  void* packed_weights,
1868
  struct subconvolution_params* subconv_params,
1869
  const struct xnn_qu8_packing_params* params)
1870
0
{
1871
0
  assert(g != 0);
1872
0
  assert(nr >= sr);
1873
0
  assert(k != NULL);
1874
0
  assert(packed_weights != NULL);
1875
1876
0
  const size_t skr = sr * kr;
1877
0
  const int32_t izp = (int32_t) params->input_zero_point;
1878
0
  const int32_t kzp = (int32_t) params->kernel_zero_point;
1879
0
  for (size_t i = 0; i < g; i++) {
1880
0
    for (size_t oy = 0; oy < sh; oy++) {
1881
0
      for (size_t ox = 0; ox < sw; ox++) {
1882
0
        if (i == 0) {
1883
0
          (*subconv_params++).weights = packed_weights;
1884
0
        }
1885
0
        const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1886
0
        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1887
0
          const size_t nr_block_size = min(nc - nr_block_start, nr);
1888
0
          int32_t* packed_b = (int32_t*) packed_weights;
1889
0
          if XNN_LIKELY(b != 0) {
1890
0
            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1891
0
              unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1892
0
              packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1893
0
            }
1894
0
          } else {
1895
0
            size_t n = nr_block_size;
1896
0
            do {
1897
0
              unaligned_store_s32(packed_weights, bzp);
1898
0
              packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1899
0
            } while (--n != 0);
1900
0
          }
1901
0
          packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1902
0
          for (size_t ky = oy; ky < kh; ky += sh) {
1903
0
            for (size_t kx = ox; kx < kw; kx += sw) {
1904
0
              for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1905
0
                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1906
0
                  int32_t ksum = 0;
1907
0
                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1908
0
                    const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1909
0
                    if (kc_idx < kc) {
1910
0
                      const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1911
0
                      ksum += (int32_t) kv;
1912
0
                      ((uint8_t*) packed_weights)[kr_block_offset] = kv;
1913
0
                    }
1914
0
                  }
1915
0
                  unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1916
0
                  packed_weights = (uint8_t*) packed_weights + kr;
1917
0
                }
1918
0
                packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
1919
0
              }
1920
0
            }
1921
0
          }
1922
0
        }
1923
0
      }
1924
0
    }
1925
0
    k += kh * kw * kc * nc;
1926
0
    if XNN_UNPREDICTABLE(b != NULL) {
1927
0
      b += nc;
1928
0
    }
1929
0
  }
1930
0
}
1931
1932
// Helper function to advance x and y indices.
1933
0
inline static void advance_x_y(size_t h, size_t* x, size_t* y) {
1934
0
  if (++*y == h) {
1935
0
    *y = 0;
1936
0
    ++*x;
1937
0
  }
1938
0
}
1939
1940
void xnn_pack_f32_dwconv_ghw_w(
1941
  size_t first_pass_tile,
1942
  size_t middle_pass_tile,
1943
  size_t last_pass_tile,
1944
  size_t h,
1945
  size_t w,
1946
  size_t c,
1947
  size_t channel_tile,
1948
  size_t channel_subtile,
1949
  size_t channel_round,
1950
  const float* k,
1951
  const float* b,
1952
  const void* scale,
1953
  float* packed_weights,
1954
  size_t per_tile_extra_bytes,
1955
  size_t per_subtile_extra_bytes,
1956
  const void* params)
1957
0
{
1958
0
  assert(k != NULL);
1959
0
  assert(packed_weights != NULL);
1960
0
  size_t kernel_size = h * w;
1961
0
  if (middle_pass_tile == 0) {
1962
    // Uni-pass DWCONV.
1963
0
    assert(last_pass_tile == 0);
1964
0
  } else {
1965
    // Multi-pass DWCONV.
1966
0
    assert(kernel_size > first_pass_tile);
1967
0
  }
1968
1969
  // Stores the x and y index that should be processed next.
1970
0
  size_t processed_x = 0;
1971
0
  size_t processed_y = 0;
1972
0
  size_t x = 0;
1973
0
  size_t y = 0;
1974
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
1975
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
1976
1977
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
1978
0
  {
1979
0
    size_t cr_block_start = 0;
1980
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1981
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1982
0
      if XNN_LIKELY(b != NULL) {
1983
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1984
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
1985
0
        }
1986
0
      } else {
1987
0
        size_t n = cr_block_size;
1988
0
        do {
1989
0
          *packed_weights++ = 0.0f;
1990
0
        } while (--n != 0);
1991
0
      }
1992
0
      packed_weights += channel_tile - cr_block_size;
1993
1994
0
      x = 0;
1995
0
      y = 0;
1996
      // kernel_size can be less than the first_pass_tile, in this case, pack up
1997
      // to the smaller of the two.
1998
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
1999
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2000
0
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2001
0
          *packed_weights++ = kv;
2002
0
        }
2003
0
        packed_weights += channel_tile - cr_block_size;
2004
0
        advance_x_y(h, &x, &y);
2005
0
      }
2006
      // And make sure to skip weights if kernel_size < first_pass_tile.
2007
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2008
0
    }
2009
2010
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2011
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2012
0
      if XNN_LIKELY(b != NULL) {
2013
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2014
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
2015
0
        }
2016
0
      } else {
2017
0
        size_t n = cr_block_size;
2018
0
        do {
2019
0
          *packed_weights++ = 0.0f;
2020
0
        } while (--n != 0);
2021
0
      }
2022
0
      packed_weights += channel_subtile - cr_block_size;
2023
2024
0
      x = 0;
2025
0
      y = 0;
2026
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2027
      // to the smaller of the two.
2028
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2029
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2030
0
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2031
0
          *packed_weights++ = kv;
2032
0
        }
2033
0
        packed_weights += channel_subtile - cr_block_size;
2034
0
        advance_x_y(h, &x, &y);
2035
0
      }
2036
      // And make sure to skip weights if kernel_size < first_pass_tile.
2037
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2038
0
    }
2039
0
  }
2040
2041
0
  if (kernel_size <= first_pass_tile) {
2042
0
    return;
2043
0
  }
2044
2045
0
  kernel_size -= first_pass_tile;
2046
2047
0
  processed_x = x;
2048
0
  processed_y = y;
2049
2050
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2051
  // middle_pass_tile * cr weights.
2052
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2053
0
    assert(kernel_size >= middle_pass_tile);
2054
0
    size_t cr_block_start = 0;
2055
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2056
0
      x = processed_x;
2057
0
      y = processed_y;
2058
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2059
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2060
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2061
0
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2062
0
          *packed_weights++ = kv;
2063
0
        }
2064
0
        packed_weights += channel_tile - cr_block_size;
2065
0
        advance_x_y(h, &x, &y);
2066
0
      }
2067
0
    }
2068
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2069
0
      x = processed_x;
2070
0
      y = processed_y;
2071
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2072
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2073
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2074
0
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2075
0
          *packed_weights++ = kv;
2076
0
        }
2077
0
        packed_weights += channel_subtile - cr_block_size;
2078
0
        advance_x_y(h, &x, &y);
2079
0
      }
2080
0
    }
2081
0
    processed_x = x;
2082
0
    processed_y = y;
2083
0
  }
2084
2085
  // Last pass.
2086
0
  {
2087
0
    assert(kernel_size <= last_pass_tile);
2088
0
    size_t cr_block_start = 0;
2089
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2090
      // Last pass does not pack to rounded c, since it handles remainder.
2091
0
      x = processed_x;
2092
0
      y = processed_y;
2093
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2094
0
      for (size_t i = 0; i < kernel_size; i++) {
2095
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2096
0
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2097
0
          *packed_weights++ = kv;
2098
0
        }
2099
0
        packed_weights += channel_tile - cr_block_size;
2100
0
        advance_x_y(h, &x, &y);
2101
0
      }
2102
      // Pad so that we can always read last_pass_tile weights in the last pass.
2103
0
      packed_weights += (last_pass_tile - kernel_size) * channel_tile;
2104
0
      packed_weights = (float*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
2105
0
    }
2106
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2107
      // Last pass does not pack to rounded c, since it handles remainder.
2108
0
      x = processed_x;
2109
0
      y = processed_y;
2110
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2111
0
      for (size_t i = 0; i < kernel_size; i++) {
2112
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2113
0
          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2114
0
          *packed_weights++ = kv;
2115
0
        }
2116
0
        packed_weights += channel_subtile - cr_block_size;
2117
0
        advance_x_y(h, &x, &y);
2118
0
      }
2119
      // Pad so that we can always read last_pass_tile weights in the last pass.
2120
0
      packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
2121
0
      packed_weights = (float*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
2122
0
    }
2123
0
  }
2124
0
}
2125
2126
void xnn_pack_f16_dwconv_ghw_w(
2127
  size_t first_pass_tile,
2128
  size_t middle_pass_tile,
2129
  size_t last_pass_tile,
2130
  size_t h,
2131
  size_t w,
2132
  size_t c,
2133
  size_t channel_tile,
2134
  size_t channel_subtile,
2135
  size_t channel_round,
2136
  const uint16_t* k,
2137
  const uint16_t* b,
2138
  const void* scale,
2139
  uint16_t* packed_weights,
2140
  size_t per_tile_extra_bytes,
2141
  size_t per_subtile_extra_bytes,
2142
  const void* params)
2143
0
{
2144
0
  assert(k != NULL);
2145
0
  assert(packed_weights != NULL);
2146
0
  size_t kernel_size = h * w;
2147
0
  if (middle_pass_tile == 0) {
2148
    // Uni-pass DWCONV.
2149
0
    assert(last_pass_tile == 0);
2150
0
  } else {
2151
    // Multi-pass DWCONV.
2152
0
    assert(kernel_size > first_pass_tile);
2153
0
  }
2154
2155
  // Stores the x and y index that should be processed next.
2156
0
  size_t processed_x = 0;
2157
0
  size_t processed_y = 0;
2158
0
  size_t x = 0;
2159
0
  size_t y = 0;
2160
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
2161
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
2162
2163
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
2164
0
  {
2165
0
    size_t cr_block_start = 0;
2166
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2167
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2168
0
      if XNN_LIKELY(b != NULL) {
2169
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2170
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
2171
0
        }
2172
0
      } else {
2173
0
        size_t n = cr_block_size;
2174
0
        do {
2175
0
          *packed_weights++ = 0.0f;
2176
0
        } while (--n != 0);
2177
0
      }
2178
0
      packed_weights += channel_tile - cr_block_size;
2179
2180
0
      x = 0;
2181
0
      y = 0;
2182
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2183
      // to the smaller of the two.
2184
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2185
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2186
0
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2187
0
          *packed_weights++ = kv;
2188
0
        }
2189
0
        packed_weights += channel_tile - cr_block_size;
2190
0
        advance_x_y(h, &x, &y);
2191
0
      }
2192
      // And make sure to skip weights if kernel_size < first_pass_tile.
2193
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2194
0
    }
2195
2196
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2197
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2198
0
      if XNN_LIKELY(b != NULL) {
2199
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2200
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
2201
0
        }
2202
0
      } else {
2203
0
        size_t n = cr_block_size;
2204
0
        do {
2205
0
          *packed_weights++ = 0.0f;
2206
0
        } while (--n != 0);
2207
0
      }
2208
0
      packed_weights += channel_subtile - cr_block_size;
2209
2210
0
      x = 0;
2211
0
      y = 0;
2212
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2213
      // to the smaller of the two.
2214
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2215
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2216
0
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2217
0
          *packed_weights++ = kv;
2218
0
        }
2219
0
        packed_weights += channel_subtile - cr_block_size;
2220
0
        advance_x_y(h, &x, &y);
2221
0
      }
2222
      // And make sure to skip weights if kernel_size < first_pass_tile.
2223
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2224
0
    }
2225
0
  }
2226
2227
0
  if (kernel_size <= first_pass_tile) {
2228
0
    return;
2229
0
  }
2230
2231
0
  kernel_size -= first_pass_tile;
2232
2233
0
  processed_x = x;
2234
0
  processed_y = y;
2235
2236
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2237
  // middle_pass_tile * cr weights.
2238
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2239
0
    assert(kernel_size >= middle_pass_tile);
2240
0
    size_t cr_block_start = 0;
2241
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2242
0
      x = processed_x;
2243
0
      y = processed_y;
2244
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2245
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2246
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2247
0
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2248
0
          *packed_weights++ = kv;
2249
0
        }
2250
0
        packed_weights += channel_tile - cr_block_size;
2251
0
        advance_x_y(h, &x, &y);
2252
0
      }
2253
0
    }
2254
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2255
0
      x = processed_x;
2256
0
      y = processed_y;
2257
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2258
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2259
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2260
0
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2261
0
          *packed_weights++ = kv;
2262
0
        }
2263
0
        packed_weights += channel_subtile - cr_block_size;
2264
0
        advance_x_y(h, &x, &y);
2265
0
      }
2266
0
    }
2267
0
    processed_x = x;
2268
0
    processed_y = y;
2269
0
  }
2270
2271
  // Last pass.
2272
0
  {
2273
0
    assert(kernel_size <= last_pass_tile);
2274
0
    size_t cr_block_start = 0;
2275
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2276
      // Last pass does not pack to rounded c, since it handles remainder.
2277
0
      x = processed_x;
2278
0
      y = processed_y;
2279
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2280
0
      for (size_t i = 0; i < kernel_size; i++) {
2281
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2282
0
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2283
0
          *packed_weights++ = kv;
2284
0
        }
2285
0
        packed_weights += channel_tile - cr_block_size;
2286
0
        advance_x_y(h, &x, &y);
2287
0
      }
2288
      // Pad so that we can always read last_pass_tile weights in the last pass.
2289
0
      packed_weights += (last_pass_tile - kernel_size) * channel_tile;
2290
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
2291
0
    }
2292
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2293
      // Last pass does not pack to rounded c, since it handles remainder.
2294
0
      x = processed_x;
2295
0
      y = processed_y;
2296
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2297
0
      for (size_t i = 0; i < kernel_size; i++) {
2298
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2299
0
          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2300
0
          *packed_weights++ = kv;
2301
0
        }
2302
0
        packed_weights += channel_subtile - cr_block_size;
2303
0
        advance_x_y(h, &x, &y);
2304
0
      }
2305
      // Pad so that we can always read last_pass_tile weights in the last pass.
2306
0
      packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
2307
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
2308
0
    }
2309
0
  }
2310
0
}
2311
2312
void xnn_pack_f32_to_f16_dwconv_ghw_w(
2313
  size_t first_pass_tile,
2314
  size_t middle_pass_tile,
2315
  size_t last_pass_tile,
2316
  size_t h,
2317
  size_t w,
2318
  size_t c,
2319
  size_t channel_tile,
2320
  size_t channel_subtile,
2321
  size_t channel_round,
2322
  const float* k,
2323
  const float* b,
2324
  const void* scale,
2325
  uint16_t* packed_weights,
2326
  size_t per_tile_extra_bytes,
2327
  size_t per_subtile_extra_bytes,
2328
  const void* params)
2329
0
{
2330
0
  assert(k != NULL);
2331
0
  assert(packed_weights != NULL);
2332
0
  size_t kernel_size = h * w;
2333
0
  if (middle_pass_tile == 0) {
2334
    // Uni-pass DWCONV.
2335
0
    assert(last_pass_tile == 0);
2336
0
  } else {
2337
    // Multi-pass DWCONV.
2338
0
    assert(kernel_size > first_pass_tile);
2339
0
  }
2340
2341
  // Stores the x and y index that should be processed next.
2342
0
  size_t processed_x = 0;
2343
0
  size_t processed_y = 0;
2344
0
  size_t x = 0;
2345
0
  size_t y = 0;
2346
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
2347
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
2348
2349
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
2350
0
  {
2351
0
    size_t cr_block_start = 0;
2352
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2353
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2354
0
      if XNN_LIKELY(b != NULL) {
2355
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2356
0
          *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2357
0
        }
2358
0
      } else {
2359
0
        size_t n = cr_block_size;
2360
0
        do {
2361
0
          *packed_weights++ = 0;
2362
0
        } while (--n != 0);
2363
0
      }
2364
0
      packed_weights += channel_tile - cr_block_size;
2365
2366
0
      x = 0;
2367
0
      y = 0;
2368
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2369
      // to the smaller of the two.
2370
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2371
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2372
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
2373
0
          *packed_weights++ = kv;
2374
0
        }
2375
0
        packed_weights += channel_tile - cr_block_size;
2376
0
        advance_x_y(h, &x, &y);
2377
0
      }
2378
      // And make sure to skip weights if kernel_size < first_pass_tile.
2379
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2380
0
    }
2381
2382
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2383
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2384
0
      if XNN_LIKELY(b != NULL) {
2385
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2386
0
          *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2387
0
        }
2388
0
      } else {
2389
0
        size_t n = cr_block_size;
2390
0
        do {
2391
0
          *packed_weights++ = 0;
2392
0
        } while (--n != 0);
2393
0
      }
2394
0
      packed_weights += channel_subtile - cr_block_size;
2395
2396
0
      x = 0;
2397
0
      y = 0;
2398
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2399
      // to the smaller of the two.
2400
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2401
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2402
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
2403
0
          *packed_weights++ = kv;
2404
0
        }
2405
0
        packed_weights += channel_subtile - cr_block_size;
2406
0
        advance_x_y(h, &x, &y);
2407
0
      }
2408
      // And make sure to skip weights if kernel_size < first_pass_tile.
2409
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2410
0
    }
2411
0
  }
2412
2413
0
  if (kernel_size <= first_pass_tile) {
2414
0
    return;
2415
0
  }
2416
2417
0
  kernel_size -= first_pass_tile;
2418
2419
0
  processed_x = x;
2420
0
  processed_y = y;
2421
2422
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2423
  // middle_pass_tile * cr weights.
2424
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2425
0
    assert(kernel_size >= middle_pass_tile);
2426
0
    size_t cr_block_start = 0;
2427
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2428
0
      x = processed_x;
2429
0
      y = processed_y;
2430
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2431
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2432
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2433
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
2434
0
          *packed_weights++ = kv;
2435
0
        }
2436
0
        packed_weights += channel_tile - cr_block_size;
2437
0
        advance_x_y(h, &x, &y);
2438
0
      }
2439
0
    }
2440
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2441
0
      x = processed_x;
2442
0
      y = processed_y;
2443
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2444
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2445
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2446
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
2447
0
          *packed_weights++ = kv;
2448
0
        }
2449
0
        packed_weights += channel_subtile - cr_block_size;
2450
0
        advance_x_y(h, &x, &y);
2451
0
      }
2452
0
    }
2453
0
    processed_x = x;
2454
0
    processed_y = y;
2455
0
  }
2456
2457
  // Last pass.
2458
0
  {
2459
0
    assert(kernel_size <= last_pass_tile);
2460
0
    size_t cr_block_start = 0;
2461
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2462
      // Last pass does not pack to rounded c, since it handles remainder.
2463
0
      x = processed_x;
2464
0
      y = processed_y;
2465
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2466
0
      for (size_t i = 0; i < kernel_size; i++) {
2467
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2468
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
2469
0
          *packed_weights++ = kv;
2470
0
        }
2471
0
        packed_weights += channel_tile - cr_block_size;
2472
0
        advance_x_y(h, &x, &y);
2473
0
      }
2474
      // Pad so that we can always read last_pass_tile weights in the last pass.
2475
0
      packed_weights += (last_pass_tile - kernel_size) * channel_tile;
2476
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
2477
0
    }
2478
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2479
      // Last pass does not pack to rounded c, since it handles remainder.
2480
0
      x = processed_x;
2481
0
      y = processed_y;
2482
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2483
0
      for (size_t i = 0; i < kernel_size; i++) {
2484
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2485
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
2486
0
          *packed_weights++ = kv;
2487
0
        }
2488
0
        packed_weights += channel_subtile - cr_block_size;
2489
0
        advance_x_y(h, &x, &y);
2490
0
      }
2491
      // Pad so that we can always read last_pass_tile weights in the last pass.
2492
0
      packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
2493
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
2494
0
    }
2495
0
  }
2496
0
}
2497
2498
2499
void xnn_pack_qu8_dwconv_ghw_w(
2500
  size_t first_pass_tile,
2501
  size_t middle_pass_tile,
2502
  size_t last_pass_tile,
2503
  size_t h,
2504
  size_t w,
2505
  size_t c,
2506
  size_t channel_tile,
2507
  size_t channel_subtile,
2508
  size_t channel_round,
2509
  const uint8_t* k,
2510
  const int32_t* b,
2511
  const void* scale,
2512
  void* packed_weights,
2513
  size_t per_tile_extra_bytes,
2514
  size_t per_subtile_extra_bytes,
2515
  const struct xnn_qu8_packing_params* params)
2516
0
{
2517
0
  assert(k != NULL);
2518
0
  assert(packed_weights != NULL);
2519
0
  size_t kernel_size = h * w;
2520
0
  if (middle_pass_tile == 0) {
2521
    // Uni-pass DWCONV.
2522
0
    assert(last_pass_tile == 0);
2523
0
  } else {
2524
    // Multi-pass DWCONV.
2525
0
    assert(kernel_size > first_pass_tile);
2526
0
  }
2527
2528
0
  const int32_t izp = (int32_t) params->input_zero_point;
2529
0
  const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
2530
  // Stores the x and y index that should be processed next.
2531
0
  size_t processed_x = 0;
2532
0
  size_t processed_y = 0;
2533
0
  size_t x = 0;
2534
0
  size_t y = 0;
2535
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
2536
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
2537
2538
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
2539
0
  {
2540
0
    size_t cr_block_start = 0;
2541
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2542
0
      int32_t* packed_b = (int32_t*) packed_weights;
2543
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2544
0
      if XNN_LIKELY(b != NULL) {
2545
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2546
0
          unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
2547
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2548
0
        }
2549
0
      } else {
2550
0
        size_t n = cr_block_size;
2551
0
        do {
2552
0
          unaligned_store_s32(packed_weights, boff);
2553
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2554
0
        } while (--n != 0);
2555
0
      }
2556
2557
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int32_t));
2558
2559
      // Biases need to be offset by all kernel values.
2560
0
      for (size_t x = 0; x < w; x++) {
2561
0
        for (size_t y = 0; y < h; y++) {
2562
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2563
0
            const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2564
0
            unaligned_indexed_store_s32(packed_b, cr_block_offset,
2565
0
                                        unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
2566
0
          }
2567
0
        }
2568
0
      }
2569
2570
0
      x = 0;
2571
0
      y = 0;
2572
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2573
      // to the smaller of the two.
2574
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2575
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2576
0
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2577
0
          *((uint8_t*) packed_weights) = kv;
2578
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2579
0
        }
2580
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(uint8_t));
2581
0
        advance_x_y(h, &x, &y);
2582
0
      }
2583
      // And make sure to skip weights if kernel_size < first_pass_tile.
2584
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
2585
0
    }
2586
2587
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2588
0
      int32_t* packed_b = (int32_t*) packed_weights;
2589
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2590
0
      if XNN_LIKELY(b != NULL) {
2591
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2592
0
          unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
2593
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2594
0
        }
2595
0
      } else {
2596
0
        size_t n = cr_block_size;
2597
0
        do {
2598
0
          unaligned_store_s32(packed_weights, boff);
2599
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2600
0
        } while (--n != 0);
2601
0
      }
2602
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int32_t));
2603
2604
      // Biases need to be offset by all kernel values.
2605
0
      for (size_t x = 0; x < w; x++) {
2606
0
        for (size_t y = 0; y < h; y++) {
2607
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2608
0
            const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2609
0
            unaligned_indexed_store_s32(packed_b, cr_block_offset,
2610
0
                                        unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
2611
0
          }
2612
0
        }
2613
0
      }
2614
2615
0
      x = 0;
2616
0
      y = 0;
2617
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2618
      // to the smaller of the two.
2619
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2620
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2621
0
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2622
0
          *((uint8_t*) packed_weights) = kv;
2623
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2624
0
        }
2625
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(uint8_t));
2626
0
        advance_x_y(h, &x, &y);
2627
0
      }
2628
      // And make sure to skip weights if kernel_size < first_pass_tile.
2629
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
2630
0
    }
2631
0
  }
2632
2633
0
  if (kernel_size <= first_pass_tile) {
2634
0
    return;
2635
0
  }
2636
2637
0
  kernel_size -= first_pass_tile;
2638
2639
0
  processed_x = x;
2640
0
  processed_y = y;
2641
2642
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2643
  // middle_pass_tile * cr weights.
2644
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2645
0
    assert(kernel_size >= middle_pass_tile);
2646
0
    size_t cr_block_start = 0;
2647
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2648
0
      x = processed_x;
2649
0
      y = processed_y;
2650
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2651
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2652
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2653
0
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2654
0
          *((uint8_t*) packed_weights) = kv;
2655
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2656
0
        }
2657
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(uint8_t));
2658
0
        advance_x_y(h, &x, &y);
2659
0
      }
2660
0
    }
2661
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2662
0
      x = processed_x;
2663
0
      y = processed_y;
2664
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2665
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2666
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2667
0
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2668
0
          *((uint8_t*) packed_weights) = kv;
2669
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2670
0
        }
2671
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(uint8_t));
2672
0
        advance_x_y(h, &x, &y);
2673
0
      }
2674
0
    }
2675
0
    processed_x = x;
2676
0
    processed_y = y;
2677
0
  }
2678
2679
  // Last pass.
2680
0
  {
2681
0
    assert(kernel_size <= last_pass_tile);
2682
0
    size_t cr_block_start = 0;
2683
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2684
      // Last pass does not pack to rounded c, since it handles remainder.
2685
0
      x = processed_x;
2686
0
      y = processed_y;
2687
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2688
0
      for (size_t i = 0; i < kernel_size; i++) {
2689
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2690
0
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2691
0
          *((uint8_t*) packed_weights) = kv;
2692
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2693
0
        }
2694
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(uint8_t));
2695
0
        advance_x_y(h, &x, &y);
2696
0
      }
2697
      // Pad so that we can always read last_pass_tile weights in the last pass.
2698
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_tile);
2699
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
2700
0
    }
2701
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2702
      // Last pass does not pack to rounded c, since it handles remainder.
2703
0
      x = processed_x;
2704
0
      y = processed_y;
2705
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2706
0
      for (size_t i = 0; i < kernel_size; i++) {
2707
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2708
0
          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2709
0
          *((uint8_t*) packed_weights) = kv;
2710
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2711
0
        }
2712
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(uint8_t));
2713
0
        advance_x_y(h, &x, &y);
2714
0
      }
2715
      // Pad so that we can always read last_pass_tile weights in the last pass.
2716
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_subtile);
2717
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
2718
0
    }
2719
0
  }
2720
0
}
2721
2722
void xnn_pack_qs8_dwconv_ghw_w(
2723
  size_t first_pass_tile,
2724
  size_t middle_pass_tile,
2725
  size_t last_pass_tile,
2726
  size_t h,
2727
  size_t w,
2728
  size_t c,
2729
  size_t channel_tile,
2730
  size_t channel_subtile,
2731
  size_t channel_round,
2732
  const int8_t* k,
2733
  const int32_t* b,
2734
  const float* scale,
2735
  void* packed_weights,
2736
  size_t per_tile_extra_bytes,
2737
  size_t per_subtile_extra_bytes,
2738
  const struct xnn_qs8_packing_params* params)
2739
0
{
2740
0
  assert(k != NULL);
2741
0
  assert(packed_weights != NULL);
2742
0
  size_t kernel_size = h * w;
2743
0
  if (middle_pass_tile == 0) {
2744
    // Uni-pass DWCONV.
2745
0
    assert(last_pass_tile == 0);
2746
0
  } else {
2747
    // Multi-pass DWCONV.
2748
0
    assert(kernel_size > first_pass_tile);
2749
0
  }
2750
2751
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
2752
  // Stores the x and y index that should be processed next.
2753
0
  size_t processed_x = 0;
2754
0
  size_t processed_y = 0;
2755
0
  size_t x = 0;
2756
0
  size_t y = 0;
2757
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
2758
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
2759
2760
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
2761
0
  {
2762
0
    size_t cr_block_start = 0;
2763
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2764
0
      int32_t* packed_b = (int32_t*) packed_weights;
2765
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2766
0
      if XNN_LIKELY(b != NULL) {
2767
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2768
0
          unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
2769
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2770
0
        }
2771
0
      } else {
2772
0
        size_t n = cr_block_size;
2773
0
        do {
2774
0
          unaligned_store_s32(packed_weights, 0);
2775
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2776
0
        } while (--n != 0);
2777
0
      }
2778
2779
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int32_t));
2780
2781
      // Biases need to be offset by all kernel values.
2782
0
      for (size_t x = 0; x < w; x++) {
2783
0
        for (size_t y = 0; y < h; y++) {
2784
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2785
0
            const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2786
0
            unaligned_indexed_store_u32(packed_b, cr_block_offset,
2787
0
                                        unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
2788
0
          }
2789
0
        }
2790
0
      }
2791
2792
0
      x = 0;
2793
0
      y = 0;
2794
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2795
      // to the smaller of the two.
2796
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2797
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2798
0
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2799
0
          *((int8_t*) packed_weights) = kv;
2800
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2801
0
        }
2802
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int8_t));
2803
0
        advance_x_y(h, &x, &y);
2804
0
      }
2805
      // And make sure to skip weights if kernel_size < first_pass_tile.
2806
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
2807
      // If unipass and QC8, we need to pack extra bytes for scale values here.
2808
0
      if (middle_pass_tile == 0) {
2809
0
        packed_weights = (void*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
2810
0
      }
2811
0
    }
2812
2813
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2814
0
      int32_t* packed_b = (int32_t*) packed_weights;
2815
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2816
0
      if XNN_LIKELY(b != NULL) {
2817
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2818
0
          unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
2819
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2820
0
        }
2821
0
      } else {
2822
0
        size_t n = cr_block_size;
2823
0
        do {
2824
0
          unaligned_store_s32(packed_weights, 0);
2825
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2826
0
        } while (--n != 0);
2827
0
      }
2828
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int32_t));
2829
2830
      // Biases need to be offset by all kernel values.
2831
0
      for (size_t x = 0; x < w; x++) {
2832
0
        for (size_t y = 0; y < h; y++) {
2833
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2834
0
            const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2835
0
            unaligned_indexed_store_u32(packed_b, cr_block_offset,
2836
0
                                        unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
2837
0
          }
2838
0
        }
2839
0
      }
2840
2841
0
      x = 0;
2842
0
      y = 0;
2843
      // kernel_size can be less than the first_pass_tile, in this case, pack up
2844
      // to the smaller of the two.
2845
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
2846
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2847
0
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2848
0
          *((int8_t*) packed_weights) = kv;
2849
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2850
0
        }
2851
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int8_t));
2852
0
        advance_x_y(h, &x, &y);
2853
0
      }
2854
      // And make sure to skip weights if kernel_size < first_pass_tile.
2855
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
2856
      // If unipass and QC8, we need to pack extra bytes for scale values here.
2857
0
      if (middle_pass_tile == 0) {
2858
0
        packed_weights = (void*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
2859
0
      }
2860
0
    }
2861
0
  }
2862
2863
0
  if (kernel_size <= first_pass_tile) {
2864
0
    return;
2865
0
  }
2866
2867
0
  kernel_size -= first_pass_tile;
2868
2869
0
  processed_x = x;
2870
0
  processed_y = y;
2871
2872
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2873
  // middle_pass_tile * cr weights.
2874
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2875
0
    assert(kernel_size >= middle_pass_tile);
2876
0
    size_t cr_block_start = 0;
2877
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2878
0
      x = processed_x;
2879
0
      y = processed_y;
2880
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2881
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2882
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2883
0
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2884
0
          *((int8_t*) packed_weights) = kv;
2885
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2886
0
        }
2887
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int8_t));
2888
0
        advance_x_y(h, &x, &y);
2889
0
      }
2890
0
    }
2891
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2892
0
      x = processed_x;
2893
0
      y = processed_y;
2894
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2895
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
2896
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2897
0
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2898
0
          *((int8_t*) packed_weights) = kv;
2899
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2900
0
        }
2901
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int8_t));
2902
0
        advance_x_y(h, &x, &y);
2903
0
      }
2904
0
    }
2905
0
    processed_x = x;
2906
0
    processed_y = y;
2907
0
  }
2908
2909
  // Last pass.
2910
0
  {
2911
0
    assert(kernel_size <= last_pass_tile);
2912
0
    size_t cr_block_start = 0;
2913
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2914
      // Last pass does not pack to rounded c, since it handles remainder.
2915
0
      x = processed_x;
2916
0
      y = processed_y;
2917
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2918
0
      for (size_t i = 0; i < kernel_size; i++) {
2919
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2920
0
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2921
0
          *((int8_t*) packed_weights) = kv;
2922
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2923
0
        }
2924
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int8_t));
2925
0
        advance_x_y(h, &x, &y);
2926
0
      }
2927
      // Pad so that we can always read last_pass_tile weights in the last pass.
2928
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_tile);
2929
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
2930
0
    }
2931
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
2932
      // Last pass does not pack to rounded c, since it handles remainder.
2933
0
      x = processed_x;
2934
0
      y = processed_y;
2935
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2936
0
      for (size_t i = 0; i < kernel_size; i++) {
2937
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2938
0
          const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
2939
0
          *((int8_t*) packed_weights) = kv;
2940
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2941
0
        }
2942
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int8_t));
2943
0
        advance_x_y(h, &x, &y);
2944
0
      }
2945
      // Pad so that we can always read last_pass_tile weights in the last pass.
2946
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_subtile);
2947
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
2948
0
    }
2949
0
  }
2950
0
}
2951
2952
void xnn_pack_f32_dwconv_hwg_w(
2953
  size_t first_pass_tile,
2954
  size_t middle_pass_tile,
2955
  size_t last_pass_tile,
2956
  size_t h,
2957
  size_t w,
2958
  size_t c,
2959
  size_t channel_tile,
2960
  size_t channel_subtile,
2961
  size_t channel_round,
2962
  const float* k,
2963
  const float* b,
2964
  const void* scale,
2965
  float* packed_weights,
2966
  size_t per_tile_extra_bytes,
2967
  size_t per_subtile_extra_bytes,
2968
  const void* params)
2969
0
{
2970
0
  assert(k != NULL);
2971
0
  assert(packed_weights != NULL);
2972
0
  size_t kernel_size = h * w;
2973
0
  if (middle_pass_tile == 0) {
2974
    // Uni-pass DWCONV.
2975
0
    assert(last_pass_tile == 0);
2976
0
  } else {
2977
    // Multi-pass DWCONV.
2978
0
    assert(kernel_size > first_pass_tile);
2979
0
  }
2980
2981
  // Stores the x and y index that should be processed next.
2982
0
  size_t processed_x = 0;
2983
0
  size_t processed_y = 0;
2984
0
  size_t x = 0;
2985
0
  size_t y = 0;
2986
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
2987
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
2988
2989
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
2990
0
  {
2991
0
    size_t cr_block_start = 0;
2992
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2993
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2994
0
      if XNN_LIKELY(b != NULL) {
2995
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2996
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
2997
0
        }
2998
0
      } else {
2999
0
        size_t n = cr_block_size;
3000
0
        do {
3001
0
          *packed_weights++ = 0.0f;
3002
0
        } while (--n != 0);
3003
0
      }
3004
0
      packed_weights += channel_tile - cr_block_size;
3005
3006
0
      x = processed_x;
3007
0
      y = processed_y;
3008
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3009
      // to the smaller of the two.
3010
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3011
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3012
0
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3013
0
          *packed_weights++ = kv;
3014
0
        }
3015
0
        packed_weights += channel_tile - cr_block_size;
3016
0
        if (++y == h) {
3017
0
          y = 0;
3018
0
          x++;
3019
0
        }
3020
0
      }
3021
      // And make sure to skip weights if kernel_size < first_pass_tile.
3022
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
3023
0
    }
3024
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3025
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3026
0
      if XNN_LIKELY(b != NULL) {
3027
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3028
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
3029
0
        }
3030
0
      } else {
3031
0
        size_t n = cr_block_size;
3032
0
        do {
3033
0
          *packed_weights++ = 0.0f;
3034
0
        } while (--n != 0);
3035
0
      }
3036
0
      packed_weights += channel_subtile - cr_block_size;
3037
3038
0
      x = processed_x;
3039
0
      y = processed_y;
3040
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3041
      // to the smaller of the two.
3042
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3043
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3044
0
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3045
0
          *packed_weights++ = kv;
3046
0
        }
3047
0
        packed_weights += channel_subtile - cr_block_size;
3048
0
        if (++y == h) {
3049
0
          y = 0;
3050
0
          x++;
3051
0
        }
3052
0
      }
3053
      // And make sure to skip weights if kernel_size < first_pass_tile.
3054
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
3055
0
    }
3056
0
  }
3057
3058
0
  if (kernel_size <= first_pass_tile) {
3059
0
    return;
3060
0
  }
3061
3062
0
  kernel_size -= first_pass_tile;
3063
3064
0
  processed_x = x;
3065
0
  processed_y = y;
3066
3067
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
3068
  // middle_pass_tile * cr weights.
3069
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
3070
0
    assert(kernel_size >= middle_pass_tile);
3071
0
    size_t cr_block_start = 0;
3072
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3073
0
      x = processed_x;
3074
0
      y = processed_y;
3075
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3076
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3077
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3078
0
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3079
0
          *packed_weights++ = kv;
3080
0
        }
3081
0
        packed_weights += channel_tile - cr_block_size;
3082
0
        if (++y == h) {
3083
0
          y = 0;
3084
0
          x++;
3085
0
        }
3086
0
      }
3087
0
    }
3088
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3089
0
      x = processed_x;
3090
0
      y = processed_y;
3091
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3092
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3093
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3094
0
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3095
0
          *packed_weights++ = kv;
3096
0
        }
3097
0
        packed_weights += channel_subtile - cr_block_size;
3098
0
        if (++y == h) {
3099
0
          y = 0;
3100
0
          x++;
3101
0
        }
3102
0
      }
3103
0
    }
3104
0
    processed_x = x;
3105
0
    processed_y = y;
3106
0
  }
3107
3108
  // Last pass.
3109
0
  {
3110
0
    assert(kernel_size <= last_pass_tile);
3111
0
    size_t cr_block_start = 0;
3112
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
3113
0
      x = processed_x;
3114
0
      y = processed_y;
3115
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3116
0
      for (size_t i = 0; i < kernel_size; i++) {
3117
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3118
0
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3119
0
          *packed_weights++ = kv;
3120
0
        }
3121
0
        packed_weights += channel_tile - cr_block_size;
3122
0
        if (++y == h) {
3123
0
          y = 0;
3124
0
          x++;
3125
0
        }
3126
0
      }
3127
      // Pad so that we can always read last_pass_tile weights in the last pass.
3128
0
      packed_weights += (last_pass_tile - kernel_size) * channel_tile;
3129
0
      packed_weights = (float*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
3130
0
    }
3131
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3132
0
      x = processed_x;
3133
0
      y = processed_y;
3134
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3135
0
      for (size_t i = 0; i < kernel_size; i++) {
3136
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3137
0
          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3138
0
          *packed_weights++ = kv;
3139
0
        }
3140
0
        packed_weights += channel_subtile - cr_block_size;
3141
0
        if (++y == h) {
3142
0
          y = 0;
3143
0
          x++;
3144
0
        }
3145
0
      }
3146
      // Pad so that we can always read last_pass_tile weights in the last pass.
3147
0
      packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
3148
0
      packed_weights = (float*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
3149
0
    }
3150
0
  }
3151
0
}
3152
3153
void xnn_pack_f16_dwconv_hwg_w(
3154
  size_t first_pass_tile,
3155
  size_t middle_pass_tile,
3156
  size_t last_pass_tile,
3157
  size_t h,
3158
  size_t w,
3159
  size_t c,
3160
  size_t channel_tile,
3161
  size_t channel_subtile,
3162
  size_t channel_round,
3163
  const uint16_t* k,
3164
  const uint16_t* b,
3165
  const void* scale,
3166
  uint16_t* packed_weights,
3167
  size_t per_tile_extra_bytes,
3168
  size_t per_subtile_extra_bytes,
3169
  const void* params)
3170
0
{
3171
0
  assert(k != NULL);
3172
0
  assert(packed_weights != NULL);
3173
0
  size_t kernel_size = h * w;
3174
0
  if (middle_pass_tile == 0) {
3175
    // Uni-pass DWCONV.
3176
0
    assert(last_pass_tile == 0);
3177
0
  } else {
3178
    // Multi-pass DWCONV.
3179
0
    assert(kernel_size > first_pass_tile);
3180
0
  }
3181
3182
  // Stores the x and y index that should be processed next.
3183
0
  size_t processed_x = 0;
3184
0
  size_t processed_y = 0;
3185
0
  size_t x = 0;
3186
0
  size_t y = 0;
3187
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
3188
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
3189
3190
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
3191
0
  {
3192
0
    size_t cr_block_start = 0;
3193
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3194
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3195
0
      if XNN_LIKELY(b != NULL) {
3196
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3197
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
3198
0
        }
3199
0
      } else {
3200
0
        size_t n = cr_block_size;
3201
0
        do {
3202
0
          *packed_weights++ = 0.0f;
3203
0
        } while (--n != 0);
3204
0
      }
3205
0
      packed_weights += channel_tile - cr_block_size;
3206
3207
0
      x = processed_x;
3208
0
      y = processed_y;
3209
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3210
      // to the smaller of the two.
3211
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3212
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3213
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3214
0
          *packed_weights++ = kv;
3215
0
        }
3216
0
        packed_weights += channel_tile - cr_block_size;
3217
0
        if (++y == h) {
3218
0
          y = 0;
3219
0
          x++;
3220
0
        }
3221
0
      }
3222
      // And make sure to skip weights if kernel_size < first_pass_tile.
3223
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
3224
0
    }
3225
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3226
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3227
0
      if XNN_LIKELY(b != NULL) {
3228
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3229
0
          *packed_weights++ = b[cr_block_start + cr_block_offset];
3230
0
        }
3231
0
      } else {
3232
0
        size_t n = cr_block_size;
3233
0
        do {
3234
0
          *packed_weights++ = 0.0f;
3235
0
        } while (--n != 0);
3236
0
      }
3237
0
      packed_weights += channel_subtile - cr_block_size;
3238
3239
0
      x = processed_x;
3240
0
      y = processed_y;
3241
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3242
      // to the smaller of the two.
3243
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3244
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3245
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3246
0
          *packed_weights++ = kv;
3247
0
        }
3248
0
        packed_weights += channel_subtile - cr_block_size;
3249
0
        if (++y == h) {
3250
0
          y = 0;
3251
0
          x++;
3252
0
        }
3253
0
      }
3254
      // And make sure to skip weights if kernel_size < first_pass_tile.
3255
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
3256
0
    }
3257
0
  }
3258
3259
0
  if (kernel_size <= first_pass_tile) {
3260
0
    return;
3261
0
  }
3262
3263
0
  kernel_size -= first_pass_tile;
3264
3265
0
  processed_x = x;
3266
0
  processed_y = y;
3267
3268
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
3269
  // middle_pass_tile * cr weights.
3270
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
3271
0
    assert(kernel_size >= middle_pass_tile);
3272
0
    size_t cr_block_start = 0;
3273
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3274
0
      x = processed_x;
3275
0
      y = processed_y;
3276
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3277
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3278
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3279
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3280
0
          *packed_weights++ = kv;
3281
0
        }
3282
0
        packed_weights += channel_tile - cr_block_size;
3283
0
        if (++y == h) {
3284
0
          y = 0;
3285
0
          x++;
3286
0
        }
3287
0
      }
3288
0
    }
3289
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3290
0
      x = processed_x;
3291
0
      y = processed_y;
3292
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3293
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3294
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3295
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3296
0
          *packed_weights++ = kv;
3297
0
        }
3298
0
        packed_weights += channel_subtile - cr_block_size;
3299
0
        if (++y == h) {
3300
0
          y = 0;
3301
0
          x++;
3302
0
        }
3303
0
      }
3304
0
    }
3305
0
    processed_x = x;
3306
0
    processed_y = y;
3307
0
  }
3308
3309
  // Last pass.
3310
0
  {
3311
0
    assert(kernel_size <= last_pass_tile);
3312
0
    size_t cr_block_start = 0;
3313
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
3314
0
      x = processed_x;
3315
0
      y = processed_y;
3316
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3317
0
      for (size_t i = 0; i < kernel_size; i++) {
3318
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3319
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3320
0
          *packed_weights++ = kv;
3321
0
        }
3322
0
        packed_weights += channel_tile - cr_block_size;
3323
0
        if (++y == h) {
3324
0
          y = 0;
3325
0
          x++;
3326
0
        }
3327
0
      }
3328
      // Pad so that we can always read last_pass_tile weights in the last pass.
3329
0
      packed_weights += (last_pass_tile - kernel_size) * channel_tile;
3330
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
3331
0
    }
3332
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3333
0
      x = processed_x;
3334
0
      y = processed_y;
3335
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3336
0
      for (size_t i = 0; i < kernel_size; i++) {
3337
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3338
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3339
0
          *packed_weights++ = kv;
3340
0
        }
3341
0
        packed_weights += channel_subtile - cr_block_size;
3342
0
        if (++y == h) {
3343
0
          y = 0;
3344
0
          x++;
3345
0
        }
3346
0
      }
3347
      // Pad so that we can always read last_pass_tile weights in the last pass.
3348
0
      packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
3349
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
3350
0
    }
3351
0
  }
3352
0
}
3353
3354
void xnn_pack_f32_to_f16_dwconv_hwg_w(
3355
  size_t first_pass_tile,
3356
  size_t middle_pass_tile,
3357
  size_t last_pass_tile,
3358
  size_t h,
3359
  size_t w,
3360
  size_t c,
3361
  size_t channel_tile,
3362
  size_t channel_subtile,
3363
  size_t channel_round,
3364
  const float* k,
3365
  const float* b,
3366
  const void* scale,
3367
  uint16_t* packed_weights,
3368
  size_t per_tile_extra_bytes,
3369
  size_t per_subtile_extra_bytes,
3370
  const void* params)
3371
0
{
3372
0
  assert(k != NULL);
3373
0
  assert(packed_weights != NULL);
3374
0
  size_t kernel_size = h * w;
3375
0
  if (middle_pass_tile == 0) {
3376
    // Uni-pass DWCONV.
3377
0
    assert(last_pass_tile == 0);
3378
0
  } else {
3379
    // Multi-pass DWCONV.
3380
0
    assert(kernel_size > first_pass_tile);
3381
0
  }
3382
3383
  // Stores the x and y index that should be processed next.
3384
0
  size_t processed_x = 0;
3385
0
  size_t processed_y = 0;
3386
0
  size_t x = 0;
3387
0
  size_t y = 0;
3388
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
3389
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
3390
3391
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
3392
0
  {
3393
0
    size_t cr_block_start = 0;
3394
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3395
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3396
0
      if XNN_LIKELY(b != NULL) {
3397
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3398
0
          *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
3399
0
        }
3400
0
      } else {
3401
0
        size_t n = cr_block_size;
3402
0
        do {
3403
0
          *packed_weights++ = 0;
3404
0
        } while (--n != 0);
3405
0
      }
3406
0
      packed_weights += channel_tile - cr_block_size;
3407
3408
0
      x = processed_x;
3409
0
      y = processed_y;
3410
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3411
      // to the smaller of the two.
3412
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3413
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3414
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
3415
0
          *packed_weights++ = kv;
3416
0
        }
3417
0
        packed_weights += channel_tile - cr_block_size;
3418
0
        if (++y == h) {
3419
0
          y = 0;
3420
0
          x++;
3421
0
        }
3422
0
      }
3423
      // And make sure to skip weights if kernel_size < first_pass_tile.
3424
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
3425
0
    }
3426
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3427
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3428
0
      if XNN_LIKELY(b != NULL) {
3429
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3430
0
          *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
3431
0
        }
3432
0
      } else {
3433
0
        size_t n = cr_block_size;
3434
0
        do {
3435
0
          *packed_weights++ = 0;
3436
0
        } while (--n != 0);
3437
0
      }
3438
0
      packed_weights += channel_subtile - cr_block_size;
3439
3440
0
      x = processed_x;
3441
0
      y = processed_y;
3442
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3443
      // to the smaller of the two.
3444
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3445
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3446
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
3447
0
          *packed_weights++ = kv;
3448
0
        }
3449
0
        packed_weights += channel_subtile - cr_block_size;
3450
0
        if (++y == h) {
3451
0
          y = 0;
3452
0
          x++;
3453
0
        }
3454
0
      }
3455
      // And make sure to skip weights if kernel_size < first_pass_tile.
3456
0
      packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
3457
0
    }
3458
0
  }
3459
3460
0
  if (kernel_size <= first_pass_tile) {
3461
0
    return;
3462
0
  }
3463
3464
0
  kernel_size -= first_pass_tile;
3465
3466
0
  processed_x = x;
3467
0
  processed_y = y;
3468
3469
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
3470
  // middle_pass_tile * cr weights.
3471
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
3472
0
    assert(kernel_size >= middle_pass_tile);
3473
0
    size_t cr_block_start = 0;
3474
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3475
0
      x = processed_x;
3476
0
      y = processed_y;
3477
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3478
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3479
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3480
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
3481
0
          *packed_weights++ = kv;
3482
0
        }
3483
0
        packed_weights += channel_tile - cr_block_size;
3484
0
        if (++y == h) {
3485
0
          y = 0;
3486
0
          x++;
3487
0
        }
3488
0
      }
3489
0
    }
3490
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3491
0
      x = processed_x;
3492
0
      y = processed_y;
3493
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3494
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3495
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3496
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
3497
0
          *packed_weights++ = kv;
3498
0
        }
3499
0
        packed_weights += channel_subtile - cr_block_size;
3500
0
        if (++y == h) {
3501
0
          y = 0;
3502
0
          x++;
3503
0
        }
3504
0
      }
3505
0
    }
3506
0
    processed_x = x;
3507
0
    processed_y = y;
3508
0
  }
3509
3510
  // Last pass.
3511
0
  {
3512
0
    assert(kernel_size <= last_pass_tile);
3513
0
    size_t cr_block_start = 0;
3514
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
3515
0
      x = processed_x;
3516
0
      y = processed_y;
3517
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3518
0
      for (size_t i = 0; i < kernel_size; i++) {
3519
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3520
0
          const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
3521
0
          *packed_weights++ = kv;
3522
0
        }
3523
0
        packed_weights += channel_tile - cr_block_size;
3524
0
        if (++y == h) {
3525
0
          y = 0;
3526
0
          x++;
3527
0
        }
3528
0
      }
3529
      // Pad so that we can always read last_pass_tile weights in the last pass.
3530
0
      packed_weights += (last_pass_tile - kernel_size) * channel_tile;
3531
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
3532
0
    }
3533
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3534
0
      x = processed_x;
3535
0
      y = processed_y;
3536
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3537
0
      for (size_t i = 0; i < kernel_size; i++) {
3538
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3539
0
          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3540
0
          *packed_weights++ = fp16_ieee_from_fp32_value(kv);
3541
0
        }
3542
0
        packed_weights += channel_subtile - cr_block_size;
3543
0
        if (++y == h) {
3544
0
          y = 0;
3545
0
          x++;
3546
0
        }
3547
0
      }
3548
      // Pad so that we can always read last_pass_tile weights in the last pass.
3549
0
      packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
3550
0
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
3551
0
    }
3552
0
  }
3553
0
}
3554
3555
void xnn_pack_qu8_dwconv_hwg_w(
3556
  size_t first_pass_tile,
3557
  size_t middle_pass_tile,
3558
  size_t last_pass_tile,
3559
  size_t h,
3560
  size_t w,
3561
  size_t c,
3562
  size_t channel_tile,
3563
  size_t channel_subtile,
3564
  size_t channel_round,
3565
  const uint8_t* k,
3566
  const int32_t* b,
3567
  const void* scale,
3568
  void* packed_weights,
3569
  size_t per_tile_extra_bytes,
3570
  size_t per_subtile_extra_bytes,
3571
  const struct xnn_qu8_packing_params* params)
3572
0
{
3573
0
  assert(k != NULL);
3574
0
  assert(packed_weights != NULL);
3575
0
  size_t kernel_size = h * w;
3576
0
  if (middle_pass_tile == 0) {
3577
    // Uni-pass DWCONV.
3578
0
    assert(last_pass_tile == 0);
3579
0
  } else {
3580
    // Multi-pass DWCONV.
3581
0
    assert(kernel_size > first_pass_tile);
3582
0
  }
3583
3584
0
  const int32_t izp = (int32_t) params->input_zero_point;
3585
0
  const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
3586
  // Stores the x and y index that should be processed next.
3587
0
  size_t processed_x = 0;
3588
0
  size_t processed_y = 0;
3589
0
  size_t x = 0;
3590
0
  size_t y = 0;
3591
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
3592
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
3593
3594
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
3595
0
  {
3596
0
    size_t cr_block_start = 0;
3597
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3598
0
      int32_t* packed_b = (int32_t*) packed_weights;
3599
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3600
0
      if XNN_LIKELY(b != NULL) {
3601
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3602
0
          unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
3603
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3604
0
        }
3605
0
      } else {
3606
0
        size_t n = cr_block_size;
3607
0
        do {
3608
0
          unaligned_store_s32(packed_weights, boff);
3609
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3610
0
        } while (--n != 0);
3611
0
      }
3612
3613
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int32_t));
3614
3615
      // Biases need to be offset by all kernel values.
3616
0
      for (size_t x = 0; x < w; x++) {
3617
0
        for (size_t y = 0; y < h; y++) {
3618
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3619
0
            const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3620
0
            unaligned_indexed_store_s32(packed_b, cr_block_offset,
3621
0
                                        unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
3622
0
          }
3623
0
        }
3624
0
      }
3625
3626
0
      x = 0;
3627
0
      y = 0;
3628
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3629
      // to the smaller of the two.
3630
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3631
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3632
0
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3633
0
          *((uint8_t*) packed_weights) = kv;
3634
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
3635
0
        }
3636
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(uint8_t));
3637
0
        advance_x_y(h, &x, &y);
3638
0
      }
3639
      // And make sure to skip weights if kernel_size < first_pass_tile.
3640
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
3641
0
    }
3642
3643
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3644
0
      int32_t* packed_b = (int32_t*) packed_weights;
3645
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3646
0
      if XNN_LIKELY(b != NULL) {
3647
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3648
0
          unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
3649
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3650
0
        }
3651
0
      } else {
3652
0
        size_t n = cr_block_size;
3653
0
        do {
3654
0
          unaligned_store_s32(packed_weights, boff);
3655
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3656
0
        } while (--n != 0);
3657
0
      }
3658
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int32_t));
3659
3660
      // Biases need to be offset by all kernel values.
3661
0
      for (size_t x = 0; x < w; x++) {
3662
0
        for (size_t y = 0; y < h; y++) {
3663
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3664
0
            const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3665
0
            unaligned_indexed_store_s32(packed_b, cr_block_offset,
3666
0
                                        unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
3667
0
          }
3668
0
        }
3669
0
      }
3670
3671
0
      x = 0;
3672
0
      y = 0;
3673
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3674
      // to the smaller of the two.
3675
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3676
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3677
0
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3678
0
          *((uint8_t*) packed_weights) = kv;
3679
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
3680
0
        }
3681
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(uint8_t));
3682
0
        advance_x_y(h, &x, &y);
3683
0
      }
3684
      // And make sure to skip weights if kernel_size < first_pass_tile.
3685
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
3686
0
    }
3687
0
  }
3688
3689
0
  if (kernel_size <= first_pass_tile) {
3690
0
    return;
3691
0
  }
3692
3693
0
  kernel_size -= first_pass_tile;
3694
3695
0
  processed_x = x;
3696
0
  processed_y = y;
3697
3698
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
3699
  // middle_pass_tile * cr weights.
3700
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
3701
0
    assert(kernel_size >= middle_pass_tile);
3702
0
    size_t cr_block_start = 0;
3703
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3704
0
      x = processed_x;
3705
0
      y = processed_y;
3706
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3707
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3708
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3709
0
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3710
0
          *((uint8_t*) packed_weights) = kv;
3711
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
3712
0
        }
3713
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(uint8_t));
3714
0
        advance_x_y(h, &x, &y);
3715
0
      }
3716
0
    }
3717
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3718
0
      x = processed_x;
3719
0
      y = processed_y;
3720
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3721
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3722
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3723
0
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3724
0
          *((uint8_t*) packed_weights) = kv;
3725
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
3726
0
        }
3727
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(uint8_t));
3728
0
        advance_x_y(h, &x, &y);
3729
0
      }
3730
0
    }
3731
0
    processed_x = x;
3732
0
    processed_y = y;
3733
0
  }
3734
3735
  // Last pass.
3736
0
  {
3737
0
    assert(kernel_size <= last_pass_tile);
3738
0
    size_t cr_block_start = 0;
3739
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
3740
      // Last pass does not pack to rounded c, since it handles remainder.
3741
0
      x = processed_x;
3742
0
      y = processed_y;
3743
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3744
0
      for (size_t i = 0; i < kernel_size; i++) {
3745
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3746
0
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3747
0
          *((uint8_t*) packed_weights) = kv;
3748
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
3749
0
        }
3750
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(uint8_t));
3751
0
        advance_x_y(h, &x, &y);
3752
0
      }
3753
      // Pad so that we can always read last_pass_tile weights in the last pass.
3754
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_tile);
3755
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
3756
0
    }
3757
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3758
      // Last pass does not pack to rounded c, since it handles remainder.
3759
0
      x = processed_x;
3760
0
      y = processed_y;
3761
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3762
0
      for (size_t i = 0; i < kernel_size; i++) {
3763
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3764
0
          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3765
0
          *((uint8_t*) packed_weights) = kv;
3766
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
3767
0
        }
3768
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(uint8_t));
3769
0
        advance_x_y(h, &x, &y);
3770
0
      }
3771
      // Pad so that we can always read last_pass_tile weights in the last pass.
3772
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_subtile);
3773
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
3774
0
    }
3775
0
  }
3776
0
}
3777
3778
void xnn_pack_qs8_dwconv_hwg_w(
3779
  size_t first_pass_tile,
3780
  size_t middle_pass_tile,
3781
  size_t last_pass_tile,
3782
  size_t h,
3783
  size_t w,
3784
  size_t c,
3785
  size_t channel_tile,
3786
  size_t channel_subtile,
3787
  size_t channel_round,
3788
  const int8_t* k,
3789
  const int32_t* b,
3790
  const float* scale,
3791
  void* packed_weights,
3792
  size_t per_tile_extra_bytes,
3793
  size_t per_subtile_extra_bytes,
3794
  const struct xnn_qs8_packing_params* params)
3795
0
{
3796
0
  assert(k != NULL);
3797
0
  assert(packed_weights != NULL);
3798
0
  size_t kernel_size = h * w;
3799
0
  if (middle_pass_tile == 0) {
3800
    // Uni-pass DWCONV.
3801
0
    assert(last_pass_tile == 0);
3802
0
  } else {
3803
    // Multi-pass DWCONV.
3804
0
    assert(kernel_size > first_pass_tile);
3805
0
  }
3806
3807
0
  const uint32_t izp = (uint32_t) params->input_zero_point;
3808
  // Stores the x and y index that should be processed next.
3809
0
  size_t processed_x = 0;
3810
0
  size_t processed_y = 0;
3811
0
  size_t x = 0;
3812
0
  size_t y = 0;
3813
  // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
3814
0
  const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
3815
3816
  // Pack in blocks of channel_tile, then in blocks of channel_subtile.
3817
0
  {
3818
0
    size_t cr_block_start = 0;
3819
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3820
0
      int32_t* packed_b = (int32_t*) packed_weights;
3821
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3822
0
      if XNN_LIKELY(b != NULL) {
3823
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3824
0
          unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
3825
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3826
0
        }
3827
0
      } else {
3828
0
        size_t n = cr_block_size;
3829
0
        do {
3830
0
          unaligned_store_s32(packed_weights, 0);
3831
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3832
0
        } while (--n != 0);
3833
0
      }
3834
3835
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int32_t));
3836
3837
      // Biases need to be offset by all kernel values.
3838
0
      for (size_t x = 0; x < w; x++) {
3839
0
        for (size_t y = 0; y < h; y++) {
3840
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3841
0
            const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3842
0
            unaligned_indexed_store_u32(packed_b, cr_block_offset,
3843
0
                                        unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
3844
0
          }
3845
0
        }
3846
0
      }
3847
3848
0
      x = 0;
3849
0
      y = 0;
3850
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3851
      // to the smaller of the two.
3852
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3853
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3854
0
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3855
0
          *((int8_t*) packed_weights) = kv;
3856
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
3857
0
        }
3858
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int8_t));
3859
0
        advance_x_y(h, &x, &y);
3860
0
      }
3861
      // And make sure to skip weights if kernel_size < first_pass_tile.
3862
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
3863
      // If unipass and QC8, we need to pack extra bytes for scale values here.
3864
0
      if (middle_pass_tile == 0) {
3865
0
        packed_weights = (void*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
3866
0
      }
3867
0
    }
3868
3869
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3870
0
      int32_t* packed_b = (int32_t*) packed_weights;
3871
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3872
0
      if XNN_LIKELY(b != NULL) {
3873
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3874
0
          unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
3875
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3876
0
        }
3877
0
      } else {
3878
0
        size_t n = cr_block_size;
3879
0
        do {
3880
0
          unaligned_store_s32(packed_weights, 0);
3881
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
3882
0
        } while (--n != 0);
3883
0
      }
3884
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int32_t));
3885
3886
      // Biases need to be offset by all kernel values.
3887
0
      for (size_t x = 0; x < w; x++) {
3888
0
        for (size_t y = 0; y < h; y++) {
3889
0
          for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3890
0
            const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3891
0
            unaligned_indexed_store_u32(packed_b, cr_block_offset,
3892
0
                                        unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
3893
0
          }
3894
0
        }
3895
0
      }
3896
3897
0
      x = 0;
3898
0
      y = 0;
3899
      // kernel_size can be less than the first_pass_tile, in this case, pack up
3900
      // to the smaller of the two.
3901
0
      for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
3902
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3903
0
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3904
0
          *((int8_t*) packed_weights) = kv;
3905
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
3906
0
        }
3907
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int8_t));
3908
0
        advance_x_y(h, &x, &y);
3909
0
      }
3910
      // And make sure to skip weights if kernel_size < first_pass_tile.
3911
0
      packed_weights = (void*) ((uintptr_t) packed_weights + doz(first_pass_tile, kernel_size) * cr_block_size);
3912
      // If unipass and QC8, we need to pack extra bytes for scale values here.
3913
0
      if (middle_pass_tile == 0) {
3914
0
        packed_weights = (void*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
3915
0
      }
3916
0
    }
3917
0
  }
3918
3919
0
  if (kernel_size <= first_pass_tile) {
3920
0
    return;
3921
0
  }
3922
3923
0
  kernel_size -= first_pass_tile;
3924
3925
0
  processed_x = x;
3926
0
  processed_y = y;
3927
3928
  // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
3929
  // middle_pass_tile * cr weights.
3930
0
  for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
3931
0
    assert(kernel_size >= middle_pass_tile);
3932
0
    size_t cr_block_start = 0;
3933
0
    for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
3934
0
      x = processed_x;
3935
0
      y = processed_y;
3936
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3937
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3938
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3939
0
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3940
0
          *((int8_t*) packed_weights) = kv;
3941
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
3942
0
        }
3943
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int8_t));
3944
0
        advance_x_y(h, &x, &y);
3945
0
      }
3946
0
    }
3947
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3948
0
      x = processed_x;
3949
0
      y = processed_y;
3950
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3951
0
      for (size_t j = 0; j < middle_pass_tile; j++) {
3952
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3953
0
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3954
0
          *((int8_t*) packed_weights) = kv;
3955
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
3956
0
        }
3957
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int8_t));
3958
0
        advance_x_y(h, &x, &y);
3959
0
      }
3960
0
    }
3961
0
    processed_x = x;
3962
0
    processed_y = y;
3963
0
  }
3964
3965
  // Last pass.
3966
0
  {
3967
0
    assert(kernel_size <= last_pass_tile);
3968
0
    size_t cr_block_start = 0;
3969
0
    for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
3970
      // Last pass does not pack to rounded c, since it handles remainder.
3971
0
      x = processed_x;
3972
0
      y = processed_y;
3973
0
      const size_t cr_block_size = min(c - cr_block_start, channel_tile);
3974
0
      for (size_t i = 0; i < kernel_size; i++) {
3975
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3976
0
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3977
0
          *((int8_t*) packed_weights) = kv;
3978
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
3979
0
        }
3980
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_tile - cr_block_size) * sizeof(int8_t));
3981
0
        advance_x_y(h, &x, &y);
3982
0
      }
3983
      // Pad so that we can always read last_pass_tile weights in the last pass.
3984
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_tile);
3985
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_tile_extra_bytes);
3986
0
    }
3987
0
    for (; cr_block_start < c; cr_block_start += channel_subtile) {
3988
      // Last pass does not pack to rounded c, since it handles remainder.
3989
0
      x = processed_x;
3990
0
      y = processed_y;
3991
0
      const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
3992
0
      for (size_t i = 0; i < kernel_size; i++) {
3993
0
        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
3994
0
          const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
3995
0
          *((int8_t*) packed_weights) = kv;
3996
0
          packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
3997
0
        }
3998
0
        packed_weights = (void*) ((uintptr_t) packed_weights + (channel_subtile - cr_block_size) * sizeof(int8_t));
3999
0
        advance_x_y(h, &x, &y);
4000
0
      }
4001
      // Pad so that we can always read last_pass_tile weights in the last pass.
4002
0
      packed_weights = (void*) ((uintptr_t) packed_weights + (last_pass_tile - kernel_size) * channel_subtile);
4003
0
      packed_weights = (void*) ((uintptr_t) packed_weights + per_subtile_extra_bytes);
4004
0
    }
4005
0
  }
4006
0
}
4007
4008
void xnn_pack_f32_gemminc_goi_w(
4009
  size_t g,
4010
  size_t nc,
4011
  size_t kc,
4012
  size_t nr,
4013
  size_t kr,
4014
  size_t sr,
4015
  const float* k,
4016
  float* packed_weights,
4017
  const void* params)
4018
0
{
4019
0
  assert(g != 0);
4020
0
  assert(nr >= sr);
4021
0
  assert(k != NULL);
4022
0
  assert(packed_weights != NULL);
4023
4024
0
  const size_t skr = sr * kr;
4025
0
  do {
4026
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
4027
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
4028
4029
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
4030
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
4031
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
4032
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
4033
0
            if (kc_idx < kc) {
4034
0
              packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
4035
0
            }
4036
0
          }
4037
0
          packed_weights += kr;
4038
0
        }
4039
0
        packed_weights += (nr - nr_block_size) * kr;
4040
0
      }
4041
0
    }
4042
0
    k += nc * kc;
4043
0
  } while (--g != 0);
4044
0
}
4045
4046
void xnn_pack_f16_gemminc_goi_w(
4047
  size_t g,
4048
  size_t nc,
4049
  size_t kc,
4050
  size_t nr,
4051
  size_t kr,
4052
  size_t sr,
4053
  const uint16_t* k,
4054
  uint16_t* packed_weights,
4055
  const void* params)
4056
0
{
4057
0
  assert(g != 0);
4058
0
  assert(nr >= sr);
4059
0
  assert(k != NULL);
4060
0
  assert(packed_weights != NULL);
4061
4062
0
  const size_t skr = sr * kr;
4063
0
  do {
4064
0
    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
4065
0
      const size_t nr_block_size = min(nc - nr_block_start, nr);
4066
4067
0
      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
4068
0
        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
4069
0
          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
4070
0
            const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
4071
0
            if (kc_idx < kc) {
4072
0
              packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
4073
0
            }
4074
0
          }
4075
0
          packed_weights += kr;
4076
0
        }
4077
0
        packed_weights += (nr - nr_block_size) * kr;
4078
0
      }
4079
0
    }
4080
0
    k += nc * kc;
4081
0
  } while (--g != 0);
4082
0
}
4083
4084
void xnn_pack_f32_dconv_oki_w(
4085
  size_t nc,
4086
  size_t kc,
4087
  size_t nr,
4088
  size_t kh,
4089
  size_t kw,
4090
  const float* k,
4091
  const float* b,
4092
  float* packed_weights,
4093
  const void* params)
4094
0
{
4095
0
  assert(k != NULL);
4096
0
  assert(packed_weights != NULL);
4097
4098
0
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
4099
0
    const size_t nr_block_size = min(nc - nr_block_start, nr);
4100
0
    if XNN_LIKELY(b != NULL) {
4101
0
      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
4102
0
        *packed_weights++ = b[min(nr_block_offset, nr_block_size - 1)];
4103
0
      }
4104
0
    } else {
4105
0
      size_t n = nr;
4106
0
      do {
4107
0
        *packed_weights++ = 0.0f;
4108
0
      } while (--n != 0);
4109
0
    }
4110
4111
0
    for (size_t kx = 0; kx < kw; kx++) {
4112
0
      for (size_t c = 0; c < kc; c++) {
4113
0
        for (size_t ky = 0; ky < kh; ky++) {
4114
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
4115
0
            *packed_weights++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
4116
0
          }
4117
0
        }
4118
0
      }
4119
0
    }
4120
0
    if XNN_UNPREDICTABLE(b != NULL) {
4121
0
      b += nr;
4122
0
    }
4123
0
  }
4124
0
}
4125
4126
void xnn_pack_f32_to_f16_dconv_oki_w(
4127
  size_t nc,
4128
  size_t kc,
4129
  size_t nr,
4130
  size_t kh,
4131
  size_t kw,
4132
  const float* k,
4133
  const float* b,
4134
  uint16_t* packed_weights,
4135
  const void* params)
4136
0
{
4137
0
  assert(k != NULL);
4138
0
  assert(packed_weights != NULL);
4139
4140
0
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
4141
0
    const size_t nr_block_size = min(nc - nr_block_start, nr);
4142
0
    if XNN_LIKELY(b != NULL) {
4143
0
      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
4144
0
        *packed_weights++ = fp16_ieee_from_fp32_value(b[min(nr_block_offset, nr_block_size - 1)]);
4145
0
      }
4146
0
    } else {
4147
0
      size_t n = nr;
4148
0
      do {
4149
0
        *packed_weights++ = 0;
4150
0
      } while (--n != 0);
4151
0
    }
4152
4153
0
    for (size_t kx = 0; kx < kw; kx++) {
4154
0
      for (size_t c = 0; c < kc; c++) {
4155
0
        for (size_t ky = 0; ky < kh; ky++) {
4156
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
4157
0
            *packed_weights++ = fp16_ieee_from_fp32_value(k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c]);
4158
0
          }
4159
0
        }
4160
0
      }
4161
0
    }
4162
0
    if XNN_UNPREDICTABLE(b != NULL) {
4163
0
      b += nr;
4164
0
    }
4165
0
  }
4166
0
}
4167
4168
void xnn_pack_f16_dconv_oki_w(
4169
  size_t nc,
4170
  size_t kc,
4171
  size_t nr,
4172
  size_t kh,
4173
  size_t kw,
4174
  const uint16_t* k,
4175
  const uint16_t* b,
4176
  uint16_t* packed_weights,
4177
  const void* params)
4178
0
{
4179
0
  assert(k != NULL);
4180
0
  assert(packed_weights != NULL);
4181
4182
0
  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
4183
0
    const size_t nr_block_size = min(nc - nr_block_start, nr);
4184
0
    if XNN_LIKELY(b != NULL) {
4185
0
      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
4186
0
        *packed_weights++ = b[min(nr_block_offset, nr_block_size - 1)];
4187
0
      }
4188
0
    } else {
4189
0
      size_t n = nr;
4190
0
      do {
4191
0
        *packed_weights++ = 0;
4192
0
      } while (--n != 0);
4193
0
    }
4194
4195
0
    for (size_t kx = 0; kx < kw; kx++) {
4196
0
      for (size_t c = 0; c < kc; c++) {
4197
0
        for (size_t ky = 0; ky < kh; ky++) {
4198
0
          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
4199
0
            *packed_weights++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
4200
0
          }
4201
0
        }
4202
0
      }
4203
0
    }
4204
0
    if XNN_UNPREDICTABLE(b != NULL) {
4205
0
      b += nr;
4206
0
    }
4207
0
  }
4208
0
}
4209
4210
void xnn_pack_f32_chw_dwconv_ghw_w(
4211
  size_t kernel_size,
4212
  size_t groups,
4213
  const float* k,
4214
  const float* b,
4215
  float* packed_weights,
4216
  const void* params)
4217
0
{
4218
0
  assert(k != NULL);
4219
0
  assert(packed_weights != NULL);
4220
4221
0
  for (size_t g = 0; g < groups; g++) {
4222
0
    if XNN_LIKELY(b != NULL) {
4223
0
      *packed_weights = *b++;
4224
0
    } else {
4225
0
      *packed_weights = 0.0f;
4226
0
    }
4227
0
    packed_weights += 1;
4228
0
    for (size_t i = 0; i < kernel_size; i++) {
4229
0
      *packed_weights++ = k[g * kernel_size + i];
4230
0
    }
4231
0
  }
4232
0
}
4233
4234
void xnn_pack_f32_to_f16_chw_dwconv_ghw_w(
4235
  size_t kernel_size,
4236
  size_t groups,
4237
  const float* k,
4238
  const float* b,
4239
  uint16_t* packed_weights,
4240
  const void* params)
4241
0
{
4242
0
  assert(k != NULL);
4243
0
  assert(packed_weights != NULL);
4244
4245
0
  for (size_t g = 0; g < groups; g++) {
4246
0
    if XNN_LIKELY(b != NULL) {
4247
0
      *packed_weights = fp16_ieee_from_fp32_value(*b++);
4248
0
    } else {
4249
0
      *packed_weights = 0;
4250
0
    }
4251
0
    packed_weights += 1;
4252
0
    for (size_t i = 0; i < kernel_size; i++) {
4253
0
      *packed_weights++ = fp16_ieee_from_fp32_value(k[g * kernel_size + i]);
4254
0
    }
4255
0
  }
4256
0
}
4257
4258
void xnn_pack_f16_chw_dwconv_ghw_w(
4259
  size_t kernel_size,
4260
  size_t groups,
4261
  const uint16_t* k,
4262
  const uint16_t* b,
4263
  uint16_t* packed_weights,
4264
  const void* params)
4265
0
{
4266
0
  assert(k != NULL);
4267
0
  assert(packed_weights != NULL);
4268
4269
0
  for (size_t g = 0; g < groups; g++) {
4270
0
    if XNN_LIKELY(b != NULL) {
4271
0
      *packed_weights = *b++;
4272
0
    } else {
4273
0
      *packed_weights = 0;
4274
0
    }
4275
0
    packed_weights += 1;
4276
0
    for (size_t i = 0; i < kernel_size; i++) {
4277
0
      *packed_weights++ = k[g * kernel_size + i];
4278
0
    }
4279
0
  }
4280
0
}
4281
4282
void xnn_pack_f32_chw_dwconv_hwg_w(
4283
  size_t kernel_size,
4284
  size_t groups,
4285
  const float* k,
4286
  const float* b,
4287
  float* packed_weights,
4288
  const void* params)
4289
0
{
4290
0
  assert(k != NULL);
4291
0
  assert(packed_weights != NULL);
4292
4293
0
  for (size_t g = 0; g < groups; g++) {
4294
0
    if XNN_LIKELY(b != NULL) {
4295
0
      *packed_weights = *b++;
4296
0
    } else {
4297
0
      *packed_weights = 0.0f;
4298
0
    }
4299
0
    packed_weights += 1;
4300
0
    for (size_t i = 0; i < kernel_size; i++) {
4301
0
      *packed_weights++ = k[i * groups + g];
4302
0
    }
4303
0
  }
4304
0
}
4305
4306
void xnn_pack_f16_chw_dwconv_hwg_w(
4307
  size_t kernel_size,
4308
  size_t groups,
4309
  const uint16_t* k,
4310
  const uint16_t* b,
4311
  uint16_t* packed_weights,
4312
  const void* params)
4313
0
{
4314
0
  assert(k != NULL);
4315
0
  assert(packed_weights != NULL);
4316
4317
0
  for (size_t g = 0; g < groups; g++) {
4318
0
    if XNN_LIKELY(b != NULL) {
4319
0
      *packed_weights = *b++;
4320
0
    } else {
4321
0
      *packed_weights = 0;
4322
0
    }
4323
0
    packed_weights += 1;
4324
0
    for (size_t i = 0; i < kernel_size; i++) {
4325
0
      *packed_weights++ = k[i * groups + g];
4326
0
    }
4327
0
  }
4328
0
}
4329
4330
void xnn_pack_f32_to_f16_chw_dwconv_hwg_w(
4331
  size_t kernel_size,
4332
  size_t groups,
4333
  const float* k,
4334
  const float* b,
4335
  uint16_t* packed_weights,
4336
  const void* params)
4337
0
{
4338
0
  assert(k != NULL);
4339
0
  assert(packed_weights != NULL);
4340
4341
0
  for (size_t g = 0; g < groups; g++) {
4342
0
    if XNN_LIKELY(b != NULL) {
4343
0
      *packed_weights = fp16_ieee_from_fp32_value(*b++);
4344
0
    } else {
4345
0
      *packed_weights = 0;
4346
0
    }
4347
0
    packed_weights += 1;
4348
0
    for (size_t i = 0; i < kernel_size; i++) {
4349
0
      *packed_weights++ = fp16_ieee_from_fp32_value(k[i * groups + g]);
4350
0
    }
4351
0
  }
4352
0
}
4353
4354
4355
void xnn_pack_f32_vmulcaddc_w(
4356
  size_t c,
4357
  size_t cr,
4358
  const float* s,
4359
  const float* b,
4360
  float* packed_weights,
4361
  const void* params)
4362
0
{
4363
0
  assert(s != NULL);
4364
0
  assert(packed_weights != NULL);
4365
4366
0
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
4367
0
    const size_t cr_block_size = min(c - cr_block_start, cr);
4368
0
    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
4369
0
      *packed_weights++ = s[cr_block_start + cr_block_offset];
4370
0
    }
4371
0
    packed_weights += cr - cr_block_size;
4372
0
    if XNN_LIKELY(b != NULL) {
4373
0
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
4374
0
        *packed_weights++ = b[cr_block_start + cr_block_offset];
4375
0
      }
4376
0
    } else {
4377
0
      size_t n = cr_block_size;
4378
0
      do {
4379
0
        *packed_weights++ = 0.0f;
4380
0
      } while (--n != 0);
4381
0
    }
4382
0
    packed_weights += cr - cr_block_size;
4383
0
  }
4384
0
}
4385
4386
void xnn_pack_f16_vmulcaddc_w(
4387
  size_t c,
4388
  size_t cr,
4389
  const uint16_t* s,
4390
  const uint16_t* b,
4391
  uint16_t* packed_weights,
4392
  const void* params)
4393
0
{
4394
0
  assert(s != NULL);
4395
0
  assert(packed_weights != NULL);
4396
4397
0
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
4398
0
    const size_t cr_block_size = min(c - cr_block_start, cr);
4399
0
    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
4400
0
      *packed_weights++ = s[cr_block_start + cr_block_offset];
4401
0
    }
4402
0
    packed_weights += cr - cr_block_size;
4403
0
    if XNN_LIKELY(b != NULL) {
4404
0
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
4405
0
        *packed_weights++ = b[cr_block_start + cr_block_offset];
4406
0
      }
4407
0
    } else {
4408
0
      size_t n = cr_block_size;
4409
0
      do {
4410
0
        *packed_weights++ = 0;
4411
0
      } while (--n != 0);
4412
0
    }
4413
0
    packed_weights += cr - cr_block_size;
4414
0
  }
4415
0
}
4416
4417
void xnn_pack_f32_to_f16_vmulcaddc_w(
4418
  size_t c,
4419
  size_t cr,
4420
  const float* s,
4421
  const float* b,
4422
  uint16_t* packed_weights,
4423
  const void* params)
4424
0
{
4425
0
  assert(s != NULL);
4426
0
  assert(packed_weights != NULL);
4427
4428
0
  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
4429
0
    const size_t cr_block_size = min(c - cr_block_start, cr);
4430
0
    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
4431
0
      *packed_weights++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
4432
0
    }
4433
0
    packed_weights += cr - cr_block_size;
4434
0
    if XNN_LIKELY(b != NULL) {
4435
0
      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
4436
0
        *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
4437
0
      }
4438
0
    } else {
4439
0
      size_t n = cr_block_size;
4440
0
      do {
4441
0
        *packed_weights++ = 0;
4442
0
      } while (--n != 0);
4443
0
    }
4444
0
    packed_weights += cr - cr_block_size;
4445
0
  }
4446
0
}
4447
4448
void xnn_pack_f32_prelu_w(
4449
  size_t c,
4450
  const float* s,
4451
  float* packed_weights)
4452
0
{
4453
0
  assert(s != NULL);
4454
0
  assert(packed_weights != NULL);
4455
4456
0
  memcpy(packed_weights, s, c * sizeof(float));
4457
0
}
4458
4459
void xnn_pack_f16_prelu_w(
4460
  size_t c,
4461
  const uint16_t* s,
4462
  uint16_t* packed_weights)
4463
0
{
4464
0
  assert(s != NULL);
4465
0
  assert(packed_weights != NULL);
4466
4467
0
  memcpy(packed_weights, s, c * sizeof(uint16