Coverage Report

Created: 2025-09-27 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xnnpack/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-sse41-dup.c
Line
Count
Source
1
// clang-format off
2
// Auto-generated file. Do not edit!
3
//   Template: src/f32-gemm/sse-dup.c.in
4
//   Generator: tools/xngen
5
//
6
// Copyright 2019 Google LLC
7
//
8
// This source code is licensed under the BSD-style license found in the
9
// LICENSE file in the root directory of this source tree.
10
11
#include <assert.h>
12
#include <stddef.h>
13
#include <stdint.h>
14
15
#include <smmintrin.h>
16
17
#include "src/xnnpack/common.h"
18
#include "src/xnnpack/microparams.h"
19
#include "src/xnnpack/gemm.h"
20
#include "src/xnnpack/unaligned.h"
21
22
23
void xnn_f32_qc4w_gemm_minmax_ukernel_4x8__sse41_dup(
24
    size_t mr,
25
    size_t nc,
26
    size_t kc,
27
    const float* restrict a,
28
    size_t a_stride,
29
    const void* restrict w,
30
    float* restrict c,
31
    size_t cm_stride,
32
    size_t cn_stride,
33
    const struct xnn_f32_qc4w_minmax_params* restrict params)
34
0
{
35
0
  assert(mr != 0);
36
0
  assert(mr <= 4);
37
0
  assert(nc != 0);
38
0
  assert(kc != 0);
39
0
  assert(kc % sizeof(float) == 0);
40
0
  assert(a != NULL);
41
0
  assert(w != NULL);
42
0
  assert(c != NULL);
43
44
0
  const float* a0 = a;
45
0
  float* c0 = c;
46
0
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
47
0
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
48
0
  if XNN_UNPREDICTABLE(mr < 2) {
49
0
    a1 = a0;
50
0
    c1 = c0;
51
0
  }
52
0
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
53
0
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
54
0
  if XNN_UNPREDICTABLE(mr <= 2) {
55
0
    a2 = a1;
56
0
    c2 = c1;
57
0
  }
58
0
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
59
0
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
60
0
  if XNN_UNPREDICTABLE(mr != 4) {
61
0
    a3 = a2;
62
0
    c3 = c2;
63
0
  }
64
0
  const __m128i vmagic_bias_c0 = _mm_set1_epi32(0x4B0000F0);
65
0
  const __m128i vmagic_bias_c1 = _mm_set1_epi32(0x4900000F);
66
0
  const __m128 vmagic_bias_plus_kernel_zero_point_c0 = _mm_set1_ps(0x1.0001E0p+23f + (float) params->scalar.kernel_zero_point);
67
0
  const __m128 vmagic_bias_plus_kernel_zero_point_c1 = _mm_set1_ps(0x1.00001Ep+19f + (float) params->scalar.kernel_zero_point);
68
0
  XNN_FORCE_REALIZATION(vmagic_bias_c0);
69
0
  XNN_FORCE_REALIZATION(vmagic_bias_c1);
70
0
  XNN_FORCE_REALIZATION(vmagic_bias_plus_kernel_zero_point_c0);
71
0
  XNN_FORCE_REALIZATION(vmagic_bias_plus_kernel_zero_point_c1);
72
73
0
  const __m128 vmin = _mm_set1_ps(params->scalar.min);
74
0
  const __m128 vmax = _mm_set1_ps(params->scalar.max);
75
0
  XNN_FORCE_REALIZATION(vmin);
76
0
  XNN_FORCE_REALIZATION(vmax);
77
78
0
  do {
79
0
    __m128 vacc0x0123 = _mm_loadu_ps((const float*) w + 0);
80
0
    __m128 vacc0x4567 = _mm_loadu_ps((const float*) w + 4);
81
0
    __m128 vacc1x0123 = vacc0x0123;
82
0
    __m128 vacc1x4567 = vacc0x4567;
83
0
    __m128 vacc2x0123 = vacc0x0123;
84
0
    __m128 vacc2x4567 = vacc0x4567;
85
0
    __m128 vacc3x0123 = vacc0x0123;
86
0
    __m128 vacc3x4567 = vacc0x4567;
87
0
    w = (const float*) w + 8;
88
89
0
    size_t k = kc;
90
0
    for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
91
0
      const __m128 va0 = _mm_loadu_ps(a0);
92
0
      a0 += 4;
93
0
      const __m128 va1 = _mm_loadu_ps(a1);
94
0
      a1 += 4;
95
0
      const __m128 va2 = _mm_loadu_ps(a2);
96
0
      a2 += 4;
97
0
      const __m128 va3 = _mm_loadu_ps(a3);
98
0
      a3 += 4;
99
100
101
0
      const __m128 va0c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(0, 0, 0, 0)));
102
0
      const __m128 va1c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(0, 0, 0, 0)));
103
0
      const __m128 va2c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(0, 0, 0, 0)));
104
0
      const __m128 va3c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(0, 0, 0, 0)));
105
106
0
      const __m128i vbi0123c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 0)));
107
0
      const __m128i vbi4567c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 4)));
108
0
      const __m128i vbi0123c23 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 8)));
109
0
      const __m128i vbi4567c23 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 12)));
110
0
      const __m128 vbm0123c0 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c0));
111
0
      const __m128 vbm0123c1 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c1));
112
0
      const __m128 vbm0123c2 = _mm_castsi128_ps(_mm_or_si128(vbi0123c23, vmagic_bias_c0));
113
0
      const __m128 vbm0123c3 = _mm_castsi128_ps(_mm_or_si128(vbi0123c23, vmagic_bias_c1));
114
0
      const __m128 vbm4567c0 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c0));
115
0
      const __m128 vbm4567c1 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c1));
116
0
      const __m128 vbm4567c2 = _mm_castsi128_ps(_mm_or_si128(vbi4567c23, vmagic_bias_c0));
117
0
      const __m128 vbm4567c3 = _mm_castsi128_ps(_mm_or_si128(vbi4567c23, vmagic_bias_c1));
118
0
      const __m128 vb0123c0 = _mm_sub_ps(vbm0123c0, vmagic_bias_plus_kernel_zero_point_c0);
119
0
      const __m128 vb0123c1 = _mm_sub_ps(vbm0123c1, vmagic_bias_plus_kernel_zero_point_c1);
120
0
      const __m128 vb0123c2 = _mm_sub_ps(vbm0123c2, vmagic_bias_plus_kernel_zero_point_c0);
121
0
      const __m128 vb0123c3 = _mm_sub_ps(vbm0123c3, vmagic_bias_plus_kernel_zero_point_c1);
122
0
      const __m128 vb4567c0 = _mm_sub_ps(vbm4567c0, vmagic_bias_plus_kernel_zero_point_c0);
123
0
      const __m128 vb4567c1 = _mm_sub_ps(vbm4567c1, vmagic_bias_plus_kernel_zero_point_c1);
124
0
      const __m128 vb4567c2 = _mm_sub_ps(vbm4567c2, vmagic_bias_plus_kernel_zero_point_c0);
125
0
      const __m128 vb4567c3 = _mm_sub_ps(vbm4567c3, vmagic_bias_plus_kernel_zero_point_c1);
126
127
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c0000, vb0123c0));
128
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c0000, vb0123c0));
129
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0));
130
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0));
131
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c0000, vb4567c0));
132
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c0000, vb4567c0));
133
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c0000, vb4567c0));
134
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c0000, vb4567c0));
135
136
0
      const __m128 va0c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(1, 1, 1, 1)));
137
0
      const __m128 va1c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(1, 1, 1, 1)));
138
0
      const __m128 va2c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(1, 1, 1, 1)));
139
0
      const __m128 va3c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(1, 1, 1, 1)));
140
141
142
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c1111, vb0123c1));
143
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c1111, vb0123c1));
144
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1));
145
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1));
146
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c1111, vb4567c1));
147
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c1111, vb4567c1));
148
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c1111, vb4567c1));
149
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c1111, vb4567c1));
150
151
0
      const __m128 va0c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(2, 2, 2, 2)));
152
0
      const __m128 va1c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(2, 2, 2, 2)));
153
0
      const __m128 va2c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(2, 2, 2, 2)));
154
0
      const __m128 va3c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(2, 2, 2, 2)));
155
156
157
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c2222, vb0123c2));
158
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c2222, vb0123c2));
159
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2));
160
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2));
161
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c2222, vb4567c2));
162
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c2222, vb4567c2));
163
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c2222, vb4567c2));
164
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c2222, vb4567c2));
165
166
0
      const __m128 va0c3333 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(3, 3, 3, 3));
167
0
      const __m128 va1c3333 = _mm_shuffle_ps(va1, va1, _MM_SHUFFLE(3, 3, 3, 3));
168
0
      const __m128 va2c3333 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(3, 3, 3, 3));
169
0
      const __m128 va3c3333 = _mm_shuffle_ps(va3, va3, _MM_SHUFFLE(3, 3, 3, 3));
170
171
172
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c3333, vb0123c3));
173
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c3333, vb0123c3));
174
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3));
175
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3));
176
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c3333, vb4567c3));
177
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c3333, vb4567c3));
178
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c3333, vb4567c3));
179
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c3333, vb4567c3));
180
181
0
      w = (const int8_t*) w + 16;
182
0
    }
183
0
    if XNN_UNLIKELY(k >= 2 * sizeof(float)) {
184
0
      const __m128 va0 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a0));
185
0
      a0 += 2;
186
0
      const __m128 va1 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a1));
187
0
      a1 += 2;
188
0
      const __m128 va2 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a2));
189
0
      a2 += 2;
190
0
      const __m128 va3 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a3));
191
0
      a3 += 2;
192
193
0
      const __m128 va0c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(0, 0, 0, 0)));
194
0
      const __m128 va1c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(0, 0, 0, 0)));
195
0
      const __m128 va2c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(0, 0, 0, 0)));
196
0
      const __m128 va3c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(0, 0, 0, 0)));
197
198
0
      const __m128i vbi0123c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 0)));
199
0
      const __m128i vbi4567c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 4)));
200
0
      const __m128 vbm0123c0 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c0));
201
0
      const __m128 vbm0123c1 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c1));
202
0
      const __m128 vbm4567c0 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c0));
203
0
      const __m128 vbm4567c1 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c1));
204
0
      const __m128 vb0123c0 = _mm_sub_ps(vbm0123c0, vmagic_bias_plus_kernel_zero_point_c0);
205
0
      const __m128 vb0123c1 = _mm_sub_ps(vbm0123c1, vmagic_bias_plus_kernel_zero_point_c1);
206
0
      const __m128 vb4567c0 = _mm_sub_ps(vbm4567c0, vmagic_bias_plus_kernel_zero_point_c0);
207
0
      const __m128 vb4567c1 = _mm_sub_ps(vbm4567c1, vmagic_bias_plus_kernel_zero_point_c1);
208
209
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c0000, vb0123c0));
210
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c0000, vb0123c0));
211
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0));
212
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0));
213
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c0000, vb4567c0));
214
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c0000, vb4567c0));
215
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c0000, vb4567c0));
216
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c0000, vb4567c0));
217
218
0
      const __m128 va0c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(1, 1, 1, 1)));
219
0
      const __m128 va1c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(1, 1, 1, 1)));
220
0
      const __m128 va2c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(1, 1, 1, 1)));
221
0
      const __m128 va3c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(1, 1, 1, 1)));
222
223
224
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c1111, vb0123c1));
225
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c1111, vb0123c1));
226
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1));
227
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1));
228
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c1111, vb4567c1));
229
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c1111, vb4567c1));
230
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c1111, vb4567c1));
231
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c1111, vb4567c1));
232
233
0
      w = (const int8_t*) w + 8;
234
0
      k -= 2 * sizeof(float);
235
0
    }
236
0
    if XNN_UNLIKELY(k != 0) {
237
0
      const __m128 va0 = _mm_load1_ps(a0);
238
0
      a0 += 1;
239
0
      const __m128 va1 = _mm_load1_ps(a1);
240
0
      a1 += 1;
241
0
      const __m128 va2 = _mm_load1_ps(a2);
242
0
      a2 += 1;
243
0
      const __m128 va3 = _mm_load1_ps(a3);
244
0
      a3 += 1;
245
246
0
      const __m128i vbi0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 0)));
247
0
      const __m128i vbi4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 4)));
248
0
      const __m128 vbm0123 = _mm_castsi128_ps(_mm_or_si128(vbi0123, vmagic_bias_c0));
249
0
      const __m128 vbm4567 = _mm_castsi128_ps(_mm_or_si128(vbi4567, vmagic_bias_c0));
250
0
      const __m128 vb0123 = _mm_sub_ps(vbm0123, vmagic_bias_plus_kernel_zero_point_c0);
251
0
      const __m128 vb4567 = _mm_sub_ps(vbm4567, vmagic_bias_plus_kernel_zero_point_c0);
252
0
      w = (const int8_t*) w + 8;
253
254
0
      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
255
0
      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
256
0
      vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
257
0
      vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
258
0
      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
259
0
      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
260
0
      vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
261
0
      vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
262
263
0
      k -= sizeof(float);
264
0
    }
265
266
0
    const __m128 vscale0123 = _mm_loadu_ps((const float*) w + 0);
267
0
    vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
268
0
    vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
269
0
    vacc2x0123 = _mm_mul_ps(vacc2x0123, vscale0123);
270
0
    vacc3x0123 = _mm_mul_ps(vacc3x0123, vscale0123);
271
0
    const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
272
0
    vacc0x4567 = _mm_mul_ps(vacc0x4567, vscale4567);
273
0
    vacc1x4567 = _mm_mul_ps(vacc1x4567, vscale4567);
274
0
    vacc2x4567 = _mm_mul_ps(vacc2x4567, vscale4567);
275
0
    vacc3x4567 = _mm_mul_ps(vacc3x4567, vscale4567);
276
0
    w = (const float*) w + 8;
277
0
    vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
278
0
    vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
279
0
    vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
280
0
    vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
281
0
    vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
282
0
    vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
283
0
    vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
284
0
    vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
285
286
0
    vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
287
0
    vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
288
0
    vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
289
0
    vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
290
0
    vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
291
0
    vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
292
0
    vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
293
0
    vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
294
295
0
    if XNN_LIKELY(nc >= 8) {
296
0
      _mm_storeu_ps(c0, vacc0x0123);
297
0
      _mm_storeu_ps(c0 + 4, vacc0x4567);
298
0
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
299
0
      _mm_storeu_ps(c1, vacc1x0123);
300
0
      _mm_storeu_ps(c1 + 4, vacc1x4567);
301
0
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
302
0
      _mm_storeu_ps(c2, vacc2x0123);
303
0
      _mm_storeu_ps(c2 + 4, vacc2x4567);
304
0
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
305
0
      _mm_storeu_ps(c3, vacc3x0123);
306
0
      _mm_storeu_ps(c3 + 4, vacc3x4567);
307
0
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
308
309
0
      a0 = (const float*) ((uintptr_t) a0 - kc);
310
0
      a1 = (const float*) ((uintptr_t) a1 - kc);
311
0
      a2 = (const float*) ((uintptr_t) a2 - kc);
312
0
      a3 = (const float*) ((uintptr_t) a3 - kc);
313
314
0
      nc -= 8;
315
0
    } else {
316
0
      if (nc & 4) {
317
0
        _mm_storeu_ps(c0, vacc0x0123);
318
0
        _mm_storeu_ps(c1, vacc1x0123);
319
0
        _mm_storeu_ps(c2, vacc2x0123);
320
0
        _mm_storeu_ps(c3, vacc3x0123);
321
322
0
        vacc0x0123 = vacc0x4567;
323
0
        vacc1x0123 = vacc1x4567;
324
0
        vacc2x0123 = vacc2x4567;
325
0
        vacc3x0123 = vacc3x4567;
326
327
0
        c0 += 4;
328
0
        c1 += 4;
329
0
        c2 += 4;
330
0
        c3 += 4;
331
0
      }
332
0
      if (nc & 2) {
333
0
        _mm_storel_pi((__m64*) c0, vacc0x0123);
334
0
        _mm_storel_pi((__m64*) c1, vacc1x0123);
335
0
        _mm_storel_pi((__m64*) c2, vacc2x0123);
336
0
        _mm_storel_pi((__m64*) c3, vacc3x0123);
337
338
0
        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
339
0
        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
340
0
        vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
341
0
        vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
342
343
0
        c0 += 2;
344
0
        c1 += 2;
345
0
        c2 += 2;
346
0
        c3 += 2;
347
0
      }
348
0
      if (nc & 1) {
349
0
        _mm_store_ss(c0, vacc0x0123);
350
0
        _mm_store_ss(c1, vacc1x0123);
351
0
        _mm_store_ss(c2, vacc2x0123);
352
0
        _mm_store_ss(c3, vacc3x0123);
353
0
      }
354
355
0
      nc = 0;
356
0
    }
357
0
  } while (nc != 0);
358
0
}