/src/xnnpack/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-sse41-dup.c
Line | Count | Source |
1 | | // clang-format off |
2 | | // Auto-generated file. Do not edit! |
3 | | // Template: src/f32-gemm/sse-dup.c.in |
4 | | // Generator: tools/xngen |
5 | | // |
6 | | // Copyright 2019 Google LLC |
7 | | // |
8 | | // This source code is licensed under the BSD-style license found in the |
9 | | // LICENSE file in the root directory of this source tree. |
10 | | |
11 | | #include <assert.h> |
12 | | #include <stddef.h> |
13 | | #include <stdint.h> |
14 | | |
15 | | #include <smmintrin.h> |
16 | | |
17 | | #include "src/xnnpack/common.h" |
18 | | #include "src/xnnpack/microparams.h" |
19 | | #include "src/xnnpack/gemm.h" |
20 | | #include "src/xnnpack/unaligned.h" |
21 | | |
22 | | |
23 | | void xnn_f32_qc4w_gemm_minmax_ukernel_4x8__sse41_dup( |
24 | | size_t mr, |
25 | | size_t nc, |
26 | | size_t kc, |
27 | | const float* restrict a, |
28 | | size_t a_stride, |
29 | | const void* restrict w, |
30 | | float* restrict c, |
31 | | size_t cm_stride, |
32 | | size_t cn_stride, |
33 | | const struct xnn_f32_qc4w_minmax_params* restrict params) |
34 | 0 | { |
35 | 0 | assert(mr != 0); |
36 | 0 | assert(mr <= 4); |
37 | 0 | assert(nc != 0); |
38 | 0 | assert(kc != 0); |
39 | 0 | assert(kc % sizeof(float) == 0); |
40 | 0 | assert(a != NULL); |
41 | 0 | assert(w != NULL); |
42 | 0 | assert(c != NULL); |
43 | | |
44 | 0 | const float* a0 = a; |
45 | 0 | float* c0 = c; |
46 | 0 | const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); |
47 | 0 | float* c1 = (float*) ((uintptr_t) c0 + cm_stride); |
48 | 0 | if XNN_UNPREDICTABLE(mr < 2) { |
49 | 0 | a1 = a0; |
50 | 0 | c1 = c0; |
51 | 0 | } |
52 | 0 | const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); |
53 | 0 | float* c2 = (float*) ((uintptr_t) c1 + cm_stride); |
54 | 0 | if XNN_UNPREDICTABLE(mr <= 2) { |
55 | 0 | a2 = a1; |
56 | 0 | c2 = c1; |
57 | 0 | } |
58 | 0 | const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); |
59 | 0 | float* c3 = (float*) ((uintptr_t) c2 + cm_stride); |
60 | 0 | if XNN_UNPREDICTABLE(mr != 4) { |
61 | 0 | a3 = a2; |
62 | 0 | c3 = c2; |
63 | 0 | } |
64 | 0 | const __m128i vmagic_bias_c0 = _mm_set1_epi32(0x4B0000F0); |
65 | 0 | const __m128i vmagic_bias_c1 = _mm_set1_epi32(0x4900000F); |
66 | 0 | const __m128 vmagic_bias_plus_kernel_zero_point_c0 = _mm_set1_ps(0x1.0001E0p+23f + (float) params->scalar.kernel_zero_point); |
67 | 0 | const __m128 vmagic_bias_plus_kernel_zero_point_c1 = _mm_set1_ps(0x1.00001Ep+19f + (float) params->scalar.kernel_zero_point); |
68 | 0 | XNN_FORCE_REALIZATION(vmagic_bias_c0); |
69 | 0 | XNN_FORCE_REALIZATION(vmagic_bias_c1); |
70 | 0 | XNN_FORCE_REALIZATION(vmagic_bias_plus_kernel_zero_point_c0); |
71 | 0 | XNN_FORCE_REALIZATION(vmagic_bias_plus_kernel_zero_point_c1); |
72 | |
|
73 | 0 | const __m128 vmin = _mm_set1_ps(params->scalar.min); |
74 | 0 | const __m128 vmax = _mm_set1_ps(params->scalar.max); |
75 | 0 | XNN_FORCE_REALIZATION(vmin); |
76 | 0 | XNN_FORCE_REALIZATION(vmax); |
77 | |
|
78 | 0 | do { |
79 | 0 | __m128 vacc0x0123 = _mm_loadu_ps((const float*) w + 0); |
80 | 0 | __m128 vacc0x4567 = _mm_loadu_ps((const float*) w + 4); |
81 | 0 | __m128 vacc1x0123 = vacc0x0123; |
82 | 0 | __m128 vacc1x4567 = vacc0x4567; |
83 | 0 | __m128 vacc2x0123 = vacc0x0123; |
84 | 0 | __m128 vacc2x4567 = vacc0x4567; |
85 | 0 | __m128 vacc3x0123 = vacc0x0123; |
86 | 0 | __m128 vacc3x4567 = vacc0x4567; |
87 | 0 | w = (const float*) w + 8; |
88 | |
|
89 | 0 | size_t k = kc; |
90 | 0 | for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { |
91 | 0 | const __m128 va0 = _mm_loadu_ps(a0); |
92 | 0 | a0 += 4; |
93 | 0 | const __m128 va1 = _mm_loadu_ps(a1); |
94 | 0 | a1 += 4; |
95 | 0 | const __m128 va2 = _mm_loadu_ps(a2); |
96 | 0 | a2 += 4; |
97 | 0 | const __m128 va3 = _mm_loadu_ps(a3); |
98 | 0 | a3 += 4; |
99 | | |
100 | |
|
101 | 0 | const __m128 va0c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(0, 0, 0, 0))); |
102 | 0 | const __m128 va1c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(0, 0, 0, 0))); |
103 | 0 | const __m128 va2c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(0, 0, 0, 0))); |
104 | 0 | const __m128 va3c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(0, 0, 0, 0))); |
105 | |
|
106 | 0 | const __m128i vbi0123c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 0))); |
107 | 0 | const __m128i vbi4567c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 4))); |
108 | 0 | const __m128i vbi0123c23 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 8))); |
109 | 0 | const __m128i vbi4567c23 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 12))); |
110 | 0 | const __m128 vbm0123c0 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c0)); |
111 | 0 | const __m128 vbm0123c1 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c1)); |
112 | 0 | const __m128 vbm0123c2 = _mm_castsi128_ps(_mm_or_si128(vbi0123c23, vmagic_bias_c0)); |
113 | 0 | const __m128 vbm0123c3 = _mm_castsi128_ps(_mm_or_si128(vbi0123c23, vmagic_bias_c1)); |
114 | 0 | const __m128 vbm4567c0 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c0)); |
115 | 0 | const __m128 vbm4567c1 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c1)); |
116 | 0 | const __m128 vbm4567c2 = _mm_castsi128_ps(_mm_or_si128(vbi4567c23, vmagic_bias_c0)); |
117 | 0 | const __m128 vbm4567c3 = _mm_castsi128_ps(_mm_or_si128(vbi4567c23, vmagic_bias_c1)); |
118 | 0 | const __m128 vb0123c0 = _mm_sub_ps(vbm0123c0, vmagic_bias_plus_kernel_zero_point_c0); |
119 | 0 | const __m128 vb0123c1 = _mm_sub_ps(vbm0123c1, vmagic_bias_plus_kernel_zero_point_c1); |
120 | 0 | const __m128 vb0123c2 = _mm_sub_ps(vbm0123c2, vmagic_bias_plus_kernel_zero_point_c0); |
121 | 0 | const __m128 vb0123c3 = _mm_sub_ps(vbm0123c3, vmagic_bias_plus_kernel_zero_point_c1); |
122 | 0 | const __m128 vb4567c0 = _mm_sub_ps(vbm4567c0, vmagic_bias_plus_kernel_zero_point_c0); |
123 | 0 | const __m128 vb4567c1 = _mm_sub_ps(vbm4567c1, vmagic_bias_plus_kernel_zero_point_c1); |
124 | 0 | const __m128 vb4567c2 = _mm_sub_ps(vbm4567c2, vmagic_bias_plus_kernel_zero_point_c0); |
125 | 0 | const __m128 vb4567c3 = _mm_sub_ps(vbm4567c3, vmagic_bias_plus_kernel_zero_point_c1); |
126 | |
|
127 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c0000, vb0123c0)); |
128 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c0000, vb0123c0)); |
129 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); |
130 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); |
131 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c0000, vb4567c0)); |
132 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c0000, vb4567c0)); |
133 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c0000, vb4567c0)); |
134 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c0000, vb4567c0)); |
135 | |
|
136 | 0 | const __m128 va0c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(1, 1, 1, 1))); |
137 | 0 | const __m128 va1c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(1, 1, 1, 1))); |
138 | 0 | const __m128 va2c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(1, 1, 1, 1))); |
139 | 0 | const __m128 va3c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(1, 1, 1, 1))); |
140 | | |
141 | |
|
142 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c1111, vb0123c1)); |
143 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c1111, vb0123c1)); |
144 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); |
145 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); |
146 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c1111, vb4567c1)); |
147 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c1111, vb4567c1)); |
148 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c1111, vb4567c1)); |
149 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c1111, vb4567c1)); |
150 | |
|
151 | 0 | const __m128 va0c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(2, 2, 2, 2))); |
152 | 0 | const __m128 va1c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(2, 2, 2, 2))); |
153 | 0 | const __m128 va2c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(2, 2, 2, 2))); |
154 | 0 | const __m128 va3c2222 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(2, 2, 2, 2))); |
155 | | |
156 | |
|
157 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c2222, vb0123c2)); |
158 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c2222, vb0123c2)); |
159 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c2222, vb0123c2)); |
160 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c2222, vb0123c2)); |
161 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c2222, vb4567c2)); |
162 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c2222, vb4567c2)); |
163 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c2222, vb4567c2)); |
164 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c2222, vb4567c2)); |
165 | |
|
166 | 0 | const __m128 va0c3333 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(3, 3, 3, 3)); |
167 | 0 | const __m128 va1c3333 = _mm_shuffle_ps(va1, va1, _MM_SHUFFLE(3, 3, 3, 3)); |
168 | 0 | const __m128 va2c3333 = _mm_shuffle_ps(va2, va2, _MM_SHUFFLE(3, 3, 3, 3)); |
169 | 0 | const __m128 va3c3333 = _mm_shuffle_ps(va3, va3, _MM_SHUFFLE(3, 3, 3, 3)); |
170 | | |
171 | |
|
172 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c3333, vb0123c3)); |
173 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c3333, vb0123c3)); |
174 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c3333, vb0123c3)); |
175 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c3333, vb0123c3)); |
176 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c3333, vb4567c3)); |
177 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c3333, vb4567c3)); |
178 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c3333, vb4567c3)); |
179 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c3333, vb4567c3)); |
180 | |
|
181 | 0 | w = (const int8_t*) w + 16; |
182 | 0 | } |
183 | 0 | if XNN_UNLIKELY(k >= 2 * sizeof(float)) { |
184 | 0 | const __m128 va0 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a0)); |
185 | 0 | a0 += 2; |
186 | 0 | const __m128 va1 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a1)); |
187 | 0 | a1 += 2; |
188 | 0 | const __m128 va2 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a2)); |
189 | 0 | a2 += 2; |
190 | 0 | const __m128 va3 = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *) a3)); |
191 | 0 | a3 += 2; |
192 | |
|
193 | 0 | const __m128 va0c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(0, 0, 0, 0))); |
194 | 0 | const __m128 va1c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(0, 0, 0, 0))); |
195 | 0 | const __m128 va2c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(0, 0, 0, 0))); |
196 | 0 | const __m128 va3c0000 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(0, 0, 0, 0))); |
197 | |
|
198 | 0 | const __m128i vbi0123c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 0))); |
199 | 0 | const __m128i vbi4567c01 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 4))); |
200 | 0 | const __m128 vbm0123c0 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c0)); |
201 | 0 | const __m128 vbm0123c1 = _mm_castsi128_ps(_mm_or_si128(vbi0123c01, vmagic_bias_c1)); |
202 | 0 | const __m128 vbm4567c0 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c0)); |
203 | 0 | const __m128 vbm4567c1 = _mm_castsi128_ps(_mm_or_si128(vbi4567c01, vmagic_bias_c1)); |
204 | 0 | const __m128 vb0123c0 = _mm_sub_ps(vbm0123c0, vmagic_bias_plus_kernel_zero_point_c0); |
205 | 0 | const __m128 vb0123c1 = _mm_sub_ps(vbm0123c1, vmagic_bias_plus_kernel_zero_point_c1); |
206 | 0 | const __m128 vb4567c0 = _mm_sub_ps(vbm4567c0, vmagic_bias_plus_kernel_zero_point_c0); |
207 | 0 | const __m128 vb4567c1 = _mm_sub_ps(vbm4567c1, vmagic_bias_plus_kernel_zero_point_c1); |
208 | |
|
209 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c0000, vb0123c0)); |
210 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c0000, vb0123c0)); |
211 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c0000, vb0123c0)); |
212 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c0000, vb0123c0)); |
213 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c0000, vb4567c0)); |
214 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c0000, vb4567c0)); |
215 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c0000, vb4567c0)); |
216 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c0000, vb4567c0)); |
217 | |
|
218 | 0 | const __m128 va0c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va0), _MM_SHUFFLE(1, 1, 1, 1))); |
219 | 0 | const __m128 va1c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va1), _MM_SHUFFLE(1, 1, 1, 1))); |
220 | 0 | const __m128 va2c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va2), _MM_SHUFFLE(1, 1, 1, 1))); |
221 | 0 | const __m128 va3c1111 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(va3), _MM_SHUFFLE(1, 1, 1, 1))); |
222 | | |
223 | |
|
224 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0c1111, vb0123c1)); |
225 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c1111, vb0123c1)); |
226 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2c1111, vb0123c1)); |
227 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3c1111, vb0123c1)); |
228 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0c1111, vb4567c1)); |
229 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1c1111, vb4567c1)); |
230 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2c1111, vb4567c1)); |
231 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3c1111, vb4567c1)); |
232 | |
|
233 | 0 | w = (const int8_t*) w + 8; |
234 | 0 | k -= 2 * sizeof(float); |
235 | 0 | } |
236 | 0 | if XNN_UNLIKELY(k != 0) { |
237 | 0 | const __m128 va0 = _mm_load1_ps(a0); |
238 | 0 | a0 += 1; |
239 | 0 | const __m128 va1 = _mm_load1_ps(a1); |
240 | 0 | a1 += 1; |
241 | 0 | const __m128 va2 = _mm_load1_ps(a2); |
242 | 0 | a2 += 1; |
243 | 0 | const __m128 va3 = _mm_load1_ps(a3); |
244 | 0 | a3 += 1; |
245 | |
|
246 | 0 | const __m128i vbi0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 0))); |
247 | 0 | const __m128i vbi4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((uint32_t) unaligned_load_u32((const uint8_t*) w + 4))); |
248 | 0 | const __m128 vbm0123 = _mm_castsi128_ps(_mm_or_si128(vbi0123, vmagic_bias_c0)); |
249 | 0 | const __m128 vbm4567 = _mm_castsi128_ps(_mm_or_si128(vbi4567, vmagic_bias_c0)); |
250 | 0 | const __m128 vb0123 = _mm_sub_ps(vbm0123, vmagic_bias_plus_kernel_zero_point_c0); |
251 | 0 | const __m128 vb4567 = _mm_sub_ps(vbm4567, vmagic_bias_plus_kernel_zero_point_c0); |
252 | 0 | w = (const int8_t*) w + 8; |
253 | |
|
254 | 0 | vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123)); |
255 | 0 | vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123)); |
256 | 0 | vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123)); |
257 | 0 | vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123)); |
258 | 0 | vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); |
259 | 0 | vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); |
260 | 0 | vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); |
261 | 0 | vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); |
262 | |
|
263 | 0 | k -= sizeof(float); |
264 | 0 | } |
265 | |
|
266 | 0 | const __m128 vscale0123 = _mm_loadu_ps((const float*) w + 0); |
267 | 0 | vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123); |
268 | 0 | vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123); |
269 | 0 | vacc2x0123 = _mm_mul_ps(vacc2x0123, vscale0123); |
270 | 0 | vacc3x0123 = _mm_mul_ps(vacc3x0123, vscale0123); |
271 | 0 | const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4); |
272 | 0 | vacc0x4567 = _mm_mul_ps(vacc0x4567, vscale4567); |
273 | 0 | vacc1x4567 = _mm_mul_ps(vacc1x4567, vscale4567); |
274 | 0 | vacc2x4567 = _mm_mul_ps(vacc2x4567, vscale4567); |
275 | 0 | vacc3x4567 = _mm_mul_ps(vacc3x4567, vscale4567); |
276 | 0 | w = (const float*) w + 8; |
277 | 0 | vacc0x0123 = _mm_min_ps(vacc0x0123, vmax); |
278 | 0 | vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); |
279 | 0 | vacc2x0123 = _mm_min_ps(vacc2x0123, vmax); |
280 | 0 | vacc3x0123 = _mm_min_ps(vacc3x0123, vmax); |
281 | 0 | vacc0x4567 = _mm_min_ps(vacc0x4567, vmax); |
282 | 0 | vacc1x4567 = _mm_min_ps(vacc1x4567, vmax); |
283 | 0 | vacc2x4567 = _mm_min_ps(vacc2x4567, vmax); |
284 | 0 | vacc3x4567 = _mm_min_ps(vacc3x4567, vmax); |
285 | |
|
286 | 0 | vacc0x0123 = _mm_max_ps(vacc0x0123, vmin); |
287 | 0 | vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); |
288 | 0 | vacc2x0123 = _mm_max_ps(vacc2x0123, vmin); |
289 | 0 | vacc3x0123 = _mm_max_ps(vacc3x0123, vmin); |
290 | 0 | vacc0x4567 = _mm_max_ps(vacc0x4567, vmin); |
291 | 0 | vacc1x4567 = _mm_max_ps(vacc1x4567, vmin); |
292 | 0 | vacc2x4567 = _mm_max_ps(vacc2x4567, vmin); |
293 | 0 | vacc3x4567 = _mm_max_ps(vacc3x4567, vmin); |
294 | |
|
295 | 0 | if XNN_LIKELY(nc >= 8) { |
296 | 0 | _mm_storeu_ps(c0, vacc0x0123); |
297 | 0 | _mm_storeu_ps(c0 + 4, vacc0x4567); |
298 | 0 | c0 = (float*) ((uintptr_t) c0 + cn_stride); |
299 | 0 | _mm_storeu_ps(c1, vacc1x0123); |
300 | 0 | _mm_storeu_ps(c1 + 4, vacc1x4567); |
301 | 0 | c1 = (float*) ((uintptr_t) c1 + cn_stride); |
302 | 0 | _mm_storeu_ps(c2, vacc2x0123); |
303 | 0 | _mm_storeu_ps(c2 + 4, vacc2x4567); |
304 | 0 | c2 = (float*) ((uintptr_t) c2 + cn_stride); |
305 | 0 | _mm_storeu_ps(c3, vacc3x0123); |
306 | 0 | _mm_storeu_ps(c3 + 4, vacc3x4567); |
307 | 0 | c3 = (float*) ((uintptr_t) c3 + cn_stride); |
308 | |
|
309 | 0 | a0 = (const float*) ((uintptr_t) a0 - kc); |
310 | 0 | a1 = (const float*) ((uintptr_t) a1 - kc); |
311 | 0 | a2 = (const float*) ((uintptr_t) a2 - kc); |
312 | 0 | a3 = (const float*) ((uintptr_t) a3 - kc); |
313 | |
|
314 | 0 | nc -= 8; |
315 | 0 | } else { |
316 | 0 | if (nc & 4) { |
317 | 0 | _mm_storeu_ps(c0, vacc0x0123); |
318 | 0 | _mm_storeu_ps(c1, vacc1x0123); |
319 | 0 | _mm_storeu_ps(c2, vacc2x0123); |
320 | 0 | _mm_storeu_ps(c3, vacc3x0123); |
321 | |
|
322 | 0 | vacc0x0123 = vacc0x4567; |
323 | 0 | vacc1x0123 = vacc1x4567; |
324 | 0 | vacc2x0123 = vacc2x4567; |
325 | 0 | vacc3x0123 = vacc3x4567; |
326 | |
|
327 | 0 | c0 += 4; |
328 | 0 | c1 += 4; |
329 | 0 | c2 += 4; |
330 | 0 | c3 += 4; |
331 | 0 | } |
332 | 0 | if (nc & 2) { |
333 | 0 | _mm_storel_pi((__m64*) c0, vacc0x0123); |
334 | 0 | _mm_storel_pi((__m64*) c1, vacc1x0123); |
335 | 0 | _mm_storel_pi((__m64*) c2, vacc2x0123); |
336 | 0 | _mm_storel_pi((__m64*) c3, vacc3x0123); |
337 | |
|
338 | 0 | vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); |
339 | 0 | vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); |
340 | 0 | vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123); |
341 | 0 | vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123); |
342 | |
|
343 | 0 | c0 += 2; |
344 | 0 | c1 += 2; |
345 | 0 | c2 += 2; |
346 | 0 | c3 += 2; |
347 | 0 | } |
348 | 0 | if (nc & 1) { |
349 | 0 | _mm_store_ss(c0, vacc0x0123); |
350 | 0 | _mm_store_ss(c1, vacc1x0123); |
351 | 0 | _mm_store_ss(c2, vacc2x0123); |
352 | 0 | _mm_store_ss(c3, vacc3x0123); |
353 | 0 | } |
354 | |
|
355 | 0 | nc = 0; |
356 | 0 | } |
357 | 0 | } while (nc != 0); |
358 | 0 | } |