/src/xnnpack/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c
Line | Count | Source |
1 | | // clang-format off |
2 | | // Auto-generated file. Do not edit! |
3 | | // Template: src/qs8-dwconv/unipass-avx512skx-mul32.c.in |
4 | | // Generator: tools/xngen |
5 | | // |
6 | | // Copyright 2020 Google LLC |
7 | | // |
8 | | // This source code is licensed under the BSD-style license found in the |
9 | | // LICENSE file in the root directory of this source tree. |
10 | | |
11 | | #include <assert.h> |
12 | | #include <stddef.h> |
13 | | #include <stdint.h> |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "src/xnnpack/common.h" |
18 | | #include "src/xnnpack/microparams.h" |
19 | | #include "src/xnnpack/dwconv.h" |
20 | | #include "src/xnnpack/intrinsics-polyfill.h" |
21 | | |
22 | | |
23 | | void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32( |
24 | | size_t channels, |
25 | | size_t output_width, |
26 | | const int8_t** input, |
27 | | const void* weights, |
28 | | int8_t* output, |
29 | | intptr_t input_stride, |
30 | | size_t output_increment, |
31 | | size_t input_offset, |
32 | | size_t input_pixel_stride, |
33 | | const int8_t* zero, |
34 | | const union xnn_qs8_qc8w_conv_minmax_params* restrict params) XNN_OOB_READS |
35 | 0 | { |
36 | 0 | assert(channels != 0); |
37 | 0 | assert(output_width != 0); |
38 | | |
39 | 0 | const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); |
40 | | // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); |
41 | 0 | const __m512i voutput_zero_point = _mm512_set1_epi16((int16_t) params->fp32_scalar.output_zero_point); |
42 | 0 | const __m256i voutput_min = _mm256_set1_epi8(params->fp32_scalar.output_min); |
43 | 0 | const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0); |
44 | | // XNN_FORCE_REALIZATION(voutput_zero_point); |
45 | | // XNN_FORCE_REALIZATION(voutput_min); |
46 | |
|
47 | 0 | do { |
48 | 0 | const int8_t* i0 = input[0]; |
49 | 0 | assert(i0 != NULL); |
50 | 0 | if XNN_UNPREDICTABLE(i0 != zero) { |
51 | 0 | i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); |
52 | 0 | } |
53 | 0 | const int8_t* i1 = input[1]; |
54 | 0 | assert(i1 != NULL); |
55 | 0 | if XNN_UNPREDICTABLE(i1 != zero) { |
56 | 0 | i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); |
57 | 0 | } |
58 | 0 | const int8_t* i2 = input[2]; |
59 | 0 | assert(i2 != NULL); |
60 | 0 | if XNN_UNPREDICTABLE(i2 != zero) { |
61 | 0 | i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); |
62 | 0 | } |
63 | 0 | const int8_t* i3 = input[3]; |
64 | 0 | assert(i3 != NULL); |
65 | 0 | if XNN_UNPREDICTABLE(i3 != zero) { |
66 | 0 | i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); |
67 | 0 | } |
68 | 0 | const int8_t* i4 = input[4]; |
69 | 0 | assert(i4 != NULL); |
70 | 0 | if XNN_UNPREDICTABLE(i4 != zero) { |
71 | 0 | i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); |
72 | 0 | } |
73 | 0 | const int8_t* i5 = input[5]; |
74 | 0 | assert(i5 != NULL); |
75 | 0 | if XNN_UNPREDICTABLE(i5 != zero) { |
76 | 0 | i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); |
77 | 0 | } |
78 | 0 | const int8_t* i6 = input[6]; |
79 | 0 | assert(i6 != NULL); |
80 | 0 | if XNN_UNPREDICTABLE(i6 != zero) { |
81 | 0 | i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); |
82 | 0 | } |
83 | 0 | const int8_t* i7 = input[7]; |
84 | 0 | assert(i7 != NULL); |
85 | 0 | if XNN_UNPREDICTABLE(i7 != zero) { |
86 | 0 | i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); |
87 | 0 | } |
88 | 0 | const int8_t* i8 = input[8]; |
89 | 0 | assert(i8 != NULL); |
90 | 0 | if XNN_UNPREDICTABLE(i8 != zero) { |
91 | 0 | i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); |
92 | 0 | } |
93 | 0 | const int8_t* i9 = input[9]; |
94 | 0 | assert(i9 != NULL); |
95 | 0 | if XNN_UNPREDICTABLE(i9 != zero) { |
96 | 0 | i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); |
97 | 0 | } |
98 | 0 | const int8_t* i10 = input[10]; |
99 | 0 | assert(i10 != NULL); |
100 | 0 | if XNN_UNPREDICTABLE(i10 != zero) { |
101 | 0 | i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); |
102 | 0 | } |
103 | 0 | const int8_t* i11 = input[11]; |
104 | 0 | assert(i11 != NULL); |
105 | 0 | if XNN_UNPREDICTABLE(i11 != zero) { |
106 | 0 | i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); |
107 | 0 | } |
108 | 0 | const int8_t* i12 = input[12]; |
109 | 0 | assert(i12 != NULL); |
110 | 0 | if XNN_UNPREDICTABLE(i12 != zero) { |
111 | 0 | i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); |
112 | 0 | } |
113 | 0 | const int8_t* i13 = input[13]; |
114 | 0 | assert(i13 != NULL); |
115 | 0 | if XNN_UNPREDICTABLE(i13 != zero) { |
116 | 0 | i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); |
117 | 0 | } |
118 | 0 | const int8_t* i14 = input[14]; |
119 | 0 | assert(i14 != NULL); |
120 | 0 | if XNN_UNPREDICTABLE(i14 != zero) { |
121 | 0 | i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); |
122 | 0 | } |
123 | 0 | const int8_t* i15 = input[15]; |
124 | 0 | assert(i15 != NULL); |
125 | 0 | if XNN_UNPREDICTABLE(i15 != zero) { |
126 | 0 | i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); |
127 | 0 | } |
128 | 0 | const int8_t* i16 = input[16]; |
129 | 0 | assert(i16 != NULL); |
130 | 0 | if XNN_UNPREDICTABLE(i16 != zero) { |
131 | 0 | i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); |
132 | 0 | } |
133 | 0 | const int8_t* i17 = input[17]; |
134 | 0 | assert(i17 != NULL); |
135 | 0 | if XNN_UNPREDICTABLE(i17 != zero) { |
136 | 0 | i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); |
137 | 0 | } |
138 | 0 | const int8_t* i18 = input[18]; |
139 | 0 | assert(i18 != NULL); |
140 | 0 | if XNN_UNPREDICTABLE(i18 != zero) { |
141 | 0 | i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); |
142 | 0 | } |
143 | 0 | const int8_t* i19 = input[19]; |
144 | 0 | assert(i19 != NULL); |
145 | 0 | if XNN_UNPREDICTABLE(i19 != zero) { |
146 | 0 | i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); |
147 | 0 | } |
148 | 0 | const int8_t* i20 = input[20]; |
149 | 0 | assert(i20 != NULL); |
150 | 0 | if XNN_UNPREDICTABLE(i20 != zero) { |
151 | 0 | i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); |
152 | 0 | } |
153 | 0 | const int8_t* i21 = input[21]; |
154 | 0 | assert(i21 != NULL); |
155 | 0 | if XNN_UNPREDICTABLE(i21 != zero) { |
156 | 0 | i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); |
157 | 0 | } |
158 | 0 | const int8_t* i22 = input[22]; |
159 | 0 | assert(i22 != NULL); |
160 | 0 | if XNN_UNPREDICTABLE(i22 != zero) { |
161 | 0 | i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); |
162 | 0 | } |
163 | 0 | const int8_t* i23 = input[23]; |
164 | 0 | assert(i23 != NULL); |
165 | 0 | if XNN_UNPREDICTABLE(i23 != zero) { |
166 | 0 | i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); |
167 | 0 | } |
168 | 0 | const int8_t* i24 = input[24]; |
169 | 0 | assert(i24 != NULL); |
170 | 0 | if XNN_UNPREDICTABLE(i24 != zero) { |
171 | 0 | i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); |
172 | 0 | } |
173 | 0 | input = (const int8_t**) ((uintptr_t) input + input_stride); |
174 | |
|
175 | 0 | size_t c = channels; |
176 | 0 | const void* w = weights; |
177 | 0 | for (; c >= 32; c -= 32) { |
178 | 0 | __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w); |
179 | 0 | __m512i vaccGHIJKLMNOPQRSTUV = _mm512_loadu_si512((const void*) ((uintptr_t) w + 16 * sizeof(int32_t))); |
180 | | |
181 | |
|
182 | 0 | const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0)); |
183 | 0 | const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 0 * sizeof(int8_t)))); |
184 | 0 | const __m512i vi0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i0 + 16))); |
185 | 0 | const __m512i vk0xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 16 * sizeof(int8_t)))); |
186 | 0 | i0 += 32; |
187 | |
|
188 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF)); |
189 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV)); |
190 | |
|
191 | 0 | const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1)); |
192 | 0 | const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 32 * sizeof(int8_t)))); |
193 | 0 | const __m512i vi1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i1 + 16))); |
194 | 0 | const __m512i vk1xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 48 * sizeof(int8_t)))); |
195 | 0 | i1 += 32; |
196 | |
|
197 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF)); |
198 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV)); |
199 | |
|
200 | 0 | const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2)); |
201 | 0 | const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 64 * sizeof(int8_t)))); |
202 | 0 | const __m512i vi2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i2 + 16))); |
203 | 0 | const __m512i vk2xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 80 * sizeof(int8_t)))); |
204 | 0 | i2 += 32; |
205 | |
|
206 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF)); |
207 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV)); |
208 | |
|
209 | 0 | const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3)); |
210 | 0 | const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 96 * sizeof(int8_t)))); |
211 | 0 | const __m512i vi3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i3 + 16))); |
212 | 0 | const __m512i vk3xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 112 * sizeof(int8_t)))); |
213 | 0 | i3 += 32; |
214 | |
|
215 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF)); |
216 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV)); |
217 | |
|
218 | 0 | const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4)); |
219 | 0 | const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 128 * sizeof(int8_t)))); |
220 | 0 | const __m512i vi4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i4 + 16))); |
221 | 0 | const __m512i vk4xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 144 * sizeof(int8_t)))); |
222 | 0 | i4 += 32; |
223 | |
|
224 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF)); |
225 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV)); |
226 | |
|
227 | 0 | const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5)); |
228 | 0 | const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 160 * sizeof(int8_t)))); |
229 | 0 | const __m512i vi5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i5 + 16))); |
230 | 0 | const __m512i vk5xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 176 * sizeof(int8_t)))); |
231 | 0 | i5 += 32; |
232 | |
|
233 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF)); |
234 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV)); |
235 | |
|
236 | 0 | const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6)); |
237 | 0 | const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 192 * sizeof(int8_t)))); |
238 | 0 | const __m512i vi6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i6 + 16))); |
239 | 0 | const __m512i vk6xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 208 * sizeof(int8_t)))); |
240 | 0 | i6 += 32; |
241 | |
|
242 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF)); |
243 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV)); |
244 | |
|
245 | 0 | const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7)); |
246 | 0 | const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 224 * sizeof(int8_t)))); |
247 | 0 | const __m512i vi7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i7 + 16))); |
248 | 0 | const __m512i vk7xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 240 * sizeof(int8_t)))); |
249 | 0 | i7 += 32; |
250 | |
|
251 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF)); |
252 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV)); |
253 | |
|
254 | 0 | const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8)); |
255 | 0 | const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 256 * sizeof(int8_t)))); |
256 | 0 | const __m512i vi8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i8 + 16))); |
257 | 0 | const __m512i vk8xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 272 * sizeof(int8_t)))); |
258 | 0 | i8 += 32; |
259 | |
|
260 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF)); |
261 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV)); |
262 | |
|
263 | 0 | const __m512i vi9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i9)); |
264 | 0 | const __m512i vk9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t)))); |
265 | 0 | const __m512i vi9xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i9 + 16))); |
266 | 0 | const __m512i vk9xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 304 * sizeof(int8_t)))); |
267 | 0 | i9 += 32; |
268 | |
|
269 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF)); |
270 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi9xGHIJKLMNOPQRSTUV, vk9xGHIJKLMNOPQRSTUV)); |
271 | |
|
272 | 0 | const __m512i vi10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i10)); |
273 | 0 | const __m512i vk10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 320 * sizeof(int8_t)))); |
274 | 0 | const __m512i vi10xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i10 + 16))); |
275 | 0 | const __m512i vk10xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 336 * sizeof(int8_t)))); |
276 | 0 | i10 += 32; |
277 | |
|
278 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF)); |
279 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi10xGHIJKLMNOPQRSTUV, vk10xGHIJKLMNOPQRSTUV)); |
280 | |
|
281 | 0 | const __m512i vi11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i11)); |
282 | 0 | const __m512i vk11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 352 * sizeof(int8_t)))); |
283 | 0 | const __m512i vi11xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i11 + 16))); |
284 | 0 | const __m512i vk11xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 368 * sizeof(int8_t)))); |
285 | 0 | i11 += 32; |
286 | |
|
287 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF)); |
288 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi11xGHIJKLMNOPQRSTUV, vk11xGHIJKLMNOPQRSTUV)); |
289 | |
|
290 | 0 | const __m512i vi12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i12)); |
291 | 0 | const __m512i vk12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 384 * sizeof(int8_t)))); |
292 | 0 | const __m512i vi12xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i12 + 16))); |
293 | 0 | const __m512i vk12xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 400 * sizeof(int8_t)))); |
294 | 0 | i12 += 32; |
295 | |
|
296 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF)); |
297 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi12xGHIJKLMNOPQRSTUV, vk12xGHIJKLMNOPQRSTUV)); |
298 | |
|
299 | 0 | const __m512i vi13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i13)); |
300 | 0 | const __m512i vk13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 416 * sizeof(int8_t)))); |
301 | 0 | const __m512i vi13xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i13 + 16))); |
302 | 0 | const __m512i vk13xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 432 * sizeof(int8_t)))); |
303 | 0 | i13 += 32; |
304 | |
|
305 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF)); |
306 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi13xGHIJKLMNOPQRSTUV, vk13xGHIJKLMNOPQRSTUV)); |
307 | |
|
308 | 0 | const __m512i vi14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i14)); |
309 | 0 | const __m512i vk14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 448 * sizeof(int8_t)))); |
310 | 0 | const __m512i vi14xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i14 + 16))); |
311 | 0 | const __m512i vk14xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 464 * sizeof(int8_t)))); |
312 | 0 | i14 += 32; |
313 | |
|
314 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF)); |
315 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi14xGHIJKLMNOPQRSTUV, vk14xGHIJKLMNOPQRSTUV)); |
316 | |
|
317 | 0 | const __m512i vi15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i15)); |
318 | 0 | const __m512i vk15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 480 * sizeof(int8_t)))); |
319 | 0 | const __m512i vi15xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i15 + 16))); |
320 | 0 | const __m512i vk15xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 496 * sizeof(int8_t)))); |
321 | 0 | i15 += 32; |
322 | |
|
323 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF)); |
324 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi15xGHIJKLMNOPQRSTUV, vk15xGHIJKLMNOPQRSTUV)); |
325 | |
|
326 | 0 | const __m512i vi16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i16)); |
327 | 0 | const __m512i vk16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 512 * sizeof(int8_t)))); |
328 | 0 | const __m512i vi16xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i16 + 16))); |
329 | 0 | const __m512i vk16xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 528 * sizeof(int8_t)))); |
330 | 0 | i16 += 32; |
331 | |
|
332 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF)); |
333 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi16xGHIJKLMNOPQRSTUV, vk16xGHIJKLMNOPQRSTUV)); |
334 | |
|
335 | 0 | const __m512i vi17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i17)); |
336 | 0 | const __m512i vk17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 544 * sizeof(int8_t)))); |
337 | 0 | const __m512i vi17xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i17 + 16))); |
338 | 0 | const __m512i vk17xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 560 * sizeof(int8_t)))); |
339 | 0 | i17 += 32; |
340 | |
|
341 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF)); |
342 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi17xGHIJKLMNOPQRSTUV, vk17xGHIJKLMNOPQRSTUV)); |
343 | |
|
344 | 0 | const __m512i vi18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i18)); |
345 | 0 | const __m512i vk18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 576 * sizeof(int8_t)))); |
346 | 0 | const __m512i vi18xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i18 + 16))); |
347 | 0 | const __m512i vk18xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 592 * sizeof(int8_t)))); |
348 | 0 | i18 += 32; |
349 | |
|
350 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF)); |
351 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi18xGHIJKLMNOPQRSTUV, vk18xGHIJKLMNOPQRSTUV)); |
352 | |
|
353 | 0 | const __m512i vi19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i19)); |
354 | 0 | const __m512i vk19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 608 * sizeof(int8_t)))); |
355 | 0 | const __m512i vi19xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i19 + 16))); |
356 | 0 | const __m512i vk19xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 624 * sizeof(int8_t)))); |
357 | 0 | i19 += 32; |
358 | |
|
359 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF)); |
360 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi19xGHIJKLMNOPQRSTUV, vk19xGHIJKLMNOPQRSTUV)); |
361 | |
|
362 | 0 | const __m512i vi20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i20)); |
363 | 0 | const __m512i vk20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 640 * sizeof(int8_t)))); |
364 | 0 | const __m512i vi20xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i20 + 16))); |
365 | 0 | const __m512i vk20xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 656 * sizeof(int8_t)))); |
366 | 0 | i20 += 32; |
367 | |
|
368 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF)); |
369 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi20xGHIJKLMNOPQRSTUV, vk20xGHIJKLMNOPQRSTUV)); |
370 | |
|
371 | 0 | const __m512i vi21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i21)); |
372 | 0 | const __m512i vk21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 672 * sizeof(int8_t)))); |
373 | 0 | const __m512i vi21xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i21 + 16))); |
374 | 0 | const __m512i vk21xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 688 * sizeof(int8_t)))); |
375 | 0 | i21 += 32; |
376 | |
|
377 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF)); |
378 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi21xGHIJKLMNOPQRSTUV, vk21xGHIJKLMNOPQRSTUV)); |
379 | |
|
380 | 0 | const __m512i vi22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i22)); |
381 | 0 | const __m512i vk22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 704 * sizeof(int8_t)))); |
382 | 0 | const __m512i vi22xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i22 + 16))); |
383 | 0 | const __m512i vk22xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 720 * sizeof(int8_t)))); |
384 | 0 | i22 += 32; |
385 | |
|
386 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF)); |
387 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi22xGHIJKLMNOPQRSTUV, vk22xGHIJKLMNOPQRSTUV)); |
388 | |
|
389 | 0 | const __m512i vi23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i23)); |
390 | 0 | const __m512i vk23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 736 * sizeof(int8_t)))); |
391 | 0 | const __m512i vi23xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i23 + 16))); |
392 | 0 | const __m512i vk23xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 752 * sizeof(int8_t)))); |
393 | 0 | i23 += 32; |
394 | |
|
395 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF)); |
396 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi23xGHIJKLMNOPQRSTUV, vk23xGHIJKLMNOPQRSTUV)); |
397 | |
|
398 | 0 | const __m512i vi24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i24)); |
399 | 0 | const __m512i vk24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 768 * sizeof(int8_t)))); |
400 | 0 | const __m512i vi24xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i24 + 16))); |
401 | 0 | const __m512i vk24xGHIJKLMNOPQRSTUV = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int32_t) + 784 * sizeof(int8_t)))); |
402 | 0 | i24 += 32; |
403 | |
|
404 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF)); |
405 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vi24xGHIJKLMNOPQRSTUV, vk24xGHIJKLMNOPQRSTUV)); |
406 | |
|
407 | 0 | w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t)); |
408 | |
|
409 | 0 | __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF); |
410 | 0 | __m512 vscaledGHIJKLMNOPQRSTUV = _mm512_cvtepi32_ps(vaccGHIJKLMNOPQRSTUV); |
411 | |
|
412 | 0 | const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps(w); |
413 | 0 | const __m512 vscaleGHIJKLMNOPQRSTUV = _mm512_loadu_ps((const void*) ((uintptr_t) w + 16 * sizeof(float))); |
414 | 0 | w = (const void*) ((uintptr_t) w + 32 * sizeof(float)); |
415 | 0 | vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF); |
416 | 0 | vscaledGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaledGHIJKLMNOPQRSTUV, vscaleGHIJKLMNOPQRSTUV); |
417 | |
|
418 | 0 | vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point); |
419 | 0 | vscaledGHIJKLMNOPQRSTUV = _mm512_min_ps(vscaledGHIJKLMNOPQRSTUV, voutput_max_less_zero_point); |
420 | |
|
421 | 0 | vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF); |
422 | 0 | vaccGHIJKLMNOPQRSTUV = _mm512_cvtps_epi32(vscaledGHIJKLMNOPQRSTUV); |
423 | |
|
424 | 0 | __m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point); |
425 | 0 | __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vaccGHIJKLMNOPQRSTUV), _mm512_extracti32x8_epi32(vaccGHIJKLMNOPQRSTUV, 1)), _mm512_castsi512_si256(voutput_zero_point)); |
426 | |
|
427 | 0 | const __m256i vout0123GHIJ4567KLMN = _mm512_castsi512_si256(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV); |
428 | 0 | const __m256i vout89ABOPQRCDEFSTUV = _mm512_extracti32x8_epi32(vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV, 1); |
429 | 0 | const __m256i vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV = _mm256_packs_epi16(vout0123GHIJ4567KLMN, vout89ABOPQRCDEFSTUV); |
430 | 0 | __m256i vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_permutevar8x32_epi32(vout0123GHIJ89ABOPQR4567KLMNCDEFSTUV, vpermute_mask); |
431 | 0 | const __m128i voutGHIJOPQR = _mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV); |
432 | 0 | const __m128i voutKLMNSTUV = _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1); |
433 | 0 | __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(voutGHIJOPQR, voutKLMNSTUV), _MM_SHUFFLE(3, 1, 2, 0)); |
434 | |
|
435 | 0 | vout0123456789ABCDEFGHIJKLMNOPQRSTUV = _mm256_max_epi8(vout0123456789ABCDEFGHIJKLMNOPQRSTUV, voutput_min); |
436 | 0 | voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, _mm256_castsi256_si128(voutput_min)); |
437 | |
|
438 | 0 | _mm256_storeu_si256((__m256i*) output, vout0123456789ABCDEFGHIJKLMNOPQRSTUV); |
439 | 0 | _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV); |
440 | 0 | output += 32; |
441 | 0 | } |
442 | 0 | if XNN_UNLIKELY(c != 0) { |
443 | | // Prepare mask for valid 8-bit elements (depends on nc). |
444 | 0 | const __mmask16 vmask = _cvtu32_mask16((uint32_t) ((UINT32_C(1) << (c & 15)) - UINT32_C(1))); |
445 | 0 | const int8_t* k = (const int8_t*) ((uintptr_t) w + 32 * sizeof(int32_t)); |
446 | 0 | do { |
447 | 0 | __m512i vacc0123456789ABCDEF = _mm512_loadu_si512(w); |
448 | | |
449 | |
|
450 | 0 | const __m512i vi0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i0)); |
451 | 0 | const __m512i vk0x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) k)); |
452 | 0 | i0 += 16; |
453 | |
|
454 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF)); |
455 | |
|
456 | 0 | const __m512i vi1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i1)); |
457 | 0 | const __m512i vk1x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 32))); |
458 | 0 | i1 += 16; |
459 | |
|
460 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF)); |
461 | |
|
462 | 0 | const __m512i vi2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i2)); |
463 | 0 | const __m512i vk2x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 64))); |
464 | 0 | i2 += 16; |
465 | |
|
466 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF)); |
467 | |
|
468 | 0 | const __m512i vi3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i3)); |
469 | 0 | const __m512i vk3x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 96))); |
470 | 0 | i3 += 16; |
471 | |
|
472 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF)); |
473 | |
|
474 | 0 | const __m512i vi4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i4)); |
475 | 0 | const __m512i vk4x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 128))); |
476 | 0 | i4 += 16; |
477 | |
|
478 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF)); |
479 | |
|
480 | 0 | const __m512i vi5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i5)); |
481 | 0 | const __m512i vk5x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 160))); |
482 | 0 | i5 += 16; |
483 | |
|
484 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF)); |
485 | |
|
486 | 0 | const __m512i vi6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i6)); |
487 | 0 | const __m512i vk6x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 192))); |
488 | 0 | i6 += 16; |
489 | |
|
490 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF)); |
491 | |
|
492 | 0 | const __m512i vi7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i7)); |
493 | 0 | const __m512i vk7x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 224))); |
494 | 0 | i7 += 16; |
495 | |
|
496 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF)); |
497 | |
|
498 | 0 | const __m512i vi8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i8)); |
499 | 0 | const __m512i vk8x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 256))); |
500 | 0 | i8 += 16; |
501 | |
|
502 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF)); |
503 | |
|
504 | 0 | const __m512i vi9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i9)); |
505 | 0 | const __m512i vk9x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 288))); |
506 | 0 | i9 += 16; |
507 | |
|
508 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF)); |
509 | |
|
510 | 0 | const __m512i vi10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i10)); |
511 | 0 | const __m512i vk10x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 320))); |
512 | 0 | i10 += 16; |
513 | |
|
514 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF)); |
515 | |
|
516 | 0 | const __m512i vi11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i11)); |
517 | 0 | const __m512i vk11x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 352))); |
518 | 0 | i11 += 16; |
519 | |
|
520 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF)); |
521 | |
|
522 | 0 | const __m512i vi12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i12)); |
523 | 0 | const __m512i vk12x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 384))); |
524 | 0 | i12 += 16; |
525 | |
|
526 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF)); |
527 | |
|
528 | 0 | const __m512i vi13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i13)); |
529 | 0 | const __m512i vk13x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 416))); |
530 | 0 | i13 += 16; |
531 | |
|
532 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF)); |
533 | |
|
534 | 0 | const __m512i vi14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i14)); |
535 | 0 | const __m512i vk14x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 448))); |
536 | 0 | i14 += 16; |
537 | |
|
538 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF)); |
539 | |
|
540 | 0 | const __m512i vi15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i15)); |
541 | 0 | const __m512i vk15x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 480))); |
542 | 0 | i15 += 16; |
543 | |
|
544 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF)); |
545 | |
|
546 | 0 | const __m512i vi16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i16)); |
547 | 0 | const __m512i vk16x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 512))); |
548 | 0 | i16 += 16; |
549 | |
|
550 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF)); |
551 | |
|
552 | 0 | const __m512i vi17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i17)); |
553 | 0 | const __m512i vk17x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 544))); |
554 | 0 | i17 += 16; |
555 | |
|
556 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF)); |
557 | |
|
558 | 0 | const __m512i vi18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i18)); |
559 | 0 | const __m512i vk18x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 576))); |
560 | 0 | i18 += 16; |
561 | |
|
562 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF)); |
563 | |
|
564 | 0 | const __m512i vi19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i19)); |
565 | 0 | const __m512i vk19x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 608))); |
566 | 0 | i19 += 16; |
567 | |
|
568 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF)); |
569 | |
|
570 | 0 | const __m512i vi20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i20)); |
571 | 0 | const __m512i vk20x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 640))); |
572 | 0 | i20 += 16; |
573 | |
|
574 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF)); |
575 | |
|
576 | 0 | const __m512i vi21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i21)); |
577 | 0 | const __m512i vk21x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 672))); |
578 | 0 | i21 += 16; |
579 | |
|
580 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF)); |
581 | |
|
582 | 0 | const __m512i vi22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i22)); |
583 | 0 | const __m512i vk22x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 704))); |
584 | 0 | i22 += 16; |
585 | |
|
586 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF)); |
587 | |
|
588 | 0 | const __m512i vi23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i23)); |
589 | 0 | const __m512i vk23x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 736))); |
590 | 0 | i23 += 16; |
591 | |
|
592 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF)); |
593 | |
|
594 | 0 | const __m512i vi24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i24)); |
595 | 0 | const __m512i vk24x0123456789ABCDEF = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (k + 768))); |
596 | 0 | i24 += 16; |
597 | |
|
598 | 0 | vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF)); |
599 | |
|
600 | 0 | k += 16; |
601 | |
|
602 | 0 | __m512 vscaled0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0123456789ABCDEF); |
603 | 0 | const __m512 vscale0123456789ABCDEF = _mm512_loadu_ps((const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t))); |
604 | 0 | vscaled0123456789ABCDEF = _mm512_mul_ps(vscaled0123456789ABCDEF, vscale0123456789ABCDEF); |
605 | 0 | vscaled0123456789ABCDEF = _mm512_min_ps(vscaled0123456789ABCDEF, voutput_max_less_zero_point); |
606 | 0 | vacc0123456789ABCDEF = _mm512_cvtps_epi32(vscaled0123456789ABCDEF); |
607 | |
|
608 | 0 | w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t)); |
609 | |
|
610 | 0 | __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point)); |
611 | |
|
612 | 0 | const __m128i vout012389AB = _mm256_castsi256_si128(vout012389AB4567CDEF); |
613 | 0 | const __m128i vout4567CDEF = _mm256_extracti128_si256(vout012389AB4567CDEF, 1); |
614 | 0 | __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(vout012389AB, vout4567CDEF), _MM_SHUFFLE(3, 1, 2, 0)); |
615 | 0 | vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, _mm256_castsi256_si128(voutput_min)); |
616 | |
|
617 | 0 | if XNN_LIKELY(c >= 16) { |
618 | 0 | _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); |
619 | 0 | output += 16; |
620 | 0 | c -= 16; |
621 | 0 | } else { |
622 | 0 | _mm_mask_storeu_epi8(output, vmask, vout0123456789ABCDEF); |
623 | 0 | output = (int8_t*) ((uintptr_t) output + c); |
624 | 0 | c = 0; |
625 | 0 | } |
626 | 0 | } while (c != 0); |
627 | 0 | } |
628 | |
|
629 | 0 | input_offset += input_pixel_stride; |
630 | 0 | output = (int8_t*) ((uintptr_t) output + output_increment); |
631 | 0 | } while (--output_width != 0); |
632 | 0 | } |