/src/Simd/src/Simd/SimdAvx2DescrIntEnc.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2023 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | #include "Simd/SimdExtract.h" |
27 | | #include "Simd/SimdArray.h" |
28 | | #include "Simd/SimdUnpack.h" |
29 | | #include "Simd/SimdDescrInt.h" |
30 | | #include "Simd/SimdDescrIntCommon.h" |
31 | | #include "Simd/SimdCpu.h" |
32 | | |
33 | | namespace Simd |
34 | | { |
35 | | #ifdef SIMD_AVX2_ENABLE |
36 | | namespace Avx2 |
37 | | { |
38 | | SIMD_INLINE __m256i Encode32f(__m256 src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
39 | 0 | { |
40 | 0 | __m256i value = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sub_ps(src, min), scale)); |
41 | 0 | sum = _mm256_add_epi32(value, sum); |
42 | 0 | sqsum = _mm256_add_epi32(_mm256_madd_epi16(value, value), sqsum); |
43 | 0 | return value; |
44 | 0 | } |
45 | | |
46 | | SIMD_INLINE __m256i Encode32f(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
47 | 0 | { |
48 | 0 | return Encode32f(_mm256_loadu_ps(src), scale, min, sum, sqsum); |
49 | 0 | } |
50 | | |
51 | | static SIMD_INLINE __m128i Encode32f4x8(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
52 | 0 | { |
53 | 0 | __m256i i0 = Encode32f(src + 0 * 8, scale, min, sum, sqsum); |
54 | 0 | __m128i s0 = _mm_srli_epi32(_mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E4_MULLO), 12); |
55 | 0 | return _mm_packus_epi16(_mm_packus_epi32(s0, Sse41::K_ZERO), Sse41::K_ZERO); |
56 | 0 | } |
57 | | |
58 | | static SIMD_INLINE __m128i Encode32f4x32(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
59 | 0 | { |
60 | 0 | __m256i i0 = Encode32f(src + 0 * 8, scale, min, sum, sqsum); |
61 | 0 | __m256i i1 = Encode32f(src + 1 * 8, scale, min, sum, sqsum); |
62 | 0 | __m256i s0 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i0, i1), E4_MULLO), 12); |
63 | 0 | __m256i i2 = Encode32f(src + 2 * 8, scale, min, sum, sqsum); |
64 | 0 | __m256i i3 = Encode32f(src + 3 * 8, scale, min, sum, sqsum); |
65 | 0 | __m256i s1 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i2, i3), E4_MULLO), 12); |
66 | 0 | return _mm_packus_epi16(_mm_packus_epi32(_mm256_castsi256_si128(s0), _mm256_extracti128_si256(s0, 1)), |
67 | 0 | _mm_packus_epi32(_mm256_castsi256_si128(s1), _mm256_extracti128_si256(s1, 1))); |
68 | 0 | } |
69 | | |
70 | | static void Encode32f4(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
71 | 0 | { |
72 | 0 | assert(size % 8 == 0); |
73 | 0 | size_t i = 0, size32 = AlignLo(size, 32); |
74 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
75 | 0 | __m256 _min = _mm256_set1_ps(min); |
76 | 0 | __m256i _sum = _mm256_setzero_si256(); |
77 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
78 | 0 | for (; i < size32; i += 32, src += 32, dst += 16) |
79 | 0 | _mm_storeu_si128((__m128i*)dst, Encode32f4x32(src, _scale, _min, _sum, _sqsum)); |
80 | 0 | for (; i < size; i += 8, src += 8, dst += 4) |
81 | 0 | *(uint32_t*)(dst) = _mm_extract_epi32(Encode32f4x8(src, _scale, _min, _sum, _sqsum), 0); |
82 | 0 | sum = ExtractSum<uint32_t>(_sum); |
83 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
84 | 0 | } |
85 | | |
86 | | static SIMD_INLINE __m128i Encode32f5x1(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
87 | 0 | { |
88 | 0 | __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum); |
89 | 0 | __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E5_MULLO); |
90 | 0 | return _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E5_SHFL0), _mm_shuffle_epi8(s0, Sse41::E5_SHFL1)), _mm_shuffle_epi8(s0, Sse41::E5_SHFL2)); |
91 | 0 | } |
92 | | |
93 | | static SIMD_INLINE __m128i Encode32f5x2(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
94 | 0 | { |
95 | 0 | __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum); |
96 | 0 | __m256i i8 = Encode32f(src + 8, scale, min, sum, sqsum); |
97 | 0 | __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E5_MULLO); |
98 | 0 | __m256i e0 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, E5_SHFL0), _mm256_shuffle_epi8(s0, E5_SHFL1)), _mm256_shuffle_epi8(s0, E5_SHFL2)); |
99 | 0 | return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1)); |
100 | 0 | } |
101 | | |
102 | | static void Encode32f5(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
103 | 0 | { |
104 | 0 | assert(size % 8 == 0); |
105 | 0 | size_t i = 0, main = size - 8, main16 = AlignLo(main, 16); |
106 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
107 | 0 | __m256 _min = _mm256_set1_ps(min); |
108 | 0 | __m256i _sum = _mm256_setzero_si256(); |
109 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
110 | 0 | for (; i < main16; i += 16, src += 16, dst += 10) |
111 | 0 | _mm_storeu_si128((__m128i*)dst, Encode32f5x2(src, _scale, _min, _sum, _sqsum)); |
112 | 0 | for (; i < main; i += 8, src += 8, dst += 5) |
113 | 0 | _mm_storel_epi64((__m128i*)dst, Encode32f5x1(src, _scale, _min, _sum, _sqsum)); |
114 | 0 | for (; i < size; i += 8, src += 8, dst += 5) |
115 | 0 | { |
116 | 0 | __m128i d0 = Encode32f5x1(src, _scale, _min, _sum, _sqsum); |
117 | 0 | *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0); |
118 | 0 | *(uint8_t*)(dst + 4) = _mm_extract_epi8(d0, 4); |
119 | 0 | } |
120 | 0 | sum = ExtractSum<uint32_t>(_sum); |
121 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
122 | 0 | } |
123 | | |
124 | | static SIMD_INLINE __m128i Encode32f6x1(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
125 | 0 | { |
126 | 0 | __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum); |
127 | 0 | __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E6_MULLO); |
128 | 0 | return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E6_SHFL0), _mm_shuffle_epi8(s0, Sse41::E6_SHFL1)); |
129 | 0 | } |
130 | | |
131 | | static SIMD_INLINE __m128i Encode32f6x2(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
132 | 0 | { |
133 | 0 | __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum); |
134 | 0 | __m256i i8 = Encode32f(src + 8, scale, min, sum, sqsum); |
135 | 0 | __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E6_MULLO); |
136 | 0 | __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E6_SHFL0), _mm256_shuffle_epi8(s0, E6_SHFL1)); |
137 | 0 | return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1)); |
138 | 0 | } |
139 | | |
140 | | static void Encode32f6(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
141 | 0 | { |
142 | 0 | assert(size % 8 == 0); |
143 | 0 | size_t i = 0, main = size - 8, main16 = AlignLo(main, 16); |
144 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
145 | 0 | __m256 _min = _mm256_set1_ps(min); |
146 | 0 | __m256i _sum = _mm256_setzero_si256(); |
147 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
148 | 0 | for (; i < main16; i += 16, src += 16, dst += 12) |
149 | 0 | _mm_storeu_si128((__m128i*)dst, Encode32f6x2(src, _scale, _min, _sum, _sqsum)); |
150 | 0 | for (; i < main; i += 8, src += 8, dst += 6) |
151 | 0 | _mm_storel_epi64((__m128i*)dst, Encode32f6x1(src, _scale, _min, _sum, _sqsum)); |
152 | 0 | for (; i < size; i += 8, src += 8, dst += 6) |
153 | 0 | { |
154 | 0 | __m128i d0 = Encode32f6x1(src, _scale, _min, _sum, _sqsum); |
155 | 0 | *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0); |
156 | 0 | *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2); |
157 | 0 | } |
158 | 0 | sum = ExtractSum<uint32_t>(_sum); |
159 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
160 | 0 | } |
161 | | |
162 | | static SIMD_INLINE __m128i Encode32f7x1(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
163 | 0 | { |
164 | 0 | __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum); |
165 | 0 | __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E7_MULLO); |
166 | 0 | return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E7_SHFL0), _mm_shuffle_epi8(s0, Sse41::E7_SHFL1)); |
167 | 0 | } |
168 | | |
169 | | static SIMD_INLINE __m128i Encode32f7x2(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
170 | 0 | { |
171 | 0 | __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum); |
172 | 0 | __m256i i8 = Encode32f(src + 8, scale, min, sum, sqsum); |
173 | 0 | __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E7_MULLO); |
174 | 0 | __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E7_SHFL0), _mm256_shuffle_epi8(s0, E7_SHFL1)); |
175 | 0 | return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1)); |
176 | 0 | } |
177 | | |
178 | | static void Encode32f7(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
179 | 0 | { |
180 | 0 | assert(size % 8 == 0); |
181 | 0 | size_t i = 0, main = size - 8, main16 = AlignLo(main, 16); |
182 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
183 | 0 | __m256 _min = _mm256_set1_ps(min); |
184 | 0 | __m256i _sum = _mm256_setzero_si256(); |
185 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
186 | 0 | for (; i < main16; i += 16, src += 16, dst += 14) |
187 | 0 | _mm_storeu_si128((__m128i*)dst, Encode32f7x2(src, _scale, _min, _sum, _sqsum)); |
188 | 0 | for (; i < main; i += 8, src += 8, dst += 7) |
189 | 0 | _mm_storel_epi64((__m128i*)dst, Encode32f7x1(src, _scale, _min, _sum, _sqsum)); |
190 | 0 | for (; i < size; i += 8, src += 8, dst += 7) |
191 | 0 | { |
192 | 0 | __m128i d0 = Encode32f7x1(src, _scale, _min, _sum, _sqsum); |
193 | 0 | *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0); |
194 | 0 | *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2); |
195 | 0 | *(uint8_t*)(dst + 6) = _mm_extract_epi8(d0, 6); |
196 | 0 | } |
197 | 0 | sum = ExtractSum<uint32_t>(_sum); |
198 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
199 | 0 | } |
200 | | |
201 | | static void Encode32f8(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
202 | 0 | { |
203 | 0 | assert(size % 8 == 0); |
204 | 0 | size_t sizeA = AlignLo(size, A), i = 0; |
205 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
206 | 0 | __m256 _min = _mm256_set1_ps(min); |
207 | 0 | __m256i _sum = _mm256_setzero_si256(); |
208 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
209 | 0 | for (; i < sizeA; i += A) |
210 | 0 | { |
211 | 0 | __m256i d0 = Encode32f(src + i + 0 * F, _scale, _min, _sum, _sqsum); |
212 | 0 | __m256i d1 = Encode32f(src + i + 1 * F, _scale, _min, _sum, _sqsum); |
213 | 0 | __m256i d2 = Encode32f(src + i + 2 * F, _scale, _min, _sum, _sqsum); |
214 | 0 | __m256i d3 = Encode32f(src + i + 3 * F, _scale, _min, _sum, _sqsum); |
215 | 0 | _mm256_storeu_si256((__m256i*)(dst + i), PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
216 | 0 | } |
217 | 0 | for (; i < size; i += F) |
218 | 0 | { |
219 | 0 | __m256i d0 = Encode32f(src + i, _scale, _min, _sum, _sqsum); |
220 | 0 | _mm_storel_epi64((__m128i*)(dst + i), _mm256_castsi256_si128(PackI16ToU8(PackI32ToI16(d0, _mm256_setzero_si256()), _mm256_setzero_si256()))); |
221 | 0 | } |
222 | 0 | sum = ExtractSum<uint32_t>(_sum); |
223 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
224 | 0 | } |
225 | | |
226 | | //------------------------------------------------------------------------------------------------- |
227 | | |
228 | | static SIMD_INLINE __m128i Encode16f4x8(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
229 | 0 | { |
230 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum); |
231 | 0 | __m128i s0 = _mm_srli_epi32(_mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E4_MULLO), 12); |
232 | 0 | return _mm_packus_epi16(_mm_packus_epi32(s0, Sse41::K_ZERO), Sse41::K_ZERO); |
233 | 0 | } |
234 | | |
235 | | static SIMD_INLINE __m128i Encode16f4x32(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
236 | 0 | { |
237 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum); |
238 | 0 | __m256i i1 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum); |
239 | 0 | __m256i s0 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i0, i1), E4_MULLO), 12); |
240 | 0 | __m256i i2 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 2)), scale, min, sum, sqsum); |
241 | 0 | __m256i i3 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 3)), scale, min, sum, sqsum); |
242 | 0 | __m256i s1 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i2, i3), E4_MULLO), 12); |
243 | 0 | return _mm_packus_epi16(_mm_packus_epi32(_mm256_castsi256_si128(s0), _mm256_extracti128_si256(s0, 1)), |
244 | 0 | _mm_packus_epi32(_mm256_castsi256_si128(s1), _mm256_extracti128_si256(s1, 1))); |
245 | 0 | } |
246 | | |
247 | | static void Encode16f4(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
248 | 0 | { |
249 | 0 | assert(size % 8 == 0); |
250 | 0 | size_t i = 0, size32 = AlignLo(size, 32); |
251 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
252 | 0 | __m256 _min = _mm256_set1_ps(min); |
253 | 0 | __m256i _sum = _mm256_setzero_si256(); |
254 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
255 | 0 | for (; i < size32; i += 32, src += 32, dst += 16) |
256 | 0 | _mm_storeu_si128((__m128i*)dst, Encode16f4x32(src, _scale, _min, _sum, _sqsum)); |
257 | 0 | for (; i < size; i += 8, src += 8, dst += 4) |
258 | 0 | *(uint32_t*)(dst) = _mm_extract_epi32(Encode16f4x8(src, _scale, _min, _sum, _sqsum), 0); |
259 | 0 | sum = ExtractSum<uint32_t>(_sum); |
260 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
261 | 0 | } |
262 | | |
263 | | static SIMD_INLINE __m128i Encode16f5x1(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
264 | 0 | { |
265 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum); |
266 | 0 | __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E5_MULLO); |
267 | 0 | return _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E5_SHFL0), _mm_shuffle_epi8(s0, Sse41::E5_SHFL1)), _mm_shuffle_epi8(s0, Sse41::E5_SHFL2)); |
268 | 0 | } |
269 | | |
270 | | static SIMD_INLINE __m128i Encode16f5x2(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
271 | 0 | { |
272 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum); |
273 | 0 | __m256i i8 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum); |
274 | 0 | __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E5_MULLO); |
275 | 0 | __m256i e0 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, E5_SHFL0), _mm256_shuffle_epi8(s0, E5_SHFL1)), _mm256_shuffle_epi8(s0, E5_SHFL2)); |
276 | 0 | return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1)); |
277 | 0 | } |
278 | | |
279 | | static void Encode16f5(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
280 | 0 | { |
281 | 0 | assert(size % 8 == 0); |
282 | 0 | size_t i = 0, main = size - 8, main16 = AlignLo(main, 16); |
283 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
284 | 0 | __m256 _min = _mm256_set1_ps(min); |
285 | 0 | __m256i _sum = _mm256_setzero_si256(); |
286 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
287 | 0 | for (; i < main16; i += 16, src += 16, dst += 10) |
288 | 0 | _mm_storeu_si128((__m128i*)dst, Encode16f5x2(src, _scale, _min, _sum, _sqsum)); |
289 | 0 | for (; i < main; i += 8, src += 8, dst += 5) |
290 | 0 | _mm_storel_epi64((__m128i*)dst, Encode16f5x1(src, _scale, _min, _sum, _sqsum)); |
291 | 0 | for (; i < size; i += 8, src += 8, dst += 5) |
292 | 0 | { |
293 | 0 | __m128i d0 = Encode16f5x1(src, _scale, _min, _sum, _sqsum); |
294 | 0 | *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0); |
295 | 0 | *(uint8_t*)(dst + 4) = _mm_extract_epi8(d0, 4); |
296 | 0 | } |
297 | 0 | sum = ExtractSum<uint32_t>(_sum); |
298 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
299 | 0 | } |
300 | | |
301 | | static SIMD_INLINE __m128i Encode16f6x1(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
302 | 0 | { |
303 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum); |
304 | 0 | __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E6_MULLO); |
305 | 0 | return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E6_SHFL0), _mm_shuffle_epi8(s0, Sse41::E6_SHFL1)); |
306 | 0 | } |
307 | | |
308 | | static SIMD_INLINE __m128i Encode16f6x2(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
309 | 0 | { |
310 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum); |
311 | 0 | __m256i i8 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum); |
312 | 0 | __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E6_MULLO); |
313 | 0 | __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E6_SHFL0), _mm256_shuffle_epi8(s0, E6_SHFL1)); |
314 | 0 | return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1)); |
315 | 0 | } |
316 | | |
317 | | static void Encode16f6(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
318 | 0 | { |
319 | 0 | assert(size % 8 == 0); |
320 | 0 | size_t i = 0, main = size - 8, main16 = AlignLo(main, 16); |
321 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
322 | 0 | __m256 _min = _mm256_set1_ps(min); |
323 | 0 | __m256i _sum = _mm256_setzero_si256(); |
324 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
325 | 0 | for (; i < main16; i += 16, src += 16, dst += 12) |
326 | 0 | _mm_storeu_si128((__m128i*)dst, Encode16f6x2(src, _scale, _min, _sum, _sqsum)); |
327 | 0 | for (; i < main; i += 8, src += 8, dst += 6) |
328 | 0 | _mm_storel_epi64((__m128i*)dst, Encode16f6x1(src, _scale, _min, _sum, _sqsum)); |
329 | 0 | for (; i < size; i += 8, src += 8, dst += 6) |
330 | 0 | { |
331 | 0 | __m128i d0 = Encode16f6x1(src, _scale, _min, _sum, _sqsum); |
332 | 0 | *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0); |
333 | 0 | *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2); |
334 | 0 | } |
335 | 0 | sum = ExtractSum<uint32_t>(_sum); |
336 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
337 | 0 | } |
338 | | |
339 | | static SIMD_INLINE __m128i Encode16f7x1(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
340 | 0 | { |
341 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum); |
342 | 0 | __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E7_MULLO); |
343 | 0 | return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E7_SHFL0), _mm_shuffle_epi8(s0, Sse41::E7_SHFL1)); |
344 | 0 | } |
345 | | |
346 | | static SIMD_INLINE __m128i Encode16f7x2(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum) |
347 | 0 | { |
348 | 0 | __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum); |
349 | 0 | __m256i i8 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum); |
350 | 0 | __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E7_MULLO); |
351 | 0 | __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E7_SHFL0), _mm256_shuffle_epi8(s0, E7_SHFL1)); |
352 | 0 | return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1)); |
353 | 0 | } |
354 | | |
355 | | static void Encode16f7(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
356 | 0 | { |
357 | 0 | assert(size % 8 == 0); |
358 | 0 | size_t i = 0, main = size - 8, main16 = AlignLo(main, 16); |
359 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
360 | 0 | __m256 _min = _mm256_set1_ps(min); |
361 | 0 | __m256i _sum = _mm256_setzero_si256(); |
362 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
363 | 0 | for (; i < main16; i += 16, src += 16, dst += 14) |
364 | 0 | _mm_storeu_si128((__m128i*)dst, Encode16f7x2(src, _scale, _min, _sum, _sqsum)); |
365 | 0 | for (; i < main; i += 8, src += 8, dst += 7) |
366 | 0 | _mm_storel_epi64((__m128i*)dst, Encode16f7x1(src, _scale, _min, _sum, _sqsum)); |
367 | 0 | for (; i < size; i += 8, src += 8, dst += 7) |
368 | 0 | { |
369 | 0 | __m128i d0 = Encode16f7x1(src, _scale, _min, _sum, _sqsum); |
370 | 0 | *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0); |
371 | 0 | *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2); |
372 | 0 | *(uint8_t*)(dst + 6) = _mm_extract_epi8(d0, 6); |
373 | 0 | } |
374 | 0 | sum = ExtractSum<uint32_t>(_sum); |
375 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
376 | 0 | } |
377 | | |
378 | | static void Encode16f8(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst) |
379 | 0 | { |
380 | 0 | assert(size % 8 == 0); |
381 | 0 | size_t sizeA = AlignLo(size, A), i = 0; |
382 | 0 | __m256 _scale = _mm256_set1_ps(scale); |
383 | 0 | __m256 _min = _mm256_set1_ps(min); |
384 | 0 | __m256i _sum = _mm256_setzero_si256(); |
385 | 0 | __m256i _sqsum = _mm256_setzero_si256(); |
386 | 0 | for (; i < sizeA; i += A) |
387 | 0 | { |
388 | 0 | __m256i d0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 0)), _scale, _min, _sum, _sqsum); |
389 | 0 | __m256i d1 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 1)), _scale, _min, _sum, _sqsum); |
390 | 0 | __m256i d2 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 2)), _scale, _min, _sum, _sqsum); |
391 | 0 | __m256i d3 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 3)), _scale, _min, _sum, _sqsum); |
392 | 0 | _mm256_storeu_si256((__m256i*)(dst + i), PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
393 | 0 | } |
394 | 0 | for (; i < size; i += F) |
395 | 0 | { |
396 | 0 | __m256i d0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i))), _scale, _min, _sum, _sqsum); |
397 | 0 | _mm_storel_epi64((__m128i*)(dst + i), _mm256_castsi256_si128(PackI16ToU8(PackI32ToI16(d0, _mm256_setzero_si256()), _mm256_setzero_si256()))); |
398 | 0 | } |
399 | 0 | sum = ExtractSum<uint32_t>(_sum); |
400 | 0 | sqsum = ExtractSum<uint32_t>(_sqsum); |
401 | 0 | } |
402 | | |
403 | | //------------------------------------------------------------------------------------------------- |
404 | | |
405 | | Base::DescrInt::Encode32fPtr GetEncode32f(size_t depth) |
406 | 0 | { |
407 | 0 | switch (depth) |
408 | 0 | { |
409 | 0 | case 4: return Encode32f4; |
410 | 0 | case 5: return Encode32f5; |
411 | 0 | case 6: return Encode32f6; |
412 | 0 | case 7: return Encode32f7; |
413 | 0 | case 8: return Encode32f8; |
414 | 0 | default: assert(0); return NULL; |
415 | 0 | } |
416 | 0 | } |
417 | | |
418 | | Base::DescrInt::Encode16fPtr GetEncode16f(size_t depth) |
419 | 0 | { |
420 | 0 | switch (depth) |
421 | 0 | { |
422 | 0 | case 4: return Encode16f4; |
423 | 0 | case 5: return Encode16f5; |
424 | 0 | case 6: return Encode16f6; |
425 | 0 | case 7: return Encode16f7; |
426 | 0 | case 8: return Encode16f8; |
427 | 0 | default: assert(0); return NULL; |
428 | 0 | } |
429 | 0 | } |
430 | | } |
431 | | #endif |
432 | | } |