Coverage Report

Created: 2026-04-09 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx2DescrIntEnc.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2023 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdStore.h"
26
#include "Simd/SimdExtract.h"
27
#include "Simd/SimdArray.h"
28
#include "Simd/SimdUnpack.h"
29
#include "Simd/SimdDescrInt.h"
30
#include "Simd/SimdDescrIntCommon.h"
31
#include "Simd/SimdCpu.h"
32
33
namespace Simd
34
{
35
#ifdef SIMD_AVX2_ENABLE    
36
    namespace Avx2
37
    {
38
        SIMD_INLINE __m256i Encode32f(__m256 src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
39
0
        {
40
0
            __m256i value = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sub_ps(src, min), scale));
41
0
            sum = _mm256_add_epi32(value, sum);
42
0
            sqsum = _mm256_add_epi32(_mm256_madd_epi16(value, value), sqsum);
43
0
            return value;
44
0
        }
45
46
        SIMD_INLINE __m256i Encode32f(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
47
0
        {
48
0
            return Encode32f(_mm256_loadu_ps(src), scale, min, sum, sqsum);
49
0
        }
50
51
        static SIMD_INLINE __m128i Encode32f4x8(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
52
0
        {
53
0
            __m256i i0 = Encode32f(src + 0 * 8, scale, min, sum, sqsum);
54
0
            __m128i s0 = _mm_srli_epi32(_mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E4_MULLO), 12);
55
0
            return _mm_packus_epi16(_mm_packus_epi32(s0, Sse41::K_ZERO), Sse41::K_ZERO);
56
0
        }
57
58
        static SIMD_INLINE __m128i Encode32f4x32(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
59
0
        {
60
0
            __m256i i0 = Encode32f(src + 0 * 8, scale, min, sum, sqsum);
61
0
            __m256i i1 = Encode32f(src + 1 * 8, scale, min, sum, sqsum);
62
0
            __m256i s0 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i0, i1), E4_MULLO), 12);
63
0
            __m256i i2 = Encode32f(src + 2 * 8, scale, min, sum, sqsum);
64
0
            __m256i i3 = Encode32f(src + 3 * 8, scale, min, sum, sqsum);
65
0
            __m256i s1 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i2, i3), E4_MULLO), 12);
66
0
            return _mm_packus_epi16(_mm_packus_epi32(_mm256_castsi256_si128(s0), _mm256_extracti128_si256(s0, 1)), 
67
0
                _mm_packus_epi32(_mm256_castsi256_si128(s1), _mm256_extracti128_si256(s1, 1)));
68
0
        }
69
70
        static void Encode32f4(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
71
0
        {
72
0
            assert(size % 8 == 0);
73
0
            size_t i = 0, size32 = AlignLo(size, 32);
74
0
            __m256 _scale = _mm256_set1_ps(scale);
75
0
            __m256 _min = _mm256_set1_ps(min);
76
0
            __m256i _sum = _mm256_setzero_si256();
77
0
            __m256i _sqsum = _mm256_setzero_si256();
78
0
            for (; i < size32; i += 32, src += 32, dst += 16)
79
0
                _mm_storeu_si128((__m128i*)dst, Encode32f4x32(src, _scale, _min, _sum, _sqsum));
80
0
            for (; i < size; i += 8, src += 8, dst += 4)
81
0
                *(uint32_t*)(dst) = _mm_extract_epi32(Encode32f4x8(src, _scale, _min, _sum, _sqsum), 0);
82
0
            sum = ExtractSum<uint32_t>(_sum);
83
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
84
0
        }
85
86
        static SIMD_INLINE __m128i Encode32f5x1(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
87
0
        {
88
0
            __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum);
89
0
            __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E5_MULLO);
90
0
            return _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E5_SHFL0), _mm_shuffle_epi8(s0, Sse41::E5_SHFL1)), _mm_shuffle_epi8(s0, Sse41::E5_SHFL2));
91
0
        }
92
93
        static SIMD_INLINE __m128i Encode32f5x2(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
94
0
        {
95
0
            __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum);
96
0
            __m256i i8 = Encode32f(src + 8, scale, min, sum, sqsum);
97
0
            __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E5_MULLO);
98
0
            __m256i e0 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, E5_SHFL0), _mm256_shuffle_epi8(s0, E5_SHFL1)), _mm256_shuffle_epi8(s0, E5_SHFL2));
99
0
            return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1));
100
0
        }
101
102
        static void Encode32f5(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
103
0
        {
104
0
            assert(size % 8 == 0);
105
0
            size_t i = 0, main = size - 8, main16 = AlignLo(main, 16);
106
0
            __m256 _scale = _mm256_set1_ps(scale);
107
0
            __m256 _min = _mm256_set1_ps(min);
108
0
            __m256i _sum = _mm256_setzero_si256();
109
0
            __m256i _sqsum = _mm256_setzero_si256();
110
0
            for (; i < main16; i += 16, src += 16, dst += 10)
111
0
                _mm_storeu_si128((__m128i*)dst, Encode32f5x2(src, _scale, _min, _sum, _sqsum));
112
0
            for (; i < main; i += 8, src += 8, dst += 5)
113
0
                _mm_storel_epi64((__m128i*)dst, Encode32f5x1(src, _scale, _min, _sum, _sqsum));
114
0
            for (; i < size; i += 8, src += 8, dst += 5)
115
0
            {
116
0
                __m128i d0 = Encode32f5x1(src, _scale, _min, _sum, _sqsum);
117
0
                *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0);
118
0
                *(uint8_t*)(dst + 4) = _mm_extract_epi8(d0, 4);
119
0
            }
120
0
            sum = ExtractSum<uint32_t>(_sum);
121
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
122
0
        }
123
124
        static SIMD_INLINE __m128i Encode32f6x1(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
125
0
        {
126
0
            __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum);
127
0
            __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E6_MULLO);
128
0
            return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E6_SHFL0), _mm_shuffle_epi8(s0, Sse41::E6_SHFL1));
129
0
        }
130
131
        static SIMD_INLINE __m128i Encode32f6x2(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
132
0
        {
133
0
            __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum);
134
0
            __m256i i8 = Encode32f(src + 8, scale, min, sum, sqsum);
135
0
            __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E6_MULLO);
136
0
            __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E6_SHFL0), _mm256_shuffle_epi8(s0, E6_SHFL1));
137
0
            return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1));
138
0
        }
139
140
        static void Encode32f6(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
141
0
        {
142
0
            assert(size % 8 == 0);
143
0
            size_t i = 0, main = size - 8, main16 = AlignLo(main, 16);
144
0
            __m256 _scale = _mm256_set1_ps(scale);
145
0
            __m256 _min = _mm256_set1_ps(min);
146
0
            __m256i _sum = _mm256_setzero_si256();
147
0
            __m256i _sqsum = _mm256_setzero_si256();
148
0
            for (; i < main16; i += 16, src += 16, dst += 12)
149
0
                _mm_storeu_si128((__m128i*)dst, Encode32f6x2(src, _scale, _min, _sum, _sqsum));
150
0
            for (; i < main; i += 8, src += 8, dst += 6)
151
0
                _mm_storel_epi64((__m128i*)dst, Encode32f6x1(src, _scale, _min, _sum, _sqsum));
152
0
            for (; i < size; i += 8, src += 8, dst += 6)
153
0
            {
154
0
                __m128i d0 = Encode32f6x1(src, _scale, _min, _sum, _sqsum);
155
0
                *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0);
156
0
                *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2);
157
0
            }
158
0
            sum = ExtractSum<uint32_t>(_sum);
159
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
160
0
        }
161
162
        static SIMD_INLINE __m128i Encode32f7x1(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
163
0
        {
164
0
            __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum);
165
0
            __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E7_MULLO);
166
0
            return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E7_SHFL0), _mm_shuffle_epi8(s0, Sse41::E7_SHFL1));
167
0
        }
168
169
        static SIMD_INLINE __m128i Encode32f7x2(const float* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
170
0
        {
171
0
            __m256i i0 = Encode32f(src + 0, scale, min, sum, sqsum);
172
0
            __m256i i8 = Encode32f(src + 8, scale, min, sum, sqsum);
173
0
            __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E7_MULLO);
174
0
            __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E7_SHFL0), _mm256_shuffle_epi8(s0, E7_SHFL1));
175
0
            return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1));
176
0
        }
177
178
        static void Encode32f7(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
179
0
        {
180
0
            assert(size % 8 == 0);
181
0
            size_t i = 0, main = size - 8, main16 = AlignLo(main, 16);
182
0
            __m256 _scale = _mm256_set1_ps(scale);
183
0
            __m256 _min = _mm256_set1_ps(min);
184
0
            __m256i _sum = _mm256_setzero_si256();
185
0
            __m256i _sqsum = _mm256_setzero_si256();
186
0
            for (; i < main16; i += 16, src += 16, dst += 14)
187
0
                _mm_storeu_si128((__m128i*)dst, Encode32f7x2(src, _scale, _min, _sum, _sqsum));
188
0
            for (; i < main; i += 8, src += 8, dst += 7)
189
0
                _mm_storel_epi64((__m128i*)dst, Encode32f7x1(src, _scale, _min, _sum, _sqsum));
190
0
            for (; i < size; i += 8, src += 8, dst += 7)
191
0
            {
192
0
                __m128i d0 = Encode32f7x1(src, _scale, _min, _sum, _sqsum);
193
0
                *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0);
194
0
                *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2);
195
0
                *(uint8_t*)(dst + 6) = _mm_extract_epi8(d0, 6);
196
0
            }
197
0
            sum = ExtractSum<uint32_t>(_sum);
198
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
199
0
        }
200
201
        static void Encode32f8(const float* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
202
0
        {
203
0
            assert(size % 8 == 0);
204
0
            size_t sizeA = AlignLo(size, A), i = 0;
205
0
            __m256 _scale = _mm256_set1_ps(scale);
206
0
            __m256 _min = _mm256_set1_ps(min);
207
0
            __m256i _sum = _mm256_setzero_si256();
208
0
            __m256i _sqsum = _mm256_setzero_si256();
209
0
            for (; i < sizeA; i += A)
210
0
            {
211
0
                __m256i d0 = Encode32f(src + i + 0 * F, _scale, _min, _sum, _sqsum);
212
0
                __m256i d1 = Encode32f(src + i + 1 * F, _scale, _min, _sum, _sqsum);
213
0
                __m256i d2 = Encode32f(src + i + 2 * F, _scale, _min, _sum, _sqsum);
214
0
                __m256i d3 = Encode32f(src + i + 3 * F, _scale, _min, _sum, _sqsum);
215
0
                _mm256_storeu_si256((__m256i*)(dst + i), PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
216
0
            }
217
0
            for (; i < size; i += F)
218
0
            {
219
0
                __m256i d0 = Encode32f(src + i, _scale, _min, _sum, _sqsum);
220
0
                _mm_storel_epi64((__m128i*)(dst + i), _mm256_castsi256_si128(PackI16ToU8(PackI32ToI16(d0, _mm256_setzero_si256()), _mm256_setzero_si256())));
221
0
            }
222
0
            sum = ExtractSum<uint32_t>(_sum);
223
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
224
0
        }
225
226
        //-------------------------------------------------------------------------------------------------
227
228
        static SIMD_INLINE __m128i Encode16f4x8(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
229
0
        {
230
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum);
231
0
            __m128i s0 = _mm_srli_epi32(_mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E4_MULLO), 12);
232
0
            return _mm_packus_epi16(_mm_packus_epi32(s0, Sse41::K_ZERO), Sse41::K_ZERO);
233
0
        }
234
235
        static SIMD_INLINE __m128i Encode16f4x32(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
236
0
        {
237
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum);
238
0
            __m256i i1 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum);
239
0
            __m256i s0 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i0, i1), E4_MULLO), 12);
240
0
            __m256i i2 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 2)), scale, min, sum, sqsum);
241
0
            __m256i i3 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 3)), scale, min, sum, sqsum);
242
0
            __m256i s1 = _mm256_srli_epi32(_mm256_mullo_epi16(PackU32ToI16(i2, i3), E4_MULLO), 12);
243
0
            return _mm_packus_epi16(_mm_packus_epi32(_mm256_castsi256_si128(s0), _mm256_extracti128_si256(s0, 1)),
244
0
                _mm_packus_epi32(_mm256_castsi256_si128(s1), _mm256_extracti128_si256(s1, 1)));
245
0
        }
246
247
        static void Encode16f4(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
248
0
        {
249
0
            assert(size % 8 == 0);
250
0
            size_t i = 0, size32 = AlignLo(size, 32);
251
0
            __m256 _scale = _mm256_set1_ps(scale);
252
0
            __m256 _min = _mm256_set1_ps(min);
253
0
            __m256i _sum = _mm256_setzero_si256();
254
0
            __m256i _sqsum = _mm256_setzero_si256();
255
0
            for (; i < size32; i += 32, src += 32, dst += 16)
256
0
                _mm_storeu_si128((__m128i*)dst, Encode16f4x32(src, _scale, _min, _sum, _sqsum));
257
0
            for (; i < size; i += 8, src += 8, dst += 4)
258
0
                *(uint32_t*)(dst) = _mm_extract_epi32(Encode16f4x8(src, _scale, _min, _sum, _sqsum), 0);
259
0
            sum = ExtractSum<uint32_t>(_sum);
260
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
261
0
        }
262
263
        static SIMD_INLINE __m128i Encode16f5x1(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
264
0
        {
265
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum);
266
0
            __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E5_MULLO);
267
0
            return _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E5_SHFL0), _mm_shuffle_epi8(s0, Sse41::E5_SHFL1)), _mm_shuffle_epi8(s0, Sse41::E5_SHFL2));
268
0
        }
269
270
        static SIMD_INLINE __m128i Encode16f5x2(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
271
0
        {
272
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum);
273
0
            __m256i i8 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum);
274
0
            __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E5_MULLO);
275
0
            __m256i e0 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, E5_SHFL0), _mm256_shuffle_epi8(s0, E5_SHFL1)), _mm256_shuffle_epi8(s0, E5_SHFL2));
276
0
            return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1));
277
0
        }
278
279
        static void Encode16f5(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
280
0
        {
281
0
            assert(size % 8 == 0);
282
0
            size_t i = 0, main = size - 8, main16 = AlignLo(main, 16);
283
0
            __m256 _scale = _mm256_set1_ps(scale);
284
0
            __m256 _min = _mm256_set1_ps(min);
285
0
            __m256i _sum = _mm256_setzero_si256();
286
0
            __m256i _sqsum = _mm256_setzero_si256();
287
0
            for (; i < main16; i += 16, src += 16, dst += 10)
288
0
                _mm_storeu_si128((__m128i*)dst, Encode16f5x2(src, _scale, _min, _sum, _sqsum));
289
0
            for (; i < main; i += 8, src += 8, dst += 5)
290
0
                _mm_storel_epi64((__m128i*)dst, Encode16f5x1(src, _scale, _min, _sum, _sqsum));
291
0
            for (; i < size; i += 8, src += 8, dst += 5)
292
0
            {
293
0
                __m128i d0 = Encode16f5x1(src, _scale, _min, _sum, _sqsum);
294
0
                *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0);
295
0
                *(uint8_t*)(dst + 4) = _mm_extract_epi8(d0, 4);
296
0
            }
297
0
            sum = ExtractSum<uint32_t>(_sum);
298
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
299
0
        }
300
301
        static SIMD_INLINE __m128i Encode16f6x1(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
302
0
        {
303
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum);
304
0
            __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E6_MULLO);
305
0
            return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E6_SHFL0), _mm_shuffle_epi8(s0, Sse41::E6_SHFL1));
306
0
        }
307
308
        static SIMD_INLINE __m128i Encode16f6x2(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
309
0
        {
310
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum);
311
0
            __m256i i8 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum);
312
0
            __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E6_MULLO);
313
0
            __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E6_SHFL0), _mm256_shuffle_epi8(s0, E6_SHFL1));
314
0
            return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1));
315
0
        }
316
317
        static void Encode16f6(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
318
0
        {
319
0
            assert(size % 8 == 0);
320
0
            size_t i = 0, main = size - 8, main16 = AlignLo(main, 16);
321
0
            __m256 _scale = _mm256_set1_ps(scale);
322
0
            __m256 _min = _mm256_set1_ps(min);
323
0
            __m256i _sum = _mm256_setzero_si256();
324
0
            __m256i _sqsum = _mm256_setzero_si256();
325
0
            for (; i < main16; i += 16, src += 16, dst += 12)
326
0
                _mm_storeu_si128((__m128i*)dst, Encode16f6x2(src, _scale, _min, _sum, _sqsum));
327
0
            for (; i < main; i += 8, src += 8, dst += 6)
328
0
                _mm_storel_epi64((__m128i*)dst, Encode16f6x1(src, _scale, _min, _sum, _sqsum));
329
0
            for (; i < size; i += 8, src += 8, dst += 6)
330
0
            {
331
0
                __m128i d0 = Encode16f6x1(src, _scale, _min, _sum, _sqsum);
332
0
                *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0);
333
0
                *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2);
334
0
            }
335
0
            sum = ExtractSum<uint32_t>(_sum);
336
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
337
0
        }
338
339
        static SIMD_INLINE __m128i Encode16f7x1(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
340
0
        {
341
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)), scale, min, sum, sqsum);
342
0
            __m128i s0 = _mm_mullo_epi16(_mm256_castsi256_si128(PackU32ToI16(i0, _mm256_setzero_si256())), Sse41::E7_MULLO);
343
0
            return _mm_or_si128(_mm_shuffle_epi8(s0, Sse41::E7_SHFL0), _mm_shuffle_epi8(s0, Sse41::E7_SHFL1));
344
0
        }
345
346
        static SIMD_INLINE __m128i Encode16f7x2(const uint16_t* src, __m256 scale, __m256 min, __m256i& sum, __m256i& sqsum)
347
0
        {
348
0
            __m256i i0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 0)), scale, min, sum, sqsum);
349
0
            __m256i i8 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src + 1)), scale, min, sum, sqsum);
350
0
            __m256i s0 = _mm256_mullo_epi16(PackU32ToI16(i0, i8), E7_MULLO);
351
0
            __m256i e0 = _mm256_or_si256(_mm256_shuffle_epi8(s0, E7_SHFL0), _mm256_shuffle_epi8(s0, E7_SHFL1));
352
0
            return _mm_or_si128(_mm256_castsi256_si128(e0), _mm256_extracti128_si256(e0, 1));
353
0
        }
354
355
        static void Encode16f7(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
356
0
        {
357
0
            assert(size % 8 == 0);
358
0
            size_t i = 0, main = size - 8, main16 = AlignLo(main, 16);
359
0
            __m256 _scale = _mm256_set1_ps(scale);
360
0
            __m256 _min = _mm256_set1_ps(min);
361
0
            __m256i _sum = _mm256_setzero_si256();
362
0
            __m256i _sqsum = _mm256_setzero_si256();
363
0
            for (; i < main16; i += 16, src += 16, dst += 14)
364
0
                _mm_storeu_si128((__m128i*)dst, Encode16f7x2(src, _scale, _min, _sum, _sqsum));
365
0
            for (; i < main; i += 8, src += 8, dst += 7)
366
0
                _mm_storel_epi64((__m128i*)dst, Encode16f7x1(src, _scale, _min, _sum, _sqsum));
367
0
            for (; i < size; i += 8, src += 8, dst += 7)
368
0
            {
369
0
                __m128i d0 = Encode16f7x1(src, _scale, _min, _sum, _sqsum);
370
0
                *(uint32_t*)(dst + 0) = _mm_extract_epi32(d0, 0);
371
0
                *(uint16_t*)(dst + 4) = _mm_extract_epi16(d0, 2);
372
0
                *(uint8_t*)(dst + 6) = _mm_extract_epi8(d0, 6);
373
0
            }
374
0
            sum = ExtractSum<uint32_t>(_sum);
375
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
376
0
        }
377
378
        static void Encode16f8(const uint16_t* src, float scale, float min, size_t size, int32_t& sum, int32_t& sqsum, uint8_t* dst)
379
0
        {
380
0
            assert(size % 8 == 0);
381
0
            size_t sizeA = AlignLo(size, A), i = 0;
382
0
            __m256 _scale = _mm256_set1_ps(scale);
383
0
            __m256 _min = _mm256_set1_ps(min);
384
0
            __m256i _sum = _mm256_setzero_si256();
385
0
            __m256i _sqsum = _mm256_setzero_si256();
386
0
            for (; i < sizeA; i += A)
387
0
            {
388
0
                __m256i d0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 0)), _scale, _min, _sum, _sqsum);
389
0
                __m256i d1 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 1)), _scale, _min, _sum, _sqsum);
390
0
                __m256i d2 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 2)), _scale, _min, _sum, _sqsum);
391
0
                __m256i d3 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i) + 3)), _scale, _min, _sum, _sqsum);
392
0
                _mm256_storeu_si256((__m256i*)(dst + i), PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
393
0
            }
394
0
            for (; i < size; i += F)
395
0
            {
396
0
                __m256i d0 = Encode32f(_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src + i))), _scale, _min, _sum, _sqsum);
397
0
                _mm_storel_epi64((__m128i*)(dst + i), _mm256_castsi256_si128(PackI16ToU8(PackI32ToI16(d0, _mm256_setzero_si256()), _mm256_setzero_si256())));
398
0
            }
399
0
            sum = ExtractSum<uint32_t>(_sum);
400
0
            sqsum = ExtractSum<uint32_t>(_sqsum);
401
0
        }
402
403
        //-------------------------------------------------------------------------------------------------
404
405
        Base::DescrInt::Encode32fPtr GetEncode32f(size_t depth)
406
0
        {
407
0
            switch (depth)
408
0
            {
409
0
            case 4: return Encode32f4;
410
0
            case 5: return Encode32f5;
411
0
            case 6: return Encode32f6;
412
0
            case 7: return Encode32f7;
413
0
            case 8: return Encode32f8;
414
0
            default: assert(0); return NULL;
415
0
            }
416
0
        }
417
418
        Base::DescrInt::Encode16fPtr GetEncode16f(size_t depth)
419
0
        {
420
0
            switch (depth)
421
0
            {
422
0
            case 4: return Encode16f4;
423
0
            case 5: return Encode16f5;
424
0
            case 6: return Encode16f6;
425
0
            case 7: return Encode16f7;
426
0
            case 8: return Encode16f8;
427
0
            default: assert(0); return NULL;
428
0
            }
429
0
        }
430
    }
431
#endif
432
}