Coverage Report

Created: 2026-02-14 07:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx2DescrIntDec.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2023 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdStore.h"
26
#include "Simd/SimdExtract.h"
27
#include "Simd/SimdArray.h"
28
#include "Simd/SimdUnpack.h"
29
#include "Simd/SimdDescrInt.h"
30
#include "Simd/SimdDescrIntCommon.h"
31
#include "Simd/SimdCpu.h"
32
33
namespace Simd
34
{
35
#ifdef SIMD_AVX2_ENABLE    
36
    namespace Avx2
37
    {
38
        static void Decode32f4(const uint8_t* src, float scale, float shift, size_t size, float* dst)
39
0
        {
40
0
            assert(size % 8 == 0);
41
0
            __m256 _scale = _mm256_set1_ps(scale);
42
0
            __m256 _shift = _mm256_set1_ps(shift);
43
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
44
0
            for (; i < size16; i += 16)
45
0
            {
46
0
                __m256i s4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
47
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s4, C4_SHFL), C4_MULLO), 12);
48
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
49
0
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
50
0
                src += 8;
51
0
                dst += 16;
52
0
            }
53
0
            for (; i < size; i += 8)
54
0
            {
55
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<4>(src), Sse41::C4_SHFL0), Sse41::C4_MULLO), 12);
56
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
57
0
                src += 4;
58
0
                dst += 8;
59
0
            }
60
0
        }
61
62
        static void Decode32f5(const uint8_t* src, float scale, float shift, size_t size, float* dst)
63
0
        {
64
0
            assert(size % 8 == 0);
65
0
            __m256 _scale = _mm256_set1_ps(scale);
66
0
            __m256 _shift = _mm256_set1_ps(shift);
67
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
68
0
            for (; i < size16; i += 16)
69
0
            {
70
0
                __m256i s5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
71
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s5, C5_SHFL), C5_MULLO), 11);
72
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
73
0
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
74
0
                src += 10;
75
0
                dst += 16;
76
0
            }
77
0
            for (; i < size; i += 8)
78
0
            {
79
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<5>(src), Sse41::C5_SHFL0), Sse41::C5_MULLO), 11);
80
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
81
0
                src += 5;
82
0
                dst += 8;
83
0
            }
84
0
        }
85
86
        static void Decode32f6(const uint8_t* src, float scale, float shift, size_t size, float* dst)
87
0
        {
88
0
            assert(size % 8 == 0);
89
0
            __m256 _scale = _mm256_set1_ps(scale);
90
0
            __m256 _shift = _mm256_set1_ps(shift);
91
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
92
0
            for (; i < size16; i += 16)
93
0
            {
94
0
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
95
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C6_SHFL), C6_MULLO), 10);
96
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
97
0
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
98
0
                src += 12;
99
0
                dst += 16;
100
0
            }
101
0
            for (; i < size; i += 8)
102
0
            {
103
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<6>(src), Sse41::C6_SHFL0), Sse41::C6_MULLO), 10);
104
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
105
0
                src += 6;
106
0
                dst += 8;
107
0
            }
108
0
        }
109
110
        static void Decode32f7(const uint8_t* src, float scale, float shift, size_t size, float* dst)
111
0
        {
112
0
            assert(size % 8 == 0);
113
0
            __m256 _scale = _mm256_set1_ps(scale);
114
0
            __m256 _shift = _mm256_set1_ps(shift);
115
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
116
0
            for (; i < size16; i += 16)
117
0
            {
118
0
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
119
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C7_SHFL), C7_MULLO), 9);
120
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
121
0
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
122
0
                src += 14;
123
0
                dst += 16;
124
0
            }
125
0
            for (; i < size; i += 8)
126
0
            {
127
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<7>(src), Sse41::C7_SHFL0), Sse41::C7_MULLO), 9);
128
0
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
129
0
                src += 7;
130
0
                dst += 8;
131
0
            }
132
0
        }
133
134
        static void Decode32f8(const uint8_t* src, float scale, float shift, size_t size, float* dst)
135
0
        {
136
0
            assert(size % 8 == 0);
137
0
            __m256 _scale = _mm256_set1_ps(scale);
138
0
            __m256 _shift = _mm256_set1_ps(shift);
139
0
            size_t i = 0, size16 = AlignLo(size, 16);
140
0
            for (; i < size16; i += 16)
141
0
            {
142
0
                __m128i u8 = _mm_loadu_si128((__m128i*)(src + i));
143
0
                _mm256_storeu_ps(dst + i + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(u8)), _scale, _shift));
144
0
                _mm256_storeu_ps(dst + i + F, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128(u8, 8))), _scale, _shift));
145
0
            }
146
0
            for (; i < size; i += 8)
147
0
            {
148
0
                __m256 _src = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(src + i))));
149
0
                _mm256_storeu_ps(dst + i, _mm256_fmadd_ps(_src, _scale, _shift));
150
0
            }
151
0
        }
152
153
        //-------------------------------------------------------------------------------------------------
154
155
        static void Decode16f4(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
156
0
        {
157
0
            assert(size % 8 == 0);
158
0
            __m256 _scale = _mm256_set1_ps(scale);
159
0
            __m256 _shift = _mm256_set1_ps(shift);
160
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
161
0
            for (; i < size16; i += 16)
162
0
            {
163
0
                __m256i s4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
164
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s4, C4_SHFL), C4_MULLO), 12);
165
0
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
166
0
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
167
0
                src += 8;
168
0
                dst += 16;
169
0
            }
170
0
            for (; i < size; i += 8)
171
0
            {
172
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<4>(src), Sse41::C4_SHFL0), Sse41::C4_MULLO), 12);
173
0
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
174
0
                src += 4;
175
0
                dst += 8;
176
0
            }
177
0
        }
178
179
        static void Decode16f5(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
180
0
        {
181
0
            assert(size % 8 == 0);
182
0
            __m256 _scale = _mm256_set1_ps(scale);
183
0
            __m256 _shift = _mm256_set1_ps(shift);
184
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
185
0
            for (; i < size16; i += 16)
186
0
            {
187
0
                __m256i s5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
188
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s5, C5_SHFL), C5_MULLO), 11);
189
0
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
190
0
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
191
0
                src += 10;
192
0
                dst += 16;
193
0
            }
194
0
            for (; i < size; i += 8)
195
0
            {
196
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<5>(src), Sse41::C5_SHFL0), Sse41::C5_MULLO), 11);
197
0
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
198
0
                src += 5;
199
0
                dst += 8;
200
0
            }
201
0
        }
202
203
        static void Decode16f6(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
204
0
        {
205
0
            assert(size % 8 == 0);
206
0
            __m256 _scale = _mm256_set1_ps(scale);
207
0
            __m256 _shift = _mm256_set1_ps(shift);
208
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
209
0
            for (; i < size16; i += 16)
210
0
            {
211
0
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
212
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C6_SHFL), C6_MULLO), 10);
213
0
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
214
0
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
215
0
                src += 12;
216
0
                dst += 16;
217
0
            }
218
0
            for (; i < size; i += 8)
219
0
            {
220
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<6>(src), Sse41::C6_SHFL0), Sse41::C6_MULLO), 10);
221
0
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
222
0
                src += 6;
223
0
                dst += 8;
224
0
            }
225
0
        }
226
227
        static void Decode16f7(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
228
0
        {
229
0
            assert(size % 8 == 0);
230
0
            __m256 _scale = _mm256_set1_ps(scale);
231
0
            __m256 _shift = _mm256_set1_ps(shift);
232
0
            size_t i = 0, size16 = AlignLo(size - 1, 16);
233
0
            for (; i < size16; i += 16)
234
0
            {
235
0
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
236
0
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C7_SHFL), C7_MULLO), 9);
237
0
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
238
0
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
239
0
                src += 14;
240
0
                dst += 16;
241
0
            }
242
0
            for (; i < size; i += 8)
243
0
            {
244
0
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<7>(src), Sse41::C7_SHFL0), Sse41::C7_MULLO), 9);
245
0
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
246
0
                src += 7;
247
0
                dst += 8;
248
0
            }
249
0
        }
250
251
        static void Decode16f8(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
252
0
        {
253
0
            assert(size % 8 == 0);
254
0
            __m256 _scale = _mm256_set1_ps(scale);
255
0
            __m256 _shift = _mm256_set1_ps(shift);
256
0
            size_t i = 0, size16 = AlignLo(size, 16);
257
0
            for (; i < size16; i += 16)
258
0
            {
259
0
                __m128i u8 = _mm_loadu_si128((__m128i*)(src + i));
260
0
                _mm_storeu_si128((__m128i*)(dst + i) + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(u8)), _scale, _shift), 0));
261
0
                _mm_storeu_si128((__m128i*)(dst + i) + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128(u8, 8))), _scale, _shift), 0));
262
0
            }
263
0
            for (; i < size; i += 8)
264
0
            {
265
0
                __m256 _src = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(src + i))));
266
0
                _mm_storeu_si128((__m128i*)(dst + i), _mm256_cvtps_ph(_mm256_fmadd_ps(_src, _scale, _shift), 0));
267
0
            }
268
0
        }
269
270
        //-------------------------------------------------------------------------------------------------
271
272
        Base::DescrInt::Decode32fPtr GetDecode32f(size_t depth)
273
0
        {
274
0
            switch (depth)
275
0
            {
276
0
            case 4: return Decode32f4;
277
0
            case 5: return Decode32f5;
278
0
            case 6: return Decode32f6;
279
0
            case 7: return Decode32f7;
280
0
            case 8: return Decode32f8;
281
0
            default: assert(0); return NULL;
282
0
            }
283
0
        }
284
285
        Base::DescrInt::Decode16fPtr GetDecode16f(size_t depth)
286
0
        {
287
0
            switch (depth)
288
0
            {
289
0
            case 4: return Decode16f4;
290
0
            case 5: return Decode16f5;
291
0
            case 6: return Decode16f6;
292
0
            case 7: return Decode16f7;
293
0
            case 8: return Decode16f8;
294
0
            default: assert(0); return NULL;
295
0
            }
296
0
        }
297
    }
298
#endif
299
}