Coverage Report

Created: 2025-09-27 07:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx2SynetQuantizedShuffle.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2025 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetQuantizeLinear.h"
25
#include "Simd/SimdMath.h"
26
#include "Simd/SimdDeinterleave.h"
27
#include "Simd/SimdSet.h"
28
29
namespace Simd
30
{
31
#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)   
32
    namespace Avx2
33
    {
34
        void SynetQuantizedShuffleLayerForwardNchw0(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
35
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
36
0
        {
37
0
            size_t dstC = (srcC0 + srcC1) / 2, cd = 0, spatial8  = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s;
38
0
            __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
39
0
            __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
40
0
            for (size_t cs = 0; cs < srcC0; cs += 2, cd += 1)
41
0
            {
42
0
                for (s = 0; s < spatial32; s += 32)
43
0
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
44
0
                for (; s < spatial8; s += 8)
45
0
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
46
0
                for (; s < spatial; s += 1)
47
0
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
48
0
                src0 += spatial;
49
0
                dst0 += spatial;
50
0
                for (s = 0; s < spatial32; s += 32)
51
0
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
52
0
                for (; s < spatial8; s += 8)
53
0
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
54
0
                for (; s < spatial; s += 1)
55
0
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
56
0
                src0 += spatial;
57
0
                dst1 += spatial;
58
0
            }
59
0
            for (size_t cs = 0; cs < srcC1; cs += 2, cd += 1)
60
0
            {
61
0
                for (s = 0; s < spatial32; s += 32)
62
0
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
63
0
                for (; s < spatial8; s += 8)
64
0
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
65
0
                for (; s < spatial; s += 1)
66
0
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
67
0
                src1 += spatial;
68
0
                dst0 += spatial;
69
0
                for (s = 0; s < spatial32; s += 32)
70
0
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
71
0
                for (; s < spatial8; s += 8)
72
0
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
73
0
                for (; s < spatial; s += 1)
74
0
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
75
0
                src1 += spatial;
76
0
                dst1 += spatial;
77
0
            }        
78
0
        }
79
80
        //--------------------------------------------------------------------------------------------------
81
82
        SIMD_INLINE void DequantizeQuantizeLinearNhwc0_1(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
83
0
        {
84
0
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_set1_epi32(((int16_t*)src)[0])), bias, norm), scale, zero);
85
0
            __m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO);
86
0
            dst0[0] = _mm256_extract_epi8(u0, 0);
87
0
            dst1[0] = _mm256_extract_epi8(u0, 1);
88
0
        }
89
90
        SIMD_INLINE void DequantizeQuantizeLinearNhwc0_8(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
91
0
        {
92
0
            __m128i _src = _mm_loadu_si128((__m128i*)src);
93
0
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 0)), bias, norm), scale, zero);
94
0
            __m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 8)), bias, norm), scale, zero);
95
0
            __m256i u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO));
96
0
#if defined(SIMD_X64_ENABLE)
97
0
            ((uint64_t*)dst0)[0] = _mm256_extract_epi64(u0, 0);
98
0
            ((uint64_t*)dst1)[0] = _mm256_extract_epi64(u0, 1);
99
#else
100
            SIMD_ALIGNED(32) uint64_t tmp[4];
101
            _mm256_store_si256((__m256i*)tmp, u0);
102
            ((uint64_t*)dst0)[0] = tmp[0];
103
            ((uint64_t*)dst1)[0] = tmp[1];
104
#endif
105
0
        }
106
107
        SIMD_INLINE void DequantizeQuantizeLinearNhwc0_32(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
108
0
        {
109
0
            __m256i d0, d1, d2, d3, u0, u1;
110
0
            __m128i s0;
111
0
            s0 = _mm_loadu_si128((__m128i*)src + 0);
112
0
            d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
113
0
            d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
114
0
            s0 = _mm_loadu_si128((__m128i*)src + 1);
115
0
            d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
116
0
            d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
117
0
            u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
118
0
            s0 = _mm_loadu_si128((__m128i*)src + 2);
119
0
            d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
120
0
            d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
121
0
            s0 = _mm_loadu_si128((__m128i*)src + 3);
122
0
            d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
123
0
            d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
124
0
            u1 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
125
0
            _mm256_storeu_si256((__m256i*)dst0, Deinterleave64<0>(u0, u1));
126
0
            _mm256_storeu_si256((__m256i*)dst1, Deinterleave64<1>(u0, u1));
127
0
        }
128
129
        void SynetQuantizedShuffleLayerForwardNhwc0(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
130
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
131
0
        {
132
0
            size_t dstC = (srcC0 + srcC1) / 2, cs, cd, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_64 = AlignLo(srcC0, 64), srcC1_64 = AlignLo(srcC1, 64);
133
0
            __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
134
0
            __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
135
0
            for (size_t s = 0; s < spatial; ++s)
136
0
            {
137
0
                cd = 0, cs = 0;
138
0
                for (; cs < srcC0_64; cs += 64, cd += 32)
139
0
                    DequantizeQuantizeLinearNhwc0_32(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
140
0
                for (; cs < srcC0_16; cs += 16, cd += 8)
141
0
                    DequantizeQuantizeLinearNhwc0_8(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
142
0
                for (; cs < srcC0; cs += 2, cd += 1)
143
0
                    DequantizeQuantizeLinearNhwc0_1(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
144
0
                cs = 0;
145
0
                for (; cs < srcC1_64; cs += 64, cd += 32)
146
0
                    DequantizeQuantizeLinearNhwc0_32(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
147
0
                for (; cs < srcC1_16; cs += 16, cd += 8)
148
0
                    DequantizeQuantizeLinearNhwc0_8(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
149
0
                for (; cs < srcC1; cs += 2, cd += 1)
150
0
                    DequantizeQuantizeLinearNhwc0_1(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
151
0
                src0 += srcC0;
152
0
                src1 += srcC1;
153
0
                dst0 += dstC;
154
0
                dst1 += dstC;
155
0
            }
156
0
        }
157
158
        //--------------------------------------------------------------------------------------------------
159
160
        void SynetQuantizedShuffleLayerForwardNchw1(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
161
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
162
0
        {
163
0
            size_t dstC = (srcC0 + srcC1) / 2, cs = 0, spatial8 = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s;
164
0
            __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
165
0
            __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
166
0
            for (size_t cd = 0; cd < srcC0; cs += 1, cd += 2)
167
0
            {
168
0
                for (s = 0; s < spatial32; s += 32)
169
0
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
170
0
                for (; s < spatial8; s += 8)
171
0
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
172
0
                for (; s < spatial; s += 1)
173
0
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
174
0
                src0 += spatial;
175
0
                dst0 += spatial;
176
0
                for (s = 0; s < spatial32; s += 32)
177
0
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
178
0
                for (; s < spatial8; s += 8)
179
0
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
180
0
                for (; s < spatial; s += 1)
181
0
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
182
0
                src1 += spatial;
183
0
                dst0 += spatial;
184
0
            }
185
0
            for (size_t cd = 0; cd < srcC1; cs += 1, cd += 2)
186
0
            {
187
0
                for (s = 0; s < spatial32; s += 32)
188
0
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
189
0
                for (; s < spatial8; s += 8)
190
0
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
191
0
                for (; s < spatial; s += 1)
192
0
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
193
0
                src0 += spatial;
194
0
                dst1 += spatial;
195
0
                for (s = 0; s < spatial32; s += 32)
196
0
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
197
0
                for (; s < spatial8; s += 8)
198
0
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
199
0
                for (; s < spatial; s += 1)
200
0
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
201
0
                src1 += spatial;
202
0
                dst1 += spatial;
203
0
            }
204
0
        }
205
206
        //--------------------------------------------------------------------------------------------------
207
208
        SIMD_INLINE void DequantizeQuantizeLinearNhwc1_1(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
209
0
        {
210
0
            __m128i s0 = _mm_set1_epi8(src0[0]);
211
0
            __m128i s1 = _mm_set1_epi8(src1[0]);
212
0
            __m128i s01 = _mm_unpacklo_epi8(s0, s1);
213
0
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(s01), bias, norm), scale, zero);
214
0
            __m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO);
215
0
            ((uint16_t*)dst)[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(u0));
216
0
        }
217
218
        SIMD_INLINE void DequantizeQuantizeLinearNhwc1_8(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
219
0
        {
220
0
            __m128i _src0 = _mm_loadl_epi64((__m128i*)src0);
221
0
            __m128i _src1 = _mm_loadl_epi64((__m128i*)src1);
222
0
            __m128i s0 = _mm_unpacklo_epi8(_src0, _src1);
223
0
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
224
0
            __m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
225
0
            __m256i u0 = PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO);
226
0
            _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(u0));
227
0
        }
228
229
        SIMD_INLINE void DequantizeQuantizeLinearNhwc1_16(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
230
0
        {
231
0
            __m128i _src0, _src1, s0;
232
0
            __m256i d0, d1, d2, d3;
233
0
            _src0 = _mm_loadu_si128((__m128i*)src0 + 0);
234
0
            _src1 = _mm_loadu_si128((__m128i*)src1 + 0);
235
0
            s0 = _mm_unpacklo_epi8(_src0, _src1);
236
0
            d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
237
0
            d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
238
0
            s0 = _mm_unpackhi_epi8(_src0, _src1);
239
0
            d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
240
0
            d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
241
0
            _mm256_storeu_si256((__m256i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
242
0
        }
243
244
        void SynetQuantizedShuffleLayerForwardNhwc1(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
245
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
246
0
        {
247
0
            size_t dstC = (srcC0 + srcC1) / 2, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_32 = AlignLo(srcC0, 32), srcC1_32 = AlignLo(srcC1, 32);
248
0
            __m256i _bias01 = SetInt32(bias0, bias1), _zero = _mm256_set1_epi32(zero);
249
0
            __m256 _norm01 = SetFloat(norm0, norm1), _scale = _mm256_set1_ps(scale);
250
0
            for (size_t s = 0; s < spatial; ++s)
251
0
            {
252
0
                size_t cs = 0, cd = 0;
253
0
                for (; cd < srcC0_32; cd += 32, cs += 16)
254
0
                    DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
255
0
                for (; cd < srcC0_16; cd += 16, cs += 8)
256
0
                    DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
257
0
                for (; cd < srcC0; cd += 2, cs += 1)
258
0
                    DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
259
0
                cd = 0;
260
0
                for (; cd < srcC1_32; cd += 32, cs += 16)
261
0
                    DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
262
0
                for (; cd < srcC1_16; cd += 16, cs += 8)
263
0
                    DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
264
0
                for (; cd < srcC1; cd += 2, cs += 1)
265
0
                    DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
266
0
                src0 += dstC;
267
0
                src1 += dstC;
268
0
                dst0 += srcC0;
269
0
                dst1 += srcC1;
270
0
            }
271
0
        }
272
273
        //--------------------------------------------------------------------------------------------------
274
275
        void SynetQuantizedShuffleLayerForward(const uint8_t* src0, int bias0, const float* norm0, size_t srcC0, const uint8_t* src1, int bias1, const float* norm1, size_t srcC1,
276
            size_t spatial, uint8_t* dst0, uint8_t* dst1, const float* scale, int zero, SimdTensorFormatType format, int shuffleType)
277
0
        {
278
0
            switch (shuffleType)
279
0
            {
280
0
            case 0:
281
0
                if (format == SimdTensorFormatNhwc)
282
0
                    SynetQuantizedShuffleLayerForwardNhwc0(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
283
0
                else
284
0
                    SynetQuantizedShuffleLayerForwardNchw0(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
285
0
                break;
286
0
            case 1:
287
0
                if (format == SimdTensorFormatNhwc)
288
0
                    SynetQuantizedShuffleLayerForwardNhwc1(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
289
0
                else
290
0
                    SynetQuantizedShuffleLayerForwardNchw1(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
291
0
                break;
292
0
            }
293
0
        }
294
    }
295
#endif
296
}