Coverage Report

Created: 2026-02-14 07:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx512bwSynetQuantizedScale.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2025 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetQuantizeLinear.h"
25
#include "Simd/SimdFmadd.h"
26
27
namespace Simd
28
{
29
#if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE)   
30
    namespace Avx512bw
31
    {
32
        SIMD_INLINE __m512i QuantizedScale(const __m512i& src, const __m512i& sBias, const __m512& sNorm, const __m512& scale, const __m512& bias, const __m512& dNorm, const __m512i& dZero)
33
0
        {
34
0
            __m512 _src = DequantizeLinear(src, sBias, sNorm);
35
0
            __m512 _dst = Fmadd<false>(_src, scale, bias);
36
0
            return QuantizeLinear(_dst, dNorm, dZero);
37
0
        }
38
39
        SIMD_INLINE void QuantizedScale16(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const __m512& scale, const __m512& bias, uint8_t* dst, const __m512& dNorm, const __m512i& dZero, __mmask16 tail = -1)
40
0
        {
41
0
            __m512i _src = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(tail, src));
42
0
            __m512i d0 = QuantizedScale(_src, sBias, sNorm, scale, bias, dNorm, dZero);
43
0
            _mm_mask_storeu_epi8(dst, tail, _mm512_castsi512_si128(PackI16ToU8(PackI32ToI16(d0, K_ZERO), K_ZERO)));
44
0
        }
45
46
        SIMD_INLINE void QuantizedScale64(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const __m512& scale, const __m512& bias, uint8_t* dst, const __m512& dNorm, const __m512i& dZero)
47
0
        {
48
0
            __m512i d0 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 0)), sBias, sNorm, scale, bias, dNorm, dZero);
49
0
            __m512i d1 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 1)), sBias, sNorm, scale, bias, dNorm, dZero);
50
0
            __m512i d2 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 2)), sBias, sNorm, scale, bias, dNorm, dZero);
51
0
            __m512i d3 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 3)), sBias, sNorm, scale, bias, dNorm, dZero);
52
0
            _mm512_storeu_si512((__m512i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
53
0
        }
54
55
        SIMD_INLINE void QuantizedScale64(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const float* scale, const float* bias, uint8_t* dst, const __m512& dNorm, const __m512i& dZero)
56
0
        {
57
0
            __m512i d0 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 0)), sBias, sNorm, _mm512_loadu_ps(scale + 0 * F), _mm512_loadu_ps(bias + 0 * F), dNorm, dZero);
58
0
            __m512i d1 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 1)), sBias, sNorm, _mm512_loadu_ps(scale + 1 * F), _mm512_loadu_ps(bias + 1 * F), dNorm, dZero);
59
0
            __m512i d2 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 2)), sBias, sNorm, _mm512_loadu_ps(scale + 2 * F), _mm512_loadu_ps(bias + 2 * F), dNorm, dZero);
60
0
            __m512i d3 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 3)), sBias, sNorm, _mm512_loadu_ps(scale + 3 * F), _mm512_loadu_ps(bias + 3 * F), dNorm, dZero);
61
0
            _mm512_storeu_si512((__m512i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
62
0
        }
63
64
        void SynetQuantizedScaleLayerForward(const uint8_t* src, const float* srcScale, int srcZero, size_t channels, size_t spatial, const float* scale, const float* bias, uint8_t* dst, const float* dstScale, int dstZero, SimdTensorFormatType format)
65
0
        {
66
0
            Array32f defaultBias;
67
0
            if (bias == NULL)
68
0
            {
69
0
                defaultBias.Resize(channels, true);
70
0
                bias = defaultBias.data;
71
0
            }
72
0
            __m512i sBias = _mm512_set1_epi32(-srcZero), dZero = _mm512_set1_epi32(dstZero);
73
0
            __m512 sNorm = _mm512_set1_ps(srcScale[0]), dNorm = _mm512_set1_ps(1.0f / dstScale[0]);
74
0
            if (format == SimdTensorFormatNhwc)
75
0
            {
76
0
                size_t channels16 = AlignLo(channels, 16), channels64 = AlignLo(channels, 64);
77
0
                __mmask16 tail = TailMask16(channels - channels16);
78
0
                for (size_t s = 0; s < spatial; ++s)
79
0
                {
80
0
                    size_t c = 0;
81
0
                    for (; c < channels64; c += 64)
82
0
                        QuantizedScale64(src + c, sBias, sNorm, scale + c, bias + c, dst + c, dNorm, dZero);
83
0
                    for (; c < channels16; c += 16)
84
0
                        QuantizedScale16(src + c, sBias, sNorm, _mm512_load_ps(scale + c), _mm512_load_ps(bias + c), dst + c, dNorm, dZero);
85
0
                    if(tail)
86
0
                        QuantizedScale16(src + c, sBias, sNorm, _mm512_maskz_load_ps(tail, scale + c), _mm512_maskz_load_ps(tail, bias + c), dst + c, dNorm, dZero, tail);
87
0
                    src += channels;
88
0
                    dst += channels;
89
0
                }
90
0
            }
91
0
            else
92
0
            {
93
0
                size_t spatial16 = AlignLo(spatial, 16), spatial64 = AlignLo(spatial, 64);
94
0
                __mmask16 tail = TailMask16(spatial - spatial16);
95
0
                for (size_t c = 0; c < channels; ++c)
96
0
                {
97
0
                    __m512 _scale = _mm512_set1_ps(scale[c]);
98
0
                    __m512 _bias = _mm512_set1_ps(bias[c]);
99
0
                    size_t s = 0;
100
0
                    for (; s < spatial64; s += 64)
101
0
                        QuantizedScale64(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero);
102
0
                    for (; s < spatial16; s += 16)
103
0
                        QuantizedScale16(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero);
104
0
                    if (tail)
105
0
                        QuantizedScale16(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero, tail);
106
0
                    src += spatial;
107
0
                    dst += spatial;
108
0
                }
109
0
            }
110
0
        }
111
    }
112
#endif
113
}