/src/Simd/src/Simd/SimdAvx512bwSynetQuantizedScale.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetQuantizeLinear.h" |
25 | | #include "Simd/SimdFmadd.h" |
26 | | |
27 | | namespace Simd |
28 | | { |
29 | | #if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE) |
30 | | namespace Avx512bw |
31 | | { |
32 | | SIMD_INLINE __m512i QuantizedScale(const __m512i& src, const __m512i& sBias, const __m512& sNorm, const __m512& scale, const __m512& bias, const __m512& dNorm, const __m512i& dZero) |
33 | 0 | { |
34 | 0 | __m512 _src = DequantizeLinear(src, sBias, sNorm); |
35 | 0 | __m512 _dst = Fmadd<false>(_src, scale, bias); |
36 | 0 | return QuantizeLinear(_dst, dNorm, dZero); |
37 | 0 | } |
38 | | |
39 | | SIMD_INLINE void QuantizedScale16(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const __m512& scale, const __m512& bias, uint8_t* dst, const __m512& dNorm, const __m512i& dZero, __mmask16 tail = -1) |
40 | 0 | { |
41 | 0 | __m512i _src = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(tail, src)); |
42 | 0 | __m512i d0 = QuantizedScale(_src, sBias, sNorm, scale, bias, dNorm, dZero); |
43 | 0 | _mm_mask_storeu_epi8(dst, tail, _mm512_castsi512_si128(PackI16ToU8(PackI32ToI16(d0, K_ZERO), K_ZERO))); |
44 | 0 | } |
45 | | |
46 | | SIMD_INLINE void QuantizedScale64(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const __m512& scale, const __m512& bias, uint8_t* dst, const __m512& dNorm, const __m512i& dZero) |
47 | 0 | { |
48 | 0 | __m512i d0 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 0)), sBias, sNorm, scale, bias, dNorm, dZero); |
49 | 0 | __m512i d1 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 1)), sBias, sNorm, scale, bias, dNorm, dZero); |
50 | 0 | __m512i d2 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 2)), sBias, sNorm, scale, bias, dNorm, dZero); |
51 | 0 | __m512i d3 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 3)), sBias, sNorm, scale, bias, dNorm, dZero); |
52 | 0 | _mm512_storeu_si512((__m512i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
53 | 0 | } |
54 | | |
55 | | SIMD_INLINE void QuantizedScale64(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const float* scale, const float* bias, uint8_t* dst, const __m512& dNorm, const __m512i& dZero) |
56 | 0 | { |
57 | 0 | __m512i d0 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 0)), sBias, sNorm, _mm512_loadu_ps(scale + 0 * F), _mm512_loadu_ps(bias + 0 * F), dNorm, dZero); |
58 | 0 | __m512i d1 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 1)), sBias, sNorm, _mm512_loadu_ps(scale + 1 * F), _mm512_loadu_ps(bias + 1 * F), dNorm, dZero); |
59 | 0 | __m512i d2 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 2)), sBias, sNorm, _mm512_loadu_ps(scale + 2 * F), _mm512_loadu_ps(bias + 2 * F), dNorm, dZero); |
60 | 0 | __m512i d3 = QuantizedScale(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 3)), sBias, sNorm, _mm512_loadu_ps(scale + 3 * F), _mm512_loadu_ps(bias + 3 * F), dNorm, dZero); |
61 | 0 | _mm512_storeu_si512((__m512i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
62 | 0 | } |
63 | | |
64 | | void SynetQuantizedScaleLayerForward(const uint8_t* src, const float* srcScale, int srcZero, size_t channels, size_t spatial, const float* scale, const float* bias, uint8_t* dst, const float* dstScale, int dstZero, SimdTensorFormatType format) |
65 | 0 | { |
66 | 0 | Array32f defaultBias; |
67 | 0 | if (bias == NULL) |
68 | 0 | { |
69 | 0 | defaultBias.Resize(channels, true); |
70 | 0 | bias = defaultBias.data; |
71 | 0 | } |
72 | 0 | __m512i sBias = _mm512_set1_epi32(-srcZero), dZero = _mm512_set1_epi32(dstZero); |
73 | 0 | __m512 sNorm = _mm512_set1_ps(srcScale[0]), dNorm = _mm512_set1_ps(1.0f / dstScale[0]); |
74 | 0 | if (format == SimdTensorFormatNhwc) |
75 | 0 | { |
76 | 0 | size_t channels16 = AlignLo(channels, 16), channels64 = AlignLo(channels, 64); |
77 | 0 | __mmask16 tail = TailMask16(channels - channels16); |
78 | 0 | for (size_t s = 0; s < spatial; ++s) |
79 | 0 | { |
80 | 0 | size_t c = 0; |
81 | 0 | for (; c < channels64; c += 64) |
82 | 0 | QuantizedScale64(src + c, sBias, sNorm, scale + c, bias + c, dst + c, dNorm, dZero); |
83 | 0 | for (; c < channels16; c += 16) |
84 | 0 | QuantizedScale16(src + c, sBias, sNorm, _mm512_load_ps(scale + c), _mm512_load_ps(bias + c), dst + c, dNorm, dZero); |
85 | 0 | if(tail) |
86 | 0 | QuantizedScale16(src + c, sBias, sNorm, _mm512_maskz_load_ps(tail, scale + c), _mm512_maskz_load_ps(tail, bias + c), dst + c, dNorm, dZero, tail); |
87 | 0 | src += channels; |
88 | 0 | dst += channels; |
89 | 0 | } |
90 | 0 | } |
91 | 0 | else |
92 | 0 | { |
93 | 0 | size_t spatial16 = AlignLo(spatial, 16), spatial64 = AlignLo(spatial, 64); |
94 | 0 | __mmask16 tail = TailMask16(spatial - spatial16); |
95 | 0 | for (size_t c = 0; c < channels; ++c) |
96 | 0 | { |
97 | 0 | __m512 _scale = _mm512_set1_ps(scale[c]); |
98 | 0 | __m512 _bias = _mm512_set1_ps(bias[c]); |
99 | 0 | size_t s = 0; |
100 | 0 | for (; s < spatial64; s += 64) |
101 | 0 | QuantizedScale64(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero); |
102 | 0 | for (; s < spatial16; s += 16) |
103 | 0 | QuantizedScale16(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero); |
104 | 0 | if (tail) |
105 | 0 | QuantizedScale16(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero, tail); |
106 | 0 | src += spatial; |
107 | 0 | dst += spatial; |
108 | 0 | } |
109 | 0 | } |
110 | 0 | } |
111 | | } |
112 | | #endif |
113 | | } |