/src/Simd/src/Simd/SimdSse41SynetQuantizedScale.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2026 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetQuantizeLinear.h" |
25 | | #include "Simd/SimdFmadd.h" |
26 | | |
27 | | namespace Simd |
28 | | { |
29 | | #if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) |
30 | | namespace Sse41 |
31 | | { |
32 | | SIMD_INLINE __m128i QuantizedScale(const __m128i& src, const __m128i& sBias, const __m128& sNorm, const __m128& scale, const __m128& bias, const __m128& dNorm, const __m128i& dZero) |
33 | 0 | { |
34 | 0 | __m128 _src = DequantizeLinear(src, sBias, sNorm); |
35 | 0 | __m128 _dst = Fmadd<false>(_src, scale, bias); |
36 | 0 | return QuantizeLinear(_dst, dNorm, dZero); |
37 | 0 | } |
38 | | |
39 | | SIMD_INLINE void QuantizedScale1(const uint8_t* src, const __m128i& sBias, const __m128& sNorm, const __m128& scale, const __m128& bias, uint8_t* dst, const __m128& dNorm, const __m128i& dZero) |
40 | 0 | { |
41 | 0 | __m128i _src = _mm_set1_epi32(src[0]); |
42 | 0 | __m128i d0 = QuantizedScale(_src, sBias, sNorm, scale, bias, dNorm, dZero); |
43 | 0 | dst[0] = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(d0, K_ZERO), K_ZERO)); |
44 | 0 | } |
45 | | |
46 | | SIMD_INLINE void QuantizedScale4(const uint8_t* src, const __m128i& sBias, const __m128& sNorm, const __m128& scale, const __m128& bias, uint8_t* dst, const __m128& dNorm, const __m128i& dZero) |
47 | 0 | { |
48 | 0 | __m128i _src = _mm_cvtepu8_epi32(_mm_set1_epi32(((int32_t*)src)[0])); |
49 | 0 | __m128i d0 = QuantizedScale(_src, sBias, sNorm, scale, bias, dNorm, dZero); |
50 | 0 | ((uint32_t*)dst)[0] = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(d0, K_ZERO), K_ZERO)); |
51 | 0 | } |
52 | | |
53 | | SIMD_INLINE void QuantizedScale16(const uint8_t* src, const __m128i& sBias, const __m128& sNorm, const __m128& scale, const __m128& bias, uint8_t* dst, const __m128& dNorm, const __m128i& dZero) |
54 | 0 | { |
55 | 0 | __m128i _src = _mm_loadu_si128((__m128i*)src); |
56 | 0 | __m128i d0 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 0 * 4)), sBias, sNorm, scale, bias, dNorm, dZero); |
57 | 0 | __m128i d1 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 1 * 4)), sBias, sNorm, scale, bias, dNorm, dZero); |
58 | 0 | __m128i d2 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 2 * 4)), sBias, sNorm, scale, bias, dNorm, dZero); |
59 | 0 | __m128i d3 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 3 * 4)), sBias, sNorm, scale, bias, dNorm, dZero); |
60 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3))); |
61 | 0 | } |
62 | | |
63 | | SIMD_INLINE void QuantizedScale16(const uint8_t* src, const __m128i& sBias, const __m128& sNorm, const float* scale, const float* bias, uint8_t* dst, const __m128& dNorm, const __m128i& dZero) |
64 | 0 | { |
65 | 0 | __m128i _src = _mm_loadu_si128((__m128i*)src); |
66 | 0 | __m128i d0 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 0 * 4)), sBias, sNorm, _mm_loadu_ps(scale + 0 * 4), _mm_loadu_ps(bias + 0 * 4), dNorm, dZero); |
67 | 0 | __m128i d1 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 1 * 4)), sBias, sNorm, _mm_loadu_ps(scale + 1 * 4), _mm_loadu_ps(bias + 1 * 4), dNorm, dZero); |
68 | 0 | __m128i d2 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 2 * 4)), sBias, sNorm, _mm_loadu_ps(scale + 2 * 4), _mm_loadu_ps(bias + 2 * 4), dNorm, dZero); |
69 | 0 | __m128i d3 = QuantizedScale(_mm_cvtepu8_epi32(_mm_srli_si128(_src, 3 * 4)), sBias, sNorm, _mm_loadu_ps(scale + 3 * 4), _mm_loadu_ps(bias + 3 * 4), dNorm, dZero); |
70 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3))); |
71 | 0 | } |
72 | | |
73 | | void SynetQuantizedScaleLayerForward(const uint8_t* src, const float* srcScale, int srcZero, size_t channels, size_t spatial, const float* scale, const float* bias, uint8_t* dst, const float* dstScale, int dstZero, SimdTensorFormatType format) |
74 | 0 | { |
75 | 0 | Array32f defaultBias; |
76 | 0 | if (bias == NULL) |
77 | 0 | { |
78 | 0 | defaultBias.Resize(channels, true); |
79 | 0 | bias = defaultBias.data; |
80 | 0 | } |
81 | 0 | __m128i sBias = _mm_set1_epi32(-srcZero), dZero = _mm_set1_epi32(dstZero); |
82 | 0 | __m128 sNorm = _mm_set1_ps(srcScale[0]), dNorm = _mm_set1_ps(1.0f / dstScale[0]); |
83 | 0 | if (format == SimdTensorFormatNhwc) |
84 | 0 | { |
85 | 0 | size_t channels4 = AlignLo(channels, 4), channels16 = AlignLo(channels, 16); |
86 | 0 | for (size_t s = 0; s < spatial; ++s) |
87 | 0 | { |
88 | 0 | size_t c = 0; |
89 | 0 | for (; c < channels16; c += 16) |
90 | 0 | QuantizedScale16(src + c, sBias, sNorm, scale + c, bias + c, dst + c, dNorm, dZero); |
91 | 0 | for (; c < channels4; c += 4) |
92 | 0 | QuantizedScale4(src + c, sBias, sNorm, _mm_loadu_ps(scale + c), _mm_loadu_ps(bias + c), dst + c, dNorm, dZero); |
93 | 0 | for (; c < channels; ++c) |
94 | 0 | QuantizedScale1(src + c, sBias, sNorm, _mm_load_ss(scale + c), _mm_load_ss(bias + c), dst + c, dNorm, dZero); |
95 | 0 | src += channels; |
96 | 0 | dst += channels; |
97 | 0 | } |
98 | 0 | } |
99 | 0 | else |
100 | 0 | { |
101 | 0 | size_t spatial4 = AlignLo(spatial, 4), spatial16 = AlignLo(spatial, 16); |
102 | 0 | for (size_t c = 0; c < channels; ++c) |
103 | 0 | { |
104 | 0 | __m128 _scale = _mm_set1_ps(scale[c]); |
105 | 0 | __m128 _bias = _mm_set1_ps(bias[c]); |
106 | 0 | size_t s = 0; |
107 | 0 | for (; s < spatial16; s += 16) |
108 | 0 | QuantizedScale16(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero); |
109 | 0 | for (; s < spatial4; s += 4) |
110 | 0 | QuantizedScale4(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero); |
111 | 0 | for (; s < spatial; ++s) |
112 | 0 | QuantizedScale1(src + s, sBias, sNorm, _scale, _bias, dst + s, dNorm, dZero); |
113 | 0 | src += spatial; |
114 | 0 | dst += spatial; |
115 | 0 | } |
116 | 0 | } |
117 | 0 | } |
118 | | } |
119 | | #endif |
120 | | } |