/src/Simd/src/Simd/SimdAvx512bwSynetQuantizedActivation.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetQuantizeLinear.h" |
25 | | #include "Simd/SimdFmadd.h" |
26 | | |
27 | | namespace Simd |
28 | | { |
29 | | #if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE) |
30 | | namespace Avx512bw |
31 | | { |
32 | | SIMD_INLINE __m512i QuantizedPrelu(const __m512i& src, const __m512i& sBias, const __m512& sNorm, const __m512& slope, const __m512& dNorm, const __m512i& dZero) |
33 | 0 | { |
34 | 0 | __m512 _src = DequantizeLinear(src, sBias, sNorm); |
35 | 0 | __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), _src); |
36 | 0 | __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), _src); |
37 | 0 | __m512 _dst = Fmadd<false>(slope, neg, pos); |
38 | 0 | return QuantizeLinear(_dst, dNorm, dZero); |
39 | 0 | } |
40 | | |
41 | | SIMD_INLINE void QuantizedPrelu16(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const __m512& slope, uint8_t* dst, const __m512& dNorm, const __m512i& dZero, __mmask16 tail = -1) |
42 | 0 | { |
43 | 0 | __m512i _src = _mm512_cvtepu8_epi32(_mm_maskz_loadu_epi8(tail, src)); |
44 | 0 | __m512i d0 = QuantizedPrelu(_src, sBias, sNorm, slope, dNorm, dZero); |
45 | 0 | _mm_mask_storeu_epi8(dst, tail, _mm512_castsi512_si128(PackI16ToU8(PackI32ToI16(d0, K_ZERO), K_ZERO))); |
46 | 0 | } |
47 | | |
48 | | SIMD_INLINE void QuantizedPrelu64(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const __m512& slope, uint8_t* dst, const __m512& dNorm, const __m512i& dZero) |
49 | 0 | { |
50 | 0 | __m512i d0 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 0)), sBias, sNorm, slope, dNorm, dZero); |
51 | 0 | __m512i d1 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 1)), sBias, sNorm, slope, dNorm, dZero); |
52 | 0 | __m512i d2 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 2)), sBias, sNorm, slope, dNorm, dZero); |
53 | 0 | __m512i d3 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 3)), sBias, sNorm, slope, dNorm, dZero); |
54 | 0 | _mm512_storeu_si512((__m512i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
55 | 0 | } |
56 | | |
57 | | SIMD_INLINE void QuantizedPrelu64(const uint8_t* src, const __m512i& sBias, const __m512& sNorm, const float* slope, uint8_t* dst, const __m512& dNorm, const __m512i& dZero) |
58 | 0 | { |
59 | 0 | __m512i d0 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 0)), sBias, sNorm, _mm512_loadu_ps(slope + 0 * F), dNorm, dZero); |
60 | 0 | __m512i d1 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 1)), sBias, sNorm, _mm512_loadu_ps(slope + 1 * F), dNorm, dZero); |
61 | 0 | __m512i d2 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 2)), sBias, sNorm, _mm512_loadu_ps(slope + 2 * F), dNorm, dZero); |
62 | 0 | __m512i d3 = QuantizedPrelu(_mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)src + 3)), sBias, sNorm, _mm512_loadu_ps(slope + 3 * F), dNorm, dZero); |
63 | 0 | _mm512_storeu_si512((__m512i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
64 | 0 | } |
65 | | |
66 | | void SynetQuantizedPreluLayerForward(const uint8_t* src, const float* srcPrelu, int srcZero, size_t channels, size_t spatial, const float* slope, uint8_t* dst, const float* dstPrelu, int dstZero, SimdTensorFormatType format) |
67 | 0 | { |
68 | 0 | __m512i sBias = _mm512_set1_epi32(-srcZero), dZero = _mm512_set1_epi32(dstZero); |
69 | 0 | __m512 sNorm = _mm512_set1_ps(srcPrelu[0]), dNorm = _mm512_set1_ps(1.0f / dstPrelu[0]); |
70 | 0 | if (format == SimdTensorFormatNhwc) |
71 | 0 | { |
72 | 0 | size_t channels16 = AlignLo(channels, 16), channels64 = AlignLo(channels, 64); |
73 | 0 | __mmask16 tail = TailMask16(channels - channels16); |
74 | 0 | for (size_t s = 0; s < spatial; ++s) |
75 | 0 | { |
76 | 0 | size_t c = 0; |
77 | 0 | for (; c < channels64; c += 64) |
78 | 0 | QuantizedPrelu64(src + c, sBias, sNorm, slope + c, dst + c, dNorm, dZero); |
79 | 0 | for (; c < channels16; c += 16) |
80 | 0 | QuantizedPrelu16(src + c, sBias, sNorm, _mm512_load_ps(slope + c), dst + c, dNorm, dZero); |
81 | 0 | if(tail) |
82 | 0 | QuantizedPrelu16(src + c, sBias, sNorm, _mm512_maskz_load_ps(tail, slope + c), dst + c, dNorm, dZero, tail); |
83 | 0 | src += channels; |
84 | 0 | dst += channels; |
85 | 0 | } |
86 | 0 | } |
87 | 0 | else |
88 | 0 | { |
89 | 0 | size_t spatial16 = AlignLo(spatial, 16), spatial64 = AlignLo(spatial, 64); |
90 | 0 | __mmask16 tail = TailMask16(spatial - spatial16); |
91 | 0 | for (size_t c = 0; c < channels; ++c) |
92 | 0 | { |
93 | 0 | __m512 _slope = _mm512_set1_ps(slope[c]); |
94 | 0 | size_t s = 0; |
95 | 0 | for (; s < spatial64; s += 64) |
96 | 0 | QuantizedPrelu64(src + s, sBias, sNorm, _slope, dst + s, dNorm, dZero); |
97 | 0 | for (; s < spatial16; s += 16) |
98 | 0 | QuantizedPrelu16(src + s, sBias, sNorm, _slope, dst + s, dNorm, dZero); |
99 | 0 | if (tail) |
100 | 0 | QuantizedPrelu16(src + s, sBias, sNorm, _slope, dst + s, dNorm, dZero, tail); |
101 | 0 | src += spatial; |
102 | 0 | dst += spatial; |
103 | 0 | } |
104 | 0 | } |
105 | 0 | } |
106 | | } |
107 | | #endif |
108 | | } |