/src/Simd/src/Simd/SimdAvx2SynetQuantizedShuffle.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetQuantizeLinear.h" |
25 | | #include "Simd/SimdMath.h" |
26 | | #include "Simd/SimdDeinterleave.h" |
27 | | #include "Simd/SimdSet.h" |
28 | | |
29 | | namespace Simd |
30 | | { |
31 | | #if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE) |
32 | | namespace Avx2 |
33 | | { |
34 | | void SynetQuantizedShuffleLayerForwardNchw0(const uint8_t* src0, int bias0, float norm0, size_t srcC0, |
35 | | const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero) |
36 | 0 | { |
37 | 0 | size_t dstC = (srcC0 + srcC1) / 2, cd = 0, spatial8 = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s; |
38 | 0 | __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero); |
39 | 0 | __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale); |
40 | 0 | for (size_t cs = 0; cs < srcC0; cs += 2, cd += 1) |
41 | 0 | { |
42 | 0 | for (s = 0; s < spatial32; s += 32) |
43 | 0 | DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s); |
44 | 0 | for (; s < spatial8; s += 8) |
45 | 0 | DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s); |
46 | 0 | for (; s < spatial; s += 1) |
47 | 0 | DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s); |
48 | 0 | src0 += spatial; |
49 | 0 | dst0 += spatial; |
50 | 0 | for (s = 0; s < spatial32; s += 32) |
51 | 0 | DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s); |
52 | 0 | for (; s < spatial8; s += 8) |
53 | 0 | DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s); |
54 | 0 | for (; s < spatial; s += 1) |
55 | 0 | DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s); |
56 | 0 | src0 += spatial; |
57 | 0 | dst1 += spatial; |
58 | 0 | } |
59 | 0 | for (size_t cs = 0; cs < srcC1; cs += 2, cd += 1) |
60 | 0 | { |
61 | 0 | for (s = 0; s < spatial32; s += 32) |
62 | 0 | DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s); |
63 | 0 | for (; s < spatial8; s += 8) |
64 | 0 | DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s); |
65 | 0 | for (; s < spatial; s += 1) |
66 | 0 | DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s); |
67 | 0 | src1 += spatial; |
68 | 0 | dst0 += spatial; |
69 | 0 | for (s = 0; s < spatial32; s += 32) |
70 | 0 | DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s); |
71 | 0 | for (; s < spatial8; s += 8) |
72 | 0 | DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s); |
73 | 0 | for (; s < spatial; s += 1) |
74 | 0 | DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s); |
75 | 0 | src1 += spatial; |
76 | 0 | dst1 += spatial; |
77 | 0 | } |
78 | 0 | } |
79 | | |
80 | | //-------------------------------------------------------------------------------------------------- |
81 | | |
82 | | SIMD_INLINE void DequantizeQuantizeLinearNhwc0_1(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1) |
83 | 0 | { |
84 | 0 | __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_set1_epi32(((int16_t*)src)[0])), bias, norm), scale, zero); |
85 | 0 | __m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO); |
86 | 0 | dst0[0] = _mm256_extract_epi8(u0, 0); |
87 | 0 | dst1[0] = _mm256_extract_epi8(u0, 1); |
88 | 0 | } |
89 | | |
90 | | SIMD_INLINE void DequantizeQuantizeLinearNhwc0_8(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1) |
91 | 0 | { |
92 | 0 | __m128i _src = _mm_loadu_si128((__m128i*)src); |
93 | 0 | __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 0)), bias, norm), scale, zero); |
94 | 0 | __m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 8)), bias, norm), scale, zero); |
95 | 0 | __m256i u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO)); |
96 | 0 | #if defined(SIMD_X64_ENABLE) |
97 | 0 | ((uint64_t*)dst0)[0] = _mm256_extract_epi64(u0, 0); |
98 | 0 | ((uint64_t*)dst1)[0] = _mm256_extract_epi64(u0, 1); |
99 | | #else |
100 | | SIMD_ALIGNED(32) uint64_t tmp[4]; |
101 | | _mm256_store_si256((__m256i*)tmp, u0); |
102 | | ((uint64_t*)dst0)[0] = tmp[0]; |
103 | | ((uint64_t*)dst1)[0] = tmp[1]; |
104 | | #endif |
105 | 0 | } |
106 | | |
107 | | SIMD_INLINE void DequantizeQuantizeLinearNhwc0_32(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1) |
108 | 0 | { |
109 | 0 | __m256i d0, d1, d2, d3, u0, u1; |
110 | 0 | __m128i s0; |
111 | 0 | s0 = _mm_loadu_si128((__m128i*)src + 0); |
112 | 0 | d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
113 | 0 | d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
114 | 0 | s0 = _mm_loadu_si128((__m128i*)src + 1); |
115 | 0 | d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
116 | 0 | d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
117 | 0 | u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
118 | 0 | s0 = _mm_loadu_si128((__m128i*)src + 2); |
119 | 0 | d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
120 | 0 | d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
121 | 0 | s0 = _mm_loadu_si128((__m128i*)src + 3); |
122 | 0 | d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
123 | 0 | d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
124 | 0 | u1 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
125 | 0 | _mm256_storeu_si256((__m256i*)dst0, Deinterleave64<0>(u0, u1)); |
126 | 0 | _mm256_storeu_si256((__m256i*)dst1, Deinterleave64<1>(u0, u1)); |
127 | 0 | } |
128 | | |
129 | | void SynetQuantizedShuffleLayerForwardNhwc0(const uint8_t* src0, int bias0, float norm0, size_t srcC0, |
130 | | const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero) |
131 | 0 | { |
132 | 0 | size_t dstC = (srcC0 + srcC1) / 2, cs, cd, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_64 = AlignLo(srcC0, 64), srcC1_64 = AlignLo(srcC1, 64); |
133 | 0 | __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero); |
134 | 0 | __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale); |
135 | 0 | for (size_t s = 0; s < spatial; ++s) |
136 | 0 | { |
137 | 0 | cd = 0, cs = 0; |
138 | 0 | for (; cs < srcC0_64; cs += 64, cd += 32) |
139 | 0 | DequantizeQuantizeLinearNhwc0_32(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd); |
140 | 0 | for (; cs < srcC0_16; cs += 16, cd += 8) |
141 | 0 | DequantizeQuantizeLinearNhwc0_8(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd); |
142 | 0 | for (; cs < srcC0; cs += 2, cd += 1) |
143 | 0 | DequantizeQuantizeLinearNhwc0_1(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd); |
144 | 0 | cs = 0; |
145 | 0 | for (; cs < srcC1_64; cs += 64, cd += 32) |
146 | 0 | DequantizeQuantizeLinearNhwc0_32(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd); |
147 | 0 | for (; cs < srcC1_16; cs += 16, cd += 8) |
148 | 0 | DequantizeQuantizeLinearNhwc0_8(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd); |
149 | 0 | for (; cs < srcC1; cs += 2, cd += 1) |
150 | 0 | DequantizeQuantizeLinearNhwc0_1(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd); |
151 | 0 | src0 += srcC0; |
152 | 0 | src1 += srcC1; |
153 | 0 | dst0 += dstC; |
154 | 0 | dst1 += dstC; |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | | //-------------------------------------------------------------------------------------------------- |
159 | | |
160 | | void SynetQuantizedShuffleLayerForwardNchw1(const uint8_t* src0, int bias0, float norm0, size_t srcC0, |
161 | | const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero) |
162 | 0 | { |
163 | 0 | size_t dstC = (srcC0 + srcC1) / 2, cs = 0, spatial8 = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s; |
164 | 0 | __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero); |
165 | 0 | __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale); |
166 | 0 | for (size_t cd = 0; cd < srcC0; cs += 1, cd += 2) |
167 | 0 | { |
168 | 0 | for (s = 0; s < spatial32; s += 32) |
169 | 0 | DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s); |
170 | 0 | for (; s < spatial8; s += 8) |
171 | 0 | DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s); |
172 | 0 | for (; s < spatial; s += 1) |
173 | 0 | DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s); |
174 | 0 | src0 += spatial; |
175 | 0 | dst0 += spatial; |
176 | 0 | for (s = 0; s < spatial32; s += 32) |
177 | 0 | DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s); |
178 | 0 | for (; s < spatial8; s += 8) |
179 | 0 | DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s); |
180 | 0 | for (; s < spatial; s += 1) |
181 | 0 | DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s); |
182 | 0 | src1 += spatial; |
183 | 0 | dst0 += spatial; |
184 | 0 | } |
185 | 0 | for (size_t cd = 0; cd < srcC1; cs += 1, cd += 2) |
186 | 0 | { |
187 | 0 | for (s = 0; s < spatial32; s += 32) |
188 | 0 | DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s); |
189 | 0 | for (; s < spatial8; s += 8) |
190 | 0 | DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s); |
191 | 0 | for (; s < spatial; s += 1) |
192 | 0 | DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s); |
193 | 0 | src0 += spatial; |
194 | 0 | dst1 += spatial; |
195 | 0 | for (s = 0; s < spatial32; s += 32) |
196 | 0 | DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s); |
197 | 0 | for (; s < spatial8; s += 8) |
198 | 0 | DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s); |
199 | 0 | for (; s < spatial; s += 1) |
200 | 0 | DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s); |
201 | 0 | src1 += spatial; |
202 | 0 | dst1 += spatial; |
203 | 0 | } |
204 | 0 | } |
205 | | |
206 | | //-------------------------------------------------------------------------------------------------- |
207 | | |
208 | | SIMD_INLINE void DequantizeQuantizeLinearNhwc1_1(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst) |
209 | 0 | { |
210 | 0 | __m128i s0 = _mm_set1_epi8(src0[0]); |
211 | 0 | __m128i s1 = _mm_set1_epi8(src1[0]); |
212 | 0 | __m128i s01 = _mm_unpacklo_epi8(s0, s1); |
213 | 0 | __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(s01), bias, norm), scale, zero); |
214 | 0 | __m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO); |
215 | 0 | ((uint16_t*)dst)[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(u0)); |
216 | 0 | } |
217 | | |
218 | | SIMD_INLINE void DequantizeQuantizeLinearNhwc1_8(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst) |
219 | 0 | { |
220 | 0 | __m128i _src0 = _mm_loadl_epi64((__m128i*)src0); |
221 | 0 | __m128i _src1 = _mm_loadl_epi64((__m128i*)src1); |
222 | 0 | __m128i s0 = _mm_unpacklo_epi8(_src0, _src1); |
223 | 0 | __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
224 | 0 | __m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
225 | 0 | __m256i u0 = PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO); |
226 | 0 | _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(u0)); |
227 | 0 | } |
228 | | |
229 | | SIMD_INLINE void DequantizeQuantizeLinearNhwc1_16(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst) |
230 | 0 | { |
231 | 0 | __m128i _src0, _src1, s0; |
232 | 0 | __m256i d0, d1, d2, d3; |
233 | 0 | _src0 = _mm_loadu_si128((__m128i*)src0 + 0); |
234 | 0 | _src1 = _mm_loadu_si128((__m128i*)src1 + 0); |
235 | 0 | s0 = _mm_unpacklo_epi8(_src0, _src1); |
236 | 0 | d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
237 | 0 | d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
238 | 0 | s0 = _mm_unpackhi_epi8(_src0, _src1); |
239 | 0 | d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero); |
240 | 0 | d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero); |
241 | 0 | _mm256_storeu_si256((__m256i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3))); |
242 | 0 | } |
243 | | |
244 | | void SynetQuantizedShuffleLayerForwardNhwc1(const uint8_t* src0, int bias0, float norm0, size_t srcC0, |
245 | | const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero) |
246 | 0 | { |
247 | 0 | size_t dstC = (srcC0 + srcC1) / 2, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_32 = AlignLo(srcC0, 32), srcC1_32 = AlignLo(srcC1, 32); |
248 | 0 | __m256i _bias01 = SetInt32(bias0, bias1), _zero = _mm256_set1_epi32(zero); |
249 | 0 | __m256 _norm01 = SetFloat(norm0, norm1), _scale = _mm256_set1_ps(scale); |
250 | 0 | for (size_t s = 0; s < spatial; ++s) |
251 | 0 | { |
252 | 0 | size_t cs = 0, cd = 0; |
253 | 0 | for (; cd < srcC0_32; cd += 32, cs += 16) |
254 | 0 | DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd); |
255 | 0 | for (; cd < srcC0_16; cd += 16, cs += 8) |
256 | 0 | DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd); |
257 | 0 | for (; cd < srcC0; cd += 2, cs += 1) |
258 | 0 | DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd); |
259 | 0 | cd = 0; |
260 | 0 | for (; cd < srcC1_32; cd += 32, cs += 16) |
261 | 0 | DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd); |
262 | 0 | for (; cd < srcC1_16; cd += 16, cs += 8) |
263 | 0 | DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd); |
264 | 0 | for (; cd < srcC1; cd += 2, cs += 1) |
265 | 0 | DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd); |
266 | 0 | src0 += dstC; |
267 | 0 | src1 += dstC; |
268 | 0 | dst0 += srcC0; |
269 | 0 | dst1 += srcC1; |
270 | 0 | } |
271 | 0 | } |
272 | | |
273 | | //-------------------------------------------------------------------------------------------------- |
274 | | |
275 | | void SynetQuantizedShuffleLayerForward(const uint8_t* src0, int bias0, const float* norm0, size_t srcC0, const uint8_t* src1, int bias1, const float* norm1, size_t srcC1, |
276 | | size_t spatial, uint8_t* dst0, uint8_t* dst1, const float* scale, int zero, SimdTensorFormatType format, int shuffleType) |
277 | 0 | { |
278 | 0 | switch (shuffleType) |
279 | 0 | { |
280 | 0 | case 0: |
281 | 0 | if (format == SimdTensorFormatNhwc) |
282 | 0 | SynetQuantizedShuffleLayerForwardNhwc0(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero); |
283 | 0 | else |
284 | 0 | SynetQuantizedShuffleLayerForwardNchw0(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero); |
285 | 0 | break; |
286 | 0 | case 1: |
287 | 0 | if (format == SimdTensorFormatNhwc) |
288 | 0 | SynetQuantizedShuffleLayerForwardNhwc1(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero); |
289 | 0 | else |
290 | 0 | SynetQuantizedShuffleLayerForwardNchw1(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero); |
291 | 0 | break; |
292 | 0 | } |
293 | 0 | } |
294 | | } |
295 | | #endif |
296 | | } |