/src/Simd/src/Simd/SimdAvx2SynetQuantizedShuffle.cpp

Source
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2025 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "Simd/SimdSynetQuantizeLinear.h"
#include "Simd/SimdMath.h"
#include "Simd/SimdDeinterleave.h"
#include "Simd/SimdSet.h"

namespace Simd
{
#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)   
    namespace Avx2
    {
        void SynetQuantizedShuffleLayerForwardNchw0(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
        {
            size_t dstC = (srcC0 + srcC1) / 2, cd = 0, spatial8  = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s;
            __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
            __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
            for (size_t cs = 0; cs < srcC0; cs += 2, cd += 1)
            {
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
                src0 += spatial;
                dst0 += spatial;
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
                src0 += spatial;
                dst1 += spatial;
            }
            for (size_t cs = 0; cs < srcC1; cs += 2, cd += 1)
            {
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
                src1 += spatial;
                dst0 += spatial;
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
                src1 += spatial;
                dst1 += spatial;
            }        
        }

        //--------------------------------------------------------------------------------------------------

        SIMD_INLINE void DequantizeQuantizeLinearNhwc0_1(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
        {
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_set1_epi32(((int16_t*)src)[0])), bias, norm), scale, zero);
            __m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO);
            dst0[0] = _mm256_extract_epi8(u0, 0);
            dst1[0] = _mm256_extract_epi8(u0, 1);
        }

        SIMD_INLINE void DequantizeQuantizeLinearNhwc0_8(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
        {
            __m128i _src = _mm_loadu_si128((__m128i*)src);
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 0)), bias, norm), scale, zero);
            __m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 8)), bias, norm), scale, zero);
            __m256i u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO));
#if defined(SIMD_X64_ENABLE)
            ((uint64_t*)dst0)[0] = _mm256_extract_epi64(u0, 0);
            ((uint64_t*)dst1)[0] = _mm256_extract_epi64(u0, 1);
#else
            SIMD_ALIGNED(32) uint64_t tmp[4];
            _mm256_store_si256((__m256i*)tmp, u0);
            ((uint64_t*)dst0)[0] = tmp[0];
            ((uint64_t*)dst1)[0] = tmp[1];
#endif
        }

        SIMD_INLINE void DequantizeQuantizeLinearNhwc0_32(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
        {
            __m256i d0, d1, d2, d3, u0, u1;
            __m128i s0;
            s0 = _mm_loadu_si128((__m128i*)src + 0);
            d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            s0 = _mm_loadu_si128((__m128i*)src + 1);
            d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
            s0 = _mm_loadu_si128((__m128i*)src + 2);
            d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            s0 = _mm_loadu_si128((__m128i*)src + 3);
            d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            u1 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
            _mm256_storeu_si256((__m256i*)dst0, Deinterleave64<0>(u0, u1));
            _mm256_storeu_si256((__m256i*)dst1, Deinterleave64<1>(u0, u1));
        }

        void SynetQuantizedShuffleLayerForwardNhwc0(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
        {
            size_t dstC = (srcC0 + srcC1) / 2, cs, cd, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_64 = AlignLo(srcC0, 64), srcC1_64 = AlignLo(srcC1, 64);
            __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
            __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
            for (size_t s = 0; s < spatial; ++s)
            {
                cd = 0, cs = 0;
                for (; cs < srcC0_64; cs += 64, cd += 32)
                    DequantizeQuantizeLinearNhwc0_32(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
                for (; cs < srcC0_16; cs += 16, cd += 8)
                    DequantizeQuantizeLinearNhwc0_8(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
                for (; cs < srcC0; cs += 2, cd += 1)
                    DequantizeQuantizeLinearNhwc0_1(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
                cs = 0;
                for (; cs < srcC1_64; cs += 64, cd += 32)
                    DequantizeQuantizeLinearNhwc0_32(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
                for (; cs < srcC1_16; cs += 16, cd += 8)
                    DequantizeQuantizeLinearNhwc0_8(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
                for (; cs < srcC1; cs += 2, cd += 1)
                    DequantizeQuantizeLinearNhwc0_1(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
                src0 += srcC0;
                src1 += srcC1;
                dst0 += dstC;
                dst1 += dstC;
            }
        }

        //--------------------------------------------------------------------------------------------------

        void SynetQuantizedShuffleLayerForwardNchw1(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
        {
            size_t dstC = (srcC0 + srcC1) / 2, cs = 0, spatial8 = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s;
            __m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
            __m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
            for (size_t cd = 0; cd < srcC0; cs += 1, cd += 2)
            {
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
                src0 += spatial;
                dst0 += spatial;
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
                src1 += spatial;
                dst0 += spatial;
            }
            for (size_t cd = 0; cd < srcC1; cs += 1, cd += 2)
            {
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
                src0 += spatial;
                dst1 += spatial;
                for (s = 0; s < spatial32; s += 32)
                    DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
                for (; s < spatial8; s += 8)
                    DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
                for (; s < spatial; s += 1)
                    DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
                src1 += spatial;
                dst1 += spatial;
            }
        }

        //--------------------------------------------------------------------------------------------------

        SIMD_INLINE void DequantizeQuantizeLinearNhwc1_1(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
        {
            __m128i s0 = _mm_set1_epi8(src0[0]);
            __m128i s1 = _mm_set1_epi8(src1[0]);
            __m128i s01 = _mm_unpacklo_epi8(s0, s1);
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(s01), bias, norm), scale, zero);
            __m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO);
            ((uint16_t*)dst)[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(u0));
        }

        SIMD_INLINE void DequantizeQuantizeLinearNhwc1_8(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
        {
            __m128i _src0 = _mm_loadl_epi64((__m128i*)src0);
            __m128i _src1 = _mm_loadl_epi64((__m128i*)src1);
            __m128i s0 = _mm_unpacklo_epi8(_src0, _src1);
            __m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            __m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            __m256i u0 = PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO);
            _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(u0));
        }

        SIMD_INLINE void DequantizeQuantizeLinearNhwc1_16(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
        {
            __m128i _src0, _src1, s0;
            __m256i d0, d1, d2, d3;
            _src0 = _mm_loadu_si128((__m128i*)src0 + 0);
            _src1 = _mm_loadu_si128((__m128i*)src1 + 0);
            s0 = _mm_unpacklo_epi8(_src0, _src1);
            d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            s0 = _mm_unpackhi_epi8(_src0, _src1);
            d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
            d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
            _mm256_storeu_si256((__m256i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
        }

        void SynetQuantizedShuffleLayerForwardNhwc1(const uint8_t* src0, int bias0, float norm0, size_t srcC0, 
            const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
        {
            size_t dstC = (srcC0 + srcC1) / 2, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_32 = AlignLo(srcC0, 32), srcC1_32 = AlignLo(srcC1, 32);
            __m256i _bias01 = SetInt32(bias0, bias1), _zero = _mm256_set1_epi32(zero);
            __m256 _norm01 = SetFloat(norm0, norm1), _scale = _mm256_set1_ps(scale);
            for (size_t s = 0; s < spatial; ++s)
            {
                size_t cs = 0, cd = 0;
                for (; cd < srcC0_32; cd += 32, cs += 16)
                    DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
                for (; cd < srcC0_16; cd += 16, cs += 8)
                    DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
                for (; cd < srcC0; cd += 2, cs += 1)
                    DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
                cd = 0;
                for (; cd < srcC1_32; cd += 32, cs += 16)
                    DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
                for (; cd < srcC1_16; cd += 16, cs += 8)
                    DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
                for (; cd < srcC1; cd += 2, cs += 1)
                    DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
                src0 += dstC;
                src1 += dstC;
                dst0 += srcC0;
                dst1 += srcC1;
            }
        }

        //--------------------------------------------------------------------------------------------------

        void SynetQuantizedShuffleLayerForward(const uint8_t* src0, int bias0, const float* norm0, size_t srcC0, const uint8_t* src1, int bias1, const float* norm1, size_t srcC1,
            size_t spatial, uint8_t* dst0, uint8_t* dst1, const float* scale, int zero, SimdTensorFormatType format, int shuffleType)
        {
            switch (shuffleType)
            {
            case 0:
                if (format == SimdTensorFormatNhwc)
                    SynetQuantizedShuffleLayerForwardNhwc0(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
                else
                    SynetQuantizedShuffleLayerForwardNchw0(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
                break;
            case 1:
                if (format == SimdTensorFormatNhwc)
                    SynetQuantizedShuffleLayerForwardNhwc1(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
                else
                    SynetQuantizedShuffleLayerForwardNchw1(src0, bias0, *norm0, srcC0, src1, bias1, *norm1, srcC1, spatial, dst0, dst1, *scale, zero);
                break;
            }
        }
    }
#endif
}

Coverage Report

Created: 2025-09-27 07:34

Line	Count	Source
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2025 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24		#include "Simd/SimdSynetQuantizeLinear.h"
25		#include "Simd/SimdMath.h"
26		#include "Simd/SimdDeinterleave.h"
27		#include "Simd/SimdSet.h"
28
29		namespace Simd
30		{
31		#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)
32		namespace Avx2
33		{
34		void SynetQuantizedShuffleLayerForwardNchw0(const uint8_t* src0, int bias0, float norm0, size_t srcC0,
35		const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
36	0	{
37	0	size_t dstC = (srcC0 + srcC1) / 2, cd = 0, spatial8 = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s;
38	0	__m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
39	0	__m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
40	0	for (size_t cs = 0; cs < srcC0; cs += 2, cd += 1)
41	0	{
42	0	for (s = 0; s < spatial32; s += 32)
43	0	DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
44	0	for (; s < spatial8; s += 8)
45	0	DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
46	0	for (; s < spatial; s += 1)
47	0	DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
48	0	src0 += spatial;
49	0	dst0 += spatial;
50	0	for (s = 0; s < spatial32; s += 32)
51	0	DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
52	0	for (; s < spatial8; s += 8)
53	0	DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
54	0	for (; s < spatial; s += 1)
55	0	DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
56	0	src0 += spatial;
57	0	dst1 += spatial;
58	0	}
59	0	for (size_t cs = 0; cs < srcC1; cs += 2, cd += 1)
60	0	{
61	0	for (s = 0; s < spatial32; s += 32)
62	0	DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
63	0	for (; s < spatial8; s += 8)
64	0	DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
65	0	for (; s < spatial; s += 1)
66	0	DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
67	0	src1 += spatial;
68	0	dst0 += spatial;
69	0	for (s = 0; s < spatial32; s += 32)
70	0	DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
71	0	for (; s < spatial8; s += 8)
72	0	DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
73	0	for (; s < spatial; s += 1)
74	0	DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
75	0	src1 += spatial;
76	0	dst1 += spatial;
77	0	}
78	0	}
79
80		//--------------------------------------------------------------------------------------------------
81
82		SIMD_INLINE void DequantizeQuantizeLinearNhwc0_1(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
83	0	{
84	0	__m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_set1_epi32(((int16_t*)src)[0])), bias, norm), scale, zero);
85	0	__m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO);
86	0	dst0[0] = _mm256_extract_epi8(u0, 0);
87	0	dst1[0] = _mm256_extract_epi8(u0, 1);
88	0	}
89
90		SIMD_INLINE void DequantizeQuantizeLinearNhwc0_8(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
91	0	{
92	0	__m128i _src = _mm_loadu_si128((__m128i*)src);
93	0	__m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 0)), bias, norm), scale, zero);
94	0	__m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(_src, 8)), bias, norm), scale, zero);
95	0	__m256i u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO));
96	0	#if defined(SIMD_X64_ENABLE)
97	0	((uint64_t*)dst0)[0] = _mm256_extract_epi64(u0, 0);
98	0	((uint64_t*)dst1)[0] = _mm256_extract_epi64(u0, 1);
99		#else
100		SIMD_ALIGNED(32) uint64_t tmp[4];
101		_mm256_store_si256((__m256i*)tmp, u0);
102		((uint64_t*)dst0)[0] = tmp[0];
103		((uint64_t*)dst1)[0] = tmp[1];
104		#endif
105	0	}
106
107		SIMD_INLINE void DequantizeQuantizeLinearNhwc0_32(const uint8_t* src, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst0, uint8_t* dst1)
108	0	{
109	0	__m256i d0, d1, d2, d3, u0, u1;
110	0	__m128i s0;
111	0	s0 = _mm_loadu_si128((__m128i*)src + 0);
112	0	d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
113	0	d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
114	0	s0 = _mm_loadu_si128((__m128i*)src + 1);
115	0	d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
116	0	d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
117	0	u0 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
118	0	s0 = _mm_loadu_si128((__m128i*)src + 2);
119	0	d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
120	0	d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
121	0	s0 = _mm_loadu_si128((__m128i*)src + 3);
122	0	d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
123	0	d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
124	0	u1 = Deinterleave8To64(PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
125	0	_mm256_storeu_si256((__m256i*)dst0, Deinterleave64<0>(u0, u1));
126	0	_mm256_storeu_si256((__m256i*)dst1, Deinterleave64<1>(u0, u1));
127	0	}
128
129		void SynetQuantizedShuffleLayerForwardNhwc0(const uint8_t* src0, int bias0, float norm0, size_t srcC0,
130		const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
131	0	{
132	0	size_t dstC = (srcC0 + srcC1) / 2, cs, cd, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_64 = AlignLo(srcC0, 64), srcC1_64 = AlignLo(srcC1, 64);
133	0	__m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
134	0	__m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
135	0	for (size_t s = 0; s < spatial; ++s)
136	0	{
137	0	cd = 0, cs = 0;
138	0	for (; cs < srcC0_64; cs += 64, cd += 32)
139	0	DequantizeQuantizeLinearNhwc0_32(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
140	0	for (; cs < srcC0_16; cs += 16, cd += 8)
141	0	DequantizeQuantizeLinearNhwc0_8(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
142	0	for (; cs < srcC0; cs += 2, cd += 1)
143	0	DequantizeQuantizeLinearNhwc0_1(src0 + cs, _bias0, _norm0, _scale, _zero, dst0 + cd, dst1 + cd);
144	0	cs = 0;
145	0	for (; cs < srcC1_64; cs += 64, cd += 32)
146	0	DequantizeQuantizeLinearNhwc0_32(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
147	0	for (; cs < srcC1_16; cs += 16, cd += 8)
148	0	DequantizeQuantizeLinearNhwc0_8(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
149	0	for (; cs < srcC1; cs += 2, cd += 1)
150	0	DequantizeQuantizeLinearNhwc0_1(src1 + cs, _bias1, _norm1, _scale, _zero, dst0 + cd, dst1 + cd);
151	0	src0 += srcC0;
152	0	src1 += srcC1;
153	0	dst0 += dstC;
154	0	dst1 += dstC;
155	0	}
156	0	}
157
158		//--------------------------------------------------------------------------------------------------
159
160		void SynetQuantizedShuffleLayerForwardNchw1(const uint8_t* src0, int bias0, float norm0, size_t srcC0,
161		const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
162	0	{
163	0	size_t dstC = (srcC0 + srcC1) / 2, cs = 0, spatial8 = AlignLo(spatial, 8), spatial32 = AlignLo(spatial, 32), s;
164	0	__m256i _bias0 = _mm256_set1_epi32(bias0), _bias1 = _mm256_set1_epi32(bias1), _zero = _mm256_set1_epi32(zero);
165	0	__m256 _norm0 = _mm256_set1_ps(norm0), _norm1 = _mm256_set1_ps(norm1), _scale = _mm256_set1_ps(scale);
166	0	for (size_t cd = 0; cd < srcC0; cs += 1, cd += 2)
167	0	{
168	0	for (s = 0; s < spatial32; s += 32)
169	0	DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
170	0	for (; s < spatial8; s += 8)
171	0	DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
172	0	for (; s < spatial; s += 1)
173	0	DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst0 + s);
174	0	src0 += spatial;
175	0	dst0 += spatial;
176	0	for (s = 0; s < spatial32; s += 32)
177	0	DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
178	0	for (; s < spatial8; s += 8)
179	0	DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
180	0	for (; s < spatial; s += 1)
181	0	DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst0 + s);
182	0	src1 += spatial;
183	0	dst0 += spatial;
184	0	}
185	0	for (size_t cd = 0; cd < srcC1; cs += 1, cd += 2)
186	0	{
187	0	for (s = 0; s < spatial32; s += 32)
188	0	DequantizeQuantizeLinear32(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
189	0	for (; s < spatial8; s += 8)
190	0	DequantizeQuantizeLinear8(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
191	0	for (; s < spatial; s += 1)
192	0	DequantizeQuantizeLinear1(src0 + s, _bias0, _norm0, _scale, _zero, dst1 + s);
193	0	src0 += spatial;
194	0	dst1 += spatial;
195	0	for (s = 0; s < spatial32; s += 32)
196	0	DequantizeQuantizeLinear32(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
197	0	for (; s < spatial8; s += 8)
198	0	DequantizeQuantizeLinear8(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
199	0	for (; s < spatial; s += 1)
200	0	DequantizeQuantizeLinear1(src1 + s, _bias1, _norm1, _scale, _zero, dst1 + s);
201	0	src1 += spatial;
202	0	dst1 += spatial;
203	0	}
204	0	}
205
206		//--------------------------------------------------------------------------------------------------
207
208		SIMD_INLINE void DequantizeQuantizeLinearNhwc1_1(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
209	0	{
210	0	__m128i s0 = _mm_set1_epi8(src0[0]);
211	0	__m128i s1 = _mm_set1_epi8(src1[0]);
212	0	__m128i s01 = _mm_unpacklo_epi8(s0, s1);
213	0	__m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(s01), bias, norm), scale, zero);
214	0	__m256i u0 = _mm256_packus_epi16(_mm256_packs_epi32(d0, K_ZERO), K_ZERO);
215	0	((uint16_t*)dst)[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(u0));
216	0	}
217
218		SIMD_INLINE void DequantizeQuantizeLinearNhwc1_8(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
219	0	{
220	0	__m128i _src0 = _mm_loadl_epi64((__m128i*)src0);
221	0	__m128i _src1 = _mm_loadl_epi64((__m128i*)src1);
222	0	__m128i s0 = _mm_unpacklo_epi8(_src0, _src1);
223	0	__m256i d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
224	0	__m256i d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
225	0	__m256i u0 = PackI16ToU8(PackI32ToI16(d0, d1), K_ZERO);
226	0	_mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(u0));
227	0	}
228
229		SIMD_INLINE void DequantizeQuantizeLinearNhwc1_16(const uint8_t* src0, const uint8_t* src1, const __m256i& bias, const __m256& norm, const __m256& scale, const __m256i& zero, uint8_t* dst)
230	0	{
231	0	__m128i _src0, _src1, s0;
232	0	__m256i d0, d1, d2, d3;
233	0	_src0 = _mm_loadu_si128((__m128i*)src0 + 0);
234	0	_src1 = _mm_loadu_si128((__m128i*)src1 + 0);
235	0	s0 = _mm_unpacklo_epi8(_src0, _src1);
236	0	d0 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
237	0	d1 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
238	0	s0 = _mm_unpackhi_epi8(_src0, _src1);
239	0	d2 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 0)), bias, norm), scale, zero);
240	0	d3 = QuantizeLinear(DequantizeLinear(_mm256_cvtepu8_epi32(_mm_srli_si128(s0, 8)), bias, norm), scale, zero);
241	0	_mm256_storeu_si256((__m256i*)dst, PackI16ToU8(PackI32ToI16(d0, d1), PackI32ToI16(d2, d3)));
242	0	}
243
244		void SynetQuantizedShuffleLayerForwardNhwc1(const uint8_t* src0, int bias0, float norm0, size_t srcC0,
245		const uint8_t* src1, int bias1, float norm1, size_t srcC1, size_t spatial, uint8_t* dst0, uint8_t* dst1, float scale, int zero)
246	0	{
247	0	size_t dstC = (srcC0 + srcC1) / 2, srcC0_16 = AlignLo(srcC0, 16), srcC1_16 = AlignLo(srcC1, 16), srcC0_32 = AlignLo(srcC0, 32), srcC1_32 = AlignLo(srcC1, 32);
248	0	__m256i _bias01 = SetInt32(bias0, bias1), _zero = _mm256_set1_epi32(zero);
249	0	__m256 _norm01 = SetFloat(norm0, norm1), _scale = _mm256_set1_ps(scale);
250	0	for (size_t s = 0; s < spatial; ++s)
251	0	{
252	0	size_t cs = 0, cd = 0;
253	0	for (; cd < srcC0_32; cd += 32, cs += 16)
254	0	DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
255	0	for (; cd < srcC0_16; cd += 16, cs += 8)
256	0	DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
257	0	for (; cd < srcC0; cd += 2, cs += 1)
258	0	DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst0 + cd);
259	0	cd = 0;
260	0	for (; cd < srcC1_32; cd += 32, cs += 16)
261	0	DequantizeQuantizeLinearNhwc1_16(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
262	0	for (; cd < srcC1_16; cd += 16, cs += 8)
263	0	DequantizeQuantizeLinearNhwc1_8(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
264	0	for (; cd < srcC1; cd += 2, cs += 1)
265	0	DequantizeQuantizeLinearNhwc1_1(src0 + cs, src1 + cs, _bias01, _norm01, _scale, _zero, dst1 + cd);
266	0	src0 += dstC;
267	0	src1 += dstC;
268	0	dst0 += srcC0;
269	0	dst1 += srcC1;
270	0	}
271	0	}
272
273		//--------------------------------------------------------------------------------------------------
274
275		void SynetQuantizedShuffleLayerForward(const uint8_t* src0, int bias0, const float* norm0, size_t srcC0, const uint8_t* src1, int bias1, const float* norm1, size_t srcC1,
276		size_t spatial, uint8_t* dst0, uint8_t* dst1, const float* scale, int zero, SimdTensorFormatType format, int shuffleType)
277	0	{
278	0	switch (shuffleType)
279	0	{
280	0	case 0:
281	0	if (format == SimdTensorFormatNhwc)
282	0	SynetQuantizedShuffleLayerForwardNhwc0(src0, bias0, norm0, srcC0, src1, bias1, norm1, srcC1, spatial, dst0, dst1, *scale, zero);
283	0	else
284	0	SynetQuantizedShuffleLayerForwardNchw0(src0, bias0, norm0, srcC0, src1, bias1, norm1, srcC1, spatial, dst0, dst1, *scale, zero);
285	0	break;
286	0	case 1:
287	0	if (format == SimdTensorFormatNhwc)
288	0	SynetQuantizedShuffleLayerForwardNhwc1(src0, bias0, norm0, srcC0, src1, bias1, norm1, srcC1, spatial, dst0, dst1, *scale, zero);
289	0	else
290	0	SynetQuantizedShuffleLayerForwardNchw1(src0, bias0, norm0, srcC0, src1, bias1, norm1, srcC1, spatial, dst0, dst1, *scale, zero);
291	0	break;
292	0	}
293	0	}
294		}
295		#endif
296		}