/src/Simd/src/Simd/SimdAvx2DescrIntDec.cpp

Source
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "Simd/SimdMemory.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdExtract.h"
#include "Simd/SimdArray.h"
#include "Simd/SimdUnpack.h"
#include "Simd/SimdDescrInt.h"
#include "Simd/SimdDescrIntCommon.h"
#include "Simd/SimdCpu.h"

namespace Simd
{
#ifdef SIMD_AVX2_ENABLE    
    namespace Avx2
    {
        static void Decode32f4(const uint8_t* src, float scale, float shift, size_t size, float* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s4, C4_SHFL), C4_MULLO), 12);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
                src += 8;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<4>(src), Sse41::C4_SHFL0), Sse41::C4_MULLO), 12);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
                src += 4;
                dst += 8;
            }
        }

        static void Decode32f5(const uint8_t* src, float scale, float shift, size_t size, float* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s5, C5_SHFL), C5_MULLO), 11);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
                src += 10;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<5>(src), Sse41::C5_SHFL0), Sse41::C5_MULLO), 11);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
                src += 5;
                dst += 8;
            }
        }

        static void Decode32f6(const uint8_t* src, float scale, float shift, size_t size, float* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C6_SHFL), C6_MULLO), 10);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
                src += 12;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<6>(src), Sse41::C6_SHFL0), Sse41::C6_MULLO), 10);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
                src += 6;
                dst += 8;
            }
        }

        static void Decode32f7(const uint8_t* src, float scale, float shift, size_t size, float* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C7_SHFL), C7_MULLO), 9);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
                _mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
                src += 14;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<7>(src), Sse41::C7_SHFL0), Sse41::C7_MULLO), 9);
                _mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
                src += 7;
                dst += 8;
            }
        }

        static void Decode32f8(const uint8_t* src, float scale, float shift, size_t size, float* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size, 16);
            for (; i < size16; i += 16)
            {
                __m128i u8 = _mm_loadu_si128((__m128i*)(src + i));
                _mm256_storeu_ps(dst + i + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(u8)), _scale, _shift));
                _mm256_storeu_ps(dst + i + F, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128(u8, 8))), _scale, _shift));
            }
            for (; i < size; i += 8)
            {
                __m256 _src = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(src + i))));
                _mm256_storeu_ps(dst + i, _mm256_fmadd_ps(_src, _scale, _shift));
            }
        }

        //-------------------------------------------------------------------------------------------------

        static void Decode16f4(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s4, C4_SHFL), C4_MULLO), 12);
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
                src += 8;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<4>(src), Sse41::C4_SHFL0), Sse41::C4_MULLO), 12);
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
                src += 4;
                dst += 8;
            }
        }

        static void Decode16f5(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s5, C5_SHFL), C5_MULLO), 11);
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
                src += 10;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<5>(src), Sse41::C5_SHFL0), Sse41::C5_MULLO), 11);
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
                src += 5;
                dst += 8;
            }
        }

        static void Decode16f6(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C6_SHFL), C6_MULLO), 10);
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
                src += 12;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<6>(src), Sse41::C6_SHFL0), Sse41::C6_MULLO), 10);
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
                src += 6;
                dst += 8;
            }
        }

        static void Decode16f7(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size - 1, 16);
            for (; i < size16; i += 16)
            {
                __m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
                __m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C7_SHFL), C7_MULLO), 9);
                _mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
                _mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
                src += 14;
                dst += 16;
            }
            for (; i < size; i += 8)
            {
                __m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<7>(src), Sse41::C7_SHFL0), Sse41::C7_MULLO), 9);
                _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
                src += 7;
                dst += 8;
            }
        }

        static void Decode16f8(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
        {
            assert(size % 8 == 0);
            __m256 _scale = _mm256_set1_ps(scale);
            __m256 _shift = _mm256_set1_ps(shift);
            size_t i = 0, size16 = AlignLo(size, 16);
            for (; i < size16; i += 16)
            {
                __m128i u8 = _mm_loadu_si128((__m128i*)(src + i));
                _mm_storeu_si128((__m128i*)(dst + i) + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(u8)), _scale, _shift), 0));
                _mm_storeu_si128((__m128i*)(dst + i) + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128(u8, 8))), _scale, _shift), 0));
            }
            for (; i < size; i += 8)
            {
                __m256 _src = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(src + i))));
                _mm_storeu_si128((__m128i*)(dst + i), _mm256_cvtps_ph(_mm256_fmadd_ps(_src, _scale, _shift), 0));
            }
        }

        //-------------------------------------------------------------------------------------------------

        Base::DescrInt::Decode32fPtr GetDecode32f(size_t depth)
        {
            switch (depth)
            {
            case 4: return Decode32f4;
            case 5: return Decode32f5;
            case 6: return Decode32f6;
            case 7: return Decode32f7;
            case 8: return Decode32f8;
            default: assert(0); return NULL;
            }
        }

        Base::DescrInt::Decode16fPtr GetDecode16f(size_t depth)
        {
            switch (depth)
            {
            case 4: return Decode16f4;
            case 5: return Decode16f5;
            case 6: return Decode16f6;
            case 7: return Decode16f7;
            case 8: return Decode16f8;
            default: assert(0); return NULL;
            }
        }
    }
#endif
}

Coverage Report

Created: 2026-02-14 07:40

Line	Count	Source
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2023 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24		#include "Simd/SimdMemory.h"
25		#include "Simd/SimdStore.h"
26		#include "Simd/SimdExtract.h"
27		#include "Simd/SimdArray.h"
28		#include "Simd/SimdUnpack.h"
29		#include "Simd/SimdDescrInt.h"
30		#include "Simd/SimdDescrIntCommon.h"
31		#include "Simd/SimdCpu.h"
32
33		namespace Simd
34		{
35		#ifdef SIMD_AVX2_ENABLE
36		namespace Avx2
37		{
38		static void Decode32f4(const uint8_t* src, float scale, float shift, size_t size, float* dst)
39	0	{
40	0	assert(size % 8 == 0);
41	0	__m256 _scale = _mm256_set1_ps(scale);
42	0	__m256 _shift = _mm256_set1_ps(shift);
43	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
44	0	for (; i < size16; i += 16)
45	0	{
46	0	__m256i s4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
47	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s4, C4_SHFL), C4_MULLO), 12);
48	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
49	0	_mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
50	0	src += 8;
51	0	dst += 16;
52	0	}
53	0	for (; i < size; i += 8)
54	0	{
55	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<4>(src), Sse41::C4_SHFL0), Sse41::C4_MULLO), 12);
56	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
57	0	src += 4;
58	0	dst += 8;
59	0	}
60	0	}
61
62		static void Decode32f5(const uint8_t* src, float scale, float shift, size_t size, float* dst)
63	0	{
64	0	assert(size % 8 == 0);
65	0	__m256 _scale = _mm256_set1_ps(scale);
66	0	__m256 _shift = _mm256_set1_ps(shift);
67	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
68	0	for (; i < size16; i += 16)
69	0	{
70	0	__m256i s5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
71	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s5, C5_SHFL), C5_MULLO), 11);
72	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
73	0	_mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
74	0	src += 10;
75	0	dst += 16;
76	0	}
77	0	for (; i < size; i += 8)
78	0	{
79	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<5>(src), Sse41::C5_SHFL0), Sse41::C5_MULLO), 11);
80	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
81	0	src += 5;
82	0	dst += 8;
83	0	}
84	0	}
85
86		static void Decode32f6(const uint8_t* src, float scale, float shift, size_t size, float* dst)
87	0	{
88	0	assert(size % 8 == 0);
89	0	__m256 _scale = _mm256_set1_ps(scale);
90	0	__m256 _shift = _mm256_set1_ps(shift);
91	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
92	0	for (; i < size16; i += 16)
93	0	{
94	0	__m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
95	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C6_SHFL), C6_MULLO), 10);
96	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
97	0	_mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
98	0	src += 12;
99	0	dst += 16;
100	0	}
101	0	for (; i < size; i += 8)
102	0	{
103	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<6>(src), Sse41::C6_SHFL0), Sse41::C6_MULLO), 10);
104	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
105	0	src += 6;
106	0	dst += 8;
107	0	}
108	0	}
109
110		static void Decode32f7(const uint8_t* src, float scale, float shift, size_t size, float* dst)
111	0	{
112	0	assert(size % 8 == 0);
113	0	__m256 _scale = _mm256_set1_ps(scale);
114	0	__m256 _shift = _mm256_set1_ps(shift);
115	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
116	0	for (; i < size16; i += 16)
117	0	{
118	0	__m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
119	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C7_SHFL), C7_MULLO), 9);
120	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift));
121	0	_mm256_storeu_ps(dst + 8, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift));
122	0	src += 14;
123	0	dst += 16;
124	0	}
125	0	for (; i < size; i += 8)
126	0	{
127	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<7>(src), Sse41::C7_SHFL0), Sse41::C7_MULLO), 9);
128	0	_mm256_storeu_ps(dst + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift));
129	0	src += 7;
130	0	dst += 8;
131	0	}
132	0	}
133
134		static void Decode32f8(const uint8_t* src, float scale, float shift, size_t size, float* dst)
135	0	{
136	0	assert(size % 8 == 0);
137	0	__m256 _scale = _mm256_set1_ps(scale);
138	0	__m256 _shift = _mm256_set1_ps(shift);
139	0	size_t i = 0, size16 = AlignLo(size, 16);
140	0	for (; i < size16; i += 16)
141	0	{
142	0	__m128i u8 = _mm_loadu_si128((__m128i*)(src + i));
143	0	_mm256_storeu_ps(dst + i + 0, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(u8)), _scale, _shift));
144	0	_mm256_storeu_ps(dst + i + F, _mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128(u8, 8))), _scale, _shift));
145	0	}
146	0	for (; i < size; i += 8)
147	0	{
148	0	__m256 _src = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(src + i))));
149	0	_mm256_storeu_ps(dst + i, _mm256_fmadd_ps(_src, _scale, _shift));
150	0	}
151	0	}
152
153		//-------------------------------------------------------------------------------------------------
154
155		static void Decode16f4(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
156	0	{
157	0	assert(size % 8 == 0);
158	0	__m256 _scale = _mm256_set1_ps(scale);
159	0	__m256 _shift = _mm256_set1_ps(shift);
160	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
161	0	for (; i < size16; i += 16)
162	0	{
163	0	__m256i s4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
164	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s4, C4_SHFL), C4_MULLO), 12);
165	0	_mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
166	0	_mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
167	0	src += 8;
168	0	dst += 16;
169	0	}
170	0	for (; i < size; i += 8)
171	0	{
172	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<4>(src), Sse41::C4_SHFL0), Sse41::C4_MULLO), 12);
173	0	_mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
174	0	src += 4;
175	0	dst += 8;
176	0	}
177	0	}
178
179		static void Decode16f5(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
180	0	{
181	0	assert(size % 8 == 0);
182	0	__m256 _scale = _mm256_set1_ps(scale);
183	0	__m256 _shift = _mm256_set1_ps(shift);
184	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
185	0	for (; i < size16; i += 16)
186	0	{
187	0	__m256i s5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
188	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s5, C5_SHFL), C5_MULLO), 11);
189	0	_mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
190	0	_mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
191	0	src += 10;
192	0	dst += 16;
193	0	}
194	0	for (; i < size; i += 8)
195	0	{
196	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<5>(src), Sse41::C5_SHFL0), Sse41::C5_MULLO), 11);
197	0	_mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
198	0	src += 5;
199	0	dst += 8;
200	0	}
201	0	}
202
203		static void Decode16f6(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
204	0	{
205	0	assert(size % 8 == 0);
206	0	__m256 _scale = _mm256_set1_ps(scale);
207	0	__m256 _shift = _mm256_set1_ps(shift);
208	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
209	0	for (; i < size16; i += 16)
210	0	{
211	0	__m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
212	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C6_SHFL), C6_MULLO), 10);
213	0	_mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
214	0	_mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
215	0	src += 12;
216	0	dst += 16;
217	0	}
218	0	for (; i < size; i += 8)
219	0	{
220	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<6>(src), Sse41::C6_SHFL0), Sse41::C6_MULLO), 10);
221	0	_mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
222	0	src += 6;
223	0	dst += 8;
224	0	}
225	0	}
226
227		static void Decode16f7(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
228	0	{
229	0	assert(size % 8 == 0);
230	0	__m256 _scale = _mm256_set1_ps(scale);
231	0	__m256 _shift = _mm256_set1_ps(shift);
232	0	size_t i = 0, size16 = AlignLo(size - 1, 16);
233	0	for (; i < size16; i += 16)
234	0	{
235	0	__m256i s6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)src));
236	0	__m256i s16 = _mm256_srli_epi16(_mm256_mullo_epi16(_mm256_shuffle_epi8(s6, C7_SHFL), C7_MULLO), 9);
237	0	_mm_storeu_si128((__m128i*)dst + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 0))), _scale, _shift), 0));
238	0	_mm_storeu_si128((__m128i*)dst + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256(s16, 1))), _scale, _shift), 0));
239	0	src += 14;
240	0	dst += 16;
241	0	}
242	0	for (; i < size; i += 8)
243	0	{
244	0	__m128i s16 = _mm_srli_epi16(_mm_mullo_epi16(_mm_shuffle_epi8(Sse41::LoadLast8<7>(src), Sse41::C7_SHFL0), Sse41::C7_MULLO), 9);
245	0	_mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(s16)), _scale, _shift), 0));
246	0	src += 7;
247	0	dst += 8;
248	0	}
249	0	}
250
251		static void Decode16f8(const uint8_t* src, float scale, float shift, size_t size, uint16_t* dst)
252	0	{
253	0	assert(size % 8 == 0);
254	0	__m256 _scale = _mm256_set1_ps(scale);
255	0	__m256 _shift = _mm256_set1_ps(shift);
256	0	size_t i = 0, size16 = AlignLo(size, 16);
257	0	for (; i < size16; i += 16)
258	0	{
259	0	__m128i u8 = _mm_loadu_si128((__m128i*)(src + i));
260	0	_mm_storeu_si128((__m128i*)(dst + i) + 0, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(u8)), _scale, _shift), 0));
261	0	_mm_storeu_si128((__m128i*)(dst + i) + 1, _mm256_cvtps_ph(_mm256_fmadd_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128(u8, 8))), _scale, _shift), 0));
262	0	}
263	0	for (; i < size; i += 8)
264	0	{
265	0	__m256 _src = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(src + i))));
266	0	_mm_storeu_si128((__m128i*)(dst + i), _mm256_cvtps_ph(_mm256_fmadd_ps(_src, _scale, _shift), 0));
267	0	}
268	0	}
269
270		//-------------------------------------------------------------------------------------------------
271
272		Base::DescrInt::Decode32fPtr GetDecode32f(size_t depth)
273	0	{
274	0	switch (depth)
275	0	{
276	0	case 4: return Decode32f4;
277	0	case 5: return Decode32f5;
278	0	case 6: return Decode32f6;
279	0	case 7: return Decode32f7;
280	0	case 8: return Decode32f8;
281	0	default: assert(0); return NULL;
282	0	}
283	0	}
284
285		Base::DescrInt::Decode16fPtr GetDecode16f(size_t depth)
286	0	{
287	0	switch (depth)
288	0	{
289	0	case 4: return Decode16f4;
290	0	case 5: return Decode16f5;
291	0	case 6: return Decode16f6;
292	0	case 7: return Decode16f7;
293	0	case 8: return Decode16f8;
294	0	default: assert(0); return NULL;
295	0	}
296	0	}
297		}
298		#endif
299		}