Coverage Report

Created: 2026-04-09 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx2Reduce.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2020 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdStore.h"
26
27
namespace Simd
28
{
29
#ifdef SIMD_AVX2_ENABLE    
30
    namespace Avx2
31
    {
32
#ifdef SIMD_MADDUBS_ERROR
33
        SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
34
        {
35
            __m256i lo = Average16(
36
                _mm256_and_si256(s00, K16_00FF),
37
                _mm256_and_si256(_mm256_srli_si256(s00, 1), K16_00FF),
38
                _mm256_and_si256(s10, K16_00FF),
39
                _mm256_and_si256(_mm256_srli_si256(s10, 1), K16_00FF));
40
            __m256i hi = Average16(
41
                _mm256_and_si256(s01, K16_00FF),
42
                _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF),
43
                _mm256_and_si256(s11, K16_00FF),
44
                _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF));
45
            return PackI16ToU8(lo, hi);
46
        }
47
#else
48
        SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1)
49
        {
50
            return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(s0, K8_01), _mm256_maddubs_epi16(s1, K8_01)), K16_0002), 2);
51
        }
52
53
        SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
54
0
        {
55
0
            return PackI16ToU8(Average16(s00, s10), Average16(s01, s11));
56
0
        }
57
#endif
58
59
        template <size_t channelCount> __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11);
60
61
        template<> SIMD_INLINE __m256i Average8<1>(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
62
0
        {
63
0
            return Average8(s00, s01, s10, s11);
64
0
        }
65
66
        const __m256i K8_RC2 = SIMD_MM256_SETR_EPI8(
67
            0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF,
68
            0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
69
70
        template<> SIMD_INLINE __m256i Average8<2>(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
71
0
        {
72
0
            return Average8(_mm256_shuffle_epi8(s00, K8_RC2), _mm256_shuffle_epi8(s01, K8_RC2), _mm256_shuffle_epi8(s10, K8_RC2), _mm256_shuffle_epi8(s11, K8_RC2));
73
0
        }
74
75
        const __m256i K8_RC4 = SIMD_MM256_SETR_EPI8(
76
            0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF,
77
            0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
78
79
        template<> SIMD_INLINE __m256i Average8<4>(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
80
0
        {
81
0
            return Average8(_mm256_shuffle_epi8(s00, K8_RC4), _mm256_shuffle_epi8(s01, K8_RC4), _mm256_shuffle_epi8(s10, K8_RC4), _mm256_shuffle_epi8(s11, K8_RC4));
82
0
        }
83
84
        template <size_t channelCount, bool align> SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
85
0
        {
86
0
            __m256i s00 = Load<align>((__m256i*)src0 + 0);
87
0
            __m256i s01 = Load<align>((__m256i*)src0 + 1);
88
0
            __m256i s10 = Load<align>((__m256i*)src1 + 0);
89
0
            __m256i s11 = Load<align>((__m256i*)src1 + 1);
90
0
            Store<align>((__m256i*)dst, Average8<channelCount>(s00, s01, s10, s11));
91
0
        }
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<1ul, true>(unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<1ul, false>(unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<2ul, true>(unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<2ul, false>(unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<4ul, true>(unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<4ul, false>(unsigned char const*, unsigned char const*, unsigned char*)
92
93
        template <size_t channelCount, bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
94
0
        {
95
0
            size_t evenWidth = AlignLo(srcWidth, 2);
96
0
            size_t evenSize = evenWidth * channelCount;
97
0
            size_t alignedSize = AlignLo(evenSize, DA);
98
0
            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
99
0
            {
100
0
                const uint8_t *src0 = src;
101
0
                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
102
0
                size_t srcOffset = 0, dstOffset = 0;
103
0
                for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A)
104
0
                    ReduceColor2x2<channelCount, align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
105
0
                if (alignedSize != evenSize)
106
0
                {
107
0
                    srcOffset = evenSize - DA;
108
0
                    dstOffset = srcOffset / 2;
109
0
                    ReduceColor2x2<channelCount, false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
110
0
                }
111
0
                if (evenWidth != srcWidth)
112
0
                {
113
0
                    for (size_t c = 0; c < channelCount; ++c)
114
0
                        dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
115
0
                }
116
0
                src += 2 * srcStride;
117
0
                dst += dstStride;
118
0
            }
119
0
        }
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<1ul, true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<2ul, true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<4ul, true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<1ul, false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<2ul, false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<4ul, false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
120
121
        const __m256i K8_BGR0 = SIMD_MM256_SETR_EPI8(
122
            0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1,
123
            -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
124
        const __m256i K8_BGR1 = SIMD_MM256_SETR_EPI8(
125
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
126
            0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
127
        const __m256i K8_BGR2 = SIMD_MM256_SETR_EPI8(
128
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 
129
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
130
        const __m256i K8_BGR3 = SIMD_MM256_SETR_EPI8(
131
            -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF,
132
            0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
133
        const __m256i K8_BGR4 = SIMD_MM256_SETR_EPI8(
134
            0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
135
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
136
        const __m256i K8_BGR5 = SIMD_MM256_SETR_EPI8(
137
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
139
        const __m256i K8_BGR6 = SIMD_MM256_SETR_EPI8(
140
            -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1,
141
            -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
142
        const __m256i K8_BGR7 = SIMD_MM256_SETR_EPI8(
143
            0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
144
            0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
145
        const __m256i K8_BGR8 = SIMD_MM256_SETR_EPI8(
146
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1,
147
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
148
149
        template <bool align> SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
150
0
        {
151
0
            __m256i s001 = Load<align>((__m256i*)src0 + 0);
152
0
            __m256i s023 = Load<align>((__m256i*)src0 + 1);
153
0
            __m256i s045 = Load<align>((__m256i*)src0 + 2);
154
0
            __m256i s101 = Load<align>((__m256i*)src1 + 0);
155
0
            __m256i s123 = Load<align>((__m256i*)src1 + 1);
156
0
            __m256i s145 = Load<align>((__m256i*)src1 + 2);
157
0
            __m256i s000 = _mm256_permute2x128_si256(s001, s001, 0x00);
158
0
            __m256i s100 = _mm256_permute2x128_si256(s101, s101, 0x00);
159
0
            __m256i s012 = _mm256_permute2x128_si256(s001, s023, 0x21);
160
0
            __m256i s112 = _mm256_permute2x128_si256(s101, s123, 0x21);
161
0
            __m256i s034 = _mm256_permute2x128_si256(s023, s045, 0x21);
162
0
            __m256i s134 = _mm256_permute2x128_si256(s123, s145, 0x21);
163
0
            __m256i m00 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s001, K8_BGR0), _mm256_shuffle_epi8(s000, K8_BGR1)), _mm256_shuffle_epi8(s012, K8_BGR2));
164
0
            __m256i m01 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s023, K8_BGR3), _mm256_shuffle_epi8(s012, K8_BGR4)), _mm256_shuffle_epi8(s034, K8_BGR5));
165
0
            __m256i m10 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s101, K8_BGR0), _mm256_shuffle_epi8(s100, K8_BGR1)), _mm256_shuffle_epi8(s112, K8_BGR2));
166
0
            __m256i m11 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s123, K8_BGR3), _mm256_shuffle_epi8(s112, K8_BGR4)), _mm256_shuffle_epi8(s134, K8_BGR5));
167
0
            Store<align>((__m256i*)dst + 0, Average8(m00, m01, m10, m11));
168
0
            __m256i s067 = Load<align>((__m256i*)src0 + 3);
169
0
            __m256i s089 = Load<align>((__m256i*)src0 + 4);
170
0
            __m256i s167 = Load<align>((__m256i*)src1 + 3);
171
0
            __m256i s189 = Load<align>((__m256i*)src1 + 4);
172
0
            __m256i s056 = _mm256_permute2x128_si256(s045, s067, 0x21);
173
0
            __m256i s156 = _mm256_permute2x128_si256(s145, s167, 0x21);
174
0
            __m256i s078 = _mm256_permute2x128_si256(s067, s089, 0x21);
175
0
            __m256i s178 = _mm256_permute2x128_si256(s167, s189, 0x21);
176
0
            __m256i m02 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s045, K8_BGR6), _mm256_shuffle_epi8(s034, K8_BGR7)), _mm256_shuffle_epi8(s056, K8_BGR8));
177
0
            __m256i m03 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s067, K8_BGR0), _mm256_shuffle_epi8(s056, K8_BGR1)), _mm256_shuffle_epi8(s078, K8_BGR2));
178
0
            __m256i m12 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s145, K8_BGR6), _mm256_shuffle_epi8(s134, K8_BGR7)), _mm256_shuffle_epi8(s156, K8_BGR8));
179
0
            __m256i m13 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s167, K8_BGR0), _mm256_shuffle_epi8(s156, K8_BGR1)), _mm256_shuffle_epi8(s178, K8_BGR2));
180
0
            Store<align>((__m256i*)dst + 1, Average8(m02, m03, m12, m13));
181
0
            __m256i s0ab = Load<align>((__m256i*)src0 + 5);
182
0
            __m256i s1ab = Load<align>((__m256i*)src1 + 5);
183
0
            __m256i s09a = _mm256_permute2x128_si256(s089, s0ab, 0x21);
184
0
            __m256i s19a = _mm256_permute2x128_si256(s189, s1ab, 0x21);
185
0
            __m256i s0bb = _mm256_permute2x128_si256(s0ab, s0ab, 0x33);
186
0
            __m256i s1bb = _mm256_permute2x128_si256(s1ab, s1ab, 0x33);
187
0
            __m256i m04 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s089, K8_BGR3), _mm256_shuffle_epi8(s078, K8_BGR4)), _mm256_shuffle_epi8(s09a, K8_BGR5));
188
0
            __m256i m05 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0ab, K8_BGR6), _mm256_shuffle_epi8(s09a, K8_BGR7)), _mm256_shuffle_epi8(s0bb, K8_BGR8));
189
0
            __m256i m14 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s189, K8_BGR3), _mm256_shuffle_epi8(s178, K8_BGR4)), _mm256_shuffle_epi8(s19a, K8_BGR5));
190
0
            __m256i m15 = _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s1ab, K8_BGR6), _mm256_shuffle_epi8(s19a, K8_BGR7)), _mm256_shuffle_epi8(s1bb, K8_BGR8));
191
0
            Store<align>((__m256i*)dst + 2, Average8(m04, m05, m14, m15));
192
0
        }
Unexecuted instantiation: void Simd::Avx2::ReduceBgr2x2<true>(unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: void Simd::Avx2::ReduceBgr2x2<false>(unsigned char const*, unsigned char const*, unsigned char*)
193
194
        template <bool align> void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
195
0
        {
196
0
            size_t evenWidth = AlignLo(srcWidth, 2);
197
0
            size_t alignedWidth = AlignLo(srcWidth, DA);
198
0
            size_t evenSize = evenWidth * 3;
199
0
            size_t alignedSize = alignedWidth * 3;
200
0
            size_t srcStep = DA * 3, dstStep = A * 3;
201
0
            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
202
0
            {
203
0
                const uint8_t *src0 = src;
204
0
                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
205
0
                size_t srcOffset = 0, dstOffset = 0;
206
0
                for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep)
207
0
                    ReduceBgr2x2<align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
208
0
                if (alignedSize != evenSize)
209
0
                {
210
0
                    srcOffset = evenSize - srcStep;
211
0
                    dstOffset = srcOffset / 2;
212
0
                    ReduceBgr2x2<false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
213
0
                }
214
0
                if (evenWidth != srcWidth)
215
0
                {
216
0
                    for (size_t c = 0; c < 3; ++c)
217
0
                        dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
218
0
                }
219
0
                src += 2 * srcStride;
220
0
                dst += dstStride;
221
0
            }
222
0
        }
Unexecuted instantiation: void Simd::Avx2::ReduceBgr2x2<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceBgr2x2<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long)
223
224
        template <bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
225
            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
226
0
        {
227
0
            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
228
0
            if (align)
229
0
            {
230
0
                assert(Aligned(src) && Aligned(srcStride));
231
0
                assert(Aligned(dst) && Aligned(dstStride));
232
0
            }
233
234
0
            switch (channelCount)
235
0
            {
236
0
            case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
237
0
            case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
238
0
            case 3: ReduceBgr2x2<align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
239
0
            case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
240
0
            default: assert(0);
241
0
            }
242
0
        }
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: void Simd::Avx2::ReduceColor2x2<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long, unsigned long, unsigned long, unsigned long)
243
244
        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
245
            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
246
0
        {
247
0
            if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride))
248
0
                ReduceColor2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
249
0
            else
250
0
                ReduceColor2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
251
0
        }
252
    }
253
#endif// SIMD_AVX2_ENABLE
254
}