Coverage Report

Created: 2025-07-23 07:53

/src/Simd/src/Simd/SimdAvx2StatisticMoments.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2019 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdStore.h"
26
#include "Simd/SimdExtract.h"
27
#include "Simd/SimdSet.h"
28
29
namespace Simd
30
{
31
#ifdef SIMD_AVX2_ENABLE    
32
    namespace Avx2
33
    {
34
        SIMD_INLINE void GetObjectMoments16(__m256i src, __m256i col, __m256i & sx, __m256i & sxx)
35
0
        {
36
0
            sx = _mm256_add_epi32(sx, _mm256_madd_epi16(col, src));
37
0
            sxx = _mm256_add_epi32(sxx, _mm256_madd_epi16(src, _mm256_mullo_epi16(col, col)));
38
0
        }
39
40
        SIMD_INLINE void GetObjectMoments8(__m256i src, __m256i mask, __m256i& col, __m256i & n, __m256i & s, __m256i & sx, __m256i & sxx)
41
0
        {
42
0
            src = _mm256_and_si256(src, mask);
43
0
            n = _mm256_add_epi64(n, _mm256_sad_epu8(_mm256_and_si256(K8_01, mask), K_ZERO));
44
0
            s = _mm256_add_epi64(s, _mm256_sad_epu8(src, K_ZERO));
45
0
            GetObjectMoments16(_mm256_unpacklo_epi8(src, K_ZERO), col, sx, sxx);
46
0
            col = _mm256_add_epi16(col, K16_0008);
47
0
            GetObjectMoments16(_mm256_unpackhi_epi8(src, K_ZERO), col, sx, sxx);
48
0
            col = _mm256_add_epi16(col, K16_0018);
49
0
        }
50
51
        template <bool align> void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index,
52
            __m256i & n, __m256i & s, __m256i & sx, __m256i & sy, __m256i & sxx, __m256i& sxy, __m256i& syy)
53
0
        {
54
0
            size_t widthA = AlignLo(width, A);
55
0
            const size_t B = AlignLo(181, A);
56
0
            size_t widthB = AlignLoAny(width, B);
57
0
            __m256i tailMask = SetMask<uint8_t>(0, A - width + widthA, 0xFF);
58
59
0
            const __m256i K16_I = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23);
60
0
            const __m256i _index = _mm256_set1_epi8(index);
61
0
            const __m256i tailCol = _mm256_add_epi16(K16_I, _mm256_set1_epi16((int16_t)(width - A - widthB)));
62
63
0
            for (size_t row = 0; row < height; ++row)
64
0
            {
65
0
                for (size_t colB = 0; colB < width;)
66
0
                {
67
0
                    size_t colE = Simd::Min(colB + B, widthA);
68
0
                    __m256i _col = K16_I;
69
0
                    __m256i _n = _mm256_setzero_si256();
70
0
                    __m256i _s = _mm256_setzero_si256();
71
0
                    __m256i _sx = _mm256_setzero_si256();
72
0
                    __m256i _sxx = _mm256_setzero_si256();
73
0
                    if (mask == NULL)
74
0
                    {
75
0
                        for (size_t col = colB; col < colE; col += A)
76
0
                        {
77
0
                            __m256i _src = Load<align>((__m256i*)(src + col));
78
0
                            GetObjectMoments8(_src, K_INV_ZERO, _col, _n, _s, _sx, _sxx);
79
0
                        }
80
0
                        if (colB == widthB && widthA < width)
81
0
                        {
82
0
                            __m256i _src = Load<false>((__m256i*)(src + width - A));
83
0
                            _col = tailCol;
84
0
                            GetObjectMoments8(_src, tailMask, _col, _n, _s, _sx, _sxx);
85
0
                            colE = width;
86
0
                        }                        
87
0
                    }
88
0
                    else if (src == NULL)
89
0
                    {
90
0
                        for (size_t col = colB; col < colE; col += A)
91
0
                        {
92
0
                            __m256i _mask = _mm256_cmpeq_epi8(Load<align>((__m256i*)(mask + col)), _index);
93
0
                            GetObjectMoments8(K8_01, _mask, _col, _n, _s, _sx, _sxx);
94
0
                        }
95
0
                        if (colB == widthB && widthA < width)
96
0
                        {
97
0
                            __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<false>((__m256i*)(mask + width - A)), _index), tailMask);
98
0
                            _col = tailCol;
99
0
                            GetObjectMoments8(K8_01, _mask, _col, _n, _s, _sx, _sxx);
100
0
                            colE = width;
101
0
                        }
102
0
                    }
103
0
                    else
104
0
                    {
105
0
                        for (size_t col = colB; col < colE; col += A)
106
0
                        {
107
0
                            __m256i _src = Load<align>((__m256i*)(src + col));
108
0
                            __m256i _mask = _mm256_cmpeq_epi8(Load<align>((__m256i*)(mask + col)), _index);
109
0
                            GetObjectMoments8(_src, _mask, _col, _n, _s, _sx, _sxx);
110
0
                        }
111
0
                        if (colB == widthB && widthA < width)
112
0
                        {
113
0
                            __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<false>((__m256i*)(mask + width - A)), _index), tailMask);
114
0
                            __m256i _src = Load<false>((__m256i*)(src + width - A));
115
0
                            _col = tailCol;
116
0
                            GetObjectMoments8(_src, _mask, _col, _n, _s, _sx, _sxx);
117
0
                            colE = width;
118
0
                        }
119
0
                    }
120
0
                    _sx = HorizontalSum32(_sx);
121
0
                    _sxx = HorizontalSum32(_sxx);
122
123
0
                    __m256i _y = _mm256_set1_epi32((int32_t)row);
124
0
                    __m256i _x0 = _mm256_set1_epi32((int32_t)colB);
125
126
0
                    n = _mm256_add_epi64(n, _n);
127
128
0
                    s = _mm256_add_epi64(s, _s);
129
130
0
                    sx = _mm256_add_epi64(sx, _sx);
131
0
                    __m256i _sx0 = _mm256_mul_epu32(_s, _x0);
132
0
                    sx = _mm256_add_epi64(sx, _sx0);
133
134
0
                    __m256i _sy = _mm256_mul_epu32(_s, _y);
135
0
                    sy = _mm256_add_epi64(sy, _sy);
136
137
0
                    sxx = _mm256_add_epi64(sxx, _sxx);
138
0
                    sxx = _mm256_add_epi64(sxx, _mm256_mul_epu32(_sx, _mm256_add_epi64(_x0, _x0)));
139
0
                    sxx = _mm256_add_epi64(sxx, _mm256_mul_epu32(_sx0, _x0));
140
141
0
                    sxy = _mm256_add_epi64(sxy, _mm256_mul_epu32(_sx, _y));
142
0
                    sxy = _mm256_add_epi64(sxy, _mm256_mul_epu32(_sx0, _y));
143
144
0
                    syy = _mm256_add_epi64(syy, _mm256_mul_epu32(_sy, _y));
145
146
0
                    colB = colE;
147
0
                }
148
0
                if(src)
149
0
                    src += srcStride;
150
0
                if(mask)
151
0
                    mask += maskStride;
152
0
            }
153
0
        }
Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&)
Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&)
154
155
        template<bool align> void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index,
156
            uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy)
157
0
        {
158
0
            assert(width >= A && (src || mask));
159
0
            if (align)
160
0
                assert((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride))));
161
162
0
            __m256i _n = _mm256_setzero_si256();
163
0
            __m256i _s = _mm256_setzero_si256();
164
0
            __m256i _sx = _mm256_setzero_si256();
165
0
            __m256i _sy = _mm256_setzero_si256();
166
0
            __m256i _sxx = _mm256_setzero_si256();
167
0
            __m256i _sxy = _mm256_setzero_si256();
168
0
            __m256i _syy = _mm256_setzero_si256();
169
170
0
            GetObjectMoments<align>(src, srcStride, width, height, mask, maskStride, index, _n, _s, _sx, _sy, _sxx, _sxy, _syy);
171
172
0
            *n = ExtractSum<uint64_t>(_n);
173
0
            *s = ExtractSum<uint64_t>(_s);
174
0
            *sx = ExtractSum<uint64_t>(_sx);
175
0
            *sy = ExtractSum<uint64_t>(_sy);
176
0
            *sxx = ExtractSum<uint64_t>(_sxx);
177
0
            *sxy = ExtractSum<uint64_t>(_sxy);
178
0
            *syy = ExtractSum<uint64_t>(_syy);
179
0
        }
Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*)
Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*)
180
181
        void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index,
182
            uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy)
183
0
        {
184
0
            if ((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride))))
185
0
                GetObjectMoments<true>(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy);
186
0
            else
187
0
                GetObjectMoments<false>(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy);
188
0
        }
189
190
        void GetMoments(const uint8_t* mask, size_t stride, size_t width, size_t height, uint8_t index,
191
            uint64_t* area, uint64_t* x, uint64_t* y, uint64_t* xx, uint64_t* xy, uint64_t* yy)
192
0
        {
193
0
            uint64_t stub;
194
0
            GetObjectMoments(NULL, 0, width, height, mask, stride, index, &stub, area, x, y, xx, xy, yy);
195
0
        }
196
    }
197
#endif// SIMD_AVX2_ENABLE
198
}