/src/Simd/src/Simd/SimdAvx2StatisticMoments.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2019 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | #include "Simd/SimdExtract.h" |
27 | | #include "Simd/SimdSet.h" |
28 | | |
29 | | namespace Simd |
30 | | { |
31 | | #ifdef SIMD_AVX2_ENABLE |
32 | | namespace Avx2 |
33 | | { |
34 | | SIMD_INLINE void GetObjectMoments16(__m256i src, __m256i col, __m256i & sx, __m256i & sxx) |
35 | 0 | { |
36 | 0 | sx = _mm256_add_epi32(sx, _mm256_madd_epi16(col, src)); |
37 | 0 | sxx = _mm256_add_epi32(sxx, _mm256_madd_epi16(src, _mm256_mullo_epi16(col, col))); |
38 | 0 | } |
39 | | |
40 | | SIMD_INLINE void GetObjectMoments8(__m256i src, __m256i mask, __m256i& col, __m256i & n, __m256i & s, __m256i & sx, __m256i & sxx) |
41 | 0 | { |
42 | 0 | src = _mm256_and_si256(src, mask); |
43 | 0 | n = _mm256_add_epi64(n, _mm256_sad_epu8(_mm256_and_si256(K8_01, mask), K_ZERO)); |
44 | 0 | s = _mm256_add_epi64(s, _mm256_sad_epu8(src, K_ZERO)); |
45 | 0 | GetObjectMoments16(_mm256_unpacklo_epi8(src, K_ZERO), col, sx, sxx); |
46 | 0 | col = _mm256_add_epi16(col, K16_0008); |
47 | 0 | GetObjectMoments16(_mm256_unpackhi_epi8(src, K_ZERO), col, sx, sxx); |
48 | 0 | col = _mm256_add_epi16(col, K16_0018); |
49 | 0 | } |
50 | | |
51 | | template <bool align> void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t index, |
52 | | __m256i & n, __m256i & s, __m256i & sx, __m256i & sy, __m256i & sxx, __m256i& sxy, __m256i& syy) |
53 | 0 | { |
54 | 0 | size_t widthA = AlignLo(width, A); |
55 | 0 | const size_t B = AlignLo(181, A); |
56 | 0 | size_t widthB = AlignLoAny(width, B); |
57 | 0 | __m256i tailMask = SetMask<uint8_t>(0, A - width + widthA, 0xFF); |
58 | |
|
59 | 0 | const __m256i K16_I = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
60 | 0 | const __m256i _index = _mm256_set1_epi8(index); |
61 | 0 | const __m256i tailCol = _mm256_add_epi16(K16_I, _mm256_set1_epi16((int16_t)(width - A - widthB))); |
62 | |
|
63 | 0 | for (size_t row = 0; row < height; ++row) |
64 | 0 | { |
65 | 0 | for (size_t colB = 0; colB < width;) |
66 | 0 | { |
67 | 0 | size_t colE = Simd::Min(colB + B, widthA); |
68 | 0 | __m256i _col = K16_I; |
69 | 0 | __m256i _n = _mm256_setzero_si256(); |
70 | 0 | __m256i _s = _mm256_setzero_si256(); |
71 | 0 | __m256i _sx = _mm256_setzero_si256(); |
72 | 0 | __m256i _sxx = _mm256_setzero_si256(); |
73 | 0 | if (mask == NULL) |
74 | 0 | { |
75 | 0 | for (size_t col = colB; col < colE; col += A) |
76 | 0 | { |
77 | 0 | __m256i _src = Load<align>((__m256i*)(src + col)); |
78 | 0 | GetObjectMoments8(_src, K_INV_ZERO, _col, _n, _s, _sx, _sxx); |
79 | 0 | } |
80 | 0 | if (colB == widthB && widthA < width) |
81 | 0 | { |
82 | 0 | __m256i _src = Load<false>((__m256i*)(src + width - A)); |
83 | 0 | _col = tailCol; |
84 | 0 | GetObjectMoments8(_src, tailMask, _col, _n, _s, _sx, _sxx); |
85 | 0 | colE = width; |
86 | 0 | } |
87 | 0 | } |
88 | 0 | else if (src == NULL) |
89 | 0 | { |
90 | 0 | for (size_t col = colB; col < colE; col += A) |
91 | 0 | { |
92 | 0 | __m256i _mask = _mm256_cmpeq_epi8(Load<align>((__m256i*)(mask + col)), _index); |
93 | 0 | GetObjectMoments8(K8_01, _mask, _col, _n, _s, _sx, _sxx); |
94 | 0 | } |
95 | 0 | if (colB == widthB && widthA < width) |
96 | 0 | { |
97 | 0 | __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<false>((__m256i*)(mask + width - A)), _index), tailMask); |
98 | 0 | _col = tailCol; |
99 | 0 | GetObjectMoments8(K8_01, _mask, _col, _n, _s, _sx, _sxx); |
100 | 0 | colE = width; |
101 | 0 | } |
102 | 0 | } |
103 | 0 | else |
104 | 0 | { |
105 | 0 | for (size_t col = colB; col < colE; col += A) |
106 | 0 | { |
107 | 0 | __m256i _src = Load<align>((__m256i*)(src + col)); |
108 | 0 | __m256i _mask = _mm256_cmpeq_epi8(Load<align>((__m256i*)(mask + col)), _index); |
109 | 0 | GetObjectMoments8(_src, _mask, _col, _n, _s, _sx, _sxx); |
110 | 0 | } |
111 | 0 | if (colB == widthB && widthA < width) |
112 | 0 | { |
113 | 0 | __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<false>((__m256i*)(mask + width - A)), _index), tailMask); |
114 | 0 | __m256i _src = Load<false>((__m256i*)(src + width - A)); |
115 | 0 | _col = tailCol; |
116 | 0 | GetObjectMoments8(_src, _mask, _col, _n, _s, _sx, _sxx); |
117 | 0 | colE = width; |
118 | 0 | } |
119 | 0 | } |
120 | 0 | _sx = HorizontalSum32(_sx); |
121 | 0 | _sxx = HorizontalSum32(_sxx); |
122 | |
|
123 | 0 | __m256i _y = _mm256_set1_epi32((int32_t)row); |
124 | 0 | __m256i _x0 = _mm256_set1_epi32((int32_t)colB); |
125 | |
|
126 | 0 | n = _mm256_add_epi64(n, _n); |
127 | |
|
128 | 0 | s = _mm256_add_epi64(s, _s); |
129 | |
|
130 | 0 | sx = _mm256_add_epi64(sx, _sx); |
131 | 0 | __m256i _sx0 = _mm256_mul_epu32(_s, _x0); |
132 | 0 | sx = _mm256_add_epi64(sx, _sx0); |
133 | |
|
134 | 0 | __m256i _sy = _mm256_mul_epu32(_s, _y); |
135 | 0 | sy = _mm256_add_epi64(sy, _sy); |
136 | |
|
137 | 0 | sxx = _mm256_add_epi64(sxx, _sxx); |
138 | 0 | sxx = _mm256_add_epi64(sxx, _mm256_mul_epu32(_sx, _mm256_add_epi64(_x0, _x0))); |
139 | 0 | sxx = _mm256_add_epi64(sxx, _mm256_mul_epu32(_sx0, _x0)); |
140 | |
|
141 | 0 | sxy = _mm256_add_epi64(sxy, _mm256_mul_epu32(_sx, _y)); |
142 | 0 | sxy = _mm256_add_epi64(sxy, _mm256_mul_epu32(_sx0, _y)); |
143 | |
|
144 | 0 | syy = _mm256_add_epi64(syy, _mm256_mul_epu32(_sy, _y)); |
145 | |
|
146 | 0 | colB = colE; |
147 | 0 | } |
148 | 0 | if(src) |
149 | 0 | src += srcStride; |
150 | 0 | if(mask) |
151 | 0 | mask += maskStride; |
152 | 0 | } |
153 | 0 | } Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&) Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&, long long __vector(4)&) |
154 | | |
155 | | template<bool align> void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, |
156 | | uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) |
157 | 0 | { |
158 | 0 | assert(width >= A && (src || mask)); |
159 | 0 | if (align) |
160 | 0 | assert((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))); |
161 | |
|
162 | 0 | __m256i _n = _mm256_setzero_si256(); |
163 | 0 | __m256i _s = _mm256_setzero_si256(); |
164 | 0 | __m256i _sx = _mm256_setzero_si256(); |
165 | 0 | __m256i _sy = _mm256_setzero_si256(); |
166 | 0 | __m256i _sxx = _mm256_setzero_si256(); |
167 | 0 | __m256i _sxy = _mm256_setzero_si256(); |
168 | 0 | __m256i _syy = _mm256_setzero_si256(); |
169 | |
|
170 | 0 | GetObjectMoments<align>(src, srcStride, width, height, mask, maskStride, index, _n, _s, _sx, _sy, _sxx, _sxy, _syy); |
171 | |
|
172 | 0 | *n = ExtractSum<uint64_t>(_n); |
173 | 0 | *s = ExtractSum<uint64_t>(_s); |
174 | 0 | *sx = ExtractSum<uint64_t>(_sx); |
175 | 0 | *sy = ExtractSum<uint64_t>(_sy); |
176 | 0 | *sxx = ExtractSum<uint64_t>(_sxx); |
177 | 0 | *sxy = ExtractSum<uint64_t>(_sxy); |
178 | 0 | *syy = ExtractSum<uint64_t>(_syy); |
179 | 0 | } Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*) Unexecuted instantiation: void Simd::Avx2::GetObjectMoments<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*, unsigned long*) |
180 | | |
181 | | void GetObjectMoments(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* mask, size_t maskStride, uint8_t index, |
182 | | uint64_t* n, uint64_t* s, uint64_t* sx, uint64_t* sy, uint64_t* sxx, uint64_t* sxy, uint64_t* syy) |
183 | 0 | { |
184 | 0 | if ((src == NULL || (Aligned(src) && Aligned(srcStride))) && (mask == NULL || (Aligned(mask) && Aligned(maskStride)))) |
185 | 0 | GetObjectMoments<true>(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); |
186 | 0 | else |
187 | 0 | GetObjectMoments<false>(src, srcStride, width, height, mask, maskStride, index, n, s, sx, sy, sxx, sxy, syy); |
188 | 0 | } |
189 | | |
190 | | void GetMoments(const uint8_t* mask, size_t stride, size_t width, size_t height, uint8_t index, |
191 | | uint64_t* area, uint64_t* x, uint64_t* y, uint64_t* xx, uint64_t* xy, uint64_t* yy) |
192 | 0 | { |
193 | 0 | uint64_t stub; |
194 | 0 | GetObjectMoments(NULL, 0, width, height, mask, stride, index, &stub, area, x, y, xx, xy, yy); |
195 | 0 | } |
196 | | } |
197 | | #endif// SIMD_AVX2_ENABLE |
198 | | } |