/src/Simd/src/Simd/SimdAvx512bwTexture.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar, |
5 | | * 2025-2025 Ger Hobbelt. |
6 | | * |
7 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
8 | | * of this software and associated documentation files (the "Software"), to deal |
9 | | * in the Software without restriction, including without limitation the rights |
10 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the Software is |
12 | | * furnished to do so, subject to the following conditions: |
13 | | * |
14 | | * The above copyright notice and this permission notice shall be included in |
15 | | * all copies or substantial portions of the Software. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
18 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
20 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
21 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
22 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
23 | | * SOFTWARE. |
24 | | */ |
25 | | #include "Simd/SimdMemory.h" |
26 | | #include "Simd/SimdStore.h" |
27 | | #include "Simd/SimdSet.h" |
28 | | #include "Simd/SimdExtract.h" |
29 | | #include "Simd/SimdBase.h" |
30 | | #include "Simd/SimdUnpack.h" |
31 | | |
32 | | namespace Simd |
33 | | { |
34 | | #ifdef SIMD_AVX512BW_ENABLE |
35 | | namespace Avx512bw |
36 | | { |
37 | | SIMD_INLINE __m512i TextureBoostedSaturatedGradient16(const __m512i & difference, const __m512i & saturation, const __m512i & boost) |
38 | 0 | { |
39 | 0 | return _mm512_mullo_epi16(_mm512_max_epi16(K_ZERO, _mm512_add_epi16(saturation, _mm512_min_epi16(difference, saturation))), boost); |
40 | 0 | } |
41 | | |
42 | | SIMD_INLINE __m512i TextureBoostedSaturatedGradient8(const __m512i & a, const __m512i & b, const __m512i & saturation, const __m512i & boost) |
43 | 0 | { |
44 | 0 | __m512i lo = TextureBoostedSaturatedGradient16(SubUnpackedU8<0>(b, a), saturation, boost); |
45 | 0 | __m512i hi = TextureBoostedSaturatedGradient16(SubUnpackedU8<1>(b, a), saturation, boost); |
46 | 0 | return _mm512_packus_epi16(lo, hi); |
47 | 0 | } |
48 | | |
49 | | template<bool align, bool mask> SIMD_INLINE void TextureBoostedSaturatedGradient(const uint8_t * src, uint8_t * dx, uint8_t * dy, |
50 | | size_t stride, const __m512i & saturation, const __m512i & boost, __mmask64 tail = -1) |
51 | 0 | { |
52 | 0 | const __m512i s10 = Load<false, mask>(src - 1, tail); |
53 | 0 | const __m512i s12 = Load<false, mask>(src + 1, tail); |
54 | 0 | const __m512i s01 = Load<align, mask>(src - stride, tail); |
55 | 0 | const __m512i s21 = Load<align, mask>(src + stride, tail); |
56 | 0 | Store<align, mask>(dx, TextureBoostedSaturatedGradient8(s10, s12, saturation, boost), tail); |
57 | 0 | Store<align, mask>(dy, TextureBoostedSaturatedGradient8(s01, s21, saturation, boost), tail); |
58 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedSaturatedGradient<true, false>(unsigned char const*, unsigned char*, unsigned char*, unsigned long, long long __vector(8) const&, long long __vector(8) const&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedSaturatedGradient<false, true>(unsigned char const*, unsigned char*, unsigned char*, unsigned long, long long __vector(8) const&, long long __vector(8) const&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedSaturatedGradient<false, false>(unsigned char const*, unsigned char*, unsigned char*, unsigned long, long long __vector(8) const&, long long __vector(8) const&, unsigned long long) |
59 | | |
60 | | template<bool align> void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
61 | | uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) |
62 | 0 | { |
63 | 0 | assert(int(2)*saturation*boost <= 0xFF); |
64 | 0 | if (align) |
65 | 0 | assert(Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)); |
66 | |
|
67 | 0 | size_t alignedWidth = AlignLo(width, A); |
68 | 0 | __mmask64 tailMask = TailMask64(width - alignedWidth); |
69 | 0 | __m512i _saturation = _mm512_set1_epi16(saturation); |
70 | 0 | __m512i _boost = _mm512_set1_epi16(boost); |
71 | |
|
72 | 0 | memset(dx, 0, width); |
73 | 0 | memset(dy, 0, width); |
74 | 0 | src += srcStride; |
75 | 0 | dx += dxStride; |
76 | 0 | dy += dyStride; |
77 | 0 | for (size_t row = 2; row < height; ++row) |
78 | 0 | { |
79 | 0 | size_t col = 0; |
80 | 0 | for (; col < alignedWidth; col += A) |
81 | 0 | TextureBoostedSaturatedGradient<align, false>(src + col, dx + col, dy + col, srcStride, _saturation, _boost); |
82 | 0 | if (col < width) |
83 | 0 | TextureBoostedSaturatedGradient<false, true>(src + col, dx + col, dy + col, srcStride, _saturation, _boost, tailMask); |
84 | |
|
85 | 0 | dx[0] = 0; |
86 | 0 | dy[0] = 0; |
87 | 0 | dx[width - 1] = 0; |
88 | 0 | dy[width - 1] = 0; |
89 | |
|
90 | 0 | src += srcStride; |
91 | 0 | dx += dxStride; |
92 | 0 | dy += dyStride; |
93 | 0 | } |
94 | 0 | memset(dx, 0, width); |
95 | 0 | memset(dy, 0, width); |
96 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedSaturatedGradient<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char, unsigned char, unsigned char*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedSaturatedGradient<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char, unsigned char, unsigned char*, unsigned long, unsigned char*, unsigned long) |
97 | | |
98 | | void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
99 | | uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) |
100 | 0 | { |
101 | 0 | if (Aligned(src) && Aligned(srcStride) && Aligned(dx) && Aligned(dxStride) && Aligned(dy) && Aligned(dyStride)) |
102 | 0 | TextureBoostedSaturatedGradient<true>(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); |
103 | 0 | else |
104 | 0 | TextureBoostedSaturatedGradient<false>(src, srcStride, width, height, saturation, boost, dx, dxStride, dy, dyStride); |
105 | 0 | } |
106 | | |
107 | | template<bool align, bool mask> SIMD_INLINE void TextureBoostedUv(const uint8_t * src, uint8_t * dst, |
108 | | const __m512i & min8, const __m512i & max8, const __m512i & boost16, __mmask64 tail = -1) |
109 | 0 | { |
110 | 0 | const __m512i _src = Load<align, mask>(src, tail); |
111 | 0 | const __m512i saturated = _mm512_sub_epi8(_mm512_max_epu8(min8, _mm512_min_epu8(max8, _src)), min8); |
112 | 0 | const __m512i lo = _mm512_mullo_epi16(_mm512_unpacklo_epi8(saturated, K_ZERO), boost16); |
113 | 0 | const __m512i hi = _mm512_mullo_epi16(_mm512_unpackhi_epi8(saturated, K_ZERO), boost16); |
114 | 0 | Store<align, mask>(dst, _mm512_packus_epi16(lo, hi), tail); |
115 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedUv<true, false>(unsigned char const*, unsigned char*, long long __vector(8) const&, long long __vector(8) const&, long long __vector(8) const&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedUv<false, true>(unsigned char const*, unsigned char*, long long __vector(8) const&, long long __vector(8) const&, long long __vector(8) const&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedUv<false, false>(unsigned char const*, unsigned char*, long long __vector(8) const&, long long __vector(8) const&, long long __vector(8) const&, unsigned long long) |
116 | | |
117 | | template<bool align> void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
118 | | uint8_t boost, uint8_t * dst, size_t dstStride) |
119 | 0 | { |
120 | 0 | assert(boost < 0x80); |
121 | 0 | if (align) |
122 | 0 | assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); |
123 | |
|
124 | 0 | size_t alignedWidth = AlignLo(width, A); |
125 | 0 | __mmask64 tailMask = TailMask64(width - alignedWidth); |
126 | 0 | int min = 128 - (128 / boost); |
127 | 0 | int max = 255 - min; |
128 | 0 | __m512i min8 = _mm512_set1_epi8(min); |
129 | 0 | __m512i max8 = _mm512_set1_epi8(max); |
130 | 0 | __m512i boost16 = _mm512_set1_epi16(boost); |
131 | 0 | for (size_t row = 0; row < height; ++row) |
132 | 0 | { |
133 | 0 | size_t col = 0; |
134 | 0 | for (; col < alignedWidth; col += A) |
135 | 0 | TextureBoostedUv<align, false>(src + col, dst + col, min8, max8, boost16); |
136 | 0 | if (col < width) |
137 | 0 | TextureBoostedUv<false, true>(src + col, dst + col, min8, max8, boost16, tailMask); |
138 | 0 | src += srcStride; |
139 | 0 | dst += dstStride; |
140 | 0 | } |
141 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedUv<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Avx512bw::TextureBoostedUv<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char, unsigned char*, unsigned long) |
142 | | |
143 | | void TextureBoostedUv(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
144 | | uint8_t boost, uint8_t * dst, size_t dstStride) |
145 | 0 | { |
146 | 0 | if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) |
147 | 0 | TextureBoostedUv<true>(src, srcStride, width, height, boost, dst, dstStride); |
148 | 0 | else |
149 | 0 | TextureBoostedUv<false>(src, srcStride, width, height, boost, dst, dstStride); |
150 | 0 | } |
151 | | |
152 | | SIMD_INLINE void TextureGetDifferenceSum(const __m512i & current, const __m512i & average, __m512i & positive, __m512i & negative) |
153 | 0 | { |
154 | 0 | positive = _mm512_add_epi64(positive, _mm512_sad_epu8(_mm512_subs_epu8(current, average), K_ZERO)); |
155 | 0 | negative = _mm512_add_epi64(negative, _mm512_sad_epu8(_mm512_subs_epu8(average, current), K_ZERO)); |
156 | 0 | } |
157 | | |
158 | | template <bool align, bool mask> SIMD_INLINE void TextureGetDifferenceSum(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, |
159 | | __m512i & positive, __m512i & negative, __mmask64 tail = -1) |
160 | 0 | { |
161 | 0 | const __m512i current = Load<align, mask>(src, tail); |
162 | 0 | const __m512i _lo = Load<align, mask>(lo, tail); |
163 | 0 | const __m512i _hi = Load<align, mask>(hi, tail); |
164 | 0 | const __m512i average = _mm512_avg_epu8(_lo, _hi); |
165 | 0 | TextureGetDifferenceSum(current, average, positive, negative); |
166 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum<true, false>(unsigned char const*, unsigned char const*, unsigned char const*, long long __vector(8)&, long long __vector(8)&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum<true, true>(unsigned char const*, unsigned char const*, unsigned char const*, long long __vector(8)&, long long __vector(8)&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum<false, false>(unsigned char const*, unsigned char const*, unsigned char const*, long long __vector(8)&, long long __vector(8)&, unsigned long long) Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum<false, true>(unsigned char const*, unsigned char const*, unsigned char const*, long long __vector(8)&, long long __vector(8)&, unsigned long long) |
167 | | |
168 | | template <bool align> SIMD_INLINE void TextureGetDifferenceSum4(const uint8_t * src, const uint8_t * lo, const uint8_t * hi, __m512i & positive, __m512i & negative) |
169 | 0 | { |
170 | 0 | TextureGetDifferenceSum(Load<align>(src + 0 * A), _mm512_avg_epu8(Load<align>(hi + 0 * A), Load<align>(lo + 0 * A)), positive, negative); |
171 | 0 | TextureGetDifferenceSum(Load<align>(src + 1 * A), _mm512_avg_epu8(Load<align>(hi + 1 * A), Load<align>(lo + 1 * A)), positive, negative); |
172 | 0 | TextureGetDifferenceSum(Load<align>(src + 2 * A), _mm512_avg_epu8(Load<align>(hi + 2 * A), Load<align>(lo + 2 * A)), positive, negative); |
173 | 0 | TextureGetDifferenceSum(Load<align>(src + 3 * A), _mm512_avg_epu8(Load<align>(hi + 3 * A), Load<align>(lo + 3 * A)), positive, negative); |
174 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum4<true>(unsigned char const*, unsigned char const*, unsigned char const*, long long __vector(8)&, long long __vector(8)&) Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum4<false>(unsigned char const*, unsigned char const*, unsigned char const*, long long __vector(8)&, long long __vector(8)&) |
175 | | |
176 | | template <bool align> void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
177 | | const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) |
178 | 0 | { |
179 | 0 | assert(sum != nullptr); |
180 | 0 | if (align) |
181 | 0 | assert(Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)); |
182 | |
|
183 | 0 | size_t alignedWidth = AlignLo(width, A); |
184 | 0 | size_t fullAlignedWidth = AlignLo(width, QA); |
185 | 0 | __mmask64 tailMask = TailMask64(width - alignedWidth); |
186 | 0 | __m512i positive = _mm512_setzero_si512(); |
187 | 0 | __m512i negative = _mm512_setzero_si512(); |
188 | 0 | for (size_t row = 0; row < height; ++row) |
189 | 0 | { |
190 | 0 | size_t col = 0; |
191 | 0 | for (; col < fullAlignedWidth; col += QA) |
192 | 0 | TextureGetDifferenceSum4<align>(src + col, lo + col, hi + col, positive, negative); |
193 | 0 | for (; col < alignedWidth; col += A) |
194 | 0 | TextureGetDifferenceSum<align, false>(src + col, lo + col, hi + col, positive, negative); |
195 | 0 | if (col < width) |
196 | 0 | TextureGetDifferenceSum<align, true>(src + col, lo + col, hi + col, positive, negative, tailMask); |
197 | 0 | src += srcStride; |
198 | 0 | lo += loStride; |
199 | 0 | hi += hiStride; |
200 | 0 | } |
201 | 0 | *sum = ExtractSum<int64_t>(positive) - ExtractSum<int64_t>(negative); |
202 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char const*, unsigned long, long*) Unexecuted instantiation: void Simd::Avx512bw::TextureGetDifferenceSum<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned long, unsigned char const*, unsigned long, long*) |
203 | | |
204 | | void TextureGetDifferenceSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
205 | | const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, int64_t * sum) |
206 | 0 | { |
207 | 0 | if (Aligned(src) && Aligned(srcStride) && Aligned(lo) && Aligned(loStride) && Aligned(hi) && Aligned(hiStride)) |
208 | 0 | TextureGetDifferenceSum<true>(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); |
209 | 0 | else |
210 | 0 | TextureGetDifferenceSum<false>(src, srcStride, width, height, lo, loStride, hi, hiStride, sum); |
211 | 0 | } |
212 | | |
213 | | template <bool align> void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, int shift, uint8_t * dst, size_t dstStride) |
214 | 0 | { |
215 | 0 | assert(shift > -0xFF && shift < 0xFF && shift != 0); |
216 | 0 | if (align) |
217 | 0 | assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); |
218 | |
|
219 | 0 | size_t alignedWidth = AlignLo(width, A); |
220 | 0 | size_t fullAlignedWidth = AlignLo(width, QA); |
221 | 0 | __mmask64 tailMask = TailMask64(width - alignedWidth); |
222 | 0 | if (shift > 0) |
223 | 0 | { |
224 | 0 | __m512i _shift = _mm512_set1_epi8((char)shift); |
225 | 0 | for (size_t row = 0; row < height; ++row) |
226 | 0 | { |
227 | 0 | size_t col = 0; |
228 | 0 | for (; col < fullAlignedWidth; col += QA) |
229 | 0 | { |
230 | 0 | Store<align>(dst + col + 0 * A, _mm512_adds_epu8(Load<align>(src + col + 0 * A), _shift)); |
231 | 0 | Store<align>(dst + col + 1 * A, _mm512_adds_epu8(Load<align>(src + col + 1 * A), _shift)); |
232 | 0 | Store<align>(dst + col + 2 * A, _mm512_adds_epu8(Load<align>(src + col + 2 * A), _shift)); |
233 | 0 | Store<align>(dst + col + 3 * A, _mm512_adds_epu8(Load<align>(src + col + 3 * A), _shift)); |
234 | 0 | } |
235 | 0 | for (; col < alignedWidth; col += A) |
236 | 0 | Store<align>(dst + col, _mm512_adds_epu8(Load<align>(src + col), _shift)); |
237 | 0 | if (col < width) |
238 | 0 | Store<align, true>(dst + col, _mm512_adds_epu8((Load<align, true>(src + col, tailMask)), _shift), tailMask); |
239 | 0 | src += srcStride; |
240 | 0 | dst += dstStride; |
241 | 0 | } |
242 | 0 | } |
243 | 0 | if (shift < 0) |
244 | 0 | { |
245 | 0 | __m512i _shift = _mm512_set1_epi8((char)-shift); |
246 | 0 | for (size_t row = 0; row < height; ++row) |
247 | 0 | { |
248 | 0 | size_t col = 0; |
249 | 0 | for (; col < fullAlignedWidth; col += QA) |
250 | 0 | { |
251 | 0 | Store<align>(dst + col + 0 * A, _mm512_subs_epu8(Load<align>(src + col + 0 * A), _shift)); |
252 | 0 | Store<align>(dst + col + 1 * A, _mm512_subs_epu8(Load<align>(src + col + 1 * A), _shift)); |
253 | 0 | Store<align>(dst + col + 2 * A, _mm512_subs_epu8(Load<align>(src + col + 2 * A), _shift)); |
254 | 0 | Store<align>(dst + col + 3 * A, _mm512_subs_epu8(Load<align>(src + col + 3 * A), _shift)); |
255 | 0 | } |
256 | 0 | for (; col < alignedWidth; col += A) |
257 | 0 | Store<align>(dst + col, _mm512_subs_epu8(Load<align>(src + col), _shift)); |
258 | 0 | if (col < width) |
259 | 0 | Store<align, true>(dst + col, _mm512_subs_epu8((Load<align, true>(src + col, tailMask)), _shift), tailMask); |
260 | 0 | src += srcStride; |
261 | 0 | dst += dstStride; |
262 | 0 | } |
263 | 0 | } |
264 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::TexturePerformCompensation<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, int, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Avx512bw::TexturePerformCompensation<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, int, unsigned char*, unsigned long) |
265 | | |
266 | | void TexturePerformCompensation(const uint8_t * src, size_t srcStride, size_t width, size_t height, |
267 | | int shift, uint8_t * dst, size_t dstStride) |
268 | 0 | { |
269 | 0 | if (shift == 0) |
270 | 0 | { |
271 | 0 | if (src != dst) |
272 | 0 | Base::Copy(src, srcStride, width, height, 1, dst, dstStride); |
273 | 0 | return; |
274 | 0 | } |
275 | 0 | if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) |
276 | 0 | TexturePerformCompensation<true>(src, srcStride, width, height, shift, dst, dstStride); |
277 | 0 | else |
278 | 0 | TexturePerformCompensation<false>(src, srcStride, width, height, shift, dst, dstStride); |
279 | 0 | } |
280 | | } |
281 | | #endif// SIMD_AVX512BW_ENABLE |
282 | | } |