/src/skia/src/opts/SkBitmapProcState_opts.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2018 Google Inc. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license that can be |
5 | | * found in the LICENSE file. |
6 | | */ |
7 | | |
8 | | #ifndef SkBitmapProcState_opts_DEFINED |
9 | | #define SkBitmapProcState_opts_DEFINED |
10 | | |
11 | | #include "include/private/SkVx.h" |
12 | | #include "src/core/SkBitmapProcState.h" |
13 | | #include "src/core/SkMSAN.h" |
14 | | |
15 | | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
16 | | // |
17 | | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
18 | | // our common baseline SSE2/NEON instruction sets, so that's |
19 | | // all that lives here. |
20 | | // |
21 | | // The rest are scattershot at the moment but I want to get them |
22 | | // all migrated to be normal code inside SkBitmapProcState.cpp. |
23 | | |
24 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
25 | | #include <immintrin.h> |
26 | | #elif defined(SK_ARM_HAS_NEON) |
27 | | #include <arm_neon.h> |
28 | | #endif |
29 | | |
30 | | namespace SK_OPTS_NS { |
31 | | |
32 | | // This same basic packing scheme is used throughout the file. |
33 | | template <typename U32, typename Out> |
34 | 5.38k | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { |
35 | 5.38k | *v0 = (packed >> 18); // Integer coordinate x0 or y0. |
36 | 5.38k | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. |
37 | 5.38k | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. |
38 | 5.38k | } SkOpts.cpp:void sse2::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*) Line | Count | Source | 34 | 5.38k | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { | 35 | 5.38k | *v0 = (packed >> 18); // Integer coordinate x0 or y0. | 36 | 5.38k | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. | 37 | 5.38k | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. | 38 | 5.38k | } |
Unexecuted instantiation: SkOpts_hsw.cpp:void hsw::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*) Unexecuted instantiation: SkOpts_hsw.cpp:void hsw::decode_packed_coordinates_and_weight<skvx::Vec<8, unsigned int>, skvx::Vec<8, unsigned int> >(skvx::Vec<8, unsigned int>, skvx::Vec<8, unsigned int>*, skvx::Vec<8, unsigned int>*, skvx::Vec<8, unsigned int>*) Unexecuted instantiation: SkOpts_ssse3.cpp:void ssse3::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*) |
39 | | |
40 | | #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
41 | | /*not static*/ inline |
42 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
43 | 0 | const uint32_t* xy, int count, uint32_t* colors) { |
44 | 0 | SkASSERT(count > 0 && colors != nullptr); |
45 | 0 | SkASSERT(s.fBilerp); |
46 | 0 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
47 | 0 | SkASSERT(s.fAlphaScale <= 256); |
48 | | |
49 | | // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight. |
50 | 0 | int y0, y1, wy; |
51 | 0 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
52 | |
|
53 | 0 | const uint32_t* row0 = s.fPixmap.addr32(0,y0); |
54 | 0 | const uint32_t* row1 = s.fPixmap.addr32(0,y1); |
55 | |
|
56 | 0 | auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> { |
57 | | // Decode up to 8 output pixels' x-coordinates and weights. |
58 | 0 | skvx::Vec<8,uint32_t> x0,x1,wx; |
59 | 0 | decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx); |
60 | | |
61 | | // Splat wx to each color channel. |
62 | 0 | wx = (wx << 0) |
63 | 0 | | (wx << 8) |
64 | 0 | | (wx << 16) |
65 | 0 | | (wx << 24); |
66 | |
|
67 | 0 | auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) { |
68 | 0 | #if 1 |
69 | | // Drop into AVX2 intrinsics for vpgatherdd. |
70 | 0 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>( |
71 | 0 | _mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4)); |
72 | | #else |
73 | | // Portable version... sometimes I don't trust vpgatherdd. |
74 | | return skvx::Vec<8,uint32_t>{ |
75 | | ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]], |
76 | | ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]], |
77 | | }; |
78 | | #endif |
79 | 0 | }; |
80 | | |
81 | | // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels. |
82 | 0 | skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1), |
83 | 0 | bl = gather(row1, x0), br = gather(row1, x1); |
84 | |
|
85 | 0 | #if 1 |
86 | | // We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code. |
87 | 0 | auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) { |
88 | 0 | __m256i l = skvx::bit_pun<__m256i>(L), |
89 | 0 | r = skvx::bit_pun<__m256i>(R), |
90 | 0 | wr = skvx::bit_pun<__m256i>(wx), |
91 | 0 | wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr); |
92 | | |
93 | | // Interlace l,r bytewise and line them up with their weights, then lerp. |
94 | 0 | __m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r), |
95 | 0 | _mm256_unpacklo_epi8(wl,wr)); |
96 | 0 | __m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r), |
97 | 0 | _mm256_unpackhi_epi8(wl,wr)); |
98 | | |
99 | | // Those _mm256_unpack??_epi8() calls left us in a bit of an odd order: |
100 | | // |
101 | | // if l = a b c d | e f g h |
102 | | // and r = A B C D | E F G H |
103 | | // |
104 | | // then lo = a A b B | e E f F (low half of each input) |
105 | | // and hi = c C d D | g G h H (high half of each input) |
106 | | // |
107 | | // To get everything back in original order we need to transpose that. |
108 | 0 | __m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20), |
109 | 0 | efgh = _mm256_permute2x128_si256(lo, hi, 0x31); |
110 | |
|
111 | 0 | return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd), |
112 | 0 | skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh)); |
113 | 0 | }; |
114 | |
|
115 | 0 | skvx::Vec<32, uint16_t> top = lerp_x(tl, tr), |
116 | 0 | bot = lerp_x(bl, br), |
117 | 0 | sum = 16*top + (bot-top)*wy; |
118 | | #else |
119 | | // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply. |
120 | | auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> { |
121 | | return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v)); |
122 | | }; |
123 | | |
124 | | // Sum up weighted sample pixels. The naive, redundant math would be, |
125 | | // |
126 | | // sum = tl * (16-wy) * (16-wx) |
127 | | // + bl * ( wy) * (16-wx) |
128 | | // + tr * (16-wy) * ( wx) |
129 | | // + br * ( wy) * ( wx) |
130 | | // |
131 | | // But we refactor to eliminate a bunch of those common factors. |
132 | | auto lerp = [](auto lo, auto hi, auto w) { |
133 | | return 16*lo + (hi-lo)*w; |
134 | | }; |
135 | | skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy), |
136 | | lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx)); |
137 | | #endif |
138 | | |
139 | | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
140 | 0 | sum >>= 8; |
141 | | |
142 | | // Scale by alpha if needed. |
143 | 0 | if(s.fAlphaScale < 256) { |
144 | 0 | sum *= s.fAlphaScale; |
145 | 0 | sum >>= 8; |
146 | 0 | } |
147 | | |
148 | | // Pack back to 8-bit channels, undoing to_16x4(). |
149 | 0 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum)); |
150 | 0 | }; |
151 | |
|
152 | 0 | while (count >= 8) { |
153 | 0 | bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors); |
154 | 0 | xy += 8; |
155 | 0 | colors += 8; |
156 | 0 | count -= 8; |
157 | 0 | } |
158 | 0 | if (count > 0) { |
159 | 0 | __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ), |
160 | 0 | coords = _mm256_maskload_epi32((const int*)xy, active), |
161 | 0 | pixels; |
162 | |
|
163 | 0 | bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels); |
164 | 0 | _mm256_maskstore_epi32((int*)colors, active, pixels); |
165 | |
|
166 | 0 | sk_msan_mark_initialized(colors, colors+count, |
167 | 0 | "MSAN still doesn't understand AVX2 mask loads and stores."); |
168 | 0 | } |
169 | 0 | } Unexecuted instantiation: hsw::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) Unexecuted instantiation: hsw::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) |
170 | | |
171 | | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
172 | | |
173 | | /*not static*/ inline |
174 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
175 | 0 | const uint32_t* xy, int count, uint32_t* colors) { |
176 | 0 | SkASSERT(count > 0 && colors != nullptr); |
177 | 0 | SkASSERT(s.fBilerp); |
178 | 0 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
179 | 0 | SkASSERT(s.fAlphaScale <= 256); |
180 | | |
181 | | // interpolate_in_x() is the crux of the SSSE3 implementation, |
182 | | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
183 | 0 | auto interpolate_in_x = [](uint32_t A0, uint32_t A1, |
184 | 0 | uint32_t B0, uint32_t B1, |
185 | 0 | __m128i interlaced_x_weights) { |
186 | | // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp. |
187 | | // |
188 | | // It takes two arguments interlaced byte-wise: |
189 | | // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...] |
190 | | // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...] |
191 | | // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ]. |
192 | | // |
193 | | // That's why we go to all this trouble to make interlaced_x_weights, |
194 | | // and here we're about to interlace A0 with A1 and B0 with B1 to match. |
195 | | // |
196 | | // Our interlaced_x_weights are all in [0,16], and so we need not worry about |
197 | | // the signedness of that input nor about the signedness of the output. |
198 | |
|
199 | 0 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
200 | 0 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
201 | |
|
202 | 0 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
203 | 0 | interlaced_x_weights); |
204 | 0 | }; |
205 | | |
206 | | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
207 | | // Returns two pixels, with each color channel in a 16-bit lane of the __m128i. |
208 | 0 | auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1, |
209 | 0 | uint32_t A2, uint32_t A3, |
210 | 0 | uint32_t B0, uint32_t B1, |
211 | 0 | uint32_t B2, uint32_t B3, |
212 | 0 | __m128i interlaced_x_weights, |
213 | 0 | int wy) { |
214 | | // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights. |
215 | 0 | __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
216 | 0 | bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
217 | | |
218 | | // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy |
219 | | // as 16*top + (bot-top)*wy to save a multiply. |
220 | 0 | __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4), |
221 | 0 | _mm_mullo_epi16(_mm_sub_epi16(bot, top), |
222 | 0 | _mm_set1_epi16(wy))); |
223 | | |
224 | | // Scale down by total max weight 16x16 = 256. |
225 | 0 | px = _mm_srli_epi16(px, 8); |
226 | | |
227 | | // Scale by alpha if needed. |
228 | 0 | if (s.fAlphaScale < 256) { |
229 | 0 | px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8); |
230 | 0 | } |
231 | 0 | return px; |
232 | 0 | }; |
233 | | |
234 | | // We're in _DX mode here, so we're only varying in X. |
235 | | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
236 | | // All the other entries in xy will be pairs of X coordinates and the X weight. |
237 | 0 | int y0, y1, wy; |
238 | 0 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
239 | |
|
240 | 0 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
241 | 0 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
242 | |
|
243 | 0 | while (count >= 4) { |
244 | | // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels. |
245 | 0 | int x0[4], |
246 | 0 | x1[4]; |
247 | 0 | __m128i wx; |
248 | | |
249 | | // decode_packed_coordinates_and_weight(), 4x. |
250 | 0 | __m128i packed = _mm_loadu_si128((const __m128i*)xy); |
251 | 0 | _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18)); |
252 | 0 | _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
253 | 0 | wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15] |
254 | | |
255 | | // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1, |
256 | | // and sixteen minus that as wl for pixels on the left at x0. |
257 | 0 | __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
258 | 0 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
259 | | |
260 | | // We need to interlace wl and wr for _mm_maddubs_epi16(). |
261 | 0 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr), |
262 | 0 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr); |
263 | |
|
264 | 0 | enum { A,B,C,D }; |
265 | | |
266 | | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
267 | | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
268 | 0 | __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]], |
269 | 0 | row1[x0[A]], row1[x1[A]], |
270 | 0 | row0[x0[B]], row0[x1[B]], |
271 | 0 | row1[x0[B]], row1[x1[B]], |
272 | 0 | interlaced_x_weights_AB, wy); |
273 | | |
274 | | // Once more with the other half of the x-weights for two more pixels C,D. |
275 | 0 | __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]], |
276 | 0 | row1[x0[C]], row1[x1[C]], |
277 | 0 | row0[x0[D]], row0[x1[D]], |
278 | 0 | row1[x0[D]], row1[x1[D]], |
279 | 0 | interlaced_x_weights_CD, wy); |
280 | | |
281 | | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
282 | 0 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD)); |
283 | 0 | xy += 4; |
284 | 0 | colors += 4; |
285 | 0 | count -= 4; |
286 | 0 | } |
287 | |
|
288 | 0 | while (count --> 0) { |
289 | | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
290 | 0 | int x0, x1, wx; |
291 | 0 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
292 | | |
293 | | // As above, splat out wx four times as wr, and sixteen minus that as wl. |
294 | 0 | __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
295 | 0 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
296 | |
|
297 | 0 | __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr); |
298 | |
|
299 | 0 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
300 | 0 | row1[x0], row1[x1], |
301 | 0 | 0, 0, |
302 | 0 | 0, 0, |
303 | 0 | interlaced_x_weights, wy); |
304 | |
|
305 | 0 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128())); |
306 | 0 | } |
307 | 0 | } Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) |
308 | | |
309 | | |
310 | | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
311 | | |
312 | | /*not static*/ inline |
313 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
314 | 2.69k | const uint32_t* xy, int count, uint32_t* colors) { |
315 | 2.69k | SkASSERT(count > 0 && colors != nullptr); |
316 | 2.69k | SkASSERT(s.fBilerp); |
317 | 2.69k | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
318 | 2.69k | SkASSERT(s.fAlphaScale <= 256); |
319 | | |
320 | 2.69k | int y0, y1, wy; |
321 | 2.69k | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
322 | | |
323 | 2.69k | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
324 | 2.69k | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
325 | | |
326 | | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, |
327 | | // and another in the upper 4 16-bit lanes to line up with 16 - wy. |
328 | 2.69k | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here. |
329 | 2.69k | _mm_set1_epi16(16-wy)); // Top pixel goes here. |
330 | | |
331 | 5.38k | while (count --> 0) { |
332 | 2.69k | int x0, x1, wx; |
333 | 2.69k | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
334 | | |
335 | | // Load the 4 pixels we're interpolating, in this grid: |
336 | | // | tl tr | |
337 | | // | bl br | |
338 | 2.69k | const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]), |
339 | 2.69k | bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]); |
340 | | |
341 | | // We want to calculate a sum of 4 pixels weighted in two directions: |
342 | | // |
343 | | // sum = tl * (16-wy) * (16-wx) |
344 | | // + bl * ( wy) * (16-wx) |
345 | | // + tr * (16-wy) * ( wx) |
346 | | // + br * ( wy) * ( wx) |
347 | | // |
348 | | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
349 | | // |
350 | | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
351 | | // to apply those y-direction weights. So we'll start on the x-direction |
352 | | // first, grouping into left and right halves, lined up with allY: |
353 | | // |
354 | | // L = [bl, tl] |
355 | | // R = [br, tr] |
356 | | // |
357 | | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
358 | | // |
359 | | // Rewriting that one more step, we can replace a multiply with a shift: |
360 | | // |
361 | | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
362 | | // |
363 | | // That's how we'll actually do this math. |
364 | | |
365 | 2.69k | __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()), |
366 | 2.69k | R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128()); |
367 | | |
368 | 2.69k | __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4), |
369 | 2.69k | _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx))); |
370 | | |
371 | 2.69k | __m128i sum_in_x = _mm_mullo_epi16(inner, allY); |
372 | | |
373 | | // sum = horizontalSum( ... ) |
374 | 2.69k | __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8)); |
375 | | |
376 | | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
377 | 2.69k | sum = _mm_srli_epi16(sum, 8); |
378 | | |
379 | 2.69k | if (s.fAlphaScale < 256) { |
380 | | // Scale by alpha, which is in [0,256]. |
381 | 0 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); |
382 | 0 | sum = _mm_srli_epi16(sum, 8); |
383 | 0 | } |
384 | | |
385 | | // Pack back into 8-bit values and store. |
386 | 2.69k | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); |
387 | 2.69k | } |
388 | 2.69k | } |
389 | | |
390 | | #else |
391 | | |
392 | | // The NEON code only actually differs from the portable code in the |
393 | | // filtering step after we've loaded all four pixels we want to bilerp. |
394 | | |
395 | | #if defined(SK_ARM_HAS_NEON) |
396 | | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
397 | | SkPMColor a00, SkPMColor a01, |
398 | | SkPMColor a10, SkPMColor a11, |
399 | | SkPMColor *dst, |
400 | | uint16_t scale) { |
401 | | uint8x8_t vy, vconst16_8, v16_y, vres; |
402 | | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
403 | | uint32x2_t va0, va1; |
404 | | uint16x8_t tmp1, tmp2; |
405 | | |
406 | | vy = vdup_n_u8(y); // duplicate y into vy |
407 | | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
408 | | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
409 | | |
410 | | va0 = vdup_n_u32(a00); // duplicate a00 |
411 | | va1 = vdup_n_u32(a10); // duplicate a10 |
412 | | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
413 | | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
414 | | |
415 | | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
416 | | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
417 | | |
418 | | vx = vdup_n_u16(x); // duplicate x into vx |
419 | | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
420 | | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
421 | | |
422 | | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
423 | | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
424 | | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
425 | | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
426 | | |
427 | | if (scale < 256) { |
428 | | vscale = vdup_n_u16(scale); // duplicate scale |
429 | | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
430 | | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
431 | | } |
432 | | |
433 | | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8 |
434 | | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
435 | | } |
436 | | #else |
437 | | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
438 | | SkPMColor a00, SkPMColor a01, |
439 | | SkPMColor a10, SkPMColor a11, |
440 | | SkPMColor* dstColor, |
441 | | unsigned alphaScale) { |
442 | | SkASSERT((unsigned)x <= 0xF); |
443 | | SkASSERT((unsigned)y <= 0xF); |
444 | | SkASSERT(alphaScale <= 256); |
445 | | |
446 | | int xy = x * y; |
447 | | const uint32_t mask = 0xFF00FF; |
448 | | |
449 | | int scale = 256 - 16*y - 16*x + xy; |
450 | | uint32_t lo = (a00 & mask) * scale; |
451 | | uint32_t hi = ((a00 >> 8) & mask) * scale; |
452 | | |
453 | | scale = 16*x - xy; |
454 | | lo += (a01 & mask) * scale; |
455 | | hi += ((a01 >> 8) & mask) * scale; |
456 | | |
457 | | scale = 16*y - xy; |
458 | | lo += (a10 & mask) * scale; |
459 | | hi += ((a10 >> 8) & mask) * scale; |
460 | | |
461 | | lo += (a11 & mask) * xy; |
462 | | hi += ((a11 >> 8) & mask) * xy; |
463 | | |
464 | | if (alphaScale < 256) { |
465 | | lo = ((lo >> 8) & mask) * alphaScale; |
466 | | hi = ((hi >> 8) & mask) * alphaScale; |
467 | | } |
468 | | |
469 | | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
470 | | } |
471 | | #endif |
472 | | |
473 | | |
474 | | /*not static*/ inline |
475 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
476 | | const uint32_t* xy, int count, SkPMColor* colors) { |
477 | | SkASSERT(count > 0 && colors != nullptr); |
478 | | SkASSERT(s.fBilerp); |
479 | | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
480 | | SkASSERT(s.fAlphaScale <= 256); |
481 | | |
482 | | int y0, y1, wy; |
483 | | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
484 | | |
485 | | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
486 | | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
487 | | |
488 | | while (count --> 0) { |
489 | | int x0, x1, wx; |
490 | | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
491 | | |
492 | | filter_and_scale_by_alpha(wx, wy, |
493 | | row0[x0], row0[x1], |
494 | | row1[x0], row1[x1], |
495 | | colors++, |
496 | | s.fAlphaScale); |
497 | | } |
498 | | } |
499 | | |
500 | | #endif |
501 | | |
502 | | #if defined(SK_ARM_HAS_NEON) |
503 | | /*not static*/ inline |
504 | | void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s, |
505 | | const uint32_t* xy, int count, SkPMColor* colors) { |
506 | | SkASSERT(count > 0 && colors != nullptr); |
507 | | SkASSERT(s.fBilerp); |
508 | | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
509 | | SkASSERT(s.fAlphaScale <= 256); |
510 | | |
511 | | auto src = (const char*)s.fPixmap.addr(); |
512 | | size_t rb = s.fPixmap.rowBytes(); |
513 | | |
514 | | while (count --> 0) { |
515 | | int y0, y1, wy, |
516 | | x0, x1, wx; |
517 | | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
518 | | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
519 | | |
520 | | auto row0 = (const uint32_t*)(src + y0*rb), |
521 | | row1 = (const uint32_t*)(src + y1*rb); |
522 | | |
523 | | filter_and_scale_by_alpha(wx, wy, |
524 | | row0[x0], row0[x1], |
525 | | row1[x0], row1[x1], |
526 | | colors++, |
527 | | s.fAlphaScale); |
528 | | } |
529 | | } |
530 | | #else |
531 | | // It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2. |
532 | | constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&, |
533 | | const uint32_t*, int, SkPMColor*) = nullptr; |
534 | | #endif |
535 | | |
536 | | } // namespace SK_OPTS_NS |
537 | | |
538 | | #endif |