/src/skia/src/opts/SkBitmapProcState_opts.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2018 Google Inc. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license that can be |
5 | | * found in the LICENSE file. |
6 | | */ |
7 | | |
8 | | #ifndef SkBitmapProcState_opts_DEFINED |
9 | | #define SkBitmapProcState_opts_DEFINED |
10 | | |
11 | | #include "src/base/SkMSAN.h" |
12 | | #include "src/base/SkVx.h" |
13 | | #include "src/core/SkBitmapProcState.h" |
14 | | |
15 | | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
16 | | // |
17 | | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
18 | | // our common baseline SSE2/NEON instruction sets, so that's |
19 | | // all that lives here. |
20 | | // |
21 | | // The rest are scattershot at the moment but I want to get them |
22 | | // all migrated to be normal code inside SkBitmapProcState.cpp. |
23 | | |
24 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
25 | | #include <immintrin.h> |
26 | | #elif defined(SK_ARM_HAS_NEON) |
27 | | #include <arm_neon.h> |
28 | | #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX |
29 | | #include <lasxintrin.h> |
30 | | #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX |
31 | | #include <lsxintrin.h> |
32 | | #endif |
33 | | |
34 | | namespace SK_OPTS_NS { |
35 | | |
36 | | // This same basic packing scheme is used throughout the file. |
37 | | template <typename U32, typename Out> |
38 | 2.82M | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { |
39 | 2.82M | *v0 = (packed >> 18); // Integer coordinate x0 or y0. |
40 | 2.82M | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. |
41 | 2.82M | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. |
42 | 2.82M | } SkBitmapProcState_opts.cpp:void sse2::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*) Line | Count | Source | 38 | 2.82M | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { | 39 | 2.82M | *v0 = (packed >> 18); // Integer coordinate x0 or y0. | 40 | 2.82M | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. | 41 | 2.82M | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. | 42 | 2.82M | } |
Unexecuted instantiation: SkBitmapProcState_opts_ssse3.cpp:void ssse3::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*) |
43 | | |
44 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
45 | | |
46 | | /*not static*/ inline |
47 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
48 | 0 | const uint32_t* xy, int count, uint32_t* colors) { |
49 | 0 | SkASSERT(count > 0 && colors != nullptr); |
50 | 0 | SkASSERT(s.fBilerp); |
51 | 0 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
52 | 0 | SkASSERT(s.fAlphaScale <= 256); |
53 | | |
54 | | // interpolate_in_x() is the crux of the SSSE3 implementation, |
55 | | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
56 | 0 | auto interpolate_in_x = [](uint32_t A0, uint32_t A1, |
57 | 0 | uint32_t B0, uint32_t B1, |
58 | 0 | __m128i interlaced_x_weights) { |
59 | | // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp. |
60 | | // |
61 | | // It takes two arguments interlaced byte-wise: |
62 | | // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...] |
63 | | // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...] |
64 | | // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ]. |
65 | | // |
66 | | // That's why we go to all this trouble to make interlaced_x_weights, |
67 | | // and here we're about to interlace A0 with A1 and B0 with B1 to match. |
68 | | // |
69 | | // Our interlaced_x_weights are all in [0,16], and so we need not worry about |
70 | | // the signedness of that input nor about the signedness of the output. |
71 | |
|
72 | 0 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
73 | 0 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
74 | |
|
75 | 0 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
76 | 0 | interlaced_x_weights); |
77 | 0 | }; |
78 | | |
79 | | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
80 | | // Returns two pixels, with each color channel in a 16-bit lane of the __m128i. |
81 | 0 | auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1, |
82 | 0 | uint32_t A2, uint32_t A3, |
83 | 0 | uint32_t B0, uint32_t B1, |
84 | 0 | uint32_t B2, uint32_t B3, |
85 | 0 | __m128i interlaced_x_weights, |
86 | 0 | int wy) { |
87 | | // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights. |
88 | 0 | __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
89 | 0 | bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
90 | | |
91 | | // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy |
92 | | // as 16*top + (bot-top)*wy to save a multiply. |
93 | 0 | __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4), |
94 | 0 | _mm_mullo_epi16(_mm_sub_epi16(bot, top), |
95 | 0 | _mm_set1_epi16(wy))); |
96 | | |
97 | | // Scale down by total max weight 16x16 = 256. |
98 | 0 | px = _mm_srli_epi16(px, 8); |
99 | | |
100 | | // Scale by alpha if needed. |
101 | 0 | if (s.fAlphaScale < 256) { |
102 | 0 | px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8); |
103 | 0 | } |
104 | 0 | return px; |
105 | 0 | }; |
106 | | |
107 | | // We're in _DX mode here, so we're only varying in X. |
108 | | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
109 | | // All the other entries in xy will be pairs of X coordinates and the X weight. |
110 | 0 | int y0, y1, wy; |
111 | 0 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
112 | |
|
113 | 0 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
114 | 0 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
115 | |
|
116 | 0 | while (count >= 4) { |
117 | | // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels. |
118 | 0 | int x0[4], |
119 | 0 | x1[4]; |
120 | 0 | __m128i wx; |
121 | | |
122 | | // decode_packed_coordinates_and_weight(), 4x. |
123 | 0 | __m128i packed = _mm_loadu_si128((const __m128i*)xy); |
124 | 0 | _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18)); |
125 | 0 | _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
126 | 0 | wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15] |
127 | | |
128 | | // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1, |
129 | | // and sixteen minus that as wl for pixels on the left at x0. |
130 | 0 | __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
131 | 0 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
132 | | |
133 | | // We need to interlace wl and wr for _mm_maddubs_epi16(). |
134 | 0 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr), |
135 | 0 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr); |
136 | |
|
137 | 0 | enum { A,B,C,D }; |
138 | | |
139 | | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
140 | | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
141 | 0 | __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]], |
142 | 0 | row1[x0[A]], row1[x1[A]], |
143 | 0 | row0[x0[B]], row0[x1[B]], |
144 | 0 | row1[x0[B]], row1[x1[B]], |
145 | 0 | interlaced_x_weights_AB, wy); |
146 | | |
147 | | // Once more with the other half of the x-weights for two more pixels C,D. |
148 | 0 | __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]], |
149 | 0 | row1[x0[C]], row1[x1[C]], |
150 | 0 | row0[x0[D]], row0[x1[D]], |
151 | 0 | row1[x0[D]], row1[x1[D]], |
152 | 0 | interlaced_x_weights_CD, wy); |
153 | | |
154 | | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
155 | 0 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD)); |
156 | 0 | xy += 4; |
157 | 0 | colors += 4; |
158 | 0 | count -= 4; |
159 | 0 | } |
160 | |
|
161 | 0 | while (count --> 0) { |
162 | | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
163 | 0 | int x0, x1, wx; |
164 | 0 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
165 | | |
166 | | // As above, splat out wx four times as wr, and sixteen minus that as wl. |
167 | 0 | __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
168 | 0 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
169 | |
|
170 | 0 | __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr); |
171 | |
|
172 | 0 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
173 | 0 | row1[x0], row1[x1], |
174 | 0 | 0, 0, |
175 | 0 | 0, 0, |
176 | 0 | interlaced_x_weights, wy); |
177 | |
|
178 | 0 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128())); |
179 | 0 | } |
180 | 0 | } Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) |
181 | | |
182 | | |
183 | | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
184 | | |
185 | | /*not static*/ inline |
186 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
187 | 80.3k | const uint32_t* xy, int count, uint32_t* colors) { |
188 | 80.3k | SkASSERT(count > 0 && colors != nullptr); |
189 | 80.3k | SkASSERT(s.fBilerp); |
190 | 80.3k | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
191 | 80.3k | SkASSERT(s.fAlphaScale <= 256); |
192 | | |
193 | 80.3k | int y0, y1, wy; |
194 | 80.3k | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
195 | | |
196 | 80.3k | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
197 | 80.3k | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
198 | | |
199 | | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, |
200 | | // and another in the upper 4 16-bit lanes to line up with 16 - wy. |
201 | 80.3k | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here. |
202 | 80.3k | _mm_set1_epi16(16-wy)); // Top pixel goes here. |
203 | | |
204 | 2.82M | while (count --> 0) { |
205 | 2.74M | int x0, x1, wx; |
206 | 2.74M | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
207 | | |
208 | | // Load the 4 pixels we're interpolating, in this grid: |
209 | | // | tl tr | |
210 | | // | bl br | |
211 | 2.74M | const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]), |
212 | 2.74M | bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]); |
213 | | |
214 | | // We want to calculate a sum of 4 pixels weighted in two directions: |
215 | | // |
216 | | // sum = tl * (16-wy) * (16-wx) |
217 | | // + bl * ( wy) * (16-wx) |
218 | | // + tr * (16-wy) * ( wx) |
219 | | // + br * ( wy) * ( wx) |
220 | | // |
221 | | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
222 | | // |
223 | | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
224 | | // to apply those y-direction weights. So we'll start on the x-direction |
225 | | // first, grouping into left and right halves, lined up with allY: |
226 | | // |
227 | | // L = [bl, tl] |
228 | | // R = [br, tr] |
229 | | // |
230 | | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
231 | | // |
232 | | // Rewriting that one more step, we can replace a multiply with a shift: |
233 | | // |
234 | | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
235 | | // |
236 | | // That's how we'll actually do this math. |
237 | | |
238 | 2.74M | __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()), |
239 | 2.74M | R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128()); |
240 | | |
241 | 2.74M | __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4), |
242 | 2.74M | _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx))); |
243 | | |
244 | 2.74M | __m128i sum_in_x = _mm_mullo_epi16(inner, allY); |
245 | | |
246 | | // sum = horizontalSum( ... ) |
247 | 2.74M | __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8)); |
248 | | |
249 | | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
250 | 2.74M | sum = _mm_srli_epi16(sum, 8); |
251 | | |
252 | 2.74M | if (s.fAlphaScale < 256) { |
253 | | // Scale by alpha, which is in [0,256]. |
254 | 0 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); |
255 | 0 | sum = _mm_srli_epi16(sum, 8); |
256 | 0 | } |
257 | | |
258 | | // Pack back into 8-bit values and store. |
259 | 2.74M | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); |
260 | 2.74M | } |
261 | 80.3k | } sse2::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) Line | Count | Source | 187 | 80.3k | const uint32_t* xy, int count, uint32_t* colors) { | 188 | 80.3k | SkASSERT(count > 0 && colors != nullptr); | 189 | 80.3k | SkASSERT(s.fBilerp); | 190 | 80.3k | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); | 191 | 80.3k | SkASSERT(s.fAlphaScale <= 256); | 192 | | | 193 | 80.3k | int y0, y1, wy; | 194 | 80.3k | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); | 195 | | | 196 | 80.3k | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), | 197 | 80.3k | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); | 198 | | | 199 | | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, | 200 | | // and another in the upper 4 16-bit lanes to line up with 16 - wy. | 201 | 80.3k | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here. | 202 | 80.3k | _mm_set1_epi16(16-wy)); // Top pixel goes here. | 203 | | | 204 | 2.82M | while (count --> 0) { | 205 | 2.74M | int x0, x1, wx; | 206 | 2.74M | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); | 207 | | | 208 | | // Load the 4 pixels we're interpolating, in this grid: | 209 | | // | tl tr | | 210 | | // | bl br | | 211 | 2.74M | const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]), | 212 | 2.74M | bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]); | 213 | | | 214 | | // We want to calculate a sum of 4 pixels weighted in two directions: | 215 | | // | 216 | | // sum = tl * (16-wy) * (16-wx) | 217 | | // + bl * ( wy) * (16-wx) | 218 | | // + tr * (16-wy) * ( wx) | 219 | | // + br * ( wy) * ( wx) | 220 | | // | 221 | | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) | 222 | | // | 223 | | // We've already prepared allY as a vector containing [wy, 16-wy] as a way | 224 | | // to apply those y-direction weights. So we'll start on the x-direction | 225 | | // first, grouping into left and right halves, lined up with allY: | 226 | | // | 227 | | // L = [bl, tl] | 228 | | // R = [br, tr] | 229 | | // | 230 | | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) | 231 | | // | 232 | | // Rewriting that one more step, we can replace a multiply with a shift: | 233 | | // | 234 | | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) | 235 | | // | 236 | | // That's how we'll actually do this math. | 237 | | | 238 | 2.74M | __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()), | 239 | 2.74M | R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128()); | 240 | | | 241 | 2.74M | __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4), | 242 | 2.74M | _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx))); | 243 | | | 244 | 2.74M | __m128i sum_in_x = _mm_mullo_epi16(inner, allY); | 245 | | | 246 | | // sum = horizontalSum( ... ) | 247 | 2.74M | __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8)); | 248 | | | 249 | | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. | 250 | 2.74M | sum = _mm_srli_epi16(sum, 8); | 251 | | | 252 | 2.74M | if (s.fAlphaScale < 256) { | 253 | | // Scale by alpha, which is in [0,256]. | 254 | 0 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); | 255 | 0 | sum = _mm_srli_epi16(sum, 8); | 256 | 0 | } | 257 | | | 258 | | // Pack back into 8-bit values and store. | 259 | 2.74M | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); | 260 | 2.74M | } | 261 | 80.3k | } |
Unexecuted instantiation: sse2::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*) |
262 | | |
263 | | #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX |
264 | | /*not static*/ inline |
265 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
266 | | const uint32_t* xy, int count, uint32_t* colors) { |
267 | | SkASSERT(count > 0 && colors != nullptr); |
268 | | SkASSERT(s.fBilerp); |
269 | | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
270 | | SkASSERT(s.fAlphaScale <= 256); |
271 | | |
272 | | int y0, y1, wy; |
273 | | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
274 | | |
275 | | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
276 | | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
277 | | |
278 | | // We'll put one pixel in the low 16 16-bit lanes to line up with wy, |
279 | | // and another in the upper 16 16-bit lanes to line up with 16 - wy. |
280 | | __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy)); |
281 | | |
282 | | while (count --> 0) { |
283 | | int x0, x1, wx; |
284 | | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
285 | | |
286 | | // Load the 4 pixels we're interpolating, in this grid: |
287 | | // | tl tr | |
288 | | // | bl br | |
289 | | |
290 | | const __m256i zeros = __lasx_xvldi(0); |
291 | | const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0), |
292 | | tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0), |
293 | | bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0), |
294 | | br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0); |
295 | | |
296 | | // We want to calculate a sum of 8 pixels weighted in two directions: |
297 | | // |
298 | | // sum = tl * (16-wy) * (16-wx) |
299 | | // + bl * ( wy) * (16-wx) |
300 | | // + tr * (16-wy) * ( wx) |
301 | | // + br * ( wy) * ( wx) |
302 | | // |
303 | | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
304 | | // |
305 | | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
306 | | // to apply those y-direction weights. So we'll start on the x-direction |
307 | | // first, grouping into left and right halves, lined up with allY: |
308 | | // |
309 | | // L = [bl, tl] |
310 | | // R = [br, tr] |
311 | | // |
312 | | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
313 | | // |
314 | | // Rewriting that one more step, we can replace a multiply with a shift: |
315 | | // |
316 | | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
317 | | // |
318 | | // That's how we'll actually do this math. |
319 | | |
320 | | __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)), |
321 | | R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br)); |
322 | | |
323 | | __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4), |
324 | | __lasx_xvmul_h(__lasx_xvsub_h(R,L), |
325 | | __lasx_xvreplgr2vr_h(wx))); |
326 | | |
327 | | __m256i sum_in_x = __lasx_xvmul_h(inner, allY); |
328 | | |
329 | | // sum = horizontalSum( ... ) |
330 | | __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8)); |
331 | | |
332 | | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
333 | | sum = __lasx_xvsrli_h(sum, 8); |
334 | | |
335 | | if (s.fAlphaScale < 256) { |
336 | | // Scale by alpha, which is in [0,256]. |
337 | | sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale)); |
338 | | sum = __lasx_xvsrli_h(sum, 8); |
339 | | } |
340 | | |
341 | | // Pack back into 8-bit values and store. |
342 | | *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0), |
343 | | __lasx_xvsat_hu(sum, 8)), 0); |
344 | | } |
345 | | } |
346 | | |
347 | | #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX |
348 | | |
349 | | /*not static*/ inline |
350 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
351 | | const uint32_t* xy, int count, uint32_t* colors) { |
352 | | SkASSERT(count > 0 && colors != nullptr); |
353 | | SkASSERT(s.fBilerp); |
354 | | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
355 | | SkASSERT(s.fAlphaScale <= 256); |
356 | | |
357 | | int y0, y1, wy; |
358 | | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
359 | | |
360 | | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
361 | | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
362 | | |
363 | | // We'll put one pixel in the low 8 16-bit lanes to line up with wy, |
364 | | // and another in the upper 8 16-bit lanes to line up with 16 - wy. |
365 | | __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy)); |
366 | | |
367 | | while (count --> 0) { |
368 | | int x0, x1, wx; |
369 | | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
370 | | |
371 | | // Load the 4 pixels we're interpolating, in this grid: |
372 | | // | tl tr | |
373 | | // | bl br | |
374 | | const __m128i zeros = __lsx_vldi(0); |
375 | | const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0), |
376 | | tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0), |
377 | | bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0), |
378 | | br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0); |
379 | | |
380 | | // We want to calculate a sum of 8 pixels weighted in two directions: |
381 | | // |
382 | | // sum = tl * (16-wy) * (16-wx) |
383 | | // + bl * ( wy) * (16-wx) |
384 | | // + tr * (16-wy) * ( wx) |
385 | | // + br * ( wy) * ( wx) |
386 | | // |
387 | | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
388 | | // |
389 | | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
390 | | // to apply those y-direction weights. So we'll start on the x-direction |
391 | | // first, grouping into left and right halves, lined up with allY: |
392 | | // |
393 | | // L = [bl, tl] |
394 | | // R = [br, tr] |
395 | | // |
396 | | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
397 | | // |
398 | | // Rewriting that one more step, we can replace a multiply with a shift: |
399 | | // |
400 | | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
401 | | // |
402 | | // That's how we'll actually do this math. |
403 | | |
404 | | |
405 | | __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)), |
406 | | R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br)); |
407 | | |
408 | | __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4), |
409 | | __lsx_vmul_h(__lsx_vsub_h(R,L), |
410 | | __lsx_vreplgr2vr_h(wx))); |
411 | | |
412 | | __m128i sum_in_x = __lsx_vmul_h(inner, allY); |
413 | | |
414 | | // sum = horizontalSum( ... ) |
415 | | __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8)); |
416 | | |
417 | | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
418 | | sum = __lsx_vsrli_h(sum, 8); |
419 | | |
420 | | if (s.fAlphaScale < 256) { |
421 | | // Scale by alpha, which is in [0,256]. |
422 | | sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale)); |
423 | | sum = __lsx_vsrli_h(sum, 8); |
424 | | } |
425 | | |
426 | | // Pack back into 8-bit values and store. |
427 | | *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0), |
428 | | __lsx_vsat_hu(sum, 8)), 0); |
429 | | } |
430 | | } |
431 | | |
432 | | #else |
433 | | |
434 | | // The NEON code only actually differs from the portable code in the |
435 | | // filtering step after we've loaded all four pixels we want to bilerp. |
436 | | |
437 | | #if defined(SK_ARM_HAS_NEON) |
438 | | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
439 | | SkPMColor a00, SkPMColor a01, |
440 | | SkPMColor a10, SkPMColor a11, |
441 | | SkPMColor *dst, |
442 | | uint16_t scale) { |
443 | | uint8x8_t vy, vconst16_8, v16_y, vres; |
444 | | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
445 | | uint32x2_t va0, va1; |
446 | | uint16x8_t tmp1, tmp2; |
447 | | |
448 | | vy = vdup_n_u8(y); // duplicate y into vy |
449 | | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
450 | | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
451 | | |
452 | | va0 = vdup_n_u32(a00); // duplicate a00 |
453 | | va1 = vdup_n_u32(a10); // duplicate a10 |
454 | | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
455 | | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
456 | | |
457 | | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
458 | | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
459 | | |
460 | | vx = vdup_n_u16(x); // duplicate x into vx |
461 | | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
462 | | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
463 | | |
464 | | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
465 | | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
466 | | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
467 | | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
468 | | |
469 | | if (scale < 256) { |
470 | | vscale = vdup_n_u16(scale); // duplicate scale |
471 | | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
472 | | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
473 | | } |
474 | | |
475 | | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8 |
476 | | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
477 | | } |
478 | | #else |
479 | | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
480 | | SkPMColor a00, SkPMColor a01, |
481 | | SkPMColor a10, SkPMColor a11, |
482 | | SkPMColor* dstColor, |
483 | | unsigned alphaScale) { |
484 | | SkASSERT((unsigned)x <= 0xF); |
485 | | SkASSERT((unsigned)y <= 0xF); |
486 | | SkASSERT(alphaScale <= 256); |
487 | | |
488 | | int xy = x * y; |
489 | | const uint32_t mask = 0xFF00FF; |
490 | | |
491 | | int scale = 256 - 16*y - 16*x + xy; |
492 | | uint32_t lo = (a00 & mask) * scale; |
493 | | uint32_t hi = ((a00 >> 8) & mask) * scale; |
494 | | |
495 | | scale = 16*x - xy; |
496 | | lo += (a01 & mask) * scale; |
497 | | hi += ((a01 >> 8) & mask) * scale; |
498 | | |
499 | | scale = 16*y - xy; |
500 | | lo += (a10 & mask) * scale; |
501 | | hi += ((a10 >> 8) & mask) * scale; |
502 | | |
503 | | lo += (a11 & mask) * xy; |
504 | | hi += ((a11 >> 8) & mask) * xy; |
505 | | |
506 | | if (alphaScale < 256) { |
507 | | lo = ((lo >> 8) & mask) * alphaScale; |
508 | | hi = ((hi >> 8) & mask) * alphaScale; |
509 | | } |
510 | | |
511 | | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
512 | | } |
513 | | #endif |
514 | | |
515 | | |
516 | | /*not static*/ inline |
517 | | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
518 | | const uint32_t* xy, int count, SkPMColor* colors) { |
519 | | SkASSERT(count > 0 && colors != nullptr); |
520 | | SkASSERT(s.fBilerp); |
521 | | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
522 | | SkASSERT(s.fAlphaScale <= 256); |
523 | | |
524 | | int y0, y1, wy; |
525 | | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
526 | | |
527 | | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
528 | | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
529 | | |
530 | | while (count --> 0) { |
531 | | int x0, x1, wx; |
532 | | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
533 | | |
534 | | filter_and_scale_by_alpha(wx, wy, |
535 | | row0[x0], row0[x1], |
536 | | row1[x0], row1[x1], |
537 | | colors++, |
538 | | s.fAlphaScale); |
539 | | } |
540 | | } |
541 | | |
542 | | #endif |
543 | | |
544 | | #if defined(SK_ARM_HAS_NEON) |
545 | | /*not static*/ inline |
546 | | void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s, |
547 | | const uint32_t* xy, int count, SkPMColor* colors) { |
548 | | SkASSERT(count > 0 && colors != nullptr); |
549 | | SkASSERT(s.fBilerp); |
550 | | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
551 | | SkASSERT(s.fAlphaScale <= 256); |
552 | | |
553 | | auto src = (const char*)s.fPixmap.addr(); |
554 | | size_t rb = s.fPixmap.rowBytes(); |
555 | | |
556 | | while (count --> 0) { |
557 | | int y0, y1, wy, |
558 | | x0, x1, wx; |
559 | | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
560 | | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
561 | | |
562 | | auto row0 = (const uint32_t*)(src + y0*rb), |
563 | | row1 = (const uint32_t*)(src + y1*rb); |
564 | | |
565 | | filter_and_scale_by_alpha(wx, wy, |
566 | | row0[x0], row0[x1], |
567 | | row1[x0], row1[x1], |
568 | | colors++, |
569 | | s.fAlphaScale); |
570 | | } |
571 | | } |
572 | | #else |
573 | | // It's not yet clear whether it's worthwhile specializing for other architectures. |
574 | | constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&, |
575 | | const uint32_t*, int, SkPMColor*) = nullptr; |
576 | | #endif |
577 | | |
578 | | } // namespace SK_OPTS_NS |
579 | | |
580 | | namespace sktests { |
581 | | template <typename U32, typename Out> |
582 | | void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { |
583 | | SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w); |
584 | | } |
585 | | } |
586 | | |
587 | | #endif |