Coverage Report

Created: 2021-08-22 09:07

/src/skia/src/opts/SkBitmapProcState_opts.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2018 Google Inc.
3
 *
4
 * Use of this source code is governed by a BSD-style license that can be
5
 * found in the LICENSE file.
6
 */
7
8
#ifndef SkBitmapProcState_opts_DEFINED
9
#define SkBitmapProcState_opts_DEFINED
10
11
#include "include/private/SkVx.h"
12
#include "src/core/SkBitmapProcState.h"
13
#include "src/core/SkMSAN.h"
14
15
// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16
//
17
// Only S32_alpha_D32_filter_DX exploits instructions beyond
18
// our common baseline SSE2/NEON instruction sets, so that's
19
// all that lives here.
20
//
21
// The rest are scattershot at the moment but I want to get them
22
// all migrated to be normal code inside SkBitmapProcState.cpp.
23
24
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25
    #include <immintrin.h>
26
#elif defined(SK_ARM_HAS_NEON)
27
    #include <arm_neon.h>
28
#endif
29
30
namespace SK_OPTS_NS {
31
32
// This same basic packing scheme is used throughout the file.
33
template <typename U32, typename Out>
34
5.38k
static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
35
5.38k
    *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
36
5.38k
    *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
37
5.38k
    *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
38
5.38k
}
SkOpts.cpp:void sse2::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*)
Line
Count
Source
34
5.38k
static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
35
5.38k
    *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
36
5.38k
    *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
37
5.38k
    *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
38
5.38k
}
Unexecuted instantiation: SkOpts_hsw.cpp:void hsw::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*)
Unexecuted instantiation: SkOpts_hsw.cpp:void hsw::decode_packed_coordinates_and_weight<skvx::Vec<8, unsigned int>, skvx::Vec<8, unsigned int> >(skvx::Vec<8, unsigned int>, skvx::Vec<8, unsigned int>*, skvx::Vec<8, unsigned int>*, skvx::Vec<8, unsigned int>*)
Unexecuted instantiation: SkOpts_ssse3.cpp:void ssse3::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*)
39
40
#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
41
    /*not static*/ inline
42
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
43
0
                                 const uint32_t* xy, int count, uint32_t* colors) {
44
0
        SkASSERT(count > 0 && colors != nullptr);
45
0
        SkASSERT(s.fBilerp);
46
0
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
47
0
        SkASSERT(s.fAlphaScale <= 256);
48
49
        // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight.
50
0
        int y0, y1, wy;
51
0
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
52
53
0
        const uint32_t* row0 = s.fPixmap.addr32(0,y0);
54
0
        const uint32_t* row1 = s.fPixmap.addr32(0,y1);
55
56
0
        auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> {
57
            // Decode up to 8 output pixels' x-coordinates and weights.
58
0
            skvx::Vec<8,uint32_t> x0,x1,wx;
59
0
            decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx);
60
61
            // Splat wx to each color channel.
62
0
            wx = (wx <<  0)
63
0
               | (wx <<  8)
64
0
               | (wx << 16)
65
0
               | (wx << 24);
66
67
0
            auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) {
68
0
            #if 1
69
                // Drop into AVX2 intrinsics for vpgatherdd.
70
0
                return skvx::bit_pun<skvx::Vec<8,uint32_t>>(
71
0
                        _mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4));
72
            #else
73
                // Portable version... sometimes I don't trust vpgatherdd.
74
                return skvx::Vec<8,uint32_t>{
75
                    ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]],
76
                    ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]],
77
                };
78
            #endif
79
0
            };
80
81
            // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels.
82
0
            skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1),
83
0
                                  bl = gather(row1, x0), br = gather(row1, x1);
84
85
0
        #if 1
86
            // We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code.
87
0
            auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) {
88
0
                __m256i l = skvx::bit_pun<__m256i>(L),
89
0
                        r = skvx::bit_pun<__m256i>(R),
90
0
                       wr = skvx::bit_pun<__m256i>(wx),
91
0
                       wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr);
92
93
                // Interlace l,r bytewise and line them up with their weights, then lerp.
94
0
                __m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r),
95
0
                                                  _mm256_unpacklo_epi8(wl,wr));
96
0
                __m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r),
97
0
                                                  _mm256_unpackhi_epi8(wl,wr));
98
99
                // Those _mm256_unpack??_epi8() calls left us in a bit of an odd order:
100
                //
101
                //    if   l = a b c d | e f g h
102
                //   and   r = A B C D | E F G H
103
                //
104
                // then   lo = a A b B | e E f F   (low  half of each input)
105
                //  and   hi = c C d D | g G h H   (high half of each input)
106
                //
107
                // To get everything back in original order we need to transpose that.
108
0
                __m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20),
109
0
                        efgh = _mm256_permute2x128_si256(lo, hi, 0x31);
110
111
0
                return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd),
112
0
                                  skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh));
113
0
            };
114
115
0
            skvx::Vec<32, uint16_t> top = lerp_x(tl, tr),
116
0
                                    bot = lerp_x(bl, br),
117
0
                                    sum = 16*top + (bot-top)*wy;
118
        #else
119
            // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply.
120
            auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> {
121
                return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v));
122
            };
123
124
            // Sum up weighted sample pixels.  The naive, redundant math would be,
125
            //
126
            //   sum = tl * (16-wy) * (16-wx)
127
            //       + bl * (   wy) * (16-wx)
128
            //       + tr * (16-wy) * (   wx)
129
            //       + br * (   wy) * (   wx)
130
            //
131
            // But we refactor to eliminate a bunch of those common factors.
132
            auto lerp = [](auto lo, auto hi, auto w) {
133
                return 16*lo + (hi-lo)*w;
134
            };
135
            skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy),
136
                                               lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx));
137
        #endif
138
139
            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
140
0
            sum >>= 8;
141
142
            // Scale by alpha if needed.
143
0
            if(s.fAlphaScale < 256) {
144
0
                sum *= s.fAlphaScale;
145
0
                sum >>= 8;
146
0
            }
147
148
            // Pack back to 8-bit channels, undoing to_16x4().
149
0
            return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum));
150
0
        };
151
152
0
        while (count >= 8) {
153
0
            bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors);
154
0
            xy     += 8;
155
0
            colors += 8;
156
0
            count  -= 8;
157
0
        }
158
0
        if (count > 0) {
159
0
            __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ),
160
0
                    coords = _mm256_maskload_epi32((const int*)xy, active),
161
0
                    pixels;
162
163
0
            bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels);
164
0
            _mm256_maskstore_epi32((int*)colors, active, pixels);
165
166
0
            sk_msan_mark_initialized(colors, colors+count,
167
0
                                     "MSAN still doesn't understand AVX2 mask loads and stores.");
168
0
        }
169
0
    }
Unexecuted instantiation: hsw::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
Unexecuted instantiation: hsw::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
170
171
#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
172
173
    /*not static*/ inline
174
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
175
0
                                 const uint32_t* xy, int count, uint32_t* colors) {
176
0
        SkASSERT(count > 0 && colors != nullptr);
177
0
        SkASSERT(s.fBilerp);
178
0
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
179
0
        SkASSERT(s.fAlphaScale <= 256);
180
181
        // interpolate_in_x() is the crux of the SSSE3 implementation,
182
        // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
183
0
        auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
184
0
                                   uint32_t B0, uint32_t B1,
185
0
                                   __m128i interlaced_x_weights) {
186
            // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
187
            //
188
            // It takes two arguments interlaced byte-wise:
189
            //    - first  arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
190
            //    - second arg: [ w,W, ... 7 more pairs of   signed 8-bit values ...]
191
            // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
192
            //
193
            // That's why we go to all this trouble to make interlaced_x_weights,
194
            // and here we're about to interlace A0 with A1 and B0 with B1 to match.
195
            //
196
            // Our interlaced_x_weights are all in [0,16], and so we need not worry about
197
            // the signedness of that input nor about the signedness of the output.
198
199
0
            __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
200
0
                    interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
201
202
0
            return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
203
0
                                     interlaced_x_weights);
204
0
        };
205
206
        // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
207
        // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
208
0
        auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
209
0
                                          uint32_t A2, uint32_t A3,
210
0
                                          uint32_t B0, uint32_t B1,
211
0
                                          uint32_t B2, uint32_t B3,
212
0
                                          __m128i interlaced_x_weights,
213
0
                                          int wy) {
214
            // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
215
0
            __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
216
0
                    bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
217
218
            // Interpolate in Y.  As in the SSE2 code, we calculate top*(16-wy) + bot*wy
219
            // as 16*top + (bot-top)*wy to save a multiply.
220
0
            __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
221
0
                                       _mm_mullo_epi16(_mm_sub_epi16(bot, top),
222
0
                                                       _mm_set1_epi16(wy)));
223
224
            // Scale down by total max weight 16x16 = 256.
225
0
            px = _mm_srli_epi16(px, 8);
226
227
            // Scale by alpha if needed.
228
0
            if (s.fAlphaScale < 256) {
229
0
                px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
230
0
            }
231
0
            return px;
232
0
        };
233
234
        // We're in _DX mode here, so we're only varying in X.
235
        // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
236
        // All the other entries in xy will be pairs of X coordinates and the X weight.
237
0
        int y0, y1, wy;
238
0
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
239
240
0
        auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
241
0
             row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
242
243
0
        while (count >= 4) {
244
            // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
245
0
            int x0[4],
246
0
                x1[4];
247
0
            __m128i wx;
248
249
            // decode_packed_coordinates_and_weight(), 4x.
250
0
            __m128i packed = _mm_loadu_si128((const __m128i*)xy);
251
0
            _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
252
0
            _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
253
0
            wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));  // [0,15]
254
255
            // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
256
            // and sixteen minus that as wl for pixels on the left at x0.
257
0
            __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
258
0
                    wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
259
260
            // We need to interlace wl and wr for _mm_maddubs_epi16().
261
0
            __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
262
0
                    interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
263
264
0
            enum { A,B,C,D };
265
266
            // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
267
            // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
268
0
            __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
269
0
                                                row1[x0[A]], row1[x1[A]],
270
0
                                                row0[x0[B]], row0[x1[B]],
271
0
                                                row1[x0[B]], row1[x1[B]],
272
0
                                                interlaced_x_weights_AB, wy);
273
274
            // Once more with the other half of the x-weights for two more pixels C,D.
275
0
            __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
276
0
                                                row1[x0[C]], row1[x1[C]],
277
0
                                                row0[x0[D]], row0[x1[D]],
278
0
                                                row1[x0[D]], row1[x1[D]],
279
0
                                                interlaced_x_weights_CD, wy);
280
281
            // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
282
0
            _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
283
0
            xy     += 4;
284
0
            colors += 4;
285
0
            count  -= 4;
286
0
        }
287
288
0
        while (count --> 0) {
289
            // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
290
0
            int x0, x1, wx;
291
0
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
292
293
            // As above, splat out wx four times as wr, and sixteen minus that as wl.
294
0
            __m128i wr = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
295
0
                    wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
296
297
0
            __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
298
299
0
            __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
300
0
                                               row1[x0], row1[x1],
301
0
                                                      0,        0,
302
0
                                                      0,        0,
303
0
                                               interlaced_x_weights, wy);
304
305
0
            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
306
0
        }
307
0
    }
Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
308
309
310
#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
311
312
    /*not static*/ inline
313
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
314
2.69k
                                 const uint32_t* xy, int count, uint32_t* colors) {
315
2.69k
        SkASSERT(count > 0 && colors != nullptr);
316
2.69k
        SkASSERT(s.fBilerp);
317
2.69k
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
318
2.69k
        SkASSERT(s.fAlphaScale <= 256);
319
320
2.69k
        int y0, y1, wy;
321
2.69k
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
322
323
2.69k
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
324
2.69k
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
325
326
        // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
327
        // and another in the upper 4 16-bit lanes to line up with 16 - wy.
328
2.69k
        const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.
329
2.69k
                                                _mm_set1_epi16(16-wy));  // Top pixel goes here.
330
331
5.38k
        while (count --> 0) {
332
2.69k
            int x0, x1, wx;
333
2.69k
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
334
335
            // Load the 4 pixels we're interpolating, in this grid:
336
            //    | tl  tr |
337
            //    | bl  br |
338
2.69k
            const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
339
2.69k
                          bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
340
341
            // We want to calculate a sum of 4 pixels weighted in two directions:
342
            //
343
            //  sum = tl * (16-wy) * (16-wx)
344
            //      + bl * (   wy) * (16-wx)
345
            //      + tr * (16-wy) * (   wx)
346
            //      + br * (   wy) * (   wx)
347
            //
348
            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
349
            //
350
            // We've already prepared allY as a vector containing [wy, 16-wy] as a way
351
            // to apply those y-direction weights.  So we'll start on the x-direction
352
            // first, grouping into left and right halves, lined up with allY:
353
            //
354
            //     L = [bl, tl]
355
            //     R = [br, tr]
356
            //
357
            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
358
            //
359
            // Rewriting that one more step, we can replace a multiply with a shift:
360
            //
361
            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
362
            //
363
            // That's how we'll actually do this math.
364
365
2.69k
            __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
366
2.69k
                    R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
367
368
2.69k
            __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
369
2.69k
                                          _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
370
371
2.69k
            __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
372
373
            // sum = horizontalSum( ... )
374
2.69k
            __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
375
376
            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
377
2.69k
            sum = _mm_srli_epi16(sum, 8);
378
379
2.69k
            if (s.fAlphaScale < 256) {
380
                // Scale by alpha, which is in [0,256].
381
0
                sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
382
0
                sum = _mm_srli_epi16(sum, 8);
383
0
            }
384
385
            // Pack back into 8-bit values and store.
386
2.69k
            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
387
2.69k
        }
388
2.69k
    }
389
390
#else
391
392
    // The NEON code only actually differs from the portable code in the
393
    // filtering step after we've loaded all four pixels we want to bilerp.
394
395
    #if defined(SK_ARM_HAS_NEON)
396
        static void filter_and_scale_by_alpha(unsigned x, unsigned y,
397
                                              SkPMColor a00, SkPMColor a01,
398
                                              SkPMColor a10, SkPMColor a11,
399
                                              SkPMColor *dst,
400
                                              uint16_t scale) {
401
            uint8x8_t vy, vconst16_8, v16_y, vres;
402
            uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
403
            uint32x2_t va0, va1;
404
            uint16x8_t tmp1, tmp2;
405
406
            vy = vdup_n_u8(y);                // duplicate y into vy
407
            vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
408
            v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
409
410
            va0 = vdup_n_u32(a00);            // duplicate a00
411
            va1 = vdup_n_u32(a10);            // duplicate a10
412
            va0 = vset_lane_u32(a01, va0, 1); // set top to a01
413
            va1 = vset_lane_u32(a11, va1, 1); // set top to a11
414
415
            tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
416
            tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
417
418
            vx = vdup_n_u16(x);                // duplicate x into vx
419
            vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
420
            v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
421
422
            tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
423
            tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
424
            tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
425
            tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
426
427
            if (scale < 256) {
428
                vscale = vdup_n_u16(scale);        // duplicate scale
429
                tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
430
                tmp = vmul_u16(tmp, vscale);       // multiply result by scale
431
            }
432
433
            vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
434
            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
435
        }
436
    #else
437
        static void filter_and_scale_by_alpha(unsigned x, unsigned y,
438
                                              SkPMColor a00, SkPMColor a01,
439
                                              SkPMColor a10, SkPMColor a11,
440
                                              SkPMColor* dstColor,
441
                                              unsigned alphaScale) {
442
            SkASSERT((unsigned)x <= 0xF);
443
            SkASSERT((unsigned)y <= 0xF);
444
            SkASSERT(alphaScale <= 256);
445
446
            int xy = x * y;
447
            const uint32_t mask = 0xFF00FF;
448
449
            int scale = 256 - 16*y - 16*x + xy;
450
            uint32_t lo = (a00 & mask) * scale;
451
            uint32_t hi = ((a00 >> 8) & mask) * scale;
452
453
            scale = 16*x - xy;
454
            lo += (a01 & mask) * scale;
455
            hi += ((a01 >> 8) & mask) * scale;
456
457
            scale = 16*y - xy;
458
            lo += (a10 & mask) * scale;
459
            hi += ((a10 >> 8) & mask) * scale;
460
461
            lo += (a11 & mask) * xy;
462
            hi += ((a11 >> 8) & mask) * xy;
463
464
            if (alphaScale < 256) {
465
                lo = ((lo >> 8) & mask) * alphaScale;
466
                hi = ((hi >> 8) & mask) * alphaScale;
467
            }
468
469
            *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
470
        }
471
    #endif
472
473
474
    /*not static*/ inline
475
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
476
                                 const uint32_t* xy, int count, SkPMColor* colors) {
477
        SkASSERT(count > 0 && colors != nullptr);
478
        SkASSERT(s.fBilerp);
479
        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
480
        SkASSERT(s.fAlphaScale <= 256);
481
482
        int y0, y1, wy;
483
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
484
485
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
486
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
487
488
        while (count --> 0) {
489
            int x0, x1, wx;
490
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
491
492
            filter_and_scale_by_alpha(wx, wy,
493
                                      row0[x0], row0[x1],
494
                                      row1[x0], row1[x1],
495
                                      colors++,
496
                                      s.fAlphaScale);
497
        }
498
    }
499
500
#endif
501
502
#if defined(SK_ARM_HAS_NEON)
503
    /*not static*/ inline
504
    void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
505
                                   const uint32_t* xy, int count, SkPMColor* colors) {
506
        SkASSERT(count > 0 && colors != nullptr);
507
        SkASSERT(s.fBilerp);
508
        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
509
        SkASSERT(s.fAlphaScale <= 256);
510
511
        auto src = (const char*)s.fPixmap.addr();
512
        size_t rb = s.fPixmap.rowBytes();
513
514
        while (count --> 0) {
515
            int y0, y1, wy,
516
                x0, x1, wx;
517
            decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
518
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
519
520
            auto row0 = (const uint32_t*)(src + y0*rb),
521
                 row1 = (const uint32_t*)(src + y1*rb);
522
523
            filter_and_scale_by_alpha(wx, wy,
524
                                      row0[x0], row0[x1],
525
                                      row1[x0], row1[x1],
526
                                      colors++,
527
                                      s.fAlphaScale);
528
        }
529
    }
530
#else
531
    // It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2.
532
    constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
533
                                                       const uint32_t*, int, SkPMColor*) = nullptr;
534
#endif
535
536
}  // namespace SK_OPTS_NS
537
538
#endif