Coverage Report

Created: 2024-09-14 07:19

/src/skia/src/opts/SkBitmapProcState_opts.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2018 Google Inc.
3
 *
4
 * Use of this source code is governed by a BSD-style license that can be
5
 * found in the LICENSE file.
6
 */
7
8
#ifndef SkBitmapProcState_opts_DEFINED
9
#define SkBitmapProcState_opts_DEFINED
10
11
#include "src/base/SkMSAN.h"
12
#include "src/base/SkVx.h"
13
#include "src/core/SkBitmapProcState.h"
14
15
// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
16
//
17
// Only S32_alpha_D32_filter_DX exploits instructions beyond
18
// our common baseline SSE2/NEON instruction sets, so that's
19
// all that lives here.
20
//
21
// The rest are scattershot at the moment but I want to get them
22
// all migrated to be normal code inside SkBitmapProcState.cpp.
23
24
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
25
    #include <immintrin.h>
26
#elif defined(SK_ARM_HAS_NEON)
27
    #include <arm_neon.h>
28
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
29
    #include <lasxintrin.h>
30
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
31
    #include <lsxintrin.h>
32
#endif
33
34
namespace SK_OPTS_NS {
35
36
// This same basic packing scheme is used throughout the file.
37
template <typename U32, typename Out>
38
2.82M
static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
39
2.82M
    *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
40
2.82M
    *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
41
2.82M
    *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
42
2.82M
}
SkBitmapProcState_opts.cpp:void sse2::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*)
Line
Count
Source
38
2.82M
static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
39
2.82M
    *v0 = (packed >> 18);       // Integer coordinate x0 or y0.
40
2.82M
    *v1 = (packed & 0x3fff);    // Integer coordinate x1 or y1.
41
2.82M
    *w  = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.
42
2.82M
}
Unexecuted instantiation: SkBitmapProcState_opts_ssse3.cpp:void ssse3::decode_packed_coordinates_and_weight<unsigned int, int>(unsigned int, int*, int*, int*)
43
44
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
45
46
    /*not static*/ inline
47
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
48
0
                                 const uint32_t* xy, int count, uint32_t* colors) {
49
0
        SkASSERT(count > 0 && colors != nullptr);
50
0
        SkASSERT(s.fBilerp);
51
0
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
52
0
        SkASSERT(s.fAlphaScale <= 256);
53
54
        // interpolate_in_x() is the crux of the SSSE3 implementation,
55
        // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
56
0
        auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
57
0
                                   uint32_t B0, uint32_t B1,
58
0
                                   __m128i interlaced_x_weights) {
59
            // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.
60
            //
61
            // It takes two arguments interlaced byte-wise:
62
            //    - first  arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]
63
            //    - second arg: [ w,W, ... 7 more pairs of   signed 8-bit values ...]
64
            // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].
65
            //
66
            // That's why we go to all this trouble to make interlaced_x_weights,
67
            // and here we're about to interlace A0 with A1 and B0 with B1 to match.
68
            //
69
            // Our interlaced_x_weights are all in [0,16], and so we need not worry about
70
            // the signedness of that input nor about the signedness of the output.
71
72
0
            __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
73
0
                    interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
74
75
0
            return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
76
0
                                     interlaced_x_weights);
77
0
        };
78
79
        // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
80
        // Returns two pixels, with each color channel in a 16-bit lane of the __m128i.
81
0
        auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
82
0
                                          uint32_t A2, uint32_t A3,
83
0
                                          uint32_t B0, uint32_t B1,
84
0
                                          uint32_t B2, uint32_t B3,
85
0
                                          __m128i interlaced_x_weights,
86
0
                                          int wy) {
87
            // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.
88
0
            __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
89
0
                    bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
90
91
            // Interpolate in Y.  As in the SSE2 code, we calculate top*(16-wy) + bot*wy
92
            // as 16*top + (bot-top)*wy to save a multiply.
93
0
            __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),
94
0
                                       _mm_mullo_epi16(_mm_sub_epi16(bot, top),
95
0
                                                       _mm_set1_epi16(wy)));
96
97
            // Scale down by total max weight 16x16 = 256.
98
0
            px = _mm_srli_epi16(px, 8);
99
100
            // Scale by alpha if needed.
101
0
            if (s.fAlphaScale < 256) {
102
0
                px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);
103
0
            }
104
0
            return px;
105
0
        };
106
107
        // We're in _DX mode here, so we're only varying in X.
108
        // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
109
        // All the other entries in xy will be pairs of X coordinates and the X weight.
110
0
        int y0, y1, wy;
111
0
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
112
113
0
        auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
114
0
             row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
115
116
0
        while (count >= 4) {
117
            // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
118
0
            int x0[4],
119
0
                x1[4];
120
0
            __m128i wx;
121
122
            // decode_packed_coordinates_and_weight(), 4x.
123
0
            __m128i packed = _mm_loadu_si128((const __m128i*)xy);
124
0
            _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));
125
0
            _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
126
0
            wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));  // [0,15]
127
128
            // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
129
            // and sixteen minus that as wl for pixels on the left at x0.
130
0
            __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
131
0
                    wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
132
133
            // We need to interlace wl and wr for _mm_maddubs_epi16().
134
0
            __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),
135
0
                    interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);
136
137
0
            enum { A,B,C,D };
138
139
            // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
140
            // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
141
0
            __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],
142
0
                                                row1[x0[A]], row1[x1[A]],
143
0
                                                row0[x0[B]], row0[x1[B]],
144
0
                                                row1[x0[B]], row1[x1[B]],
145
0
                                                interlaced_x_weights_AB, wy);
146
147
            // Once more with the other half of the x-weights for two more pixels C,D.
148
0
            __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],
149
0
                                                row1[x0[C]], row1[x1[C]],
150
0
                                                row0[x0[D]], row0[x1[D]],
151
0
                                                row1[x0[D]], row1[x1[D]],
152
0
                                                interlaced_x_weights_CD, wy);
153
154
            // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
155
0
            _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));
156
0
            xy     += 4;
157
0
            colors += 4;
158
0
            count  -= 4;
159
0
        }
160
161
0
        while (count --> 0) {
162
            // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
163
0
            int x0, x1, wx;
164
0
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
165
166
            // As above, splat out wx four times as wr, and sixteen minus that as wl.
167
0
            __m128i wr = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
168
0
                    wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);
169
170
0
            __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);
171
172
0
            __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
173
0
                                               row1[x0], row1[x1],
174
0
                                                      0,        0,
175
0
                                                      0,        0,
176
0
                                               interlaced_x_weights, wy);
177
178
0
            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));
179
0
        }
180
0
    }
Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
Unexecuted instantiation: ssse3::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
181
182
183
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
184
185
    /*not static*/ inline
186
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
187
80.3k
                                 const uint32_t* xy, int count, uint32_t* colors) {
188
80.3k
        SkASSERT(count > 0 && colors != nullptr);
189
80.3k
        SkASSERT(s.fBilerp);
190
80.3k
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
191
80.3k
        SkASSERT(s.fAlphaScale <= 256);
192
193
80.3k
        int y0, y1, wy;
194
80.3k
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
195
196
80.3k
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
197
80.3k
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
198
199
        // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
200
        // and another in the upper 4 16-bit lanes to line up with 16 - wy.
201
80.3k
        const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.
202
80.3k
                                                _mm_set1_epi16(16-wy));  // Top pixel goes here.
203
204
2.82M
        while (count --> 0) {
205
2.74M
            int x0, x1, wx;
206
2.74M
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
207
208
            // Load the 4 pixels we're interpolating, in this grid:
209
            //    | tl  tr |
210
            //    | bl  br |
211
2.74M
            const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212
2.74M
                          bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
213
214
            // We want to calculate a sum of 4 pixels weighted in two directions:
215
            //
216
            //  sum = tl * (16-wy) * (16-wx)
217
            //      + bl * (   wy) * (16-wx)
218
            //      + tr * (16-wy) * (   wx)
219
            //      + br * (   wy) * (   wx)
220
            //
221
            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
222
            //
223
            // We've already prepared allY as a vector containing [wy, 16-wy] as a way
224
            // to apply those y-direction weights.  So we'll start on the x-direction
225
            // first, grouping into left and right halves, lined up with allY:
226
            //
227
            //     L = [bl, tl]
228
            //     R = [br, tr]
229
            //
230
            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
231
            //
232
            // Rewriting that one more step, we can replace a multiply with a shift:
233
            //
234
            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
235
            //
236
            // That's how we'll actually do this math.
237
238
2.74M
            __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239
2.74M
                    R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
240
241
2.74M
            __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
242
2.74M
                                          _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
243
244
2.74M
            __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
245
246
            // sum = horizontalSum( ... )
247
2.74M
            __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
248
249
            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
250
2.74M
            sum = _mm_srli_epi16(sum, 8);
251
252
2.74M
            if (s.fAlphaScale < 256) {
253
                // Scale by alpha, which is in [0,256].
254
0
                sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
255
0
                sum = _mm_srli_epi16(sum, 8);
256
0
            }
257
258
            // Pack back into 8-bit values and store.
259
2.74M
            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
260
2.74M
        }
261
80.3k
    }
sse2::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
Line
Count
Source
187
80.3k
                                 const uint32_t* xy, int count, uint32_t* colors) {
188
80.3k
        SkASSERT(count > 0 && colors != nullptr);
189
80.3k
        SkASSERT(s.fBilerp);
190
80.3k
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
191
80.3k
        SkASSERT(s.fAlphaScale <= 256);
192
193
80.3k
        int y0, y1, wy;
194
80.3k
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
195
196
80.3k
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
197
80.3k
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
198
199
        // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
200
        // and another in the upper 4 16-bit lanes to line up with 16 - wy.
201
80.3k
        const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),   // Bottom pixel goes here.
202
80.3k
                                                _mm_set1_epi16(16-wy));  // Top pixel goes here.
203
204
2.82M
        while (count --> 0) {
205
2.74M
            int x0, x1, wx;
206
2.74M
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
207
208
            // Load the 4 pixels we're interpolating, in this grid:
209
            //    | tl  tr |
210
            //    | bl  br |
211
2.74M
            const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),
212
2.74M
                          bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);
213
214
            // We want to calculate a sum of 4 pixels weighted in two directions:
215
            //
216
            //  sum = tl * (16-wy) * (16-wx)
217
            //      + bl * (   wy) * (16-wx)
218
            //      + tr * (16-wy) * (   wx)
219
            //      + br * (   wy) * (   wx)
220
            //
221
            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
222
            //
223
            // We've already prepared allY as a vector containing [wy, 16-wy] as a way
224
            // to apply those y-direction weights.  So we'll start on the x-direction
225
            // first, grouping into left and right halves, lined up with allY:
226
            //
227
            //     L = [bl, tl]
228
            //     R = [br, tr]
229
            //
230
            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
231
            //
232
            // Rewriting that one more step, we can replace a multiply with a shift:
233
            //
234
            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
235
            //
236
            // That's how we'll actually do this math.
237
238
2.74M
            __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),
239
2.74M
                    R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());
240
241
2.74M
            __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),
242
2.74M
                                          _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));
243
244
2.74M
            __m128i sum_in_x = _mm_mullo_epi16(inner, allY);
245
246
            // sum = horizontalSum( ... )
247
2.74M
            __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));
248
249
            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
250
2.74M
            sum = _mm_srli_epi16(sum, 8);
251
252
2.74M
            if (s.fAlphaScale < 256) {
253
                // Scale by alpha, which is in [0,256].
254
0
                sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
255
0
                sum = _mm_srli_epi16(sum, 8);
256
0
            }
257
258
            // Pack back into 8-bit values and store.
259
2.74M
            *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
260
2.74M
        }
261
80.3k
    }
Unexecuted instantiation: sse2::S32_alpha_D32_filter_DX(SkBitmapProcState const&, unsigned int const*, int, unsigned int*)
262
263
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
264
    /*not static*/ inline
265
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
266
                                 const uint32_t* xy, int count, uint32_t* colors) {
267
        SkASSERT(count > 0 && colors != nullptr);
268
        SkASSERT(s.fBilerp);
269
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
270
        SkASSERT(s.fAlphaScale <= 256);
271
272
        int y0, y1, wy;
273
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
274
275
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
276
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
277
278
        // We'll put one pixel in the low 16 16-bit lanes to line up with wy,
279
        // and another in the upper 16 16-bit lanes to line up with 16 - wy.
280
        __m256i allY = __lasx_xvilvl_d(__lasx_xvreplgr2vr_h(16-wy), __lasx_xvreplgr2vr_h(wy));
281
282
        while (count --> 0) {
283
            int x0, x1, wx;
284
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
285
286
            // Load the 4 pixels we're interpolating, in this grid:
287
            //    | tl  tr |
288
            //    | bl  br |
289
290
            const __m256i zeros = __lasx_xvldi(0);
291
            const __m256i tl = __lasx_xvinsgr2vr_w(zeros, row0[x0], 0),
292
                          tr = __lasx_xvinsgr2vr_w(zeros, row0[x1], 0),
293
                          bl = __lasx_xvinsgr2vr_w(zeros, row1[x0], 0),
294
                          br = __lasx_xvinsgr2vr_w(zeros, row1[x1], 0);
295
296
            // We want to calculate a sum of 8 pixels weighted in two directions:
297
            //
298
            //  sum = tl * (16-wy) * (16-wx)
299
            //      + bl * (   wy) * (16-wx)
300
            //      + tr * (16-wy) * (   wx)
301
            //      + br * (   wy) * (   wx)
302
            //
303
            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
304
            //
305
            // We've already prepared allY as a vector containing [wy, 16-wy] as a way
306
            // to apply those y-direction weights.  So we'll start on the x-direction
307
            // first, grouping into left and right halves, lined up with allY:
308
            //
309
            //     L = [bl, tl]
310
            //     R = [br, tr]
311
            //
312
            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
313
            //
314
            // Rewriting that one more step, we can replace a multiply with a shift:
315
            //
316
            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
317
            //
318
            // That's how we'll actually do this math.
319
320
            __m256i L = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tl, bl)),
321
                    R = __lasx_xvilvl_b(__lasx_xvldi(0), __lasx_xvilvl_w(tr, br));
322
323
            __m256i inner = __lasx_xvadd_h(__lasx_xvslli_h(L, 4),
324
                                           __lasx_xvmul_h(__lasx_xvsub_h(R,L),
325
                                                          __lasx_xvreplgr2vr_h(wx)));
326
327
            __m256i sum_in_x = __lasx_xvmul_h(inner, allY);
328
329
            // sum = horizontalSum( ... )
330
            __m256i sum = __lasx_xvadd_h(sum_in_x, __lasx_xvbsrl_v(sum_in_x, 8));
331
332
            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
333
            sum = __lasx_xvsrli_h(sum, 8);
334
335
            if (s.fAlphaScale < 256) {
336
                // Scale by alpha, which is in [0,256].
337
                sum = __lasx_xvmul_h(sum, __lasx_xvreplgr2vr_h(s.fAlphaScale));
338
                sum = __lasx_xvsrli_h(sum, 8);
339
            }
340
341
            // Pack back into 8-bit values and store.
342
            *colors++ = __lasx_xvpickve2gr_w(__lasx_xvpickev_b(__lasx_xvldi(0),
343
                                                               __lasx_xvsat_hu(sum, 8)), 0);
344
        }
345
    }
346
347
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
348
349
    /*not static*/ inline
350
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
351
                                 const uint32_t* xy, int count, uint32_t* colors) {
352
        SkASSERT(count > 0 && colors != nullptr);
353
        SkASSERT(s.fBilerp);
354
        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
355
        SkASSERT(s.fAlphaScale <= 256);
356
357
        int y0, y1, wy;
358
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
359
360
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
361
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
362
363
        // We'll put one pixel in the low 8 16-bit lanes to line up with wy,
364
        // and another in the upper 8 16-bit lanes to line up with 16 - wy.
365
        __m128i allY = __lsx_vilvl_d(__lsx_vreplgr2vr_h(16-wy), __lsx_vreplgr2vr_h(wy));
366
367
        while (count --> 0) {
368
            int x0, x1, wx;
369
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
370
371
            // Load the 4 pixels we're interpolating, in this grid:
372
            //    | tl  tr |
373
            //    | bl  br |
374
            const __m128i zeros = __lsx_vldi(0);
375
            const __m128i tl = __lsx_vinsgr2vr_w(zeros, row0[x0], 0),
376
                          tr = __lsx_vinsgr2vr_w(zeros, row0[x1], 0),
377
                          bl = __lsx_vinsgr2vr_w(zeros, row1[x0], 0),
378
                          br = __lsx_vinsgr2vr_w(zeros, row1[x1], 0);
379
380
            // We want to calculate a sum of 8 pixels weighted in two directions:
381
            //
382
            //  sum = tl * (16-wy) * (16-wx)
383
            //      + bl * (   wy) * (16-wx)
384
            //      + tr * (16-wy) * (   wx)
385
            //      + br * (   wy) * (   wx)
386
            //
387
            // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)
388
            //
389
            // We've already prepared allY as a vector containing [wy, 16-wy] as a way
390
            // to apply those y-direction weights.  So we'll start on the x-direction
391
            // first, grouping into left and right halves, lined up with allY:
392
            //
393
            //     L = [bl, tl]
394
            //     R = [br, tr]
395
            //
396
            //   sum = horizontalSum( allY * (L*(16-wx) + R*wx) )
397
            //
398
            // Rewriting that one more step, we can replace a multiply with a shift:
399
            //
400
            //   sum = horizontalSum( allY * (16*L + (R-L)*wx) )
401
            //
402
            // That's how we'll actually do this math.
403
404
405
            __m128i L = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tl, bl)),
406
                    R = __lsx_vilvl_b(__lsx_vldi(0), __lsx_vilvl_w(tr, br));
407
408
            __m128i inner = __lsx_vadd_h(__lsx_vslli_h(L, 4),
409
                                         __lsx_vmul_h(__lsx_vsub_h(R,L),
410
                                                      __lsx_vreplgr2vr_h(wx)));
411
412
            __m128i sum_in_x = __lsx_vmul_h(inner, allY);
413
414
            // sum = horizontalSum( ... )
415
            __m128i sum = __lsx_vadd_h(sum_in_x, __lsx_vbsrl_v(sum_in_x, 8));
416
417
            // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
418
            sum = __lsx_vsrli_h(sum, 8);
419
420
            if (s.fAlphaScale < 256) {
421
                // Scale by alpha, which is in [0,256].
422
                sum = __lsx_vmul_h(sum, __lsx_vreplgr2vr_h(s.fAlphaScale));
423
                sum = __lsx_vsrli_h(sum, 8);
424
            }
425
426
            // Pack back into 8-bit values and store.
427
            *colors++ = __lsx_vpickve2gr_w(__lsx_vpickev_b(__lsx_vldi(0),
428
                                                           __lsx_vsat_hu(sum, 8)), 0);
429
        }
430
    }
431
432
#else
433
434
    // The NEON code only actually differs from the portable code in the
435
    // filtering step after we've loaded all four pixels we want to bilerp.
436
437
    #if defined(SK_ARM_HAS_NEON)
438
        static void filter_and_scale_by_alpha(unsigned x, unsigned y,
439
                                              SkPMColor a00, SkPMColor a01,
440
                                              SkPMColor a10, SkPMColor a11,
441
                                              SkPMColor *dst,
442
                                              uint16_t scale) {
443
            uint8x8_t vy, vconst16_8, v16_y, vres;
444
            uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
445
            uint32x2_t va0, va1;
446
            uint16x8_t tmp1, tmp2;
447
448
            vy = vdup_n_u8(y);                // duplicate y into vy
449
            vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
450
            v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
451
452
            va0 = vdup_n_u32(a00);            // duplicate a00
453
            va1 = vdup_n_u32(a10);            // duplicate a10
454
            va0 = vset_lane_u32(a01, va0, 1); // set top to a01
455
            va1 = vset_lane_u32(a11, va1, 1); // set top to a11
456
457
            tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
458
            tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
459
460
            vx = vdup_n_u16(x);                // duplicate x into vx
461
            vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
462
            v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
463
464
            tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
465
            tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
466
            tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
467
            tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
468
469
            if (scale < 256) {
470
                vscale = vdup_n_u16(scale);        // duplicate scale
471
                tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
472
                tmp = vmul_u16(tmp, vscale);       // multiply result by scale
473
            }
474
475
            vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8
476
            vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
477
        }
478
    #else
479
        static void filter_and_scale_by_alpha(unsigned x, unsigned y,
480
                                              SkPMColor a00, SkPMColor a01,
481
                                              SkPMColor a10, SkPMColor a11,
482
                                              SkPMColor* dstColor,
483
                                              unsigned alphaScale) {
484
            SkASSERT((unsigned)x <= 0xF);
485
            SkASSERT((unsigned)y <= 0xF);
486
            SkASSERT(alphaScale <= 256);
487
488
            int xy = x * y;
489
            const uint32_t mask = 0xFF00FF;
490
491
            int scale = 256 - 16*y - 16*x + xy;
492
            uint32_t lo = (a00 & mask) * scale;
493
            uint32_t hi = ((a00 >> 8) & mask) * scale;
494
495
            scale = 16*x - xy;
496
            lo += (a01 & mask) * scale;
497
            hi += ((a01 >> 8) & mask) * scale;
498
499
            scale = 16*y - xy;
500
            lo += (a10 & mask) * scale;
501
            hi += ((a10 >> 8) & mask) * scale;
502
503
            lo += (a11 & mask) * xy;
504
            hi += ((a11 >> 8) & mask) * xy;
505
506
            if (alphaScale < 256) {
507
                lo = ((lo >> 8) & mask) * alphaScale;
508
                hi = ((hi >> 8) & mask) * alphaScale;
509
            }
510
511
            *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
512
        }
513
    #endif
514
515
516
    /*not static*/ inline
517
    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
518
                                 const uint32_t* xy, int count, SkPMColor* colors) {
519
        SkASSERT(count > 0 && colors != nullptr);
520
        SkASSERT(s.fBilerp);
521
        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
522
        SkASSERT(s.fAlphaScale <= 256);
523
524
        int y0, y1, wy;
525
        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
526
527
        auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
528
             row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
529
530
        while (count --> 0) {
531
            int x0, x1, wx;
532
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
533
534
            filter_and_scale_by_alpha(wx, wy,
535
                                      row0[x0], row0[x1],
536
                                      row1[x0], row1[x1],
537
                                      colors++,
538
                                      s.fAlphaScale);
539
        }
540
    }
541
542
#endif
543
544
#if defined(SK_ARM_HAS_NEON)
545
    /*not static*/ inline
546
    void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,
547
                                   const uint32_t* xy, int count, SkPMColor* colors) {
548
        SkASSERT(count > 0 && colors != nullptr);
549
        SkASSERT(s.fBilerp);
550
        SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
551
        SkASSERT(s.fAlphaScale <= 256);
552
553
        auto src = (const char*)s.fPixmap.addr();
554
        size_t rb = s.fPixmap.rowBytes();
555
556
        while (count --> 0) {
557
            int y0, y1, wy,
558
                x0, x1, wx;
559
            decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
560
            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
561
562
            auto row0 = (const uint32_t*)(src + y0*rb),
563
                 row1 = (const uint32_t*)(src + y1*rb);
564
565
            filter_and_scale_by_alpha(wx, wy,
566
                                      row0[x0], row0[x1],
567
                                      row1[x0], row1[x1],
568
                                      colors++,
569
                                      s.fAlphaScale);
570
        }
571
    }
572
#else
573
    // It's not yet clear whether it's worthwhile specializing for other architectures.
574
    constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,
575
                                                       const uint32_t*, int, SkPMColor*) = nullptr;
576
#endif
577
578
}  // namespace SK_OPTS_NS
579
580
namespace sktests {
581
    template <typename U32, typename Out>
582
    void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {
583
        SK_OPTS_NS::decode_packed_coordinates_and_weight<U32, Out>(packed, v0, v1, w);
584
    }
585
}
586
587
#endif