/src/skia/src/core/Sk4px.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2015 Google Inc. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license that can be |
5 | | * found in the LICENSE file. |
6 | | */ |
7 | | |
8 | | #ifndef Sk4px_DEFINED |
9 | | #define Sk4px_DEFINED |
10 | | |
11 | | #include "include/core/SkColor.h" |
12 | | #include "include/private/SkColorData.h" |
13 | | #include "src/base/SkVx.h" |
14 | | |
15 | | // 1, 2 or 4 SkPMColors, generally vectorized. |
16 | | class Sk4px { |
17 | | public: |
18 | 47.8M | Sk4px(const skvx::byte16& v) : fV(v) {} |
19 | | |
20 | 4.46k | static Sk4px DupPMColor(SkPMColor c) { |
21 | 4.46k | skvx::uint4 splat(c); |
22 | | |
23 | 4.46k | Sk4px v; |
24 | 4.46k | memcpy((void*)&v, &splat, 16); |
25 | 4.46k | return v; |
26 | 4.46k | } |
27 | | |
28 | | // RGBA rgba XYZW xyzw -> AAAA aaaa WWWW wwww |
29 | 397k | Sk4px alphas() const { |
30 | 397k | static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian."); |
31 | 397k | return Sk4px(skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(fV)); |
32 | 397k | } |
33 | 8.17M | Sk4px inv() const { return Sk4px(skvx::byte16(255) - fV); } |
34 | | |
35 | | // When loading or storing fewer than 4 SkPMColors, we use the low lanes. |
36 | 7.86M | static Sk4px Load4(const SkPMColor px[4]) { |
37 | 7.86M | Sk4px v; |
38 | 7.86M | memcpy((void*)&v, px, 16); |
39 | 7.86M | return v; |
40 | 7.86M | } |
41 | 144k | static Sk4px Load2(const SkPMColor px[2]) { |
42 | 144k | Sk4px v; |
43 | 144k | memcpy((void*)&v, px, 8); |
44 | 144k | return v; |
45 | 144k | } |
46 | 159k | static Sk4px Load1(const SkPMColor px[1]) { |
47 | 159k | Sk4px v; |
48 | 159k | memcpy((void*)&v, px, 4); |
49 | 159k | return v; |
50 | 159k | } |
51 | | |
52 | | // Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px. |
53 | | // AaXx -> AAAA aaaa XXXX xxxx |
54 | 7.86M | static Sk4px Load4Alphas(const SkAlpha alphas[4]) { |
55 | 7.86M | skvx::byte4 a = skvx::byte4::Load(alphas); |
56 | 7.86M | return Sk4px(skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(a)); |
57 | 7.86M | } |
58 | | // Aa -> AAAA aaaa ???? ???? |
59 | 144k | static Sk4px Load2Alphas(const SkAlpha alphas[2]) { |
60 | 144k | skvx::byte2 a = skvx::byte2::Load(alphas); |
61 | 144k | return Sk4px(join(skvx::shuffle<0,0,0,0, 1,1,1,1>(a), skvx::byte8())); |
62 | 144k | } |
63 | | |
64 | 7.86M | void store4(SkPMColor px[4]) const { memcpy(px, this, 16); } |
65 | 144k | void store2(SkPMColor px[2]) const { memcpy(px, this, 8); } |
66 | 159k | void store1(SkPMColor px[1]) const { memcpy(px, this, 4); } |
67 | | |
68 | | // 1, 2, or 4 SkPMColors with 16-bit components. |
69 | | // This is most useful as the result of a multiply, e.g. from mulWiden(). |
70 | | class Wide { |
71 | | public: |
72 | 0 | Wide(const skvx::Vec<16, uint16_t>& v) : fV(v) {} |
73 | | |
74 | | // Rounds, i.e. (x+127) / 255. |
75 | 0 | Sk4px div255() const { return Sk4px(skvx::div255(fV)); } |
76 | | |
77 | 0 | Wide operator * (const Wide& o) const { return Wide(fV * o.fV); } |
78 | 0 | Wide operator + (const Wide& o) const { return Wide(fV + o.fV); } |
79 | 0 | Wide operator - (const Wide& o) const { return Wide(fV - o.fV); } |
80 | 0 | Wide operator >> (int bits) const { return Wide(fV >> bits); } |
81 | 0 | Wide operator << (int bits) const { return Wide(fV << bits); } |
82 | | |
83 | | private: |
84 | | skvx::Vec<16, uint16_t> fV; |
85 | | }; |
86 | | |
87 | | // Widen 8-bit values to low 8-bits of 16-bit lanes. |
88 | 0 | Wide widen() const { return Wide(skvx::cast<uint16_t>(fV)); } |
89 | | // 8-bit x 8-bit -> 16-bit components. |
90 | 0 | Wide mulWiden(const skvx::byte16& o) const { return Wide(mull(fV, o)); } |
91 | | |
92 | | // The only 8-bit multiply we use is 8-bit x 8-bit -> 16-bit. Might as well make it pithy. |
93 | 0 | Wide operator * (const Sk4px& o) const { return this->mulWiden(o.fV); } |
94 | | |
95 | 8.17M | Sk4px operator + (const Sk4px& o) const { return Sk4px(fV + o.fV); } |
96 | 0 | Sk4px operator - (const Sk4px& o) const { return Sk4px(fV - o.fV); } |
97 | 0 | Sk4px operator < (const Sk4px& o) const { return Sk4px(fV < o.fV); } |
98 | 6.63M | Sk4px operator & (const Sk4px& o) const { return Sk4px(fV & o.fV); } |
99 | 0 | Sk4px thenElse(const Sk4px& t, const Sk4px& e) const { |
100 | 0 | return Sk4px(if_then_else(fV, t.fV, e.fV)); |
101 | 0 | } |
102 | | |
103 | | // Generally faster than (*this * o).div255(). |
104 | | // May be incorrect by +-1, but is always exactly correct when *this or o is 0 or 255. |
105 | 9.70M | Sk4px approxMulDiv255(const Sk4px& o) const { |
106 | 9.70M | return Sk4px(approx_scale(fV, o.fV)); |
107 | 9.70M | } |
108 | | |
109 | 0 | Sk4px saturatedAdd(const Sk4px& o) const { |
110 | 0 | return Sk4px(saturated_add(fV, o.fV)); |
111 | 0 | } |
112 | | |
113 | | // A generic driver that maps fn over a src array into a dst array. |
114 | | // fn should take an Sk4px (4 src pixels) and return an Sk4px (4 dst pixels). |
115 | | template <typename Fn> |
116 | | [[maybe_unused]] static void MapSrc(int n, SkPMColor* dst, const SkPMColor* src, const Fn& fn) { |
117 | | SkASSERT(dst); |
118 | | SkASSERT(src); |
119 | | // This looks a bit odd, but it helps loop-invariant hoisting across different calls to fn. |
120 | | // Basically, we need to make sure we keep things inside a single loop. |
121 | | while (n > 0) { |
122 | | if (n >= 8) { |
123 | | Sk4px dst0 = fn(Load4(src+0)), |
124 | | dst4 = fn(Load4(src+4)); |
125 | | dst0.store4(dst+0); |
126 | | dst4.store4(dst+4); |
127 | | dst += 8; src += 8; n -= 8; |
128 | | continue; // Keep our stride at 8 pixels as long as possible. |
129 | | } |
130 | | SkASSERT(n <= 7); |
131 | | if (n >= 4) { |
132 | | fn(Load4(src)).store4(dst); |
133 | | dst += 4; src += 4; n -= 4; |
134 | | } |
135 | | if (n >= 2) { |
136 | | fn(Load2(src)).store2(dst); |
137 | | dst += 2; src += 2; n -= 2; |
138 | | } |
139 | | if (n >= 1) { |
140 | | fn(Load1(src)).store1(dst); |
141 | | } |
142 | | break; |
143 | | } |
144 | | } |
145 | | |
146 | | // As above, but with dst4' = fn(dst4, src4). |
147 | | template <typename Fn> |
148 | | [[maybe_unused]] static void MapDstSrc(int n, SkPMColor* dst, const SkPMColor* src, |
149 | | const Fn& fn) { |
150 | | SkASSERT(dst); |
151 | | SkASSERT(src); |
152 | | while (n > 0) { |
153 | | if (n >= 8) { |
154 | | Sk4px dst0 = fn(Load4(dst+0), Load4(src+0)), |
155 | | dst4 = fn(Load4(dst+4), Load4(src+4)); |
156 | | dst0.store4(dst+0); |
157 | | dst4.store4(dst+4); |
158 | | dst += 8; src += 8; n -= 8; |
159 | | continue; // Keep our stride at 8 pixels as long as possible. |
160 | | } |
161 | | SkASSERT(n <= 7); |
162 | | if (n >= 4) { |
163 | | fn(Load4(dst), Load4(src)).store4(dst); |
164 | | dst += 4; src += 4; n -= 4; |
165 | | } |
166 | | if (n >= 2) { |
167 | | fn(Load2(dst), Load2(src)).store2(dst); |
168 | | dst += 2; src += 2; n -= 2; |
169 | | } |
170 | | if (n >= 1) { |
171 | | fn(Load1(dst), Load1(src)).store1(dst); |
172 | | } |
173 | | break; |
174 | | } |
175 | | } |
176 | | |
177 | | // As above, but with dst4' = fn(dst4, alpha4). |
178 | | template <typename Fn> |
179 | | [[maybe_unused]] static void MapDstAlpha(int n, SkPMColor* dst, const SkAlpha* a, |
180 | 386k | const Fn& fn) { |
181 | 386k | SkASSERT(dst); |
182 | 386k | SkASSERT(a); |
183 | 4.24M | while (n > 0) { |
184 | 4.13M | if (n >= 8) { |
185 | 3.85M | Sk4px dst0 = fn(Load4(dst+0), Load4Alphas(a+0)), |
186 | 3.85M | dst4 = fn(Load4(dst+4), Load4Alphas(a+4)); |
187 | 3.85M | dst0.store4(dst+0); |
188 | 3.85M | dst4.store4(dst+4); |
189 | 3.85M | dst += 8; a += 8; n -= 8; |
190 | 3.85M | continue; // Keep our stride at 8 pixels as long as possible. |
191 | 3.85M | } |
192 | 277k | SkASSERT(n <= 7); |
193 | 277k | if (n >= 4) { |
194 | 157k | fn(Load4(dst), Load4Alphas(a)).store4(dst); |
195 | 157k | dst += 4; a += 4; n -= 4; |
196 | 157k | } |
197 | 277k | if (n >= 2) { |
198 | 144k | fn(Load2(dst), Load2Alphas(a)).store2(dst); |
199 | 144k | dst += 2; a += 2; n -= 2; |
200 | 144k | } |
201 | 277k | if (n >= 1) { |
202 | 159k | fn(Load1(dst), skvx::byte16(*a)).store1(dst); |
203 | 159k | } |
204 | 277k | break; |
205 | 4.13M | } |
206 | 386k | } SkBlitMask_opts.cpp:void Sk4px::MapDstAlpha<sse2::blit_mask_d32_a8_black(unsigned int*, unsigned long, unsigned char const*, unsigned long, int, int)::$_0>(int, unsigned int*, unsigned char const*, sse2::blit_mask_d32_a8_black(unsigned int*, unsigned long, unsigned char const*, unsigned long, int, int)::$_0 const&) Line | Count | Source | 180 | 302k | const Fn& fn) { | 181 | 302k | SkASSERT(dst); | 182 | 302k | SkASSERT(a); | 183 | 3.45M | while (n > 0) { | 184 | 3.35M | if (n >= 8) { | 185 | 3.15M | Sk4px dst0 = fn(Load4(dst+0), Load4Alphas(a+0)), | 186 | 3.15M | dst4 = fn(Load4(dst+4), Load4Alphas(a+4)); | 187 | 3.15M | dst0.store4(dst+0); | 188 | 3.15M | dst4.store4(dst+4); | 189 | 3.15M | dst += 8; a += 8; n -= 8; | 190 | 3.15M | continue; // Keep our stride at 8 pixels as long as possible. | 191 | 3.15M | } | 192 | 203k | SkASSERT(n <= 7); | 193 | 203k | if (n >= 4) { | 194 | 109k | fn(Load4(dst), Load4Alphas(a)).store4(dst); | 195 | 109k | dst += 4; a += 4; n -= 4; | 196 | 109k | } | 197 | 203k | if (n >= 2) { | 198 | 96.9k | fn(Load2(dst), Load2Alphas(a)).store2(dst); | 199 | 96.9k | dst += 2; a += 2; n -= 2; | 200 | 96.9k | } | 201 | 203k | if (n >= 1) { | 202 | 127k | fn(Load1(dst), skvx::byte16(*a)).store1(dst); | 203 | 127k | } | 204 | 203k | break; | 205 | 3.35M | } | 206 | 302k | } |
SkBlitMask_opts.cpp:void Sk4px::MapDstAlpha<sse2::blit_mask_d32_a8_opaque(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0>(int, unsigned int*, unsigned char const*, sse2::blit_mask_d32_a8_opaque(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0 const&) Line | Count | Source | 180 | 55.2k | const Fn& fn) { | 181 | 55.2k | SkASSERT(dst); | 182 | 55.2k | SkASSERT(a); | 183 | 578k | while (n > 0) { | 184 | 574k | if (n >= 8) { | 185 | 522k | Sk4px dst0 = fn(Load4(dst+0), Load4Alphas(a+0)), | 186 | 522k | dst4 = fn(Load4(dst+4), Load4Alphas(a+4)); | 187 | 522k | dst0.store4(dst+0); | 188 | 522k | dst4.store4(dst+4); | 189 | 522k | dst += 8; a += 8; n -= 8; | 190 | 522k | continue; // Keep our stride at 8 pixels as long as possible. | 191 | 522k | } | 192 | 51.7k | SkASSERT(n <= 7); | 193 | 51.7k | if (n >= 4) { | 194 | 37.4k | fn(Load4(dst), Load4Alphas(a)).store4(dst); | 195 | 37.4k | dst += 4; a += 4; n -= 4; | 196 | 37.4k | } | 197 | 51.7k | if (n >= 2) { | 198 | 38.7k | fn(Load2(dst), Load2Alphas(a)).store2(dst); | 199 | 38.7k | dst += 2; a += 2; n -= 2; | 200 | 38.7k | } | 201 | 51.7k | if (n >= 1) { | 202 | 16.1k | fn(Load1(dst), skvx::byte16(*a)).store1(dst); | 203 | 16.1k | } | 204 | 51.7k | break; | 205 | 574k | } | 206 | 55.2k | } |
SkBlitMask_opts.cpp:void Sk4px::MapDstAlpha<sse2::blit_mask_d32_a8_general(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0>(int, unsigned int*, unsigned char const*, sse2::blit_mask_d32_a8_general(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0 const&) Line | Count | Source | 180 | 29.1k | const Fn& fn) { | 181 | 29.1k | SkASSERT(dst); | 182 | 29.1k | SkASSERT(a); | 183 | 210k | while (n > 0) { | 184 | 203k | if (n >= 8) { | 185 | 181k | Sk4px dst0 = fn(Load4(dst+0), Load4Alphas(a+0)), | 186 | 181k | dst4 = fn(Load4(dst+4), Load4Alphas(a+4)); | 187 | 181k | dst0.store4(dst+0); | 188 | 181k | dst4.store4(dst+4); | 189 | 181k | dst += 8; a += 8; n -= 8; | 190 | 181k | continue; // Keep our stride at 8 pixels as long as possible. | 191 | 181k | } | 192 | 22.4k | SkASSERT(n <= 7); | 193 | 22.4k | if (n >= 4) { | 194 | 10.3k | fn(Load4(dst), Load4Alphas(a)).store4(dst); | 195 | 10.3k | dst += 4; a += 4; n -= 4; | 196 | 10.3k | } | 197 | 22.4k | if (n >= 2) { | 198 | 8.67k | fn(Load2(dst), Load2Alphas(a)).store2(dst); | 199 | 8.67k | dst += 2; a += 2; n -= 2; | 200 | 8.67k | } | 201 | 22.4k | if (n >= 1) { | 202 | 15.5k | fn(Load1(dst), skvx::byte16(*a)).store1(dst); | 203 | 15.5k | } | 204 | 22.4k | break; | 205 | 203k | } | 206 | 29.1k | } |
Unexecuted instantiation: SkBlitMask_opts_ssse3.cpp:void Sk4px::MapDstAlpha<ssse3::blit_mask_d32_a8_black(unsigned int*, unsigned long, unsigned char const*, unsigned long, int, int)::$_0>(int, unsigned int*, unsigned char const*, ssse3::blit_mask_d32_a8_black(unsigned int*, unsigned long, unsigned char const*, unsigned long, int, int)::$_0 const&) Unexecuted instantiation: SkBlitMask_opts_ssse3.cpp:void Sk4px::MapDstAlpha<ssse3::blit_mask_d32_a8_opaque(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0>(int, unsigned int*, unsigned char const*, ssse3::blit_mask_d32_a8_opaque(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0 const&) Unexecuted instantiation: SkBlitMask_opts_ssse3.cpp:void Sk4px::MapDstAlpha<ssse3::blit_mask_d32_a8_general(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0>(int, unsigned int*, unsigned char const*, ssse3::blit_mask_d32_a8_general(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0 const&) Unexecuted instantiation: SkBlitMask_opts_ssse3.cpp:void Sk4px::MapDstAlpha<ssse3::blit_mask_d32_a8_black(unsigned int*, unsigned long, unsigned char const*, unsigned long, int, int)::$_0>(int, unsigned int*, unsigned char const*, ssse3::blit_mask_d32_a8_black(unsigned int*, unsigned long, unsigned char const*, unsigned long, int, int)::$_0 const&) Unexecuted instantiation: SkBlitMask_opts_ssse3.cpp:void Sk4px::MapDstAlpha<ssse3::blit_mask_d32_a8_opaque(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0>(int, unsigned int*, unsigned char const*, ssse3::blit_mask_d32_a8_opaque(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0 const&) Unexecuted instantiation: SkBlitMask_opts_ssse3.cpp:void Sk4px::MapDstAlpha<ssse3::blit_mask_d32_a8_general(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0>(int, unsigned int*, unsigned char const*, ssse3::blit_mask_d32_a8_general(unsigned int*, unsigned long, unsigned char const*, unsigned long, unsigned int, int, int)::$_0 const&) |
207 | | |
208 | | // As above, but with dst4' = fn(dst4, src4, alpha4). |
209 | | template <typename Fn> |
210 | | [[maybe_unused]] static void MapDstSrcAlpha(int n, SkPMColor* dst, const SkPMColor* src, |
211 | | const SkAlpha* a, const Fn& fn) { |
212 | | SkASSERT(dst); |
213 | | SkASSERT(src); |
214 | | SkASSERT(a); |
215 | | while (n > 0) { |
216 | | if (n >= 8) { |
217 | | Sk4px dst0 = fn(Load4(dst+0), Load4(src+0), Load4Alphas(a+0)), |
218 | | dst4 = fn(Load4(dst+4), Load4(src+4), Load4Alphas(a+4)); |
219 | | dst0.store4(dst+0); |
220 | | dst4.store4(dst+4); |
221 | | dst += 8; src += 8; a += 8; n -= 8; |
222 | | continue; // Keep our stride at 8 pixels as long as possible. |
223 | | } |
224 | | SkASSERT(n <= 7); |
225 | | if (n >= 4) { |
226 | | fn(Load4(dst), Load4(src), Load4Alphas(a)).store4(dst); |
227 | | dst += 4; src += 4; a += 4; n -= 4; |
228 | | } |
229 | | if (n >= 2) { |
230 | | fn(Load2(dst), Load2(src), Load2Alphas(a)).store2(dst); |
231 | | dst += 2; src += 2; a += 2; n -= 2; |
232 | | } |
233 | | if (n >= 1) { |
234 | | fn(Load1(dst), Load1(src), skvx::byte16(*a)).store1(dst); |
235 | | } |
236 | | break; |
237 | | } |
238 | | } |
239 | | |
240 | | private: |
241 | 8.17M | Sk4px() = default; |
242 | | |
243 | | skvx::byte16 fV; |
244 | | }; |
245 | | |
246 | | static_assert(sizeof(Sk4px) == sizeof(skvx::byte16)); |
247 | | static_assert(alignof(Sk4px) == alignof(skvx::byte16)); |
248 | | |
249 | | #endif // Sk4px_DEFINED |