/src/skia/src/opts/SkBlitRow_opts.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2015 Google Inc. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license that can be |
5 | | * found in the LICENSE file. |
6 | | */ |
7 | | |
8 | | #ifndef SkBlitRow_opts_DEFINED |
9 | | #define SkBlitRow_opts_DEFINED |
10 | | |
11 | | #include "include/private/SkColorData.h" |
12 | | #include "include/private/SkVx.h" |
13 | | #include "src/core/SkMSAN.h" |
14 | | |
15 | | // Helpers for blit_row_s32a_opaque(), |
16 | | // then blit_row_s32a_opaque() itself, |
17 | | // then unrelated blit_row_color32() at the bottom. |
18 | | // |
19 | | // To keep Skia resistant to timing attacks, it's important not to branch on pixel data. |
20 | | // In particular, don't be tempted to [v]ptest, pmovmskb, etc. to branch on the source alpha. |
21 | | |
22 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX |
23 | | #include <immintrin.h> |
24 | | |
25 | | static inline __m512i SkPMSrcOver_SKX(const __m512i& src, const __m512i& dst) { |
26 | | // Detailed explanations in SkPMSrcOver_AVX2 |
27 | | // b = s + (d*(256-srcA)) >> 8 |
28 | | |
29 | | // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel. |
30 | | const uint8_t _ = -1; // fills a literal 0 byte. |
31 | | const uint8_t mask[64] = { 3, _,3, _, 7, _,7, _, 11,_,11,_, 15,_,15,_, |
32 | | 19,_,19,_, 23,_,23,_, 27,_,27,_, 31,_,31,_, |
33 | | 35,_,35,_, 39,_,39,_, 43,_,43,_, 47,_,47,_, |
34 | | 51,_,51,_, 55,_,55,_, 59,_,59,_, 63,_,63,_ }; |
35 | | __m512i srcA_x2 = _mm512_shuffle_epi8(src, _mm512_loadu_si512(mask)); |
36 | | __m512i scale_x2 = _mm512_sub_epi16(_mm512_set1_epi16(256), |
37 | | srcA_x2); |
38 | | |
39 | | // Scale red and blue, leaving results in the low byte of each 16-bit lane. |
40 | | __m512i rb = _mm512_and_si512(_mm512_set1_epi32(0x00ff00ff), dst); |
41 | | rb = _mm512_mullo_epi16(rb, scale_x2); |
42 | | rb = _mm512_srli_epi16(rb, 8); |
43 | | |
44 | | // Scale green and alpha, leaving results in the high byte, masking off the low bits. |
45 | | __m512i ga = _mm512_srli_epi16(dst, 8); |
46 | | ga = _mm512_mullo_epi16(ga, scale_x2); |
47 | | ga = _mm512_andnot_si512(_mm512_set1_epi32(0x00ff00ff), ga); |
48 | | |
49 | | return _mm512_add_epi32(src, _mm512_or_si512(rb, ga)); |
50 | | } |
51 | | #endif |
52 | | |
53 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
54 | | #include <immintrin.h> |
55 | | |
56 | 0 | static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) { |
57 | | // Abstractly srcover is |
58 | | // b = s + d*(1-srcA) |
59 | | // |
60 | | // In terms of unorm8 bytes, that works out to |
61 | | // b = s + (d*(255-srcA) + 127) / 255 |
62 | | // |
63 | | // But we approximate that to within a bit with |
64 | | // b = s + (d*(255-srcA) + d) / 256 |
65 | | // a.k.a |
66 | | // b = s + (d*(256-srcA)) >> 8 |
67 | | |
68 | | // The bottleneck of this math is the multiply, and we want to do it as |
69 | | // narrowly as possible, here getting inputs into 16-bit lanes and |
70 | | // using 16-bit multiplies. We can do twice as many multiplies at once |
71 | | // as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies |
72 | | // are themselves a couple cycles quicker. Win-win. |
73 | | |
74 | | // We'll get everything in 16-bit lanes for two multiplies, one |
75 | | // handling dst red and blue, the other green and alpha. (They're |
76 | | // conveniently 16-bits apart, you see.) We don't need the individual |
77 | | // src channels beyond alpha until the very end when we do the "s + " |
78 | | // add, and we don't even need to unpack them; the adds cannot overflow. |
79 | | |
80 | | // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel. |
81 | 0 | const int _ = -1; // fills a literal 0 byte. |
82 | 0 | __m256i srcA_x2 = _mm256_shuffle_epi8(src, |
83 | 0 | _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_, |
84 | 0 | 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_)); |
85 | 0 | __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256), |
86 | 0 | srcA_x2); |
87 | | |
88 | | // Scale red and blue, leaving results in the low byte of each 16-bit lane. |
89 | 0 | __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst); |
90 | 0 | rb = _mm256_mullo_epi16(rb, scale_x2); |
91 | 0 | rb = _mm256_srli_epi16 (rb, 8); |
92 | | |
93 | | // Scale green and alpha, leaving results in the high byte, masking off the low bits. |
94 | 0 | __m256i ga = _mm256_srli_epi16(dst, 8); |
95 | 0 | ga = _mm256_mullo_epi16(ga, scale_x2); |
96 | 0 | ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga); |
97 | |
|
98 | 0 | return _mm256_add_epi32(src, _mm256_or_si256(rb, ga)); |
99 | 0 | } |
100 | | #endif |
101 | | |
102 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
103 | | #include <immintrin.h> |
104 | | |
105 | 1.11G | static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { |
106 | 1.11G | __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256), |
107 | 1.11G | _mm_srli_epi32(src, 24)); |
108 | 1.11G | __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale); |
109 | | |
110 | 1.11G | __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst); |
111 | 1.11G | rb = _mm_mullo_epi16(rb, scale_x2); |
112 | 1.11G | rb = _mm_srli_epi16(rb, 8); |
113 | | |
114 | 1.11G | __m128i ga = _mm_srli_epi16(dst, 8); |
115 | 1.11G | ga = _mm_mullo_epi16(ga, scale_x2); |
116 | 1.11G | ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga); |
117 | | |
118 | 1.11G | return _mm_add_epi32(src, _mm_or_si128(rb, ga)); |
119 | 1.11G | } SkOpts.cpp:SkPMSrcOver_SSE2(long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 105 | 1.11G | static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { | 106 | 1.11G | __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256), | 107 | 1.11G | _mm_srli_epi32(src, 24)); | 108 | 1.11G | __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale); | 109 | | | 110 | 1.11G | __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst); | 111 | 1.11G | rb = _mm_mullo_epi16(rb, scale_x2); | 112 | 1.11G | rb = _mm_srli_epi16(rb, 8); | 113 | | | 114 | 1.11G | __m128i ga = _mm_srli_epi16(dst, 8); | 115 | 1.11G | ga = _mm_mullo_epi16(ga, scale_x2); | 116 | 1.11G | ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga); | 117 | | | 118 | 1.11G | return _mm_add_epi32(src, _mm_or_si128(rb, ga)); | 119 | 1.11G | } |
Unexecuted instantiation: SkOpts_hsw.cpp:SkPMSrcOver_SSE2(long long __vector(2) const&, long long __vector(2) const&) |
120 | | #endif |
121 | | |
122 | | #if defined(SK_ARM_HAS_NEON) |
123 | | #include <arm_neon.h> |
124 | | |
125 | | // SkMulDiv255Round() applied to each lane. |
126 | | static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) { |
127 | | uint16x8_t prod = vmull_u8(x, y); |
128 | | return vraddhn_u16(prod, vrshrq_n_u16(prod, 8)); |
129 | | } |
130 | | |
131 | | static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) { |
132 | | uint8x8_t nalphas = vmvn_u8(src.val[3]); // 256 - alpha |
133 | | return { |
134 | | vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0])), |
135 | | vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1])), |
136 | | vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2])), |
137 | | vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3])), |
138 | | }; |
139 | | } |
140 | | |
141 | | // Variant assuming dst and src contain the color components of two consecutive pixels. |
142 | | static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) { |
143 | | const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303); |
144 | | uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices)); |
145 | | return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst)); |
146 | | } |
147 | | |
148 | | #endif |
149 | | |
150 | | namespace SK_OPTS_NS { |
151 | | |
152 | | /*not static*/ |
153 | 99.3M | inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) { |
154 | 99.3M | SkASSERT(alpha == 0xFF); |
155 | 99.3M | sk_msan_assert_initialized(src, src+len); |
156 | | |
157 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX |
158 | | while (len >= 16) { |
159 | | _mm512_storeu_si512((__m512*)dst, |
160 | | SkPMSrcOver_SKX(_mm512_loadu_si512((const __m512i*)src), |
161 | | _mm512_loadu_si512((const __m512i*)dst))); |
162 | | src += 16; |
163 | | dst += 16; |
164 | | len -= 16; |
165 | | } |
166 | | #endif |
167 | | |
168 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
169 | 0 | while (len >= 8) { |
170 | 0 | _mm256_storeu_si256((__m256i*)dst, |
171 | 0 | SkPMSrcOver_AVX2(_mm256_loadu_si256((const __m256i*)src), |
172 | 0 | _mm256_loadu_si256((const __m256i*)dst))); |
173 | 0 | src += 8; |
174 | 0 | dst += 8; |
175 | 0 | len -= 8; |
176 | 0 | } |
177 | | #endif |
178 | | |
179 | 99.3M | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
180 | 1.21G | while (len >= 4) { |
181 | 1.11G | _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src), |
182 | 1.11G | _mm_loadu_si128((const __m128i*)dst))); |
183 | 1.11G | src += 4; |
184 | 1.11G | dst += 4; |
185 | 1.11G | len -= 4; |
186 | 1.11G | } |
187 | 99.3M | #endif |
188 | | |
189 | | #if defined(SK_ARM_HAS_NEON) |
190 | | while (len >= 8) { |
191 | | vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((const uint8_t*)dst), |
192 | | vld4_u8((const uint8_t*)src))); |
193 | | src += 8; |
194 | | dst += 8; |
195 | | len -= 8; |
196 | | } |
197 | | |
198 | | while (len >= 2) { |
199 | | vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((const uint8_t*)dst), |
200 | | vld1_u8((const uint8_t*)src))); |
201 | | src += 2; |
202 | | dst += 2; |
203 | | len -= 2; |
204 | | } |
205 | | |
206 | | if (len != 0) { |
207 | | uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst), |
208 | | vcreate_u8((uint64_t)*src)); |
209 | | vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0); |
210 | | } |
211 | | return; |
212 | | #endif |
213 | | |
214 | 278M | while (len --> 0) { |
215 | 178M | *dst = SkPMSrcOver(*src, *dst); |
216 | 178M | src++; |
217 | 178M | dst++; |
218 | 178M | } |
219 | 99.3M | } sse2::blit_row_s32a_opaque(unsigned int*, unsigned int const*, int, unsigned int) Line | Count | Source | 153 | 99.3M | inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) { | 154 | 99.3M | SkASSERT(alpha == 0xFF); | 155 | 99.3M | sk_msan_assert_initialized(src, src+len); | 156 | | | 157 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX | 158 | | while (len >= 16) { | 159 | | _mm512_storeu_si512((__m512*)dst, | 160 | | SkPMSrcOver_SKX(_mm512_loadu_si512((const __m512i*)src), | 161 | | _mm512_loadu_si512((const __m512i*)dst))); | 162 | | src += 16; | 163 | | dst += 16; | 164 | | len -= 16; | 165 | | } | 166 | | #endif | 167 | | | 168 | | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 | 169 | | while (len >= 8) { | 170 | | _mm256_storeu_si256((__m256i*)dst, | 171 | | SkPMSrcOver_AVX2(_mm256_loadu_si256((const __m256i*)src), | 172 | | _mm256_loadu_si256((const __m256i*)dst))); | 173 | | src += 8; | 174 | | dst += 8; | 175 | | len -= 8; | 176 | | } | 177 | | #endif | 178 | | | 179 | 99.3M | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 180 | 1.21G | while (len >= 4) { | 181 | 1.11G | _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src), | 182 | 1.11G | _mm_loadu_si128((const __m128i*)dst))); | 183 | 1.11G | src += 4; | 184 | 1.11G | dst += 4; | 185 | 1.11G | len -= 4; | 186 | 1.11G | } | 187 | 99.3M | #endif | 188 | | | 189 | | #if defined(SK_ARM_HAS_NEON) | 190 | | while (len >= 8) { | 191 | | vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((const uint8_t*)dst), | 192 | | vld4_u8((const uint8_t*)src))); | 193 | | src += 8; | 194 | | dst += 8; | 195 | | len -= 8; | 196 | | } | 197 | | | 198 | | while (len >= 2) { | 199 | | vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((const uint8_t*)dst), | 200 | | vld1_u8((const uint8_t*)src))); | 201 | | src += 2; | 202 | | dst += 2; | 203 | | len -= 2; | 204 | | } | 205 | | | 206 | | if (len != 0) { | 207 | | uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst), | 208 | | vcreate_u8((uint64_t)*src)); | 209 | | vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0); | 210 | | } | 211 | | return; | 212 | | #endif | 213 | | | 214 | 278M | while (len --> 0) { | 215 | 178M | *dst = SkPMSrcOver(*src, *dst); | 216 | 178M | src++; | 217 | 178M | dst++; | 218 | 178M | } | 219 | 99.3M | } |
Unexecuted instantiation: hsw::blit_row_s32a_opaque(unsigned int*, unsigned int const*, int, unsigned int) Unexecuted instantiation: hsw::blit_row_s32a_opaque(unsigned int*, unsigned int const*, int, unsigned int) |
220 | | |
221 | | // Blend constant color over count src pixels, writing into dst. |
222 | | /*not static*/ |
223 | 1.45M | inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) { |
224 | 1.45M | constexpr int N = 4; // 8, 16 also reasonable choices |
225 | 1.45M | using U32 = skvx::Vec< N, uint32_t>; |
226 | 1.45M | using U16 = skvx::Vec<4*N, uint16_t>; |
227 | 1.45M | using U8 = skvx::Vec<4*N, uint8_t>; |
228 | | |
229 | 3.47M | auto kernel = [color](U32 src) { |
230 | 3.47M | unsigned invA = 255 - SkGetPackedA32(color); |
231 | 3.47M | invA += invA >> 7; |
232 | 3.47M | SkASSERT(0 < invA && invA < 256); // We handle alpha == 0 or alpha == 255 specially. |
233 | | |
234 | | // (src * invA + (color << 8) + 128) >> 8 |
235 | | // Should all fit in 16 bits. |
236 | 3.47M | U8 s = skvx::bit_pun<U8>(src), |
237 | 3.47M | a = U8(invA); |
238 | 3.47M | U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))), |
239 | 3.47M | d = (mull(s,a) + (c << 8) + 128)>>8; |
240 | 3.47M | return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d)); |
241 | 3.47M | }; sse2::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)::{lambda(skvx::Vec<4, unsigned int>)#1}::operator()(skvx::Vec<4, unsigned int>) const Line | Count | Source | 229 | 3.47M | auto kernel = [color](U32 src) { | 230 | 3.47M | unsigned invA = 255 - SkGetPackedA32(color); | 231 | 3.47M | invA += invA >> 7; | 232 | 3.47M | SkASSERT(0 < invA && invA < 256); // We handle alpha == 0 or alpha == 255 specially. | 233 | | | 234 | | // (src * invA + (color << 8) + 128) >> 8 | 235 | | // Should all fit in 16 bits. | 236 | 3.47M | U8 s = skvx::bit_pun<U8>(src), | 237 | 3.47M | a = U8(invA); | 238 | 3.47M | U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))), | 239 | 3.47M | d = (mull(s,a) + (c << 8) + 128)>>8; | 240 | 3.47M | return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d)); | 241 | 3.47M | }; |
Unexecuted instantiation: hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)::{lambda(skvx::Vec<4, unsigned int>)#1}::operator()(skvx::Vec<4, unsigned int>) const Unexecuted instantiation: hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int)::{lambda(skvx::Vec<4, unsigned int>)#1}::operator()(skvx::Vec<4, unsigned int>) const |
242 | | |
243 | 3.43M | while (count >= N) { |
244 | 1.98M | kernel(U32::Load(src)).store(dst); |
245 | 1.98M | src += N; |
246 | 1.98M | dst += N; |
247 | 1.98M | count -= N; |
248 | 1.98M | } |
249 | 2.94M | while (count --> 0) { |
250 | 1.48M | *dst++ = kernel(U32{*src++})[0]; |
251 | 1.48M | } |
252 | 1.45M | } sse2::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int) Line | Count | Source | 223 | 1.45M | inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) { | 224 | 1.45M | constexpr int N = 4; // 8, 16 also reasonable choices | 225 | 1.45M | using U32 = skvx::Vec< N, uint32_t>; | 226 | 1.45M | using U16 = skvx::Vec<4*N, uint16_t>; | 227 | 1.45M | using U8 = skvx::Vec<4*N, uint8_t>; | 228 | | | 229 | 1.45M | auto kernel = [color](U32 src) { | 230 | 1.45M | unsigned invA = 255 - SkGetPackedA32(color); | 231 | 1.45M | invA += invA >> 7; | 232 | 1.45M | SkASSERT(0 < invA && invA < 256); // We handle alpha == 0 or alpha == 255 specially. | 233 | | | 234 | | // (src * invA + (color << 8) + 128) >> 8 | 235 | | // Should all fit in 16 bits. | 236 | 1.45M | U8 s = skvx::bit_pun<U8>(src), | 237 | 1.45M | a = U8(invA); | 238 | 1.45M | U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))), | 239 | 1.45M | d = (mull(s,a) + (c << 8) + 128)>>8; | 240 | 1.45M | return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d)); | 241 | 1.45M | }; | 242 | | | 243 | 3.43M | while (count >= N) { | 244 | 1.98M | kernel(U32::Load(src)).store(dst); | 245 | 1.98M | src += N; | 246 | 1.98M | dst += N; | 247 | 1.98M | count -= N; | 248 | 1.98M | } | 249 | 2.94M | while (count --> 0) { | 250 | 1.48M | *dst++ = kernel(U32{*src++})[0]; | 251 | 1.48M | } | 252 | 1.45M | } |
Unexecuted instantiation: hsw::blit_row_color32(unsigned int*, unsigned int const*, int, unsigned int) |
253 | | |
254 | | } // namespace SK_OPTS_NS |
255 | | |
256 | | #endif//SkBlitRow_opts_DEFINED |