/src/libwebp/src/dsp/alpha_processing_sse2.c
Line | Count | Source |
1 | | // Copyright 2014 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style license |
4 | | // that can be found in the COPYING file in the root of the source |
5 | | // tree. An additional intellectual property rights grant can be found |
6 | | // in the file PATENTS. All contributing project authors may |
7 | | // be found in the AUTHORS file in the root of the source tree. |
8 | | // ----------------------------------------------------------------------------- |
9 | | // |
10 | | // Utilities for processing transparent channel. |
11 | | // |
12 | | // Author: Skal (pascal.massimino@gmail.com) |
13 | | |
14 | | #include "src/dsp/dsp.h" |
15 | | |
16 | | #if defined(WEBP_USE_SSE2) |
17 | | #include <emmintrin.h> |
18 | | |
19 | | #include "src/dsp/cpu.h" |
20 | | #include "src/webp/types.h" |
21 | | |
22 | | //------------------------------------------------------------------------------ |
23 | | |
24 | | static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, |
25 | | int alpha_stride, int width, int height, |
26 | 24.4M | uint8_t* WEBP_RESTRICT dst, int dst_stride) { |
27 | | // alpha_and stores an 'and' operation of all the alpha[] values. The final |
28 | | // value is not 0xff if any of the alpha[] is not equal to 0xff. |
29 | 24.4M | uint32_t alpha_and = 0xff; |
30 | 24.4M | int i, j; |
31 | 24.4M | const __m128i zero = _mm_setzero_si128(); |
32 | 24.4M | const __m128i alpha_mask = _mm_set1_epi32((int)0xff); // to preserve A |
33 | 24.4M | const __m128i all_0xff = _mm_set1_epi8((char)0xff); |
34 | 24.4M | __m128i all_alphas16 = all_0xff; |
35 | 24.4M | __m128i all_alphas8 = all_0xff; |
36 | | |
37 | | // We must be able to access 3 extra bytes after the last written byte |
38 | | // 'dst[4 * width - 4]', because we don't know if alpha is the first or the |
39 | | // last byte of the quadruplet. |
40 | 49.5M | for (j = 0; j < height; ++j) { |
41 | 25.0M | char* ptr = (char*)dst; |
42 | 99.6M | for (i = 0; i + 16 <= width - 1; i += 16) { |
43 | | // load 16 alpha bytes |
44 | 74.5M | const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); |
45 | 74.5M | const __m128i a1_lo = _mm_unpacklo_epi8(a0, zero); |
46 | 74.5M | const __m128i a1_hi = _mm_unpackhi_epi8(a0, zero); |
47 | 74.5M | const __m128i a2_lo_lo = _mm_unpacklo_epi16(a1_lo, zero); |
48 | 74.5M | const __m128i a2_lo_hi = _mm_unpackhi_epi16(a1_lo, zero); |
49 | 74.5M | const __m128i a2_hi_lo = _mm_unpacklo_epi16(a1_hi, zero); |
50 | 74.5M | const __m128i a2_hi_hi = _mm_unpackhi_epi16(a1_hi, zero); |
51 | 74.5M | _mm_maskmoveu_si128(a2_lo_lo, alpha_mask, ptr + 0); |
52 | 74.5M | _mm_maskmoveu_si128(a2_lo_hi, alpha_mask, ptr + 16); |
53 | 74.5M | _mm_maskmoveu_si128(a2_hi_lo, alpha_mask, ptr + 32); |
54 | 74.5M | _mm_maskmoveu_si128(a2_hi_hi, alpha_mask, ptr + 48); |
55 | | // accumulate 16 alpha 'and' in parallel |
56 | 74.5M | all_alphas16 = _mm_and_si128(all_alphas16, a0); |
57 | 74.5M | ptr += 64; |
58 | 74.5M | } |
59 | 25.0M | if (i + 8 <= width - 1) { |
60 | | // load 8 alpha bytes |
61 | 2.31M | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); |
62 | 2.31M | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); |
63 | 2.31M | const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); |
64 | 2.31M | const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); |
65 | 2.31M | _mm_maskmoveu_si128(a2_lo, alpha_mask, ptr); |
66 | 2.31M | _mm_maskmoveu_si128(a2_hi, alpha_mask, ptr + 16); |
67 | | // accumulate 8 alpha 'and' in parallel |
68 | 2.31M | all_alphas8 = _mm_and_si128(all_alphas8, a0); |
69 | 2.31M | i += 8; |
70 | 2.31M | } |
71 | 113M | for (; i < width; ++i) { |
72 | 88.7M | const uint32_t alpha_value = alpha[i]; |
73 | 88.7M | dst[4 * i] = alpha_value; |
74 | 88.7M | alpha_and &= alpha_value; |
75 | 88.7M | } |
76 | 25.0M | alpha += alpha_stride; |
77 | 25.0M | dst += dst_stride; |
78 | 25.0M | } |
79 | | // Combine the eight alpha 'and' into a 8-bit mask. |
80 | 24.4M | alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas8, all_0xff)) & 0xff; |
81 | 24.4M | return (alpha_and != 0xff || |
82 | 8.46M | _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas16, all_0xff)) != 0xffff); |
83 | 24.4M | } |
84 | | |
85 | | static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha, |
86 | | int alpha_stride, int width, int height, |
87 | | uint32_t* WEBP_RESTRICT dst, |
88 | 21.8k | int dst_stride) { |
89 | 21.8k | int i, j; |
90 | 21.8k | const __m128i zero = _mm_setzero_si128(); |
91 | 21.8k | const int limit = width & ~15; |
92 | 2.29M | for (j = 0; j < height; ++j) { |
93 | 8.25M | for (i = 0; i < limit; i += 16) { // process 16 alpha bytes |
94 | 5.98M | const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); |
95 | 5.98M | const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! |
96 | 5.98M | const __m128i b1 = _mm_unpackhi_epi8(zero, a0); |
97 | 5.98M | const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); |
98 | 5.98M | const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); |
99 | 5.98M | const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); |
100 | 5.98M | const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); |
101 | 5.98M | _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); |
102 | 5.98M | _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); |
103 | 5.98M | _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); |
104 | 5.98M | _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); |
105 | 5.98M | } |
106 | 13.7M | for (; i < width; ++i) dst[i] = alpha[i] << 8; |
107 | 2.26M | alpha += alpha_stride; |
108 | 2.26M | dst += dst_stride; |
109 | 2.26M | } |
110 | 21.8k | } |
111 | | |
112 | | static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, |
113 | | int width, int height, |
114 | 1.01M | uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { |
115 | | // alpha_and stores an 'and' operation of all the alpha[] values. The final |
116 | | // value is not 0xff if any of the alpha[] is not equal to 0xff. |
117 | 1.01M | uint32_t alpha_and = 0xff; |
118 | 1.01M | int i, j; |
119 | 1.01M | const __m128i a_mask = _mm_set1_epi32(0xff); // to preserve alpha |
120 | 1.01M | const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); |
121 | 1.01M | __m128i all_alphas = all_0xff; |
122 | | |
123 | | // We must be able to access 3 extra bytes after the last written byte |
124 | | // 'src[4 * width - 4]', because we don't know if alpha is the first or the |
125 | | // last byte of the quadruplet. |
126 | 1.01M | const int limit = (width - 1) & ~7; |
127 | | |
128 | 2.33M | for (j = 0; j < height; ++j) { |
129 | 1.31M | const __m128i* src = (const __m128i*)argb; |
130 | 50.4M | for (i = 0; i < limit; i += 8) { |
131 | | // load 32 argb bytes |
132 | 49.1M | const __m128i a0 = _mm_loadu_si128(src + 0); |
133 | 49.1M | const __m128i a1 = _mm_loadu_si128(src + 1); |
134 | 49.1M | const __m128i b0 = _mm_and_si128(a0, a_mask); |
135 | 49.1M | const __m128i b1 = _mm_and_si128(a1, a_mask); |
136 | 49.1M | const __m128i c0 = _mm_packs_epi32(b0, b1); |
137 | 49.1M | const __m128i d0 = _mm_packus_epi16(c0, c0); |
138 | | // store |
139 | 49.1M | _mm_storel_epi64((__m128i*)&alpha[i], d0); |
140 | | // accumulate eight alpha 'and' in parallel |
141 | 49.1M | all_alphas = _mm_and_si128(all_alphas, d0); |
142 | 49.1M | src += 2; |
143 | 49.1M | } |
144 | 7.18M | for (; i < width; ++i) { |
145 | 5.87M | const uint32_t alpha_value = argb[4 * i]; |
146 | 5.87M | alpha[i] = alpha_value; |
147 | 5.87M | alpha_and &= alpha_value; |
148 | 5.87M | } |
149 | 1.31M | argb += argb_stride; |
150 | 1.31M | alpha += alpha_stride; |
151 | 1.31M | } |
152 | | // Combine the eight alpha 'and' into a 8-bit mask. |
153 | 1.01M | alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); |
154 | 1.01M | return (alpha_and == 0xff); |
155 | 1.01M | } |
156 | | |
157 | | static void ExtractGreen_SSE2(const uint32_t* WEBP_RESTRICT argb, |
158 | 7.59k | uint8_t* WEBP_RESTRICT alpha, int size) { |
159 | 7.59k | int i; |
160 | 7.59k | const __m128i mask = _mm_set1_epi32(0xff); |
161 | 7.59k | const __m128i* src = (const __m128i*)argb; |
162 | | |
163 | 291k | for (i = 0; i + 16 <= size; i += 16, src += 4) { |
164 | 284k | const __m128i a0 = _mm_loadu_si128(src + 0); |
165 | 284k | const __m128i a1 = _mm_loadu_si128(src + 1); |
166 | 284k | const __m128i a2 = _mm_loadu_si128(src + 2); |
167 | 284k | const __m128i a3 = _mm_loadu_si128(src + 3); |
168 | 284k | const __m128i b0 = _mm_srli_epi32(a0, 8); |
169 | 284k | const __m128i b1 = _mm_srli_epi32(a1, 8); |
170 | 284k | const __m128i b2 = _mm_srli_epi32(a2, 8); |
171 | 284k | const __m128i b3 = _mm_srli_epi32(a3, 8); |
172 | 284k | const __m128i c0 = _mm_and_si128(b0, mask); |
173 | 284k | const __m128i c1 = _mm_and_si128(b1, mask); |
174 | 284k | const __m128i c2 = _mm_and_si128(b2, mask); |
175 | 284k | const __m128i c3 = _mm_and_si128(b3, mask); |
176 | 284k | const __m128i d0 = _mm_packs_epi32(c0, c1); |
177 | 284k | const __m128i d1 = _mm_packs_epi32(c2, c3); |
178 | 284k | const __m128i e = _mm_packus_epi16(d0, d1); |
179 | | // store |
180 | 284k | _mm_storeu_si128((__m128i*)&alpha[i], e); |
181 | 284k | } |
182 | 7.59k | if (i + 8 <= size) { |
183 | 915 | const __m128i a0 = _mm_loadu_si128(src + 0); |
184 | 915 | const __m128i a1 = _mm_loadu_si128(src + 1); |
185 | 915 | const __m128i b0 = _mm_srli_epi32(a0, 8); |
186 | 915 | const __m128i b1 = _mm_srli_epi32(a1, 8); |
187 | 915 | const __m128i c0 = _mm_and_si128(b0, mask); |
188 | 915 | const __m128i c1 = _mm_and_si128(b1, mask); |
189 | 915 | const __m128i d = _mm_packs_epi32(c0, c1); |
190 | 915 | const __m128i e = _mm_packus_epi16(d, d); |
191 | 915 | _mm_storel_epi64((__m128i*)&alpha[i], e); |
192 | 915 | i += 8; |
193 | 915 | } |
194 | 13.9k | for (; i < size; ++i) alpha[i] = argb[i] >> 8; |
195 | 7.59k | } |
196 | | |
197 | | //------------------------------------------------------------------------------ |
198 | | // Non-dither premultiplied modes |
199 | | |
200 | 27.3M | #define MULTIPLIER(a) ((a) * 0x8081) |
201 | 81.9M | #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) |
202 | | |
203 | | // We can't use a 'const int' for the SHUFFLE value, because it has to be an |
204 | | // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. |
205 | | // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit |
206 | | // value. |
207 | | #define APPLY_ALPHA(RGBX, SHUFFLE) \ |
208 | 206M | do { \ |
209 | 206M | const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \ |
210 | 206M | const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \ |
211 | 206M | const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \ |
212 | 206M | const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \ |
213 | 206M | const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \ |
214 | 206M | const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ |
215 | 206M | const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ |
216 | 206M | const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ |
217 | 206M | const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ |
218 | 206M | /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \ |
219 | 206M | const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \ |
220 | 206M | const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \ |
221 | 206M | const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \ |
222 | 206M | const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \ |
223 | 206M | const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \ |
224 | 206M | const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \ |
225 | 206M | const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \ |
226 | 206M | _mm_storeu_si128((__m128i*)&(RGBX), A3); \ |
227 | 206M | } while (0) |
228 | | |
229 | | static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, int w, |
230 | 13.7M | int h, int stride) { |
231 | 13.7M | const __m128i zero = _mm_setzero_si128(); |
232 | 13.7M | const __m128i kMult = _mm_set1_epi16((short)0x8081); |
233 | 13.7M | const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); |
234 | 13.7M | const int kSpan = 4; |
235 | 40.9M | while (h-- > 0) { |
236 | 27.1M | uint32_t* const rgbx = (uint32_t*)rgba; |
237 | 27.1M | int i; |
238 | 27.1M | if (!alpha_first) { |
239 | 120M | for (i = 0; i + kSpan <= w; i += kSpan) { |
240 | 98.7M | APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); |
241 | 98.7M | } |
242 | 21.3M | } else { |
243 | 113M | for (i = 0; i + kSpan <= w; i += kSpan) { |
244 | 107M | APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); |
245 | 107M | } |
246 | 5.84M | } |
247 | | // Finish with left-overs. |
248 | 69.5M | for (; i < w; ++i) { |
249 | 42.3M | uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); |
250 | 42.3M | const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); |
251 | 42.3M | const uint32_t a = alpha[4 * i]; |
252 | 42.3M | if (a != 0xff) { |
253 | 27.3M | const uint32_t mult = MULTIPLIER(a); |
254 | 27.3M | rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); |
255 | 27.3M | rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); |
256 | 27.3M | rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); |
257 | 27.3M | } |
258 | 42.3M | } |
259 | 27.1M | rgba += stride; |
260 | 27.1M | } |
261 | 13.7M | } |
262 | | #undef MULTIPLIER |
263 | | #undef PREMULTIPLY |
264 | | |
265 | | //------------------------------------------------------------------------------ |
266 | | // Alpha detection |
267 | | |
268 | 215k | static int HasAlpha8b_SSE2(const uint8_t* src, int length) { |
269 | 215k | const __m128i all_0xff = _mm_set1_epi8((char)0xff); |
270 | 215k | int i = 0; |
271 | 793k | for (; i + 16 <= length; i += 16) { |
272 | 597k | const __m128i v = _mm_loadu_si128((const __m128i*)(src + i)); |
273 | 597k | const __m128i bits = _mm_cmpeq_epi8(v, all_0xff); |
274 | 597k | const int mask = _mm_movemask_epi8(bits); |
275 | 597k | if (mask != 0xffff) return 1; |
276 | 597k | } |
277 | 440k | for (; i < length; ++i) { |
278 | 304k | if (src[i] != 0xff) return 1; |
279 | 304k | } |
280 | 136k | return 0; |
281 | 195k | } |
282 | | |
283 | 1.67M | static int HasAlpha32b_SSE2(const uint8_t* src, int length) { |
284 | 1.67M | const __m128i alpha_mask = _mm_set1_epi32(0xff); |
285 | 1.67M | const __m128i all_0xff = _mm_set1_epi8((char)0xff); |
286 | 1.67M | int i = 0; |
287 | | // We don't know if we can access the last 3 bytes after the last alpha |
288 | | // value 'src[4 * length - 4]' (because we don't know if alpha is the first |
289 | | // or the last byte of the quadruplet). Hence the '-3' protection below. |
290 | 1.67M | length = length * 4 - 3; // size in bytes |
291 | 11.3M | for (; i + 64 <= length; i += 64) { |
292 | 9.69M | const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); |
293 | 9.69M | const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); |
294 | 9.69M | const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32)); |
295 | 9.69M | const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48)); |
296 | 9.69M | const __m128i b0 = _mm_and_si128(a0, alpha_mask); |
297 | 9.69M | const __m128i b1 = _mm_and_si128(a1, alpha_mask); |
298 | 9.69M | const __m128i b2 = _mm_and_si128(a2, alpha_mask); |
299 | 9.69M | const __m128i b3 = _mm_and_si128(a3, alpha_mask); |
300 | 9.69M | const __m128i c0 = _mm_packs_epi32(b0, b1); |
301 | 9.69M | const __m128i c1 = _mm_packs_epi32(b2, b3); |
302 | 9.69M | const __m128i d = _mm_packus_epi16(c0, c1); |
303 | 9.69M | const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); |
304 | 9.69M | const int mask = _mm_movemask_epi8(bits); |
305 | 9.69M | if (mask != 0xffff) return 1; |
306 | 9.69M | } |
307 | 2.76M | for (; i + 32 <= length; i += 32) { |
308 | 1.09M | const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); |
309 | 1.09M | const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); |
310 | 1.09M | const __m128i b0 = _mm_and_si128(a0, alpha_mask); |
311 | 1.09M | const __m128i b1 = _mm_and_si128(a1, alpha_mask); |
312 | 1.09M | const __m128i c = _mm_packs_epi32(b0, b1); |
313 | 1.09M | const __m128i d = _mm_packus_epi16(c, c); |
314 | 1.09M | const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); |
315 | 1.09M | const int mask = _mm_movemask_epi8(bits); |
316 | 1.09M | if (mask != 0xffff) return 1; |
317 | 1.09M | } |
318 | 11.0M | for (; i <= length; i += 4) { |
319 | 9.36M | if (src[i] != 0xff) return 1; |
320 | 9.36M | } |
321 | 1.65M | return 0; |
322 | 1.66M | } |
323 | | |
324 | 813k | static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) { |
325 | 813k | const __m128i m_color = _mm_set1_epi32((int)color); |
326 | 813k | const __m128i zero = _mm_setzero_si128(); |
327 | 813k | int i = 0; |
328 | 6.62M | for (; i + 8 <= length; i += 8) { |
329 | 5.81M | const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); |
330 | 5.81M | const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4)); |
331 | 5.81M | const __m128i b0 = _mm_srai_epi32(a0, 24); |
332 | 5.81M | const __m128i b1 = _mm_srai_epi32(a1, 24); |
333 | 5.81M | const __m128i c0 = _mm_cmpeq_epi32(b0, zero); |
334 | 5.81M | const __m128i c1 = _mm_cmpeq_epi32(b1, zero); |
335 | 5.81M | const __m128i d0 = _mm_and_si128(c0, m_color); |
336 | 5.81M | const __m128i d1 = _mm_and_si128(c1, m_color); |
337 | 5.81M | const __m128i e0 = _mm_andnot_si128(c0, a0); |
338 | 5.81M | const __m128i e1 = _mm_andnot_si128(c1, a1); |
339 | 5.81M | _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0)); |
340 | 5.81M | _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1)); |
341 | 5.81M | } |
342 | 1.85M | for (; i < length; ++i) { |
343 | 1.03M | if ((src[i] >> 24) == 0) src[i] = color; |
344 | 1.03M | } |
345 | 813k | } |
346 | | |
347 | | // ----------------------------------------------------------------------------- |
348 | | // Apply alpha value to rows |
349 | | |
350 | 91.6M | static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { |
351 | 91.6M | int x = 0; |
352 | 91.6M | if (!inverse) { |
353 | 3.45M | const int kSpan = 2; |
354 | 3.45M | const __m128i zero = _mm_setzero_si128(); |
355 | 3.45M | const __m128i k128 = _mm_set1_epi16(128); |
356 | 3.45M | const __m128i kMult = _mm_set1_epi16(0x0101); |
357 | 3.45M | const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); |
358 | 2.49G | for (x = 0; x + kSpan <= width; x += kSpan) { |
359 | | // To compute 'result = (int)(a * x / 255. + .5)', we use: |
360 | | // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 |
361 | 2.48G | const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); |
362 | 2.48G | const __m128i A1 = _mm_unpacklo_epi8(A0, zero); |
363 | 2.48G | const __m128i A2 = _mm_or_si128(A1, kMask); |
364 | 2.48G | const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); |
365 | 2.48G | const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); |
366 | | // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] |
367 | 2.48G | const __m128i A5 = _mm_mullo_epi16(A4, A1); |
368 | 2.48G | const __m128i A6 = _mm_add_epi16(A5, k128); |
369 | 2.48G | const __m128i A7 = _mm_mulhi_epu16(A6, kMult); |
370 | 2.48G | const __m128i A10 = _mm_packus_epi16(A7, zero); |
371 | 2.48G | _mm_storel_epi64((__m128i*)&ptr[x], A10); |
372 | 2.48G | } |
373 | 3.45M | } |
374 | 91.6M | width -= x; |
375 | 91.6M | if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); |
376 | 91.6M | } |
377 | | |
378 | | static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr, |
379 | | const uint8_t* WEBP_RESTRICT const alpha, int width, |
380 | 11.2M | int inverse) { |
381 | 11.2M | int x = 0; |
382 | 11.2M | if (!inverse) { |
383 | 279k | const __m128i zero = _mm_setzero_si128(); |
384 | 279k | const __m128i k128 = _mm_set1_epi16(128); |
385 | 279k | const __m128i kMult = _mm_set1_epi16(0x0101); |
386 | 2.56M | for (x = 0; x + 8 <= width; x += 8) { |
387 | 2.28M | const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); |
388 | 2.28M | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); |
389 | 2.28M | const __m128i v1 = _mm_unpacklo_epi8(v0, zero); |
390 | 2.28M | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); |
391 | 2.28M | const __m128i v2 = _mm_mullo_epi16(v1, a1); |
392 | 2.28M | const __m128i v3 = _mm_add_epi16(v2, k128); |
393 | 2.28M | const __m128i v4 = _mm_mulhi_epu16(v3, kMult); |
394 | 2.28M | const __m128i v5 = _mm_packus_epi16(v4, zero); |
395 | 2.28M | _mm_storel_epi64((__m128i*)&ptr[x], v5); |
396 | 2.28M | } |
397 | 279k | } |
398 | 11.2M | width -= x; |
399 | 11.2M | if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse); |
400 | 11.2M | } |
401 | | |
402 | | //------------------------------------------------------------------------------ |
403 | | // Entry point |
404 | | |
405 | | extern void WebPInitAlphaProcessingSSE2(void); |
406 | | |
407 | 13.9k | WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { |
408 | 13.9k | WebPMultARGBRow = MultARGBRow_SSE2; |
409 | 13.9k | WebPMultRow = MultRow_SSE2; |
410 | 13.9k | WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; |
411 | 13.9k | WebPDispatchAlpha = DispatchAlpha_SSE2; |
412 | 13.9k | WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; |
413 | 13.9k | WebPExtractAlpha = ExtractAlpha_SSE2; |
414 | 13.9k | WebPExtractGreen = ExtractGreen_SSE2; |
415 | | |
416 | 13.9k | WebPHasAlpha8b = HasAlpha8b_SSE2; |
417 | 13.9k | WebPHasAlpha32b = HasAlpha32b_SSE2; |
418 | 13.9k | WebPAlphaReplace = AlphaReplace_SSE2; |
419 | 13.9k | } |
420 | | |
421 | | #else // !WEBP_USE_SSE2 |
422 | | |
423 | | WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) |
424 | | |
425 | | #endif // WEBP_USE_SSE2 |