/src/libwebp/src/dsp/lossless_enc_sse2.c
Line | Count | Source |
1 | | // Copyright 2015 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style license |
4 | | // that can be found in the COPYING file in the root of the source |
5 | | // tree. An additional intellectual property rights grant can be found |
6 | | // in the file PATENTS. All contributing project authors may |
7 | | // be found in the AUTHORS file in the root of the source tree. |
8 | | // ----------------------------------------------------------------------------- |
9 | | // |
10 | | // SSE2 variant of methods for lossless encoder |
11 | | // |
12 | | // Author: Skal (pascal.massimino@gmail.com) |
13 | | |
14 | | #include "src/dsp/dsp.h" |
15 | | |
16 | | #if defined(WEBP_USE_SSE2) |
17 | | #include <assert.h> |
18 | | #include <emmintrin.h> |
19 | | #include <string.h> |
20 | | |
21 | | #include "src/dsp/cpu.h" |
22 | | #include "src/dsp/lossless.h" |
23 | | #include "src/dsp/lossless_common.h" |
24 | | #include "src/utils/utils.h" |
25 | | #include "src/webp/format_constants.h" |
26 | | #include "src/webp/types.h" |
27 | | |
28 | | // For sign-extended multiplying constants, pre-shifted by 5: |
29 | | #define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) |
30 | | |
31 | | //------------------------------------------------------------------------------ |
32 | | // Subtract-Green Transform |
33 | | |
34 | | static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data, |
35 | 0 | int num_pixels) { |
36 | 0 | int i; |
37 | 0 | for (i = 0; i + 4 <= num_pixels; i += 4) { |
38 | 0 | const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb |
39 | 0 | const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g |
40 | 0 | const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); |
41 | 0 | const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g |
42 | 0 | const __m128i out = _mm_sub_epi8(in, C); |
43 | 0 | _mm_storeu_si128((__m128i*)&argb_data[i], out); |
44 | 0 | } |
45 | | // fallthrough and finish off with plain-C |
46 | 0 | if (i != num_pixels) { |
47 | 0 | VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); |
48 | 0 | } |
49 | 0 | } |
50 | | |
51 | | //------------------------------------------------------------------------------ |
52 | | // Color Transform |
53 | | |
54 | | #define MK_CST_16(HI, LO) \ |
55 | 0 | _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) |
56 | | |
57 | | static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m, |
58 | | uint32_t* WEBP_RESTRICT argb_data, |
59 | 0 | int num_pixels) { |
60 | 0 | const __m128i mults_rb = |
61 | 0 | MK_CST_16(CST_5b(m->green_to_red), CST_5b(m->green_to_blue)); |
62 | 0 | const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue), 0); |
63 | 0 | const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks |
64 | 0 | const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks |
65 | 0 | int i; |
66 | 0 | for (i = 0; i + 4 <= num_pixels; i += 4) { |
67 | 0 | const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb |
68 | 0 | const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 |
69 | 0 | const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); |
70 | 0 | const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 |
71 | 0 | const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 |
72 | 0 | const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 |
73 | 0 | const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 |
74 | 0 | const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 |
75 | 0 | const __m128i H = _mm_add_epi8(G, D); // x dr x db |
76 | 0 | const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db |
77 | 0 | const __m128i out = _mm_sub_epi8(in, I); |
78 | 0 | _mm_storeu_si128((__m128i*)&argb_data[i], out); |
79 | 0 | } |
80 | | // fallthrough and finish off with plain-C |
81 | 0 | if (i != num_pixels) { |
82 | 0 | VP8LTransformColor_C(m, argb_data + i, num_pixels - i); |
83 | 0 | } |
84 | 0 | } |
85 | | |
86 | | //------------------------------------------------------------------------------ |
87 | 0 | #define SPAN 8 |
88 | | static void CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, |
89 | | int stride, int tile_width, |
90 | | int tile_height, int green_to_blue, |
91 | 0 | int red_to_blue, uint32_t histo[]) { |
92 | 0 | const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0); |
93 | 0 | const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue)); |
94 | 0 | const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask |
95 | 0 | const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask |
96 | 0 | int y; |
97 | 0 | for (y = 0; y < tile_height; ++y) { |
98 | 0 | const uint32_t* const src = argb + y * stride; |
99 | 0 | int i, x; |
100 | 0 | for (x = 0; x + SPAN <= tile_width; x += SPAN) { |
101 | 0 | uint16_t values[SPAN]; |
102 | 0 | const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); |
103 | 0 | const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); |
104 | 0 | const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 |
105 | 0 | const __m128i A1 = _mm_slli_epi16(in1, 8); |
106 | 0 | const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 |
107 | 0 | const __m128i B1 = _mm_and_si128(in1, mask_g); |
108 | 0 | const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 |
109 | 0 | const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); |
110 | 0 | const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db |
111 | 0 | const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); |
112 | 0 | const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' |
113 | 0 | const __m128i E1 = _mm_sub_epi8(in1, D1); |
114 | 0 | const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db |
115 | 0 | const __m128i F1 = _mm_srli_epi32(C1, 16); |
116 | 0 | const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' |
117 | 0 | const __m128i G1 = _mm_sub_epi8(E1, F1); |
118 | 0 | const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b |
119 | 0 | const __m128i H1 = _mm_and_si128(G1, mask_b); |
120 | 0 | const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b' |
121 | 0 | _mm_storeu_si128((__m128i*)values, I); |
122 | 0 | for (i = 0; i < SPAN; ++i) ++histo[values[i]]; |
123 | 0 | } |
124 | 0 | } |
125 | 0 | { |
126 | 0 | const int left_over = tile_width & (SPAN - 1); |
127 | 0 | if (left_over > 0) { |
128 | 0 | VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, |
129 | 0 | left_over, tile_height, green_to_blue, |
130 | 0 | red_to_blue, histo); |
131 | 0 | } |
132 | 0 | } |
133 | 0 | } |
134 | | |
135 | | static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, |
136 | | int stride, int tile_width, |
137 | | int tile_height, int green_to_red, |
138 | 0 | uint32_t histo[]) { |
139 | 0 | const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red)); |
140 | 0 | const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask |
141 | 0 | const __m128i mask = _mm_set1_epi32(0xff); |
142 | |
|
143 | 0 | int y; |
144 | 0 | for (y = 0; y < tile_height; ++y) { |
145 | 0 | const uint32_t* const src = argb + y * stride; |
146 | 0 | int i, x; |
147 | 0 | for (x = 0; x + SPAN <= tile_width; x += SPAN) { |
148 | 0 | uint16_t values[SPAN]; |
149 | 0 | const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); |
150 | 0 | const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); |
151 | 0 | const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 |
152 | 0 | const __m128i A1 = _mm_and_si128(in1, mask_g); |
153 | 0 | const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r |
154 | 0 | const __m128i B1 = _mm_srli_epi32(in1, 16); |
155 | 0 | const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr |
156 | 0 | const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); |
157 | 0 | const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' |
158 | 0 | const __m128i E1 = _mm_sub_epi8(B1, C1); |
159 | 0 | const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' |
160 | 0 | const __m128i F1 = _mm_and_si128(E1, mask); |
161 | 0 | const __m128i I = _mm_packs_epi32(F0, F1); |
162 | 0 | _mm_storeu_si128((__m128i*)values, I); |
163 | 0 | for (i = 0; i < SPAN; ++i) ++histo[values[i]]; |
164 | 0 | } |
165 | 0 | } |
166 | 0 | { |
167 | 0 | const int left_over = tile_width & (SPAN - 1); |
168 | 0 | if (left_over > 0) { |
169 | 0 | VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, |
170 | 0 | left_over, tile_height, green_to_red, |
171 | 0 | histo); |
172 | 0 | } |
173 | 0 | } |
174 | 0 | } |
175 | | #undef SPAN |
176 | | #undef MK_CST_16 |
177 | | |
178 | | //------------------------------------------------------------------------------ |
179 | | |
180 | | // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But |
181 | | // that's ok since the histogram values are less than 1<<28 (max picture size). |
182 | | static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, |
183 | | const uint32_t* WEBP_RESTRICT b, |
184 | 1.03M | uint32_t* WEBP_RESTRICT out, int size) { |
185 | 1.03M | int i = 0; |
186 | 1.03M | int aligned_size = size & ~15; |
187 | | // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as |
188 | | // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of |
189 | | // 2). See the usage in VP8LHistogramAdd(). |
190 | 1.03M | assert(size >= 16); |
191 | 1.03M | assert(size % 2 == 0); |
192 | | |
193 | 14.1M | do { |
194 | 14.1M | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); |
195 | 14.1M | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
196 | 14.1M | const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); |
197 | 14.1M | const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); |
198 | 14.1M | const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); |
199 | 14.1M | const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); |
200 | 14.1M | const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]); |
201 | 14.1M | const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]); |
202 | 14.1M | _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); |
203 | 14.1M | _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); |
204 | 14.1M | _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); |
205 | 14.1M | _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); |
206 | 14.1M | i += 16; |
207 | 14.1M | } while (i != aligned_size); |
208 | | |
209 | 1.03M | if ((size & 8) != 0) { |
210 | 572k | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); |
211 | 572k | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
212 | 572k | const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); |
213 | 572k | const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); |
214 | 572k | _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); |
215 | 572k | _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); |
216 | 572k | i += 8; |
217 | 572k | } |
218 | | |
219 | 1.03M | size &= 7; |
220 | 1.03M | if (size == 4) { |
221 | 30.8k | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); |
222 | 30.8k | const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]); |
223 | 30.8k | _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0)); |
224 | 1.00M | } else if (size == 2) { |
225 | 9.45k | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]); |
226 | 9.45k | const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]); |
227 | 9.45k | _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0)); |
228 | 9.45k | } |
229 | 1.03M | } |
230 | | |
231 | | static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, |
232 | 2.45M | uint32_t* WEBP_RESTRICT out, int size) { |
233 | 2.45M | int i = 0; |
234 | 2.45M | int aligned_size = size & ~15; |
235 | | // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as |
236 | | // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of |
237 | | // 2). See the usage in VP8LHistogramAdd(). |
238 | 2.45M | assert(size >= 16); |
239 | 2.45M | assert(size % 2 == 0); |
240 | | |
241 | 68.8M | do { |
242 | 68.8M | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); |
243 | 68.8M | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
244 | 68.8M | const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); |
245 | 68.8M | const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); |
246 | 68.8M | const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); |
247 | 68.8M | const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); |
248 | 68.8M | const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]); |
249 | 68.8M | const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); |
250 | 68.8M | _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); |
251 | 68.8M | _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); |
252 | 68.8M | _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); |
253 | 68.8M | _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); |
254 | 68.8M | i += 16; |
255 | 68.8M | } while (i != aligned_size); |
256 | | |
257 | 2.45M | if ((size & 8) != 0) { |
258 | 947k | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); |
259 | 947k | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
260 | 947k | const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); |
261 | 947k | const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); |
262 | 947k | _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); |
263 | 947k | _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); |
264 | 947k | i += 8; |
265 | 947k | } |
266 | | |
267 | 2.45M | size &= 7; |
268 | 2.45M | if (size == 4) { |
269 | 69.9k | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); |
270 | 69.9k | const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]); |
271 | 69.9k | _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0)); |
272 | 2.38M | } else if (size == 2) { |
273 | 22.8k | const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]); |
274 | 22.8k | const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]); |
275 | 22.8k | _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0)); |
276 | 22.8k | } |
277 | 2.45M | } |
278 | | |
279 | | //------------------------------------------------------------------------------ |
280 | | // Entropy |
281 | | |
282 | | #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ) |
283 | | |
284 | | static uint64_t CombinedShannonEntropy_SSE2(const uint32_t X[256], |
285 | 39.6M | const uint32_t Y[256]) { |
286 | 39.6M | int i; |
287 | 39.6M | uint64_t retval = 0; |
288 | 39.6M | uint32_t sumX = 0, sumXY = 0; |
289 | 39.6M | const __m128i zero = _mm_setzero_si128(); |
290 | | |
291 | 673M | for (i = 0; i < 256; i += 16) { |
292 | 633M | const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0)); |
293 | 633M | const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0)); |
294 | 633M | const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4)); |
295 | 633M | const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4)); |
296 | 633M | const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8)); |
297 | 633M | const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8)); |
298 | 633M | const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12)); |
299 | 633M | const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12)); |
300 | 633M | const __m128i x4 = |
301 | 633M | _mm_packs_epi16(_mm_packs_epi32(x0, x1), _mm_packs_epi32(x2, x3)); |
302 | 633M | const __m128i y4 = |
303 | 633M | _mm_packs_epi16(_mm_packs_epi32(y0, y1), _mm_packs_epi32(y2, y3)); |
304 | 633M | const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero)); |
305 | 633M | int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx; |
306 | 1.04G | while (my) { |
307 | 409M | const int32_t j = BitsCtz(my); |
308 | 409M | uint32_t xy; |
309 | 409M | if ((mx >> j) & 1) { |
310 | 257M | const int x = X[i + j]; |
311 | 257M | sumXY += x; |
312 | 257M | retval += VP8LFastSLog2(x); |
313 | 257M | } |
314 | 409M | xy = X[i + j] + Y[i + j]; |
315 | 409M | sumX += xy; |
316 | 409M | retval += VP8LFastSLog2(xy); |
317 | 409M | my &= my - 1; |
318 | 409M | } |
319 | 633M | } |
320 | 39.6M | retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval; |
321 | 39.6M | return retval; |
322 | 39.6M | } |
323 | | |
324 | | #else |
325 | | |
326 | | #define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster |
327 | | |
328 | | #endif |
329 | | |
330 | | //------------------------------------------------------------------------------ |
331 | | |
332 | | static int VectorMismatch_SSE2(const uint32_t* const array1, |
333 | 275M | const uint32_t* const array2, int length) { |
334 | 275M | int match_len; |
335 | | |
336 | 275M | if (length >= 12) { |
337 | 275M | __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); |
338 | 275M | __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); |
339 | 275M | match_len = 0; |
340 | 664M | do { |
341 | | // Loop unrolling and early load both provide a speedup of 10% for the |
342 | | // current function. Also, max_limit can be MAX_LENGTH=4096 at most. |
343 | 664M | const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); |
344 | 664M | const __m128i B0 = |
345 | 664M | _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); |
346 | 664M | const __m128i B1 = |
347 | 664M | _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); |
348 | 664M | if (_mm_movemask_epi8(cmpA) != 0xffff) break; |
349 | 429M | match_len += 4; |
350 | | |
351 | 429M | { |
352 | 429M | const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); |
353 | 429M | A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); |
354 | 429M | A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); |
355 | 429M | if (_mm_movemask_epi8(cmpB) != 0xffff) break; |
356 | 389M | match_len += 4; |
357 | 389M | } |
358 | 389M | } while (match_len + 12 < length); |
359 | 275M | } else { |
360 | 54.5k | match_len = 0; |
361 | | // Unroll the potential first two loops. |
362 | 54.5k | if (length >= 4 && |
363 | 16.5k | _mm_movemask_epi8(_mm_cmpeq_epi32( |
364 | 16.5k | _mm_loadu_si128((const __m128i*)&array1[0]), |
365 | 16.5k | _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { |
366 | 4.55k | match_len = 4; |
367 | 4.55k | if (length >= 8 && |
368 | 2.29k | _mm_movemask_epi8(_mm_cmpeq_epi32( |
369 | 2.29k | _mm_loadu_si128((const __m128i*)&array1[4]), |
370 | 2.29k | _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) { |
371 | 1.03k | match_len = 8; |
372 | 1.03k | } |
373 | 4.55k | } |
374 | 54.5k | } |
375 | | |
376 | 774M | while (match_len < length && array1[match_len] == array2[match_len]) { |
377 | 498M | ++match_len; |
378 | 498M | } |
379 | 275M | return match_len; |
380 | 275M | } |
381 | | |
382 | | // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. |
383 | | static void BundleColorMap_SSE2(const uint8_t* WEBP_RESTRICT const row, |
384 | | int width, int xbits, |
385 | 1.03M | uint32_t* WEBP_RESTRICT dst) { |
386 | 1.03M | int x; |
387 | 1.03M | assert(xbits >= 0); |
388 | 1.03M | assert(xbits <= 3); |
389 | 1.03M | switch (xbits) { |
390 | 371k | case 0: { |
391 | 371k | const __m128i ff = _mm_set1_epi16((short)0xff00); |
392 | 371k | const __m128i zero = _mm_setzero_si128(); |
393 | | // Store 0xff000000 | (row[x] << 8). |
394 | 17.5M | for (x = 0; x + 16 <= width; x += 16, dst += 16) { |
395 | 17.1M | const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); |
396 | 17.1M | const __m128i in_lo = _mm_unpacklo_epi8(zero, in); |
397 | 17.1M | const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff); |
398 | 17.1M | const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff); |
399 | 17.1M | const __m128i in_hi = _mm_unpackhi_epi8(zero, in); |
400 | 17.1M | const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff); |
401 | 17.1M | const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff); |
402 | 17.1M | _mm_storeu_si128((__m128i*)&dst[0], dst0); |
403 | 17.1M | _mm_storeu_si128((__m128i*)&dst[4], dst1); |
404 | 17.1M | _mm_storeu_si128((__m128i*)&dst[8], dst2); |
405 | 17.1M | _mm_storeu_si128((__m128i*)&dst[12], dst3); |
406 | 17.1M | } |
407 | 371k | break; |
408 | 0 | } |
409 | 116k | case 1: { |
410 | 116k | const __m128i ff = _mm_set1_epi16((short)0xff00); |
411 | 116k | const __m128i mul = _mm_set1_epi16(0x110); |
412 | 5.40M | for (x = 0; x + 16 <= width; x += 16, dst += 8) { |
413 | | // 0a0b | (where a/b are 4 bits). |
414 | 5.28M | const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); |
415 | 5.28M | const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0 |
416 | 5.28M | const __m128i pack = _mm_and_si128(tmp, ff); // ab00 |
417 | 5.28M | const __m128i dst0 = _mm_unpacklo_epi16(pack, ff); |
418 | 5.28M | const __m128i dst1 = _mm_unpackhi_epi16(pack, ff); |
419 | 5.28M | _mm_storeu_si128((__m128i*)&dst[0], dst0); |
420 | 5.28M | _mm_storeu_si128((__m128i*)&dst[4], dst1); |
421 | 5.28M | } |
422 | 116k | break; |
423 | 0 | } |
424 | 100k | case 2: { |
425 | 100k | const __m128i mask_or = _mm_set1_epi32((int)0xff000000); |
426 | 100k | const __m128i mul_cst = _mm_set1_epi16(0x0104); |
427 | 100k | const __m128i mask_mul = _mm_set1_epi16(0x0f00); |
428 | 3.81M | for (x = 0; x + 16 <= width; x += 16, dst += 4) { |
429 | | // 000a000b000c000d | (where a/b/c/d are 2 bits). |
430 | 3.71M | const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); |
431 | 3.71M | const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0 |
432 | 3.71M | const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000 |
433 | 3.71M | const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000 |
434 | 3.71M | const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000 |
435 | | // Convert to 0xff00**00. |
436 | 3.71M | const __m128i res = _mm_or_si128(pack, mask_or); |
437 | 3.71M | _mm_storeu_si128((__m128i*)dst, res); |
438 | 3.71M | } |
439 | 100k | break; |
440 | 0 | } |
441 | 442k | default: { |
442 | 442k | assert(xbits == 3); |
443 | 27.5M | for (x = 0; x + 16 <= width; x += 16, dst += 2) { |
444 | | // 0000000a00000000b... | (where a/b are 1 bit). |
445 | 27.0M | const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); |
446 | 27.0M | const __m128i shift = _mm_slli_epi64(in, 7); |
447 | 27.0M | const uint32_t move = _mm_movemask_epi8(shift); |
448 | 27.0M | dst[0] = 0xff000000 | ((move & 0xff) << 8); |
449 | 27.0M | dst[1] = 0xff000000 | (move & 0xff00); |
450 | 27.0M | } |
451 | 442k | break; |
452 | 0 | } |
453 | 1.03M | } |
454 | 1.03M | if (x != width) { |
455 | 905k | VP8LBundleColorMap_C(row + x, width - x, xbits, dst); |
456 | 905k | } |
457 | 1.03M | } |
458 | | |
459 | | //------------------------------------------------------------------------------ |
460 | | // Batch version of Predictor Transform subtraction |
461 | | |
462 | | static WEBP_INLINE void Average2_m128i(const __m128i* const a0, |
463 | | const __m128i* const a1, |
464 | 1.63G | __m128i* const avg) { |
465 | | // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) |
466 | 1.63G | const __m128i ones = _mm_set1_epi8(1); |
467 | 1.63G | const __m128i avg1 = _mm_avg_epu8(*a0, *a1); |
468 | 1.63G | const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones); |
469 | 1.63G | *avg = _mm_sub_epi8(avg1, one); |
470 | 1.63G | } |
471 | | |
472 | | // Predictor0: ARGB_BLACK. |
473 | | static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, |
474 | 22.3M | int num_pixels, uint32_t* WEBP_RESTRICT out) { |
475 | 22.3M | int i; |
476 | 22.3M | const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); |
477 | 196M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
478 | 174M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
479 | 174M | const __m128i res = _mm_sub_epi8(src, black); |
480 | 174M | _mm_storeu_si128((__m128i*)&out[i], res); |
481 | 174M | } |
482 | 22.3M | if (i != num_pixels) { |
483 | 1.31M | VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i); |
484 | 1.31M | } |
485 | 22.3M | (void)upper; |
486 | 22.3M | } |
487 | | |
488 | | #define GENERATE_PREDICTOR_1(X, IN) \ |
489 | | static void PredictorSub##X##_SSE2( \ |
490 | | const uint32_t* const in, const uint32_t* const upper, int num_pixels, \ |
491 | 106M | uint32_t* WEBP_RESTRICT const out) { \ |
492 | 106M | int i; \ |
493 | 927M | for (i = 0; i + 4 <= num_pixels; i += 4) { \ |
494 | 821M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ |
495 | 821M | const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \ |
496 | 821M | const __m128i res = _mm_sub_epi8(src, pred); \ |
497 | 821M | _mm_storeu_si128((__m128i*)&out[i], res); \ |
498 | 821M | } \ |
499 | 106M | if (i != num_pixels) { \ |
500 | 17.6M | VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \ |
501 | 17.6M | num_pixels - i, out + i); \ |
502 | 17.6M | } \ |
503 | 106M | } |
504 | | |
505 | 25.6M | GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L |
506 | 34.6M | GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T |
507 | 23.3M | GENERATE_PREDICTOR_1(3, upper[i + 1]) // Predictor3: TR |
508 | 22.8M | GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL |
509 | | #undef GENERATE_PREDICTOR_1 |
510 | | |
511 | | // Predictor5: avg2(avg2(L, TR), T) |
512 | | static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper, |
513 | 22.5M | int num_pixels, uint32_t* WEBP_RESTRICT out) { |
514 | 22.5M | int i; |
515 | 198M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
516 | 176M | const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); |
517 | 176M | const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
518 | 176M | const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); |
519 | 176M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
520 | 176M | __m128i avg, pred, res; |
521 | 176M | Average2_m128i(&L, &TR, &avg); |
522 | 176M | Average2_m128i(&avg, &T, &pred); |
523 | 176M | res = _mm_sub_epi8(src, pred); |
524 | 176M | _mm_storeu_si128((__m128i*)&out[i], res); |
525 | 176M | } |
526 | 22.5M | if (i != num_pixels) { |
527 | 1.30M | VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i); |
528 | 1.30M | } |
529 | 22.5M | } |
530 | | |
531 | | #define GENERATE_PREDICTOR_2(X, A, B) \ |
532 | | static void PredictorSub##X##_SSE2(const uint32_t* in, \ |
533 | | const uint32_t* upper, int num_pixels, \ |
534 | 90.7M | uint32_t* WEBP_RESTRICT out) { \ |
535 | 90.7M | int i; \ |
536 | 803M | for (i = 0; i + 4 <= num_pixels; i += 4) { \ |
537 | 713M | const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ |
538 | 713M | const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \ |
539 | 713M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ |
540 | 713M | __m128i pred, res; \ |
541 | 713M | Average2_m128i(&tA, &tB, &pred); \ |
542 | 713M | res = _mm_sub_epi8(src, pred); \ |
543 | 713M | _mm_storeu_si128((__m128i*)&out[i], res); \ |
544 | 713M | } \ |
545 | 90.7M | if (i != num_pixels) { \ |
546 | 5.27M | VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ |
547 | 5.27M | } \ |
548 | 90.7M | } lossless_enc_sse2.c:PredictorSub6_SSE2 Line | Count | Source | 534 | 22.6M | uint32_t* WEBP_RESTRICT out) { \ | 535 | 22.6M | int i; \ | 536 | 202M | for (i = 0; i + 4 <= num_pixels; i += 4) { \ | 537 | 180M | const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ | 538 | 180M | const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \ | 539 | 180M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ | 540 | 180M | __m128i pred, res; \ | 541 | 180M | Average2_m128i(&tA, &tB, &pred); \ | 542 | 180M | res = _mm_sub_epi8(src, pred); \ | 543 | 180M | _mm_storeu_si128((__m128i*)&out[i], res); \ | 544 | 180M | } \ | 545 | 22.6M | if (i != num_pixels) { \ | 546 | 1.34M | VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ | 547 | 1.34M | } \ | 548 | 22.6M | } |
lossless_enc_sse2.c:PredictorSub7_SSE2 Line | Count | Source | 534 | 22.2M | uint32_t* WEBP_RESTRICT out) { \ | 535 | 22.2M | int i; \ | 536 | 196M | for (i = 0; i + 4 <= num_pixels; i += 4) { \ | 537 | 173M | const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ | 538 | 173M | const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \ | 539 | 173M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ | 540 | 173M | __m128i pred, res; \ | 541 | 173M | Average2_m128i(&tA, &tB, &pred); \ | 542 | 173M | res = _mm_sub_epi8(src, pred); \ | 543 | 173M | _mm_storeu_si128((__m128i*)&out[i], res); \ | 544 | 173M | } \ | 545 | 22.2M | if (i != num_pixels) { \ | 546 | 1.28M | VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ | 547 | 1.28M | } \ | 548 | 22.2M | } |
lossless_enc_sse2.c:PredictorSub8_SSE2 Line | Count | Source | 534 | 22.7M | uint32_t* WEBP_RESTRICT out) { \ | 535 | 22.7M | int i; \ | 536 | 201M | for (i = 0; i + 4 <= num_pixels; i += 4) { \ | 537 | 178M | const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ | 538 | 178M | const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \ | 539 | 178M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ | 540 | 178M | __m128i pred, res; \ | 541 | 178M | Average2_m128i(&tA, &tB, &pred); \ | 542 | 178M | res = _mm_sub_epi8(src, pred); \ | 543 | 178M | _mm_storeu_si128((__m128i*)&out[i], res); \ | 544 | 178M | } \ | 545 | 22.7M | if (i != num_pixels) { \ | 546 | 1.31M | VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ | 547 | 1.31M | } \ | 548 | 22.7M | } |
lossless_enc_sse2.c:PredictorSub9_SSE2 Line | Count | Source | 534 | 23.1M | uint32_t* WEBP_RESTRICT out) { \ | 535 | 23.1M | int i; \ | 536 | 203M | for (i = 0; i + 4 <= num_pixels; i += 4) { \ | 537 | 180M | const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ | 538 | 180M | const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \ | 539 | 180M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ | 540 | 180M | __m128i pred, res; \ | 541 | 180M | Average2_m128i(&tA, &tB, &pred); \ | 542 | 180M | res = _mm_sub_epi8(src, pred); \ | 543 | 180M | _mm_storeu_si128((__m128i*)&out[i], res); \ | 544 | 180M | } \ | 545 | 23.1M | if (i != num_pixels) { \ | 546 | 1.32M | VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ | 547 | 1.32M | } \ | 548 | 23.1M | } |
|
549 | | |
550 | | GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL) |
551 | | GENERATE_PREDICTOR_2(7, in[i - 1], upper[i]) // Predictor7: avg(L, T) |
552 | | GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i]) // Predictor8: avg(TL, T) |
553 | | GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR) |
554 | | #undef GENERATE_PREDICTOR_2 |
555 | | |
556 | | // Predictor10: avg(avg(L,TL), avg(T, TR)). |
557 | | static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper, |
558 | 23.5M | int num_pixels, uint32_t* WEBP_RESTRICT out) { |
559 | 23.5M | int i; |
560 | 212M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
561 | 188M | const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); |
562 | 188M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
563 | 188M | const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
564 | 188M | const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
565 | 188M | const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); |
566 | 188M | __m128i avgTTR, avgLTL, avg, res; |
567 | 188M | Average2_m128i(&T, &TR, &avgTTR); |
568 | 188M | Average2_m128i(&L, &TL, &avgLTL); |
569 | 188M | Average2_m128i(&avgTTR, &avgLTL, &avg); |
570 | 188M | res = _mm_sub_epi8(src, avg); |
571 | 188M | _mm_storeu_si128((__m128i*)&out[i], res); |
572 | 188M | } |
573 | 23.5M | if (i != num_pixels) { |
574 | 1.39M | VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i); |
575 | 1.39M | } |
576 | 23.5M | } |
577 | | |
578 | | // Predictor11: select. |
579 | | static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B, |
580 | 355M | __m128i* const out) { |
581 | | // We can unpack with any value on the upper 32 bits, provided it's the same |
582 | | // on both operands (to that their sum of abs diff is zero). Here we use *A. |
583 | 355M | const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); |
584 | 355M | const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); |
585 | 355M | const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); |
586 | 355M | const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); |
587 | 355M | const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); |
588 | 355M | const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); |
589 | 355M | *out = _mm_packs_epi32(s_lo, s_hi); |
590 | 355M | } |
591 | | |
592 | | static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, |
593 | 22.6M | int num_pixels, uint32_t* WEBP_RESTRICT out) { |
594 | 22.6M | int i; |
595 | 200M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
596 | 177M | const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); |
597 | 177M | const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
598 | 177M | const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
599 | 177M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
600 | 177M | __m128i pa, pb; |
601 | 177M | GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL| |
602 | 177M | GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL| |
603 | 177M | { |
604 | 177M | const __m128i mask = _mm_cmpgt_epi32(pb, pa); |
605 | 177M | const __m128i A = _mm_and_si128(mask, L); |
606 | 177M | const __m128i B = _mm_andnot_si128(mask, T); |
607 | 177M | const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T |
608 | 177M | const __m128i res = _mm_sub_epi8(src, pred); |
609 | 177M | _mm_storeu_si128((__m128i*)&out[i], res); |
610 | 177M | } |
611 | 177M | } |
612 | 22.6M | if (i != num_pixels) { |
613 | 1.34M | VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); |
614 | 1.34M | } |
615 | 22.6M | } |
616 | | |
617 | | // Predictor12: ClampedSubSubtractFull. |
618 | | static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper, |
619 | 22.7M | int num_pixels, uint32_t* WEBP_RESTRICT out) { |
620 | 22.7M | int i; |
621 | 22.7M | const __m128i zero = _mm_setzero_si128(); |
622 | 205M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
623 | 182M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
624 | 182M | const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); |
625 | 182M | const __m128i L_lo = _mm_unpacklo_epi8(L, zero); |
626 | 182M | const __m128i L_hi = _mm_unpackhi_epi8(L, zero); |
627 | 182M | const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
628 | 182M | const __m128i T_lo = _mm_unpacklo_epi8(T, zero); |
629 | 182M | const __m128i T_hi = _mm_unpackhi_epi8(T, zero); |
630 | 182M | const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
631 | 182M | const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); |
632 | 182M | const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); |
633 | 182M | const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); |
634 | 182M | const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); |
635 | 182M | const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo); |
636 | 182M | const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi); |
637 | 182M | const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); |
638 | 182M | const __m128i res = _mm_sub_epi8(src, pred); |
639 | 182M | _mm_storeu_si128((__m128i*)&out[i], res); |
640 | 182M | } |
641 | 22.7M | if (i != num_pixels) { |
642 | 1.35M | VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i); |
643 | 1.35M | } |
644 | 22.7M | } |
645 | | |
646 | | // Predictors13: ClampedAddSubtractHalf |
647 | | static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, |
648 | 22.4M | int num_pixels, uint32_t* WEBP_RESTRICT out) { |
649 | 22.4M | int i; |
650 | 22.4M | const __m128i zero = _mm_setzero_si128(); |
651 | 198M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
652 | 175M | const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); |
653 | 175M | const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
654 | 175M | const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
655 | 175M | const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
656 | 175M | __m128i A4_lo, A4_hi; |
657 | | // lo. |
658 | 175M | { |
659 | 175M | const __m128i L_lo = _mm_unpacklo_epi8(L, zero); |
660 | 175M | const __m128i T_lo = _mm_unpacklo_epi8(T, zero); |
661 | 175M | const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); |
662 | 175M | const __m128i sum_lo = _mm_add_epi16(T_lo, L_lo); |
663 | 175M | const __m128i avg_lo = _mm_srli_epi16(sum_lo, 1); |
664 | 175M | const __m128i A1_lo = _mm_sub_epi16(avg_lo, TL_lo); |
665 | 175M | const __m128i bit_fix_lo = _mm_cmpgt_epi16(TL_lo, avg_lo); |
666 | 175M | const __m128i A2_lo = _mm_sub_epi16(A1_lo, bit_fix_lo); |
667 | 175M | const __m128i A3_lo = _mm_srai_epi16(A2_lo, 1); |
668 | 175M | A4_lo = _mm_add_epi16(avg_lo, A3_lo); |
669 | 175M | } |
670 | | // hi. |
671 | 175M | { |
672 | 175M | const __m128i L_hi = _mm_unpackhi_epi8(L, zero); |
673 | 175M | const __m128i T_hi = _mm_unpackhi_epi8(T, zero); |
674 | 175M | const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); |
675 | 175M | const __m128i sum_hi = _mm_add_epi16(T_hi, L_hi); |
676 | 175M | const __m128i avg_hi = _mm_srli_epi16(sum_hi, 1); |
677 | 175M | const __m128i A1_hi = _mm_sub_epi16(avg_hi, TL_hi); |
678 | 175M | const __m128i bit_fix_hi = _mm_cmpgt_epi16(TL_hi, avg_hi); |
679 | 175M | const __m128i A2_hi = _mm_sub_epi16(A1_hi, bit_fix_hi); |
680 | 175M | const __m128i A3_hi = _mm_srai_epi16(A2_hi, 1); |
681 | 175M | A4_hi = _mm_add_epi16(avg_hi, A3_hi); |
682 | 175M | } |
683 | 175M | { |
684 | 175M | const __m128i pred = _mm_packus_epi16(A4_lo, A4_hi); |
685 | 175M | const __m128i res = _mm_sub_epi8(src, pred); |
686 | 175M | _mm_storeu_si128((__m128i*)&out[i], res); |
687 | 175M | } |
688 | 175M | } |
689 | 22.4M | if (i != num_pixels) { |
690 | 1.30M | VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); |
691 | 1.30M | } |
692 | 22.4M | } |
693 | | |
694 | | //------------------------------------------------------------------------------ |
695 | | // Entry point |
696 | | |
697 | | extern void VP8LEncDspInitSSE2(void); |
698 | | |
699 | 3 | WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { |
700 | | // SSE exports for AVX and above. |
701 | 3 | VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE2; |
702 | 3 | VP8LTransformColor_SSE = TransformColor_SSE2; |
703 | 3 | VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE2; |
704 | 3 | VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE2; |
705 | 3 | VP8LBundleColorMap_SSE = BundleColorMap_SSE2; |
706 | | |
707 | 3 | VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_SSE; |
708 | 3 | VP8LTransformColor = VP8LTransformColor_SSE; |
709 | 3 | VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_SSE; |
710 | 3 | VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_SSE; |
711 | 3 | VP8LAddVector = AddVector_SSE2; |
712 | 3 | VP8LAddVectorEq = AddVectorEq_SSE2; |
713 | 3 | #if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC) |
714 | 3 | VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2; |
715 | 3 | #endif |
716 | 3 | VP8LVectorMismatch = VectorMismatch_SSE2; |
717 | 3 | VP8LBundleColorMap = VP8LBundleColorMap_SSE; |
718 | | |
719 | | // SSE exports for AVX and above. |
720 | 3 | VP8LPredictorsSub_SSE[0] = PredictorSub0_SSE2; |
721 | 3 | VP8LPredictorsSub_SSE[1] = PredictorSub1_SSE2; |
722 | 3 | VP8LPredictorsSub_SSE[2] = PredictorSub2_SSE2; |
723 | 3 | VP8LPredictorsSub_SSE[3] = PredictorSub3_SSE2; |
724 | 3 | VP8LPredictorsSub_SSE[4] = PredictorSub4_SSE2; |
725 | 3 | VP8LPredictorsSub_SSE[5] = PredictorSub5_SSE2; |
726 | 3 | VP8LPredictorsSub_SSE[6] = PredictorSub6_SSE2; |
727 | 3 | VP8LPredictorsSub_SSE[7] = PredictorSub7_SSE2; |
728 | 3 | VP8LPredictorsSub_SSE[8] = PredictorSub8_SSE2; |
729 | 3 | VP8LPredictorsSub_SSE[9] = PredictorSub9_SSE2; |
730 | 3 | VP8LPredictorsSub_SSE[10] = PredictorSub10_SSE2; |
731 | 3 | VP8LPredictorsSub_SSE[11] = PredictorSub11_SSE2; |
732 | 3 | VP8LPredictorsSub_SSE[12] = PredictorSub12_SSE2; |
733 | 3 | VP8LPredictorsSub_SSE[13] = PredictorSub13_SSE2; |
734 | | // padding security sentinels |
735 | 3 | VP8LPredictorsSub_SSE[14] = PredictorSub0_SSE2; |
736 | 3 | VP8LPredictorsSub_SSE[15] = PredictorSub0_SSE2; |
737 | 3 | memcpy(VP8LPredictorsSub, VP8LPredictorsSub_SSE, sizeof(VP8LPredictorsSub)); |
738 | 3 | } |
739 | | |
740 | | #else // !WEBP_USE_SSE2 |
741 | | |
742 | | WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2) |
743 | | |
744 | | #endif // WEBP_USE_SSE2 |