/src/libwebp/src/dsp/lossless_sse41.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2021 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style license |
4 | | // that can be found in the COPYING file in the root of the source |
5 | | // tree. An additional intellectual property rights grant can be found |
6 | | // in the file PATENTS. All contributing project authors may |
7 | | // be found in the AUTHORS file in the root of the source tree. |
8 | | // ----------------------------------------------------------------------------- |
9 | | // |
10 | | // SSE41 variant of methods for lossless decoder |
11 | | |
12 | | #include "src/dsp/dsp.h" |
13 | | |
14 | | #if defined(WEBP_USE_SSE41) |
15 | | #include <emmintrin.h> |
16 | | #include <smmintrin.h> |
17 | | |
18 | | #include "src/webp/types.h" |
19 | | #include "src/dsp/cpu.h" |
20 | | #include "src/dsp/lossless.h" |
21 | | |
22 | | //------------------------------------------------------------------------------ |
23 | | // Color-space conversion functions |
24 | | |
25 | | static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, |
26 | | const uint32_t* const src, |
27 | 12.3M | int num_pixels, uint32_t* dst) { |
28 | | // sign-extended multiplying constants, pre-shifted by 5. |
29 | 37.1M | #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend |
30 | 12.3M | const __m128i mults_rb = |
31 | 12.3M | _mm_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 | |
32 | 12.3M | (CST(green_to_blue) & 0xffff))); |
33 | 12.3M | const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue)); |
34 | 12.3M | #undef CST |
35 | 12.3M | const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); |
36 | 12.3M | const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, |
37 | 12.3M | -1, 9, -1, 9, -1, 13, -1, 13); |
38 | 12.3M | const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, |
39 | 12.3M | -1, 10, -1, -1, -1, 14, -1, -1); |
40 | 12.3M | int i; |
41 | 37.7M | for (i = 0; i + 4 <= num_pixels; i += 4) { |
42 | 25.4M | const __m128i A = _mm_loadu_si128((const __m128i*)(src + i)); |
43 | 25.4M | const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0 |
44 | 25.4M | const __m128i C = _mm_mulhi_epi16(B, mults_rb); |
45 | 25.4M | const __m128i D = _mm_add_epi8(A, C); |
46 | 25.4M | const __m128i E = _mm_shuffle_epi8(D, perm2); |
47 | 25.4M | const __m128i F = _mm_mulhi_epi16(E, mults_b2); |
48 | 25.4M | const __m128i G = _mm_add_epi8(D, F); |
49 | 25.4M | const __m128i out = _mm_blendv_epi8(G, A, mask_ag); |
50 | 25.4M | _mm_storeu_si128((__m128i*)&dst[i], out); |
51 | 25.4M | } |
52 | | // Fall-back to C-version for left-overs. |
53 | 12.3M | if (i != num_pixels) { |
54 | 64.5k | VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); |
55 | 64.5k | } |
56 | 12.3M | } |
57 | | |
58 | | //------------------------------------------------------------------------------ |
59 | | |
60 | 0 | #define ARGB_TO_RGB_SSE41 do { \ |
61 | 0 | while (num_pixels >= 16) { \ |
62 | 0 | const __m128i in0 = _mm_loadu_si128(in + 0); \ |
63 | 0 | const __m128i in1 = _mm_loadu_si128(in + 1); \ |
64 | 0 | const __m128i in2 = _mm_loadu_si128(in + 2); \ |
65 | 0 | const __m128i in3 = _mm_loadu_si128(in + 3); \ |
66 | 0 | const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \ |
67 | 0 | const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \ |
68 | 0 | const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \ |
69 | 0 | const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \ |
70 | 0 | const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \ |
71 | 0 | const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \ |
72 | 0 | const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \ |
73 | 0 | _mm_storeu_si128(out + 0, b0); \ |
74 | 0 | _mm_storeu_si128(out + 1, b1); \ |
75 | 0 | _mm_storeu_si128(out + 2, b2); \ |
76 | 0 | in += 4; \ |
77 | 0 | out += 3; \ |
78 | 0 | num_pixels -= 16; \ |
79 | 0 | } \ |
80 | 0 | } while (0) |
81 | | |
82 | | static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src, |
83 | 0 | int num_pixels, uint8_t* WEBP_RESTRICT dst) { |
84 | 0 | const __m128i* in = (const __m128i*)src; |
85 | 0 | __m128i* out = (__m128i*)dst; |
86 | 0 | const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, |
87 | 0 | 8, 14, 13, 12, -1, -1, -1, -1); |
88 | 0 | const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); |
89 | 0 | const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); |
90 | 0 | const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); |
91 | |
|
92 | 0 | ARGB_TO_RGB_SSE41; |
93 | | |
94 | | // left-overs |
95 | 0 | if (num_pixels > 0) { |
96 | 0 | VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
97 | 0 | } |
98 | 0 | } |
99 | | |
100 | | static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src, |
101 | 0 | int num_pixels, uint8_t* WEBP_RESTRICT dst) { |
102 | 0 | const __m128i* in = (const __m128i*)src; |
103 | 0 | __m128i* out = (__m128i*)dst; |
104 | 0 | const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, |
105 | 0 | 12, 13, 14, -1, -1, -1, -1); |
106 | 0 | const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); |
107 | 0 | const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); |
108 | 0 | const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); |
109 | |
|
110 | 0 | ARGB_TO_RGB_SSE41; |
111 | | |
112 | | // left-overs |
113 | 0 | if (num_pixels > 0) { |
114 | 0 | VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
115 | 0 | } |
116 | 0 | } |
117 | | |
118 | | #undef ARGB_TO_RGB_SSE41 |
119 | | |
120 | | //------------------------------------------------------------------------------ |
121 | | // Entry point |
122 | | |
123 | | extern void VP8LDspInitSSE41(void); |
124 | | |
125 | 1 | WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) { |
126 | 1 | VP8LTransformColorInverse = TransformColorInverse_SSE41; |
127 | 1 | VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41; |
128 | 1 | VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41; |
129 | | |
130 | | // SSE exports for AVX and above. |
131 | 1 | VP8LTransformColorInverse_SSE = TransformColorInverse_SSE41; |
132 | 1 | VP8LConvertBGRAToRGB_SSE = ConvertBGRAToRGB_SSE41; |
133 | 1 | } |
134 | | |
135 | | #else // !WEBP_USE_SSE41 |
136 | | |
137 | | WEBP_DSP_INIT_STUB(VP8LDspInitSSE41) |
138 | | |
139 | | #endif // WEBP_USE_SSE41 |