/src/libwebp/src/dsp/lossless_enc_sse41.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style license |
4 | | // that can be found in the COPYING file in the root of the source |
5 | | // tree. An additional intellectual property rights grant can be found |
6 | | // in the file PATENTS. All contributing project authors may |
7 | | // be found in the AUTHORS file in the root of the source tree. |
8 | | // ----------------------------------------------------------------------------- |
9 | | // |
10 | | // SSE4.1 variant of methods for lossless encoder |
11 | | // |
12 | | // Author: Skal (pascal.massimino@gmail.com) |
13 | | |
14 | | #include "src/dsp/dsp.h" |
15 | | |
16 | | #if defined(WEBP_USE_SSE41) |
17 | | #include <assert.h> |
18 | | #include <smmintrin.h> |
19 | | #include "src/dsp/lossless.h" |
20 | | |
21 | | //------------------------------------------------------------------------------ |
22 | | // Cost operations. |
23 | | |
24 | 0 | static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) { |
25 | 0 | cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8)); |
26 | 0 | cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4)); |
27 | 0 | return _mm_cvtsi128_si32(cost); |
28 | 0 | } |
29 | | |
30 | 0 | static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { |
31 | 0 | int i; |
32 | 0 | __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]); |
33 | 0 | assert(length % 8 == 0); |
34 | | |
35 | 0 | for (i = 8; i + 8 <= length; i += 8) { |
36 | 0 | const int j = (i - 2) >> 1; |
37 | 0 | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); |
38 | 0 | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
39 | 0 | const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); |
40 | 0 | const __m128i a2 = _mm_hadd_epi32(a0, a1); |
41 | 0 | const __m128i mul = _mm_mullo_epi32(a2, w); |
42 | 0 | cost = _mm_add_epi32(mul, cost); |
43 | 0 | } |
44 | 0 | return HorizontalSum_SSE41(cost); |
45 | 0 | } |
46 | | |
47 | | static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a, |
48 | 0 | const uint32_t* const b, int length) { |
49 | 0 | int i; |
50 | 0 | __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]), |
51 | 0 | _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4])); |
52 | 0 | assert(length % 8 == 0); |
53 | | |
54 | 0 | for (i = 8; i + 8 <= length; i += 8) { |
55 | 0 | const int j = (i - 2) >> 1; |
56 | 0 | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); |
57 | 0 | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
58 | 0 | const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]); |
59 | 0 | const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); |
60 | 0 | const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); |
61 | 0 | const __m128i a2 = _mm_hadd_epi32(a0, a1); |
62 | 0 | const __m128i b2 = _mm_hadd_epi32(b0, b1); |
63 | 0 | const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w); |
64 | 0 | cost = _mm_add_epi32(mul, cost); |
65 | 0 | } |
66 | 0 | return HorizontalSum_SSE41(cost); |
67 | 0 | } |
68 | | |
69 | | //------------------------------------------------------------------------------ |
70 | | // Subtract-Green Transform |
71 | | |
72 | | static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, |
73 | 0 | int num_pixels) { |
74 | 0 | int i; |
75 | 0 | const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9, |
76 | 0 | -1, 5, -1, 5, -1, 1, -1, 1); |
77 | 0 | for (i = 0; i + 4 <= num_pixels; i += 4) { |
78 | 0 | const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); |
79 | 0 | const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle); |
80 | 0 | const __m128i out = _mm_sub_epi8(in, in_0g0g); |
81 | 0 | _mm_storeu_si128((__m128i*)&argb_data[i], out); |
82 | 0 | } |
83 | | // fallthrough and finish off with plain-C |
84 | 0 | if (i != num_pixels) { |
85 | 0 | VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | | //------------------------------------------------------------------------------ |
90 | | // Color Transform |
91 | | |
92 | | // For sign-extended multiplying constants, pre-shifted by 5: |
93 | | #define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) |
94 | | |
95 | | #define MK_CST_16(HI, LO) \ |
96 | 0 | _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) |
97 | | |
98 | | static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, |
99 | | int tile_width, int tile_height, |
100 | | int green_to_blue, int red_to_blue, |
101 | 0 | uint32_t histo[]) { |
102 | 0 | const __m128i mult = |
103 | 0 | MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue)); |
104 | 0 | const __m128i perm = |
105 | 0 | _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14); |
106 | 0 | if (tile_width >= 4) { |
107 | 0 | int y; |
108 | 0 | for (y = 0; y < tile_height; ++y) { |
109 | 0 | const uint32_t* const src = argb + y * stride; |
110 | 0 | const __m128i A1 = _mm_loadu_si128((const __m128i*)src); |
111 | 0 | const __m128i B1 = _mm_shuffle_epi8(A1, perm); |
112 | 0 | const __m128i C1 = _mm_mulhi_epi16(B1, mult); |
113 | 0 | const __m128i D1 = _mm_sub_epi16(A1, C1); |
114 | 0 | __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1); |
115 | 0 | int x; |
116 | 0 | for (x = 4; x + 4 <= tile_width; x += 4) { |
117 | 0 | const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); |
118 | 0 | __m128i B2, C2, D2; |
119 | 0 | ++histo[_mm_extract_epi8(E, 0)]; |
120 | 0 | B2 = _mm_shuffle_epi8(A2, perm); |
121 | 0 | ++histo[_mm_extract_epi8(E, 4)]; |
122 | 0 | C2 = _mm_mulhi_epi16(B2, mult); |
123 | 0 | ++histo[_mm_extract_epi8(E, 8)]; |
124 | 0 | D2 = _mm_sub_epi16(A2, C2); |
125 | 0 | ++histo[_mm_extract_epi8(E, 12)]; |
126 | 0 | E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2); |
127 | 0 | } |
128 | 0 | ++histo[_mm_extract_epi8(E, 0)]; |
129 | 0 | ++histo[_mm_extract_epi8(E, 4)]; |
130 | 0 | ++histo[_mm_extract_epi8(E, 8)]; |
131 | 0 | ++histo[_mm_extract_epi8(E, 12)]; |
132 | 0 | } |
133 | 0 | } |
134 | 0 | { |
135 | 0 | const int left_over = tile_width & 3; |
136 | 0 | if (left_over > 0) { |
137 | 0 | VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, |
138 | 0 | left_over, tile_height, |
139 | 0 | green_to_blue, red_to_blue, histo); |
140 | 0 | } |
141 | 0 | } |
142 | 0 | } |
143 | | |
144 | | static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, |
145 | | int tile_width, int tile_height, |
146 | | int green_to_red, |
147 | 0 | uint32_t histo[]) { |
148 | 0 | const __m128i mult = MK_CST_16(0, CST_5b(green_to_red)); |
149 | 0 | const __m128i mask_g = _mm_set1_epi32(0x0000ff00); |
150 | 0 | if (tile_width >= 4) { |
151 | 0 | int y; |
152 | 0 | for (y = 0; y < tile_height; ++y) { |
153 | 0 | const uint32_t* const src = argb + y * stride; |
154 | 0 | const __m128i A1 = _mm_loadu_si128((const __m128i*)src); |
155 | 0 | const __m128i B1 = _mm_and_si128(A1, mask_g); |
156 | 0 | const __m128i C1 = _mm_madd_epi16(B1, mult); |
157 | 0 | __m128i D = _mm_sub_epi16(A1, C1); |
158 | 0 | int x; |
159 | 0 | for (x = 4; x + 4 <= tile_width; x += 4) { |
160 | 0 | const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); |
161 | 0 | __m128i B2, C2; |
162 | 0 | ++histo[_mm_extract_epi8(D, 2)]; |
163 | 0 | B2 = _mm_and_si128(A2, mask_g); |
164 | 0 | ++histo[_mm_extract_epi8(D, 6)]; |
165 | 0 | C2 = _mm_madd_epi16(B2, mult); |
166 | 0 | ++histo[_mm_extract_epi8(D, 10)]; |
167 | 0 | ++histo[_mm_extract_epi8(D, 14)]; |
168 | 0 | D = _mm_sub_epi16(A2, C2); |
169 | 0 | } |
170 | 0 | ++histo[_mm_extract_epi8(D, 2)]; |
171 | 0 | ++histo[_mm_extract_epi8(D, 6)]; |
172 | 0 | ++histo[_mm_extract_epi8(D, 10)]; |
173 | 0 | ++histo[_mm_extract_epi8(D, 14)]; |
174 | 0 | } |
175 | 0 | } |
176 | 0 | { |
177 | 0 | const int left_over = tile_width & 3; |
178 | 0 | if (left_over > 0) { |
179 | 0 | VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, |
180 | 0 | left_over, tile_height, green_to_red, |
181 | 0 | histo); |
182 | 0 | } |
183 | 0 | } |
184 | 0 | } |
185 | | |
186 | | #undef MK_CST_16 |
187 | | |
188 | | //------------------------------------------------------------------------------ |
189 | | // Entry point |
190 | | |
191 | | extern void VP8LEncDspInitSSE41(void); |
192 | | |
193 | 0 | WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { |
194 | 0 | VP8LExtraCost = ExtraCost_SSE41; |
195 | 0 | VP8LExtraCostCombined = ExtraCostCombined_SSE41; |
196 | 0 | VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; |
197 | 0 | VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; |
198 | 0 | VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; |
199 | 0 | } |
200 | | |
201 | | #else // !WEBP_USE_SSE41 |
202 | | |
203 | | WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41) |
204 | | |
205 | | #endif // WEBP_USE_SSE41 |