/src/libwebp/src/dsp/lossless_enc_sse41.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style license |
4 | | // that can be found in the COPYING file in the root of the source |
5 | | // tree. An additional intellectual property rights grant can be found |
6 | | // in the file PATENTS. All contributing project authors may |
7 | | // be found in the AUTHORS file in the root of the source tree. |
8 | | // ----------------------------------------------------------------------------- |
9 | | // |
10 | | // SSE4.1 variant of methods for lossless encoder |
11 | | // |
12 | | // Author: Skal (pascal.massimino@gmail.com) |
13 | | |
14 | | #include "src/dsp/dsp.h" |
15 | | |
16 | | #if defined(WEBP_USE_SSE41) |
17 | | #include <emmintrin.h> |
18 | | #include <smmintrin.h> |
19 | | |
20 | | #include <assert.h> |
21 | | |
22 | | #include "src/dsp/cpu.h" |
23 | | #include "src/dsp/lossless.h" |
24 | | #include "src/webp/types.h" |
25 | | |
26 | | //------------------------------------------------------------------------------ |
27 | | // Cost operations. |
28 | | |
29 | 0 | static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) { |
30 | 0 | cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8)); |
31 | 0 | cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4)); |
32 | 0 | return _mm_cvtsi128_si32(cost); |
33 | 0 | } |
34 | | |
35 | 0 | static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { |
36 | 0 | int i; |
37 | 0 | __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]); |
38 | 0 | assert(length % 8 == 0); |
39 | | |
40 | 0 | for (i = 8; i + 8 <= length; i += 8) { |
41 | 0 | const int j = (i - 2) >> 1; |
42 | 0 | const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); |
43 | 0 | const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); |
44 | 0 | const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); |
45 | 0 | const __m128i a2 = _mm_hadd_epi32(a0, a1); |
46 | 0 | const __m128i mul = _mm_mullo_epi32(a2, w); |
47 | 0 | cost = _mm_add_epi32(mul, cost); |
48 | 0 | } |
49 | 0 | return HorizontalSum_SSE41(cost); |
50 | 0 | } |
51 | | |
52 | | //------------------------------------------------------------------------------ |
53 | | // Subtract-Green Transform |
54 | | |
55 | | static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, |
56 | 0 | int num_pixels) { |
57 | 0 | int i; |
58 | 0 | const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9, |
59 | 0 | -1, 5, -1, 5, -1, 1, -1, 1); |
60 | 0 | for (i = 0; i + 4 <= num_pixels; i += 4) { |
61 | 0 | const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); |
62 | 0 | const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle); |
63 | 0 | const __m128i out = _mm_sub_epi8(in, in_0g0g); |
64 | 0 | _mm_storeu_si128((__m128i*)&argb_data[i], out); |
65 | 0 | } |
66 | | // fallthrough and finish off with plain-C |
67 | 0 | if (i != num_pixels) { |
68 | 0 | VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); |
69 | 0 | } |
70 | 0 | } |
71 | | |
72 | | //------------------------------------------------------------------------------ |
73 | | // Color Transform |
74 | | |
75 | | // For sign-extended multiplying constants, pre-shifted by 5: |
76 | | #define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) |
77 | | |
78 | | #define MK_CST_16(HI, LO) \ |
79 | 0 | _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) |
80 | | |
81 | | static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb, |
82 | | int stride, |
83 | | int tile_width, int tile_height, |
84 | | int green_to_blue, int red_to_blue, |
85 | 0 | uint32_t histo[]) { |
86 | 0 | const __m128i mult = |
87 | 0 | MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue)); |
88 | 0 | const __m128i perm = |
89 | 0 | _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14); |
90 | 0 | if (tile_width >= 4) { |
91 | 0 | int y; |
92 | 0 | for (y = 0; y < tile_height; ++y) { |
93 | 0 | const uint32_t* const src = argb + y * stride; |
94 | 0 | const __m128i A1 = _mm_loadu_si128((const __m128i*)src); |
95 | 0 | const __m128i B1 = _mm_shuffle_epi8(A1, perm); |
96 | 0 | const __m128i C1 = _mm_mulhi_epi16(B1, mult); |
97 | 0 | const __m128i D1 = _mm_sub_epi16(A1, C1); |
98 | 0 | __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1); |
99 | 0 | int x; |
100 | 0 | for (x = 4; x + 4 <= tile_width; x += 4) { |
101 | 0 | const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); |
102 | 0 | __m128i B2, C2, D2; |
103 | 0 | ++histo[_mm_extract_epi8(E, 0)]; |
104 | 0 | B2 = _mm_shuffle_epi8(A2, perm); |
105 | 0 | ++histo[_mm_extract_epi8(E, 4)]; |
106 | 0 | C2 = _mm_mulhi_epi16(B2, mult); |
107 | 0 | ++histo[_mm_extract_epi8(E, 8)]; |
108 | 0 | D2 = _mm_sub_epi16(A2, C2); |
109 | 0 | ++histo[_mm_extract_epi8(E, 12)]; |
110 | 0 | E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2); |
111 | 0 | } |
112 | 0 | ++histo[_mm_extract_epi8(E, 0)]; |
113 | 0 | ++histo[_mm_extract_epi8(E, 4)]; |
114 | 0 | ++histo[_mm_extract_epi8(E, 8)]; |
115 | 0 | ++histo[_mm_extract_epi8(E, 12)]; |
116 | 0 | } |
117 | 0 | } |
118 | 0 | { |
119 | 0 | const int left_over = tile_width & 3; |
120 | 0 | if (left_over > 0) { |
121 | 0 | VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, |
122 | 0 | left_over, tile_height, |
123 | 0 | green_to_blue, red_to_blue, histo); |
124 | 0 | } |
125 | 0 | } |
126 | 0 | } |
127 | | |
128 | | static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb, |
129 | | int stride, |
130 | | int tile_width, int tile_height, |
131 | | int green_to_red, |
132 | 0 | uint32_t histo[]) { |
133 | 0 | const __m128i mult = MK_CST_16(0, CST_5b(green_to_red)); |
134 | 0 | const __m128i mask_g = _mm_set1_epi32(0x0000ff00); |
135 | 0 | if (tile_width >= 4) { |
136 | 0 | int y; |
137 | 0 | for (y = 0; y < tile_height; ++y) { |
138 | 0 | const uint32_t* const src = argb + y * stride; |
139 | 0 | const __m128i A1 = _mm_loadu_si128((const __m128i*)src); |
140 | 0 | const __m128i B1 = _mm_and_si128(A1, mask_g); |
141 | 0 | const __m128i C1 = _mm_madd_epi16(B1, mult); |
142 | 0 | __m128i D = _mm_sub_epi16(A1, C1); |
143 | 0 | int x; |
144 | 0 | for (x = 4; x + 4 <= tile_width; x += 4) { |
145 | 0 | const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); |
146 | 0 | __m128i B2, C2; |
147 | 0 | ++histo[_mm_extract_epi8(D, 2)]; |
148 | 0 | B2 = _mm_and_si128(A2, mask_g); |
149 | 0 | ++histo[_mm_extract_epi8(D, 6)]; |
150 | 0 | C2 = _mm_madd_epi16(B2, mult); |
151 | 0 | ++histo[_mm_extract_epi8(D, 10)]; |
152 | 0 | ++histo[_mm_extract_epi8(D, 14)]; |
153 | 0 | D = _mm_sub_epi16(A2, C2); |
154 | 0 | } |
155 | 0 | ++histo[_mm_extract_epi8(D, 2)]; |
156 | 0 | ++histo[_mm_extract_epi8(D, 6)]; |
157 | 0 | ++histo[_mm_extract_epi8(D, 10)]; |
158 | 0 | ++histo[_mm_extract_epi8(D, 14)]; |
159 | 0 | } |
160 | 0 | } |
161 | 0 | { |
162 | 0 | const int left_over = tile_width & 3; |
163 | 0 | if (left_over > 0) { |
164 | 0 | VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, |
165 | 0 | left_over, tile_height, green_to_red, |
166 | 0 | histo); |
167 | 0 | } |
168 | 0 | } |
169 | 0 | } |
170 | | |
171 | | #undef MK_CST_16 |
172 | | |
173 | | //------------------------------------------------------------------------------ |
174 | | // Entry point |
175 | | |
176 | | extern void VP8LEncDspInitSSE41(void); |
177 | | |
178 | 0 | WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { |
179 | 0 | VP8LExtraCost = ExtraCost_SSE41; |
180 | 0 | VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; |
181 | 0 | VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; |
182 | 0 | VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; |
183 | | |
184 | | // SSE exports for AVX and above. |
185 | 0 | VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE41; |
186 | 0 | VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE41; |
187 | 0 | VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE41; |
188 | 0 | } |
189 | | |
190 | | #else // !WEBP_USE_SSE41 |
191 | | |
192 | | WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41) |
193 | | |
194 | | #endif // WEBP_USE_SSE41 |