/src/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <emmintrin.h> |
12 | | |
13 | | #include "./vpx_dsp_rtcd.h" |
14 | | #include "vpx/vpx_integer.h" |
15 | | #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" |
16 | | #include "vpx_ports/mem.h" |
17 | | |
18 | | static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, |
19 | | __m128i *out_lo, |
20 | 0 | __m128i *out_hi) { |
21 | 0 | const __m128i sign_bits = _mm_cmplt_epi16(in, zero); |
22 | 0 | *out_lo = _mm_unpacklo_epi16(in, sign_bits); |
23 | 0 | *out_hi = _mm_unpackhi_epi16(in, sign_bits); |
24 | 0 | } |
25 | | |
26 | | void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, |
27 | 0 | int *min, int *max) { |
28 | 0 | __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; |
29 | 0 | u0 = _mm_setzero_si128(); |
30 | | // Row 0 |
31 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); |
32 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); |
33 | 0 | diff = _mm_subs_epi16(s0, d0); |
34 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
35 | 0 | absdiff0 = _mm_max_epi16(diff, negdiff); |
36 | | // Row 1 |
37 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); |
38 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); |
39 | 0 | diff = _mm_subs_epi16(s0, d0); |
40 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
41 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
42 | 0 | maxabsdiff = _mm_max_epi16(absdiff0, absdiff); |
43 | 0 | minabsdiff = _mm_min_epi16(absdiff0, absdiff); |
44 | | // Row 2 |
45 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); |
46 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); |
47 | 0 | diff = _mm_subs_epi16(s0, d0); |
48 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
49 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
50 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); |
51 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, absdiff); |
52 | | // Row 3 |
53 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); |
54 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); |
55 | 0 | diff = _mm_subs_epi16(s0, d0); |
56 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
57 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
58 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); |
59 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, absdiff); |
60 | | // Row 4 |
61 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); |
62 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); |
63 | 0 | diff = _mm_subs_epi16(s0, d0); |
64 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
65 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
66 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); |
67 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, absdiff); |
68 | | // Row 5 |
69 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); |
70 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); |
71 | 0 | diff = _mm_subs_epi16(s0, d0); |
72 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
73 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
74 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); |
75 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, absdiff); |
76 | | // Row 6 |
77 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); |
78 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); |
79 | 0 | diff = _mm_subs_epi16(s0, d0); |
80 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
81 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
82 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); |
83 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, absdiff); |
84 | | // Row 7 |
85 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); |
86 | 0 | d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); |
87 | 0 | diff = _mm_subs_epi16(s0, d0); |
88 | 0 | negdiff = _mm_subs_epi16(u0, diff); |
89 | 0 | absdiff = _mm_max_epi16(diff, negdiff); |
90 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); |
91 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, absdiff); |
92 | |
|
93 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); |
94 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); |
95 | 0 | maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); |
96 | 0 | *max = _mm_extract_epi16(maxabsdiff, 0); |
97 | |
|
98 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); |
99 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); |
100 | 0 | minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); |
101 | 0 | *min = _mm_extract_epi16(minabsdiff, 0); |
102 | 0 | } |
103 | | |
104 | 0 | unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) { |
105 | 0 | __m128i s0, s1, u0; |
106 | 0 | unsigned int avg = 0; |
107 | 0 | u0 = _mm_setzero_si128(); |
108 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); |
109 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); |
110 | 0 | s0 = _mm_adds_epu16(s0, s1); |
111 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); |
112 | 0 | s0 = _mm_adds_epu16(s0, s1); |
113 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); |
114 | 0 | s0 = _mm_adds_epu16(s0, s1); |
115 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); |
116 | 0 | s0 = _mm_adds_epu16(s0, s1); |
117 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); |
118 | 0 | s0 = _mm_adds_epu16(s0, s1); |
119 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); |
120 | 0 | s0 = _mm_adds_epu16(s0, s1); |
121 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); |
122 | 0 | s0 = _mm_adds_epu16(s0, s1); |
123 | |
|
124 | 0 | s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); |
125 | 0 | s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); |
126 | 0 | s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); |
127 | 0 | avg = _mm_extract_epi16(s0, 0); |
128 | 0 | return (avg + 32) >> 6; |
129 | 0 | } |
130 | | |
131 | 0 | unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) { |
132 | 0 | __m128i s0, s1, u0; |
133 | 0 | unsigned int avg = 0; |
134 | 0 | u0 = _mm_setzero_si128(); |
135 | 0 | s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); |
136 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); |
137 | 0 | s0 = _mm_adds_epu16(s0, s1); |
138 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); |
139 | 0 | s0 = _mm_adds_epu16(s0, s1); |
140 | 0 | s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); |
141 | 0 | s0 = _mm_adds_epu16(s0, s1); |
142 | |
|
143 | 0 | s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); |
144 | 0 | s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); |
145 | 0 | avg = _mm_extract_epi16(s0, 0); |
146 | 0 | return (avg + 8) >> 4; |
147 | 0 | } |
148 | | |
149 | | #if CONFIG_VP9_HIGHBITDEPTH |
150 | 0 | unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { |
151 | 0 | __m128i s0, s1; |
152 | 0 | unsigned int avg; |
153 | 0 | const uint16_t *s = CONVERT_TO_SHORTPTR(s8); |
154 | 0 | const __m128i zero = _mm_setzero_si128(); |
155 | 0 | s0 = _mm_loadu_si128((const __m128i *)(s)); |
156 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + p)); |
157 | 0 | s0 = _mm_adds_epu16(s0, s1); |
158 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p)); |
159 | 0 | s0 = _mm_adds_epu16(s0, s1); |
160 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p)); |
161 | 0 | s0 = _mm_adds_epu16(s0, s1); |
162 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p)); |
163 | 0 | s0 = _mm_adds_epu16(s0, s1); |
164 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p)); |
165 | 0 | s0 = _mm_adds_epu16(s0, s1); |
166 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p)); |
167 | 0 | s0 = _mm_adds_epu16(s0, s1); |
168 | 0 | s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p)); |
169 | 0 | s0 = _mm_adds_epu16(s0, s1); |
170 | 0 | s1 = _mm_unpackhi_epi16(s0, zero); |
171 | 0 | s0 = _mm_unpacklo_epi16(s0, zero); |
172 | 0 | s0 = _mm_add_epi32(s0, s1); |
173 | 0 | s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); |
174 | 0 | s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); |
175 | 0 | avg = (unsigned int)_mm_cvtsi128_si32(s0); |
176 | |
|
177 | 0 | return (avg + 32) >> 6; |
178 | 0 | } |
179 | | |
180 | 0 | unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) { |
181 | 0 | __m128i s0, s1; |
182 | 0 | unsigned int avg; |
183 | 0 | const uint16_t *s = CONVERT_TO_SHORTPTR(s8); |
184 | 0 | s0 = _mm_loadl_epi64((const __m128i *)(s)); |
185 | 0 | s1 = _mm_loadl_epi64((const __m128i *)(s + p)); |
186 | 0 | s0 = _mm_adds_epu16(s0, s1); |
187 | 0 | s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p)); |
188 | 0 | s0 = _mm_adds_epu16(s0, s1); |
189 | 0 | s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p)); |
190 | 0 | s0 = _mm_adds_epu16(s0, s1); |
191 | 0 | s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4)); |
192 | 0 | s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2)); |
193 | 0 | avg = _mm_extract_epi16(s0, 0); |
194 | |
|
195 | 0 | return (avg + 8) >> 4; |
196 | 0 | } |
197 | | #endif // CONFIG_VP9_HIGHBITDEPTH |
198 | | |
199 | 0 | static void hadamard_col8_sse2(__m128i *in, int iter) { |
200 | 0 | __m128i a0 = in[0]; |
201 | 0 | __m128i a1 = in[1]; |
202 | 0 | __m128i a2 = in[2]; |
203 | 0 | __m128i a3 = in[3]; |
204 | 0 | __m128i a4 = in[4]; |
205 | 0 | __m128i a5 = in[5]; |
206 | 0 | __m128i a6 = in[6]; |
207 | 0 | __m128i a7 = in[7]; |
208 | |
|
209 | 0 | __m128i b0 = _mm_add_epi16(a0, a1); |
210 | 0 | __m128i b1 = _mm_sub_epi16(a0, a1); |
211 | 0 | __m128i b2 = _mm_add_epi16(a2, a3); |
212 | 0 | __m128i b3 = _mm_sub_epi16(a2, a3); |
213 | 0 | __m128i b4 = _mm_add_epi16(a4, a5); |
214 | 0 | __m128i b5 = _mm_sub_epi16(a4, a5); |
215 | 0 | __m128i b6 = _mm_add_epi16(a6, a7); |
216 | 0 | __m128i b7 = _mm_sub_epi16(a6, a7); |
217 | |
|
218 | 0 | a0 = _mm_add_epi16(b0, b2); |
219 | 0 | a1 = _mm_add_epi16(b1, b3); |
220 | 0 | a2 = _mm_sub_epi16(b0, b2); |
221 | 0 | a3 = _mm_sub_epi16(b1, b3); |
222 | 0 | a4 = _mm_add_epi16(b4, b6); |
223 | 0 | a5 = _mm_add_epi16(b5, b7); |
224 | 0 | a6 = _mm_sub_epi16(b4, b6); |
225 | 0 | a7 = _mm_sub_epi16(b5, b7); |
226 | |
|
227 | 0 | if (iter == 0) { |
228 | 0 | b0 = _mm_add_epi16(a0, a4); |
229 | 0 | b7 = _mm_add_epi16(a1, a5); |
230 | 0 | b3 = _mm_add_epi16(a2, a6); |
231 | 0 | b4 = _mm_add_epi16(a3, a7); |
232 | 0 | b2 = _mm_sub_epi16(a0, a4); |
233 | 0 | b6 = _mm_sub_epi16(a1, a5); |
234 | 0 | b1 = _mm_sub_epi16(a2, a6); |
235 | 0 | b5 = _mm_sub_epi16(a3, a7); |
236 | |
|
237 | 0 | a0 = _mm_unpacklo_epi16(b0, b1); |
238 | 0 | a1 = _mm_unpacklo_epi16(b2, b3); |
239 | 0 | a2 = _mm_unpackhi_epi16(b0, b1); |
240 | 0 | a3 = _mm_unpackhi_epi16(b2, b3); |
241 | 0 | a4 = _mm_unpacklo_epi16(b4, b5); |
242 | 0 | a5 = _mm_unpacklo_epi16(b6, b7); |
243 | 0 | a6 = _mm_unpackhi_epi16(b4, b5); |
244 | 0 | a7 = _mm_unpackhi_epi16(b6, b7); |
245 | |
|
246 | 0 | b0 = _mm_unpacklo_epi32(a0, a1); |
247 | 0 | b1 = _mm_unpacklo_epi32(a4, a5); |
248 | 0 | b2 = _mm_unpackhi_epi32(a0, a1); |
249 | 0 | b3 = _mm_unpackhi_epi32(a4, a5); |
250 | 0 | b4 = _mm_unpacklo_epi32(a2, a3); |
251 | 0 | b5 = _mm_unpacklo_epi32(a6, a7); |
252 | 0 | b6 = _mm_unpackhi_epi32(a2, a3); |
253 | 0 | b7 = _mm_unpackhi_epi32(a6, a7); |
254 | |
|
255 | 0 | in[0] = _mm_unpacklo_epi64(b0, b1); |
256 | 0 | in[1] = _mm_unpackhi_epi64(b0, b1); |
257 | 0 | in[2] = _mm_unpacklo_epi64(b2, b3); |
258 | 0 | in[3] = _mm_unpackhi_epi64(b2, b3); |
259 | 0 | in[4] = _mm_unpacklo_epi64(b4, b5); |
260 | 0 | in[5] = _mm_unpackhi_epi64(b4, b5); |
261 | 0 | in[6] = _mm_unpacklo_epi64(b6, b7); |
262 | 0 | in[7] = _mm_unpackhi_epi64(b6, b7); |
263 | 0 | } else { |
264 | 0 | in[0] = _mm_add_epi16(a0, a4); |
265 | 0 | in[7] = _mm_add_epi16(a1, a5); |
266 | 0 | in[3] = _mm_add_epi16(a2, a6); |
267 | 0 | in[4] = _mm_add_epi16(a3, a7); |
268 | 0 | in[2] = _mm_sub_epi16(a0, a4); |
269 | 0 | in[6] = _mm_sub_epi16(a1, a5); |
270 | 0 | in[1] = _mm_sub_epi16(a2, a6); |
271 | 0 | in[5] = _mm_sub_epi16(a3, a7); |
272 | 0 | } |
273 | 0 | } |
274 | | |
275 | | static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, |
276 | | ptrdiff_t src_stride, tran_low_t *coeff, |
277 | 0 | int is_final) { |
278 | 0 | __m128i src[8]; |
279 | 0 | src[0] = _mm_load_si128((const __m128i *)src_diff); |
280 | 0 | src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
281 | 0 | src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
282 | 0 | src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
283 | 0 | src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
284 | 0 | src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
285 | 0 | src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
286 | 0 | src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); |
287 | |
|
288 | 0 | hadamard_col8_sse2(src, 0); |
289 | 0 | hadamard_col8_sse2(src, 1); |
290 | |
|
291 | 0 | if (is_final) { |
292 | 0 | store_tran_low(src[0], coeff); |
293 | 0 | coeff += 8; |
294 | 0 | store_tran_low(src[1], coeff); |
295 | 0 | coeff += 8; |
296 | 0 | store_tran_low(src[2], coeff); |
297 | 0 | coeff += 8; |
298 | 0 | store_tran_low(src[3], coeff); |
299 | 0 | coeff += 8; |
300 | 0 | store_tran_low(src[4], coeff); |
301 | 0 | coeff += 8; |
302 | 0 | store_tran_low(src[5], coeff); |
303 | 0 | coeff += 8; |
304 | 0 | store_tran_low(src[6], coeff); |
305 | 0 | coeff += 8; |
306 | 0 | store_tran_low(src[7], coeff); |
307 | 0 | } else { |
308 | 0 | int16_t *coeff16 = (int16_t *)coeff; |
309 | 0 | _mm_store_si128((__m128i *)coeff16, src[0]); |
310 | 0 | coeff16 += 8; |
311 | 0 | _mm_store_si128((__m128i *)coeff16, src[1]); |
312 | 0 | coeff16 += 8; |
313 | 0 | _mm_store_si128((__m128i *)coeff16, src[2]); |
314 | 0 | coeff16 += 8; |
315 | 0 | _mm_store_si128((__m128i *)coeff16, src[3]); |
316 | 0 | coeff16 += 8; |
317 | 0 | _mm_store_si128((__m128i *)coeff16, src[4]); |
318 | 0 | coeff16 += 8; |
319 | 0 | _mm_store_si128((__m128i *)coeff16, src[5]); |
320 | 0 | coeff16 += 8; |
321 | 0 | _mm_store_si128((__m128i *)coeff16, src[6]); |
322 | 0 | coeff16 += 8; |
323 | 0 | _mm_store_si128((__m128i *)coeff16, src[7]); |
324 | 0 | } |
325 | 0 | } |
326 | | |
327 | | void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, |
328 | 0 | tran_low_t *coeff) { |
329 | 0 | hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); |
330 | 0 | } |
331 | | |
332 | | static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, |
333 | | ptrdiff_t src_stride, tran_low_t *coeff, |
334 | 0 | int is_final) { |
335 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
336 | | // For high bitdepths, it is unnecessary to store_tran_low |
337 | | // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the |
338 | | // next stage. Output to an intermediate buffer first, then store_tran_low() |
339 | | // in the final stage. |
340 | 0 | DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); |
341 | 0 | int16_t *t_coeff = temp_coeff; |
342 | | #else |
343 | | int16_t *t_coeff = coeff; |
344 | | #endif |
345 | 0 | int16_t *coeff16 = (int16_t *)coeff; |
346 | 0 | int idx; |
347 | 0 | for (idx = 0; idx < 4; ++idx) { |
348 | 0 | const int16_t *src_ptr = |
349 | 0 | src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; |
350 | 0 | hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), |
351 | 0 | 0); |
352 | 0 | } |
353 | |
|
354 | 0 | for (idx = 0; idx < 64; idx += 8) { |
355 | 0 | __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); |
356 | 0 | __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); |
357 | 0 | __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); |
358 | 0 | __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); |
359 | |
|
360 | 0 | __m128i b0 = _mm_add_epi16(coeff0, coeff1); |
361 | 0 | __m128i b1 = _mm_sub_epi16(coeff0, coeff1); |
362 | 0 | __m128i b2 = _mm_add_epi16(coeff2, coeff3); |
363 | 0 | __m128i b3 = _mm_sub_epi16(coeff2, coeff3); |
364 | |
|
365 | 0 | b0 = _mm_srai_epi16(b0, 1); |
366 | 0 | b1 = _mm_srai_epi16(b1, 1); |
367 | 0 | b2 = _mm_srai_epi16(b2, 1); |
368 | 0 | b3 = _mm_srai_epi16(b3, 1); |
369 | |
|
370 | 0 | coeff0 = _mm_add_epi16(b0, b2); |
371 | 0 | coeff1 = _mm_add_epi16(b1, b3); |
372 | 0 | coeff2 = _mm_sub_epi16(b0, b2); |
373 | 0 | coeff3 = _mm_sub_epi16(b1, b3); |
374 | |
|
375 | 0 | if (is_final) { |
376 | 0 | store_tran_low(coeff0, coeff); |
377 | 0 | store_tran_low(coeff1, coeff + 64); |
378 | 0 | store_tran_low(coeff2, coeff + 128); |
379 | 0 | store_tran_low(coeff3, coeff + 192); |
380 | 0 | coeff += 8; |
381 | 0 | } else { |
382 | 0 | _mm_store_si128((__m128i *)coeff16, coeff0); |
383 | 0 | _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); |
384 | 0 | _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); |
385 | 0 | _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); |
386 | 0 | coeff16 += 8; |
387 | 0 | } |
388 | |
|
389 | 0 | t_coeff += 8; |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | | void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, |
394 | 0 | tran_low_t *coeff) { |
395 | 0 | hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); |
396 | 0 | } |
397 | | |
398 | | void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, |
399 | 0 | tran_low_t *coeff) { |
400 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
401 | | // For high bitdepths, it is unnecessary to store_tran_low |
402 | | // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the |
403 | | // next stage. Output to an intermediate buffer first, then store_tran_low() |
404 | | // in the final stage. |
405 | 0 | DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); |
406 | 0 | int16_t *t_coeff = temp_coeff; |
407 | | #else |
408 | | int16_t *t_coeff = coeff; |
409 | | #endif |
410 | 0 | int idx; |
411 | 0 | __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, |
412 | 0 | b3_lo; |
413 | 0 | __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, |
414 | 0 | b3_hi; |
415 | 0 | __m128i b0, b1, b2, b3; |
416 | 0 | const __m128i zero = _mm_setzero_si128(); |
417 | 0 | for (idx = 0; idx < 4; ++idx) { |
418 | 0 | const int16_t *src_ptr = |
419 | 0 | src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; |
420 | 0 | hadamard_16x16_sse2(src_ptr, src_stride, |
421 | 0 | (tran_low_t *)(t_coeff + idx * 256), 0); |
422 | 0 | } |
423 | |
|
424 | 0 | for (idx = 0; idx < 256; idx += 8) { |
425 | 0 | __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); |
426 | 0 | __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); |
427 | 0 | __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); |
428 | 0 | __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); |
429 | | |
430 | | // Sign extend 16 bit to 32 bit. |
431 | 0 | sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); |
432 | 0 | sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); |
433 | 0 | sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); |
434 | 0 | sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); |
435 | |
|
436 | 0 | b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); |
437 | 0 | b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); |
438 | |
|
439 | 0 | b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); |
440 | 0 | b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); |
441 | |
|
442 | 0 | b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); |
443 | 0 | b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); |
444 | |
|
445 | 0 | b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); |
446 | 0 | b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); |
447 | |
|
448 | 0 | b0_lo = _mm_srai_epi32(b0_lo, 2); |
449 | 0 | b1_lo = _mm_srai_epi32(b1_lo, 2); |
450 | 0 | b2_lo = _mm_srai_epi32(b2_lo, 2); |
451 | 0 | b3_lo = _mm_srai_epi32(b3_lo, 2); |
452 | |
|
453 | 0 | b0_hi = _mm_srai_epi32(b0_hi, 2); |
454 | 0 | b1_hi = _mm_srai_epi32(b1_hi, 2); |
455 | 0 | b2_hi = _mm_srai_epi32(b2_hi, 2); |
456 | 0 | b3_hi = _mm_srai_epi32(b3_hi, 2); |
457 | |
|
458 | 0 | b0 = _mm_packs_epi32(b0_lo, b0_hi); |
459 | 0 | b1 = _mm_packs_epi32(b1_lo, b1_hi); |
460 | 0 | b2 = _mm_packs_epi32(b2_lo, b2_hi); |
461 | 0 | b3 = _mm_packs_epi32(b3_lo, b3_hi); |
462 | |
|
463 | 0 | coeff0 = _mm_add_epi16(b0, b2); |
464 | 0 | coeff1 = _mm_add_epi16(b1, b3); |
465 | 0 | store_tran_low(coeff0, coeff); |
466 | 0 | store_tran_low(coeff1, coeff + 256); |
467 | |
|
468 | 0 | coeff2 = _mm_sub_epi16(b0, b2); |
469 | 0 | coeff3 = _mm_sub_epi16(b1, b3); |
470 | 0 | store_tran_low(coeff2, coeff + 512); |
471 | 0 | store_tran_low(coeff3, coeff + 768); |
472 | |
|
473 | 0 | coeff += 8; |
474 | 0 | t_coeff += 8; |
475 | 0 | } |
476 | 0 | } |
477 | | |
478 | 0 | int vpx_satd_sse2(const tran_low_t *coeff, int length) { |
479 | 0 | int i; |
480 | 0 | const __m128i zero = _mm_setzero_si128(); |
481 | 0 | __m128i accum = zero; |
482 | |
|
483 | 0 | for (i = 0; i < length; i += 8) { |
484 | 0 | const __m128i src_line = load_tran_low(coeff); |
485 | 0 | const __m128i inv = _mm_sub_epi16(zero, src_line); |
486 | 0 | const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) |
487 | 0 | const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); |
488 | 0 | const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); |
489 | 0 | const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); |
490 | 0 | accum = _mm_add_epi32(accum, sum); |
491 | 0 | coeff += 8; |
492 | 0 | } |
493 | |
|
494 | 0 | { // cascading summation of accum |
495 | 0 | __m128i hi = _mm_srli_si128(accum, 8); |
496 | 0 | accum = _mm_add_epi32(accum, hi); |
497 | 0 | hi = _mm_srli_epi64(accum, 32); |
498 | 0 | accum = _mm_add_epi32(accum, hi); |
499 | 0 | } |
500 | |
|
501 | 0 | return _mm_cvtsi128_si32(accum); |
502 | 0 | } |
503 | | |
504 | | void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, |
505 | 0 | const int ref_stride, const int height) { |
506 | 0 | int idx; |
507 | 0 | __m128i zero = _mm_setzero_si128(); |
508 | 0 | __m128i src_line = _mm_loadu_si128((const __m128i *)ref); |
509 | 0 | __m128i s0 = _mm_unpacklo_epi8(src_line, zero); |
510 | 0 | __m128i s1 = _mm_unpackhi_epi8(src_line, zero); |
511 | 0 | __m128i t0, t1; |
512 | 0 | int height_1 = height - 1; |
513 | 0 | ref += ref_stride; |
514 | |
|
515 | 0 | for (idx = 1; idx < height_1; idx += 2) { |
516 | 0 | src_line = _mm_loadu_si128((const __m128i *)ref); |
517 | 0 | t0 = _mm_unpacklo_epi8(src_line, zero); |
518 | 0 | t1 = _mm_unpackhi_epi8(src_line, zero); |
519 | 0 | s0 = _mm_adds_epu16(s0, t0); |
520 | 0 | s1 = _mm_adds_epu16(s1, t1); |
521 | 0 | ref += ref_stride; |
522 | |
|
523 | 0 | src_line = _mm_loadu_si128((const __m128i *)ref); |
524 | 0 | t0 = _mm_unpacklo_epi8(src_line, zero); |
525 | 0 | t1 = _mm_unpackhi_epi8(src_line, zero); |
526 | 0 | s0 = _mm_adds_epu16(s0, t0); |
527 | 0 | s1 = _mm_adds_epu16(s1, t1); |
528 | 0 | ref += ref_stride; |
529 | 0 | } |
530 | |
|
531 | 0 | src_line = _mm_loadu_si128((const __m128i *)ref); |
532 | 0 | t0 = _mm_unpacklo_epi8(src_line, zero); |
533 | 0 | t1 = _mm_unpackhi_epi8(src_line, zero); |
534 | 0 | s0 = _mm_adds_epu16(s0, t0); |
535 | 0 | s1 = _mm_adds_epu16(s1, t1); |
536 | |
|
537 | 0 | if (height == 64) { |
538 | 0 | s0 = _mm_srai_epi16(s0, 5); |
539 | 0 | s1 = _mm_srai_epi16(s1, 5); |
540 | 0 | } else if (height == 32) { |
541 | 0 | s0 = _mm_srai_epi16(s0, 4); |
542 | 0 | s1 = _mm_srai_epi16(s1, 4); |
543 | 0 | } else { |
544 | 0 | s0 = _mm_srai_epi16(s0, 3); |
545 | 0 | s1 = _mm_srai_epi16(s1, 3); |
546 | 0 | } |
547 | |
|
548 | 0 | _mm_storeu_si128((__m128i *)hbuf, s0); |
549 | 0 | hbuf += 8; |
550 | 0 | _mm_storeu_si128((__m128i *)hbuf, s1); |
551 | 0 | } |
552 | | |
553 | 0 | int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) { |
554 | 0 | __m128i zero = _mm_setzero_si128(); |
555 | 0 | __m128i src_line = _mm_loadu_si128((const __m128i *)ref); |
556 | 0 | __m128i s0 = _mm_sad_epu8(src_line, zero); |
557 | 0 | __m128i s1; |
558 | 0 | int i; |
559 | |
|
560 | 0 | for (i = 16; i < width; i += 16) { |
561 | 0 | ref += 16; |
562 | 0 | src_line = _mm_loadu_si128((const __m128i *)ref); |
563 | 0 | s1 = _mm_sad_epu8(src_line, zero); |
564 | 0 | s0 = _mm_adds_epu16(s0, s1); |
565 | 0 | } |
566 | |
|
567 | 0 | s1 = _mm_srli_si128(s0, 8); |
568 | 0 | s0 = _mm_adds_epu16(s0, s1); |
569 | |
|
570 | 0 | return _mm_extract_epi16(s0, 0); |
571 | 0 | } |
572 | | |
573 | 0 | int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) { |
574 | 0 | int idx; |
575 | 0 | int width = 4 << bwl; |
576 | 0 | int16_t mean; |
577 | 0 | __m128i v0 = _mm_loadu_si128((const __m128i *)ref); |
578 | 0 | __m128i v1 = _mm_load_si128((const __m128i *)src); |
579 | 0 | __m128i diff = _mm_subs_epi16(v0, v1); |
580 | 0 | __m128i sum = diff; |
581 | 0 | __m128i sse = _mm_madd_epi16(diff, diff); |
582 | |
|
583 | 0 | ref += 8; |
584 | 0 | src += 8; |
585 | |
|
586 | 0 | for (idx = 8; idx < width; idx += 8) { |
587 | 0 | v0 = _mm_loadu_si128((const __m128i *)ref); |
588 | 0 | v1 = _mm_load_si128((const __m128i *)src); |
589 | 0 | diff = _mm_subs_epi16(v0, v1); |
590 | |
|
591 | 0 | sum = _mm_add_epi16(sum, diff); |
592 | 0 | v0 = _mm_madd_epi16(diff, diff); |
593 | 0 | sse = _mm_add_epi32(sse, v0); |
594 | |
|
595 | 0 | ref += 8; |
596 | 0 | src += 8; |
597 | 0 | } |
598 | |
|
599 | 0 | v0 = _mm_srli_si128(sum, 8); |
600 | 0 | sum = _mm_add_epi16(sum, v0); |
601 | 0 | v0 = _mm_srli_epi64(sum, 32); |
602 | 0 | sum = _mm_add_epi16(sum, v0); |
603 | 0 | v0 = _mm_srli_epi32(sum, 16); |
604 | 0 | sum = _mm_add_epi16(sum, v0); |
605 | |
|
606 | 0 | v1 = _mm_srli_si128(sse, 8); |
607 | 0 | sse = _mm_add_epi32(sse, v1); |
608 | 0 | v1 = _mm_srli_epi64(sse, 32); |
609 | 0 | sse = _mm_add_epi32(sse, v1); |
610 | |
|
611 | 0 | mean = (int16_t)_mm_extract_epi16(sum, 0); |
612 | |
|
613 | 0 | return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); |
614 | 0 | } |