/src/libvpx/vpx_dsp/x86/variance_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <emmintrin.h> // SSE2 |
13 | | |
14 | | #include "./vpx_config.h" |
15 | | #include "./vpx_dsp_rtcd.h" |
16 | | #include "vpx_ports/mem.h" |
17 | | #include "vpx_dsp/x86/mem_sse2.h" |
18 | | |
19 | 305M | static INLINE unsigned int add32x4_sse2(__m128i val) { |
20 | 305M | val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); |
21 | 305M | val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); |
22 | 305M | return (unsigned int)_mm_cvtsi128_si32(val); |
23 | 305M | } |
24 | | |
25 | 0 | unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { |
26 | 0 | __m128i vsum = _mm_setzero_si128(); |
27 | 0 | int i; |
28 | |
|
29 | 0 | for (i = 0; i < 32; ++i) { |
30 | 0 | const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr); |
31 | 0 | vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); |
32 | 0 | src_ptr += 8; |
33 | 0 | } |
34 | |
|
35 | 0 | return add32x4_sse2(vsum); |
36 | 0 | } |
37 | | |
38 | 1.25G | static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { |
39 | 1.25G | const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); |
40 | 1.25G | const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); |
41 | 1.25G | const __m128i p01 = _mm_unpacklo_epi32(p0, p1); |
42 | 1.25G | return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); |
43 | 1.25G | } |
44 | | |
45 | | static INLINE void variance_kernel_sse2(const __m128i src_ptr, |
46 | | const __m128i ref_ptr, |
47 | | __m128i *const sse, |
48 | 627M | __m128i *const sum) { |
49 | 627M | const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr); |
50 | 627M | *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); |
51 | 627M | *sum = _mm_add_epi16(*sum, diff); |
52 | 627M | } |
53 | | |
54 | | // Can handle 128 pixels' diff sum (such as 8x16 or 16x8) |
55 | | // Slightly faster than variance_final_256_pel_sse2() |
56 | | static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, |
57 | | unsigned int *const sse, |
58 | 305M | int *const sum) { |
59 | 305M | *sse = add32x4_sse2(vsse); |
60 | | |
61 | 305M | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); |
62 | 305M | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); |
63 | 305M | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); |
64 | 305M | *sum = (int16_t)_mm_extract_epi16(vsum, 0); |
65 | 305M | } |
66 | | |
67 | | // Can handle 256 pixels' diff sum (such as 16x16) |
68 | | static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, |
69 | | unsigned int *const sse, |
70 | 0 | int *const sum) { |
71 | 0 | *sse = add32x4_sse2(vsse); |
72 | |
|
73 | 0 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); |
74 | 0 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); |
75 | 0 | *sum = (int16_t)_mm_extract_epi16(vsum, 0); |
76 | 0 | *sum += (int16_t)_mm_extract_epi16(vsum, 1); |
77 | 0 | } |
78 | | |
79 | | // Can handle 512 pixels' diff sum (such as 16x32 or 32x16) |
80 | | static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, |
81 | | unsigned int *const sse, |
82 | 0 | int *const sum) { |
83 | 0 | *sse = add32x4_sse2(vsse); |
84 | |
|
85 | 0 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); |
86 | 0 | vsum = _mm_unpacklo_epi16(vsum, vsum); |
87 | 0 | vsum = _mm_srai_epi32(vsum, 16); |
88 | 0 | *sum = (int)add32x4_sse2(vsum); |
89 | 0 | } |
90 | | |
91 | 0 | static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { |
92 | 0 | const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); |
93 | 0 | const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); |
94 | 0 | return _mm_add_epi32(sum_lo, sum_hi); |
95 | 0 | } |
96 | | |
97 | | // Can handle 1024 pixels' diff sum (such as 32x32) |
98 | 0 | static INLINE int sum_final_sse2(const __m128i sum) { |
99 | 0 | const __m128i t = sum_to_32bit_sse2(sum); |
100 | 0 | return (int)add32x4_sse2(t); |
101 | 0 | } |
102 | | |
103 | | static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, |
104 | | const uint8_t *ref_ptr, const int ref_stride, |
105 | | const int h, __m128i *const sse, |
106 | 305M | __m128i *const sum) { |
107 | 305M | int i; |
108 | | |
109 | 305M | assert(h <= 256); // May overflow for larger height. |
110 | 305M | *sse = _mm_setzero_si128(); |
111 | 305M | *sum = _mm_setzero_si128(); |
112 | | |
113 | 932M | for (i = 0; i < h; i += 2) { |
114 | 627M | const __m128i s = load4x2_sse2(src_ptr, src_stride); |
115 | 627M | const __m128i r = load4x2_sse2(ref_ptr, ref_stride); |
116 | | |
117 | 627M | variance_kernel_sse2(s, r, sse, sum); |
118 | 627M | src_ptr += 2 * src_stride; |
119 | 627M | ref_ptr += 2 * ref_stride; |
120 | 627M | } |
121 | 305M | } |
122 | | |
123 | | static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride, |
124 | | const uint8_t *ref_ptr, const int ref_stride, |
125 | | const int h, __m128i *const sse, |
126 | 0 | __m128i *const sum) { |
127 | 0 | const __m128i zero = _mm_setzero_si128(); |
128 | 0 | int i; |
129 | |
|
130 | 0 | assert(h <= 128); // May overflow for larger height. |
131 | 0 | *sse = _mm_setzero_si128(); |
132 | 0 | *sum = _mm_setzero_si128(); |
133 | |
|
134 | 0 | for (i = 0; i < h; i++) { |
135 | 0 | const __m128i s = |
136 | 0 | _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero); |
137 | 0 | const __m128i r = |
138 | 0 | _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero); |
139 | |
|
140 | 0 | variance_kernel_sse2(s, r, sse, sum); |
141 | 0 | src_ptr += src_stride; |
142 | 0 | ref_ptr += ref_stride; |
143 | 0 | } |
144 | 0 | } |
145 | | |
146 | | static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr, |
147 | | const uint8_t *const ref_ptr, |
148 | | __m128i *const sse, |
149 | 0 | __m128i *const sum) { |
150 | 0 | const __m128i zero = _mm_setzero_si128(); |
151 | 0 | const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr); |
152 | 0 | const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr); |
153 | 0 | const __m128i src0 = _mm_unpacklo_epi8(s, zero); |
154 | 0 | const __m128i ref0 = _mm_unpacklo_epi8(r, zero); |
155 | 0 | const __m128i src1 = _mm_unpackhi_epi8(s, zero); |
156 | 0 | const __m128i ref1 = _mm_unpackhi_epi8(r, zero); |
157 | |
|
158 | 0 | variance_kernel_sse2(src0, ref0, sse, sum); |
159 | 0 | variance_kernel_sse2(src1, ref1, sse, sum); |
160 | 0 | } |
161 | | |
162 | | static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride, |
163 | | const uint8_t *ref_ptr, const int ref_stride, |
164 | | const int h, __m128i *const sse, |
165 | 0 | __m128i *const sum) { |
166 | 0 | int i; |
167 | |
|
168 | 0 | assert(h <= 64); // May overflow for larger height. |
169 | 0 | *sse = _mm_setzero_si128(); |
170 | 0 | *sum = _mm_setzero_si128(); |
171 | |
|
172 | 0 | for (i = 0; i < h; ++i) { |
173 | 0 | variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum); |
174 | 0 | src_ptr += src_stride; |
175 | 0 | ref_ptr += ref_stride; |
176 | 0 | } |
177 | 0 | } |
178 | | |
179 | | static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride, |
180 | | const uint8_t *ref_ptr, const int ref_stride, |
181 | | const int h, __m128i *const sse, |
182 | 0 | __m128i *const sum) { |
183 | 0 | int i; |
184 | |
|
185 | 0 | assert(h <= 32); // May overflow for larger height. |
186 | | // Don't initialize sse here since it's an accumulation. |
187 | 0 | *sum = _mm_setzero_si128(); |
188 | |
|
189 | 0 | for (i = 0; i < h; ++i) { |
190 | 0 | variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); |
191 | 0 | variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); |
192 | 0 | src_ptr += src_stride; |
193 | 0 | ref_ptr += ref_stride; |
194 | 0 | } |
195 | 0 | } |
196 | | |
197 | | static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride, |
198 | | const uint8_t *ref_ptr, const int ref_stride, |
199 | | const int h, __m128i *const sse, |
200 | 0 | __m128i *const sum) { |
201 | 0 | int i; |
202 | |
|
203 | 0 | assert(h <= 16); // May overflow for larger height. |
204 | | // Don't initialize sse here since it's an accumulation. |
205 | 0 | *sum = _mm_setzero_si128(); |
206 | |
|
207 | 0 | for (i = 0; i < h; ++i) { |
208 | 0 | variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); |
209 | 0 | variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); |
210 | 0 | variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum); |
211 | 0 | variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum); |
212 | 0 | src_ptr += src_stride; |
213 | 0 | ref_ptr += ref_stride; |
214 | 0 | } |
215 | 0 | } |
216 | | |
217 | | void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, |
218 | | const uint8_t *ref_ptr, int ref_stride, |
219 | 0 | unsigned int *sse, int *sum) { |
220 | 0 | __m128i vsse, vsum; |
221 | 0 | variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
222 | 0 | variance_final_128_pel_sse2(vsse, vsum, sse, sum); |
223 | 0 | } |
224 | | |
225 | | void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, |
226 | | const uint8_t *ref_ptr, int ref_stride, |
227 | 0 | unsigned int *sse, int *sum) { |
228 | 0 | __m128i vsse, vsum; |
229 | 0 | variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
230 | 0 | variance_final_256_pel_sse2(vsse, vsum, sse, sum); |
231 | 0 | } |
232 | | |
233 | | unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, |
234 | | const uint8_t *ref_ptr, int ref_stride, |
235 | 296M | unsigned int *sse) { |
236 | 296M | __m128i vsse, vsum; |
237 | 296M | int sum; |
238 | 296M | variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); |
239 | 296M | variance_final_128_pel_sse2(vsse, vsum, sse, &sum); |
240 | 296M | return *sse - ((sum * sum) >> 4); |
241 | 296M | } |
242 | | |
243 | | unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, |
244 | | const uint8_t *ref_ptr, int ref_stride, |
245 | 8.46M | unsigned int *sse) { |
246 | 8.46M | __m128i vsse, vsum; |
247 | 8.46M | int sum; |
248 | 8.46M | variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
249 | 8.46M | variance_final_128_pel_sse2(vsse, vsum, sse, &sum); |
250 | 8.46M | return *sse - ((sum * sum) >> 5); |
251 | 8.46M | } |
252 | | |
253 | | unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, |
254 | | const uint8_t *ref_ptr, int ref_stride, |
255 | 0 | unsigned int *sse) { |
256 | 0 | __m128i vsse, vsum; |
257 | 0 | int sum; |
258 | 0 | variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); |
259 | 0 | variance_final_128_pel_sse2(vsse, vsum, sse, &sum); |
260 | 0 | return *sse - ((sum * sum) >> 5); |
261 | 0 | } |
262 | | |
263 | | unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, |
264 | | const uint8_t *ref_ptr, int ref_stride, |
265 | 0 | unsigned int *sse) { |
266 | 0 | __m128i vsse, vsum; |
267 | 0 | int sum; |
268 | 0 | variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
269 | 0 | variance_final_128_pel_sse2(vsse, vsum, sse, &sum); |
270 | 0 | return *sse - ((sum * sum) >> 6); |
271 | 0 | } |
272 | | |
273 | | unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, |
274 | | const uint8_t *ref_ptr, int ref_stride, |
275 | 0 | unsigned int *sse) { |
276 | 0 | __m128i vsse, vsum; |
277 | 0 | int sum; |
278 | 0 | variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
279 | 0 | variance_final_128_pel_sse2(vsse, vsum, sse, &sum); |
280 | 0 | return *sse - ((sum * sum) >> 7); |
281 | 0 | } |
282 | | |
283 | | unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, |
284 | | const uint8_t *ref_ptr, int ref_stride, |
285 | 0 | unsigned int *sse) { |
286 | 0 | __m128i vsse, vsum; |
287 | 0 | int sum; |
288 | 0 | variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
289 | 0 | variance_final_128_pel_sse2(vsse, vsum, sse, &sum); |
290 | 0 | return *sse - ((sum * sum) >> 7); |
291 | 0 | } |
292 | | |
293 | | unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, |
294 | | const uint8_t *ref_ptr, int ref_stride, |
295 | 0 | unsigned int *sse) { |
296 | 0 | __m128i vsse, vsum; |
297 | 0 | int sum; |
298 | 0 | variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
299 | 0 | variance_final_256_pel_sse2(vsse, vsum, sse, &sum); |
300 | 0 | return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); |
301 | 0 | } |
302 | | |
303 | | unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, |
304 | | const uint8_t *ref_ptr, int ref_stride, |
305 | 0 | unsigned int *sse) { |
306 | 0 | __m128i vsse, vsum; |
307 | 0 | int sum; |
308 | 0 | variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); |
309 | 0 | variance_final_512_pel_sse2(vsse, vsum, sse, &sum); |
310 | 0 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); |
311 | 0 | } |
312 | | |
313 | | unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, |
314 | | const uint8_t *ref_ptr, int ref_stride, |
315 | 0 | unsigned int *sse) { |
316 | 0 | __m128i vsse = _mm_setzero_si128(); |
317 | 0 | __m128i vsum; |
318 | 0 | int sum; |
319 | 0 | variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
320 | 0 | variance_final_512_pel_sse2(vsse, vsum, sse, &sum); |
321 | 0 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); |
322 | 0 | } |
323 | | |
324 | | unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, |
325 | | const uint8_t *ref_ptr, int ref_stride, |
326 | 0 | unsigned int *sse) { |
327 | 0 | __m128i vsse = _mm_setzero_si128(); |
328 | 0 | __m128i vsum; |
329 | 0 | int sum; |
330 | 0 | variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); |
331 | 0 | *sse = add32x4_sse2(vsse); |
332 | 0 | sum = sum_final_sse2(vsum); |
333 | 0 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); |
334 | 0 | } |
335 | | |
336 | | unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, |
337 | | const uint8_t *ref_ptr, int ref_stride, |
338 | 0 | unsigned int *sse) { |
339 | 0 | __m128i vsse = _mm_setzero_si128(); |
340 | 0 | __m128i vsum = _mm_setzero_si128(); |
341 | 0 | int sum; |
342 | 0 | int i = 0; |
343 | |
|
344 | 0 | for (i = 0; i < 2; i++) { |
345 | 0 | __m128i vsum16; |
346 | 0 | variance32_sse2(src_ptr + 32 * i * src_stride, src_stride, |
347 | 0 | ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, |
348 | 0 | &vsum16); |
349 | 0 | vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); |
350 | 0 | } |
351 | 0 | *sse = add32x4_sse2(vsse); |
352 | 0 | sum = (int)add32x4_sse2(vsum); |
353 | 0 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); |
354 | 0 | } |
355 | | |
356 | | unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, |
357 | | const uint8_t *ref_ptr, int ref_stride, |
358 | 0 | unsigned int *sse) { |
359 | 0 | __m128i vsse = _mm_setzero_si128(); |
360 | 0 | __m128i vsum = _mm_setzero_si128(); |
361 | 0 | int sum; |
362 | 0 | int i = 0; |
363 | |
|
364 | 0 | for (i = 0; i < 2; i++) { |
365 | 0 | __m128i vsum16; |
366 | 0 | variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, |
367 | 0 | ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, |
368 | 0 | &vsum16); |
369 | 0 | vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); |
370 | 0 | } |
371 | 0 | *sse = add32x4_sse2(vsse); |
372 | 0 | sum = (int)add32x4_sse2(vsum); |
373 | 0 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); |
374 | 0 | } |
375 | | |
376 | | unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, |
377 | | const uint8_t *ref_ptr, int ref_stride, |
378 | 0 | unsigned int *sse) { |
379 | 0 | __m128i vsse = _mm_setzero_si128(); |
380 | 0 | __m128i vsum = _mm_setzero_si128(); |
381 | 0 | int sum; |
382 | 0 | int i = 0; |
383 | |
|
384 | 0 | for (i = 0; i < 4; i++) { |
385 | 0 | __m128i vsum16; |
386 | 0 | variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, |
387 | 0 | ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, |
388 | 0 | &vsum16); |
389 | 0 | vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); |
390 | 0 | } |
391 | 0 | *sse = add32x4_sse2(vsse); |
392 | 0 | sum = (int)add32x4_sse2(vsum); |
393 | 0 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); |
394 | 0 | } |
395 | | |
396 | | unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, |
397 | | const uint8_t *ref_ptr, int ref_stride, |
398 | 0 | unsigned int *sse) { |
399 | 0 | vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); |
400 | 0 | return *sse; |
401 | 0 | } |
402 | | |
403 | | unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, |
404 | | const uint8_t *ref_ptr, int ref_stride, |
405 | 0 | unsigned int *sse) { |
406 | 0 | vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); |
407 | 0 | return *sse; |
408 | 0 | } |
409 | | |
410 | | unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, |
411 | | const uint8_t *ref_ptr, int ref_stride, |
412 | 0 | unsigned int *sse) { |
413 | 0 | vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); |
414 | 0 | return *sse; |
415 | 0 | } |
416 | | |
417 | | unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, |
418 | | const uint8_t *ref_ptr, int ref_stride, |
419 | 0 | unsigned int *sse) { |
420 | 0 | vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); |
421 | 0 | return *sse; |
422 | 0 | } |
423 | | |
424 | | // The 2 unused parameters are place holders for PIC enabled build. |
425 | | // These definitions are for functions defined in subpel_variance.asm |
426 | | #define DECL(w, opt) \ |
427 | | int vpx_sub_pixel_variance##w##xh_##opt( \ |
428 | | const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ |
429 | | int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \ |
430 | | unsigned int *sse, void *unused0, void *unused) |
431 | | #define DECLS(opt1, opt2) \ |
432 | | DECL(4, opt1); \ |
433 | | DECL(8, opt1); \ |
434 | | DECL(16, opt1) |
435 | | |
436 | | DECLS(sse2, sse2); |
437 | | DECLS(ssse3, ssse3); |
438 | | #undef DECLS |
439 | | #undef DECL |
440 | | |
441 | | #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ |
442 | | unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ |
443 | | const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
444 | 253M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ |
445 | 253M | unsigned int sse_tmp; \ |
446 | 253M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ |
447 | 253M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ |
448 | 253M | &sse_tmp, NULL, NULL); \ |
449 | 253M | if (w > wf) { \ |
450 | 3.45M | unsigned int sse2; \ |
451 | 3.45M | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ |
452 | 3.45M | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ |
453 | 3.45M | ref_stride, h, &sse2, NULL, NULL); \ |
454 | 3.45M | se += se2; \ |
455 | 3.45M | sse_tmp += sse2; \ |
456 | 3.45M | if (w > wf * 2) { \ |
457 | 658k | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ |
458 | 658k | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ |
459 | 658k | ref_stride, h, &sse2, NULL, NULL); \ |
460 | 658k | se += se2; \ |
461 | 658k | sse_tmp += sse2; \ |
462 | 658k | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ |
463 | 658k | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ |
464 | 658k | ref_stride, h, &sse2, NULL, NULL); \ |
465 | 658k | se += se2; \ |
466 | 658k | sse_tmp += sse2; \ |
467 | 658k | } \ |
468 | 3.45M | } \ |
469 | 253M | *sse = sse_tmp; \ |
470 | 253M | return sse_tmp - \ |
471 | 253M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ |
472 | 253M | } Unexecuted instantiation: vpx_sub_pixel_variance64x64_sse2 Unexecuted instantiation: vpx_sub_pixel_variance64x32_sse2 Unexecuted instantiation: vpx_sub_pixel_variance32x64_sse2 Unexecuted instantiation: vpx_sub_pixel_variance32x32_sse2 Unexecuted instantiation: vpx_sub_pixel_variance32x16_sse2 Unexecuted instantiation: vpx_sub_pixel_variance16x32_sse2 Unexecuted instantiation: vpx_sub_pixel_variance16x16_sse2 Unexecuted instantiation: vpx_sub_pixel_variance16x8_sse2 Unexecuted instantiation: vpx_sub_pixel_variance8x16_sse2 Unexecuted instantiation: vpx_sub_pixel_variance8x8_sse2 Unexecuted instantiation: vpx_sub_pixel_variance8x4_sse2 Unexecuted instantiation: vpx_sub_pixel_variance4x8_sse2 Unexecuted instantiation: vpx_sub_pixel_variance4x4_sse2 Unexecuted instantiation: vpx_sub_pixel_variance64x64_ssse3 vpx_sub_pixel_variance64x32_ssse3 Line | Count | Source | 444 | 658k | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 658k | unsigned int sse_tmp; \ | 446 | 658k | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 658k | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 658k | &sse_tmp, NULL, NULL); \ | 449 | 658k | if (w > wf) { \ | 450 | 658k | unsigned int sse2; \ | 451 | 658k | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 658k | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 658k | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 658k | se += se2; \ | 455 | 658k | sse_tmp += sse2; \ | 456 | 658k | if (w > wf * 2) { \ | 457 | 658k | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 658k | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 658k | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 658k | se += se2; \ | 461 | 658k | sse_tmp += sse2; \ | 462 | 658k | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 658k | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 658k | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 658k | se += se2; \ | 466 | 658k | sse_tmp += sse2; \ | 467 | 658k | } \ | 468 | 658k | } \ | 469 | 658k | *sse = sse_tmp; \ | 470 | 658k | return sse_tmp - \ | 471 | 658k | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 658k | } |
vpx_sub_pixel_variance32x64_ssse3 Line | Count | Source | 444 | 586k | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 586k | unsigned int sse_tmp; \ | 446 | 586k | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 586k | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 586k | &sse_tmp, NULL, NULL); \ | 449 | 586k | if (w > wf) { \ | 450 | 586k | unsigned int sse2; \ | 451 | 586k | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 586k | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 586k | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 586k | se += se2; \ | 455 | 586k | sse_tmp += sse2; \ | 456 | 586k | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 586k | } \ | 469 | 586k | *sse = sse_tmp; \ | 470 | 586k | return sse_tmp - \ | 471 | 586k | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 586k | } |
Unexecuted instantiation: vpx_sub_pixel_variance32x32_ssse3 vpx_sub_pixel_variance32x16_ssse3 Line | Count | Source | 444 | 2.20M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 2.20M | unsigned int sse_tmp; \ | 446 | 2.20M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 2.20M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 2.20M | &sse_tmp, NULL, NULL); \ | 449 | 2.20M | if (w > wf) { \ | 450 | 2.20M | unsigned int sse2; \ | 451 | 2.20M | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 2.20M | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 2.20M | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 2.20M | se += se2; \ | 455 | 2.20M | sse_tmp += sse2; \ | 456 | 2.20M | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 2.20M | } \ | 469 | 2.20M | *sse = sse_tmp; \ | 470 | 2.20M | return sse_tmp - \ | 471 | 2.20M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 2.20M | } |
vpx_sub_pixel_variance16x32_ssse3 Line | Count | Source | 444 | 2.14M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 2.14M | unsigned int sse_tmp; \ | 446 | 2.14M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 2.14M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 2.14M | &sse_tmp, NULL, NULL); \ | 449 | 2.14M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 2.14M | *sse = sse_tmp; \ | 470 | 2.14M | return sse_tmp - \ | 471 | 2.14M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 2.14M | } |
vpx_sub_pixel_variance16x16_ssse3 Line | Count | Source | 444 | 26.5M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 26.5M | unsigned int sse_tmp; \ | 446 | 26.5M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 26.5M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 26.5M | &sse_tmp, NULL, NULL); \ | 449 | 26.5M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 26.5M | *sse = sse_tmp; \ | 470 | 26.5M | return sse_tmp - \ | 471 | 26.5M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 26.5M | } |
vpx_sub_pixel_variance16x8_ssse3 Line | Count | Source | 444 | 15.2M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 15.2M | unsigned int sse_tmp; \ | 446 | 15.2M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 15.2M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 15.2M | &sse_tmp, NULL, NULL); \ | 449 | 15.2M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 15.2M | *sse = sse_tmp; \ | 470 | 15.2M | return sse_tmp - \ | 471 | 15.2M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 15.2M | } |
vpx_sub_pixel_variance8x16_ssse3 Line | Count | Source | 444 | 15.7M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 15.7M | unsigned int sse_tmp; \ | 446 | 15.7M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 15.7M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 15.7M | &sse_tmp, NULL, NULL); \ | 449 | 15.7M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 15.7M | *sse = sse_tmp; \ | 470 | 15.7M | return sse_tmp - \ | 471 | 15.7M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 15.7M | } |
vpx_sub_pixel_variance8x8_ssse3 Line | Count | Source | 444 | 44.8M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 44.8M | unsigned int sse_tmp; \ | 446 | 44.8M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 44.8M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 44.8M | &sse_tmp, NULL, NULL); \ | 449 | 44.8M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 44.8M | *sse = sse_tmp; \ | 470 | 44.8M | return sse_tmp - \ | 471 | 44.8M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 44.8M | } |
vpx_sub_pixel_variance8x4_ssse3 Line | Count | Source | 444 | 28.4M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 28.4M | unsigned int sse_tmp; \ | 446 | 28.4M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 28.4M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 28.4M | &sse_tmp, NULL, NULL); \ | 449 | 28.4M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 28.4M | *sse = sse_tmp; \ | 470 | 28.4M | return sse_tmp - \ | 471 | 28.4M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 28.4M | } |
vpx_sub_pixel_variance4x8_ssse3 Line | Count | Source | 444 | 28.8M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 28.8M | unsigned int sse_tmp; \ | 446 | 28.8M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 28.8M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 28.8M | &sse_tmp, NULL, NULL); \ | 449 | 28.8M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 28.8M | *sse = sse_tmp; \ | 470 | 28.8M | return sse_tmp - \ | 471 | 28.8M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 28.8M | } |
vpx_sub_pixel_variance4x4_ssse3 Line | Count | Source | 444 | 88.6M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ | 445 | 88.6M | unsigned int sse_tmp; \ | 446 | 88.6M | int se = vpx_sub_pixel_variance##wf##xh_##opt( \ | 447 | 88.6M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ | 448 | 88.6M | &sse_tmp, NULL, NULL); \ | 449 | 88.6M | if (w > wf) { \ | 450 | 0 | unsigned int sse2; \ | 451 | 0 | int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 452 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ | 453 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 454 | 0 | se += se2; \ | 455 | 0 | sse_tmp += sse2; \ | 456 | 0 | if (w > wf * 2) { \ | 457 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 458 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ | 459 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 460 | 0 | se += se2; \ | 461 | 0 | sse_tmp += sse2; \ | 462 | 0 | se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ | 463 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ | 464 | 0 | ref_stride, h, &sse2, NULL, NULL); \ | 465 | 0 | se += se2; \ | 466 | 0 | sse_tmp += sse2; \ | 467 | 0 | } \ | 468 | 0 | } \ | 469 | 88.6M | *sse = sse_tmp; \ | 470 | 88.6M | return sse_tmp - \ | 471 | 88.6M | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ | 472 | 88.6M | } |
|
473 | | |
474 | | #define FNS(opt1, opt2) \ |
475 | | FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ |
476 | | FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ |
477 | | FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ |
478 | | FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ |
479 | | FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ |
480 | | FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ |
481 | | FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ |
482 | | FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \ |
483 | | FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \ |
484 | | FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \ |
485 | | FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \ |
486 | | FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \ |
487 | | FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) |
488 | | |
489 | | FNS(sse2, sse2) |
490 | | FNS(ssse3, ssse3) |
491 | | |
492 | | #undef FNS |
493 | | #undef FN |
494 | | |
495 | | // The 2 unused parameters are place holders for PIC enabled build. |
496 | | #define DECL(w, opt) \ |
497 | | int vpx_sub_pixel_avg_variance##w##xh_##opt( \ |
498 | | const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ |
499 | | int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \ |
500 | | const uint8_t *second_pred, ptrdiff_t second_stride, int height, \ |
501 | | unsigned int *sse, void *unused0, void *unused) |
502 | | #define DECLS(opt1, opt2) \ |
503 | | DECL(4, opt1); \ |
504 | | DECL(8, opt1); \ |
505 | | DECL(16, opt1) |
506 | | |
507 | | DECLS(sse2, sse2); |
508 | | DECLS(ssse3, ssse3); |
509 | | #undef DECL |
510 | | #undef DECLS |
511 | | |
512 | | #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ |
513 | | unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ |
514 | | const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
515 | | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \ |
516 | 0 | const uint8_t *second_pred) { \ |
517 | 0 | unsigned int sse_tmp; \ |
518 | 0 | int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ |
519 | 0 | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \ |
520 | 0 | second_pred, w, h, &sse_tmp, NULL, NULL); \ |
521 | 0 | if (w > wf) { \ |
522 | 0 | unsigned int sse2; \ |
523 | 0 | int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ |
524 | 0 | src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ |
525 | 0 | ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \ |
526 | 0 | se += se2; \ |
527 | 0 | sse_tmp += sse2; \ |
528 | 0 | if (w > wf * 2) { \ |
529 | 0 | se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ |
530 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ |
531 | 0 | ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \ |
532 | 0 | se += se2; \ |
533 | 0 | sse_tmp += sse2; \ |
534 | 0 | se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ |
535 | 0 | src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ |
536 | 0 | ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \ |
537 | 0 | se += se2; \ |
538 | 0 | sse_tmp += sse2; \ |
539 | 0 | } \ |
540 | 0 | } \ |
541 | 0 | *sse = sse_tmp; \ |
542 | 0 | return sse_tmp - \ |
543 | 0 | (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ |
544 | 0 | } Unexecuted instantiation: vpx_sub_pixel_avg_variance64x64_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance64x32_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance32x64_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance32x32_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance32x16_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance16x32_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance16x16_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance16x8_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance8x16_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance8x8_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance8x4_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance4x8_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance4x4_sse2 Unexecuted instantiation: vpx_sub_pixel_avg_variance64x64_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance64x32_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance32x64_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance32x32_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance32x16_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance16x32_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance16x16_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance16x8_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance8x16_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance8x8_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance8x4_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance4x8_ssse3 Unexecuted instantiation: vpx_sub_pixel_avg_variance4x4_ssse3 |
545 | | |
546 | | #define FNS(opt1, opt2) \ |
547 | | FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ |
548 | | FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ |
549 | | FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ |
550 | | FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ |
551 | | FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ |
552 | | FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ |
553 | | FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ |
554 | | FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \ |
555 | | FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \ |
556 | | FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \ |
557 | | FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \ |
558 | | FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \ |
559 | | FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) |
560 | | |
561 | | FNS(sse2, sse) |
562 | | FNS(ssse3, ssse3) |
563 | | |
564 | | #undef FNS |
565 | | #undef FN |