/src/libvpx/vpx_dsp/x86/sum_squares_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | #include "vpx_dsp/x86/mem_sse2.h" |
16 | | |
17 | 47.4M | uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { |
18 | | // Over 75% of all calls are with size == 4. |
19 | 47.4M | if (size == 4) { |
20 | 38.3M | __m128i s[2], sq[2], ss; |
21 | | |
22 | 38.3M | s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); |
23 | 38.3M | s[0] = loadh_epi64(s[0], src + 1 * stride); |
24 | 38.3M | s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); |
25 | 38.3M | s[1] = loadh_epi64(s[1], src + 3 * stride); |
26 | 38.3M | sq[0] = _mm_madd_epi16(s[0], s[0]); |
27 | 38.3M | sq[1] = _mm_madd_epi16(s[1], s[1]); |
28 | 38.3M | sq[0] = _mm_add_epi32(sq[0], sq[1]); |
29 | 38.3M | ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8)); |
30 | 38.3M | ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32)); |
31 | | |
32 | 38.3M | return (uint64_t)_mm_cvtsi128_si32(ss); |
33 | 38.3M | } else { |
34 | | // Generic case |
35 | 9.08M | int r = size; |
36 | 9.08M | const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1); |
37 | 9.08M | __m128i v_acc_q = _mm_setzero_si128(); |
38 | | |
39 | 9.08M | assert(size % 8 == 0); |
40 | | |
41 | 11.0M | do { |
42 | 11.0M | int c = 0; |
43 | 11.0M | __m128i v_acc_d = _mm_setzero_si128(); |
44 | | |
45 | 16.7M | do { |
46 | 16.7M | const int16_t *const b = src + c; |
47 | 16.7M | const __m128i v_val_0_w = |
48 | 16.7M | _mm_load_si128((const __m128i *)(b + 0 * stride)); |
49 | 16.7M | const __m128i v_val_1_w = |
50 | 16.7M | _mm_load_si128((const __m128i *)(b + 1 * stride)); |
51 | 16.7M | const __m128i v_val_2_w = |
52 | 16.7M | _mm_load_si128((const __m128i *)(b + 2 * stride)); |
53 | 16.7M | const __m128i v_val_3_w = |
54 | 16.7M | _mm_load_si128((const __m128i *)(b + 3 * stride)); |
55 | 16.7M | const __m128i v_val_4_w = |
56 | 16.7M | _mm_load_si128((const __m128i *)(b + 4 * stride)); |
57 | 16.7M | const __m128i v_val_5_w = |
58 | 16.7M | _mm_load_si128((const __m128i *)(b + 5 * stride)); |
59 | 16.7M | const __m128i v_val_6_w = |
60 | 16.7M | _mm_load_si128((const __m128i *)(b + 6 * stride)); |
61 | 16.7M | const __m128i v_val_7_w = |
62 | 16.7M | _mm_load_si128((const __m128i *)(b + 7 * stride)); |
63 | | |
64 | 16.7M | const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); |
65 | 16.7M | const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); |
66 | 16.7M | const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); |
67 | 16.7M | const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); |
68 | 16.7M | const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); |
69 | 16.7M | const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); |
70 | 16.7M | const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); |
71 | 16.7M | const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); |
72 | | |
73 | 16.7M | const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); |
74 | 16.7M | const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); |
75 | 16.7M | const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); |
76 | 16.7M | const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); |
77 | | |
78 | 16.7M | const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); |
79 | 16.7M | const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); |
80 | | |
81 | 16.7M | v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); |
82 | 16.7M | v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); |
83 | 16.7M | c += 8; |
84 | 16.7M | } while (c < size); |
85 | | |
86 | 11.0M | v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); |
87 | 11.0M | v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); |
88 | | |
89 | 11.0M | src += 8 * stride; |
90 | 11.0M | r -= 8; |
91 | 11.0M | } while (r); |
92 | | |
93 | 9.08M | v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); |
94 | | |
95 | 9.08M | #if VPX_ARCH_X86_64 |
96 | 9.08M | return (uint64_t)_mm_cvtsi128_si64(v_acc_q); |
97 | | #else |
98 | | { |
99 | | uint64_t tmp; |
100 | | _mm_storel_epi64((__m128i *)&tmp, v_acc_q); |
101 | | return tmp; |
102 | | } |
103 | | #endif |
104 | 9.08M | } |
105 | 47.4M | } |