/src/aom/av1/common/x86/cfl_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include "av1/common/cfl.h" |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | 1.00M | static inline __m128i fill_sum_epi32(__m128i l0) { |
18 | 1.00M | l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2))); |
19 | 1.00M | return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1))); |
20 | 1.00M | } |
21 | | |
22 | | static inline void subtract_average_sse2(const uint16_t *src_ptr, |
23 | | int16_t *dst_ptr, int width, |
24 | | int height, int round_offset, |
25 | 1.00M | int num_pel_log2) { |
26 | 1.00M | const __m128i zeros = _mm_setzero_si128(); |
27 | 1.00M | const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset); |
28 | 1.00M | const __m128i *src = (__m128i *)src_ptr; |
29 | 1.00M | const __m128i *const end = src + height * CFL_BUF_LINE_I128; |
30 | 1.00M | const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4)); |
31 | | |
32 | 1.00M | __m128i sum = zeros; |
33 | 3.54M | do { |
34 | 3.54M | __m128i l0; |
35 | 3.54M | if (width == 4) { |
36 | 852k | l0 = _mm_add_epi16(_mm_loadl_epi64(src), |
37 | 852k | _mm_loadl_epi64(src + CFL_BUF_LINE_I128)); |
38 | 852k | __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128), |
39 | 852k | _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128)); |
40 | 852k | sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), |
41 | 852k | _mm_unpacklo_epi16(l1, zeros))); |
42 | 2.69M | } else { |
43 | 2.69M | if (width == 8) { |
44 | 2.69M | l0 = _mm_add_epi16(_mm_loadu_si128(src), |
45 | 2.69M | _mm_loadu_si128(src + CFL_BUF_LINE_I128)); |
46 | 2.69M | } else { |
47 | 0 | l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1)); |
48 | 0 | } |
49 | 2.69M | sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), |
50 | 2.69M | _mm_unpackhi_epi16(l0, zeros))); |
51 | 2.69M | if (width == 32) { |
52 | 0 | l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3)); |
53 | 0 | sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), |
54 | 0 | _mm_unpackhi_epi16(l0, zeros))); |
55 | 0 | } |
56 | 2.69M | } |
57 | 3.54M | src += step; |
58 | 3.54M | } while (src < end); |
59 | | |
60 | 1.00M | sum = fill_sum_epi32(sum); |
61 | | |
62 | 1.00M | __m128i avg_epi16 = |
63 | 1.00M | _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2); |
64 | 1.00M | avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16); |
65 | | |
66 | 1.00M | src = (__m128i *)src_ptr; |
67 | 1.00M | __m128i *dst = (__m128i *)dst_ptr; |
68 | 8.79M | do { |
69 | 8.79M | if (width == 4) { |
70 | 3.41M | _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16)); |
71 | 5.38M | } else { |
72 | 5.38M | _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16)); |
73 | 5.38M | if (width > 8) { |
74 | 0 | _mm_storeu_si128(dst + 1, |
75 | 0 | _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16)); |
76 | 0 | if (width == 32) { |
77 | 0 | _mm_storeu_si128(dst + 2, |
78 | 0 | _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16)); |
79 | 0 | _mm_storeu_si128(dst + 3, |
80 | 0 | _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16)); |
81 | 0 | } |
82 | 0 | } |
83 | 5.38M | } |
84 | 8.79M | src += CFL_BUF_LINE_I128; |
85 | 8.79M | dst += CFL_BUF_LINE_I128; |
86 | 8.79M | } while (src < end); |
87 | 1.00M | } |
88 | | |
89 | | CFL_SUB_AVG_FN(sse2) |