/src/aom/av1/common/x86/reconinter_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "config/av1_rtcd.h" |
13 | | |
14 | | #if CONFIG_AV1_HIGHBITDEPTH |
15 | | |
16 | | #include <tmmintrin.h> |
17 | | |
18 | | #include "aom/aom_integer.h" |
19 | | #include "aom_dsp/blend.h" |
20 | | #include "aom_dsp/x86/synonyms.h" |
21 | | #include "av1/common/blockd.h" |
22 | | |
23 | | void av1_build_compound_diffwtd_mask_highbd_ssse3( |
24 | | uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, |
25 | | int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, |
26 | 0 | int bd) { |
27 | 0 | if (w < 8) { |
28 | 0 | av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride, |
29 | 0 | src1, src1_stride, h, w, bd); |
30 | 0 | } else { |
31 | 0 | assert(bd >= 8); |
32 | 0 | assert((w % 8) == 0); |
33 | 0 | assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); |
34 | 0 | const __m128i x0 = _mm_setzero_si128(); |
35 | 0 | const __m128i xAOM_BLEND_A64_MAX_ALPHA = |
36 | 0 | _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
37 | 0 | const int mask_base = 38; |
38 | 0 | const __m128i xmask_base = _mm_set1_epi16(mask_base); |
39 | 0 | const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); |
40 | 0 | const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); |
41 | 0 | if (bd == 8) { |
42 | 0 | if (mask_type == DIFFWTD_38_INV) { |
43 | 0 | for (int i = 0; i < h; ++i) { |
44 | 0 | for (int j = 0; j < w; j += 8) { |
45 | 0 | __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); |
46 | 0 | __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); |
47 | 0 | __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), |
48 | 0 | DIFF_FACTOR_LOG2); |
49 | 0 | __m128i m = _mm_min_epi16( |
50 | 0 | _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), |
51 | 0 | xAOM_BLEND_A64_MAX_ALPHA); |
52 | 0 | m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); |
53 | 0 | m = _mm_packus_epi16(m, m); |
54 | 0 | _mm_storel_epi64((__m128i *)&mask[j], m); |
55 | 0 | } |
56 | 0 | ssrc0 += src0_stride; |
57 | 0 | ssrc1 += src1_stride; |
58 | 0 | mask += w; |
59 | 0 | } |
60 | 0 | } else { |
61 | 0 | for (int i = 0; i < h; ++i) { |
62 | 0 | for (int j = 0; j < w; j += 8) { |
63 | 0 | __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); |
64 | 0 | __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); |
65 | 0 | __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), |
66 | 0 | DIFF_FACTOR_LOG2); |
67 | 0 | __m128i m = _mm_min_epi16( |
68 | 0 | _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), |
69 | 0 | xAOM_BLEND_A64_MAX_ALPHA); |
70 | 0 | m = _mm_packus_epi16(m, m); |
71 | 0 | _mm_storel_epi64((__m128i *)&mask[j], m); |
72 | 0 | } |
73 | 0 | ssrc0 += src0_stride; |
74 | 0 | ssrc1 += src1_stride; |
75 | 0 | mask += w; |
76 | 0 | } |
77 | 0 | } |
78 | 0 | } else { |
79 | 0 | const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2); |
80 | 0 | if (mask_type == DIFFWTD_38_INV) { |
81 | 0 | for (int i = 0; i < h; ++i) { |
82 | 0 | for (int j = 0; j < w; j += 8) { |
83 | 0 | __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); |
84 | 0 | __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); |
85 | 0 | __m128i diff = |
86 | 0 | _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); |
87 | 0 | __m128i m = _mm_min_epi16( |
88 | 0 | _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), |
89 | 0 | xAOM_BLEND_A64_MAX_ALPHA); |
90 | 0 | m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); |
91 | 0 | m = _mm_packus_epi16(m, m); |
92 | 0 | _mm_storel_epi64((__m128i *)&mask[j], m); |
93 | 0 | } |
94 | 0 | ssrc0 += src0_stride; |
95 | 0 | ssrc1 += src1_stride; |
96 | 0 | mask += w; |
97 | 0 | } |
98 | 0 | } else { |
99 | 0 | for (int i = 0; i < h; ++i) { |
100 | 0 | for (int j = 0; j < w; j += 8) { |
101 | 0 | __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); |
102 | 0 | __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); |
103 | 0 | __m128i diff = |
104 | 0 | _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); |
105 | 0 | __m128i m = _mm_min_epi16( |
106 | 0 | _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), |
107 | 0 | xAOM_BLEND_A64_MAX_ALPHA); |
108 | 0 | m = _mm_packus_epi16(m, m); |
109 | 0 | _mm_storel_epi64((__m128i *)&mask[j], m); |
110 | 0 | } |
111 | 0 | ssrc0 += src0_stride; |
112 | 0 | ssrc1 += src1_stride; |
113 | 0 | mask += w; |
114 | 0 | } |
115 | 0 | } |
116 | 0 | } |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | | #endif // CONFIG_AV1_HIGHBITDEPTH |