/src/aom/aom_dsp/x86/blend_mask_sse4.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ |
13 | | #define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ |
14 | | #include <smmintrin.h> // SSE4.1 |
15 | | |
16 | | #include <assert.h> |
17 | | |
18 | | #include "aom/aom_integer.h" |
19 | | #include "aom_ports/mem.h" |
20 | | #include "aom_dsp/aom_dsp_common.h" |
21 | | #include "aom_dsp/blend.h" |
22 | | |
23 | | #include "aom_dsp/x86/synonyms.h" |
24 | | |
25 | | #include "config/aom_dsp_rtcd.h" |
26 | | |
27 | | static inline void blend_a64_d16_mask_w4_sse41( |
28 | | uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, |
29 | | const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, |
30 | 193k | int shift) { |
31 | 193k | const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); |
32 | 193k | const __m128i s0 = xx_loadl_64(src0); |
33 | 193k | const __m128i s1 = xx_loadl_64(src1); |
34 | 193k | const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); |
35 | 193k | const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); |
36 | 193k | const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); |
37 | 193k | const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); |
38 | 193k | const __m128i res_d = _mm_srai_epi32(res_c, shift); |
39 | 193k | const __m128i res_e = _mm_packs_epi32(res_d, res_d); |
40 | 193k | const __m128i res = _mm_packus_epi16(res_e, res_e); |
41 | | |
42 | 193k | xx_storel_32(dst, res); |
43 | 193k | } Unexecuted instantiation: blend_a64_mask_sse4.c:blend_a64_d16_mask_w4_sse41 blend_a64_mask_avx2.c:blend_a64_d16_mask_w4_sse41 Line | Count | Source | 30 | 193k | int shift) { | 31 | 193k | const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); | 32 | 193k | const __m128i s0 = xx_loadl_64(src0); | 33 | 193k | const __m128i s1 = xx_loadl_64(src1); | 34 | 193k | const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); | 35 | 193k | const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); | 36 | 193k | const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); | 37 | 193k | const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); | 38 | 193k | const __m128i res_d = _mm_srai_epi32(res_c, shift); | 39 | 193k | const __m128i res_e = _mm_packs_epi32(res_d, res_d); | 40 | 193k | const __m128i res = _mm_packus_epi16(res_e, res_e); | 41 | | | 42 | 193k | xx_storel_32(dst, res); | 43 | 193k | } |
|
44 | | |
45 | | static inline void blend_a64_d16_mask_w8_sse41( |
46 | | uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, |
47 | | const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, |
48 | 615k | int shift) { |
49 | 615k | const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); |
50 | 615k | const __m128i s0 = xx_loadu_128(src0); |
51 | 615k | const __m128i s1 = xx_loadu_128(src1); |
52 | 615k | __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), |
53 | 615k | _mm_unpacklo_epi16(*m, max_minus_m)); |
54 | 615k | __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), |
55 | 615k | _mm_unpackhi_epi16(*m, max_minus_m)); |
56 | 615k | res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); |
57 | 615k | res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); |
58 | 615k | const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); |
59 | 615k | const __m128i res = _mm_packus_epi16(res_e, res_e); |
60 | | |
61 | 615k | _mm_storel_epi64((__m128i *)(dst), res); |
62 | 615k | } Unexecuted instantiation: blend_a64_mask_sse4.c:blend_a64_d16_mask_w8_sse41 blend_a64_mask_avx2.c:blend_a64_d16_mask_w8_sse41 Line | Count | Source | 48 | 615k | int shift) { | 49 | 615k | const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); | 50 | 615k | const __m128i s0 = xx_loadu_128(src0); | 51 | 615k | const __m128i s1 = xx_loadu_128(src1); | 52 | 615k | __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), | 53 | 615k | _mm_unpacklo_epi16(*m, max_minus_m)); | 54 | 615k | __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), | 55 | 615k | _mm_unpackhi_epi16(*m, max_minus_m)); | 56 | 615k | res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); | 57 | 615k | res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); | 58 | 615k | const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); | 59 | 615k | const __m128i res = _mm_packus_epi16(res_e, res_e); | 60 | | | 61 | 615k | _mm_storel_epi64((__m128i *)(dst), res); | 62 | 615k | } |
|
63 | | |
64 | | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( |
65 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
66 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
67 | | const uint8_t *mask, uint32_t mask_stride, int h, |
68 | 0 | const __m128i *round_offset, int shift) { |
69 | 0 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
70 | 0 | for (int i = 0; i < h; ++i) { |
71 | 0 | const __m128i m0 = xx_loadl_32(mask); |
72 | 0 | const __m128i m = _mm_cvtepu8_epi16(m0); |
73 | |
|
74 | 0 | blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
75 | 0 | shift); |
76 | 0 | mask += mask_stride; |
77 | 0 | dst += dst_stride; |
78 | 0 | src0 += src0_stride; |
79 | 0 | src1 += src1_stride; |
80 | 0 | } |
81 | 0 | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1 Unexecuted instantiation: blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1 |
82 | | |
83 | | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( |
84 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
85 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
86 | | const uint8_t *mask, uint32_t mask_stride, int h, |
87 | 16.4k | const __m128i *round_offset, int shift) { |
88 | 16.4k | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
89 | 218k | for (int i = 0; i < h; ++i) { |
90 | 201k | const __m128i m0 = xx_loadl_64(mask); |
91 | 201k | const __m128i m = _mm_cvtepu8_epi16(m0); |
92 | 201k | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
93 | 201k | shift); |
94 | 201k | mask += mask_stride; |
95 | 201k | dst += dst_stride; |
96 | 201k | src0 += src0_stride; |
97 | 201k | src1 += src1_stride; |
98 | 201k | } |
99 | 16.4k | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1 Line | Count | Source | 87 | 16.4k | const __m128i *round_offset, int shift) { | 88 | 16.4k | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); | 89 | 218k | for (int i = 0; i < h; ++i) { | 90 | 201k | const __m128i m0 = xx_loadl_64(mask); | 91 | 201k | const __m128i m = _mm_cvtepu8_epi16(m0); | 92 | 201k | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, | 93 | 201k | shift); | 94 | 201k | mask += mask_stride; | 95 | 201k | dst += dst_stride; | 96 | 201k | src0 += src0_stride; | 97 | 201k | src1 += src1_stride; | 98 | 201k | } | 99 | 16.4k | } |
|
100 | | |
101 | | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( |
102 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
103 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
104 | | const uint8_t *mask, uint32_t mask_stride, int h, |
105 | 31.5k | const __m128i *round_offset, int shift) { |
106 | 31.5k | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
107 | 31.5k | const __m128i one_b = _mm_set1_epi8(1); |
108 | 31.5k | const __m128i two_w = _mm_set1_epi16(2); |
109 | 224k | for (int i = 0; i < h; ++i) { |
110 | 193k | const __m128i m_i0 = xx_loadl_64(mask); |
111 | 193k | const __m128i m_i1 = xx_loadl_64(mask + mask_stride); |
112 | 193k | const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); |
113 | 193k | const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); |
114 | 193k | const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); |
115 | 193k | const __m128i m = _mm_srli_epi16(m_acbd_2, 2); |
116 | | |
117 | 193k | blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
118 | 193k | shift); |
119 | 193k | mask += mask_stride << 1; |
120 | 193k | dst += dst_stride; |
121 | 193k | src0 += src0_stride; |
122 | 193k | src1 += src1_stride; |
123 | 193k | } |
124 | 31.5k | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1 Line | Count | Source | 105 | 31.5k | const __m128i *round_offset, int shift) { | 106 | 31.5k | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); | 107 | 31.5k | const __m128i one_b = _mm_set1_epi8(1); | 108 | 31.5k | const __m128i two_w = _mm_set1_epi16(2); | 109 | 224k | for (int i = 0; i < h; ++i) { | 110 | 193k | const __m128i m_i0 = xx_loadl_64(mask); | 111 | 193k | const __m128i m_i1 = xx_loadl_64(mask + mask_stride); | 112 | 193k | const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); | 113 | 193k | const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); | 114 | 193k | const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); | 115 | 193k | const __m128i m = _mm_srli_epi16(m_acbd_2, 2); | 116 | | | 117 | 193k | blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, | 118 | 193k | shift); | 119 | 193k | mask += mask_stride << 1; | 120 | 193k | dst += dst_stride; | 121 | 193k | src0 += src0_stride; | 122 | 193k | src1 += src1_stride; | 123 | 193k | } | 124 | 31.5k | } |
|
125 | | |
126 | | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( |
127 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
128 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
129 | | const uint8_t *mask, uint32_t mask_stride, int h, |
130 | 30.9k | const __m128i *round_offset, int shift) { |
131 | 30.9k | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
132 | 30.9k | const __m128i one_b = _mm_set1_epi8(1); |
133 | 30.9k | const __m128i two_w = _mm_set1_epi16(2); |
134 | 443k | for (int i = 0; i < h; ++i) { |
135 | 412k | const __m128i m_i0 = xx_loadu_128(mask); |
136 | 412k | const __m128i m_i1 = xx_loadu_128(mask + mask_stride); |
137 | 412k | const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); |
138 | 412k | const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); |
139 | 412k | const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); |
140 | 412k | const __m128i m = _mm_srli_epi16(m_acbd_2, 2); |
141 | | |
142 | 412k | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
143 | 412k | shift); |
144 | 412k | mask += mask_stride << 1; |
145 | 412k | dst += dst_stride; |
146 | 412k | src0 += src0_stride; |
147 | 412k | src1 += src1_stride; |
148 | 412k | } |
149 | 30.9k | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1 Line | Count | Source | 130 | 30.9k | const __m128i *round_offset, int shift) { | 131 | 30.9k | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); | 132 | 30.9k | const __m128i one_b = _mm_set1_epi8(1); | 133 | 30.9k | const __m128i two_w = _mm_set1_epi16(2); | 134 | 443k | for (int i = 0; i < h; ++i) { | 135 | 412k | const __m128i m_i0 = xx_loadu_128(mask); | 136 | 412k | const __m128i m_i1 = xx_loadu_128(mask + mask_stride); | 137 | 412k | const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); | 138 | 412k | const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); | 139 | 412k | const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); | 140 | 412k | const __m128i m = _mm_srli_epi16(m_acbd_2, 2); | 141 | | | 142 | 412k | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, | 143 | 412k | shift); | 144 | 412k | mask += mask_stride << 1; | 145 | 412k | dst += dst_stride; | 146 | 412k | src0 += src0_stride; | 147 | 412k | src1 += src1_stride; | 148 | 412k | } | 149 | 30.9k | } |
|
150 | | |
151 | | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( |
152 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
153 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
154 | | const uint8_t *mask, uint32_t mask_stride, int h, |
155 | 36 | const __m128i *round_offset, int shift) { |
156 | 36 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
157 | 36 | const __m128i one_b = _mm_set1_epi8(1); |
158 | 36 | const __m128i zeros = _mm_setzero_si128(); |
159 | 324 | for (int i = 0; i < h; ++i) { |
160 | 288 | const __m128i m_i0 = xx_loadl_64(mask); |
161 | 288 | const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); |
162 | 288 | const __m128i m = _mm_avg_epu16(m_ac, zeros); |
163 | | |
164 | 288 | blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
165 | 288 | shift); |
166 | 288 | mask += mask_stride; |
167 | 288 | dst += dst_stride; |
168 | 288 | src0 += src0_stride; |
169 | 288 | src1 += src1_stride; |
170 | 288 | } |
171 | 36 | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1 Line | Count | Source | 155 | 36 | const __m128i *round_offset, int shift) { | 156 | 36 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); | 157 | 36 | const __m128i one_b = _mm_set1_epi8(1); | 158 | 36 | const __m128i zeros = _mm_setzero_si128(); | 159 | 324 | for (int i = 0; i < h; ++i) { | 160 | 288 | const __m128i m_i0 = xx_loadl_64(mask); | 161 | 288 | const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); | 162 | 288 | const __m128i m = _mm_avg_epu16(m_ac, zeros); | 163 | | | 164 | 288 | blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, | 165 | 288 | shift); | 166 | 288 | mask += mask_stride; | 167 | 288 | dst += dst_stride; | 168 | 288 | src0 += src0_stride; | 169 | 288 | src1 += src1_stride; | 170 | 288 | } | 171 | 36 | } |
|
172 | | |
173 | | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( |
174 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
175 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
176 | | const uint8_t *mask, uint32_t mask_stride, int h, |
177 | 50 | const __m128i *round_offset, int shift) { |
178 | 50 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
179 | 50 | const __m128i one_b = _mm_set1_epi8(1); |
180 | 50 | const __m128i zeros = _mm_setzero_si128(); |
181 | 690 | for (int i = 0; i < h; ++i) { |
182 | 640 | const __m128i m_i0 = xx_loadu_128(mask); |
183 | 640 | const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); |
184 | 640 | const __m128i m = _mm_avg_epu16(m_ac, zeros); |
185 | | |
186 | 640 | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
187 | 640 | shift); |
188 | 640 | mask += mask_stride; |
189 | 640 | dst += dst_stride; |
190 | 640 | src0 += src0_stride; |
191 | 640 | src1 += src1_stride; |
192 | 640 | } |
193 | 50 | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1 Line | Count | Source | 177 | 50 | const __m128i *round_offset, int shift) { | 178 | 50 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); | 179 | 50 | const __m128i one_b = _mm_set1_epi8(1); | 180 | 50 | const __m128i zeros = _mm_setzero_si128(); | 181 | 690 | for (int i = 0; i < h; ++i) { | 182 | 640 | const __m128i m_i0 = xx_loadu_128(mask); | 183 | 640 | const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); | 184 | 640 | const __m128i m = _mm_avg_epu16(m_ac, zeros); | 185 | | | 186 | 640 | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, | 187 | 640 | shift); | 188 | 640 | mask += mask_stride; | 189 | 640 | dst += dst_stride; | 190 | 640 | src0 += src0_stride; | 191 | 640 | src1 += src1_stride; | 192 | 640 | } | 193 | 50 | } |
|
194 | | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( |
195 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
196 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
197 | | const uint8_t *mask, uint32_t mask_stride, int h, |
198 | 0 | const __m128i *round_offset, int shift) { |
199 | 0 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
200 | 0 | const __m128i zeros = _mm_setzero_si128(); |
201 | 0 | for (int i = 0; i < h; ++i) { |
202 | 0 | const __m128i m_i0 = xx_loadl_64(mask); |
203 | 0 | const __m128i m_i1 = xx_loadl_64(mask + mask_stride); |
204 | 0 | const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); |
205 | 0 | const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); |
206 | |
|
207 | 0 | blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
208 | 0 | shift); |
209 | 0 | mask += mask_stride << 1; |
210 | 0 | dst += dst_stride; |
211 | 0 | src0 += src0_stride; |
212 | 0 | src1 += src1_stride; |
213 | 0 | } |
214 | 0 | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1 Unexecuted instantiation: blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1 |
215 | | |
216 | | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( |
217 | | uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, |
218 | | uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, |
219 | | const uint8_t *mask, uint32_t mask_stride, int h, |
220 | 0 | const __m128i *round_offset, int shift) { |
221 | 0 | const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); |
222 | 0 | const __m128i zeros = _mm_setzero_si128(); |
223 | 0 | for (int i = 0; i < h; ++i) { |
224 | 0 | const __m128i m_i0 = xx_loadl_64(mask); |
225 | 0 | const __m128i m_i1 = xx_loadl_64(mask + mask_stride); |
226 | 0 | const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); |
227 | 0 | const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); |
228 | |
|
229 | 0 | blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, |
230 | 0 | shift); |
231 | 0 | mask += mask_stride << 1; |
232 | 0 | dst += dst_stride; |
233 | 0 | src0 += src0_stride; |
234 | 0 | src1 += src1_stride; |
235 | 0 | } |
236 | 0 | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1 Unexecuted instantiation: blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1 |
237 | | #endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ |