/src/aom/aom_dsp/x86/intrapred_sse4.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2021, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> // SSE2 |
13 | | #include <smmintrin.h> /* SSE4.1 */ |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | #include "aom_dsp/x86/intrapred_x86.h" |
17 | | #include "aom_dsp/x86/intrapred_utils.h" |
18 | | #include "aom_dsp/x86/lpf_common_sse2.h" |
19 | | |
20 | | // Low bit depth functions |
21 | | static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = { |
22 | | { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
23 | | { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
24 | | { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
25 | | { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
26 | | { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
27 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
28 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
29 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
30 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, |
31 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, |
32 | | 0 }, |
33 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, |
34 | | 0 }, |
35 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, |
36 | | 0, 0 }, |
37 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, |
38 | | 0, 0, 0 }, |
39 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
40 | | 0xff, 0, 0, 0 }, |
41 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
42 | | 0xff, 0xff, 0, 0 }, |
43 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
44 | | 0xff, 0xff, 0xff, 0 }, |
45 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
46 | | 0xff, 0xff, 0xff, 0xff }, |
47 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
48 | | 0xff, 0xff, 0xff, 0xff }, |
49 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
50 | | 0xff, 0xff, 0xff, 0xff }, |
51 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
52 | | 0xff, 0xff, 0xff, 0xff }, |
53 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
54 | | 0xff, 0xff, 0xff, 0xff }, |
55 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
56 | | 0xff, 0xff, 0xff, 0xff }, |
57 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
58 | | 0xff, 0xff, 0xff, 0xff }, |
59 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
60 | | 0xff, 0xff, 0xff, 0xff }, |
61 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
62 | | 0xff, 0xff, 0xff, 0xff }, |
63 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
64 | | 0xff, 0xff, 0xff, 0xff }, |
65 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
66 | | 0xff, 0xff, 0xff, 0xff }, |
67 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
68 | | 0xff, 0xff, 0xff, 0xff }, |
69 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
70 | | 0xff, 0xff, 0xff, 0xff }, |
71 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
72 | | 0xff, 0xff, 0xff, 0xff }, |
73 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
74 | | 0xff, 0xff, 0xff, 0xff }, |
75 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
76 | | 0xff, 0xff, 0xff, 0xff }, |
77 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
78 | | 0xff, 0xff, 0xff, 0xff } }, |
79 | | { |
80 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
81 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
82 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
83 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
84 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
85 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
86 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
87 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
88 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
89 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
90 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
91 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
92 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
93 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
94 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
95 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
96 | | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
97 | | { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
98 | | { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
99 | | { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
100 | | { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
101 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
102 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
103 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
104 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, |
105 | | 0 }, |
106 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, |
107 | | 0 }, |
108 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, |
109 | | 0, 0 }, |
110 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, |
111 | | 0, 0, 0 }, |
112 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
113 | | 0, 0, 0, 0 }, |
114 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
115 | | 0xff, 0, 0, 0 }, |
116 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
117 | | 0xff, 0xff, 0, 0 }, |
118 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
119 | | 0xff, 0xff, 0xff, 0 }, |
120 | | { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
121 | | 0xff, 0xff, 0xff, 0xff }, |
122 | | }, |
123 | | }; |
124 | | |
125 | | /* clang-format on */ |
126 | | static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1( |
127 | | int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, |
128 | 0 | int dx) { |
129 | 0 | const int frac_bits = 6 - upsample_above; |
130 | 0 | const int max_base_x = ((W + H) - 1) << upsample_above; |
131 | |
|
132 | 0 | assert(dx > 0); |
133 | | // pre-filter above pixels |
134 | | // store in temp buffers: |
135 | | // above[x] * 32 + 16 |
136 | | // above[x+1] - above[x] |
137 | | // final pixels will be calculated as: |
138 | | // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 |
139 | 0 | __m128i a0, a1, a32, a16; |
140 | 0 | __m128i diff, c3f; |
141 | 0 | __m128i a_mbase_x; |
142 | |
|
143 | 0 | a16 = _mm_set1_epi16(16); |
144 | 0 | a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); |
145 | 0 | c3f = _mm_set1_epi16(0x3f); |
146 | |
|
147 | 0 | int x = dx; |
148 | 0 | for (int r = 0; r < W; r++) { |
149 | 0 | __m128i b, res, res1, shift; |
150 | 0 | __m128i a0_above, a1_above; |
151 | |
|
152 | 0 | int base = x >> frac_bits; |
153 | 0 | int base_max_diff = (max_base_x - base) >> upsample_above; |
154 | 0 | if (base_max_diff <= 0) { |
155 | 0 | for (int i = r; i < W; ++i) { |
156 | 0 | dst[i] = a_mbase_x; // save 4 values |
157 | 0 | } |
158 | 0 | return; |
159 | 0 | } |
160 | 0 | if (base_max_diff > H) base_max_diff = H; |
161 | 0 | a0_above = _mm_loadu_si128((__m128i *)(above + base)); |
162 | 0 | a1_above = _mm_loadu_si128((__m128i *)(above + base + 1)); |
163 | |
|
164 | 0 | if (upsample_above) { |
165 | 0 | a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]); |
166 | 0 | a1_above = _mm_srli_si128(a0_above, 8); |
167 | |
|
168 | 0 | shift = _mm_srli_epi16( |
169 | 0 | _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f), |
170 | 0 | 1); |
171 | 0 | } else { |
172 | 0 | shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); |
173 | 0 | } |
174 | | // lower half |
175 | 0 | a0 = _mm_cvtepu8_epi16(a0_above); |
176 | 0 | a1 = _mm_cvtepu8_epi16(a1_above); |
177 | |
|
178 | 0 | diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] |
179 | 0 | a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 |
180 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
181 | |
|
182 | 0 | b = _mm_mullo_epi16(diff, shift); |
183 | 0 | res = _mm_add_epi16(a32, b); |
184 | 0 | res = _mm_srli_epi16(res, 5); |
185 | | |
186 | | // uppar half |
187 | 0 | a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); |
188 | 0 | a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); |
189 | |
|
190 | 0 | diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] |
191 | 0 | a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 |
192 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
193 | |
|
194 | 0 | b = _mm_mullo_epi16(diff, shift); |
195 | 0 | res1 = _mm_add_epi16(a32, b); |
196 | 0 | res1 = _mm_srli_epi16(res1, 5); |
197 | |
|
198 | 0 | res = _mm_packus_epi16(res, res1); |
199 | |
|
200 | 0 | dst[r] = |
201 | 0 | _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]); |
202 | 0 | x += dx; |
203 | 0 | } |
204 | 0 | } |
205 | | |
206 | | static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
207 | | const uint8_t *above, |
208 | 0 | int upsample_above, int dx) { |
209 | 0 | __m128i dstvec[16]; |
210 | |
|
211 | 0 | dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx); |
212 | 0 | for (int i = 0; i < N; i++) { |
213 | 0 | *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); |
214 | 0 | } |
215 | 0 | } |
216 | | |
217 | | static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
218 | | const uint8_t *above, |
219 | 0 | int upsample_above, int dx) { |
220 | 0 | __m128i dstvec[32]; |
221 | |
|
222 | 0 | dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx); |
223 | 0 | for (int i = 0; i < N; i++) { |
224 | 0 | _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); |
225 | 0 | } |
226 | 0 | } |
227 | | |
228 | | static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
229 | | const uint8_t *above, |
230 | 0 | int upsample_above, int dx) { |
231 | 0 | __m128i dstvec[64]; |
232 | |
|
233 | 0 | dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above, |
234 | 0 | dx); |
235 | 0 | for (int i = 0; i < N; i++) { |
236 | 0 | _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); |
237 | 0 | } |
238 | 0 | } |
239 | | |
240 | | static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1( |
241 | | int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above, |
242 | 0 | int upsample_above, int dx) { |
243 | | // here upsample_above is 0 by design of av1_use_intra_edge_upsample |
244 | 0 | (void)upsample_above; |
245 | 0 | const int frac_bits = 6; |
246 | 0 | const int max_base_x = ((32 + N) - 1); |
247 | | |
248 | | // pre-filter above pixels |
249 | | // store in temp buffers: |
250 | | // above[x] * 32 + 16 |
251 | | // above[x+1] - above[x] |
252 | | // final pixels will be calculated as: |
253 | | // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 |
254 | 0 | __m128i a0, a1, a32, a16; |
255 | 0 | __m128i a_mbase_x, diff, c3f; |
256 | |
|
257 | 0 | a16 = _mm_set1_epi16(16); |
258 | 0 | a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); |
259 | 0 | c3f = _mm_set1_epi16(0x3f); |
260 | |
|
261 | 0 | int x = dx; |
262 | 0 | for (int r = 0; r < N; r++) { |
263 | 0 | __m128i b, res, res1, res16[2]; |
264 | 0 | __m128i a0_above, a1_above; |
265 | |
|
266 | 0 | int base = x >> frac_bits; |
267 | 0 | int base_max_diff = (max_base_x - base); |
268 | 0 | if (base_max_diff <= 0) { |
269 | 0 | for (int i = r; i < N; ++i) { |
270 | 0 | dstvec[i] = a_mbase_x; // save 32 values |
271 | 0 | dstvec_h[i] = a_mbase_x; |
272 | 0 | } |
273 | 0 | return; |
274 | 0 | } |
275 | 0 | if (base_max_diff > 32) base_max_diff = 32; |
276 | 0 | __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); |
277 | |
|
278 | 0 | for (int j = 0, jj = 0; j < 32; j += 16, jj++) { |
279 | 0 | int mdiff = base_max_diff - j; |
280 | 0 | if (mdiff <= 0) { |
281 | 0 | res16[jj] = a_mbase_x; |
282 | 0 | } else { |
283 | 0 | a0_above = _mm_loadu_si128((__m128i *)(above + base + j)); |
284 | 0 | a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1)); |
285 | | |
286 | | // lower half |
287 | 0 | a0 = _mm_cvtepu8_epi16(a0_above); |
288 | 0 | a1 = _mm_cvtepu8_epi16(a1_above); |
289 | |
|
290 | 0 | diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] |
291 | 0 | a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 |
292 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
293 | 0 | b = _mm_mullo_epi16(diff, shift); |
294 | |
|
295 | 0 | res = _mm_add_epi16(a32, b); |
296 | 0 | res = _mm_srli_epi16(res, 5); |
297 | | |
298 | | // uppar half |
299 | 0 | a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); |
300 | 0 | a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); |
301 | |
|
302 | 0 | diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] |
303 | 0 | a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 |
304 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
305 | |
|
306 | 0 | b = _mm_mullo_epi16(diff, shift); |
307 | 0 | res1 = _mm_add_epi16(a32, b); |
308 | 0 | res1 = _mm_srli_epi16(res1, 5); |
309 | |
|
310 | 0 | res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values |
311 | 0 | } |
312 | 0 | } |
313 | |
|
314 | 0 | dstvec[r] = |
315 | 0 | _mm_blendv_epi8(a_mbase_x, res16[0], |
316 | 0 | *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values |
317 | |
|
318 | 0 | dstvec_h[r] = |
319 | 0 | _mm_blendv_epi8(a_mbase_x, res16[1], |
320 | 0 | *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values |
321 | 0 | x += dx; |
322 | 0 | } |
323 | 0 | } |
324 | | |
325 | | static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
326 | | const uint8_t *above, |
327 | 0 | int upsample_above, int dx) { |
328 | 0 | __m128i dstvec[64], dstvec_h[64]; |
329 | 0 | dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above, |
330 | 0 | upsample_above, dx); |
331 | 0 | for (int i = 0; i < N; i++) { |
332 | 0 | _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); |
333 | 0 | _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]); |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | | static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
338 | | const uint8_t *above, |
339 | 0 | int upsample_above, int dx) { |
340 | | // here upsample_above is 0 by design of av1_use_intra_edge_upsample |
341 | 0 | (void)upsample_above; |
342 | 0 | const int frac_bits = 6; |
343 | 0 | const int max_base_x = ((64 + N) - 1); |
344 | | |
345 | | // pre-filter above pixels |
346 | | // store in temp buffers: |
347 | | // above[x] * 32 + 16 |
348 | | // above[x+1] - above[x] |
349 | | // final pixels will be calculated as: |
350 | | // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 |
351 | 0 | __m128i a0, a1, a32, a16; |
352 | 0 | __m128i a_mbase_x, diff, c3f; |
353 | 0 | __m128i max_base, base_inc, mask; |
354 | |
|
355 | 0 | a16 = _mm_set1_epi16(16); |
356 | 0 | a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); |
357 | 0 | max_base = _mm_set1_epi8(max_base_x); |
358 | 0 | c3f = _mm_set1_epi16(0x3f); |
359 | |
|
360 | 0 | int x = dx; |
361 | 0 | for (int r = 0; r < N; r++, dst += stride) { |
362 | 0 | __m128i b, res, res1; |
363 | 0 | int base = x >> frac_bits; |
364 | 0 | if (base >= max_base_x) { |
365 | 0 | for (int i = r; i < N; ++i) { |
366 | 0 | _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values |
367 | 0 | _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x); |
368 | 0 | _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x); |
369 | 0 | _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x); |
370 | 0 | dst += stride; |
371 | 0 | } |
372 | 0 | return; |
373 | 0 | } |
374 | | |
375 | 0 | __m128i shift = |
376 | 0 | _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element |
377 | |
|
378 | 0 | __m128i a0_above, a1_above, res_val; |
379 | 0 | for (int j = 0; j < 64; j += 16) { |
380 | 0 | int mdif = max_base_x - (base + j); |
381 | 0 | if (mdif <= 0) { |
382 | 0 | _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x); |
383 | 0 | } else { |
384 | 0 | a0_above = |
385 | 0 | _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element |
386 | 0 | a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); |
387 | | |
388 | | // lower half |
389 | 0 | a0 = _mm_cvtepu8_epi16(a0_above); |
390 | 0 | a1 = _mm_cvtepu8_epi16(a1_above); |
391 | |
|
392 | 0 | diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] |
393 | 0 | a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 |
394 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
395 | 0 | b = _mm_mullo_epi16(diff, shift); |
396 | |
|
397 | 0 | res = _mm_add_epi16(a32, b); |
398 | 0 | res = _mm_srli_epi16(res, 5); |
399 | | |
400 | | // uppar half |
401 | 0 | a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); |
402 | 0 | a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); |
403 | |
|
404 | 0 | diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] |
405 | 0 | a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 |
406 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
407 | |
|
408 | 0 | b = _mm_mullo_epi16(diff, shift); |
409 | 0 | res1 = _mm_add_epi16(a32, b); |
410 | 0 | res1 = _mm_srli_epi16(res1, 5); |
411 | |
|
412 | 0 | res = _mm_packus_epi16(res, res1); // 16 8bit values |
413 | |
|
414 | 0 | base_inc = |
415 | 0 | _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), |
416 | 0 | (int8_t)(base + j + 2), (int8_t)(base + j + 3), |
417 | 0 | (int8_t)(base + j + 4), (int8_t)(base + j + 5), |
418 | 0 | (int8_t)(base + j + 6), (int8_t)(base + j + 7), |
419 | 0 | (int8_t)(base + j + 8), (int8_t)(base + j + 9), |
420 | 0 | (int8_t)(base + j + 10), (int8_t)(base + j + 11), |
421 | 0 | (int8_t)(base + j + 12), (int8_t)(base + j + 13), |
422 | 0 | (int8_t)(base + j + 14), (int8_t)(base + j + 15)); |
423 | |
|
424 | 0 | mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc), |
425 | 0 | _mm_setzero_si128()); |
426 | 0 | res_val = _mm_blendv_epi8(a_mbase_x, res, mask); |
427 | 0 | _mm_storeu_si128((__m128i *)(dst + j), res_val); |
428 | 0 | } |
429 | 0 | } |
430 | 0 | x += dx; |
431 | 0 | } |
432 | 0 | } |
433 | | |
434 | | // Directional prediction, zone 1: 0 < angle < 90 |
435 | | void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, |
436 | | const uint8_t *above, const uint8_t *left, |
437 | 0 | int upsample_above, int dx, int dy) { |
438 | 0 | (void)left; |
439 | 0 | (void)dy; |
440 | 0 | switch (bw) { |
441 | 0 | case 4: |
442 | 0 | dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx); |
443 | 0 | break; |
444 | 0 | case 8: |
445 | 0 | dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx); |
446 | 0 | break; |
447 | 0 | case 16: |
448 | 0 | dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx); |
449 | 0 | break; |
450 | 0 | case 32: |
451 | 0 | dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx); |
452 | 0 | break; |
453 | 0 | case 64: |
454 | 0 | dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx); |
455 | 0 | break; |
456 | 0 | default: assert(0 && "Invalid block size"); |
457 | 0 | } |
458 | 0 | return; |
459 | 0 | } |
460 | | |
461 | | static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
462 | | const uint8_t *above, |
463 | | const uint8_t *left, int upsample_above, |
464 | 0 | int upsample_left, int dx, int dy) { |
465 | 0 | const int min_base_x = -(1 << upsample_above); |
466 | 0 | const int min_base_y = -(1 << upsample_left); |
467 | 0 | const int frac_bits_x = 6 - upsample_above; |
468 | 0 | const int frac_bits_y = 6 - upsample_left; |
469 | |
|
470 | 0 | assert(dx > 0); |
471 | | // pre-filter above pixels |
472 | | // store in temp buffers: |
473 | | // above[x] * 32 + 16 |
474 | | // above[x+1] - above[x] |
475 | | // final pixels will be calculated as: |
476 | | // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 |
477 | 0 | __m128i a0_x, a1_x, a32, diff; |
478 | |
|
479 | 0 | const __m128i c3f = _mm_set1_epi16(0x3f); |
480 | 0 | const __m128i min_y_base = _mm_set1_epi16(min_base_y); |
481 | 0 | const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); |
482 | 0 | const __m128i dy_reg = _mm_set1_epi16(dy); |
483 | 0 | const __m128i a16 = _mm_set1_epi16(16); |
484 | |
|
485 | 0 | for (int r = 0; r < N; r++) { |
486 | 0 | __m128i b, res, shift, r6, ydx; |
487 | 0 | __m128i resx, resy, resxy; |
488 | 0 | __m128i a0_above, a1_above; |
489 | 0 | int y = r + 1; |
490 | 0 | int base_x = (-y * dx) >> frac_bits_x; |
491 | 0 | int base_shift = 0; |
492 | 0 | if (base_x < (min_base_x - 1)) { |
493 | 0 | base_shift = (min_base_x - base_x - 1) >> upsample_above; |
494 | 0 | } |
495 | 0 | int base_min_diff = |
496 | 0 | (min_base_x - base_x + upsample_above) >> upsample_above; |
497 | 0 | if (base_min_diff > 4) { |
498 | 0 | base_min_diff = 4; |
499 | 0 | } else { |
500 | 0 | if (base_min_diff < 0) base_min_diff = 0; |
501 | 0 | } |
502 | |
|
503 | 0 | if (base_shift > 3) { |
504 | 0 | a0_x = _mm_setzero_si128(); |
505 | 0 | a1_x = _mm_setzero_si128(); |
506 | 0 | shift = _mm_setzero_si128(); |
507 | 0 | } else { |
508 | 0 | a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); |
509 | 0 | ydx = _mm_set1_epi16(y * dx); |
510 | 0 | r6 = _mm_slli_epi16(c1234, 6); |
511 | |
|
512 | 0 | if (upsample_above) { |
513 | 0 | a0_above = |
514 | 0 | _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); |
515 | 0 | a1_above = _mm_srli_si128(a0_above, 8); |
516 | |
|
517 | 0 | shift = _mm_srli_epi16( |
518 | 0 | _mm_and_si128( |
519 | 0 | _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), |
520 | 0 | 1); |
521 | 0 | } else { |
522 | 0 | a0_above = |
523 | 0 | _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); |
524 | 0 | a1_above = _mm_srli_si128(a0_above, 1); |
525 | |
|
526 | 0 | shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); |
527 | 0 | } |
528 | 0 | a0_x = _mm_cvtepu8_epi16(a0_above); |
529 | 0 | a1_x = _mm_cvtepu8_epi16(a1_above); |
530 | 0 | } |
531 | | // y calc |
532 | 0 | __m128i a0_y, a1_y, shifty; |
533 | 0 | if (base_x < min_base_x) { |
534 | 0 | DECLARE_ALIGNED(32, int16_t, base_y_c[8]); |
535 | 0 | __m128i y_c, base_y_c_reg, mask, c1234_; |
536 | 0 | c1234_ = _mm_srli_si128(c1234, 2); |
537 | 0 | r6 = _mm_set1_epi16(r << 6); |
538 | 0 | y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg)); |
539 | 0 | base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); |
540 | 0 | mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); |
541 | 0 | base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); |
542 | 0 | _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); |
543 | |
|
544 | 0 | a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], |
545 | 0 | left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); |
546 | 0 | base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); |
547 | 0 | _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); |
548 | 0 | a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], |
549 | 0 | left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); |
550 | |
|
551 | 0 | if (upsample_left) { |
552 | 0 | shifty = _mm_srli_epi16( |
553 | 0 | _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); |
554 | 0 | } else { |
555 | 0 | shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); |
556 | 0 | } |
557 | 0 | a0_x = _mm_unpacklo_epi64(a0_x, a0_y); |
558 | 0 | a1_x = _mm_unpacklo_epi64(a1_x, a1_y); |
559 | 0 | shift = _mm_unpacklo_epi64(shift, shifty); |
560 | 0 | } |
561 | |
|
562 | 0 | diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] |
563 | 0 | a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 |
564 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
565 | |
|
566 | 0 | b = _mm_mullo_epi16(diff, shift); |
567 | 0 | res = _mm_add_epi16(a32, b); |
568 | 0 | res = _mm_srli_epi16(res, 5); |
569 | |
|
570 | 0 | resx = _mm_packus_epi16(res, res); |
571 | 0 | resy = _mm_srli_si128(resx, 4); |
572 | |
|
573 | 0 | resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); |
574 | 0 | *(int *)(dst) = _mm_cvtsi128_si32(resxy); |
575 | 0 | dst += stride; |
576 | 0 | } |
577 | 0 | } |
578 | | |
579 | | static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, |
580 | | const uint8_t *above, |
581 | | const uint8_t *left, int upsample_above, |
582 | 0 | int upsample_left, int dx, int dy) { |
583 | 0 | const int min_base_x = -(1 << upsample_above); |
584 | 0 | const int min_base_y = -(1 << upsample_left); |
585 | 0 | const int frac_bits_x = 6 - upsample_above; |
586 | 0 | const int frac_bits_y = 6 - upsample_left; |
587 | | |
588 | | // pre-filter above pixels |
589 | | // store in temp buffers: |
590 | | // above[x] * 32 + 16 |
591 | | // above[x+1] - above[x] |
592 | | // final pixels will be calculated as: |
593 | | // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 |
594 | 0 | __m128i diff, a32; |
595 | 0 | __m128i a0_x, a1_x, a0_y, a1_y; |
596 | 0 | __m128i a0_above, a1_above; |
597 | |
|
598 | 0 | const __m128i a16 = _mm_set1_epi16(16); |
599 | 0 | const __m128i c3f = _mm_set1_epi16(0x3f); |
600 | 0 | const __m128i min_y_base = _mm_set1_epi16(min_base_y); |
601 | 0 | const __m128i dy_reg = _mm_set1_epi16(dy); |
602 | 0 | const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); |
603 | |
|
604 | 0 | for (int r = 0; r < N; r++) { |
605 | 0 | __m128i b, res, res1, shift; |
606 | 0 | __m128i resx, resy, resxy, r6, ydx; |
607 | |
|
608 | 0 | int y = r + 1; |
609 | 0 | int base_x = (-y * dx) >> frac_bits_x; |
610 | 0 | int base_shift = 0; |
611 | 0 | if (base_x < (min_base_x - 1)) { |
612 | 0 | base_shift = (min_base_x - base_x - 1) >> upsample_above; |
613 | 0 | } |
614 | 0 | int base_min_diff = |
615 | 0 | (min_base_x - base_x + upsample_above) >> upsample_above; |
616 | 0 | if (base_min_diff > 8) { |
617 | 0 | base_min_diff = 8; |
618 | 0 | } else { |
619 | 0 | if (base_min_diff < 0) base_min_diff = 0; |
620 | 0 | } |
621 | |
|
622 | 0 | if (base_shift > 7) { |
623 | 0 | resx = _mm_setzero_si128(); |
624 | 0 | } else { |
625 | 0 | a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); |
626 | 0 | ydx = _mm_set1_epi16(y * dx); |
627 | 0 | r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); |
628 | 0 | if (upsample_above) { |
629 | 0 | a0_above = |
630 | 0 | _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); |
631 | 0 | a1_above = _mm_srli_si128(a0_above, 8); |
632 | |
|
633 | 0 | shift = _mm_srli_epi16( |
634 | 0 | _mm_and_si128( |
635 | 0 | _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), |
636 | 0 | 1); |
637 | 0 | } else { |
638 | 0 | a1_above = _mm_srli_si128(a0_above, 1); |
639 | 0 | a0_above = |
640 | 0 | _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); |
641 | 0 | a1_above = |
642 | 0 | _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); |
643 | |
|
644 | 0 | shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); |
645 | 0 | } |
646 | 0 | a0_x = _mm_cvtepu8_epi16(a0_above); |
647 | 0 | a1_x = _mm_cvtepu8_epi16(a1_above); |
648 | |
|
649 | 0 | diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] |
650 | 0 | a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 |
651 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
652 | |
|
653 | 0 | b = _mm_mullo_epi16(diff, shift); |
654 | 0 | res = _mm_add_epi16(a32, b); |
655 | 0 | res = _mm_srli_epi16(res, 5); |
656 | 0 | resx = _mm_packus_epi16(res, res); |
657 | 0 | } |
658 | | |
659 | | // y calc |
660 | 0 | if (base_x < min_base_x) { |
661 | 0 | DECLARE_ALIGNED(32, int16_t, base_y_c[16]); |
662 | 0 | __m128i y_c, base_y_c_reg, mask; |
663 | 0 | r6 = _mm_set1_epi16(r << 6); |
664 | 0 | y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg)); |
665 | 0 | base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); |
666 | 0 | mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); |
667 | 0 | base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); |
668 | 0 | _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); |
669 | |
|
670 | 0 | a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], |
671 | 0 | left[base_y_c[2]], left[base_y_c[3]], |
672 | 0 | left[base_y_c[4]], left[base_y_c[5]], |
673 | 0 | left[base_y_c[6]], left[base_y_c[7]]); |
674 | 0 | base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); |
675 | 0 | _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); |
676 | |
|
677 | 0 | a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], |
678 | 0 | left[base_y_c[2]], left[base_y_c[3]], |
679 | 0 | left[base_y_c[4]], left[base_y_c[5]], |
680 | 0 | left[base_y_c[6]], left[base_y_c[7]]); |
681 | |
|
682 | 0 | if (upsample_left) { |
683 | 0 | shift = _mm_srli_epi16( |
684 | 0 | _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); |
685 | 0 | } else { |
686 | 0 | shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); |
687 | 0 | } |
688 | |
|
689 | 0 | diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] |
690 | 0 | a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 |
691 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
692 | |
|
693 | 0 | b = _mm_mullo_epi16(diff, shift); |
694 | 0 | res1 = _mm_add_epi16(a32, b); |
695 | 0 | res1 = _mm_srli_epi16(res1, 5); |
696 | |
|
697 | 0 | resy = _mm_packus_epi16(res1, res1); |
698 | 0 | resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); |
699 | 0 | _mm_storel_epi64((__m128i *)dst, resxy); |
700 | 0 | } else { |
701 | 0 | _mm_storel_epi64((__m128i *)dst, resx); |
702 | 0 | } |
703 | |
|
704 | 0 | dst += stride; |
705 | 0 | } |
706 | 0 | } |
707 | | |
708 | | static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst, |
709 | | ptrdiff_t stride, const uint8_t *above, |
710 | | const uint8_t *left, int upsample_above, |
711 | 0 | int upsample_left, int dx, int dy) { |
712 | | // here upsample_above and upsample_left are 0 by design of |
713 | | // av1_use_intra_edge_upsample |
714 | 0 | const int min_base_x = -1; |
715 | 0 | const int min_base_y = -1; |
716 | 0 | (void)upsample_above; |
717 | 0 | (void)upsample_left; |
718 | 0 | const int frac_bits_x = 6; |
719 | 0 | const int frac_bits_y = 6; |
720 | |
|
721 | 0 | __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32; |
722 | 0 | __m128i diff, shifty, shifty_h; |
723 | 0 | __m128i a0_above, a1_above; |
724 | |
|
725 | 0 | DECLARE_ALIGNED(32, int16_t, base_y_c[16]); |
726 | 0 | const __m128i a16 = _mm_set1_epi16(16); |
727 | 0 | const __m128i c1 = _mm_srli_epi16(a16, 4); |
728 | 0 | const __m128i min_y_base = _mm_set1_epi16(min_base_y); |
729 | 0 | const __m128i c3f = _mm_set1_epi16(0x3f); |
730 | 0 | const __m128i dy256 = _mm_set1_epi16(dy); |
731 | 0 | const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); |
732 | 0 | const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); |
733 | 0 | const __m128i c1234 = _mm_add_epi16(c0123, c1); |
734 | 0 | const __m128i c1234_h = _mm_add_epi16(c0123_h, c1); |
735 | |
|
736 | 0 | for (int r = 0; r < H; r++) { |
737 | 0 | __m128i b, res, res1, shift, reg_j, r6, ydx; |
738 | 0 | __m128i resx, resy; |
739 | 0 | __m128i resxy; |
740 | 0 | int y = r + 1; |
741 | 0 | ydx = _mm_set1_epi16((int16_t)(y * dx)); |
742 | |
|
743 | 0 | int base_x = (-y * dx) >> frac_bits_x; |
744 | 0 | for (int j = 0; j < W; j += 16) { |
745 | 0 | reg_j = _mm_set1_epi16(j); |
746 | 0 | int base_shift = 0; |
747 | 0 | if ((base_x + j) < (min_base_x - 1)) { |
748 | 0 | base_shift = (min_base_x - (base_x + j) - 1); |
749 | 0 | } |
750 | 0 | int base_min_diff = (min_base_x - base_x - j); |
751 | 0 | if (base_min_diff > 16) { |
752 | 0 | base_min_diff = 16; |
753 | 0 | } else { |
754 | 0 | if (base_min_diff < 0) base_min_diff = 0; |
755 | 0 | } |
756 | |
|
757 | 0 | if (base_shift < 16) { |
758 | 0 | a0_above = |
759 | 0 | _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); |
760 | 0 | a1_above = |
761 | 0 | _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); |
762 | 0 | a0_above = |
763 | 0 | _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); |
764 | 0 | a1_above = |
765 | 0 | _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); |
766 | |
|
767 | 0 | a0_x = _mm_cvtepu8_epi16(a0_above); |
768 | 0 | a1_x = _mm_cvtepu8_epi16(a1_above); |
769 | |
|
770 | 0 | r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6); |
771 | 0 | shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); |
772 | |
|
773 | 0 | diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] |
774 | 0 | a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 |
775 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
776 | |
|
777 | 0 | b = _mm_mullo_epi16(diff, shift); |
778 | 0 | res = _mm_add_epi16(a32, b); |
779 | 0 | res = _mm_srli_epi16(res, 5); // 16 16-bit values |
780 | |
|
781 | 0 | a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); |
782 | 0 | a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); |
783 | |
|
784 | 0 | r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6); |
785 | 0 | shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); |
786 | |
|
787 | 0 | diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] |
788 | 0 | a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 |
789 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
790 | |
|
791 | 0 | b = _mm_mullo_epi16(diff, shift); |
792 | 0 | res1 = _mm_add_epi16(a32, b); |
793 | 0 | res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values |
794 | |
|
795 | 0 | resx = _mm_packus_epi16(res, res1); |
796 | 0 | } else { |
797 | 0 | resx = _mm_setzero_si128(); |
798 | 0 | } |
799 | | |
800 | | // y calc |
801 | 0 | if (base_x < min_base_x) { |
802 | 0 | __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h; |
803 | 0 | __m128i mask, mask_h, mul16, mul16_h; |
804 | 0 | r6 = _mm_set1_epi16(r << 6); |
805 | 0 | c_reg = _mm_add_epi16(reg_j, c1234); |
806 | 0 | c_reg_h = _mm_add_epi16(reg_j, c1234_h); |
807 | 0 | mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256), |
808 | 0 | _mm_srli_epi16(min_y_base, 1)); |
809 | 0 | mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256), |
810 | 0 | _mm_srli_epi16(min_y_base, 1)); |
811 | 0 | y_reg = _mm_sub_epi16(r6, mul16); |
812 | 0 | y_reg_h = _mm_sub_epi16(r6, mul16_h); |
813 | |
|
814 | 0 | base_y = _mm_srai_epi16(y_reg, frac_bits_y); |
815 | 0 | base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y); |
816 | 0 | mask = _mm_cmpgt_epi16(min_y_base, base_y); |
817 | 0 | mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h); |
818 | |
|
819 | 0 | base_y = _mm_blendv_epi8(base_y, min_y_base, mask); |
820 | 0 | base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h); |
821 | 0 | int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7); |
822 | 0 | int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0); |
823 | 0 | int16_t offset_diff = max_y - min_y; |
824 | |
|
825 | 0 | if (offset_diff < 16) { |
826 | 0 | __m128i min_y_reg = _mm_set1_epi16(min_y); |
827 | |
|
828 | 0 | __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg); |
829 | 0 | __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg); |
830 | 0 | __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h); |
831 | |
|
832 | 0 | __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y)); |
833 | 0 | __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1)); |
834 | 0 | __m128i LoadMask = |
835 | 0 | _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4])); |
836 | |
|
837 | 0 | a0_mask = _mm_and_si128(a0_mask, LoadMask); |
838 | 0 | a1_mask = _mm_and_si128(a1_mask, LoadMask); |
839 | |
|
840 | 0 | a0_mask = _mm_shuffle_epi8(a0_mask, y_offset); |
841 | 0 | a1_mask = _mm_shuffle_epi8(a1_mask, y_offset); |
842 | 0 | a0_y = _mm_cvtepu8_epi16(a0_mask); |
843 | 0 | a1_y = _mm_cvtepu8_epi16(a1_mask); |
844 | 0 | a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8)); |
845 | 0 | a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8)); |
846 | 0 | } else { |
847 | 0 | base_y = _mm_andnot_si128(mask, base_y); |
848 | 0 | base_y_h = _mm_andnot_si128(mask_h, base_y_h); |
849 | 0 | _mm_store_si128((__m128i *)base_y_c, base_y); |
850 | 0 | _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); |
851 | |
|
852 | 0 | a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], |
853 | 0 | left[base_y_c[2]], left[base_y_c[3]], |
854 | 0 | left[base_y_c[4]], left[base_y_c[5]], |
855 | 0 | left[base_y_c[6]], left[base_y_c[7]]); |
856 | 0 | a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], |
857 | 0 | left[base_y_c[10]], left[base_y_c[11]], |
858 | 0 | left[base_y_c[12]], left[base_y_c[13]], |
859 | 0 | left[base_y_c[14]], left[base_y_c[15]]); |
860 | 0 | base_y = _mm_add_epi16(base_y, c1); |
861 | 0 | base_y_h = _mm_add_epi16(base_y_h, c1); |
862 | 0 | _mm_store_si128((__m128i *)base_y_c, base_y); |
863 | 0 | _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); |
864 | |
|
865 | 0 | a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], |
866 | 0 | left[base_y_c[2]], left[base_y_c[3]], |
867 | 0 | left[base_y_c[4]], left[base_y_c[5]], |
868 | 0 | left[base_y_c[6]], left[base_y_c[7]]); |
869 | 0 | a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], |
870 | 0 | left[base_y_c[10]], left[base_y_c[11]], |
871 | 0 | left[base_y_c[12]], left[base_y_c[13]], |
872 | 0 | left[base_y_c[14]], left[base_y_c[15]]); |
873 | 0 | } |
874 | 0 | shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1); |
875 | 0 | shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1); |
876 | |
|
877 | 0 | diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] |
878 | 0 | a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 |
879 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
880 | |
|
881 | 0 | b = _mm_mullo_epi16(diff, shifty); |
882 | 0 | res = _mm_add_epi16(a32, b); |
883 | 0 | res = _mm_srli_epi16(res, 5); // 16 16-bit values |
884 | |
|
885 | 0 | diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x] |
886 | 0 | a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32 |
887 | 0 | a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 |
888 | |
|
889 | 0 | b = _mm_mullo_epi16(diff, shifty_h); |
890 | 0 | res1 = _mm_add_epi16(a32, b); |
891 | 0 | res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values |
892 | 0 | resy = _mm_packus_epi16(res, res1); |
893 | 0 | } else { |
894 | 0 | resy = _mm_setzero_si128(); |
895 | 0 | } |
896 | 0 | resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); |
897 | 0 | _mm_storeu_si128((__m128i *)(dst + j), resxy); |
898 | 0 | } // for j |
899 | 0 | dst += stride; |
900 | 0 | } |
901 | 0 | } |
902 | | |
903 | | // Directional prediction, zone 2: 90 < angle < 180 |
904 | | void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, |
905 | | const uint8_t *above, const uint8_t *left, |
906 | | int upsample_above, int upsample_left, int dx, |
907 | 0 | int dy) { |
908 | 0 | assert(dx > 0); |
909 | 0 | assert(dy > 0); |
910 | 0 | switch (bw) { |
911 | 0 | case 4: |
912 | 0 | dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above, |
913 | 0 | upsample_left, dx, dy); |
914 | 0 | break; |
915 | 0 | case 8: |
916 | 0 | dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above, |
917 | 0 | upsample_left, dx, dy); |
918 | 0 | break; |
919 | 0 | default: |
920 | 0 | dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left, |
921 | 0 | upsample_above, upsample_left, dx, dy); |
922 | 0 | } |
923 | 0 | return; |
924 | 0 | } |
925 | | |
926 | | // z3 functions |
927 | | static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride, |
928 | | const uint8_t *left, int upsample_left, |
929 | 0 | int dy) { |
930 | 0 | __m128i dstvec[4], d[4]; |
931 | |
|
932 | 0 | dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy); |
933 | 0 | transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], |
934 | 0 | &d[0], &d[1], &d[2], &d[3]); |
935 | |
|
936 | 0 | *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); |
937 | 0 | *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); |
938 | 0 | *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); |
939 | 0 | *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); |
940 | 0 | return; |
941 | 0 | } |
942 | | |
943 | | static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride, |
944 | | const uint8_t *left, int upsample_left, |
945 | 0 | int dy) { |
946 | 0 | __m128i dstvec[8], d[8]; |
947 | |
|
948 | 0 | dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy); |
949 | 0 | transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], |
950 | 0 | &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], |
951 | 0 | &d[3]); |
952 | |
|
953 | 0 | _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); |
954 | 0 | _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); |
955 | 0 | _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); |
956 | 0 | _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); |
957 | 0 | _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); |
958 | 0 | _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); |
959 | 0 | _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); |
960 | 0 | _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); |
961 | 0 | } |
962 | | |
963 | | static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride, |
964 | | const uint8_t *left, int upsample_left, |
965 | 0 | int dy) { |
966 | 0 | __m128i dstvec[4], d[8]; |
967 | |
|
968 | 0 | dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy); |
969 | 0 | transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], |
970 | 0 | &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); |
971 | 0 | for (int i = 0; i < 8; i++) { |
972 | 0 | *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); |
973 | 0 | } |
974 | 0 | } |
975 | | |
976 | | static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride, |
977 | | const uint8_t *left, int upsample_left, |
978 | 0 | int dy) { |
979 | 0 | __m128i dstvec[8], d[4]; |
980 | |
|
981 | 0 | dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy); |
982 | 0 | transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], |
983 | 0 | &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], |
984 | 0 | &d[1], &d[2], &d[3]); |
985 | 0 | _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); |
986 | 0 | _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); |
987 | 0 | _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); |
988 | 0 | _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); |
989 | 0 | } |
990 | | |
991 | | static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride, |
992 | | const uint8_t *left, int upsample_left, |
993 | 0 | int dy) { |
994 | 0 | __m128i dstvec[8], d[8]; |
995 | |
|
996 | 0 | dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy); |
997 | 0 | transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, |
998 | 0 | dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, |
999 | 0 | d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); |
1000 | 0 | for (int i = 0; i < 8; i++) { |
1001 | 0 | _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); |
1002 | 0 | _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), |
1003 | 0 | _mm_srli_si128(d[i], 8)); |
1004 | 0 | } |
1005 | 0 | } |
1006 | | |
1007 | | static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1008 | | const uint8_t *left, int upsample_left, |
1009 | 0 | int dy) { |
1010 | 0 | __m128i dstvec[16], d[16]; |
1011 | |
|
1012 | 0 | dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy); |
1013 | 0 | transpose16x8_8x16_sse2( |
1014 | 0 | &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], |
1015 | 0 | &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], |
1016 | 0 | &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], |
1017 | 0 | &d[3], &d[4], &d[5], &d[6], &d[7]); |
1018 | |
|
1019 | 0 | for (int i = 0; i < 8; i++) { |
1020 | 0 | _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); |
1021 | 0 | } |
1022 | 0 | } |
1023 | | |
1024 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1025 | | static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1026 | | const uint8_t *left, int upsample_left, |
1027 | 0 | int dy) { |
1028 | 0 | __m128i dstvec[4], d[16]; |
1029 | |
|
1030 | 0 | dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy); |
1031 | 0 | transpose4x16_sse2(dstvec, d); |
1032 | 0 | for (int i = 0; i < 16; i++) { |
1033 | 0 | *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); |
1034 | 0 | } |
1035 | 0 | } |
1036 | | |
1037 | | static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1038 | | const uint8_t *left, int upsample_left, |
1039 | 0 | int dy) { |
1040 | 0 | __m128i dstvec[16], d[8]; |
1041 | |
|
1042 | 0 | dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy); |
1043 | 0 | for (int i = 4; i < 8; i++) { |
1044 | 0 | d[i] = _mm_setzero_si128(); |
1045 | 0 | } |
1046 | 0 | transpose16x8_8x16_sse2( |
1047 | 0 | &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], |
1048 | 0 | &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], |
1049 | 0 | &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], |
1050 | 0 | &d[3], &d[4], &d[5], &d[6], &d[7]); |
1051 | |
|
1052 | 0 | for (int i = 0; i < 4; i++) { |
1053 | 0 | _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); |
1054 | 0 | } |
1055 | 0 | } |
1056 | | |
1057 | | static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1058 | | const uint8_t *left, int upsample_left, |
1059 | 0 | int dy) { |
1060 | 0 | __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; |
1061 | |
|
1062 | 0 | dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left, |
1063 | 0 | upsample_left, dy); |
1064 | 0 | for (int i = 8; i < 16; i++) { |
1065 | 0 | dstvec[i] = _mm_setzero_si128(); |
1066 | 0 | dstvec_h[i] = _mm_setzero_si128(); |
1067 | 0 | } |
1068 | 0 | transpose16x16_sse2(dstvec, d); |
1069 | 0 | transpose16x16_sse2(dstvec_h, d_h); |
1070 | |
|
1071 | 0 | for (int i = 0; i < 16; i++) { |
1072 | 0 | _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); |
1073 | 0 | } |
1074 | 0 | for (int i = 0; i < 16; i++) { |
1075 | 0 | _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]); |
1076 | 0 | } |
1077 | 0 | } |
1078 | | |
1079 | | static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1080 | | const uint8_t *left, int upsample_left, |
1081 | 0 | int dy) { |
1082 | 0 | __m128i dstvec[32], d[16]; |
1083 | |
|
1084 | 0 | dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy); |
1085 | |
|
1086 | 0 | transpose16x8_8x16_sse2( |
1087 | 0 | &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], |
1088 | 0 | &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], |
1089 | 0 | &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], |
1090 | 0 | &d[3], &d[4], &d[5], &d[6], &d[7]); |
1091 | 0 | transpose16x8_8x16_sse2( |
1092 | 0 | &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], |
1093 | 0 | &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], |
1094 | 0 | &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], |
1095 | 0 | &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], |
1096 | 0 | &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], |
1097 | 0 | &d[6 + 8], &d[7 + 8]); |
1098 | |
|
1099 | 0 | for (int i = 0; i < 8; i++) { |
1100 | 0 | _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); |
1101 | 0 | _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); |
1102 | 0 | } |
1103 | 0 | } |
1104 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1105 | | |
1106 | | static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1107 | | const uint8_t *left, |
1108 | 0 | int upsample_left, int dy) { |
1109 | 0 | __m128i dstvec[16], d[16]; |
1110 | |
|
1111 | 0 | dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy); |
1112 | 0 | transpose16x16_sse2(dstvec, d); |
1113 | |
|
1114 | 0 | for (int i = 0; i < 16; i++) { |
1115 | 0 | _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); |
1116 | 0 | } |
1117 | 0 | } |
1118 | | |
1119 | | static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1120 | | const uint8_t *left, |
1121 | 0 | int upsample_left, int dy) { |
1122 | 0 | __m128i dstvec[32], d[32], dstvec_h[32], d_h[32]; |
1123 | |
|
1124 | 0 | dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left, |
1125 | 0 | upsample_left, dy); |
1126 | 0 | transpose16x16_sse2(dstvec, d); |
1127 | 0 | transpose16x16_sse2(dstvec_h, d_h); |
1128 | 0 | transpose16x16_sse2(dstvec + 16, d + 16); |
1129 | 0 | transpose16x16_sse2(dstvec_h + 16, d_h + 16); |
1130 | 0 | for (int j = 0; j < 16; j++) { |
1131 | 0 | _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); |
1132 | 0 | _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]); |
1133 | 0 | } |
1134 | 0 | for (int j = 0; j < 16; j++) { |
1135 | 0 | _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); |
1136 | 0 | _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]); |
1137 | 0 | } |
1138 | 0 | } |
1139 | | |
1140 | | static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1141 | | const uint8_t *left, |
1142 | 0 | int upsample_left, int dy) { |
1143 | 0 | uint8_t dstT[64 * 64]; |
1144 | 0 | dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy); |
1145 | 0 | transpose(dstT, 64, dst, stride, 64, 64); |
1146 | 0 | } |
1147 | | |
1148 | | static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1149 | | const uint8_t *left, |
1150 | 0 | int upsample_left, int dy) { |
1151 | 0 | __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; |
1152 | |
|
1153 | 0 | dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left, |
1154 | 0 | upsample_left, dy); |
1155 | 0 | transpose16x16_sse2(dstvec, d); |
1156 | 0 | transpose16x16_sse2(dstvec_h, d_h); |
1157 | | // store |
1158 | 0 | for (int j = 0; j < 16; j++) { |
1159 | 0 | _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); |
1160 | 0 | _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); |
1161 | 0 | } |
1162 | 0 | } |
1163 | | |
1164 | | static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1165 | | const uint8_t *left, |
1166 | 0 | int upsample_left, int dy) { |
1167 | 0 | __m128i dstvec[32], d[16]; |
1168 | |
|
1169 | 0 | dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy); |
1170 | 0 | for (int i = 0; i < 32; i += 16) { |
1171 | 0 | transpose16x16_sse2((dstvec + i), d); |
1172 | 0 | for (int j = 0; j < 16; j++) { |
1173 | 0 | _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); |
1174 | 0 | } |
1175 | 0 | } |
1176 | 0 | } |
1177 | | |
1178 | | static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1179 | | const uint8_t *left, |
1180 | 0 | int upsample_left, int dy) { |
1181 | 0 | uint8_t dstT[64 * 32]; |
1182 | 0 | dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy); |
1183 | 0 | transpose(dstT, 64, dst, stride, 32, 64); |
1184 | 0 | } |
1185 | | |
1186 | | static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1187 | | const uint8_t *left, |
1188 | 0 | int upsample_left, int dy) { |
1189 | 0 | uint8_t dstT[32 * 64]; |
1190 | 0 | dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy); |
1191 | 0 | transpose(dstT, 32, dst, stride, 64, 32); |
1192 | 0 | return; |
1193 | 0 | } |
1194 | | |
1195 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1196 | | static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1197 | | const uint8_t *left, |
1198 | 0 | int upsample_left, int dy) { |
1199 | 0 | uint8_t dstT[64 * 16]; |
1200 | 0 | dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy); |
1201 | 0 | transpose(dstT, 64, dst, stride, 16, 64); |
1202 | 0 | } |
1203 | | |
1204 | | static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride, |
1205 | | const uint8_t *left, |
1206 | 0 | int upsample_left, int dy) { |
1207 | 0 | __m128i dstvec[64], d[16]; |
1208 | |
|
1209 | 0 | dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy); |
1210 | 0 | for (int i = 0; i < 64; i += 16) { |
1211 | 0 | transpose16x16_sse2(dstvec + i, d); |
1212 | 0 | for (int j = 0; j < 16; j++) { |
1213 | 0 | _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); |
1214 | 0 | } |
1215 | 0 | } |
1216 | 0 | } |
1217 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1218 | | |
1219 | | void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, |
1220 | | const uint8_t *above, const uint8_t *left, |
1221 | 0 | int upsample_left, int dx, int dy) { |
1222 | 0 | (void)above; |
1223 | 0 | (void)dx; |
1224 | 0 | assert(dx == 1); |
1225 | 0 | assert(dy > 0); |
1226 | | |
1227 | 0 | if (bw == bh) { |
1228 | 0 | switch (bw) { |
1229 | 0 | case 4: |
1230 | 0 | dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy); |
1231 | 0 | break; |
1232 | 0 | case 8: |
1233 | 0 | dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy); |
1234 | 0 | break; |
1235 | 0 | case 16: |
1236 | 0 | dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy); |
1237 | 0 | break; |
1238 | 0 | case 32: |
1239 | 0 | dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy); |
1240 | 0 | break; |
1241 | 0 | case 64: |
1242 | 0 | dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy); |
1243 | 0 | break; |
1244 | 0 | default: assert(0 && "Invalid block size"); |
1245 | 0 | } |
1246 | 0 | } else { |
1247 | 0 | if (bw < bh) { |
1248 | 0 | if (bw + bw == bh) { |
1249 | 0 | switch (bw) { |
1250 | 0 | case 4: |
1251 | 0 | dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy); |
1252 | 0 | break; |
1253 | 0 | case 8: |
1254 | 0 | dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy); |
1255 | 0 | break; |
1256 | 0 | case 16: |
1257 | 0 | dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy); |
1258 | 0 | break; |
1259 | 0 | case 32: |
1260 | 0 | dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy); |
1261 | 0 | break; |
1262 | 0 | default: assert(0 && "Invalid block size"); |
1263 | 0 | } |
1264 | 0 | } else { |
1265 | 0 | switch (bw) { |
1266 | 0 | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1267 | 0 | case 4: |
1268 | 0 | dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy); |
1269 | 0 | break; |
1270 | 0 | case 8: |
1271 | 0 | dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy); |
1272 | 0 | break; |
1273 | 0 | case 16: |
1274 | 0 | dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy); |
1275 | 0 | break; |
1276 | 0 | default: assert(0 && "Invalid block size"); |
1277 | 0 | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1278 | 0 | } |
1279 | 0 | } |
1280 | 0 | } else { |
1281 | 0 | if (bh + bh == bw) { |
1282 | 0 | switch (bh) { |
1283 | 0 | case 4: |
1284 | 0 | dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy); |
1285 | 0 | break; |
1286 | 0 | case 8: |
1287 | 0 | dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy); |
1288 | 0 | break; |
1289 | 0 | case 16: |
1290 | 0 | dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy); |
1291 | 0 | break; |
1292 | 0 | case 32: |
1293 | 0 | dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy); |
1294 | 0 | break; |
1295 | 0 | default: assert(0 && "Invalid block size"); |
1296 | 0 | } |
1297 | 0 | } else { |
1298 | 0 | switch (bh) { |
1299 | 0 | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1300 | 0 | case 4: |
1301 | 0 | dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy); |
1302 | 0 | break; |
1303 | 0 | case 8: |
1304 | 0 | dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy); |
1305 | 0 | break; |
1306 | 0 | case 16: |
1307 | 0 | dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy); |
1308 | 0 | break; |
1309 | 0 | default: assert(0 && "Invalid block size"); |
1310 | 0 | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1311 | 0 | } |
1312 | 0 | } |
1313 | 0 | } |
1314 | 0 | } |
1315 | 0 | } |