/src/aom/av1/common/x86/cfl_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <tmmintrin.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #include "av1/common/cfl.h" |
17 | | |
18 | | #include "av1/common/x86/cfl_simd.h" |
19 | | |
20 | | // Load 32-bit integer from memory into the first element of dst. |
21 | 4.46M | static inline __m128i _mm_loadh_epi32(__m128i const *mem_addr) { |
22 | 4.46M | return _mm_cvtsi32_si128(*((int *)mem_addr)); |
23 | 4.46M | } |
24 | | |
25 | | // Store 32-bit integer from the first element of a into memory. |
26 | 5.24M | static inline void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) { |
27 | 5.24M | *((int *)mem_addr) = _mm_cvtsi128_si32(a); |
28 | 5.24M | } |
29 | | |
30 | | /** |
31 | | * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more |
32 | | * precise version of a box filter 4:2:0 pixel subsampling in Q3. |
33 | | * |
34 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
35 | | * active area is specified using width and height. |
36 | | * |
37 | | * Note: We don't need to worry about going over the active area, as long as we |
38 | | * stay inside the CfL prediction buffer. |
39 | | */ |
40 | | static inline void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, |
41 | | int input_stride, |
42 | | uint16_t *pred_buf_q3, |
43 | 914k | int width, int height) { |
44 | 914k | const __m128i twos = _mm_set1_epi8(2); |
45 | 914k | __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; |
46 | 914k | const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; |
47 | 914k | const int luma_stride = input_stride << 1; |
48 | 3.73M | do { |
49 | 3.73M | if (width == 4) { |
50 | 1.78M | __m128i top = _mm_loadh_epi32((__m128i *)input); |
51 | 1.78M | top = _mm_maddubs_epi16(top, twos); |
52 | 1.78M | __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); |
53 | 1.78M | bot = _mm_maddubs_epi16(bot, twos); |
54 | 1.78M | const __m128i sum = _mm_add_epi16(top, bot); |
55 | 1.78M | _mm_storeh_epi32(pred_buf_m128i, sum); |
56 | 1.95M | } else if (width == 8) { |
57 | 957k | __m128i top = _mm_loadl_epi64((__m128i *)input); |
58 | 957k | top = _mm_maddubs_epi16(top, twos); |
59 | 957k | __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); |
60 | 957k | bot = _mm_maddubs_epi16(bot, twos); |
61 | 957k | const __m128i sum = _mm_add_epi16(top, bot); |
62 | 957k | _mm_storel_epi64(pred_buf_m128i, sum); |
63 | 994k | } else { |
64 | 994k | __m128i top = _mm_loadu_si128((__m128i *)input); |
65 | 994k | top = _mm_maddubs_epi16(top, twos); |
66 | 994k | __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); |
67 | 994k | bot = _mm_maddubs_epi16(bot, twos); |
68 | 994k | const __m128i sum = _mm_add_epi16(top, bot); |
69 | 994k | _mm_storeu_si128(pred_buf_m128i, sum); |
70 | 994k | if (width == 32) { |
71 | 0 | __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); |
72 | 0 | __m128i bot_1 = |
73 | 0 | _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); |
74 | 0 | top_1 = _mm_maddubs_epi16(top_1, twos); |
75 | 0 | bot_1 = _mm_maddubs_epi16(bot_1, twos); |
76 | 0 | __m128i sum_1 = _mm_add_epi16(top_1, bot_1); |
77 | 0 | _mm_storeu_si128(pred_buf_m128i + 1, sum_1); |
78 | 0 | } |
79 | 994k | } |
80 | 3.73M | input += luma_stride; |
81 | 3.73M | pred_buf_m128i += CFL_BUF_LINE_I128; |
82 | 3.73M | } while (pred_buf_m128i < end); |
83 | 914k | } |
84 | | |
85 | | /** |
86 | | * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more |
87 | | * precise version of a box filter 4:2:2 pixel subsampling in Q3. |
88 | | * |
89 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
90 | | * active area is specified using width and height. |
91 | | * |
92 | | * Note: We don't need to worry about going over the active area, as long as we |
93 | | * stay inside the CfL prediction buffer. |
94 | | */ |
95 | | static inline void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, |
96 | | int input_stride, |
97 | | uint16_t *pred_buf_q3, |
98 | 1.79k | int width, int height) { |
99 | 1.79k | const __m128i fours = _mm_set1_epi8(4); |
100 | 1.79k | __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; |
101 | 1.79k | const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; |
102 | 12.6k | do { |
103 | 12.6k | if (width == 4) { |
104 | 2.67k | __m128i top = _mm_loadh_epi32((__m128i *)input); |
105 | 2.67k | top = _mm_maddubs_epi16(top, fours); |
106 | 2.67k | _mm_storeh_epi32(pred_buf_m128i, top); |
107 | 9.94k | } else if (width == 8) { |
108 | 3.56k | __m128i top = _mm_loadl_epi64((__m128i *)input); |
109 | 3.56k | top = _mm_maddubs_epi16(top, fours); |
110 | 3.56k | _mm_storel_epi64(pred_buf_m128i, top); |
111 | 6.38k | } else { |
112 | 6.38k | __m128i top = _mm_loadu_si128((__m128i *)input); |
113 | 6.38k | top = _mm_maddubs_epi16(top, fours); |
114 | 6.38k | _mm_storeu_si128(pred_buf_m128i, top); |
115 | 6.38k | if (width == 32) { |
116 | 0 | __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); |
117 | 0 | top_1 = _mm_maddubs_epi16(top_1, fours); |
118 | 0 | _mm_storeu_si128(pred_buf_m128i + 1, top_1); |
119 | 0 | } |
120 | 6.38k | } |
121 | 12.6k | input += input_stride; |
122 | 12.6k | pred_buf_m128i += CFL_BUF_LINE_I128; |
123 | 12.6k | } while (pred_buf_m128i < end); |
124 | 1.79k | } |
125 | | |
126 | | /** |
127 | | * Multiplies the pixels by 8 (scaling in Q3). |
128 | | * |
129 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
130 | | * active area is specified using width and height. |
131 | | * |
132 | | * Note: We don't need to worry about going over the active area, as long as we |
133 | | * stay inside the CfL prediction buffer. |
134 | | */ |
135 | | static inline void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, |
136 | | int input_stride, |
137 | | uint16_t *pred_buf_q3, |
138 | 378k | int width, int height) { |
139 | 378k | const __m128i zeros = _mm_setzero_si128(); |
140 | 378k | const int luma_stride = input_stride; |
141 | 378k | __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; |
142 | 378k | const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; |
143 | 3.89M | do { |
144 | 3.89M | if (width == 4) { |
145 | 884k | __m128i row = _mm_loadh_epi32((__m128i *)input); |
146 | 884k | row = _mm_unpacklo_epi8(row, zeros); |
147 | 884k | _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); |
148 | 3.00M | } else if (width == 8) { |
149 | 1.66M | __m128i row = _mm_loadl_epi64((__m128i *)input); |
150 | 1.66M | row = _mm_unpacklo_epi8(row, zeros); |
151 | 1.66M | _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); |
152 | 1.66M | } else { |
153 | 1.34M | __m128i row = _mm_loadu_si128((__m128i *)input); |
154 | 1.34M | const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); |
155 | 1.34M | const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); |
156 | 1.34M | _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); |
157 | 1.34M | _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); |
158 | 1.34M | if (width == 32) { |
159 | 0 | __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); |
160 | 0 | const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); |
161 | 0 | const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); |
162 | 0 | _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); |
163 | 0 | _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); |
164 | 0 | } |
165 | 1.34M | } |
166 | 3.89M | input += luma_stride; |
167 | 3.89M | pred_buf_m128i += CFL_BUF_LINE_I128; |
168 | 3.89M | } while (pred_buf_m128i < end); |
169 | 378k | } |
170 | | |
171 | | #if CONFIG_AV1_HIGHBITDEPTH |
172 | | /** |
173 | | * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more |
174 | | * precise version of a box filter 4:2:0 pixel subsampling in Q3. |
175 | | * |
176 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
177 | | * active area is specified using width and height. |
178 | | * |
179 | | * Note: We don't need to worry about going over the active area, as long as we |
180 | | * stay inside the CfL prediction buffer. |
181 | | */ |
182 | | static inline void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, |
183 | | int input_stride, |
184 | | uint16_t *pred_buf_q3, |
185 | 850k | int width, int height) { |
186 | 850k | const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; |
187 | 850k | const int luma_stride = input_stride << 1; |
188 | 3.28M | do { |
189 | 3.28M | if (width == 4) { |
190 | 1.73M | const __m128i top = _mm_loadl_epi64((__m128i *)input); |
191 | 1.73M | const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); |
192 | 1.73M | __m128i sum = _mm_add_epi16(top, bot); |
193 | 1.73M | sum = _mm_hadd_epi16(sum, sum); |
194 | 1.73M | *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); |
195 | 1.73M | } else { |
196 | 1.54M | const __m128i top = _mm_loadu_si128((__m128i *)input); |
197 | 1.54M | const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); |
198 | 1.54M | __m128i sum = _mm_add_epi16(top, bot); |
199 | 1.54M | if (width == 8) { |
200 | 813k | sum = _mm_hadd_epi16(sum, sum); |
201 | 813k | _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); |
202 | 813k | } else { |
203 | 733k | const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); |
204 | 733k | const __m128i bot_1 = |
205 | 733k | _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); |
206 | 733k | sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); |
207 | 733k | _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); |
208 | 733k | if (width == 32) { |
209 | 0 | const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); |
210 | 0 | const __m128i bot_2 = |
211 | 0 | _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); |
212 | 0 | const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); |
213 | 0 | const __m128i bot_3 = |
214 | 0 | _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); |
215 | 0 | const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); |
216 | 0 | const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); |
217 | 0 | __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); |
218 | 0 | _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, |
219 | 0 | _mm_add_epi16(next_sum, next_sum)); |
220 | 0 | } |
221 | 733k | } |
222 | 1.54M | } |
223 | 3.28M | input += luma_stride; |
224 | 3.28M | } while ((pred_buf_q3 += CFL_BUF_LINE) < end); |
225 | 850k | } |
226 | | |
227 | | /** |
228 | | * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more |
229 | | * precise version of a box filter 4:2:2 pixel subsampling in Q3. |
230 | | * |
231 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
232 | | * active area is specified using width and height. |
233 | | * |
234 | | * Note: We don't need to worry about going over the active area, as long as we |
235 | | * stay inside the CfL prediction buffer. |
236 | | */ |
237 | | static inline void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, |
238 | | int input_stride, |
239 | | uint16_t *pred_buf_q3, |
240 | 1.76k | int width, int height) { |
241 | 1.76k | __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; |
242 | 1.76k | const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; |
243 | 8.83k | do { |
244 | 8.83k | if (width == 4) { |
245 | 4.89k | const __m128i top = _mm_loadl_epi64((__m128i *)input); |
246 | 4.89k | const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); |
247 | 4.89k | _mm_storeh_epi32(pred_buf_m128i, sum); |
248 | 4.89k | } else { |
249 | 3.94k | const __m128i top = _mm_loadu_si128((__m128i *)input); |
250 | 3.94k | if (width == 8) { |
251 | 2.30k | const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); |
252 | 2.30k | _mm_storel_epi64(pred_buf_m128i, sum); |
253 | 2.30k | } else { |
254 | 1.64k | const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); |
255 | 1.64k | const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); |
256 | 1.64k | _mm_storeu_si128(pred_buf_m128i, sum); |
257 | 1.64k | if (width == 32) { |
258 | 0 | const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); |
259 | 0 | const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); |
260 | 0 | const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); |
261 | 0 | _mm_storeu_si128(pred_buf_m128i + 1, sum_1); |
262 | 0 | } |
263 | 1.64k | } |
264 | 3.94k | } |
265 | 8.83k | pred_buf_m128i += CFL_BUF_LINE_I128; |
266 | 8.83k | input += input_stride; |
267 | 8.83k | } while (pred_buf_m128i < end); |
268 | 1.76k | } |
269 | | |
270 | | static inline void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, |
271 | | int input_stride, |
272 | | uint16_t *pred_buf_q3, |
273 | 705k | int width, int height) { |
274 | 705k | const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; |
275 | 6.14M | do { |
276 | 6.14M | if (width == 4) { |
277 | 1.40M | const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); |
278 | 1.40M | _mm_storel_epi64((__m128i *)pred_buf_q3, row); |
279 | 4.73M | } else { |
280 | 4.73M | const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); |
281 | 4.73M | _mm_storeu_si128((__m128i *)pred_buf_q3, row); |
282 | 4.73M | if (width >= 16) { |
283 | 1.51M | __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); |
284 | 1.51M | row_1 = _mm_slli_epi16(row_1, 3); |
285 | 1.51M | _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); |
286 | 1.51M | if (width == 32) { |
287 | 0 | __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); |
288 | 0 | row_2 = _mm_slli_epi16(row_2, 3); |
289 | 0 | _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); |
290 | 0 | __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); |
291 | 0 | row_3 = _mm_slli_epi16(row_3, 3); |
292 | 0 | _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); |
293 | 0 | } |
294 | 1.51M | } |
295 | 4.73M | } |
296 | 6.14M | input += input_stride; |
297 | 6.14M | pred_buf_q3 += CFL_BUF_LINE; |
298 | 6.14M | } while (pred_buf_q3 < end); |
299 | 705k | } |
300 | | #endif // CONFIG_AV1_HIGHBITDEPTH |
301 | | |
302 | | CFL_GET_SUBSAMPLE_FUNCTION(ssse3) |
303 | | |
304 | | static inline __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, |
305 | 25.1M | __m128i alpha_sign, __m128i dc_q0) { |
306 | 25.1M | __m128i ac_q3 = _mm_loadu_si128(input); |
307 | 25.1M | __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); |
308 | 25.1M | __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); |
309 | 25.1M | scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); |
310 | 25.1M | return _mm_add_epi16(scaled_luma_q0, dc_q0); |
311 | 25.1M | } |
312 | | |
313 | | static inline void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, |
314 | | uint8_t *dst, int dst_stride, |
315 | 1.31M | int alpha_q3, int width, int height) { |
316 | 1.31M | const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); |
317 | 1.31M | const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); |
318 | 1.31M | const __m128i dc_q0 = _mm_set1_epi16(*dst); |
319 | 1.31M | __m128i *row = (__m128i *)pred_buf_q3; |
320 | 1.31M | const __m128i *row_end = row + height * CFL_BUF_LINE_I128; |
321 | 11.9M | do { |
322 | 11.9M | __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); |
323 | 11.9M | if (width < 16) { |
324 | 8.22M | res = _mm_packus_epi16(res, res); |
325 | 8.22M | if (width == 4) |
326 | 3.44M | _mm_storeh_epi32((__m128i *)dst, res); |
327 | 4.77M | else |
328 | 4.77M | _mm_storel_epi64((__m128i *)dst, res); |
329 | 8.22M | } else { |
330 | 3.76M | __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); |
331 | 3.76M | res = _mm_packus_epi16(res, next); |
332 | 3.76M | _mm_storeu_si128((__m128i *)dst, res); |
333 | 3.76M | if (width == 32) { |
334 | 0 | res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); |
335 | 0 | next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); |
336 | 0 | res = _mm_packus_epi16(res, next); |
337 | 0 | _mm_storeu_si128((__m128i *)(dst + 16), res); |
338 | 0 | } |
339 | 3.76M | } |
340 | 11.9M | dst += dst_stride; |
341 | 11.9M | } while ((row += CFL_BUF_LINE_I128) < row_end); |
342 | 1.31M | } |
343 | | |
344 | | CFL_PREDICT_FN(ssse3, lbd) |
345 | | |
346 | | #if CONFIG_AV1_HIGHBITDEPTH |
347 | 1.03M | static inline __m128i highbd_max_epi16(int bd) { |
348 | 1.03M | const __m128i neg_one = _mm_set1_epi16(-1); |
349 | | // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) |
350 | 1.03M | return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); |
351 | 1.03M | } |
352 | | |
353 | 9.36M | static inline __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { |
354 | 9.36M | return _mm_max_epi16(_mm_min_epi16(u, max), zero); |
355 | 9.36M | } |
356 | | |
357 | | static inline void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, |
358 | | uint16_t *dst, int dst_stride, |
359 | | int alpha_q3, int bd, int width, |
360 | 1.03M | int height) { |
361 | 1.03M | const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); |
362 | 1.03M | const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); |
363 | 1.03M | const __m128i dc_q0 = _mm_set1_epi16(*dst); |
364 | 1.03M | const __m128i max = highbd_max_epi16(bd); |
365 | 1.03M | const __m128i zeros = _mm_setzero_si128(); |
366 | 1.03M | __m128i *row = (__m128i *)pred_buf_q3; |
367 | 1.03M | const __m128i *row_end = row + height * CFL_BUF_LINE_I128; |
368 | 9.36M | do { |
369 | 9.36M | __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); |
370 | 9.36M | res = highbd_clamp_epi16(res, zeros, max); |
371 | 9.36M | if (width == 4) { |
372 | 3.37M | _mm_storel_epi64((__m128i *)dst, res); |
373 | 5.98M | } else { |
374 | 5.98M | _mm_storeu_si128((__m128i *)dst, res); |
375 | 5.98M | } |
376 | 9.36M | if (width >= 16) { |
377 | 0 | const __m128i res_1 = |
378 | 0 | predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); |
379 | 0 | _mm_storeu_si128(((__m128i *)dst) + 1, |
380 | 0 | highbd_clamp_epi16(res_1, zeros, max)); |
381 | 0 | } |
382 | 9.36M | if (width == 32) { |
383 | 0 | const __m128i res_2 = |
384 | 0 | predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); |
385 | 0 | _mm_storeu_si128((__m128i *)(dst + 16), |
386 | 0 | highbd_clamp_epi16(res_2, zeros, max)); |
387 | 0 | const __m128i res_3 = |
388 | 0 | predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); |
389 | 0 | _mm_storeu_si128((__m128i *)(dst + 24), |
390 | 0 | highbd_clamp_epi16(res_3, zeros, max)); |
391 | 0 | } |
392 | 9.36M | dst += dst_stride; |
393 | 9.36M | } while ((row += CFL_BUF_LINE_I128) < row_end); |
394 | 1.03M | } |
395 | | |
396 | | CFL_PREDICT_FN(ssse3, hbd) |
397 | | #endif // CONFIG_AV1_HIGHBITDEPTH |