/work/svt-av1/Source/Lib/Codec/deblocking_common.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright(c) 2019 Intel Corporation |
3 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
4 | | * |
5 | | * This source code is subject to the terms of the BSD 2 Clause License and |
6 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
7 | | * was not distributed with this source code in the LICENSE file, you can |
8 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
9 | | * Media Patent License 1.0 was not distributed with this source code in the |
10 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
11 | | */ |
12 | | |
13 | | #include "deblocking_common.h" |
14 | | #include "common_utils.h" |
15 | | |
16 | | static const int delta_lf_id_lut[MAX_PLANES][2] = {{0, 1}, {2, 2}, {3, 3}}; |
17 | | |
18 | | static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_PLANES][2] = {{SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H}, |
19 | | {SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U}, |
20 | | {SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V}}; |
21 | | |
22 | 12.0k | static INLINE int svt_aom_seg_feature_active(SegmentationParams* seg, int segment_id, SEG_LVL_FEATURES feature_id) { |
23 | 12.0k | return seg->segmentation_enabled && seg->feature_enabled[segment_id][feature_id]; |
24 | 12.0k | } |
25 | | |
26 | 0 | static INLINE int get_segdata(SegmentationParams* seg, int segment_id, SEG_LVL_FEATURES feature_id) { |
27 | 0 | return seg->feature_data[segment_id][feature_id]; |
28 | 0 | } |
29 | | |
30 | 0 | static INLINE int8_t signed_char_clamp(int32_t t) { |
31 | 0 | return (int8_t)clamp(t, -128, 127); |
32 | 0 | } |
33 | | |
34 | 0 | static INLINE int16_t signed_char_clamp_high(int32_t t, int32_t bd) { |
35 | 0 | switch (bd) { |
36 | 0 | case 10: |
37 | 0 | return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); |
38 | 0 | case 12: |
39 | 0 | return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); |
40 | 0 | case 8: |
41 | 0 | default: |
42 | 0 | return (int16_t)clamp(t, -128, 128 - 1); |
43 | 0 | } |
44 | 0 | } |
45 | | |
46 | | uint8_t svt_aom_get_filter_level_delta_lf(FrameHeader* frm_hdr, const int32_t dir_idx, int32_t plane, |
47 | | int32_t* sb_delta_lf, uint8_t seg_id, PredictionMode pred_mode, |
48 | 0 | MvReferenceFrame ref_frame_0) { |
49 | 0 | int32_t delta_lf = -1; |
50 | 0 | if (frm_hdr->delta_lf_params.delta_lf_multi) { |
51 | 0 | const int32_t delta_lf_idx = delta_lf_id_lut[plane][dir_idx]; |
52 | 0 | delta_lf = sb_delta_lf[delta_lf_idx]; |
53 | 0 | } else { |
54 | 0 | delta_lf = sb_delta_lf[0]; |
55 | 0 | } |
56 | 0 | int32_t base_level; |
57 | 0 | if (plane == 0) { |
58 | 0 | base_level = frm_hdr->loop_filter_params.filter_level[dir_idx]; |
59 | 0 | } else if (plane == 1) { |
60 | 0 | base_level = frm_hdr->loop_filter_params.filter_level_u; |
61 | 0 | } else { |
62 | 0 | base_level = frm_hdr->loop_filter_params.filter_level_v; |
63 | 0 | } |
64 | 0 | int32_t lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER); |
65 | 0 | assert(plane >= 0 && plane <= 2); |
66 | 0 | const int32_t seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx]; |
67 | 0 | if (svt_aom_seg_feature_active(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id)) { |
68 | 0 | const int32_t data = get_segdata(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id); |
69 | 0 | lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); |
70 | 0 | } |
71 | |
|
72 | 0 | if (frm_hdr->loop_filter_params.mode_ref_delta_enabled) { |
73 | 0 | const int32_t scale = 1 << (lvl_seg >> 5); |
74 | 0 | lvl_seg += frm_hdr->loop_filter_params.ref_deltas[ref_frame_0] * scale; |
75 | 0 | if (ref_frame_0 > INTRA_FRAME) { |
76 | 0 | lvl_seg += frm_hdr->loop_filter_params.mode_deltas[mode_lf_lut[pred_mode]] * scale; |
77 | 0 | } |
78 | 0 | lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER); |
79 | 0 | } |
80 | 0 | return lvl_seg; |
81 | 0 | } |
82 | | |
83 | | // Update the loop filter for the current frame. |
84 | | // This should be called before loop_filter_rows(), |
85 | | // svt_av1_loop_filter_frame() calls this function directly. |
86 | | void svt_av1_loop_filter_frame_init(FrameHeader* frm_hdr, LoopFilterInfoN* lfi, int32_t plane_start, |
87 | 274 | int32_t plane_end) { |
88 | 274 | int32_t filt_lvl[MAX_PLANES], filt_lvl_r[MAX_PLANES]; |
89 | 274 | int32_t plane; |
90 | 274 | int32_t seg_id; |
91 | | // n_shift is the multiplier for lf_deltas |
92 | | // the multiplier is 1 for when filter_lvl is between 0 and 31; |
93 | | // 2 when filter_lvl is between 32 and 63 |
94 | | |
95 | 274 | LoopFilter* const lf = &frm_hdr->loop_filter_params; |
96 | | // const struct segmentation *const seg = &pcs->ppcs->seg; |
97 | | |
98 | | // update sharpness limits |
99 | 274 | svt_aom_update_sharpness(lfi, lf->sharpness_level); |
100 | | |
101 | 274 | filt_lvl[0] = frm_hdr->loop_filter_params.filter_level[0]; |
102 | 274 | filt_lvl[1] = frm_hdr->loop_filter_params.filter_level_u; |
103 | 274 | filt_lvl[2] = frm_hdr->loop_filter_params.filter_level_v; |
104 | | |
105 | 274 | filt_lvl_r[0] = frm_hdr->loop_filter_params.filter_level[1]; |
106 | 274 | filt_lvl_r[1] = frm_hdr->loop_filter_params.filter_level_u; |
107 | 274 | filt_lvl_r[2] = frm_hdr->loop_filter_params.filter_level_v; |
108 | | |
109 | 1.04k | for (plane = plane_start; plane < plane_end; plane++) { |
110 | 788 | if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) { |
111 | 17 | break; |
112 | 771 | } else if (plane == 1 && !filt_lvl[1]) { |
113 | 9 | continue; |
114 | 762 | } else if (plane == 2 && !filt_lvl[2]) { |
115 | 9 | continue; |
116 | 9 | } |
117 | | |
118 | 6.77k | for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { |
119 | 18.0k | for (int32_t dir = 0; dir < 2; ++dir) { |
120 | 12.0k | int32_t lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane]; |
121 | 12.0k | assert(plane >= 0 && plane <= 2); |
122 | 12.0k | const int32_t seg_lf_feature_id = seg_lvl_lf_lut[plane][dir]; |
123 | 12.0k | if (svt_aom_seg_feature_active(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id)) { |
124 | 0 | const int32_t data = get_segdata(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id); |
125 | 0 | lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); |
126 | 0 | } |
127 | | |
128 | 12.0k | if (!lf->mode_ref_delta_enabled) { |
129 | | // we could get rid of this if we assume that deltas are set to |
130 | | // zero when not in use; encoder always uses deltas |
131 | 12.0k | memset(lfi->lvl[plane][seg_id][dir], lvl_seg, sizeof(lfi->lvl[plane][seg_id][dir])); |
132 | 12.0k | } else { |
133 | 0 | int32_t ref, mode; |
134 | 0 | const int32_t scale = 1 << (lvl_seg >> 5); |
135 | 0 | const int32_t intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; |
136 | 0 | lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] = (uint8_t)clamp(intra_lvl, 0, MAX_LOOP_FILTER); |
137 | |
|
138 | 0 | for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) { |
139 | 0 | for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { |
140 | 0 | const int32_t inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + |
141 | 0 | lf->mode_deltas[mode] * scale; |
142 | 0 | lfi->lvl[plane][seg_id][dir][ref][mode] = (uint8_t)clamp(inter_lvl, 0, MAX_LOOP_FILTER); |
143 | 0 | } |
144 | 0 | } |
145 | 0 | } |
146 | 12.0k | } |
147 | 6.02k | } |
148 | 753 | } |
149 | 274 | } |
150 | | |
151 | | // should we apply any filter at all: 11111111 yes, 00000000 no |
152 | 0 | static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { |
153 | 0 | int8_t mask = 0; |
154 | 0 | mask |= (abs(p1 - p0) > limit) * -1; |
155 | 0 | mask |= (abs(q1 - q0) > limit) * -1; |
156 | 0 | mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
157 | 0 | return ~mask; |
158 | 0 | } |
159 | | |
160 | | static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, |
161 | 0 | uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { |
162 | 0 | int8_t mask = 0; |
163 | 0 | mask |= (abs(p3 - p2) > limit) * -1; |
164 | 0 | mask |= (abs(p2 - p1) > limit) * -1; |
165 | 0 | mask |= (abs(p1 - p0) > limit) * -1; |
166 | 0 | mask |= (abs(q1 - q0) > limit) * -1; |
167 | 0 | mask |= (abs(q2 - q1) > limit) * -1; |
168 | 0 | mask |= (abs(q3 - q2) > limit) * -1; |
169 | 0 | mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
170 | 0 | return ~mask; |
171 | 0 | } |
172 | | |
173 | | static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, |
174 | 0 | uint8_t q1, uint8_t q2) { |
175 | 0 | int8_t mask = 0; |
176 | 0 | mask |= (abs(p2 - p1) > limit) * -1; |
177 | 0 | mask |= (abs(p1 - p0) > limit) * -1; |
178 | 0 | mask |= (abs(q1 - q0) > limit) * -1; |
179 | 0 | mask |= (abs(q2 - q1) > limit) * -1; |
180 | 0 | mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
181 | 0 | return ~mask; |
182 | 0 | } |
183 | | |
184 | | static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, |
185 | 0 | uint8_t q2) { |
186 | 0 | int8_t mask = 0; |
187 | 0 | mask |= (abs(p1 - p0) > thresh) * -1; |
188 | 0 | mask |= (abs(q1 - q0) > thresh) * -1; |
189 | 0 | mask |= (abs(p2 - p0) > thresh) * -1; |
190 | 0 | mask |= (abs(q2 - q0) > thresh) * -1; |
191 | 0 | return ~mask; |
192 | 0 | } |
193 | | |
194 | | static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, |
195 | 0 | uint16_t q1, uint16_t q2, int bd) { |
196 | 0 | int8_t mask = 0; |
197 | 0 | int16_t thresh16 = (uint16_t)thresh << (bd - 8); |
198 | 0 | mask |= (abs(p1 - p0) > thresh16) * -1; |
199 | 0 | mask |= (abs(q1 - q0) > thresh16) * -1; |
200 | 0 | mask |= (abs(p2 - p0) > thresh16) * -1; |
201 | 0 | mask |= (abs(q2 - q0) > thresh16) * -1; |
202 | 0 | return ~mask; |
203 | 0 | } |
204 | | |
205 | | static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, |
206 | 0 | uint8_t q2, uint8_t q3) { |
207 | 0 | int8_t mask = 0; |
208 | 0 | mask |= (abs(p1 - p0) > thresh) * -1; |
209 | 0 | mask |= (abs(q1 - q0) > thresh) * -1; |
210 | 0 | mask |= (abs(p2 - p0) > thresh) * -1; |
211 | 0 | mask |= (abs(q2 - q0) > thresh) * -1; |
212 | 0 | mask |= (abs(p3 - p0) > thresh) * -1; |
213 | 0 | mask |= (abs(q3 - q0) > thresh) * -1; |
214 | 0 | return ~mask; |
215 | 0 | } |
216 | | |
217 | | // is there high edge variance internal edge: 11111111 yes, 00000000 no |
218 | 0 | static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { |
219 | 0 | int8_t hev = 0; |
220 | 0 | hev |= (abs(p1 - p0) > thresh) * -1; |
221 | 0 | hev |= (abs(q1 - q0) > thresh) * -1; |
222 | 0 | return hev; |
223 | 0 | } |
224 | | |
225 | 0 | static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t* op1, uint8_t* op0, uint8_t* oq0, uint8_t* oq1) { |
226 | 0 | int8_t filter1, filter2; |
227 | 0 | const int8_t ps1 = (int8_t)(*op1 ^ 0x80); |
228 | 0 | const int8_t ps0 = (int8_t)(*op0 ^ 0x80); |
229 | 0 | const int8_t qs0 = (int8_t)(*oq0 ^ 0x80); |
230 | 0 | const int8_t qs1 = (int8_t)(*oq1 ^ 0x80); |
231 | 0 | const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); |
232 | | |
233 | | // add outer taps if we have high edge variance |
234 | 0 | int8_t filter = signed_char_clamp(ps1 - qs1) & hev; |
235 | | |
236 | | // inner taps |
237 | 0 | filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; |
238 | | |
239 | | // save bottom 3 bits so that we round one side +4 and the other +3 |
240 | | // if it equals 4 we'll set to adjust by -1 to account for the fact |
241 | | // we'd round 3 the other way |
242 | 0 | filter1 = signed_char_clamp(filter + 4) >> 3; |
243 | 0 | filter2 = signed_char_clamp(filter + 3) >> 3; |
244 | 0 | *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80); |
245 | 0 | *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80); |
246 | | |
247 | | // outer tap adjustments |
248 | 0 | filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; |
249 | 0 | *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80); |
250 | 0 | *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80); |
251 | 0 | } |
252 | | |
253 | | void svt_aom_lpf_horizontal_4_c(uint8_t* s, int32_t p /* pitch */, const uint8_t* blimit, const uint8_t* limit, |
254 | 0 | const uint8_t* thresh) { |
255 | 0 | int32_t i; |
256 | 0 | int32_t count = 4; |
257 | | |
258 | | // loop filter designed to work using chars so that we can make maximum use |
259 | | // of 8 bit simd instructions. |
260 | 0 | for (i = 0; i < count; ++i) { |
261 | 0 | const uint8_t p1 = s[-2 * p], p0 = s[-p]; |
262 | 0 | const uint8_t q0 = s[0 * p], q1 = s[1 * p]; |
263 | 0 | const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); |
264 | 0 | filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); |
265 | 0 | ++s; |
266 | 0 | } |
267 | 0 | } |
268 | | |
269 | | void svt_aom_lpf_vertical_4_c(uint8_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit, |
270 | 0 | const uint8_t* thresh) { |
271 | 0 | int32_t i; |
272 | 0 | int32_t count = 4; |
273 | | |
274 | | // loop filter designed to work using chars so that we can make maximum use |
275 | | // of 8 bit simd instructions. |
276 | 0 | for (i = 0; i < count; ++i) { |
277 | 0 | const uint8_t p1 = s[-2], p0 = s[-1]; |
278 | 0 | const uint8_t q0 = s[0], q1 = s[1]; |
279 | 0 | const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); |
280 | 0 | filter4(mask, *thresh, s - 2, s - 1, s, s + 1); |
281 | 0 | s += pitch; |
282 | 0 | } |
283 | 0 | } |
284 | | |
285 | | static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, uint8_t* op2, uint8_t* op1, uint8_t* op0, |
286 | 0 | uint8_t* oq0, uint8_t* oq1, uint8_t* oq2) { |
287 | 0 | if (flat && mask) { |
288 | 0 | const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; |
289 | 0 | const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; |
290 | | |
291 | | // 5-tap filter [1, 2, 2, 2, 1] |
292 | 0 | *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); |
293 | 0 | *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); |
294 | 0 | *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); |
295 | 0 | *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); |
296 | 0 | } else { |
297 | 0 | filter4(mask, thresh, op1, op0, oq0, oq1); |
298 | 0 | } |
299 | 0 | } |
300 | | |
301 | | static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t* op3, uint8_t* op2, uint8_t* op1, |
302 | 0 | uint8_t* op0, uint8_t* oq0, uint8_t* oq1, uint8_t* oq2, uint8_t* oq3) { |
303 | 0 | if (flat && mask) { |
304 | 0 | const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; |
305 | 0 | const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; |
306 | | |
307 | | // 7-tap filter [1, 1, 1, 2, 1, 1, 1] |
308 | 0 | *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); |
309 | 0 | *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); |
310 | 0 | *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); |
311 | 0 | *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); |
312 | 0 | *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); |
313 | 0 | *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); |
314 | 0 | } else { |
315 | 0 | filter4(mask, thresh, op1, op0, oq0, oq1); |
316 | 0 | } |
317 | 0 | } |
318 | | |
319 | | void svt_aom_lpf_horizontal_6_c(uint8_t* s, int32_t p, const uint8_t* blimit, const uint8_t* limit, |
320 | 0 | const uint8_t* thresh) { |
321 | 0 | int32_t i; |
322 | 0 | int32_t count = 4; |
323 | | |
324 | | // loop filter designed to work using chars so that we can make maximum use |
325 | | // of 8 bit simd instructions. |
326 | 0 | for (i = 0; i < count; ++i) { |
327 | 0 | const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; |
328 | 0 | const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; |
329 | |
|
330 | 0 | const int8_t mask = filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); |
331 | 0 | const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); |
332 | 0 | filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); |
333 | 0 | ++s; |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | | void svt_aom_lpf_vertical_6_c(uint8_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit, |
338 | 0 | const uint8_t* thresh) { |
339 | 0 | int32_t i; |
340 | 0 | int32_t count = 4; |
341 | | |
342 | | // loop filter designed to work using chars so that we can make maximum use |
343 | | // of 8 bit simd instructions. |
344 | 0 | for (i = 0; i < count; ++i) { |
345 | 0 | const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; |
346 | 0 | const uint8_t q0 = s[0], q1 = s[1], q2 = s[2]; |
347 | |
|
348 | 0 | const int8_t mask = filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); |
349 | 0 | const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); |
350 | 0 | filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); |
351 | 0 | s += pitch; |
352 | 0 | } |
353 | 0 | } |
354 | | |
355 | | void svt_aom_lpf_horizontal_8_c(uint8_t* s, int32_t p, const uint8_t* blimit, const uint8_t* limit, |
356 | 0 | const uint8_t* thresh) { |
357 | 0 | int32_t i; |
358 | 0 | int32_t count = 4; |
359 | | |
360 | | // loop filter designed to work using chars so that we can make maximum use |
361 | | // of 8 bit simd instructions. |
362 | 0 | for (i = 0; i < count; ++i) { |
363 | 0 | const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; |
364 | 0 | const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; |
365 | |
|
366 | 0 | const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); |
367 | 0 | const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); |
368 | 0 | filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p); |
369 | 0 | ++s; |
370 | 0 | } |
371 | 0 | } |
372 | | |
373 | | void svt_aom_lpf_vertical_8_c(uint8_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit, |
374 | 0 | const uint8_t* thresh) { |
375 | 0 | int32_t i; |
376 | 0 | int32_t count = 4; |
377 | |
|
378 | 0 | for (i = 0; i < count; ++i) { |
379 | 0 | const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; |
380 | 0 | const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; |
381 | 0 | const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); |
382 | 0 | const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); |
383 | 0 | filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3); |
384 | 0 | s += pitch; |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | | // Should we apply any filter at all: 11111111 yes, 00000000 no ? |
389 | | static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, uint16_t p1, uint16_t p0, uint16_t q0, |
390 | 0 | uint16_t q1, int32_t bd) { |
391 | 0 | int8_t mask = 0; |
392 | 0 | int16_t limit16 = (uint16_t)limit << (bd - 8); |
393 | 0 | int16_t blimit16 = (uint16_t)blimit << (bd - 8); |
394 | 0 | mask |= (abs(p1 - p0) > limit16) * -1; |
395 | 0 | mask |= (abs(q1 - q0) > limit16) * -1; |
396 | 0 | mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; |
397 | 0 | return ~mask; |
398 | 0 | } |
399 | | |
400 | | // Should we apply any filter at all: 11111111 yes, 00000000 no ? |
401 | | static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, uint16_t p3, uint16_t p2, uint16_t p1, |
402 | 0 | uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, int32_t bd) { |
403 | 0 | int8_t mask = 0; |
404 | 0 | int16_t limit16 = (uint16_t)limit << (bd - 8); |
405 | 0 | int16_t blimit16 = (uint16_t)blimit << (bd - 8); |
406 | 0 | mask |= (abs(p3 - p2) > limit16) * -1; |
407 | 0 | mask |= (abs(p2 - p1) > limit16) * -1; |
408 | 0 | mask |= (abs(p1 - p0) > limit16) * -1; |
409 | 0 | mask |= (abs(q1 - q0) > limit16) * -1; |
410 | 0 | mask |= (abs(q2 - q1) > limit16) * -1; |
411 | 0 | mask |= (abs(q3 - q2) > limit16) * -1; |
412 | 0 | mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; |
413 | 0 | return ~mask; |
414 | 0 | } |
415 | | |
416 | | static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, |
417 | 0 | uint16_t q1, uint16_t q2, uint16_t q3, int32_t bd) { |
418 | 0 | int8_t mask = 0; |
419 | 0 | int16_t thresh16 = (uint16_t)thresh << (bd - 8); |
420 | 0 | mask |= (abs(p1 - p0) > thresh16) * -1; |
421 | 0 | mask |= (abs(q1 - q0) > thresh16) * -1; |
422 | 0 | mask |= (abs(p2 - p0) > thresh16) * -1; |
423 | 0 | mask |= (abs(q2 - q0) > thresh16) * -1; |
424 | 0 | mask |= (abs(p3 - p0) > thresh16) * -1; |
425 | 0 | mask |= (abs(q3 - q0) > thresh16) * -1; |
426 | 0 | return ~mask; |
427 | 0 | } |
428 | | |
429 | | // Is there high edge variance internal edge: |
430 | | // 11111111_11111111 yes, 00000000_00000000 no ? |
431 | 0 | static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, int32_t bd) { |
432 | 0 | int16_t hev = 0; |
433 | 0 | int16_t thresh16 = (uint16_t)thresh << (bd - 8); |
434 | 0 | hev |= (abs(p1 - p0) > thresh16) * -1; |
435 | 0 | hev |= (abs(q1 - q0) > thresh16) * -1; |
436 | 0 | return hev; |
437 | 0 | } |
438 | | |
439 | | static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t* op1, uint16_t* op0, uint16_t* oq0, |
440 | 0 | uint16_t* oq1, int32_t bd) { |
441 | 0 | int16_t filter1, filter2; |
442 | | // ^0x80 equivalent to subtracting 0x80 from the values to turn them |
443 | | // into -128 to +127 instead of 0 to 255. |
444 | 0 | int32_t shift = bd - 8; |
445 | 0 | const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); |
446 | 0 | const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); |
447 | 0 | const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); |
448 | 0 | const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); |
449 | 0 | const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); |
450 | | |
451 | | // Add outer taps if we have high edge variance. |
452 | 0 | int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; |
453 | | |
454 | | // Inner taps. |
455 | 0 | filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; |
456 | | |
457 | | // Save bottom 3 bits so that we round one side +4 and the other +3 |
458 | | // if it equals 4 we'll set to adjust by -1 to account for the fact |
459 | | // we'd round 3 the other way. |
460 | 0 | filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; |
461 | 0 | filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; |
462 | |
|
463 | 0 | *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); |
464 | 0 | *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); |
465 | | |
466 | | // Outer tap adjustments. |
467 | 0 | filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; |
468 | |
|
469 | 0 | *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); |
470 | 0 | *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); |
471 | 0 | } |
472 | | |
473 | | void svt_aom_highbd_lpf_horizontal_4_c(uint16_t* s, int32_t p /* pitch */, const uint8_t* blimit, const uint8_t* limit, |
474 | 0 | const uint8_t* thresh, int32_t bd) { |
475 | 0 | int32_t i; |
476 | 0 | int32_t count = 4; |
477 | | |
478 | | // loop filter designed to work using chars so that we can make maximum use |
479 | | // of 8 bit simd instructions. |
480 | 0 | for (i = 0; i < count; ++i) { |
481 | 0 | const uint16_t p1 = s[-2 * p]; |
482 | 0 | const uint16_t p0 = s[-p]; |
483 | 0 | const uint16_t q0 = s[0 * p]; |
484 | 0 | const uint16_t q1 = s[1 * p]; |
485 | 0 | const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); |
486 | 0 | highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); |
487 | 0 | ++s; |
488 | 0 | } |
489 | 0 | } |
490 | | |
491 | | void svt_aom_highbd_lpf_vertical_4_c(uint16_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit, |
492 | 0 | const uint8_t* thresh, int32_t bd) { |
493 | 0 | int32_t i; |
494 | 0 | int32_t count = 4; |
495 | | |
496 | | // loop filter designed to work using chars so that we can make maximum use |
497 | | // of 8 bit simd instructions. |
498 | 0 | for (i = 0; i < count; ++i) { |
499 | 0 | const uint16_t p1 = s[-2], p0 = s[-1]; |
500 | 0 | const uint16_t q0 = s[0], q1 = s[1]; |
501 | 0 | const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); |
502 | 0 | highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); |
503 | 0 | s += pitch; |
504 | 0 | } |
505 | 0 | } |
506 | | |
507 | | static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t* op3, uint16_t* op2, uint16_t* op1, |
508 | | uint16_t* op0, uint16_t* oq0, uint16_t* oq1, uint16_t* oq2, uint16_t* oq3, |
509 | 0 | int32_t bd) { |
510 | 0 | if (flat && mask) { |
511 | 0 | const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; |
512 | 0 | const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; |
513 | | |
514 | | // 7-tap filter [1, 1, 1, 2, 1, 1, 1] |
515 | 0 | *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); |
516 | 0 | *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); |
517 | 0 | *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); |
518 | 0 | *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); |
519 | 0 | *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); |
520 | 0 | *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); |
521 | 0 | } else { |
522 | 0 | highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); |
523 | 0 | } |
524 | 0 | } |
525 | | |
526 | | void svt_aom_highbd_lpf_horizontal_8_c(uint16_t* s, int32_t p, const uint8_t* blimit, const uint8_t* limit, |
527 | 0 | const uint8_t* thresh, int32_t bd) { |
528 | 0 | int32_t i; |
529 | 0 | int32_t count = 4; |
530 | | |
531 | | // loop filter designed to work using chars so that we can make maximum use |
532 | | // of 8 bit simd instructions. |
533 | 0 | for (i = 0; i < count; ++i) { |
534 | 0 | const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; |
535 | 0 | const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; |
536 | |
|
537 | 0 | const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
538 | 0 | const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
539 | 0 | highbd_filter8( |
540 | 0 | mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); |
541 | 0 | ++s; |
542 | 0 | } |
543 | 0 | } |
544 | | |
545 | | void svt_aom_highbd_lpf_vertical_8_c(uint16_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit, |
546 | 0 | const uint8_t* thresh, int32_t bd) { |
547 | 0 | int32_t i; |
548 | 0 | int32_t count = 4; |
549 | |
|
550 | 0 | for (i = 0; i < count; ++i) { |
551 | 0 | const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; |
552 | 0 | const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; |
553 | 0 | const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
554 | 0 | const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
555 | 0 | highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, bd); |
556 | 0 | s += pitch; |
557 | 0 | } |
558 | 0 | } |
559 | | |
560 | | //**********************************************************************************************************************// |
561 | | |
562 | | //static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_PLANES][2] = { |
563 | | // { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H }, |
564 | | // { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U }, |
565 | | // { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V } |
566 | | //}; |
567 | | |
568 | 548 | void svt_aom_update_sharpness(LoopFilterInfoN* lfi, int32_t sharpness_lvl) { |
569 | 548 | int32_t lvl; |
570 | | |
571 | | // For each possible value for the loop filter fill out limits |
572 | 35.6k | for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { |
573 | | // Set loop filter parameters that control sharpness. |
574 | 35.0k | int32_t block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); |
575 | | |
576 | 35.0k | if (sharpness_lvl > 0) { |
577 | 0 | if (block_inside_limit > (9 - sharpness_lvl)) { |
578 | 0 | block_inside_limit = (9 - sharpness_lvl); |
579 | 0 | } |
580 | 0 | } |
581 | | |
582 | 35.0k | if (block_inside_limit < 1) { |
583 | 548 | block_inside_limit = 1; |
584 | 548 | } |
585 | | |
586 | 35.0k | memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); |
587 | 35.0k | memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), SIMD_WIDTH); |
588 | 35.0k | } |
589 | 548 | } |
590 | | |
591 | | static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint16_t* op6, uint16_t* op5, |
592 | | uint16_t* op4, uint16_t* op3, uint16_t* op2, uint16_t* op1, uint16_t* op0, |
593 | | uint16_t* oq0, uint16_t* oq1, uint16_t* oq2, uint16_t* oq3, uint16_t* oq4, |
594 | 0 | uint16_t* oq5, uint16_t* oq6, int bd) { |
595 | 0 | if (flat2 && flat && mask) { |
596 | 0 | const uint16_t p6 = *op6; |
597 | 0 | const uint16_t p5 = *op5; |
598 | 0 | const uint16_t p4 = *op4; |
599 | 0 | const uint16_t p3 = *op3; |
600 | 0 | const uint16_t p2 = *op2; |
601 | 0 | const uint16_t p1 = *op1; |
602 | 0 | const uint16_t p0 = *op0; |
603 | 0 | const uint16_t q0 = *oq0; |
604 | 0 | const uint16_t q1 = *oq1; |
605 | 0 | const uint16_t q2 = *oq2; |
606 | 0 | const uint16_t q3 = *oq3; |
607 | 0 | const uint16_t q4 = *oq4; |
608 | 0 | const uint16_t q5 = *oq5; |
609 | 0 | const uint16_t q6 = *oq6; |
610 | | |
611 | | // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] |
612 | 0 | *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, 4); |
613 | 0 | *op4 = ROUND_POWER_OF_TWO(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); |
614 | 0 | *op3 = ROUND_POWER_OF_TWO(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); |
615 | 0 | *op2 = ROUND_POWER_OF_TWO(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, 4); |
616 | 0 | *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4, 4); |
617 | 0 | *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5, 4); |
618 | 0 | *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6, 4); |
619 | 0 | *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 * 2, 4); |
620 | 0 | *oq2 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, 4); |
621 | 0 | *oq3 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); |
622 | 0 | *oq4 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); |
623 | 0 | *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, 4); |
624 | 0 | } else { |
625 | 0 | highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, bd); |
626 | 0 | } |
627 | 0 | } |
628 | | |
629 | | static void highbd_mb_lpf_horizontal_edge_w(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
630 | 0 | const uint8_t* thresh, int count, int bd) { |
631 | 0 | int i; |
632 | 0 | int step = 4; |
633 | | |
634 | | // loop filter designed to work using chars so that we can make maximum use |
635 | | // of 8 bit simd instructions. |
636 | 0 | for (i = 0; i < step * count; ++i) { |
637 | 0 | const uint16_t p3 = s[-4 * p]; |
638 | 0 | const uint16_t p2 = s[-3 * p]; |
639 | 0 | const uint16_t p1 = s[-2 * p]; |
640 | 0 | const uint16_t p0 = s[-p]; |
641 | 0 | const uint16_t q0 = s[0 * p]; |
642 | 0 | const uint16_t q1 = s[1 * p]; |
643 | 0 | const uint16_t q2 = s[2 * p]; |
644 | 0 | const uint16_t q3 = s[3 * p]; |
645 | 0 | const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
646 | 0 | const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
647 | |
|
648 | 0 | const int8_t flat2 = highbd_flat_mask4( |
649 | 0 | 1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], bd); |
650 | |
|
651 | 0 | highbd_filter14(mask, |
652 | 0 | *thresh, |
653 | 0 | flat, |
654 | 0 | flat2, |
655 | 0 | s - 7 * p, |
656 | 0 | s - 6 * p, |
657 | 0 | s - 5 * p, |
658 | 0 | s - 4 * p, |
659 | 0 | s - 3 * p, |
660 | 0 | s - 2 * p, |
661 | 0 | s - 1 * p, |
662 | 0 | s, |
663 | 0 | s + 1 * p, |
664 | 0 | s + 2 * p, |
665 | 0 | s + 3 * p, |
666 | 0 | s + 4 * p, |
667 | 0 | s + 5 * p, |
668 | 0 | s + 6 * p, |
669 | 0 | bd); |
670 | 0 | ++s; |
671 | 0 | } |
672 | 0 | } |
673 | | |
674 | | void svt_aom_highbd_lpf_horizontal_14_c(uint16_t* s, int pitch, const uint8_t* blimit, const uint8_t* limit, |
675 | 0 | const uint8_t* thresh, int bd) { |
676 | 0 | highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); |
677 | 0 | } |
678 | | |
679 | | static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, uint16_t p2, uint16_t p1, uint16_t p0, |
680 | 0 | uint16_t q0, uint16_t q1, uint16_t q2, int bd) { |
681 | 0 | int8_t mask = 0; |
682 | 0 | int16_t limit16 = (uint16_t)limit << (bd - 8); |
683 | 0 | int16_t blimit16 = (uint16_t)blimit << (bd - 8); |
684 | 0 | mask |= (abs(p2 - p1) > limit16) * -1; |
685 | 0 | mask |= (abs(p1 - p0) > limit16) * -1; |
686 | 0 | mask |= (abs(q1 - q0) > limit16) * -1; |
687 | 0 | mask |= (abs(q2 - q1) > limit16) * -1; |
688 | 0 | mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; |
689 | 0 | return ~mask; |
690 | 0 | } |
691 | | |
692 | | static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, uint16_t* op2, uint16_t* op1, uint16_t* op0, |
693 | 0 | uint16_t* oq0, uint16_t* oq1, uint16_t* oq2, int bd) { |
694 | 0 | if (flat && mask) { |
695 | 0 | const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; |
696 | 0 | const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; |
697 | | |
698 | | // 5-tap filter [1, 2, 2, 2, 1] |
699 | 0 | *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); |
700 | 0 | *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); |
701 | 0 | *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); |
702 | 0 | *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); |
703 | 0 | } else { |
704 | 0 | highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); |
705 | 0 | } |
706 | 0 | } |
707 | | |
708 | | void svt_aom_highbd_lpf_vertical_6_c(uint16_t* s, int pitch, const uint8_t* blimit, const uint8_t* limit, |
709 | 0 | const uint8_t* thresh, int bd) { |
710 | 0 | int i; |
711 | 0 | int count = 4; |
712 | |
|
713 | 0 | for (i = 0; i < count; ++i) { |
714 | 0 | const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; |
715 | 0 | const uint16_t q0 = s[0], q1 = s[1], q2 = s[2]; |
716 | 0 | const int8_t mask = highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); |
717 | 0 | const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); |
718 | 0 | highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, bd); |
719 | 0 | s += pitch; |
720 | 0 | } |
721 | 0 | } |
722 | | |
723 | | void svt_aom_highbd_lpf_horizontal_6_c(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
724 | 0 | const uint8_t* thresh, int bd) { |
725 | 0 | int i; |
726 | 0 | int count = 4; |
727 | | |
728 | | // loop filter designed to work using chars so that we can make maximum use |
729 | | // of 8 bit simd instructions. |
730 | 0 | for (i = 0; i < count; ++i) { |
731 | 0 | const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; |
732 | 0 | const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; |
733 | |
|
734 | 0 | const int8_t mask = highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); |
735 | 0 | const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); |
736 | 0 | highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, bd); |
737 | 0 | ++s; |
738 | 0 | } |
739 | 0 | } |
740 | | |
741 | | static void highbd_mb_lpf_vertical_edge_w(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
742 | 0 | const uint8_t* thresh, int count, int bd) { |
743 | 0 | int i; |
744 | |
|
745 | 0 | for (i = 0; i < count; ++i) { |
746 | 0 | const uint16_t p3 = s[-4]; |
747 | 0 | const uint16_t p2 = s[-3]; |
748 | 0 | const uint16_t p1 = s[-2]; |
749 | 0 | const uint16_t p0 = s[-1]; |
750 | 0 | const uint16_t q0 = s[0]; |
751 | 0 | const uint16_t q1 = s[1]; |
752 | 0 | const uint16_t q2 = s[2]; |
753 | 0 | const uint16_t q3 = s[3]; |
754 | 0 | const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
755 | 0 | const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); |
756 | 0 | const int8_t flat2 = highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); |
757 | |
|
758 | 0 | highbd_filter14(mask, |
759 | 0 | *thresh, |
760 | 0 | flat, |
761 | 0 | flat2, |
762 | 0 | s - 7, |
763 | 0 | s - 6, |
764 | 0 | s - 5, |
765 | 0 | s - 4, |
766 | 0 | s - 3, |
767 | 0 | s - 2, |
768 | 0 | s - 1, |
769 | 0 | s, |
770 | 0 | s + 1, |
771 | 0 | s + 2, |
772 | 0 | s + 3, |
773 | 0 | s + 4, |
774 | 0 | s + 5, |
775 | 0 | s + 6, |
776 | 0 | bd); |
777 | 0 | s += p; |
778 | 0 | } |
779 | 0 | } |
780 | | |
781 | | void svt_aom_highbd_lpf_vertical_14_c(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
782 | 0 | const uint8_t* thresh, int bd) { |
783 | 0 | highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); |
784 | 0 | } |
785 | | |
786 | | static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint8_t* op6, uint8_t* op5, |
787 | | uint8_t* op4, uint8_t* op3, uint8_t* op2, uint8_t* op1, uint8_t* op0, uint8_t* oq0, |
788 | 0 | uint8_t* oq1, uint8_t* oq2, uint8_t* oq3, uint8_t* oq4, uint8_t* oq5, uint8_t* oq6) { |
789 | 0 | if (flat2 && flat && mask) { |
790 | 0 | const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; |
791 | 0 | const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, q5 = *oq5, q6 = *oq6; |
792 | | |
793 | | // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] |
794 | 0 | *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, 4); |
795 | 0 | *op4 = ROUND_POWER_OF_TWO(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); |
796 | 0 | *op3 = ROUND_POWER_OF_TWO(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); |
797 | 0 | *op2 = ROUND_POWER_OF_TWO(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, 4); |
798 | 0 | *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4, 4); |
799 | 0 | *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5, 4); |
800 | 0 | *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6, 4); |
801 | 0 | *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 * 2, 4); |
802 | 0 | *oq2 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, 4); |
803 | 0 | *oq3 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); |
804 | 0 | *oq4 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); |
805 | 0 | *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, 4); |
806 | 0 | } else { |
807 | 0 | filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); |
808 | 0 | } |
809 | 0 | } |
810 | | |
811 | | static void mb_lpf_horizontal_edge_w(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
812 | 0 | const uint8_t* thresh, int count) { |
813 | 0 | int i; |
814 | 0 | int step = 4; |
815 | | |
816 | | // loop filter designed to work using chars so that we can make maximum use |
817 | | // of 8 bit simd instructions. |
818 | 0 | for (i = 0; i < step * count; ++i) { |
819 | 0 | const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], |
820 | 0 | p0 = s[-p]; |
821 | 0 | const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], q4 = s[4 * p], q5 = s[5 * p], |
822 | 0 | q6 = s[6 * p]; |
823 | 0 | const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); |
824 | 0 | const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); |
825 | 0 | const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); |
826 | |
|
827 | 0 | filter14(mask, |
828 | 0 | *thresh, |
829 | 0 | flat, |
830 | 0 | flat2, |
831 | 0 | s - 7 * p, |
832 | 0 | s - 6 * p, |
833 | 0 | s - 5 * p, |
834 | 0 | s - 4 * p, |
835 | 0 | s - 3 * p, |
836 | 0 | s - 2 * p, |
837 | 0 | s - 1 * p, |
838 | 0 | s, |
839 | 0 | s + 1 * p, |
840 | 0 | s + 2 * p, |
841 | 0 | s + 3 * p, |
842 | 0 | s + 4 * p, |
843 | 0 | s + 5 * p, |
844 | 0 | s + 6 * p); |
845 | 0 | ++s; |
846 | 0 | } |
847 | 0 | } |
848 | | |
849 | | void svt_aom_lpf_horizontal_14_c(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
850 | 0 | const uint8_t* thresh) { |
851 | 0 | mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); |
852 | 0 | } |
853 | | |
854 | | static void mb_lpf_vertical_edge_w(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit, |
855 | 0 | const uint8_t* thresh, int count) { |
856 | 0 | int i; |
857 | |
|
858 | 0 | for (i = 0; i < count; ++i) { |
859 | 0 | const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; |
860 | 0 | const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], q5 = s[5], q6 = s[6]; |
861 | 0 | const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); |
862 | 0 | const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); |
863 | 0 | const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); |
864 | |
|
865 | 0 | filter14(mask, |
866 | 0 | *thresh, |
867 | 0 | flat, |
868 | 0 | flat2, |
869 | 0 | s - 7, |
870 | 0 | s - 6, |
871 | 0 | s - 5, |
872 | 0 | s - 4, |
873 | 0 | s - 3, |
874 | 0 | s - 2, |
875 | 0 | s - 1, |
876 | 0 | s, |
877 | 0 | s + 1, |
878 | 0 | s + 2, |
879 | 0 | s + 3, |
880 | 0 | s + 4, |
881 | 0 | s + 5, |
882 | 0 | s + 6); |
883 | 0 | s += p; |
884 | 0 | } |
885 | 0 | } |
886 | | |
887 | 0 | void svt_aom_lpf_vertical_14_c(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit, const uint8_t* thresh) { |
888 | 0 | mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); |
889 | 0 | } |