Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/deblocking_common.c
Line
Count
Source
1
/*
2
* Copyright(c) 2019 Intel Corporation
3
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
4
*
5
* This source code is subject to the terms of the BSD 2 Clause License and
6
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7
* was not distributed with this source code in the LICENSE file, you can
8
* obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9
* Media Patent License 1.0 was not distributed with this source code in the
10
* PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11
*/
12
13
#include "deblocking_common.h"
14
#include "common_utils.h"
15
16
static const int delta_lf_id_lut[MAX_PLANES][2] = {{0, 1}, {2, 2}, {3, 3}};
17
18
static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_PLANES][2] = {{SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H},
19
                                                               {SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U},
20
                                                               {SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V}};
21
22
12.0k
static INLINE int svt_aom_seg_feature_active(SegmentationParams* seg, int segment_id, SEG_LVL_FEATURES feature_id) {
23
12.0k
    return seg->segmentation_enabled && seg->feature_enabled[segment_id][feature_id];
24
12.0k
}
25
26
0
static INLINE int get_segdata(SegmentationParams* seg, int segment_id, SEG_LVL_FEATURES feature_id) {
27
0
    return seg->feature_data[segment_id][feature_id];
28
0
}
29
30
0
static INLINE int8_t signed_char_clamp(int32_t t) {
31
0
    return (int8_t)clamp(t, -128, 127);
32
0
}
33
34
0
static INLINE int16_t signed_char_clamp_high(int32_t t, int32_t bd) {
35
0
    switch (bd) {
36
0
    case 10:
37
0
        return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
38
0
    case 12:
39
0
        return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
40
0
    case 8:
41
0
    default:
42
0
        return (int16_t)clamp(t, -128, 128 - 1);
43
0
    }
44
0
}
45
46
uint8_t svt_aom_get_filter_level_delta_lf(FrameHeader* frm_hdr, const int32_t dir_idx, int32_t plane,
47
                                          int32_t* sb_delta_lf, uint8_t seg_id, PredictionMode pred_mode,
48
0
                                          MvReferenceFrame ref_frame_0) {
49
0
    int32_t delta_lf = -1;
50
0
    if (frm_hdr->delta_lf_params.delta_lf_multi) {
51
0
        const int32_t delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
52
0
        delta_lf                   = sb_delta_lf[delta_lf_idx];
53
0
    } else {
54
0
        delta_lf = sb_delta_lf[0];
55
0
    }
56
0
    int32_t base_level;
57
0
    if (plane == 0) {
58
0
        base_level = frm_hdr->loop_filter_params.filter_level[dir_idx];
59
0
    } else if (plane == 1) {
60
0
        base_level = frm_hdr->loop_filter_params.filter_level_u;
61
0
    } else {
62
0
        base_level = frm_hdr->loop_filter_params.filter_level_v;
63
0
    }
64
0
    int32_t lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
65
0
    assert(plane >= 0 && plane <= 2);
66
0
    const int32_t seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
67
0
    if (svt_aom_seg_feature_active(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id)) {
68
0
        const int32_t data = get_segdata(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id);
69
0
        lvl_seg            = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
70
0
    }
71
72
0
    if (frm_hdr->loop_filter_params.mode_ref_delta_enabled) {
73
0
        const int32_t scale = 1 << (lvl_seg >> 5);
74
0
        lvl_seg += frm_hdr->loop_filter_params.ref_deltas[ref_frame_0] * scale;
75
0
        if (ref_frame_0 > INTRA_FRAME) {
76
0
            lvl_seg += frm_hdr->loop_filter_params.mode_deltas[mode_lf_lut[pred_mode]] * scale;
77
0
        }
78
0
        lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
79
0
    }
80
0
    return lvl_seg;
81
0
}
82
83
// Update the loop filter for the current frame.
84
// This should be called before loop_filter_rows(),
85
// svt_av1_loop_filter_frame() calls this function directly.
86
void svt_av1_loop_filter_frame_init(FrameHeader* frm_hdr, LoopFilterInfoN* lfi, int32_t plane_start,
87
274
                                    int32_t plane_end) {
88
274
    int32_t filt_lvl[MAX_PLANES], filt_lvl_r[MAX_PLANES];
89
274
    int32_t plane;
90
274
    int32_t seg_id;
91
    // n_shift is the multiplier for lf_deltas
92
    // the multiplier is 1 for when filter_lvl is between 0 and 31;
93
    // 2 when filter_lvl is between 32 and 63
94
95
274
    LoopFilter* const lf = &frm_hdr->loop_filter_params;
96
    // const struct segmentation *const seg = &pcs->ppcs->seg;
97
98
    // update sharpness limits
99
274
    svt_aom_update_sharpness(lfi, lf->sharpness_level);
100
101
274
    filt_lvl[0] = frm_hdr->loop_filter_params.filter_level[0];
102
274
    filt_lvl[1] = frm_hdr->loop_filter_params.filter_level_u;
103
274
    filt_lvl[2] = frm_hdr->loop_filter_params.filter_level_v;
104
105
274
    filt_lvl_r[0] = frm_hdr->loop_filter_params.filter_level[1];
106
274
    filt_lvl_r[1] = frm_hdr->loop_filter_params.filter_level_u;
107
274
    filt_lvl_r[2] = frm_hdr->loop_filter_params.filter_level_v;
108
109
1.04k
    for (plane = plane_start; plane < plane_end; plane++) {
110
788
        if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) {
111
17
            break;
112
771
        } else if (plane == 1 && !filt_lvl[1]) {
113
9
            continue;
114
762
        } else if (plane == 2 && !filt_lvl[2]) {
115
9
            continue;
116
9
        }
117
118
6.77k
        for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
119
18.0k
            for (int32_t dir = 0; dir < 2; ++dir) {
120
12.0k
                int32_t lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
121
12.0k
                assert(plane >= 0 && plane <= 2);
122
12.0k
                const int32_t seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
123
12.0k
                if (svt_aom_seg_feature_active(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id)) {
124
0
                    const int32_t data = get_segdata(&frm_hdr->segmentation_params, seg_id, seg_lf_feature_id);
125
0
                    lvl_seg            = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
126
0
                }
127
128
12.0k
                if (!lf->mode_ref_delta_enabled) {
129
                    // we could get rid of this if we assume that deltas are set to
130
                    // zero when not in use; encoder always uses deltas
131
12.0k
                    memset(lfi->lvl[plane][seg_id][dir], lvl_seg, sizeof(lfi->lvl[plane][seg_id][dir]));
132
12.0k
                } else {
133
0
                    int32_t       ref, mode;
134
0
                    const int32_t scale                          = 1 << (lvl_seg >> 5);
135
0
                    const int32_t intra_lvl                      = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
136
0
                    lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] = (uint8_t)clamp(intra_lvl, 0, MAX_LOOP_FILTER);
137
138
0
                    for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
139
0
                        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
140
0
                            const int32_t inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
141
0
                                lf->mode_deltas[mode] * scale;
142
0
                            lfi->lvl[plane][seg_id][dir][ref][mode] = (uint8_t)clamp(inter_lvl, 0, MAX_LOOP_FILTER);
143
0
                        }
144
0
                    }
145
0
                }
146
12.0k
            }
147
6.02k
        }
148
753
    }
149
274
}
150
151
// should we apply any filter at all: 11111111 yes, 00000000 no
152
0
static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) {
153
0
    int8_t mask = 0;
154
0
    mask |= (abs(p1 - p0) > limit) * -1;
155
0
    mask |= (abs(q1 - q0) > limit) * -1;
156
0
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
157
0
    return ~mask;
158
0
}
159
160
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0,
161
0
                                 uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) {
162
0
    int8_t mask = 0;
163
0
    mask |= (abs(p3 - p2) > limit) * -1;
164
0
    mask |= (abs(p2 - p1) > limit) * -1;
165
0
    mask |= (abs(p1 - p0) > limit) * -1;
166
0
    mask |= (abs(q1 - q0) > limit) * -1;
167
0
    mask |= (abs(q2 - q1) > limit) * -1;
168
0
    mask |= (abs(q3 - q2) > limit) * -1;
169
0
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
170
0
    return ~mask;
171
0
}
172
173
static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
174
0
                                         uint8_t q1, uint8_t q2) {
175
0
    int8_t mask = 0;
176
0
    mask |= (abs(p2 - p1) > limit) * -1;
177
0
    mask |= (abs(p1 - p0) > limit) * -1;
178
0
    mask |= (abs(q1 - q0) > limit) * -1;
179
0
    mask |= (abs(q2 - q1) > limit) * -1;
180
0
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
181
0
    return ~mask;
182
0
}
183
184
static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
185
0
                                       uint8_t q2) {
186
0
    int8_t mask = 0;
187
0
    mask |= (abs(p1 - p0) > thresh) * -1;
188
0
    mask |= (abs(q1 - q0) > thresh) * -1;
189
0
    mask |= (abs(p2 - p0) > thresh) * -1;
190
0
    mask |= (abs(q2 - q0) > thresh) * -1;
191
0
    return ~mask;
192
0
}
193
194
static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0,
195
0
                                              uint16_t q1, uint16_t q2, int bd) {
196
0
    int8_t  mask     = 0;
197
0
    int16_t thresh16 = (uint16_t)thresh << (bd - 8);
198
0
    mask |= (abs(p1 - p0) > thresh16) * -1;
199
0
    mask |= (abs(q1 - q0) > thresh16) * -1;
200
0
    mask |= (abs(p2 - p0) > thresh16) * -1;
201
0
    mask |= (abs(q2 - q0) > thresh16) * -1;
202
0
    return ~mask;
203
0
}
204
205
static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
206
0
                                uint8_t q2, uint8_t q3) {
207
0
    int8_t mask = 0;
208
0
    mask |= (abs(p1 - p0) > thresh) * -1;
209
0
    mask |= (abs(q1 - q0) > thresh) * -1;
210
0
    mask |= (abs(p2 - p0) > thresh) * -1;
211
0
    mask |= (abs(q2 - q0) > thresh) * -1;
212
0
    mask |= (abs(p3 - p0) > thresh) * -1;
213
0
    mask |= (abs(q3 - q0) > thresh) * -1;
214
0
    return ~mask;
215
0
}
216
217
// is there high edge variance internal edge: 11111111 yes, 00000000 no
218
0
static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) {
219
0
    int8_t hev = 0;
220
0
    hev |= (abs(p1 - p0) > thresh) * -1;
221
0
    hev |= (abs(q1 - q0) > thresh) * -1;
222
0
    return hev;
223
0
}
224
225
0
static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t* op1, uint8_t* op0, uint8_t* oq0, uint8_t* oq1) {
226
0
    int8_t       filter1, filter2;
227
0
    const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
228
0
    const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
229
0
    const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
230
0
    const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
231
0
    const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
232
233
    // add outer taps if we have high edge variance
234
0
    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
235
236
    // inner taps
237
0
    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
238
239
    // save bottom 3 bits so that we round one side +4 and the other +3
240
    // if it equals 4 we'll set to adjust by -1 to account for the fact
241
    // we'd round 3 the other way
242
0
    filter1 = signed_char_clamp(filter + 4) >> 3;
243
0
    filter2 = signed_char_clamp(filter + 3) >> 3;
244
0
    *oq0    = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
245
0
    *op0    = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
246
247
    // outer tap adjustments
248
0
    filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
249
0
    *oq1   = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
250
0
    *op1   = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
251
0
}
252
253
void svt_aom_lpf_horizontal_4_c(uint8_t* s, int32_t p /* pitch */, const uint8_t* blimit, const uint8_t* limit,
254
0
                                const uint8_t* thresh) {
255
0
    int32_t i;
256
0
    int32_t count = 4;
257
258
    // loop filter designed to work using chars so that we can make maximum use
259
    // of 8 bit simd instructions.
260
0
    for (i = 0; i < count; ++i) {
261
0
        const uint8_t p1 = s[-2 * p], p0 = s[-p];
262
0
        const uint8_t q0 = s[0 * p], q1 = s[1 * p];
263
0
        const int8_t  mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
264
0
        filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
265
0
        ++s;
266
0
    }
267
0
}
268
269
void svt_aom_lpf_vertical_4_c(uint8_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit,
270
0
                              const uint8_t* thresh) {
271
0
    int32_t i;
272
0
    int32_t count = 4;
273
274
    // loop filter designed to work using chars so that we can make maximum use
275
    // of 8 bit simd instructions.
276
0
    for (i = 0; i < count; ++i) {
277
0
        const uint8_t p1 = s[-2], p0 = s[-1];
278
0
        const uint8_t q0 = s[0], q1 = s[1];
279
0
        const int8_t  mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
280
0
        filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
281
0
        s += pitch;
282
0
    }
283
0
}
284
285
static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, uint8_t* op2, uint8_t* op1, uint8_t* op0,
286
0
                           uint8_t* oq0, uint8_t* oq1, uint8_t* oq2) {
287
0
    if (flat && mask) {
288
0
        const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
289
0
        const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
290
291
        // 5-tap filter [1, 2, 2, 2, 1]
292
0
        *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
293
0
        *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
294
0
        *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
295
0
        *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
296
0
    } else {
297
0
        filter4(mask, thresh, op1, op0, oq0, oq1);
298
0
    }
299
0
}
300
301
static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t* op3, uint8_t* op2, uint8_t* op1,
302
0
                           uint8_t* op0, uint8_t* oq0, uint8_t* oq1, uint8_t* oq2, uint8_t* oq3) {
303
0
    if (flat && mask) {
304
0
        const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
305
0
        const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
306
307
        // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
308
0
        *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
309
0
        *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
310
0
        *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
311
0
        *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
312
0
        *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
313
0
        *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
314
0
    } else {
315
0
        filter4(mask, thresh, op1, op0, oq0, oq1);
316
0
    }
317
0
}
318
319
void svt_aom_lpf_horizontal_6_c(uint8_t* s, int32_t p, const uint8_t* blimit, const uint8_t* limit,
320
0
                                const uint8_t* thresh) {
321
0
    int32_t i;
322
0
    int32_t count = 4;
323
324
    // loop filter designed to work using chars so that we can make maximum use
325
    // of 8 bit simd instructions.
326
0
    for (i = 0; i < count; ++i) {
327
0
        const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
328
0
        const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
329
330
0
        const int8_t mask = filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
331
0
        const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
332
0
        filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
333
0
        ++s;
334
0
    }
335
0
}
336
337
void svt_aom_lpf_vertical_6_c(uint8_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit,
338
0
                              const uint8_t* thresh) {
339
0
    int32_t i;
340
0
    int32_t count = 4;
341
342
    // loop filter designed to work using chars so that we can make maximum use
343
    // of 8 bit simd instructions.
344
0
    for (i = 0; i < count; ++i) {
345
0
        const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
346
0
        const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
347
348
0
        const int8_t mask = filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
349
0
        const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
350
0
        filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
351
0
        s += pitch;
352
0
    }
353
0
}
354
355
void svt_aom_lpf_horizontal_8_c(uint8_t* s, int32_t p, const uint8_t* blimit, const uint8_t* limit,
356
0
                                const uint8_t* thresh) {
357
0
    int32_t i;
358
0
    int32_t count = 4;
359
360
    // loop filter designed to work using chars so that we can make maximum use
361
    // of 8 bit simd instructions.
362
0
    for (i = 0; i < count; ++i) {
363
0
        const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
364
0
        const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
365
366
0
        const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
367
0
        const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
368
0
        filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p);
369
0
        ++s;
370
0
    }
371
0
}
372
373
void svt_aom_lpf_vertical_8_c(uint8_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit,
374
0
                              const uint8_t* thresh) {
375
0
    int32_t i;
376
0
    int32_t count = 4;
377
378
0
    for (i = 0; i < count; ++i) {
379
0
        const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
380
0
        const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
381
0
        const int8_t  mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
382
0
        const int8_t  flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
383
0
        filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3);
384
0
        s += pitch;
385
0
    }
386
0
}
387
388
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
389
static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, uint16_t p1, uint16_t p0, uint16_t q0,
390
0
                                         uint16_t q1, int32_t bd) {
391
0
    int8_t  mask     = 0;
392
0
    int16_t limit16  = (uint16_t)limit << (bd - 8);
393
0
    int16_t blimit16 = (uint16_t)blimit << (bd - 8);
394
0
    mask |= (abs(p1 - p0) > limit16) * -1;
395
0
    mask |= (abs(q1 - q0) > limit16) * -1;
396
0
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
397
0
    return ~mask;
398
0
}
399
400
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
401
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, uint16_t p3, uint16_t p2, uint16_t p1,
402
0
                                        uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, int32_t bd) {
403
0
    int8_t  mask     = 0;
404
0
    int16_t limit16  = (uint16_t)limit << (bd - 8);
405
0
    int16_t blimit16 = (uint16_t)blimit << (bd - 8);
406
0
    mask |= (abs(p3 - p2) > limit16) * -1;
407
0
    mask |= (abs(p2 - p1) > limit16) * -1;
408
0
    mask |= (abs(p1 - p0) > limit16) * -1;
409
0
    mask |= (abs(q1 - q0) > limit16) * -1;
410
0
    mask |= (abs(q2 - q1) > limit16) * -1;
411
0
    mask |= (abs(q3 - q2) > limit16) * -1;
412
0
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
413
0
    return ~mask;
414
0
}
415
416
static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0,
417
0
                                       uint16_t q1, uint16_t q2, uint16_t q3, int32_t bd) {
418
0
    int8_t  mask     = 0;
419
0
    int16_t thresh16 = (uint16_t)thresh << (bd - 8);
420
0
    mask |= (abs(p1 - p0) > thresh16) * -1;
421
0
    mask |= (abs(q1 - q0) > thresh16) * -1;
422
0
    mask |= (abs(p2 - p0) > thresh16) * -1;
423
0
    mask |= (abs(q2 - q0) > thresh16) * -1;
424
0
    mask |= (abs(p3 - p0) > thresh16) * -1;
425
0
    mask |= (abs(q3 - q0) > thresh16) * -1;
426
0
    return ~mask;
427
0
}
428
429
// Is there high edge variance internal edge:
430
// 11111111_11111111 yes, 00000000_00000000 no ?
431
0
static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, int32_t bd) {
432
0
    int16_t hev      = 0;
433
0
    int16_t thresh16 = (uint16_t)thresh << (bd - 8);
434
0
    hev |= (abs(p1 - p0) > thresh16) * -1;
435
0
    hev |= (abs(q1 - q0) > thresh16) * -1;
436
0
    return hev;
437
0
}
438
439
static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t* op1, uint16_t* op0, uint16_t* oq0,
440
0
                                  uint16_t* oq1, int32_t bd) {
441
0
    int16_t filter1, filter2;
442
    // ^0x80 equivalent to subtracting 0x80 from the values to turn them
443
    // into -128 to +127 instead of 0 to 255.
444
0
    int32_t       shift = bd - 8;
445
0
    const int16_t ps1   = (int16_t)*op1 - (0x80 << shift);
446
0
    const int16_t ps0   = (int16_t)*op0 - (0x80 << shift);
447
0
    const int16_t qs0   = (int16_t)*oq0 - (0x80 << shift);
448
0
    const int16_t qs1   = (int16_t)*oq1 - (0x80 << shift);
449
0
    const int16_t hev   = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
450
451
    // Add outer taps if we have high edge variance.
452
0
    int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
453
454
    // Inner taps.
455
0
    filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
456
457
    // Save bottom 3 bits so that we round one side +4 and the other +3
458
    // if it equals 4 we'll set to adjust by -1 to account for the fact
459
    // we'd round 3 the other way.
460
0
    filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
461
0
    filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
462
463
0
    *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
464
0
    *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
465
466
    // Outer tap adjustments.
467
0
    filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
468
469
0
    *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
470
0
    *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
471
0
}
472
473
void svt_aom_highbd_lpf_horizontal_4_c(uint16_t* s, int32_t p /* pitch */, const uint8_t* blimit, const uint8_t* limit,
474
0
                                       const uint8_t* thresh, int32_t bd) {
475
0
    int32_t i;
476
0
    int32_t count = 4;
477
478
    // loop filter designed to work using chars so that we can make maximum use
479
    // of 8 bit simd instructions.
480
0
    for (i = 0; i < count; ++i) {
481
0
        const uint16_t p1   = s[-2 * p];
482
0
        const uint16_t p0   = s[-p];
483
0
        const uint16_t q0   = s[0 * p];
484
0
        const uint16_t q1   = s[1 * p];
485
0
        const int8_t   mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
486
0
        highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
487
0
        ++s;
488
0
    }
489
0
}
490
491
void svt_aom_highbd_lpf_vertical_4_c(uint16_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit,
492
0
                                     const uint8_t* thresh, int32_t bd) {
493
0
    int32_t i;
494
0
    int32_t count = 4;
495
496
    // loop filter designed to work using chars so that we can make maximum use
497
    // of 8 bit simd instructions.
498
0
    for (i = 0; i < count; ++i) {
499
0
        const uint16_t p1 = s[-2], p0 = s[-1];
500
0
        const uint16_t q0 = s[0], q1 = s[1];
501
0
        const int8_t   mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
502
0
        highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
503
0
        s += pitch;
504
0
    }
505
0
}
506
507
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t* op3, uint16_t* op2, uint16_t* op1,
508
                                  uint16_t* op0, uint16_t* oq0, uint16_t* oq1, uint16_t* oq2, uint16_t* oq3,
509
0
                                  int32_t bd) {
510
0
    if (flat && mask) {
511
0
        const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
512
0
        const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
513
514
        // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
515
0
        *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
516
0
        *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
517
0
        *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
518
0
        *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
519
0
        *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
520
0
        *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
521
0
    } else {
522
0
        highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
523
0
    }
524
0
}
525
526
void svt_aom_highbd_lpf_horizontal_8_c(uint16_t* s, int32_t p, const uint8_t* blimit, const uint8_t* limit,
527
0
                                       const uint8_t* thresh, int32_t bd) {
528
0
    int32_t i;
529
0
    int32_t count = 4;
530
531
    // loop filter designed to work using chars so that we can make maximum use
532
    // of 8 bit simd instructions.
533
0
    for (i = 0; i < count; ++i) {
534
0
        const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
535
0
        const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
536
537
0
        const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
538
0
        const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
539
0
        highbd_filter8(
540
0
            mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
541
0
        ++s;
542
0
    }
543
0
}
544
545
void svt_aom_highbd_lpf_vertical_8_c(uint16_t* s, int32_t pitch, const uint8_t* blimit, const uint8_t* limit,
546
0
                                     const uint8_t* thresh, int32_t bd) {
547
0
    int32_t i;
548
0
    int32_t count = 4;
549
550
0
    for (i = 0; i < count; ++i) {
551
0
        const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
552
0
        const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
553
0
        const int8_t   mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
554
0
        const int8_t   flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
555
0
        highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, bd);
556
0
        s += pitch;
557
0
    }
558
0
}
559
560
//**********************************************************************************************************************//
561
562
//static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_PLANES][2] = {
563
//    { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
564
//    { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
565
//    { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
566
//};
567
568
548
void svt_aom_update_sharpness(LoopFilterInfoN* lfi, int32_t sharpness_lvl) {
569
548
    int32_t lvl;
570
571
    // For each possible value for the loop filter fill out limits
572
35.6k
    for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
573
        // Set loop filter parameters that control sharpness.
574
35.0k
        int32_t block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
575
576
35.0k
        if (sharpness_lvl > 0) {
577
0
            if (block_inside_limit > (9 - sharpness_lvl)) {
578
0
                block_inside_limit = (9 - sharpness_lvl);
579
0
            }
580
0
        }
581
582
35.0k
        if (block_inside_limit < 1) {
583
548
            block_inside_limit = 1;
584
548
        }
585
586
35.0k
        memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
587
35.0k
        memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), SIMD_WIDTH);
588
35.0k
    }
589
548
}
590
591
static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint16_t* op6, uint16_t* op5,
592
                                   uint16_t* op4, uint16_t* op3, uint16_t* op2, uint16_t* op1, uint16_t* op0,
593
                                   uint16_t* oq0, uint16_t* oq1, uint16_t* oq2, uint16_t* oq3, uint16_t* oq4,
594
0
                                   uint16_t* oq5, uint16_t* oq6, int bd) {
595
0
    if (flat2 && flat && mask) {
596
0
        const uint16_t p6 = *op6;
597
0
        const uint16_t p5 = *op5;
598
0
        const uint16_t p4 = *op4;
599
0
        const uint16_t p3 = *op3;
600
0
        const uint16_t p2 = *op2;
601
0
        const uint16_t p1 = *op1;
602
0
        const uint16_t p0 = *op0;
603
0
        const uint16_t q0 = *oq0;
604
0
        const uint16_t q1 = *oq1;
605
0
        const uint16_t q2 = *oq2;
606
0
        const uint16_t q3 = *oq3;
607
0
        const uint16_t q4 = *oq4;
608
0
        const uint16_t q5 = *oq5;
609
0
        const uint16_t q6 = *oq6;
610
611
        // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
612
0
        *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, 4);
613
0
        *op4 = ROUND_POWER_OF_TWO(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
614
0
        *op3 = ROUND_POWER_OF_TWO(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
615
0
        *op2 = ROUND_POWER_OF_TWO(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, 4);
616
0
        *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4, 4);
617
0
        *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5, 4);
618
0
        *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6, 4);
619
0
        *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 * 2, 4);
620
0
        *oq2 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, 4);
621
0
        *oq3 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
622
0
        *oq4 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
623
0
        *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, 4);
624
0
    } else {
625
0
        highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, bd);
626
0
    }
627
0
}
628
629
static void highbd_mb_lpf_horizontal_edge_w(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
630
0
                                            const uint8_t* thresh, int count, int bd) {
631
0
    int i;
632
0
    int step = 4;
633
634
    // loop filter designed to work using chars so that we can make maximum use
635
    // of 8 bit simd instructions.
636
0
    for (i = 0; i < step * count; ++i) {
637
0
        const uint16_t p3   = s[-4 * p];
638
0
        const uint16_t p2   = s[-3 * p];
639
0
        const uint16_t p1   = s[-2 * p];
640
0
        const uint16_t p0   = s[-p];
641
0
        const uint16_t q0   = s[0 * p];
642
0
        const uint16_t q1   = s[1 * p];
643
0
        const uint16_t q2   = s[2 * p];
644
0
        const uint16_t q3   = s[3 * p];
645
0
        const int8_t   mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
646
0
        const int8_t   flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
647
648
0
        const int8_t flat2 = highbd_flat_mask4(
649
0
            1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], bd);
650
651
0
        highbd_filter14(mask,
652
0
                        *thresh,
653
0
                        flat,
654
0
                        flat2,
655
0
                        s - 7 * p,
656
0
                        s - 6 * p,
657
0
                        s - 5 * p,
658
0
                        s - 4 * p,
659
0
                        s - 3 * p,
660
0
                        s - 2 * p,
661
0
                        s - 1 * p,
662
0
                        s,
663
0
                        s + 1 * p,
664
0
                        s + 2 * p,
665
0
                        s + 3 * p,
666
0
                        s + 4 * p,
667
0
                        s + 5 * p,
668
0
                        s + 6 * p,
669
0
                        bd);
670
0
        ++s;
671
0
    }
672
0
}
673
674
void svt_aom_highbd_lpf_horizontal_14_c(uint16_t* s, int pitch, const uint8_t* blimit, const uint8_t* limit,
675
0
                                        const uint8_t* thresh, int bd) {
676
0
    highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
677
0
}
678
679
static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, uint16_t p2, uint16_t p1, uint16_t p0,
680
0
                                                uint16_t q0, uint16_t q1, uint16_t q2, int bd) {
681
0
    int8_t  mask     = 0;
682
0
    int16_t limit16  = (uint16_t)limit << (bd - 8);
683
0
    int16_t blimit16 = (uint16_t)blimit << (bd - 8);
684
0
    mask |= (abs(p2 - p1) > limit16) * -1;
685
0
    mask |= (abs(p1 - p0) > limit16) * -1;
686
0
    mask |= (abs(q1 - q0) > limit16) * -1;
687
0
    mask |= (abs(q2 - q1) > limit16) * -1;
688
0
    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
689
0
    return ~mask;
690
0
}
691
692
static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, uint16_t* op2, uint16_t* op1, uint16_t* op0,
693
0
                                  uint16_t* oq0, uint16_t* oq1, uint16_t* oq2, int bd) {
694
0
    if (flat && mask) {
695
0
        const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
696
0
        const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
697
698
        // 5-tap filter [1, 2, 2, 2, 1]
699
0
        *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
700
0
        *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
701
0
        *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
702
0
        *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
703
0
    } else {
704
0
        highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
705
0
    }
706
0
}
707
708
void svt_aom_highbd_lpf_vertical_6_c(uint16_t* s, int pitch, const uint8_t* blimit, const uint8_t* limit,
709
0
                                     const uint8_t* thresh, int bd) {
710
0
    int i;
711
0
    int count = 4;
712
713
0
    for (i = 0; i < count; ++i) {
714
0
        const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
715
0
        const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
716
0
        const int8_t   mask = highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
717
0
        const int8_t   flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
718
0
        highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, bd);
719
0
        s += pitch;
720
0
    }
721
0
}
722
723
void svt_aom_highbd_lpf_horizontal_6_c(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
724
0
                                       const uint8_t* thresh, int bd) {
725
0
    int i;
726
0
    int count = 4;
727
728
    // loop filter designed to work using chars so that we can make maximum use
729
    // of 8 bit simd instructions.
730
0
    for (i = 0; i < count; ++i) {
731
0
        const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
732
0
        const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
733
734
0
        const int8_t mask = highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
735
0
        const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
736
0
        highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, bd);
737
0
        ++s;
738
0
    }
739
0
}
740
741
static void highbd_mb_lpf_vertical_edge_w(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
742
0
                                          const uint8_t* thresh, int count, int bd) {
743
0
    int i;
744
745
0
    for (i = 0; i < count; ++i) {
746
0
        const uint16_t p3    = s[-4];
747
0
        const uint16_t p2    = s[-3];
748
0
        const uint16_t p1    = s[-2];
749
0
        const uint16_t p0    = s[-1];
750
0
        const uint16_t q0    = s[0];
751
0
        const uint16_t q1    = s[1];
752
0
        const uint16_t q2    = s[2];
753
0
        const uint16_t q3    = s[3];
754
0
        const int8_t   mask  = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
755
0
        const int8_t   flat  = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
756
0
        const int8_t   flat2 = highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
757
758
0
        highbd_filter14(mask,
759
0
                        *thresh,
760
0
                        flat,
761
0
                        flat2,
762
0
                        s - 7,
763
0
                        s - 6,
764
0
                        s - 5,
765
0
                        s - 4,
766
0
                        s - 3,
767
0
                        s - 2,
768
0
                        s - 1,
769
0
                        s,
770
0
                        s + 1,
771
0
                        s + 2,
772
0
                        s + 3,
773
0
                        s + 4,
774
0
                        s + 5,
775
0
                        s + 6,
776
0
                        bd);
777
0
        s += p;
778
0
    }
779
0
}
780
781
void svt_aom_highbd_lpf_vertical_14_c(uint16_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
782
0
                                      const uint8_t* thresh, int bd) {
783
0
    highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
784
0
}
785
786
static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint8_t* op6, uint8_t* op5,
787
                            uint8_t* op4, uint8_t* op3, uint8_t* op2, uint8_t* op1, uint8_t* op0, uint8_t* oq0,
788
0
                            uint8_t* oq1, uint8_t* oq2, uint8_t* oq3, uint8_t* oq4, uint8_t* oq5, uint8_t* oq6) {
789
0
    if (flat2 && flat && mask) {
790
0
        const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
791
0
        const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, q5 = *oq5, q6 = *oq6;
792
793
        // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
794
0
        *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, 4);
795
0
        *op4 = ROUND_POWER_OF_TWO(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
796
0
        *op3 = ROUND_POWER_OF_TWO(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
797
0
        *op2 = ROUND_POWER_OF_TWO(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, 4);
798
0
        *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4, 4);
799
0
        *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5, 4);
800
0
        *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6, 4);
801
0
        *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 * 2, 4);
802
0
        *oq2 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, 4);
803
0
        *oq3 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
804
0
        *oq4 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
805
0
        *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, 4);
806
0
    } else {
807
0
        filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
808
0
    }
809
0
}
810
811
static void mb_lpf_horizontal_edge_w(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
812
0
                                     const uint8_t* thresh, int count) {
813
0
    int i;
814
0
    int step = 4;
815
816
    // loop filter designed to work using chars so that we can make maximum use
817
    // of 8 bit simd instructions.
818
0
    for (i = 0; i < step * count; ++i) {
819
0
        const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p],
820
0
                      p0 = s[-p];
821
0
        const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], q4 = s[4 * p], q5 = s[5 * p],
822
0
                      q6   = s[6 * p];
823
0
        const int8_t mask  = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
824
0
        const int8_t flat  = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
825
0
        const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
826
827
0
        filter14(mask,
828
0
                 *thresh,
829
0
                 flat,
830
0
                 flat2,
831
0
                 s - 7 * p,
832
0
                 s - 6 * p,
833
0
                 s - 5 * p,
834
0
                 s - 4 * p,
835
0
                 s - 3 * p,
836
0
                 s - 2 * p,
837
0
                 s - 1 * p,
838
0
                 s,
839
0
                 s + 1 * p,
840
0
                 s + 2 * p,
841
0
                 s + 3 * p,
842
0
                 s + 4 * p,
843
0
                 s + 5 * p,
844
0
                 s + 6 * p);
845
0
        ++s;
846
0
    }
847
0
}
848
849
void svt_aom_lpf_horizontal_14_c(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
850
0
                                 const uint8_t* thresh) {
851
0
    mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
852
0
}
853
854
static void mb_lpf_vertical_edge_w(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit,
855
0
                                   const uint8_t* thresh, int count) {
856
0
    int i;
857
858
0
    for (i = 0; i < count; ++i) {
859
0
        const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
860
0
        const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], q5 = s[5], q6 = s[6];
861
0
        const int8_t  mask  = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
862
0
        const int8_t  flat  = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
863
0
        const int8_t  flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
864
865
0
        filter14(mask,
866
0
                 *thresh,
867
0
                 flat,
868
0
                 flat2,
869
0
                 s - 7,
870
0
                 s - 6,
871
0
                 s - 5,
872
0
                 s - 4,
873
0
                 s - 3,
874
0
                 s - 2,
875
0
                 s - 1,
876
0
                 s,
877
0
                 s + 1,
878
0
                 s + 2,
879
0
                 s + 3,
880
0
                 s + 4,
881
0
                 s + 5,
882
0
                 s + 6);
883
0
        s += p;
884
0
    }
885
0
}
886
887
0
void svt_aom_lpf_vertical_14_c(uint8_t* s, int p, const uint8_t* blimit, const uint8_t* limit, const uint8_t* thresh) {
888
0
    mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
889
0
}