Coverage Report

Created: 2025-08-28 07:12

/src/ffmpeg/libavcodec/h264_mb.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... decoder
3
 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22
/**
23
 * @file
24
 * H.264 / AVC / MPEG-4 part10 macroblock decoding
25
 */
26
27
#include <stdint.h>
28
29
#include "config.h"
30
31
#include "libavutil/common.h"
32
#include "libavutil/intreadwrite.h"
33
#include "avcodec.h"
34
#include "h264dec.h"
35
#include "h264_ps.h"
36
#include "qpeldsp.h"
37
#include "rectangle.h"
38
#include "threadframe.h"
39
40
static inline int get_lowest_part_list_y(H264SliceContext *sl,
41
                                         int n, int height, int y_offset, int list)
42
0
{
43
0
    int raw_my             = sl->mv_cache[list][scan8[n]][1];
44
0
    int filter_height_down = (raw_my & 3) ? 3 : 0;
45
0
    int full_my            = (raw_my >> 2) + y_offset;
46
0
    int bottom             = full_my + filter_height_down + height;
47
48
0
    av_assert2(height >= 0);
49
50
0
    return FFMAX(0, bottom);
51
0
}
52
53
static inline void get_lowest_part_y(const H264Context *h, H264SliceContext *sl,
54
                                     int16_t refs[2][48], int n,
55
                                     int height, int y_offset, int list0,
56
                                     int list1, int *nrefs)
57
0
{
58
0
    int my;
59
60
0
    y_offset += 16 * (sl->mb_y >> MB_FIELD(sl));
61
62
0
    if (list0) {
63
0
        int ref_n = sl->ref_cache[0][scan8[n]];
64
0
        H264Ref *ref = &sl->ref_list[0][ref_n];
65
66
        // Error resilience puts the current picture in the ref list.
67
        // Don't try to wait on these as it will cause a deadlock.
68
        // Fields can wait on each other, though.
69
0
        if (ref->parent->tf.progress != h->cur_pic.tf.progress ||
70
0
            (ref->reference & 3) != h->picture_structure) {
71
0
            my = get_lowest_part_list_y(sl, n, height, y_offset, 0);
72
0
            if (refs[0][ref_n] < 0)
73
0
                nrefs[0] += 1;
74
0
            refs[0][ref_n] = FFMAX(refs[0][ref_n], my);
75
0
        }
76
0
    }
77
78
0
    if (list1) {
79
0
        int ref_n    = sl->ref_cache[1][scan8[n]];
80
0
        H264Ref *ref = &sl->ref_list[1][ref_n];
81
82
0
        if (ref->parent->tf.progress != h->cur_pic.tf.progress ||
83
0
            (ref->reference & 3) != h->picture_structure) {
84
0
            my = get_lowest_part_list_y(sl, n, height, y_offset, 1);
85
0
            if (refs[1][ref_n] < 0)
86
0
                nrefs[1] += 1;
87
0
            refs[1][ref_n] = FFMAX(refs[1][ref_n], my);
88
0
        }
89
0
    }
90
0
}
91
92
/**
93
 * Wait until all reference frames are available for MC operations.
94
 *
95
 * @param h the H.264 context
96
 */
97
static void await_references(const H264Context *h, H264SliceContext *sl)
98
0
{
99
0
    const int mb_xy   = sl->mb_xy;
100
0
    const int mb_type = h->cur_pic.mb_type[mb_xy];
101
0
    int16_t refs[2][48];
102
0
    int nrefs[2] = { 0 };
103
0
    int ref, list;
104
105
0
    memset(refs, -1, sizeof(refs));
106
107
0
    if (IS_16X16(mb_type)) {
108
0
        get_lowest_part_y(h, sl, refs, 0, 16, 0,
109
0
                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
110
0
    } else if (IS_16X8(mb_type)) {
111
0
        get_lowest_part_y(h, sl, refs, 0, 8, 0,
112
0
                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
113
0
        get_lowest_part_y(h, sl, refs, 8, 8, 8,
114
0
                          IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
115
0
    } else if (IS_8X16(mb_type)) {
116
0
        get_lowest_part_y(h, sl, refs, 0, 16, 0,
117
0
                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
118
0
        get_lowest_part_y(h, sl, refs, 4, 16, 0,
119
0
                          IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
120
0
    } else {
121
0
        int i;
122
123
0
        av_assert2(IS_8X8(mb_type));
124
125
0
        for (i = 0; i < 4; i++) {
126
0
            const int sub_mb_type = sl->sub_mb_type[i];
127
0
            const int n           = 4 * i;
128
0
            int y_offset          = (i & 2) << 2;
129
130
0
            if (IS_SUB_8X8(sub_mb_type)) {
131
0
                get_lowest_part_y(h, sl, refs, n, 8, y_offset,
132
0
                                  IS_DIR(sub_mb_type, 0, 0),
133
0
                                  IS_DIR(sub_mb_type, 0, 1),
134
0
                                  nrefs);
135
0
            } else if (IS_SUB_8X4(sub_mb_type)) {
136
0
                get_lowest_part_y(h, sl, refs, n, 4, y_offset,
137
0
                                  IS_DIR(sub_mb_type, 0, 0),
138
0
                                  IS_DIR(sub_mb_type, 0, 1),
139
0
                                  nrefs);
140
0
                get_lowest_part_y(h, sl, refs, n + 2, 4, y_offset + 4,
141
0
                                  IS_DIR(sub_mb_type, 0, 0),
142
0
                                  IS_DIR(sub_mb_type, 0, 1),
143
0
                                  nrefs);
144
0
            } else if (IS_SUB_4X8(sub_mb_type)) {
145
0
                get_lowest_part_y(h, sl, refs, n, 8, y_offset,
146
0
                                  IS_DIR(sub_mb_type, 0, 0),
147
0
                                  IS_DIR(sub_mb_type, 0, 1),
148
0
                                  nrefs);
149
0
                get_lowest_part_y(h, sl, refs, n + 1, 8, y_offset,
150
0
                                  IS_DIR(sub_mb_type, 0, 0),
151
0
                                  IS_DIR(sub_mb_type, 0, 1),
152
0
                                  nrefs);
153
0
            } else {
154
0
                int j;
155
0
                av_assert2(IS_SUB_4X4(sub_mb_type));
156
0
                for (j = 0; j < 4; j++) {
157
0
                    int sub_y_offset = y_offset + 2 * (j & 2);
158
0
                    get_lowest_part_y(h, sl, refs, n + j, 4, sub_y_offset,
159
0
                                      IS_DIR(sub_mb_type, 0, 0),
160
0
                                      IS_DIR(sub_mb_type, 0, 1),
161
0
                                      nrefs);
162
0
                }
163
0
            }
164
0
        }
165
0
    }
166
167
0
    for (list = sl->list_count - 1; list >= 0; list--)
168
0
        for (ref = 0; ref < 48 && nrefs[list]; ref++) {
169
0
            int row = refs[list][ref];
170
0
            if (row >= 0) {
171
0
                H264Ref *ref_pic  = &sl->ref_list[list][ref];
172
0
                int ref_field         = ref_pic->reference - 1;
173
0
                int ref_field_picture = ref_pic->parent->field_picture;
174
0
                int pic_height        = 16 * h->mb_height >> ref_field_picture;
175
176
0
                row <<= MB_MBAFF(sl);
177
0
                nrefs[list]--;
178
179
0
                if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields
180
0
                    av_assert2((ref_pic->parent->reference & 3) == 3);
181
0
                    ff_thread_await_progress(&ref_pic->parent->tf,
182
0
                                             FFMIN((row >> 1) - !(row & 1),
183
0
                                                   pic_height - 1),
184
0
                                             1);
185
0
                    ff_thread_await_progress(&ref_pic->parent->tf,
186
0
                                             FFMIN((row >> 1), pic_height - 1),
187
0
                                             0);
188
0
                } else if (FIELD_PICTURE(h) && !ref_field_picture) { // field referencing one field of a frame
189
0
                    ff_thread_await_progress(&ref_pic->parent->tf,
190
0
                                             FFMIN(row * 2 + ref_field,
191
0
                                                   pic_height - 1),
192
0
                                             0);
193
0
                } else if (FIELD_PICTURE(h)) {
194
0
                    ff_thread_await_progress(&ref_pic->parent->tf,
195
0
                                             FFMIN(row, pic_height - 1),
196
0
                                             ref_field);
197
0
                } else {
198
0
                    ff_thread_await_progress(&ref_pic->parent->tf,
199
0
                                             FFMIN(row, pic_height - 1),
200
0
                                             0);
201
0
                }
202
0
            }
203
0
        }
204
0
}
205
206
static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext *sl,
207
                                         H264Ref *pic,
208
                                         int n, int square, int height,
209
                                         int delta, int list,
210
                                         uint8_t *dest_y, uint8_t *dest_cb,
211
                                         uint8_t *dest_cr,
212
                                         int src_x_offset, int src_y_offset,
213
                                         const qpel_mc_func *qpix_op,
214
                                         h264_chroma_mc_func chroma_op,
215
                                         int pixel_shift, int chroma_idc)
216
139M
{
217
139M
    const int mx      = sl->mv_cache[list][scan8[n]][0] + src_x_offset * 8;
218
139M
    int my            = sl->mv_cache[list][scan8[n]][1] + src_y_offset * 8;
219
139M
    const int luma_xy = (mx & 3) + ((my & 3) << 2);
220
139M
    ptrdiff_t offset  = (mx >> 2) * (1 << pixel_shift) + (my >> 2) * sl->mb_linesize;
221
139M
    uint8_t *src_y    = pic->data[0] + offset;
222
139M
    uint8_t *src_cb, *src_cr;
223
139M
    int extra_width  = 0;
224
139M
    int extra_height = 0;
225
139M
    int emu = 0;
226
139M
    const int full_mx    = mx >> 2;
227
139M
    const int full_my    = my >> 2;
228
139M
    const int pic_width  = 16 * h->mb_width;
229
139M
    const int pic_height = 16 * h->mb_height >> MB_FIELD(sl);
230
139M
    int ysh;
231
232
139M
    if (mx & 7)
233
22.3M
        extra_width -= 3;
234
139M
    if (my & 7)
235
19.3M
        extra_height -= 3;
236
237
139M
    if (full_mx                <          0 - extra_width  ||
238
139M
        full_my                <          0 - extra_height ||
239
139M
        full_mx + 16 /*FIXME*/ > pic_width  + extra_width  ||
240
139M
        full_my + 16 /*FIXME*/ > pic_height + extra_height) {
241
15.8M
        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
242
15.8M
                                 src_y - (2 << pixel_shift) - 2 * sl->mb_linesize,
243
15.8M
                                 sl->mb_linesize, sl->mb_linesize,
244
15.8M
                                 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2,
245
15.8M
                                 full_my - 2, pic_width, pic_height);
246
15.8M
        src_y = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
247
15.8M
        emu   = 1;
248
15.8M
    }
249
250
139M
    qpix_op[luma_xy](dest_y, src_y, sl->mb_linesize); // FIXME try variable height perhaps?
251
139M
    if (!square)
252
11.5M
        qpix_op[luma_xy](dest_y + delta, src_y + delta, sl->mb_linesize);
253
254
139M
    if (CONFIG_GRAY && h->flags & AV_CODEC_FLAG_GRAY)
255
0
        return;
256
257
139M
    if (chroma_idc == 3 /* yuv444 */) {
258
13.9M
        src_cb = pic->data[1] + offset;
259
13.9M
        if (emu) {
260
1.66M
            h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
261
1.66M
                                     src_cb - (2 << pixel_shift) - 2 * sl->mb_linesize,
262
1.66M
                                     sl->mb_linesize, sl->mb_linesize,
263
1.66M
                                     16 + 5, 16 + 5 /*FIXME*/,
264
1.66M
                                     full_mx - 2, full_my - 2,
265
1.66M
                                     pic_width, pic_height);
266
1.66M
            src_cb = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
267
1.66M
        }
268
13.9M
        qpix_op[luma_xy](dest_cb, src_cb, sl->mb_linesize); // FIXME try variable height perhaps?
269
13.9M
        if (!square)
270
842k
            qpix_op[luma_xy](dest_cb + delta, src_cb + delta, sl->mb_linesize);
271
272
13.9M
        src_cr = pic->data[2] + offset;
273
13.9M
        if (emu) {
274
1.66M
            h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
275
1.66M
                                     src_cr - (2 << pixel_shift) - 2 * sl->mb_linesize,
276
1.66M
                                     sl->mb_linesize, sl->mb_linesize,
277
1.66M
                                     16 + 5, 16 + 5 /*FIXME*/,
278
1.66M
                                     full_mx - 2, full_my - 2,
279
1.66M
                                     pic_width, pic_height);
280
1.66M
            src_cr = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
281
1.66M
        }
282
13.9M
        qpix_op[luma_xy](dest_cr, src_cr, sl->mb_linesize); // FIXME try variable height perhaps?
283
13.9M
        if (!square)
284
842k
            qpix_op[luma_xy](dest_cr + delta, src_cr + delta, sl->mb_linesize);
285
13.9M
        return;
286
13.9M
    }
287
288
125M
    ysh = 3 - (chroma_idc == 2 /* yuv422 */);
289
125M
    if (chroma_idc == 1 /* yuv420 */ && MB_FIELD(sl)) {
290
        // chroma offset when predicting from a field of opposite parity
291
16.3M
        my  += 2 * ((sl->mb_y & 1) - (pic->reference - 1));
292
16.3M
        emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1);
293
16.3M
    }
294
295
125M
    src_cb = pic->data[1] + ((mx >> 3) * (1 << pixel_shift)) +
296
125M
             (my >> ysh) * sl->mb_uvlinesize;
297
125M
    src_cr = pic->data[2] + ((mx >> 3) * (1 << pixel_shift)) +
298
125M
             (my >> ysh) * sl->mb_uvlinesize;
299
300
125M
    if (emu) {
301
15.2M
        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cb,
302
15.2M
                                 sl->mb_uvlinesize, sl->mb_uvlinesize,
303
15.2M
                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
304
15.2M
                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
305
15.2M
        src_cb = sl->edge_emu_buffer;
306
15.2M
    }
307
125M
    chroma_op(dest_cb, src_cb, sl->mb_uvlinesize,
308
125M
              height >> (chroma_idc == 1 /* yuv420 */),
309
125M
              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
310
311
125M
    if (emu) {
312
15.2M
        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cr,
313
15.2M
                                 sl->mb_uvlinesize, sl->mb_uvlinesize,
314
15.2M
                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
315
15.2M
                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
316
15.2M
        src_cr = sl->edge_emu_buffer;
317
15.2M
    }
318
125M
    chroma_op(dest_cr, src_cr, sl->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
319
125M
              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
320
125M
}
321
322
static av_always_inline void mc_part_std(const H264Context *h, H264SliceContext *sl,
323
                                         int n, int square,
324
                                         int height, int delta,
325
                                         uint8_t *dest_y, uint8_t *dest_cb,
326
                                         uint8_t *dest_cr,
327
                                         int x_offset, int y_offset,
328
                                         const qpel_mc_func *qpix_put,
329
                                         h264_chroma_mc_func chroma_put,
330
                                         const qpel_mc_func *qpix_avg,
331
                                         h264_chroma_mc_func chroma_avg,
332
                                         int list0, int list1,
333
                                         int pixel_shift, int chroma_idc)
334
72.2M
{
335
72.2M
    const qpel_mc_func *qpix_op   = qpix_put;
336
72.2M
    h264_chroma_mc_func chroma_op = chroma_put;
337
338
72.2M
    dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
339
72.2M
    if (chroma_idc == 3 /* yuv444 */) {
340
7.49M
        dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
341
7.49M
        dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
342
64.7M
    } else if (chroma_idc == 2 /* yuv422 */) {
343
25.4M
        dest_cb += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
344
25.4M
        dest_cr += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
345
39.3M
    } else { /* yuv420 */
346
39.3M
        dest_cb += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
347
39.3M
        dest_cr += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
348
39.3M
    }
349
72.2M
    x_offset += 8 * sl->mb_x;
350
72.2M
    y_offset += 8 * (sl->mb_y >> MB_FIELD(sl));
351
352
72.2M
    if (list0) {
353
66.7M
        H264Ref *ref = &sl->ref_list[0][sl->ref_cache[0][scan8[n]]];
354
66.7M
        mc_dir_part(h, sl, ref, n, square, height, delta, 0,
355
66.7M
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
356
66.7M
                    qpix_op, chroma_op, pixel_shift, chroma_idc);
357
358
66.7M
        qpix_op   = qpix_avg;
359
66.7M
        chroma_op = chroma_avg;
360
66.7M
    }
361
362
72.2M
    if (list1) {
363
31.7M
        H264Ref *ref = &sl->ref_list[1][sl->ref_cache[1][scan8[n]]];
364
31.7M
        mc_dir_part(h, sl, ref, n, square, height, delta, 1,
365
31.7M
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
366
31.7M
                    qpix_op, chroma_op, pixel_shift, chroma_idc);
367
31.7M
    }
368
72.2M
}
369
370
static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceContext *sl,
371
                                              int n, int square,
372
                                              int height, int delta,
373
                                              uint8_t *dest_y, uint8_t *dest_cb,
374
                                              uint8_t *dest_cr,
375
                                              int x_offset, int y_offset,
376
                                              const qpel_mc_func *qpix_put,
377
                                              h264_chroma_mc_func chroma_put,
378
                                              h264_weight_func luma_weight_op,
379
                                              h264_weight_func chroma_weight_op,
380
                                              h264_biweight_func luma_weight_avg,
381
                                              h264_biweight_func chroma_weight_avg,
382
                                              int list0, int list1,
383
                                              int pixel_shift, int chroma_idc)
384
36.4M
{
385
36.4M
    int chroma_height;
386
387
36.4M
    dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
388
36.4M
    if (chroma_idc == 3 /* yuv444 */) {
389
3.46M
        chroma_height     = height;
390
3.46M
        chroma_weight_avg = luma_weight_avg;
391
3.46M
        chroma_weight_op  = luma_weight_op;
392
3.46M
        dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
393
3.46M
        dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
394
32.9M
    } else if (chroma_idc == 2 /* yuv422 */) {
395
12.0M
        chroma_height = height;
396
12.0M
        dest_cb      += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
397
12.0M
        dest_cr      += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
398
20.9M
    } else { /* yuv420 */
399
20.9M
        chroma_height = height >> 1;
400
20.9M
        dest_cb      += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
401
20.9M
        dest_cr      += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
402
20.9M
    }
403
36.4M
    x_offset += 8 * sl->mb_x;
404
36.4M
    y_offset += 8 * (sl->mb_y >> MB_FIELD(sl));
405
406
36.4M
    if (list0 && list1) {
407
        /* don't optimize for luma-only case, since B-frames usually
408
         * use implicit weights => chroma too. */
409
4.37M
        uint8_t *tmp_cb = sl->bipred_scratchpad;
410
4.37M
        uint8_t *tmp_cr = sl->bipred_scratchpad + (8 << pixel_shift + (chroma_idc == 3));
411
4.37M
        uint8_t *tmp_y  = sl->bipred_scratchpad + 16 * sl->mb_uvlinesize;
412
4.37M
        int refn0       = sl->ref_cache[0][scan8[n]];
413
4.37M
        int refn1       = sl->ref_cache[1][scan8[n]];
414
415
4.37M
        mc_dir_part(h, sl, &sl->ref_list[0][refn0], n, square, height, delta, 0,
416
4.37M
                    dest_y, dest_cb, dest_cr,
417
4.37M
                    x_offset, y_offset, qpix_put, chroma_put,
418
4.37M
                    pixel_shift, chroma_idc);
419
4.37M
        mc_dir_part(h, sl, &sl->ref_list[1][refn1], n, square, height, delta, 1,
420
4.37M
                    tmp_y, tmp_cb, tmp_cr,
421
4.37M
                    x_offset, y_offset, qpix_put, chroma_put,
422
4.37M
                    pixel_shift, chroma_idc);
423
424
4.37M
        if (sl->pwt.use_weight == 2) {
425
2.88M
            int weight0 = sl->pwt.implicit_weight[refn0][refn1][sl->mb_y & 1];
426
2.88M
            int weight1 = 64 - weight0;
427
2.88M
            luma_weight_avg(dest_y, tmp_y, sl->mb_linesize,
428
2.88M
                            height, 5, weight0, weight1, 0);
429
2.88M
            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
430
2.88M
                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
431
2.88M
                                  chroma_height, 5, weight0, weight1, 0);
432
2.88M
                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
433
2.88M
                                  chroma_height, 5, weight0, weight1, 0);
434
2.88M
            }
435
2.88M
        } else {
436
1.49M
            luma_weight_avg(dest_y, tmp_y, sl->mb_linesize, height,
437
1.49M
                            sl->pwt.luma_log2_weight_denom,
438
1.49M
                            sl->pwt.luma_weight[refn0][0][0],
439
1.49M
                            sl->pwt.luma_weight[refn1][1][0],
440
1.49M
                            sl->pwt.luma_weight[refn0][0][1] +
441
1.49M
                            sl->pwt.luma_weight[refn1][1][1]);
442
1.49M
            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
443
1.49M
                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
444
1.49M
                                  sl->pwt.chroma_log2_weight_denom,
445
1.49M
                                  sl->pwt.chroma_weight[refn0][0][0][0],
446
1.49M
                                  sl->pwt.chroma_weight[refn1][1][0][0],
447
1.49M
                                  sl->pwt.chroma_weight[refn0][0][0][1] +
448
1.49M
                                  sl->pwt.chroma_weight[refn1][1][0][1]);
449
1.49M
                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
450
1.49M
                                  sl->pwt.chroma_log2_weight_denom,
451
1.49M
                                  sl->pwt.chroma_weight[refn0][0][1][0],
452
1.49M
                                  sl->pwt.chroma_weight[refn1][1][1][0],
453
1.49M
                                  sl->pwt.chroma_weight[refn0][0][1][1] +
454
1.49M
                                  sl->pwt.chroma_weight[refn1][1][1][1]);
455
1.49M
            }
456
1.49M
        }
457
32.0M
    } else {
458
32.0M
        int list     = list1 ? 1 : 0;
459
32.0M
        int refn     = sl->ref_cache[list][scan8[n]];
460
32.0M
        H264Ref *ref = &sl->ref_list[list][refn];
461
32.0M
        mc_dir_part(h, sl, ref, n, square, height, delta, list,
462
32.0M
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
463
32.0M
                    qpix_put, chroma_put, pixel_shift, chroma_idc);
464
465
32.0M
        luma_weight_op(dest_y, sl->mb_linesize, height,
466
32.0M
                       sl->pwt.luma_log2_weight_denom,
467
32.0M
                       sl->pwt.luma_weight[refn][list][0],
468
32.0M
                       sl->pwt.luma_weight[refn][list][1]);
469
32.0M
        if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
470
32.0M
            if (sl->pwt.use_weight_chroma) {
471
29.6M
                chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
472
29.6M
                                 sl->pwt.chroma_log2_weight_denom,
473
29.6M
                                 sl->pwt.chroma_weight[refn][list][0][0],
474
29.6M
                                 sl->pwt.chroma_weight[refn][list][0][1]);
475
29.6M
                chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
476
29.6M
                                 sl->pwt.chroma_log2_weight_denom,
477
29.6M
                                 sl->pwt.chroma_weight[refn][list][1][0],
478
29.6M
                                 sl->pwt.chroma_weight[refn][list][1][1]);
479
29.6M
            }
480
32.0M
        }
481
32.0M
    }
482
36.4M
}
483
484
static av_always_inline void prefetch_motion(const H264Context *h, H264SliceContext *sl,
485
                                             int list, int pixel_shift,
486
                                             int chroma_idc)
487
95.4M
{
488
    /* fetch pixels for estimated mv 4 macroblocks ahead
489
     * optimized for 64byte cache lines */
490
95.4M
    const int refn = sl->ref_cache[list][scan8[0]];
491
95.4M
    if (refn >= 0) {
492
94.2M
        const int mx  = (sl->mv_cache[list][scan8[0]][0] >> 2) + 16 * sl->mb_x + 8;
493
94.2M
        const int my  = (sl->mv_cache[list][scan8[0]][1] >> 2) + 16 * sl->mb_y;
494
94.2M
        uint8_t **src = sl->ref_list[list][refn].data;
495
94.2M
        int off       =  mx * (1<< pixel_shift) +
496
94.2M
                        (my + (sl->mb_x & 3) * 4) * sl->mb_linesize +
497
94.2M
                        (64 << pixel_shift);
498
94.2M
        h->vdsp.prefetch(src[0] + off, sl->linesize, 4);
499
94.2M
        if (chroma_idc == 3 /* yuv444 */) {
500
8.64M
            h->vdsp.prefetch(src[1] + off, sl->linesize, 4);
501
8.64M
            h->vdsp.prefetch(src[2] + off, sl->linesize, 4);
502
85.6M
        } else {
503
85.6M
            off= ((mx>>1)+64) * (1<<pixel_shift) + ((my>>1) + (sl->mb_x&7))*sl->uvlinesize;
504
85.6M
            h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
505
85.6M
        }
506
94.2M
    }
507
95.4M
}
508
509
static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceContext *sl,
510
                                            uint8_t *src_y,
511
                                            uint8_t *src_cb, uint8_t *src_cr,
512
                                            int linesize, int uvlinesize,
513
                                            int xchg, int chroma444,
514
                                            int simple, int pixel_shift)
515
5.43M
{
516
5.43M
    int deblock_topleft;
517
5.43M
    int deblock_top;
518
5.43M
    int top_idx = 1;
519
5.43M
    uint8_t *top_border_m1;
520
5.43M
    uint8_t *top_border;
521
522
5.43M
    if (!simple && FRAME_MBAFF(h)) {
523
1.96M
        if (sl->mb_y & 1) {
524
989k
            if (!MB_MBAFF(sl))
525
814k
                return;
526
989k
        } else {
527
972k
            top_idx = MB_MBAFF(sl) ? 0 : 1;
528
972k
        }
529
1.96M
    }
530
531
4.62M
    if (sl->deblocking_filter == 2) {
532
185k
        deblock_topleft = h->slice_table[sl->mb_xy - 1 - (h->mb_stride << MB_FIELD(sl))] == sl->slice_num;
533
185k
        deblock_top     = sl->top_type;
534
4.43M
    } else {
535
4.43M
        deblock_topleft = (sl->mb_x > 0);
536
4.43M
        deblock_top     = (sl->mb_y > !!MB_FIELD(sl));
537
4.43M
    }
538
539
4.62M
    src_y  -= linesize   + 1 + pixel_shift;
540
4.62M
    src_cb -= uvlinesize + 1 + pixel_shift;
541
4.62M
    src_cr -= uvlinesize + 1 + pixel_shift;
542
543
4.62M
    top_border_m1 = sl->top_borders[top_idx][sl->mb_x - 1];
544
4.62M
    top_border    = sl->top_borders[top_idx][sl->mb_x];
545
546
4.62M
#define XCHG(a, b, xchg)                        \
547
24.4M
    if (pixel_shift) {                          \
548
14.7M
        if (xchg) {                             \
549
13.5M
            AV_SWAP64(b + 0, a + 0);            \
550
13.5M
            AV_SWAP64(b + 8, a + 8);            \
551
13.5M
        } else {                                \
552
1.13M
            AV_COPY128(b, a);                   \
553
1.13M
        }                                       \
554
14.7M
    } else if (xchg)                            \
555
9.78M
        AV_SWAP64(b, a);                        \
556
9.78M
    else                                        \
557
9.78M
        AV_COPY64(b, a);
558
559
4.62M
    if (deblock_top) {
560
3.10M
        if (deblock_topleft) {
561
2.74M
            XCHG(top_border_m1 + (8 << pixel_shift),
562
2.74M
                 src_y - (7 << pixel_shift), 1);
563
2.74M
        }
564
3.10M
        XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg);
565
3.10M
        XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1);
566
3.10M
        if (sl->mb_x + 1 < h->mb_width) {
567
2.75M
            XCHG(sl->top_borders[top_idx][sl->mb_x + 1],
568
2.75M
                 src_y + (17 << pixel_shift), 1);
569
2.75M
        }
570
3.10M
        if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
571
3.10M
            if (chroma444) {
572
289k
                if (deblock_topleft) {
573
251k
                    XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
574
251k
                    XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
575
251k
                }
576
289k
                XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg);
577
289k
                XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1);
578
289k
                XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg);
579
289k
                XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1);
580
289k
                if (sl->mb_x + 1 < h->mb_width) {
581
240k
                    XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
582
240k
                    XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
583
240k
                }
584
2.81M
            } else {
585
2.81M
                if (deblock_topleft) {
586
2.49M
                    XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
587
2.49M
                    XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
588
2.49M
                }
589
2.81M
                XCHG(top_border + (16 << pixel_shift), src_cb + 1 + pixel_shift, 1);
590
2.81M
                XCHG(top_border + (24 << pixel_shift), src_cr + 1 + pixel_shift, 1);
591
2.81M
            }
592
3.10M
        }
593
3.10M
    }
594
4.62M
}
595
596
static av_always_inline int dctcoef_get(int16_t *mb, int high_bit_depth,
597
                                        int index)
598
4.97M
{
599
4.97M
    if (high_bit_depth) {
600
3.28M
        return AV_RN32A(((int32_t *)mb) + index);
601
3.28M
    } else
602
1.69M
        return AV_RN16A(mb + index);
603
4.97M
}
604
605
static av_always_inline void dctcoef_set(int16_t *mb, int high_bit_depth,
606
                                         int index, int value)
607
1.01M
{
608
1.01M
    if (high_bit_depth) {
609
845k
        AV_WN32A(((int32_t *)mb) + index, value);
610
845k
    } else
611
1.01M
        AV_WN16A(mb + index, value);
612
1.01M
}
613
614
static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
615
                                                       H264SliceContext *sl,
616
                                                       int mb_type, int simple,
617
                                                       int transform_bypass,
618
                                                       int pixel_shift,
619
                                                       const int *block_offset,
620
                                                       int linesize,
621
                                                       uint8_t *dest_y, int p)
622
3.30M
{
623
3.30M
    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
624
3.30M
    void (*idct_dc_add)(uint8_t *dst, int16_t *block, int stride);
625
3.30M
    int i;
626
3.30M
    int qscale = p == 0 ? sl->qscale : sl->chroma_qp[p - 1];
627
3.30M
    block_offset += 16 * p;
628
3.30M
    if (IS_INTRA4x4(mb_type)) {
629
2.27M
        if (IS_8x8DCT(mb_type)) {
630
1.73M
            if (transform_bypass) {
631
387k
                idct_dc_add =
632
387k
                idct_add    = h->h264dsp.h264_add_pixels8_clear;
633
1.34M
            } else {
634
1.34M
                idct_dc_add = h->h264dsp.h264_idct8_dc_add;
635
1.34M
                idct_add    = h->h264dsp.h264_idct8_add;
636
1.34M
            }
637
8.66M
            for (i = 0; i < 16; i += 4) {
638
6.93M
                uint8_t *const ptr = dest_y + block_offset[i];
639
6.93M
                const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
640
6.93M
                if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
641
284k
                    if (h->x264_build < 151U) {
642
94.2k
                        h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
643
94.2k
                    } else
644
190k
                        h->hpc.pred8x8l_filter_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift),
645
190k
                                                        (sl-> topleft_samples_available << i) & 0x8000,
646
190k
                                                        (sl->topright_samples_available << i) & 0x4000, linesize);
647
6.64M
                } else {
648
6.64M
                    const int nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
649
6.64M
                    h->hpc.pred8x8l[dir](ptr, (sl->topleft_samples_available << i) & 0x8000,
650
6.64M
                                         (sl->topright_samples_available << i) & 0x4000, linesize);
651
6.64M
                    if (nnz) {
652
4.72M
                        if (nnz == 1 && dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
653
740k
                            idct_dc_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
654
3.98M
                        else
655
3.98M
                            idct_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
656
4.72M
                    }
657
6.64M
                }
658
6.93M
            }
659
1.73M
        } else {
660
546k
            if (transform_bypass) {
661
46.7k
                idct_dc_add  =
662
46.7k
                idct_add     = h->h264dsp.h264_add_pixels4_clear;
663
499k
            } else {
664
499k
                idct_dc_add = h->h264dsp.h264_idct_dc_add;
665
499k
                idct_add    = h->h264dsp.h264_idct_add;
666
499k
            }
667
9.28M
            for (i = 0; i < 16; i++) {
668
8.73M
                uint8_t *const ptr = dest_y + block_offset[i];
669
8.73M
                const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
670
671
8.73M
                if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
672
385k
                    h->hpc.pred4x4_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
673
8.35M
                } else {
674
8.35M
                    uint8_t *topright;
675
8.35M
                    int nnz, tr;
676
8.35M
                    uint64_t tr_high;
677
8.35M
                    if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
678
628k
                        const int topright_avail = (sl->topright_samples_available << i) & 0x8000;
679
628k
                        av_assert2(sl->mb_y || linesize <= block_offset[i]);
680
628k
                        if (!topright_avail) {
681
197k
                            if (pixel_shift) {
682
81.0k
                                tr_high  = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL;
683
81.0k
                                topright = (uint8_t *)&tr_high;
684
116k
                            } else {
685
116k
                                tr       = ptr[3 - linesize] * 0x01010101u;
686
116k
                                topright = (uint8_t *)&tr;
687
116k
                            }
688
197k
                        } else
689
430k
                            topright = ptr + (4 << pixel_shift) - linesize;
690
628k
                    } else
691
7.72M
                        topright = NULL;
692
693
8.35M
                    h->hpc.pred4x4[dir](ptr, topright, linesize);
694
8.35M
                    nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
695
8.35M
                    if (nnz) {
696
3.36M
                        if (nnz == 1 && dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
697
853k
                            idct_dc_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
698
2.51M
                        else
699
2.51M
                            idct_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
700
3.36M
                    }
701
8.35M
                }
702
8.73M
            }
703
546k
        }
704
2.27M
    } else {
705
1.02M
        h->hpc.pred16x16[sl->intra16x16_pred_mode](dest_y, linesize);
706
1.02M
        if (sl->non_zero_count_cache[scan8[LUMA_DC_BLOCK_INDEX + p]]) {
707
557k
            if (!transform_bypass)
708
494k
                h->h264dsp.h264_luma_dc_dequant_idct(sl->mb + (p * 256 << pixel_shift),
709
494k
                                                     sl->mb_luma_dc[p],
710
494k
                                                     h->ps.pps->dequant4_coeff[p][qscale][0]);
711
63.2k
            else {
712
63.2k
                static const uint8_t dc_mapping[16] = {
713
63.2k
                     0 * 16,  1 * 16,  4 * 16,  5 * 16,
714
63.2k
                     2 * 16,  3 * 16,  6 * 16,  7 * 16,
715
63.2k
                     8 * 16,  9 * 16, 12 * 16, 13 * 16,
716
63.2k
                    10 * 16, 11 * 16, 14 * 16, 15 * 16
717
63.2k
                };
718
1.07M
                for (i = 0; i < 16; i++)
719
1.01M
                    dctcoef_set(sl->mb + (p * 256 << pixel_shift),
720
1.01M
                                pixel_shift, dc_mapping[i],
721
1.01M
                                dctcoef_get(sl->mb_luma_dc[p],
722
1.01M
                                            pixel_shift, i));
723
63.2k
            }
724
557k
        }
725
1.02M
    }
726
3.30M
}
727
728
static av_always_inline void hl_decode_mb_idct_luma(const H264Context *h, H264SliceContext *sl,
729
                                                    int mb_type, int simple,
730
                                                    int transform_bypass,
731
                                                    int pixel_shift,
732
                                                    const int *block_offset,
733
                                                    int linesize,
734
                                                    uint8_t *dest_y, int p)
735
100M
{
736
100M
    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
737
100M
    int i;
738
100M
    block_offset += 16 * p;
739
100M
    if (!IS_INTRA4x4(mb_type)) {
740
98.5M
        if (IS_INTRA16x16(mb_type)) {
741
1.02M
            if (transform_bypass) {
742
80.6k
                if (h->ps.sps->profile_idc == 244 &&
743
80.6k
                    (sl->intra16x16_pred_mode == VERT_PRED8x8 ||
744
67.0k
                     sl->intra16x16_pred_mode == HOR_PRED8x8)) {
745
47.0k
                    h->hpc.pred16x16_add[sl->intra16x16_pred_mode](dest_y, block_offset,
746
47.0k
                                                                   sl->mb + (p * 256 << pixel_shift),
747
47.0k
                                                                   linesize);
748
47.0k
                } else {
749
571k
                    for (i = 0; i < 16; i++)
750
537k
                        if (sl->non_zero_count_cache[scan8[i + p * 16]] ||
751
537k
                            dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
752
152k
                            h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i],
753
152k
                                                              sl->mb + (i * 16 + p * 256 << pixel_shift),
754
152k
                                                              linesize);
755
33.6k
                }
756
940k
            } else {
757
940k
                h->h264dsp.h264_idct_add16intra(dest_y, block_offset,
758
940k
                                                sl->mb + (p * 256 << pixel_shift),
759
940k
                                                linesize,
760
940k
                                                sl->non_zero_count_cache + p * 5 * 8);
761
940k
            }
762
97.5M
        } else if (sl->cbp & 15) {
763
45.0M
            if (transform_bypass) {
764
501k
                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
765
501k
                idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear
766
501k
                    : h->h264dsp.h264_add_pixels4_clear;
767
7.77M
                for (i = 0; i < 16; i += di)
768
7.26M
                    if (sl->non_zero_count_cache[scan8[i + p * 16]])
769
967k
                        idct_add(dest_y + block_offset[i],
770
967k
                                 sl->mb + (i * 16 + p * 256 << pixel_shift),
771
967k
                                 linesize);
772
44.5M
            } else {
773
44.5M
                if (IS_8x8DCT(mb_type))
774
2.31M
                    h->h264dsp.h264_idct8_add4(dest_y, block_offset,
775
2.31M
                                               sl->mb + (p * 256 << pixel_shift),
776
2.31M
                                               linesize,
777
2.31M
                                               sl->non_zero_count_cache + p * 5 * 8);
778
42.2M
                else
779
42.2M
                    h->h264dsp.h264_idct_add16(dest_y, block_offset,
780
42.2M
                                               sl->mb + (p * 256 << pixel_shift),
781
42.2M
                                               linesize,
782
42.2M
                                               sl->non_zero_count_cache + p * 5 * 8);
783
44.5M
            }
784
45.0M
        }
785
98.5M
    }
786
100M
}
787
788
141M
#define BITS   8
789
235M
#define SIMPLE 1
790
#include "h264_mb_template.c"
791
792
#undef  BITS
793
84.0M
#define BITS   16
794
#include "h264_mb_template.c"
795
796
#undef  SIMPLE
797
554M
#define SIMPLE 0
798
#include "h264_mb_template.c"
799
800
void ff_h264_hl_decode_mb(const H264Context *h, H264SliceContext *sl)
801
84.4M
{
802
84.4M
    const int mb_xy   = sl->mb_xy;
803
84.4M
    const int mb_type = h->cur_pic.mb_type[mb_xy];
804
84.4M
    int is_complex    = CONFIG_SMALL || sl->is_complex ||
805
84.4M
                        IS_INTRA_PCM(mb_type) || sl->qscale == 0;
806
807
84.4M
    if (CHROMA444(h)) {
808
8.22M
        if (is_complex || h->pixel_shift)
809
7.13M
            hl_decode_mb_444_complex(h, sl);
810
1.09M
        else
811
1.09M
            hl_decode_mb_444_simple_8(h, sl);
812
76.1M
    } else if (is_complex) {
813
52.1M
        hl_decode_mb_complex(h, sl);
814
52.1M
    } else if (h->pixel_shift) {
815
9.11M
        hl_decode_mb_simple_16(h, sl);
816
9.11M
    } else
817
14.9M
        hl_decode_mb_simple_8(h, sl);
818
84.4M
}