Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/dav1d/src/recon_tmpl.c
Line
Count
Source
1
/*
2
 * Copyright © 2018-2021, VideoLAN and dav1d authors
3
 * Copyright © 2018, Two Orioles, LLC
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are met:
8
 *
9
 * 1. Redistributions of source code must retain the above copyright notice, this
10
 *    list of conditions and the following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above copyright notice,
13
 *    this list of conditions and the following disclaimer in the documentation
14
 *    and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 */
27
28
#include "config.h"
29
30
#include <string.h>
31
#include <stdio.h>
32
33
#include "common/attributes.h"
34
#include "common/bitdepth.h"
35
#include "common/dump.h"
36
#include "common/frame.h"
37
#include "common/intops.h"
38
39
#include "src/cdef_apply.h"
40
#include "src/ctx.h"
41
#include "src/ipred_prepare.h"
42
#include "src/lf_apply.h"
43
#include "src/lr_apply.h"
44
#include "src/recon.h"
45
#include "src/scan.h"
46
#include "src/tables.h"
47
#include "src/wedge.h"
48
49
1.23M
static inline unsigned read_golomb(MsacContext *const msac) {
50
1.23M
    int len = 0;
51
1.23M
    unsigned val = 1;
52
53
2.22M
    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
54
2.22M
    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
55
56
1.23M
    return val - 1;
57
1.23M
}
58
59
static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
60
                                    const enum BlockSize bs,
61
                                    const uint8_t *const a,
62
                                    const uint8_t *const l,
63
                                    const int chroma,
64
                                    const enum Dav1dPixelLayout layout)
65
9.41M
{
66
9.41M
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
67
68
9.41M
    if (chroma) {
69
4.93M
        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
70
4.93M
        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
71
4.93M
        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
72
2.81M
                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
73
4.93M
        unsigned ca, cl;
74
75
4.93M
#define MERGE_CTX(dir, type, no_val) \
76
9.86M
        c##dir = *(const type *) dir != no_val; \
77
9.86M
        break
78
79
4.93M
        switch (t_dim->lw) {
80
        /* For some reason the MSVC CRT _wassert() function is not flagged as
81
         * __declspec(noreturn), so when using those headers the compiler will
82
         * expect execution to continue after an assertion has been triggered
83
         * and will therefore complain about the use of uninitialized variables
84
         * when compiled in debug mode if we put the default case at the end. */
85
0
        default: assert(0); /* fall-through */
86
1.81M
        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
87
898k
        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
88
703k
        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
89
1.51M
        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
90
4.93M
        }
91
4.93M
        switch (t_dim->lh) {
92
0
        default: assert(0); /* fall-through */
93
2.02M
        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
94
888k
        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
95
560k
        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
96
1.46M
        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
97
4.93M
        }
98
4.93M
#undef MERGE_CTX
99
100
4.93M
        return 7 + not_one_blk * 3 + ca + cl;
101
4.93M
    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
102
1.46M
        return 0;
103
3.01M
    } else {
104
3.01M
        unsigned la, ll;
105
106
3.01M
#define MERGE_CTX(dir, type, tx) \
107
6.06M
        if (tx == TX_64X64) { \
108
497k
            uint64_t tmp = *(const uint64_t *) dir; \
109
497k
            tmp |= *(const uint64_t *) &dir[8]; \
110
497k
            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
111
497k
        } else \
112
6.06M
            l##dir = *(const type *) dir; \
113
6.06M
        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
114
6.06M
        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
115
6.06M
        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
116
6.06M
        break
117
118
3.01M
        switch (t_dim->lw) {
119
0
        default: assert(0); /* fall-through */
120
1.87M
        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
121
467k
        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
122
406k
        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
123
34.9k
        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
124
248k
        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
125
3.01M
        }
126
3.03M
        switch (t_dim->lh) {
127
0
        default: assert(0); /* fall-through */
128
1.88M
        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
129
455k
        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
130
405k
        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
131
35.5k
        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
132
248k
        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
133
3.03M
        }
134
3.03M
#undef MERGE_CTX
135
136
3.03M
        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
137
3.03M
    }
138
9.41M
}
139
140
static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
141
                                       const uint8_t *const a,
142
                                       const uint8_t *const l)
143
4.25M
{
144
4.25M
    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
145
4.25M
    int s;
146
147
4.25M
#if ARCH_X86_64 && defined(__GNUC__)
148
    /* Coerce compilers into producing better code. For some reason
149
     * every x86-64 compiler is awful at handling 64-bit constants. */
150
4.25M
    __asm__("" : "+r"(mask), "+r"(mul));
151
4.25M
#endif
152
153
4.25M
    switch(tx) {
154
0
    default: assert(0); /* fall-through */
155
1.90M
    case TX_4X4: {
156
1.90M
        int t = *(const uint8_t *) a >> 6;
157
1.90M
        t    += *(const uint8_t *) l >> 6;
158
1.90M
        s = t - 1 - 1;
159
1.90M
        break;
160
0
    }
161
365k
    case TX_8X8: {
162
365k
        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
163
365k
        t         += *(const uint16_t *) l & (uint32_t) mask;
164
365k
        t *= 0x04040404U;
165
365k
        s = (int) (t >> 24) - 2 - 2;
166
365k
        break;
167
0
    }
168
295k
    case TX_16X16: {
169
295k
        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
170
295k
        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
171
295k
        t *= (uint32_t) mul;
172
295k
        s = (int) (t >> 24) - 4 - 4;
173
295k
        break;
174
0
    }
175
450k
    case TX_32X32: {
176
450k
        uint64_t t = (*(const uint64_t *) a & mask) >> 6;
177
450k
        t         += (*(const uint64_t *) l & mask) >> 6;
178
450k
        t *= mul;
179
450k
        s = (int) (t >> 56) - 8 - 8;
180
450k
        break;
181
0
    }
182
199k
    case TX_64X64: {
183
199k
        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
184
199k
        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
185
199k
        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
186
199k
        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
187
199k
        t *= mul;
188
199k
        s = (int) (t >> 56) - 16 - 16;
189
199k
        break;
190
0
    }
191
112k
    case RTX_4X8: {
192
112k
        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
193
112k
        t         += *(const uint16_t *) l & (uint32_t) mask;
194
112k
        t *= 0x04040404U;
195
112k
        s = (int) (t >> 24) - 1 - 2;
196
112k
        break;
197
0
    }
198
169k
    case RTX_8X4: {
199
169k
        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
200
169k
        t         += *(const uint8_t  *) l & (uint32_t) mask;
201
169k
        t *= 0x04040404U;
202
169k
        s = (int) (t >> 24) - 2 - 1;
203
169k
        break;
204
0
    }
205
101k
    case RTX_8X16: {
206
101k
        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
207
101k
        t         += *(const uint32_t *) l & (uint32_t) mask;
208
101k
        t = (t >> 6) * (uint32_t) mul;
209
101k
        s = (int) (t >> 24) - 2 - 4;
210
101k
        break;
211
0
    }
212
182k
    case RTX_16X8: {
213
182k
        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
214
182k
        t         += *(const uint16_t *) l & (uint32_t) mask;
215
182k
        t = (t >> 6) * (uint32_t) mul;
216
182k
        s = (int) (t >> 24) - 4 - 2;
217
182k
        break;
218
0
    }
219
62.2k
    case RTX_16X32: {
220
62.2k
        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
221
62.2k
        t         += *(const uint64_t *) l & mask;
222
62.2k
        t = (t >> 6) * mul;
223
62.2k
        s = (int) (t >> 56) - 4 - 8;
224
62.2k
        break;
225
0
    }
226
95.5k
    case RTX_32X16: {
227
95.5k
        uint64_t t = *(const uint64_t *) a & mask;
228
95.5k
        t         += *(const uint32_t *) l & (uint32_t) mask;
229
95.5k
        t = (t >> 6) * mul;
230
95.5k
        s = (int) (t >> 56) - 8 - 4;
231
95.5k
        break;
232
0
    }
233
38.9k
    case RTX_32X64: {
234
38.9k
        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
235
38.9k
        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
236
38.9k
        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
237
38.9k
        t *= mul;
238
38.9k
        s = (int) (t >> 56) - 8 - 16;
239
38.9k
        break;
240
0
    }
241
35.1k
    case RTX_64X32: {
242
35.1k
        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
243
35.1k
        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
244
35.1k
        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
245
35.1k
        t *= mul;
246
35.1k
        s = (int) (t >> 56) - 16 - 8;
247
35.1k
        break;
248
0
    }
249
50.3k
    case RTX_4X16: {
250
50.3k
        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
251
50.3k
        t         += *(const uint32_t *) l & (uint32_t) mask;
252
50.3k
        t = (t >> 6) * (uint32_t) mul;
253
50.3k
        s = (int) (t >> 24) - 1 - 4;
254
50.3k
        break;
255
0
    }
256
98.5k
    case RTX_16X4: {
257
98.5k
        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
258
98.5k
        t         += *(const uint8_t  *) l & (uint32_t) mask;
259
98.5k
        t = (t >> 6) * (uint32_t) mul;
260
98.5k
        s = (int) (t >> 24) - 4 - 1;
261
98.5k
        break;
262
0
    }
263
30.5k
    case RTX_8X32: {
264
30.5k
        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
265
30.5k
        t         += *(const uint64_t *) l & mask;
266
30.5k
        t = (t >> 6) * mul;
267
30.5k
        s = (int) (t >> 56) - 2 - 8;
268
30.5k
        break;
269
0
    }
270
44.0k
    case RTX_32X8: {
271
44.0k
        uint64_t t = *(const uint64_t *) a & mask;
272
44.0k
        t         += *(const uint16_t *) l & (uint32_t) mask;
273
44.0k
        t = (t >> 6) * mul;
274
44.0k
        s = (int) (t >> 56) - 8 - 2;
275
44.0k
        break;
276
0
    }
277
9.08k
    case RTX_16X64: {
278
9.08k
        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
279
9.08k
        t         += *(const uint64_t *) &l[0] & mask;
280
9.08k
        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
281
9.08k
        t *= mul;
282
9.08k
        s = (int) (t >> 56) - 4 - 16;
283
9.08k
        break;
284
0
    }
285
8.70k
    case RTX_64X16: {
286
8.70k
        uint64_t t = *(const uint64_t *) &a[0] & mask;
287
8.70k
        t         += *(const uint32_t *) l & (uint32_t) mask;
288
8.70k
        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
289
8.70k
        t *= mul;
290
8.70k
        s = (int) (t >> 56) - 16 - 4;
291
8.70k
        break;
292
0
    }
293
4.25M
    }
294
295
4.25M
    return (s != 0) + (s > 0);
296
4.25M
}
297
298
static inline unsigned get_lo_ctx(const uint8_t *const levels,
299
                                  const enum TxClass tx_class,
300
                                  unsigned *const hi_mag,
301
                                  const uint8_t (*const ctx_offsets)[5],
302
                                  const unsigned x, const unsigned y,
303
                                  const ptrdiff_t stride)
304
79.7M
{
305
79.7M
    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
306
79.7M
    unsigned offset;
307
79.7M
    if (tx_class == TX_CLASS_2D) {
308
75.6M
        mag += levels[1 * stride + 1];
309
75.6M
        *hi_mag = mag;
310
75.6M
        mag += levels[0 * stride + 2] + levels[2 * stride + 0];
311
75.6M
        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
312
75.6M
    } else {
313
4.03M
        mag += levels[0 * stride + 2];
314
4.03M
        *hi_mag = mag;
315
4.03M
        mag += levels[0 * stride + 3] + levels[0 * stride + 4];
316
4.03M
        offset = 26 + (y > 1 ? 10 : y * 5);
317
4.03M
    }
318
79.7M
    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
319
79.7M
}
320
321
static int decode_coefs(Dav1dTaskContext *const t,
322
                        uint8_t *const a, uint8_t *const l,
323
                        const enum RectTxfmSize tx, const enum BlockSize bs,
324
                        const Av1Block *const b, const int intra,
325
                        const int plane, coef *cf,
326
                        enum TxfmType *const txtp, uint8_t *res_ctx)
327
9.41M
{
328
9.41M
    Dav1dTileState *const ts = t->ts;
329
9.41M
    const int chroma = !!plane;
330
9.41M
    const Dav1dFrameContext *const f = t->f;
331
9.41M
    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
332
9.41M
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
333
9.41M
    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
334
335
9.41M
    if (dbg)
336
0
        printf("Start: r=%d\n", ts->msac.rng);
337
338
    // does this block have any non-zero coefficients
339
9.41M
    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
340
9.41M
    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
341
9.41M
                             ts->cdf.coef.skip[t_dim->ctx][sctx]);
342
9.41M
    if (dbg)
343
0
        printf("Post-non-zero[%d][%d][%d]: r=%d\n",
344
0
               t_dim->ctx, sctx, all_skip, ts->msac.rng);
345
9.41M
    if (all_skip) {
346
4.46M
        *res_ctx = 0x40;
347
4.46M
        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
348
4.46M
        return -1;
349
4.46M
    }
350
351
    // transform type (chroma: derived, luma: explicitly coded)
352
4.95M
    if (lossless) {
353
1.78M
        assert(t_dim->max == TX_4X4);
354
1.78M
        *txtp = WHT_WHT;
355
3.16M
    } else if (t_dim->max + intra >= TX_64X64) {
356
987k
        *txtp = DCT_DCT;
357
2.18M
    } else if (chroma) {
358
        // inferred from either the luma txtp (inter) or a LUT (intra)
359
586k
        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
360
586k
                        get_uv_inter_txtp(t_dim, *txtp);
361
1.59M
    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
362
        // In libaom, lossless is checked by a literal qidx == 0, but not all
363
        // such blocks are actually lossless. The remainder gets an implicit
364
        // transform type (for luma)
365
18.6k
        *txtp = DCT_DCT;
366
1.57M
    } else {
367
1.57M
        unsigned idx;
368
1.57M
        if (intra) {
369
1.22M
            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
370
1.04M
                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
371
1.22M
            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
372
534k
                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
373
534k
                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
374
534k
                *txtp = dav1d_tx_types_per_set[idx + 0];
375
686k
            } else {
376
686k
                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
377
686k
                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
378
686k
                *txtp = dav1d_tx_types_per_set[idx + 5];
379
686k
            }
380
1.22M
            if (dbg)
381
0
                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
382
0
                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
383
1.22M
        } else {
384
355k
            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
385
114k
                idx = dav1d_msac_decode_bool_adapt(&ts->msac,
386
114k
                          ts->cdf.m.txtp_inter3[t_dim->min]);
387
114k
                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
388
240k
            } else if (t_dim->min == TX_16X16) {
389
25.5k
                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
390
25.5k
                          ts->cdf.m.txtp_inter2, 11);
391
25.5k
                *txtp = dav1d_tx_types_per_set[idx + 12];
392
214k
            } else {
393
214k
                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
394
214k
                          ts->cdf.m.txtp_inter1[t_dim->min], 15);
395
214k
                *txtp = dav1d_tx_types_per_set[idx + 24];
396
214k
            }
397
355k
            if (dbg)
398
0
                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
399
0
                       tx, t_dim->min, idx, *txtp, ts->msac.rng);
400
355k
        }
401
1.57M
    }
402
403
    // find end-of-block (eob)
404
4.95M
    int eob;
405
4.95M
    const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
406
4.95M
    const int tx2dszctx = slw + slh;
407
4.95M
    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
408
4.95M
    const int is_1d = tx_class != TX_CLASS_2D;
409
4.95M
    switch (tx2dszctx) {
410
0
#define case_sz(sz, bin, ns, is_1d) \
411
4.95M
    case sz: { \
412
4.95M
        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
413
4.95M
        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
414
4.95M
        break; \
415
4.95M
    }
416
2.13M
    case_sz(0,   16,  8, [is_1d]);
417
366k
    case_sz(1,   32,  8, [is_1d]);
418
657k
    case_sz(2,   64,  8, [is_1d]);
419
366k
    case_sz(3,  128,  8, [is_1d]);
420
460k
    case_sz(4,  256, 16, [is_1d]);
421
201k
    case_sz(5,  512, 16,        );
422
763k
    case_sz(6, 1024, 16,        );
423
4.95M
#undef case_sz
424
4.95M
    }
425
4.94M
    if (dbg)
426
0
        printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
427
0
               16 << tx2dszctx, chroma, is_1d, eob, ts->msac.rng);
428
4.94M
    if (eob > 1) {
429
2.84M
        const int eob_bin = eob - 2;
430
2.84M
        uint16_t *const eob_hi_bit_cdf =
431
2.84M
            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
432
2.84M
        const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
433
2.84M
        if (dbg)
434
0
            printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
435
0
                   t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
436
2.84M
        eob = ((eob_hi_bit | 2) << eob_bin) | dav1d_msac_decode_bools(&ts->msac, eob_bin);
437
2.84M
        if (dbg)
438
0
            printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
439
2.84M
    }
440
4.94M
    assert(eob >= 0);
441
442
    // base tokens
443
4.94M
    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
444
4.94M
    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
445
4.94M
    unsigned rc, dc_tok;
446
447
4.94M
    if (eob) {
448
3.01M
        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
449
3.01M
        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
450
451
        /* eob */
452
3.01M
        unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
453
3.01M
        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
454
3.01M
        int tok = eob_tok + 1;
455
3.01M
        int level_tok = tok * 0x41;
456
3.01M
        unsigned mag;
457
458
3.01M
#define DECODE_COEFS_CLASS(tx_class) \
459
3.01M
        unsigned x, y; \
460
3.01M
        uint8_t *level; \
461
3.01M
        if (tx_class == TX_CLASS_2D) \
462
3.01M
            rc = scan[eob], x = rc >> shift, y = rc & mask; \
463
3.01M
        else if (tx_class == TX_CLASS_H) \
464
            /* Transposing reduces the stride and padding requirements */ \
465
198k
            x = eob & mask, y = eob >> shift, rc = eob; \
466
198k
        else /* tx_class == TX_CLASS_V */ \
467
198k
            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
468
3.01M
        if (dbg) \
469
3.01M
            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
470
0
                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
471
3.01M
        if (eob_tok == 2) { \
472
72.7k
            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
473
72.7k
            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
474
72.7k
            level_tok = tok + (3 << 6); \
475
72.7k
            if (dbg) \
476
72.7k
                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
477
0
                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
478
0
                       ts->msac.rng); \
479
72.7k
        } \
480
3.01M
        cf[rc] = tok << 11; \
481
3.01M
        if (tx_class == TX_CLASS_2D) \
482
3.01M
            level = levels + rc; \
483
3.01M
        else \
484
3.01M
            level = levels + x * stride + y; \
485
3.01M
        *level = (uint8_t) level_tok; \
486
82.5M
        for (int i = eob - 1; i > 0; i--) { /* ac */ \
487
79.5M
            unsigned rc_i; \
488
79.5M
            if (tx_class == TX_CLASS_2D) \
489
79.5M
                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
490
79.5M
            else if (tx_class == TX_CLASS_H) \
491
3.96M
                x = i & mask, y = i >> shift, rc_i = i; \
492
3.96M
            else /* tx_class == TX_CLASS_V */ \
493
3.96M
                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
494
79.5M
            assert(x < 32 && y < 32); \
495
79.5M
            if (tx_class == TX_CLASS_2D) \
496
79.5M
                level = levels + rc_i; \
497
79.5M
            else \
498
79.5M
                level = levels + x * stride + y; \
499
79.5M
            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
500
79.5M
            if (tx_class == TX_CLASS_2D) \
501
79.5M
                y |= x; \
502
79.5M
            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
503
79.5M
            if (dbg) \
504
79.5M
                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
505
0
                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
506
79.5M
            if (tok == 3) { \
507
6.72M
                mag &= 63; \
508
6.72M
                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
509
6.72M
                      (mag > 12 ? 6 : (mag + 1) >> 1); \
510
6.72M
                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
511
6.72M
                if (dbg) \
512
6.72M
                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
513
0
                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
514
0
                           ts->msac.rng); \
515
6.72M
                *level = (uint8_t) (tok + (3 << 6)); \
516
6.72M
                cf[rc_i] = (tok << 11) | rc; \
517
6.72M
                rc = rc_i; \
518
72.8M
            } else { \
519
                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
520
72.8M
                tok *= 0x17ff41; \
521
72.8M
                *level = (uint8_t) tok; \
522
                /* tok ? (tok << 11) | rc : 0 */ \
523
72.8M
                tok = (tok >> 9) & (rc + ~0x7ffu); \
524
72.8M
                if (tok) rc = rc_i; \
525
72.8M
                cf[rc_i] = tok; \
526
72.8M
            } \
527
79.5M
        } \
528
        /* dc */ \
529
3.01M
        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
530
3.01M
            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
531
3.01M
        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
532
3.01M
        if (dbg) \
533
3.01M
            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
534
0
                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
535
3.01M
        if (dc_tok == 3) { \
536
1.16M
            if (tx_class == TX_CLASS_2D) \
537
1.16M
                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
538
1.14M
                      levels[1 * stride + 1]; \
539
1.16M
            mag &= 63; \
540
1.16M
            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
541
1.16M
            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
542
1.16M
            if (dbg) \
543
1.16M
                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
544
0
                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
545
1.16M
        } \
546
3.01M
        break
547
548
3.01M
        const uint16_t *scan;
549
3.01M
        switch (tx_class) {
550
2.81M
        case TX_CLASS_2D: {
551
2.81M
            const unsigned nonsquare_tx = tx >= RTX_4X8;
552
2.81M
            const uint8_t (*const lo_ctx_offsets)[5] =
553
2.81M
                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
554
2.81M
            scan = dav1d_scans[tx];
555
2.81M
            const ptrdiff_t stride = 4 << slh;
556
2.81M
            const unsigned shift = slh + 2, shift2 = 0;
557
2.81M
            const unsigned mask = (4 << slh) - 1;
558
2.81M
            memset(levels, 0, stride * ((4 << slw) + 2));
559
2.81M
            DECODE_COEFS_CLASS(TX_CLASS_2D);
560
2.81M
        }
561
129k
        case TX_CLASS_H: {
562
129k
            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
563
129k
            const ptrdiff_t stride = 16;
564
129k
            const unsigned shift = slh + 2, shift2 = 0;
565
129k
            const unsigned mask = (4 << slh) - 1;
566
129k
            memset(levels, 0, stride * ((4 << slh) + 2));
567
129k
            DECODE_COEFS_CLASS(TX_CLASS_H);
568
129k
        }
569
68.6k
        case TX_CLASS_V: {
570
68.6k
            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
571
68.6k
            const ptrdiff_t stride = 16;
572
68.6k
            const unsigned shift = slw + 2, shift2 = slh + 2;
573
68.6k
            const unsigned mask = (4 << slw) - 1;
574
68.6k
            memset(levels, 0, stride * ((4 << slw) + 2));
575
68.6k
            DECODE_COEFS_CLASS(TX_CLASS_V);
576
68.6k
        }
577
0
#undef DECODE_COEFS_CLASS
578
0
        default: assert(0);
579
3.01M
        }
580
3.01M
    } else { // dc-only
581
1.93M
        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
582
1.93M
        dc_tok = 1 + tok_br;
583
1.93M
        if (dbg)
584
0
            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
585
0
                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
586
1.93M
        if (tok_br == 2) {
587
52.7k
            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
588
52.7k
            if (dbg)
589
0
                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
590
0
                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
591
52.7k
        }
592
1.93M
        rc = 0;
593
1.93M
    }
594
595
    // residual and sign
596
4.94M
    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
597
4.94M
    const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
598
4.94M
    const int dq_shift = imax(0, t_dim->ctx - 2);
599
4.94M
    const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
600
4.94M
    unsigned cul_level, dc_sign_level;
601
602
4.94M
    if (!dc_tok) {
603
703k
        cul_level = 0;
604
703k
        dc_sign_level = 1 << 6;
605
703k
        if (qm_tbl) goto ac_qm;
606
497k
        goto ac_noqm;
607
703k
    }
608
609
4.24M
    const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
610
4.24M
    uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
611
4.24M
    const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
612
4.24M
    if (dbg)
613
0
        printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
614
0
               chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
615
616
4.24M
    int dc_dq = dq_tbl[0];
617
4.24M
    dc_sign_level = (dc_sign - 1) & (2 << 6);
618
619
4.24M
    if (qm_tbl) {
620
1.23M
        dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
621
622
1.23M
        if (dc_tok == 15) {
623
40.0k
            dc_tok = read_golomb(&ts->msac) + 15;
624
40.0k
            if (dbg)
625
0
                printf("Post-dc_residual[%d->%d]: r=%d\n",
626
0
                       dc_tok - 15, dc_tok, ts->msac.rng);
627
628
40.0k
            dc_tok &= 0xfffff;
629
40.0k
            dc_dq = (dc_dq * dc_tok) & 0xffffff;
630
1.19M
        } else {
631
1.19M
            dc_dq *= dc_tok;
632
1.19M
            assert(dc_dq <= 0xffffff);
633
1.19M
        }
634
1.23M
        cul_level = dc_tok;
635
1.23M
        dc_dq >>= dq_shift;
636
1.23M
        dc_dq = umin(dc_dq, cf_max + dc_sign);
637
1.23M
        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
638
639
1.66M
        if (rc) ac_qm: {
640
1.66M
            const unsigned ac_dq = dq_tbl[1];
641
13.4M
            do {
642
13.4M
                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
643
13.4M
                if (dbg)
644
0
                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
645
13.4M
                const unsigned rc_tok = cf[rc];
646
13.4M
                unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
647
13.4M
                int dq_sat;
648
649
13.4M
                if (rc_tok >= (15 << 11)) {
650
690k
                    tok = read_golomb(&ts->msac) + 15;
651
690k
                    if (dbg)
652
0
                        printf("Post-residual[%d=%d->%d]: r=%d\n",
653
0
                               rc, tok - 15, tok, ts->msac.rng);
654
655
690k
                    tok &= 0xfffff;
656
690k
                    dq = (dq * tok) & 0xffffff;
657
12.8M
                } else {
658
12.8M
                    tok = rc_tok >> 11;
659
12.8M
                    dq *= tok;
660
12.8M
                    assert(dq <= 0xffffff);
661
12.8M
                }
662
13.4M
                cul_level += tok;
663
13.4M
                dq >>= dq_shift;
664
13.4M
                dq_sat = umin(dq, cf_max + sign);
665
13.4M
                cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
666
667
13.4M
                rc = rc_tok & 0x3ff;
668
13.4M
            } while (rc);
669
1.66M
        }
670
3.00M
    } else {
671
        // non-qmatrix is the common case and allows for additional optimizations
672
3.00M
        if (dc_tok == 15) {
673
103k
            dc_tok = read_golomb(&ts->msac) + 15;
674
103k
            if (dbg)
675
0
                printf("Post-dc_residual[%d->%d]: r=%d\n",
676
0
                       dc_tok - 15, dc_tok, ts->msac.rng);
677
678
103k
            dc_tok &= 0xfffff;
679
103k
            dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
680
103k
            dc_dq = umin(dc_dq, cf_max + dc_sign);
681
2.90M
        } else {
682
2.90M
            dc_dq = ((dc_dq * dc_tok) >> dq_shift);
683
2.90M
            assert(dc_dq <= cf_max);
684
2.90M
        }
685
3.00M
        cul_level = dc_tok;
686
3.00M
        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
687
688
3.65M
        if (rc) ac_noqm: {
689
3.65M
            const unsigned ac_dq = dq_tbl[1];
690
18.5M
            do {
691
18.5M
                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
692
18.5M
                if (dbg)
693
0
                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
694
18.5M
                const unsigned rc_tok = cf[rc];
695
18.5M
                unsigned tok;
696
18.5M
                int dq;
697
698
                // residual
699
18.5M
                if (rc_tok >= (15 << 11)) {
700
405k
                    tok = read_golomb(&ts->msac) + 15;
701
405k
                    if (dbg)
702
0
                        printf("Post-residual[%d=%d->%d]: r=%d\n",
703
0
                               rc, tok - 15, tok, ts->msac.rng);
704
705
                    // coefficient parsing, see 5.11.39
706
405k
                    tok &= 0xfffff;
707
708
                    // dequant, see 7.12.3
709
405k
                    dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
710
405k
                    dq = umin(dq, cf_max + sign);
711
18.1M
                } else {
712
                    // cannot exceed cf_max, so we can avoid the clipping
713
18.1M
                    tok = rc_tok >> 11;
714
18.1M
                    dq = ((ac_dq * tok) >> dq_shift);
715
18.1M
                    assert(dq <= cf_max);
716
18.1M
                }
717
18.5M
                cul_level += tok;
718
18.5M
                cf[rc] = (coef) (sign ? -dq : dq);
719
720
18.5M
                rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
721
18.5M
            } while (rc);
722
3.65M
        }
723
3.00M
    }
724
725
    // context
726
4.93M
    *res_ctx = umin(cul_level, 63) | dc_sign_level;
727
728
4.93M
    return eob;
729
4.24M
}
730
731
static void read_coef_tree(Dav1dTaskContext *const t,
732
                           const enum BlockSize bs, const Av1Block *const b,
733
                           const enum RectTxfmSize ytx, const int depth,
734
                           const uint16_t *const tx_split,
735
                           const int x_off, const int y_off, pixel *dst)
736
1.14M
{
737
1.14M
    const Dav1dFrameContext *const f = t->f;
738
1.14M
    Dav1dTileState *const ts = t->ts;
739
1.14M
    const Dav1dDSPContext *const dsp = f->dsp;
740
1.14M
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
741
1.14M
    const int txw = t_dim->w, txh = t_dim->h;
742
743
    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
744
     * be splitted. Aviods an undefined left shift. */
745
1.14M
    if (depth < 2 && tx_split[depth] &&
746
103k
        tx_split[depth] & (1 << (y_off * 4 + x_off)))
747
80.0k
    {
748
80.0k
        const enum RectTxfmSize sub = t_dim->sub;
749
80.0k
        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
750
80.0k
        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
751
752
80.0k
        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
753
80.0k
                       x_off * 2 + 0, y_off * 2 + 0, dst);
754
80.0k
        t->bx += txsw;
755
80.0k
        if (txw >= txh && t->bx < f->bw)
756
57.1k
            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
757
57.1k
                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
758
80.0k
        t->bx -= txsw;
759
80.0k
        t->by += txsh;
760
80.0k
        if (txh >= txw && t->by < f->bh) {
761
54.8k
            if (dst)
762
15.6k
                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
763
54.8k
            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
764
54.8k
                           x_off * 2 + 0, y_off * 2 + 1, dst);
765
54.8k
            t->bx += txsw;
766
54.8k
            if (txw >= txh && t->bx < f->bw)
767
33.0k
                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
768
33.0k
                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
769
54.8k
            t->bx -= txsw;
770
54.8k
        }
771
80.0k
        t->by -= txsh;
772
1.06M
    } else {
773
1.06M
        const int bx4 = t->bx & 31, by4 = t->by & 31;
774
1.06M
        enum TxfmType txtp;
775
1.06M
        uint8_t cf_ctx;
776
1.06M
        int eob;
777
1.06M
        coef *cf;
778
779
1.06M
        if (t->frame_thread.pass) {
780
1.06M
            const int p = t->frame_thread.pass & 1;
781
1.06M
            assert(ts->frame_thread[p].cf);
782
1.06M
            cf = ts->frame_thread[p].cf;
783
1.06M
            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
784
18.4E
        } else {
785
18.4E
            cf = bitfn(t->cf);
786
18.4E
        }
787
1.06M
        if (t->frame_thread.pass != 2) {
788
735k
            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
789
735k
                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
790
735k
            if (DEBUG_BLOCK_INFO)
791
0
                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
792
0
                       ytx, txtp, eob, ts->msac.rng);
793
735k
            dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
794
735k
            dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
795
735k
#define set_ctx(rep_macro) \
796
2.77M
            for (int y = 0; y < txh; y++) { \
797
2.03M
                rep_macro(txtp_map, 0, txtp); \
798
2.03M
                txtp_map += 32; \
799
2.03M
            }
800
735k
            uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
801
735k
            case_set_upto16(t_dim->lw);
802
735k
#undef set_ctx
803
735k
            if (t->frame_thread.pass == 1)
804
735k
                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
805
735k
        } else {
806
324k
            const int cbi = *ts->frame_thread[0].cbi++;
807
324k
            eob  = cbi >> 5;
808
324k
            txtp = cbi & 0x1f;
809
324k
        }
810
1.06M
        if (!(t->frame_thread.pass & 1)) {
811
325k
            assert(dst);
812
325k
            if (eob >= 0) {
813
210k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
814
0
                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
815
210k
                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
816
210k
                                              HIGHBD_CALL_SUFFIX);
817
210k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
818
0
                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
819
210k
            }
820
325k
        }
821
1.06M
    }
822
1.14M
}
823
824
void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
825
                                    const enum BlockSize bs, const Av1Block *const b)
826
4.60M
{
827
4.60M
    const Dav1dFrameContext *const f = t->f;
828
4.60M
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
829
4.60M
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
830
4.60M
    const int bx4 = t->bx & 31, by4 = t->by & 31;
831
4.60M
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
832
4.60M
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
833
4.60M
    const int bw4 = b_dim[0], bh4 = b_dim[1];
834
4.60M
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
835
4.60M
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
836
4.12M
                           (bw4 > ss_hor || t->bx & 1) &&
837
3.83M
                           (bh4 > ss_ver || t->by & 1);
838
839
4.60M
    if (b->skip) {
840
2.78M
        BlockContext *const a = t->a;
841
2.78M
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
842
2.78M
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
843
2.78M
        if (has_chroma) {
844
2.10M
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
845
2.10M
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
846
2.10M
            memset_cw(&a->ccoef[0][cbx4], 0x40);
847
2.10M
            memset_cw(&a->ccoef[1][cbx4], 0x40);
848
2.10M
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
849
2.10M
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
850
2.10M
        }
851
2.78M
        return;
852
2.78M
    }
853
854
1.82M
    Dav1dTileState *const ts = t->ts;
855
1.82M
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
856
1.82M
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
857
1.82M
    assert(t->frame_thread.pass == 1);
858
1.82M
    assert(!b->skip);
859
1.82M
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
860
1.82M
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
861
1.82M
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
862
863
3.72M
    for (int init_y = 0; init_y < h4; init_y += 16) {
864
1.90M
        const int sub_h4 = imin(h4, 16 + init_y);
865
3.91M
        for (int init_x = 0; init_x < w4; init_x += 16) {
866
2.01M
            const int sub_w4 = imin(w4, init_x + 16);
867
2.01M
            int y_off = !!init_y, y, x;
868
4.43M
            for (y = init_y, t->by += init_y; y < sub_h4;
869
2.41M
                 y += t_dim->h, t->by += t_dim->h, y_off++)
870
2.41M
            {
871
2.41M
                int x_off = !!init_x;
872
6.81M
                for (x = init_x, t->bx += init_x; x < sub_w4;
873
4.39M
                     x += t_dim->w, t->bx += t_dim->w, x_off++)
874
4.39M
                {
875
4.39M
                    if (!b->intra) {
876
628k
                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
877
628k
                                       x_off, y_off, NULL);
878
3.76M
                    } else {
879
3.76M
                        uint8_t cf_ctx = 0x40;
880
3.76M
                        enum TxfmType txtp;
881
3.76M
                        const int eob =
882
3.76M
                            decode_coefs(t, &t->a->lcoef[bx4 + x],
883
3.76M
                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
884
3.76M
                                         0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
885
3.76M
                        if (DEBUG_BLOCK_INFO)
886
0
                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
887
0
                                   b->tx, txtp, eob, ts->msac.rng);
888
3.76M
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
889
3.76M
                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
890
3.76M
                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
891
3.76M
                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
892
3.76M
                    }
893
4.39M
                }
894
2.41M
                t->bx -= x;
895
2.41M
            }
896
2.01M
            t->by -= y;
897
898
2.01M
            if (!has_chroma) continue;
899
900
1.63M
            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
901
1.63M
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
902
4.89M
            for (int pl = 0; pl < 2; pl++) {
903
6.96M
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
904
3.70M
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
905
3.70M
                {
906
8.63M
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
907
4.93M
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
908
4.93M
                    {
909
4.93M
                        uint8_t cf_ctx = 0x40;
910
4.93M
                        enum TxfmType txtp;
911
4.93M
                        if (!b->intra)
912
880k
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
913
880k
                                                        bx4 + (x << ss_hor)];
914
4.93M
                        const int eob =
915
4.93M
                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
916
4.93M
                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
917
4.93M
                                         b, b->intra, 1 + pl, ts->frame_thread[1].cf,
918
4.93M
                                         &txtp, &cf_ctx);
919
4.93M
                        if (DEBUG_BLOCK_INFO)
920
0
                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
921
0
                                   "txtp=%d,eob=%d]: r=%d\n",
922
0
                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
923
4.93M
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
924
4.93M
                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
925
4.93M
                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
926
4.93M
                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
927
4.93M
                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
928
4.93M
                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
929
4.93M
                    }
930
3.70M
                    t->bx -= x << ss_hor;
931
3.70M
                }
932
3.26M
                t->by -= y << ss_ver;
933
3.26M
            }
934
1.63M
        }
935
1.90M
    }
936
1.82M
}
dav1d_read_coef_blocks_8bpc
Line
Count
Source
826
2.23M
{
827
2.23M
    const Dav1dFrameContext *const f = t->f;
828
2.23M
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
829
2.23M
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
830
2.23M
    const int bx4 = t->bx & 31, by4 = t->by & 31;
831
2.23M
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
832
2.23M
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
833
2.23M
    const int bw4 = b_dim[0], bh4 = b_dim[1];
834
2.23M
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
835
2.23M
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
836
2.04M
                           (bw4 > ss_hor || t->bx & 1) &&
837
1.89M
                           (bh4 > ss_ver || t->by & 1);
838
839
2.23M
    if (b->skip) {
840
1.39M
        BlockContext *const a = t->a;
841
1.39M
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
842
1.39M
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
843
1.39M
        if (has_chroma) {
844
1.07M
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
845
1.07M
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
846
1.07M
            memset_cw(&a->ccoef[0][cbx4], 0x40);
847
1.07M
            memset_cw(&a->ccoef[1][cbx4], 0x40);
848
1.07M
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
849
1.07M
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
850
1.07M
        }
851
1.39M
        return;
852
1.39M
    }
853
854
837k
    Dav1dTileState *const ts = t->ts;
855
837k
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
856
837k
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
857
837k
    assert(t->frame_thread.pass == 1);
858
837k
    assert(!b->skip);
859
837k
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
860
837k
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
861
837k
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
862
863
1.70M
    for (int init_y = 0; init_y < h4; init_y += 16) {
864
868k
        const int sub_h4 = imin(h4, 16 + init_y);
865
1.77M
        for (int init_x = 0; init_x < w4; init_x += 16) {
866
909k
            const int sub_w4 = imin(w4, init_x + 16);
867
909k
            int y_off = !!init_y, y, x;
868
2.01M
            for (y = init_y, t->by += init_y; y < sub_h4;
869
1.10M
                 y += t_dim->h, t->by += t_dim->h, y_off++)
870
1.10M
            {
871
1.10M
                int x_off = !!init_x;
872
3.35M
                for (x = init_x, t->bx += init_x; x < sub_w4;
873
2.25M
                     x += t_dim->w, t->bx += t_dim->w, x_off++)
874
2.25M
                {
875
2.25M
                    if (!b->intra) {
876
380k
                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
877
380k
                                       x_off, y_off, NULL);
878
1.87M
                    } else {
879
1.87M
                        uint8_t cf_ctx = 0x40;
880
1.87M
                        enum TxfmType txtp;
881
1.87M
                        const int eob =
882
1.87M
                            decode_coefs(t, &t->a->lcoef[bx4 + x],
883
1.87M
                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
884
1.87M
                                         0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
885
1.87M
                        if (DEBUG_BLOCK_INFO)
886
0
                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
887
0
                                   b->tx, txtp, eob, ts->msac.rng);
888
1.87M
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
889
1.87M
                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
890
1.87M
                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
891
1.87M
                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
892
1.87M
                    }
893
2.25M
                }
894
1.10M
                t->bx -= x;
895
1.10M
            }
896
909k
            t->by -= y;
897
898
909k
            if (!has_chroma) continue;
899
900
752k
            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
901
752k
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
902
2.25M
            for (int pl = 0; pl < 2; pl++) {
903
3.21M
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
904
1.70M
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
905
1.70M
                {
906
4.01M
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
907
2.30M
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
908
2.30M
                    {
909
2.30M
                        uint8_t cf_ctx = 0x40;
910
2.30M
                        enum TxfmType txtp;
911
2.30M
                        if (!b->intra)
912
495k
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
913
495k
                                                        bx4 + (x << ss_hor)];
914
2.30M
                        const int eob =
915
2.30M
                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
916
2.30M
                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
917
2.30M
                                         b, b->intra, 1 + pl, ts->frame_thread[1].cf,
918
2.30M
                                         &txtp, &cf_ctx);
919
2.30M
                        if (DEBUG_BLOCK_INFO)
920
0
                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
921
0
                                   "txtp=%d,eob=%d]: r=%d\n",
922
0
                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
923
2.30M
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
924
2.30M
                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
925
2.30M
                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
926
2.30M
                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
927
2.30M
                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
928
2.30M
                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
929
2.30M
                    }
930
1.70M
                    t->bx -= x << ss_hor;
931
1.70M
                }
932
1.50M
                t->by -= y << ss_ver;
933
1.50M
            }
934
752k
        }
935
868k
    }
936
837k
}
dav1d_read_coef_blocks_16bpc
Line
Count
Source
826
2.37M
{
827
2.37M
    const Dav1dFrameContext *const f = t->f;
828
2.37M
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
829
2.37M
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
830
2.37M
    const int bx4 = t->bx & 31, by4 = t->by & 31;
831
2.37M
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
832
2.37M
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
833
2.37M
    const int bw4 = b_dim[0], bh4 = b_dim[1];
834
2.37M
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
835
2.37M
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
836
2.08M
                           (bw4 > ss_hor || t->bx & 1) &&
837
1.94M
                           (bh4 > ss_ver || t->by & 1);
838
839
2.37M
    if (b->skip) {
840
1.38M
        BlockContext *const a = t->a;
841
1.38M
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
842
1.38M
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
843
1.38M
        if (has_chroma) {
844
1.02M
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
845
1.02M
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
846
1.02M
            memset_cw(&a->ccoef[0][cbx4], 0x40);
847
1.02M
            memset_cw(&a->ccoef[1][cbx4], 0x40);
848
1.02M
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
849
1.02M
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
850
1.02M
        }
851
1.38M
        return;
852
1.38M
    }
853
854
985k
    Dav1dTileState *const ts = t->ts;
855
985k
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
856
985k
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
857
985k
    assert(t->frame_thread.pass == 1);
858
985k
    assert(!b->skip);
859
985k
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
860
985k
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
861
985k
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
862
863
2.01M
    for (int init_y = 0; init_y < h4; init_y += 16) {
864
1.03M
        const int sub_h4 = imin(h4, 16 + init_y);
865
2.13M
        for (int init_x = 0; init_x < w4; init_x += 16) {
866
1.10M
            const int sub_w4 = imin(w4, init_x + 16);
867
1.10M
            int y_off = !!init_y, y, x;
868
2.41M
            for (y = init_y, t->by += init_y; y < sub_h4;
869
1.31M
                 y += t_dim->h, t->by += t_dim->h, y_off++)
870
1.31M
            {
871
1.31M
                int x_off = !!init_x;
872
3.45M
                for (x = init_x, t->bx += init_x; x < sub_w4;
873
2.14M
                     x += t_dim->w, t->bx += t_dim->w, x_off++)
874
2.14M
                {
875
2.14M
                    if (!b->intra) {
876
248k
                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
877
248k
                                       x_off, y_off, NULL);
878
1.89M
                    } else {
879
1.89M
                        uint8_t cf_ctx = 0x40;
880
1.89M
                        enum TxfmType txtp;
881
1.89M
                        const int eob =
882
1.89M
                            decode_coefs(t, &t->a->lcoef[bx4 + x],
883
1.89M
                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
884
1.89M
                                         0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
885
1.89M
                        if (DEBUG_BLOCK_INFO)
886
0
                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
887
0
                                   b->tx, txtp, eob, ts->msac.rng);
888
1.89M
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
889
1.89M
                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
890
1.89M
                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
891
1.89M
                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
892
1.89M
                    }
893
2.14M
                }
894
1.31M
                t->bx -= x;
895
1.31M
            }
896
1.10M
            t->by -= y;
897
898
1.10M
            if (!has_chroma) continue;
899
900
878k
            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
901
878k
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
902
2.63M
            for (int pl = 0; pl < 2; pl++) {
903
3.74M
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
904
1.99M
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
905
1.99M
                {
906
4.61M
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
907
2.62M
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
908
2.62M
                    {
909
2.62M
                        uint8_t cf_ctx = 0x40;
910
2.62M
                        enum TxfmType txtp;
911
2.62M
                        if (!b->intra)
912
384k
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
913
384k
                                                        bx4 + (x << ss_hor)];
914
2.62M
                        const int eob =
915
2.62M
                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
916
2.62M
                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
917
2.62M
                                         b, b->intra, 1 + pl, ts->frame_thread[1].cf,
918
2.62M
                                         &txtp, &cf_ctx);
919
2.62M
                        if (DEBUG_BLOCK_INFO)
920
0
                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
921
0
                                   "txtp=%d,eob=%d]: r=%d\n",
922
0
                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
923
2.62M
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
924
2.62M
                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
925
2.62M
                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
926
2.62M
                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
927
2.62M
                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
928
2.62M
                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
929
2.62M
                    }
930
1.99M
                    t->bx -= x << ss_hor;
931
1.99M
                }
932
1.75M
                t->by -= y << ss_ver;
933
1.75M
            }
934
878k
        }
935
1.03M
    }
936
985k
}
937
938
static int mc(Dav1dTaskContext *const t,
939
              pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
940
              const int bw4, const int bh4,
941
              const int bx, const int by, const int pl,
942
              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
943
              const enum Filter2d filter_2d)
944
1.06M
{
945
1.06M
    assert((dst8 != NULL) ^ (dst16 != NULL));
946
1.06M
    const Dav1dFrameContext *const f = t->f;
947
1.06M
    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
948
1.06M
    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
949
1.06M
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
950
1.06M
    const int mvx = mv.x, mvy = mv.y;
951
1.06M
    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
952
1.06M
    ptrdiff_t ref_stride = refp->p.stride[!!pl];
953
1.06M
    const pixel *ref;
954
955
1.06M
    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
956
806k
        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
957
806k
        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
958
806k
        int w, h;
959
960
806k
        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
961
505k
            w = (f->cur.p.w + ss_hor) >> ss_hor;
962
505k
            h = (f->cur.p.h + ss_ver) >> ss_ver;
963
505k
        } else {
964
301k
            w = f->bw * 4 >> ss_hor;
965
301k
            h = f->bh * 4 >> ss_ver;
966
301k
        }
967
806k
        if (dx < !!mx * 3 || dy < !!my * 3 ||
968
671k
            dx + bw4 * h_mul + !!mx * 4 > w ||
969
509k
            dy + bh4 * v_mul + !!my * 4 > h)
970
385k
        {
971
385k
            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
972
385k
            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
973
385k
                                w, h, dx - !!mx * 3, dy - !!my * 3,
974
385k
                                emu_edge_buf, 192 * sizeof(pixel),
975
385k
                                refp->p.data[pl], ref_stride);
976
385k
            ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
977
385k
            ref_stride = 192 * sizeof(pixel);
978
421k
        } else {
979
421k
            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
980
421k
        }
981
982
806k
        if (dst8 != NULL) {
983
696k
            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
984
696k
                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver
985
696k
                                     HIGHBD_CALL_SUFFIX);
986
696k
        } else {
987
110k
            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
988
110k
                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver
989
110k
                                      HIGHBD_CALL_SUFFIX);
990
110k
        }
991
806k
    } else {
992
260k
        assert(refp != &f->sr_cur);
993
994
260k
        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
995
260k
        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
996
521k
#define scale_mv(res, val, scale) do { \
997
521k
            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
998
521k
            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
999
521k
        } while (0)
1000
260k
        int pos_y, pos_x;
1001
260k
        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
1002
260k
        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
1003
260k
#undef scale_mv
1004
260k
        const int left = pos_x >> 10;
1005
260k
        const int top = pos_y >> 10;
1006
260k
        const int right =
1007
260k
            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
1008
260k
        const int bottom =
1009
260k
            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
1010
1011
260k
        if (DEBUG_BLOCK_INFO)
1012
0
            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
1013
0
                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
1014
0
                   right-left, bottom-top,
1015
0
                   f->svc[refidx][0].step, f->svc[refidx][1].step);
1016
1017
260k
        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
1018
260k
        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
1019
260k
        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
1020
214k
            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
1021
214k
            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
1022
214k
                                w, h, left - 3, top - 3,
1023
214k
                                emu_edge_buf, 320 * sizeof(pixel),
1024
214k
                                refp->p.data[pl], ref_stride);
1025
214k
            ref = &emu_edge_buf[320 * 3 + 3];
1026
214k
            ref_stride = 320 * sizeof(pixel);
1027
214k
            if (DEBUG_BLOCK_INFO) printf("Emu\n");
1028
214k
        } else {
1029
45.8k
            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
1030
45.8k
        }
1031
1032
260k
        if (dst8 != NULL) {
1033
208k
            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
1034
208k
                                            bw4 * h_mul, bh4 * v_mul,
1035
208k
                                            pos_x & 0x3ff, pos_y & 0x3ff,
1036
208k
                                            f->svc[refidx][0].step,
1037
208k
                                            f->svc[refidx][1].step
1038
208k
                                            HIGHBD_CALL_SUFFIX);
1039
208k
        } else {
1040
52.3k
            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
1041
52.3k
                                             bw4 * h_mul, bh4 * v_mul,
1042
52.3k
                                             pos_x & 0x3ff, pos_y & 0x3ff,
1043
52.3k
                                             f->svc[refidx][0].step,
1044
52.3k
                                             f->svc[refidx][1].step
1045
52.3k
                                             HIGHBD_CALL_SUFFIX);
1046
52.3k
        }
1047
260k
    }
1048
1049
1.06M
    return 0;
1050
1.06M
}
1051
1052
static int obmc(Dav1dTaskContext *const t,
1053
                pixel *const dst, const ptrdiff_t dst_stride,
1054
                const uint8_t *const b_dim, const int pl,
1055
                const int bx4, const int by4, const int w4, const int h4)
1056
80.8k
{
1057
80.8k
    assert(!(t->bx & 1) && !(t->by & 1));
1058
80.8k
    const Dav1dFrameContext *const f = t->f;
1059
80.8k
    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
1060
80.8k
    pixel *const lap = bitfn(t->scratch.lap);
1061
80.8k
    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1062
80.8k
    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1063
80.8k
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
1064
80.8k
    int res;
1065
1066
80.8k
    if (t->by > t->ts->tiling.row_start &&
1067
66.4k
        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
1068
57.0k
    {
1069
119k
        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
1070
            // only odd blocks are considered for overlap handling, hence +1
1071
62.7k
            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
1072
62.7k
            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
1073
62.7k
            const int step4 = iclip(a_b_dim[0], 2, 16);
1074
1075
62.7k
            if (a_r->ref.ref[0] > 0) {
1076
62.1k
                const int ow4 = imin(step4, b_dim[0]);
1077
62.1k
                const int oh4 = imin(b_dim[1], 16) >> 1;
1078
62.1k
                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
1079
62.1k
                         t->bx + x, t->by, pl, a_r->mv.mv[0],
1080
62.1k
                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
1081
62.1k
                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
1082
62.1k
                if (res) return res;
1083
62.1k
                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
1084
62.1k
                                   h_mul * ow4, v_mul * oh4);
1085
62.1k
                i++;
1086
62.1k
            }
1087
62.7k
            x += step4;
1088
62.7k
        }
1089
57.0k
    }
1090
1091
80.8k
    if (t->bx > t->ts->tiling.col_start)
1092
127k
        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
1093
            // only odd blocks are considered for overlap handling, hence +1
1094
66.2k
            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
1095
66.2k
            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
1096
66.2k
            const int step4 = iclip(l_b_dim[1], 2, 16);
1097
1098
66.2k
            if (l_r->ref.ref[0] > 0) {
1099
65.3k
                const int ow4 = imin(b_dim[0], 16) >> 1;
1100
65.3k
                const int oh4 = imin(step4, b_dim[1]);
1101
65.3k
                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
1102
65.3k
                         t->bx, t->by + y, pl, l_r->mv.mv[0],
1103
65.3k
                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
1104
65.3k
                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
1105
65.3k
                if (res) return res;
1106
65.3k
                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
1107
65.3k
                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
1108
65.3k
                i++;
1109
65.3k
            }
1110
66.2k
            y += step4;
1111
66.2k
        }
1112
80.8k
    return 0;
1113
80.8k
}
1114
1115
static int warp_affine(Dav1dTaskContext *const t,
1116
                       pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
1117
                       const uint8_t *const b_dim, const int pl,
1118
                       const Dav1dThreadPicture *const refp,
1119
                       const Dav1dWarpedMotionParams *const wmp)
1120
30.2k
{
1121
30.2k
    assert((dst8 != NULL) ^ (dst16 != NULL));
1122
30.2k
    const Dav1dFrameContext *const f = t->f;
1123
30.2k
    const Dav1dDSPContext *const dsp = f->dsp;
1124
30.2k
    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1125
30.2k
    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1126
30.2k
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
1127
30.2k
    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
1128
30.2k
    const int32_t *const mat = wmp->matrix;
1129
30.2k
    const int width = (refp->p.p.w + ss_hor) >> ss_hor;
1130
30.2k
    const int height = (refp->p.p.h + ss_ver) >> ss_ver;
1131
1132
100k
    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
1133
70.2k
        const int src_y = t->by * 4 + ((y + 4) << ss_ver);
1134
70.2k
        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
1135
70.2k
        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
1136
379k
        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
1137
            // calculate transformation relative to center of 8x8 block in
1138
            // luma pixel units
1139
309k
            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
1140
309k
            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
1141
309k
            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
1142
1143
309k
            const int dx = (int) (mvx >> 16) - 4;
1144
309k
            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
1145
309k
                                                   wmp->u.p.beta  * 7) & ~0x3f;
1146
309k
            const int dy = (int) (mvy >> 16) - 4;
1147
309k
            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
1148
309k
                                                   wmp->u.p.delta * 4) & ~0x3f;
1149
1150
309k
            const pixel *ref_ptr;
1151
309k
            ptrdiff_t ref_stride = refp->p.stride[!!pl];
1152
1153
309k
            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
1154
226k
                pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
1155
226k
                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
1156
226k
                                    emu_edge_buf, 32 * sizeof(pixel),
1157
226k
                                    refp->p.data[pl], ref_stride);
1158
226k
                ref_ptr = &emu_edge_buf[32 * 3 + 3];
1159
226k
                ref_stride = 32 * sizeof(pixel);
1160
226k
            } else {
1161
83.2k
                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
1162
83.2k
            }
1163
309k
            if (dst16 != NULL)
1164
52.5k
                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
1165
52.5k
                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
1166
257k
            else
1167
257k
                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
1168
257k
                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
1169
309k
        }
1170
70.2k
        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
1171
15.6k
        else      dst16 += 8 * dstride;
1172
70.2k
    }
1173
30.2k
    return 0;
1174
30.2k
}
1175
1176
void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
1177
                                 const enum EdgeFlags intra_edge_flags,
1178
                                 const Av1Block *const b)
1179
1.79M
{
1180
1.79M
    Dav1dTileState *const ts = t->ts;
1181
1.79M
    const Dav1dFrameContext *const f = t->f;
1182
1.79M
    const Dav1dDSPContext *const dsp = f->dsp;
1183
1.79M
    const int bx4 = t->bx & 31, by4 = t->by & 31;
1184
1.79M
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1185
1.79M
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1186
1.79M
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1187
1.79M
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1188
1.79M
    const int bw4 = b_dim[0], bh4 = b_dim[1];
1189
1.79M
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1190
1.79M
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1191
1.79M
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1192
1.62M
                           (bw4 > ss_hor || t->bx & 1) &&
1193
1.52M
                           (bh4 > ss_ver || t->by & 1);
1194
1.79M
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
1195
1.79M
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
1196
1197
    // coefficient coding
1198
1.79M
    pixel *const edge = bitfn(t->scratch.edge) + 128;
1199
1.79M
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
1200
1201
1.79M
    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
1202
1203
3.69M
    for (int init_y = 0; init_y < h4; init_y += 16) {
1204
1.89M
        const int sub_h4 = imin(h4, 16 + init_y);
1205
1.89M
        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
1206
3.93M
        for (int init_x = 0; init_x < w4; init_x += 16) {
1207
2.04M
            if (b->pal_sz[0]) {
1208
8.81k
                pixel *dst = ((pixel *) f->cur.data[0]) +
1209
8.81k
                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1210
8.81k
                const uint8_t *pal_idx;
1211
8.81k
                if (t->frame_thread.pass) {
1212
8.81k
                    const int p = t->frame_thread.pass & 1;
1213
8.81k
                    assert(ts->frame_thread[p].pal_idx);
1214
8.81k
                    pal_idx = ts->frame_thread[p].pal_idx;
1215
8.81k
                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
1216
8.81k
                } else {
1217
0
                    pal_idx = t->scratch.pal_idx_y;
1218
0
                }
1219
8.81k
                const pixel *const pal = t->frame_thread.pass ?
1220
8.81k
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1221
8.81k
                                        ((t->bx >> 1) + (t->by & 1))][0] :
1222
8.81k
                    bytefn(t->scratch.pal)[0];
1223
8.81k
                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
1224
8.81k
                                       pal_idx, bw4 * 4, bh4 * 4);
1225
8.81k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1226
0
                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
1227
0
                             bw4 * 4, bh4 * 4, "y-pal-pred");
1228
8.81k
            }
1229
1230
2.04M
            const int intra_flags = (sm_flag(t->a, bx4) |
1231
2.04M
                                     sm_flag(&t->l, by4) |
1232
2.04M
                                     intra_edge_filter_flag);
1233
2.04M
            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
1234
1.89M
                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
1235
2.04M
            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
1236
1.89M
                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
1237
2.04M
            int y, x;
1238
2.04M
            const int sub_w4 = imin(w4, init_x + 16);
1239
5.04M
            for (y = init_y, t->by += init_y; y < sub_h4;
1240
3.00M
                 y += t_dim->h, t->by += t_dim->h)
1241
3.00M
            {
1242
3.00M
                pixel *dst = ((pixel *) f->cur.data[0]) +
1243
3.00M
                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
1244
3.00M
                                    t->bx + init_x);
1245
14.6M
                for (x = init_x, t->bx += init_x; x < sub_w4;
1246
11.6M
                     x += t_dim->w, t->bx += t_dim->w)
1247
11.6M
                {
1248
11.6M
                    if (b->pal_sz[0]) goto skip_y_pred;
1249
1250
11.6M
                    int angle = b->y_angle;
1251
11.6M
                    const enum EdgeFlags edge_flags =
1252
11.6M
                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
1253
10.1M
                             0 : EDGE_I444_TOP_HAS_RIGHT) |
1254
11.6M
                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
1255
9.81M
                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
1256
11.6M
                    const pixel *top_sb_edge = NULL;
1257
11.6M
                    if (!(t->by & (f->sb_step - 1))) {
1258
1.17M
                        top_sb_edge = f->ipred_edge[0];
1259
1.17M
                        const int sby = t->by >> f->sb_shift;
1260
1.17M
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1261
1.17M
                    }
1262
11.6M
                    const enum IntraPredMode m =
1263
11.6M
                        bytefn(dav1d_prepare_intra_edges)(t->bx,
1264
11.6M
                                                          t->bx > ts->tiling.col_start,
1265
11.6M
                                                          t->by,
1266
11.6M
                                                          t->by > ts->tiling.row_start,
1267
11.6M
                                                          ts->tiling.col_end,
1268
11.6M
                                                          ts->tiling.row_end,
1269
11.6M
                                                          edge_flags, dst,
1270
11.6M
                                                          f->cur.stride[0], top_sb_edge,
1271
11.6M
                                                          b->y_mode, &angle,
1272
11.6M
                                                          t_dim->w, t_dim->h,
1273
11.6M
                                                          f->seq_hdr->intra_edge_filter,
1274
11.6M
                                                          edge HIGHBD_CALL_SUFFIX);
1275
11.6M
                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
1276
11.6M
                                             t_dim->w * 4, t_dim->h * 4,
1277
11.6M
                                             angle | intra_flags,
1278
11.6M
                                             4 * f->bw - 4 * t->bx,
1279
11.6M
                                             4 * f->bh - 4 * t->by
1280
11.6M
                                             HIGHBD_CALL_SUFFIX);
1281
1282
11.6M
                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1283
0
                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
1284
0
                                 t_dim->h * 4, 2, "l");
1285
0
                        hex_dump(edge, 0, 1, 1, "tl");
1286
0
                        hex_dump(edge + 1, t_dim->w * 4,
1287
0
                                 t_dim->w * 4, 2, "t");
1288
0
                        hex_dump(dst, f->cur.stride[0],
1289
0
                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
1290
0
                    }
1291
1292
11.6M
                skip_y_pred: {}
1293
11.6M
                    if (!b->skip) {
1294
1.55M
                        coef *cf;
1295
1.55M
                        int eob;
1296
1.55M
                        enum TxfmType txtp;
1297
1.55M
                        if (t->frame_thread.pass) {
1298
1.55M
                            const int p = t->frame_thread.pass & 1;
1299
1.55M
                            const int cbi = *ts->frame_thread[p].cbi++;
1300
1.55M
                            cf = ts->frame_thread[p].cf;
1301
1.55M
                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
1302
1.55M
                            eob  = cbi >> 5;
1303
1.55M
                            txtp = cbi & 0x1f;
1304
18.4E
                        } else {
1305
18.4E
                            uint8_t cf_ctx;
1306
18.4E
                            cf = bitfn(t->cf);
1307
18.4E
                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
1308
18.4E
                                               &t->l.lcoef[by4 + y], b->tx, bs,
1309
18.4E
                                               b, 1, 0, cf, &txtp, &cf_ctx);
1310
18.4E
                            if (DEBUG_BLOCK_INFO)
1311
0
                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
1312
0
                                       b->tx, txtp, eob, ts->msac.rng);
1313
18.4E
                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
1314
18.4E
                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
1315
18.4E
                        }
1316
1.55M
                        if (eob >= 0) {
1317
1.14M
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1318
0
                                coef_dump(cf, imin(t_dim->h, 8) * 4,
1319
0
                                          imin(t_dim->w, 8) * 4, 3, "dq");
1320
1.14M
                            dsp->itx.itxfm_add[b->tx]
1321
1.14M
                                              [txtp](dst,
1322
1.14M
                                                     f->cur.stride[0],
1323
1.14M
                                                     cf, eob HIGHBD_CALL_SUFFIX);
1324
1.14M
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1325
0
                                hex_dump(dst, f->cur.stride[0],
1326
0
                                         t_dim->w * 4, t_dim->h * 4, "recon");
1327
1.14M
                        }
1328
10.1M
                    } else if (!t->frame_thread.pass) {
1329
0
                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
1330
0
                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
1331
0
                    }
1332
11.6M
                    dst += 4 * t_dim->w;
1333
11.6M
                }
1334
3.00M
                t->bx -= x;
1335
3.00M
            }
1336
2.04M
            t->by -= y;
1337
1338
2.04M
            if (!has_chroma) continue;
1339
1340
1.59M
            const ptrdiff_t stride = f->cur.stride[1];
1341
1342
1.59M
            if (b->uv_mode == CFL_PRED) {
1343
315k
                assert(!init_x && !init_y);
1344
1345
315k
                int16_t *const ac = t->scratch.ac;
1346
315k
                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
1347
315k
                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
1348
315k
                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
1349
315k
                                              (t->by >> ss_ver) * PXSTRIDE(stride));
1350
315k
                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
1351
315k
                                           ((pixel *) f->cur.data[2]) + uv_off };
1352
1353
315k
                const int furthest_r =
1354
315k
                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
1355
315k
                const int furthest_b =
1356
315k
                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
1357
315k
                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
1358
315k
                                                         cbw4 - (furthest_r >> ss_hor),
1359
315k
                                                         cbh4 - (furthest_b >> ss_ver),
1360
315k
                                                         cbw4 * 4, cbh4 * 4);
1361
946k
                for (int pl = 0; pl < 2; pl++) {
1362
631k
                    if (!b->cfl_alpha[pl]) continue;
1363
526k
                    int angle = 0;
1364
526k
                    const pixel *top_sb_edge = NULL;
1365
526k
                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1366
139k
                        top_sb_edge = f->ipred_edge[pl + 1];
1367
139k
                        const int sby = t->by >> f->sb_shift;
1368
139k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1369
139k
                    }
1370
526k
                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1371
526k
                    const int xstart = ts->tiling.col_start >> ss_hor;
1372
526k
                    const int ystart = ts->tiling.row_start >> ss_ver;
1373
526k
                    const enum IntraPredMode m =
1374
526k
                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1375
526k
                                                          ypos, ypos > ystart,
1376
526k
                                                          ts->tiling.col_end >> ss_hor,
1377
526k
                                                          ts->tiling.row_end >> ss_ver,
1378
526k
                                                          0, uv_dst[pl], stride,
1379
526k
                                                          top_sb_edge, DC_PRED, &angle,
1380
526k
                                                          uv_t_dim->w, uv_t_dim->h, 0,
1381
526k
                                                          edge HIGHBD_CALL_SUFFIX);
1382
526k
                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
1383
526k
                                           uv_t_dim->w * 4,
1384
526k
                                           uv_t_dim->h * 4,
1385
526k
                                           ac, b->cfl_alpha[pl]
1386
526k
                                           HIGHBD_CALL_SUFFIX);
1387
526k
                }
1388
315k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1389
0
                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
1390
0
                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
1391
0
                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
1392
0
                }
1393
1.27M
            } else if (b->pal_sz[1]) {
1394
3.75k
                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
1395
3.75k
                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1396
3.75k
                const pixel (*pal)[8];
1397
3.75k
                const uint8_t *pal_idx;
1398
3.75k
                if (t->frame_thread.pass) {
1399
3.75k
                    const int p = t->frame_thread.pass & 1;
1400
3.75k
                    assert(ts->frame_thread[p].pal_idx);
1401
3.75k
                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1402
3.75k
                                              ((t->bx >> 1) + (t->by & 1))];
1403
3.75k
                    pal_idx = ts->frame_thread[p].pal_idx;
1404
3.75k
                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
1405
3.75k
                } else {
1406
0
                    pal = bytefn(t->scratch.pal);
1407
0
                    pal_idx = t->scratch.pal_idx_uv;
1408
0
                }
1409
1410
3.75k
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
1411
3.75k
                                       f->cur.stride[1], pal[1],
1412
3.75k
                                       pal_idx, cbw4 * 4, cbh4 * 4);
1413
3.75k
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
1414
3.75k
                                       f->cur.stride[1], pal[2],
1415
3.75k
                                       pal_idx, cbw4 * 4, cbh4 * 4);
1416
3.75k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1417
0
                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
1418
0
                             PXSTRIDE(f->cur.stride[1]),
1419
0
                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
1420
0
                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
1421
0
                             PXSTRIDE(f->cur.stride[1]),
1422
0
                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
1423
0
                }
1424
3.75k
            }
1425
1426
1.59M
            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
1427
1.59M
                                 sm_uv_flag(&t->l, cby4);
1428
1.59M
            const int uv_sb_has_tr =
1429
1.59M
                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
1430
1.49M
                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
1431
1.59M
            const int uv_sb_has_bl =
1432
1.59M
                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
1433
1.49M
                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
1434
1.59M
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
1435
4.76M
            for (int pl = 0; pl < 2; pl++) {
1436
6.83M
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
1437
3.65M
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
1438
3.65M
                {
1439
3.65M
                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
1440
3.65M
                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
1441
3.65M
                                        ((t->bx + init_x) >> ss_hor));
1442
8.62M
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
1443
4.97M
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
1444
4.97M
                    {
1445
4.97M
                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
1446
4.44M
                            b->pal_sz[1])
1447
535k
                        {
1448
535k
                            goto skip_uv_pred;
1449
535k
                        }
1450
1451
4.43M
                        int angle = b->uv_angle;
1452
                        // this probably looks weird because we're using
1453
                        // luma flags in a chroma loop, but that's because
1454
                        // prepare_intra_edges() expects luma flags as input
1455
4.43M
                        const enum EdgeFlags edge_flags =
1456
4.43M
                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
1457
2.25M
                              (x + uv_t_dim->w >= sub_cw4)) ?
1458
3.22M
                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
1459
4.43M
                            ((x > (init_x >> ss_hor) ||
1460
3.12M
                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
1461
2.81M
                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
1462
4.43M
                        const pixel *top_sb_edge = NULL;
1463
4.43M
                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1464
1.26M
                            top_sb_edge = f->ipred_edge[1 + pl];
1465
1.26M
                            const int sby = t->by >> f->sb_shift;
1466
1.26M
                            top_sb_edge += f->sb128w * 128 * (sby - 1);
1467
1.26M
                        }
1468
4.43M
                        const enum IntraPredMode uv_mode =
1469
4.43M
                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
1470
4.43M
                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1471
4.43M
                        const int xstart = ts->tiling.col_start >> ss_hor;
1472
4.43M
                        const int ystart = ts->tiling.row_start >> ss_ver;
1473
4.43M
                        const enum IntraPredMode m =
1474
4.43M
                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1475
4.43M
                                                              ypos, ypos > ystart,
1476
4.43M
                                                              ts->tiling.col_end >> ss_hor,
1477
4.43M
                                                              ts->tiling.row_end >> ss_ver,
1478
4.43M
                                                              edge_flags, dst, stride,
1479
4.43M
                                                              top_sb_edge, uv_mode,
1480
4.43M
                                                              &angle, uv_t_dim->w,
1481
4.43M
                                                              uv_t_dim->h,
1482
4.43M
                                                              f->seq_hdr->intra_edge_filter,
1483
4.43M
                                                              edge HIGHBD_CALL_SUFFIX);
1484
4.43M
                        angle |= intra_edge_filter_flag;
1485
4.43M
                        dsp->ipred.intra_pred[m](dst, stride, edge,
1486
4.43M
                                                 uv_t_dim->w * 4,
1487
4.43M
                                                 uv_t_dim->h * 4,
1488
4.43M
                                                 angle | sm_uv_fl,
1489
4.43M
                                                 (4 * f->bw + ss_hor -
1490
4.43M
                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
1491
4.43M
                                                 (4 * f->bh + ss_ver -
1492
4.43M
                                                  4 * (t->by & ~ss_ver)) >> ss_ver
1493
4.43M
                                                 HIGHBD_CALL_SUFFIX);
1494
4.43M
                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1495
0
                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
1496
0
                                     uv_t_dim->h * 4, 2, "l");
1497
0
                            hex_dump(edge, 0, 1, 1, "tl");
1498
0
                            hex_dump(edge + 1, uv_t_dim->w * 4,
1499
0
                                     uv_t_dim->w * 4, 2, "t");
1500
0
                            hex_dump(dst, stride, uv_t_dim->w * 4,
1501
0
                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
1502
0
                        }
1503
1504
4.97M
                    skip_uv_pred: {}
1505
4.97M
                        if (!b->skip) {
1506
1.60M
                            enum TxfmType txtp;
1507
1.60M
                            int eob;
1508
1.60M
                            coef *cf;
1509
1.60M
                            if (t->frame_thread.pass) {
1510
1.60M
                                const int p = t->frame_thread.pass & 1;
1511
1.60M
                                const int cbi = *ts->frame_thread[p].cbi++;
1512
1.60M
                                cf = ts->frame_thread[p].cf;
1513
1.60M
                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
1514
1.60M
                                eob  = cbi >> 5;
1515
1.60M
                                txtp = cbi & 0x1f;
1516
1.60M
                            } else {
1517
0
                                uint8_t cf_ctx;
1518
0
                                cf = bitfn(t->cf);
1519
0
                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1520
0
                                                   &t->l.ccoef[pl][cby4 + y],
1521
0
                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
1522
0
                                                   &txtp, &cf_ctx);
1523
0
                                if (DEBUG_BLOCK_INFO)
1524
0
                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1525
0
                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
1526
0
                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
1527
0
                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1528
0
                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
1529
0
                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1530
0
                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1531
0
                            }
1532
1.60M
                            if (eob >= 0) {
1533
585k
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1534
0
                                    coef_dump(cf, uv_t_dim->h * 4,
1535
0
                                              uv_t_dim->w * 4, 3, "dq");
1536
585k
                                dsp->itx.itxfm_add[b->uvtx]
1537
585k
                                                  [txtp](dst, stride,
1538
585k
                                                         cf, eob HIGHBD_CALL_SUFFIX);
1539
585k
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1540
0
                                    hex_dump(dst, stride, uv_t_dim->w * 4,
1541
0
                                             uv_t_dim->h * 4, "recon");
1542
585k
                            }
1543
3.36M
                        } else if (!t->frame_thread.pass) {
1544
0
                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
1545
0
                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
1546
0
                        }
1547
4.97M
                        dst += uv_t_dim->w * 4;
1548
4.97M
                    }
1549
3.65M
                    t->bx -= x << ss_hor;
1550
3.65M
                }
1551
3.17M
                t->by -= y << ss_ver;
1552
3.17M
            }
1553
1.59M
        }
1554
1.89M
    }
1555
1.79M
}
dav1d_recon_b_intra_8bpc
Line
Count
Source
1179
794k
{
1180
794k
    Dav1dTileState *const ts = t->ts;
1181
794k
    const Dav1dFrameContext *const f = t->f;
1182
794k
    const Dav1dDSPContext *const dsp = f->dsp;
1183
794k
    const int bx4 = t->bx & 31, by4 = t->by & 31;
1184
794k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1185
794k
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1186
794k
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1187
794k
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1188
794k
    const int bw4 = b_dim[0], bh4 = b_dim[1];
1189
794k
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1190
794k
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1191
794k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1192
744k
                           (bw4 > ss_hor || t->bx & 1) &&
1193
697k
                           (bh4 > ss_ver || t->by & 1);
1194
794k
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
1195
794k
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
1196
1197
    // coefficient coding
1198
794k
    pixel *const edge = bitfn(t->scratch.edge) + 128;
1199
794k
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
1200
1201
794k
    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
1202
1203
1.62M
    for (int init_y = 0; init_y < h4; init_y += 16) {
1204
833k
        const int sub_h4 = imin(h4, 16 + init_y);
1205
833k
        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
1206
1.72M
        for (int init_x = 0; init_x < w4; init_x += 16) {
1207
891k
            if (b->pal_sz[0]) {
1208
4.61k
                pixel *dst = ((pixel *) f->cur.data[0]) +
1209
4.61k
                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1210
4.61k
                const uint8_t *pal_idx;
1211
4.61k
                if (t->frame_thread.pass) {
1212
4.61k
                    const int p = t->frame_thread.pass & 1;
1213
4.61k
                    assert(ts->frame_thread[p].pal_idx);
1214
4.61k
                    pal_idx = ts->frame_thread[p].pal_idx;
1215
4.61k
                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
1216
4.61k
                } else {
1217
0
                    pal_idx = t->scratch.pal_idx_y;
1218
0
                }
1219
4.61k
                const pixel *const pal = t->frame_thread.pass ?
1220
4.61k
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1221
4.61k
                                        ((t->bx >> 1) + (t->by & 1))][0] :
1222
4.61k
                    bytefn(t->scratch.pal)[0];
1223
4.61k
                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
1224
4.61k
                                       pal_idx, bw4 * 4, bh4 * 4);
1225
4.61k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1226
0
                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
1227
0
                             bw4 * 4, bh4 * 4, "y-pal-pred");
1228
4.61k
            }
1229
1230
891k
            const int intra_flags = (sm_flag(t->a, bx4) |
1231
891k
                                     sm_flag(&t->l, by4) |
1232
891k
                                     intra_edge_filter_flag);
1233
891k
            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
1234
833k
                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
1235
891k
            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
1236
833k
                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
1237
891k
            int y, x;
1238
891k
            const int sub_w4 = imin(w4, init_x + 16);
1239
2.16M
            for (y = init_y, t->by += init_y; y < sub_h4;
1240
1.27M
                 y += t_dim->h, t->by += t_dim->h)
1241
1.27M
            {
1242
1.27M
                pixel *dst = ((pixel *) f->cur.data[0]) +
1243
1.27M
                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
1244
1.27M
                                    t->bx + init_x);
1245
5.81M
                for (x = init_x, t->bx += init_x; x < sub_w4;
1246
4.54M
                     x += t_dim->w, t->bx += t_dim->w)
1247
4.54M
                {
1248
4.54M
                    if (b->pal_sz[0]) goto skip_y_pred;
1249
1250
4.53M
                    int angle = b->y_angle;
1251
4.53M
                    const enum EdgeFlags edge_flags =
1252
4.53M
                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
1253
3.91M
                             0 : EDGE_I444_TOP_HAS_RIGHT) |
1254
4.53M
                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
1255
3.76M
                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
1256
4.53M
                    const pixel *top_sb_edge = NULL;
1257
4.53M
                    if (!(t->by & (f->sb_step - 1))) {
1258
506k
                        top_sb_edge = f->ipred_edge[0];
1259
506k
                        const int sby = t->by >> f->sb_shift;
1260
506k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1261
506k
                    }
1262
4.53M
                    const enum IntraPredMode m =
1263
4.53M
                        bytefn(dav1d_prepare_intra_edges)(t->bx,
1264
4.53M
                                                          t->bx > ts->tiling.col_start,
1265
4.53M
                                                          t->by,
1266
4.53M
                                                          t->by > ts->tiling.row_start,
1267
4.53M
                                                          ts->tiling.col_end,
1268
4.53M
                                                          ts->tiling.row_end,
1269
4.53M
                                                          edge_flags, dst,
1270
4.53M
                                                          f->cur.stride[0], top_sb_edge,
1271
4.53M
                                                          b->y_mode, &angle,
1272
4.53M
                                                          t_dim->w, t_dim->h,
1273
4.53M
                                                          f->seq_hdr->intra_edge_filter,
1274
4.53M
                                                          edge HIGHBD_CALL_SUFFIX);
1275
4.53M
                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
1276
4.53M
                                             t_dim->w * 4, t_dim->h * 4,
1277
4.53M
                                             angle | intra_flags,
1278
4.53M
                                             4 * f->bw - 4 * t->bx,
1279
4.53M
                                             4 * f->bh - 4 * t->by
1280
4.53M
                                             HIGHBD_CALL_SUFFIX);
1281
1282
4.53M
                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1283
0
                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
1284
0
                                 t_dim->h * 4, 2, "l");
1285
0
                        hex_dump(edge, 0, 1, 1, "tl");
1286
0
                        hex_dump(edge + 1, t_dim->w * 4,
1287
0
                                 t_dim->w * 4, 2, "t");
1288
0
                        hex_dump(dst, f->cur.stride[0],
1289
0
                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
1290
0
                    }
1291
1292
4.54M
                skip_y_pred: {}
1293
4.54M
                    if (!b->skip) {
1294
749k
                        coef *cf;
1295
749k
                        int eob;
1296
749k
                        enum TxfmType txtp;
1297
749k
                        if (t->frame_thread.pass) {
1298
749k
                            const int p = t->frame_thread.pass & 1;
1299
749k
                            const int cbi = *ts->frame_thread[p].cbi++;
1300
749k
                            cf = ts->frame_thread[p].cf;
1301
749k
                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
1302
749k
                            eob  = cbi >> 5;
1303
749k
                            txtp = cbi & 0x1f;
1304
749k
                        } else {
1305
0
                            uint8_t cf_ctx;
1306
0
                            cf = bitfn(t->cf);
1307
0
                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
1308
0
                                               &t->l.lcoef[by4 + y], b->tx, bs,
1309
0
                                               b, 1, 0, cf, &txtp, &cf_ctx);
1310
0
                            if (DEBUG_BLOCK_INFO)
1311
0
                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
1312
0
                                       b->tx, txtp, eob, ts->msac.rng);
1313
0
                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
1314
0
                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
1315
0
                        }
1316
749k
                        if (eob >= 0) {
1317
590k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1318
0
                                coef_dump(cf, imin(t_dim->h, 8) * 4,
1319
0
                                          imin(t_dim->w, 8) * 4, 3, "dq");
1320
590k
                            dsp->itx.itxfm_add[b->tx]
1321
590k
                                              [txtp](dst,
1322
590k
                                                     f->cur.stride[0],
1323
590k
                                                     cf, eob HIGHBD_CALL_SUFFIX);
1324
590k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1325
0
                                hex_dump(dst, f->cur.stride[0],
1326
0
                                         t_dim->w * 4, t_dim->h * 4, "recon");
1327
590k
                        }
1328
3.79M
                    } else if (!t->frame_thread.pass) {
1329
0
                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
1330
0
                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
1331
0
                    }
1332
4.54M
                    dst += 4 * t_dim->w;
1333
4.54M
                }
1334
1.27M
                t->bx -= x;
1335
1.27M
            }
1336
892k
            t->by -= y;
1337
1338
892k
            if (!has_chroma) continue;
1339
1340
734k
            const ptrdiff_t stride = f->cur.stride[1];
1341
1342
734k
            if (b->uv_mode == CFL_PRED) {
1343
146k
                assert(!init_x && !init_y);
1344
1345
146k
                int16_t *const ac = t->scratch.ac;
1346
146k
                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
1347
146k
                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
1348
146k
                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
1349
146k
                                              (t->by >> ss_ver) * PXSTRIDE(stride));
1350
146k
                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
1351
146k
                                           ((pixel *) f->cur.data[2]) + uv_off };
1352
1353
146k
                const int furthest_r =
1354
146k
                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
1355
146k
                const int furthest_b =
1356
146k
                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
1357
146k
                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
1358
146k
                                                         cbw4 - (furthest_r >> ss_hor),
1359
146k
                                                         cbh4 - (furthest_b >> ss_ver),
1360
146k
                                                         cbw4 * 4, cbh4 * 4);
1361
438k
                for (int pl = 0; pl < 2; pl++) {
1362
291k
                    if (!b->cfl_alpha[pl]) continue;
1363
244k
                    int angle = 0;
1364
244k
                    const pixel *top_sb_edge = NULL;
1365
244k
                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1366
70.6k
                        top_sb_edge = f->ipred_edge[pl + 1];
1367
70.6k
                        const int sby = t->by >> f->sb_shift;
1368
70.6k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1369
70.6k
                    }
1370
244k
                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1371
244k
                    const int xstart = ts->tiling.col_start >> ss_hor;
1372
244k
                    const int ystart = ts->tiling.row_start >> ss_ver;
1373
244k
                    const enum IntraPredMode m =
1374
244k
                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1375
244k
                                                          ypos, ypos > ystart,
1376
244k
                                                          ts->tiling.col_end >> ss_hor,
1377
244k
                                                          ts->tiling.row_end >> ss_ver,
1378
244k
                                                          0, uv_dst[pl], stride,
1379
244k
                                                          top_sb_edge, DC_PRED, &angle,
1380
244k
                                                          uv_t_dim->w, uv_t_dim->h, 0,
1381
244k
                                                          edge HIGHBD_CALL_SUFFIX);
1382
244k
                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
1383
244k
                                           uv_t_dim->w * 4,
1384
244k
                                           uv_t_dim->h * 4,
1385
244k
                                           ac, b->cfl_alpha[pl]
1386
244k
                                           HIGHBD_CALL_SUFFIX);
1387
244k
                }
1388
146k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1389
0
                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
1390
0
                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
1391
0
                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
1392
0
                }
1393
588k
            } else if (b->pal_sz[1]) {
1394
2.45k
                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
1395
2.45k
                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1396
2.45k
                const pixel (*pal)[8];
1397
2.45k
                const uint8_t *pal_idx;
1398
2.45k
                if (t->frame_thread.pass) {
1399
2.45k
                    const int p = t->frame_thread.pass & 1;
1400
2.45k
                    assert(ts->frame_thread[p].pal_idx);
1401
2.45k
                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1402
2.45k
                                              ((t->bx >> 1) + (t->by & 1))];
1403
2.45k
                    pal_idx = ts->frame_thread[p].pal_idx;
1404
2.45k
                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
1405
2.45k
                } else {
1406
0
                    pal = bytefn(t->scratch.pal);
1407
0
                    pal_idx = t->scratch.pal_idx_uv;
1408
0
                }
1409
1410
2.45k
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
1411
2.45k
                                       f->cur.stride[1], pal[1],
1412
2.45k
                                       pal_idx, cbw4 * 4, cbh4 * 4);
1413
2.45k
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
1414
2.45k
                                       f->cur.stride[1], pal[2],
1415
2.45k
                                       pal_idx, cbw4 * 4, cbh4 * 4);
1416
2.45k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1417
0
                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
1418
0
                             PXSTRIDE(f->cur.stride[1]),
1419
0
                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
1420
0
                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
1421
0
                             PXSTRIDE(f->cur.stride[1]),
1422
0
                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
1423
0
                }
1424
2.45k
            }
1425
1426
734k
            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
1427
734k
                                 sm_uv_flag(&t->l, cby4);
1428
734k
            const int uv_sb_has_tr =
1429
734k
                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
1430
686k
                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
1431
734k
            const int uv_sb_has_bl =
1432
734k
                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
1433
686k
                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
1434
734k
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
1435
2.20M
            for (int pl = 0; pl < 2; pl++) {
1436
3.18M
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
1437
1.71M
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
1438
1.71M
                {
1439
1.71M
                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
1440
1.71M
                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
1441
1.71M
                                        ((t->bx + init_x) >> ss_hor));
1442
4.08M
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
1443
2.36M
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
1444
2.36M
                    {
1445
2.36M
                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
1446
2.12M
                            b->pal_sz[1])
1447
250k
                        {
1448
250k
                            goto skip_uv_pred;
1449
250k
                        }
1450
1451
2.11M
                        int angle = b->uv_angle;
1452
                        // this probably looks weird because we're using
1453
                        // luma flags in a chroma loop, but that's because
1454
                        // prepare_intra_edges() expects luma flags as input
1455
2.11M
                        const enum EdgeFlags edge_flags =
1456
2.11M
                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
1457
1.08M
                              (x + uv_t_dim->w >= sub_cw4)) ?
1458
1.55M
                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
1459
2.11M
                            ((x > (init_x >> ss_hor) ||
1460
1.46M
                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
1461
1.34M
                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
1462
2.11M
                        const pixel *top_sb_edge = NULL;
1463
2.11M
                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1464
611k
                            top_sb_edge = f->ipred_edge[1 + pl];
1465
611k
                            const int sby = t->by >> f->sb_shift;
1466
611k
                            top_sb_edge += f->sb128w * 128 * (sby - 1);
1467
611k
                        }
1468
2.11M
                        const enum IntraPredMode uv_mode =
1469
2.11M
                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
1470
2.11M
                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1471
2.11M
                        const int xstart = ts->tiling.col_start >> ss_hor;
1472
2.11M
                        const int ystart = ts->tiling.row_start >> ss_ver;
1473
2.11M
                        const enum IntraPredMode m =
1474
2.11M
                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1475
2.11M
                                                              ypos, ypos > ystart,
1476
2.11M
                                                              ts->tiling.col_end >> ss_hor,
1477
2.11M
                                                              ts->tiling.row_end >> ss_ver,
1478
2.11M
                                                              edge_flags, dst, stride,
1479
2.11M
                                                              top_sb_edge, uv_mode,
1480
2.11M
                                                              &angle, uv_t_dim->w,
1481
2.11M
                                                              uv_t_dim->h,
1482
2.11M
                                                              f->seq_hdr->intra_edge_filter,
1483
2.11M
                                                              edge HIGHBD_CALL_SUFFIX);
1484
2.11M
                        angle |= intra_edge_filter_flag;
1485
2.11M
                        dsp->ipred.intra_pred[m](dst, stride, edge,
1486
2.11M
                                                 uv_t_dim->w * 4,
1487
2.11M
                                                 uv_t_dim->h * 4,
1488
2.11M
                                                 angle | sm_uv_fl,
1489
2.11M
                                                 (4 * f->bw + ss_hor -
1490
2.11M
                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
1491
2.11M
                                                 (4 * f->bh + ss_ver -
1492
2.11M
                                                  4 * (t->by & ~ss_ver)) >> ss_ver
1493
2.11M
                                                 HIGHBD_CALL_SUFFIX);
1494
2.11M
                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1495
0
                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
1496
0
                                     uv_t_dim->h * 4, 2, "l");
1497
0
                            hex_dump(edge, 0, 1, 1, "tl");
1498
0
                            hex_dump(edge + 1, uv_t_dim->w * 4,
1499
0
                                     uv_t_dim->w * 4, 2, "t");
1500
0
                            hex_dump(dst, stride, uv_t_dim->w * 4,
1501
0
                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
1502
0
                        }
1503
1504
2.36M
                    skip_uv_pred: {}
1505
2.36M
                        if (!b->skip) {
1506
679k
                            enum TxfmType txtp;
1507
679k
                            int eob;
1508
679k
                            coef *cf;
1509
679k
                            if (t->frame_thread.pass) {
1510
679k
                                const int p = t->frame_thread.pass & 1;
1511
679k
                                const int cbi = *ts->frame_thread[p].cbi++;
1512
679k
                                cf = ts->frame_thread[p].cf;
1513
679k
                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
1514
679k
                                eob  = cbi >> 5;
1515
679k
                                txtp = cbi & 0x1f;
1516
679k
                            } else {
1517
1
                                uint8_t cf_ctx;
1518
1
                                cf = bitfn(t->cf);
1519
1
                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1520
1
                                                   &t->l.ccoef[pl][cby4 + y],
1521
1
                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
1522
1
                                                   &txtp, &cf_ctx);
1523
1
                                if (DEBUG_BLOCK_INFO)
1524
0
                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1525
0
                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
1526
0
                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
1527
1
                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1528
1
                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
1529
1
                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1530
1
                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1531
1
                            }
1532
679k
                            if (eob >= 0) {
1533
289k
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1534
0
                                    coef_dump(cf, uv_t_dim->h * 4,
1535
0
                                              uv_t_dim->w * 4, 3, "dq");
1536
289k
                                dsp->itx.itxfm_add[b->uvtx]
1537
289k
                                                  [txtp](dst, stride,
1538
289k
                                                         cf, eob HIGHBD_CALL_SUFFIX);
1539
289k
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1540
0
                                    hex_dump(dst, stride, uv_t_dim->w * 4,
1541
0
                                             uv_t_dim->h * 4, "recon");
1542
289k
                            }
1543
1.68M
                        } else if (!t->frame_thread.pass) {
1544
0
                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
1545
0
                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
1546
0
                        }
1547
2.36M
                        dst += uv_t_dim->w * 4;
1548
2.36M
                    }
1549
1.71M
                    t->bx -= x << ss_hor;
1550
1.71M
                }
1551
1.46M
                t->by -= y << ss_ver;
1552
1.46M
            }
1553
734k
        }
1554
833k
    }
1555
794k
}
dav1d_recon_b_intra_16bpc
Line
Count
Source
1179
1.00M
{
1180
1.00M
    Dav1dTileState *const ts = t->ts;
1181
1.00M
    const Dav1dFrameContext *const f = t->f;
1182
1.00M
    const Dav1dDSPContext *const dsp = f->dsp;
1183
1.00M
    const int bx4 = t->bx & 31, by4 = t->by & 31;
1184
1.00M
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1185
1.00M
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1186
1.00M
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1187
1.00M
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1188
1.00M
    const int bw4 = b_dim[0], bh4 = b_dim[1];
1189
1.00M
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1190
1.00M
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1191
1.00M
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1192
879k
                           (bw4 > ss_hor || t->bx & 1) &&
1193
828k
                           (bh4 > ss_ver || t->by & 1);
1194
1.00M
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
1195
1.00M
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
1196
1197
    // coefficient coding
1198
1.00M
    pixel *const edge = bitfn(t->scratch.edge) + 128;
1199
1.00M
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
1200
1201
1.00M
    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
1202
1203
2.06M
    for (int init_y = 0; init_y < h4; init_y += 16) {
1204
1.05M
        const int sub_h4 = imin(h4, 16 + init_y);
1205
1.05M
        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
1206
2.20M
        for (int init_x = 0; init_x < w4; init_x += 16) {
1207
1.14M
            if (b->pal_sz[0]) {
1208
4.20k
                pixel *dst = ((pixel *) f->cur.data[0]) +
1209
4.20k
                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1210
4.20k
                const uint8_t *pal_idx;
1211
4.20k
                if (t->frame_thread.pass) {
1212
4.20k
                    const int p = t->frame_thread.pass & 1;
1213
4.20k
                    assert(ts->frame_thread[p].pal_idx);
1214
4.20k
                    pal_idx = ts->frame_thread[p].pal_idx;
1215
4.20k
                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
1216
4.20k
                } else {
1217
0
                    pal_idx = t->scratch.pal_idx_y;
1218
0
                }
1219
4.20k
                const pixel *const pal = t->frame_thread.pass ?
1220
4.20k
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1221
4.20k
                                        ((t->bx >> 1) + (t->by & 1))][0] :
1222
4.20k
                    bytefn(t->scratch.pal)[0];
1223
4.20k
                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
1224
4.20k
                                       pal_idx, bw4 * 4, bh4 * 4);
1225
4.20k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1226
0
                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
1227
0
                             bw4 * 4, bh4 * 4, "y-pal-pred");
1228
4.20k
            }
1229
1230
1.14M
            const int intra_flags = (sm_flag(t->a, bx4) |
1231
1.14M
                                     sm_flag(&t->l, by4) |
1232
1.14M
                                     intra_edge_filter_flag);
1233
1.14M
            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
1234
1.05M
                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
1235
1.14M
            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
1236
1.05M
                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
1237
1.14M
            int y, x;
1238
1.14M
            const int sub_w4 = imin(w4, init_x + 16);
1239
2.88M
            for (y = init_y, t->by += init_y; y < sub_h4;
1240
1.73M
                 y += t_dim->h, t->by += t_dim->h)
1241
1.73M
            {
1242
1.73M
                pixel *dst = ((pixel *) f->cur.data[0]) +
1243
1.73M
                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
1244
1.73M
                                    t->bx + init_x);
1245
8.87M
                for (x = init_x, t->bx += init_x; x < sub_w4;
1246
7.14M
                     x += t_dim->w, t->bx += t_dim->w)
1247
7.13M
                {
1248
7.13M
                    if (b->pal_sz[0]) goto skip_y_pred;
1249
1250
7.13M
                    int angle = b->y_angle;
1251
7.13M
                    const enum EdgeFlags edge_flags =
1252
7.13M
                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
1253
6.21M
                             0 : EDGE_I444_TOP_HAS_RIGHT) |
1254
7.13M
                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
1255
6.04M
                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
1256
7.13M
                    const pixel *top_sb_edge = NULL;
1257
7.13M
                    if (!(t->by & (f->sb_step - 1))) {
1258
670k
                        top_sb_edge = f->ipred_edge[0];
1259
670k
                        const int sby = t->by >> f->sb_shift;
1260
670k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1261
670k
                    }
1262
7.13M
                    const enum IntraPredMode m =
1263
7.13M
                        bytefn(dav1d_prepare_intra_edges)(t->bx,
1264
7.13M
                                                          t->bx > ts->tiling.col_start,
1265
7.13M
                                                          t->by,
1266
7.13M
                                                          t->by > ts->tiling.row_start,
1267
7.13M
                                                          ts->tiling.col_end,
1268
7.13M
                                                          ts->tiling.row_end,
1269
7.13M
                                                          edge_flags, dst,
1270
7.13M
                                                          f->cur.stride[0], top_sb_edge,
1271
7.13M
                                                          b->y_mode, &angle,
1272
7.13M
                                                          t_dim->w, t_dim->h,
1273
7.13M
                                                          f->seq_hdr->intra_edge_filter,
1274
7.13M
                                                          edge HIGHBD_CALL_SUFFIX);
1275
7.13M
                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
1276
7.13M
                                             t_dim->w * 4, t_dim->h * 4,
1277
7.13M
                                             angle | intra_flags,
1278
7.13M
                                             4 * f->bw - 4 * t->bx,
1279
7.13M
                                             4 * f->bh - 4 * t->by
1280
7.13M
                                             HIGHBD_CALL_SUFFIX);
1281
1282
7.13M
                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1283
0
                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
1284
0
                                 t_dim->h * 4, 2, "l");
1285
0
                        hex_dump(edge, 0, 1, 1, "tl");
1286
0
                        hex_dump(edge + 1, t_dim->w * 4,
1287
0
                                 t_dim->w * 4, 2, "t");
1288
0
                        hex_dump(dst, f->cur.stride[0],
1289
0
                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
1290
0
                    }
1291
1292
7.14M
                skip_y_pred: {}
1293
7.14M
                    if (!b->skip) {
1294
803k
                        coef *cf;
1295
803k
                        int eob;
1296
803k
                        enum TxfmType txtp;
1297
803k
                        if (t->frame_thread.pass) {
1298
803k
                            const int p = t->frame_thread.pass & 1;
1299
803k
                            const int cbi = *ts->frame_thread[p].cbi++;
1300
803k
                            cf = ts->frame_thread[p].cf;
1301
803k
                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
1302
803k
                            eob  = cbi >> 5;
1303
803k
                            txtp = cbi & 0x1f;
1304
18.4E
                        } else {
1305
18.4E
                            uint8_t cf_ctx;
1306
18.4E
                            cf = bitfn(t->cf);
1307
18.4E
                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
1308
18.4E
                                               &t->l.lcoef[by4 + y], b->tx, bs,
1309
18.4E
                                               b, 1, 0, cf, &txtp, &cf_ctx);
1310
18.4E
                            if (DEBUG_BLOCK_INFO)
1311
0
                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
1312
0
                                       b->tx, txtp, eob, ts->msac.rng);
1313
18.4E
                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
1314
18.4E
                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
1315
18.4E
                        }
1316
803k
                        if (eob >= 0) {
1317
553k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1318
0
                                coef_dump(cf, imin(t_dim->h, 8) * 4,
1319
0
                                          imin(t_dim->w, 8) * 4, 3, "dq");
1320
553k
                            dsp->itx.itxfm_add[b->tx]
1321
553k
                                              [txtp](dst,
1322
553k
                                                     f->cur.stride[0],
1323
553k
                                                     cf, eob HIGHBD_CALL_SUFFIX);
1324
553k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1325
0
                                hex_dump(dst, f->cur.stride[0],
1326
0
                                         t_dim->w * 4, t_dim->h * 4, "recon");
1327
553k
                        }
1328
6.33M
                    } else if (!t->frame_thread.pass) {
1329
0
                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
1330
0
                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
1331
0
                    }
1332
7.14M
                    dst += 4 * t_dim->w;
1333
7.14M
                }
1334
1.73M
                t->bx -= x;
1335
1.73M
            }
1336
1.15M
            t->by -= y;
1337
1338
1.15M
            if (!has_chroma) continue;
1339
1340
855k
            const ptrdiff_t stride = f->cur.stride[1];
1341
1342
855k
            if (b->uv_mode == CFL_PRED) {
1343
169k
                assert(!init_x && !init_y);
1344
1345
169k
                int16_t *const ac = t->scratch.ac;
1346
169k
                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
1347
169k
                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
1348
169k
                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
1349
169k
                                              (t->by >> ss_ver) * PXSTRIDE(stride));
1350
169k
                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
1351
169k
                                           ((pixel *) f->cur.data[2]) + uv_off };
1352
1353
169k
                const int furthest_r =
1354
169k
                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
1355
169k
                const int furthest_b =
1356
169k
                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
1357
169k
                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
1358
169k
                                                         cbw4 - (furthest_r >> ss_hor),
1359
169k
                                                         cbh4 - (furthest_b >> ss_ver),
1360
169k
                                                         cbw4 * 4, cbh4 * 4);
1361
508k
                for (int pl = 0; pl < 2; pl++) {
1362
339k
                    if (!b->cfl_alpha[pl]) continue;
1363
281k
                    int angle = 0;
1364
281k
                    const pixel *top_sb_edge = NULL;
1365
281k
                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1366
69.0k
                        top_sb_edge = f->ipred_edge[pl + 1];
1367
69.0k
                        const int sby = t->by >> f->sb_shift;
1368
69.0k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1369
69.0k
                    }
1370
281k
                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1371
281k
                    const int xstart = ts->tiling.col_start >> ss_hor;
1372
281k
                    const int ystart = ts->tiling.row_start >> ss_ver;
1373
281k
                    const enum IntraPredMode m =
1374
281k
                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1375
281k
                                                          ypos, ypos > ystart,
1376
281k
                                                          ts->tiling.col_end >> ss_hor,
1377
281k
                                                          ts->tiling.row_end >> ss_ver,
1378
281k
                                                          0, uv_dst[pl], stride,
1379
281k
                                                          top_sb_edge, DC_PRED, &angle,
1380
281k
                                                          uv_t_dim->w, uv_t_dim->h, 0,
1381
281k
                                                          edge HIGHBD_CALL_SUFFIX);
1382
281k
                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
1383
281k
                                           uv_t_dim->w * 4,
1384
281k
                                           uv_t_dim->h * 4,
1385
281k
                                           ac, b->cfl_alpha[pl]
1386
281k
                                           HIGHBD_CALL_SUFFIX);
1387
281k
                }
1388
169k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1389
0
                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
1390
0
                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
1391
0
                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
1392
0
                }
1393
686k
            } else if (b->pal_sz[1]) {
1394
1.30k
                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
1395
1.30k
                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1396
1.30k
                const pixel (*pal)[8];
1397
1.30k
                const uint8_t *pal_idx;
1398
1.30k
                if (t->frame_thread.pass) {
1399
1.30k
                    const int p = t->frame_thread.pass & 1;
1400
1.30k
                    assert(ts->frame_thread[p].pal_idx);
1401
1.30k
                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1402
1.30k
                                              ((t->bx >> 1) + (t->by & 1))];
1403
1.30k
                    pal_idx = ts->frame_thread[p].pal_idx;
1404
1.30k
                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
1405
1.30k
                } else {
1406
0
                    pal = bytefn(t->scratch.pal);
1407
0
                    pal_idx = t->scratch.pal_idx_uv;
1408
0
                }
1409
1410
1.30k
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
1411
1.30k
                                       f->cur.stride[1], pal[1],
1412
1.30k
                                       pal_idx, cbw4 * 4, cbh4 * 4);
1413
1.30k
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
1414
1.30k
                                       f->cur.stride[1], pal[2],
1415
1.30k
                                       pal_idx, cbw4 * 4, cbh4 * 4);
1416
1.30k
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1417
0
                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
1418
0
                             PXSTRIDE(f->cur.stride[1]),
1419
0
                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
1420
0
                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
1421
0
                             PXSTRIDE(f->cur.stride[1]),
1422
0
                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
1423
0
                }
1424
1.30k
            }
1425
1426
855k
            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
1427
855k
                                 sm_uv_flag(&t->l, cby4);
1428
855k
            const int uv_sb_has_tr =
1429
855k
                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
1430
810k
                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
1431
855k
            const int uv_sb_has_bl =
1432
855k
                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
1433
810k
                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
1434
855k
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
1435
2.56M
            for (int pl = 0; pl < 2; pl++) {
1436
3.65M
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
1437
1.94M
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
1438
1.94M
                {
1439
1.94M
                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
1440
1.94M
                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
1441
1.94M
                                        ((t->bx + init_x) >> ss_hor));
1442
4.54M
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
1443
2.60M
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
1444
2.60M
                    {
1445
2.60M
                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
1446
2.32M
                            b->pal_sz[1])
1447
284k
                        {
1448
284k
                            goto skip_uv_pred;
1449
284k
                        }
1450
1451
2.31M
                        int angle = b->uv_angle;
1452
                        // this probably looks weird because we're using
1453
                        // luma flags in a chroma loop, but that's because
1454
                        // prepare_intra_edges() expects luma flags as input
1455
2.31M
                        const enum EdgeFlags edge_flags =
1456
2.31M
                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
1457
1.16M
                              (x + uv_t_dim->w >= sub_cw4)) ?
1458
1.66M
                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
1459
2.31M
                            ((x > (init_x >> ss_hor) ||
1460
1.66M
                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
1461
1.47M
                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
1462
2.31M
                        const pixel *top_sb_edge = NULL;
1463
2.31M
                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1464
656k
                            top_sb_edge = f->ipred_edge[1 + pl];
1465
656k
                            const int sby = t->by >> f->sb_shift;
1466
656k
                            top_sb_edge += f->sb128w * 128 * (sby - 1);
1467
656k
                        }
1468
2.31M
                        const enum IntraPredMode uv_mode =
1469
2.31M
                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
1470
2.31M
                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1471
2.31M
                        const int xstart = ts->tiling.col_start >> ss_hor;
1472
2.31M
                        const int ystart = ts->tiling.row_start >> ss_ver;
1473
2.31M
                        const enum IntraPredMode m =
1474
2.31M
                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1475
2.31M
                                                              ypos, ypos > ystart,
1476
2.31M
                                                              ts->tiling.col_end >> ss_hor,
1477
2.31M
                                                              ts->tiling.row_end >> ss_ver,
1478
2.31M
                                                              edge_flags, dst, stride,
1479
2.31M
                                                              top_sb_edge, uv_mode,
1480
2.31M
                                                              &angle, uv_t_dim->w,
1481
2.31M
                                                              uv_t_dim->h,
1482
2.31M
                                                              f->seq_hdr->intra_edge_filter,
1483
2.31M
                                                              edge HIGHBD_CALL_SUFFIX);
1484
2.31M
                        angle |= intra_edge_filter_flag;
1485
2.31M
                        dsp->ipred.intra_pred[m](dst, stride, edge,
1486
2.31M
                                                 uv_t_dim->w * 4,
1487
2.31M
                                                 uv_t_dim->h * 4,
1488
2.31M
                                                 angle | sm_uv_fl,
1489
2.31M
                                                 (4 * f->bw + ss_hor -
1490
2.31M
                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
1491
2.31M
                                                 (4 * f->bh + ss_ver -
1492
2.31M
                                                  4 * (t->by & ~ss_ver)) >> ss_ver
1493
2.31M
                                                 HIGHBD_CALL_SUFFIX);
1494
2.31M
                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1495
0
                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
1496
0
                                     uv_t_dim->h * 4, 2, "l");
1497
0
                            hex_dump(edge, 0, 1, 1, "tl");
1498
0
                            hex_dump(edge + 1, uv_t_dim->w * 4,
1499
0
                                     uv_t_dim->w * 4, 2, "t");
1500
0
                            hex_dump(dst, stride, uv_t_dim->w * 4,
1501
0
                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
1502
0
                        }
1503
1504
2.60M
                    skip_uv_pred: {}
1505
2.60M
                        if (!b->skip) {
1506
929k
                            enum TxfmType txtp;
1507
929k
                            int eob;
1508
929k
                            coef *cf;
1509
929k
                            if (t->frame_thread.pass) {
1510
929k
                                const int p = t->frame_thread.pass & 1;
1511
929k
                                const int cbi = *ts->frame_thread[p].cbi++;
1512
929k
                                cf = ts->frame_thread[p].cf;
1513
929k
                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
1514
929k
                                eob  = cbi >> 5;
1515
929k
                                txtp = cbi & 0x1f;
1516
18.4E
                            } else {
1517
18.4E
                                uint8_t cf_ctx;
1518
18.4E
                                cf = bitfn(t->cf);
1519
18.4E
                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1520
18.4E
                                                   &t->l.ccoef[pl][cby4 + y],
1521
18.4E
                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
1522
18.4E
                                                   &txtp, &cf_ctx);
1523
18.4E
                                if (DEBUG_BLOCK_INFO)
1524
0
                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1525
0
                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
1526
0
                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
1527
18.4E
                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1528
18.4E
                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
1529
18.4E
                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1530
18.4E
                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1531
18.4E
                            }
1532
929k
                            if (eob >= 0) {
1533
295k
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1534
0
                                    coef_dump(cf, uv_t_dim->h * 4,
1535
0
                                              uv_t_dim->w * 4, 3, "dq");
1536
295k
                                dsp->itx.itxfm_add[b->uvtx]
1537
295k
                                                  [txtp](dst, stride,
1538
295k
                                                         cf, eob HIGHBD_CALL_SUFFIX);
1539
295k
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1540
0
                                    hex_dump(dst, stride, uv_t_dim->w * 4,
1541
0
                                             uv_t_dim->h * 4, "recon");
1542
295k
                            }
1543
1.67M
                        } else if (!t->frame_thread.pass) {
1544
0
                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
1545
0
                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
1546
0
                        }
1547
2.60M
                        dst += uv_t_dim->w * 4;
1548
2.60M
                    }
1549
1.94M
                    t->bx -= x << ss_hor;
1550
1.94M
                }
1551
1.70M
                t->by -= y << ss_ver;
1552
1.70M
            }
1553
855k
        }
1554
1.05M
    }
1555
1.00M
}
1556
1557
int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
1558
                                const Av1Block *const b)
1559
356k
{
1560
356k
    Dav1dTileState *const ts = t->ts;
1561
356k
    const Dav1dFrameContext *const f = t->f;
1562
356k
    const Dav1dDSPContext *const dsp = f->dsp;
1563
356k
    const int bx4 = t->bx & 31, by4 = t->by & 31;
1564
356k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1565
356k
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1566
356k
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1567
356k
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1568
356k
    const int bw4 = b_dim[0], bh4 = b_dim[1];
1569
356k
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1570
356k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1571
274k
                           (bw4 > ss_hor || t->bx & 1) &&
1572
260k
                           (bh4 > ss_ver || t->by & 1);
1573
356k
    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
1574
356k
                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
1575
356k
    int res;
1576
1577
    // prediction
1578
356k
    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
1579
356k
    pixel *dst = ((pixel *) f->cur.data[0]) +
1580
356k
        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1581
356k
    const ptrdiff_t uvdstoff =
1582
356k
        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1583
356k
    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
1584
        // intrabc
1585
135k
        assert(!f->frame_hdr->super_res.enabled);
1586
135k
        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
1587
135k
                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1588
135k
        if (res) return res;
1589
249k
        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
1590
166k
            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
1591
166k
                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1592
166k
                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
1593
166k
                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1594
166k
            if (res) return res;
1595
166k
        }
1596
220k
    } else if (b->comp_type == COMP_INTER_NONE) {
1597
185k
        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
1598
185k
        const enum Filter2d filter_2d = b->filter2d;
1599
1600
185k
        if (imin(bw4, bh4) > 1 &&
1601
111k
            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1602
106k
             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1603
8.03k
        {
1604
8.03k
            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
1605
8.03k
                              b->motion_mode == MM_WARP ? &t->warpmv :
1606
8.03k
                                  &f->frame_hdr->gmv[b->ref[0]]);
1607
8.03k
            if (res) return res;
1608
177k
        } else {
1609
177k
            res = mc(t, dst, NULL, f->cur.stride[0],
1610
177k
                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
1611
177k
            if (res) return res;
1612
177k
            if (b->motion_mode == MM_OBMC) {
1613
30.1k
                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
1614
30.1k
                if (res) return res;
1615
30.1k
            }
1616
177k
        }
1617
185k
        if (b->interintra_type) {
1618
5.88k
            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1619
5.88k
            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
1620
4.80k
                                   SMOOTH_PRED : b->interintra_mode;
1621
5.88k
            pixel *const tmp = bitfn(t->scratch.interintra);
1622
5.88k
            int angle = 0;
1623
5.88k
            const pixel *top_sb_edge = NULL;
1624
5.88k
            if (!(t->by & (f->sb_step - 1))) {
1625
2.81k
                top_sb_edge = f->ipred_edge[0];
1626
2.81k
                const int sby = t->by >> f->sb_shift;
1627
2.81k
                top_sb_edge += f->sb128w * 128 * (sby - 1);
1628
2.81k
            }
1629
5.88k
            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
1630
5.88k
                                                  t->by, t->by > ts->tiling.row_start,
1631
5.88k
                                                  ts->tiling.col_end, ts->tiling.row_end,
1632
5.88k
                                                  0, dst, f->cur.stride[0], top_sb_edge,
1633
5.88k
                                                  m, &angle, bw4, bh4, 0, tl_edge
1634
5.88k
                                                  HIGHBD_CALL_SUFFIX);
1635
5.88k
            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
1636
5.88k
                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
1637
5.88k
                                     HIGHBD_CALL_SUFFIX);
1638
5.88k
            dsp->mc.blend(dst, f->cur.stride[0], tmp,
1639
5.88k
                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
1640
5.88k
        }
1641
1642
185k
        if (!has_chroma) goto skip_inter_chroma_pred;
1643
1644
        // sub8x8 derivation
1645
140k
        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
1646
140k
        refmvs_block *const *r;
1647
140k
        if (is_sub8x8) {
1648
10.3k
            assert(ss_hor == 1);
1649
10.3k
            r = &t->rt.r[(t->by & 31) + 5];
1650
10.3k
            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
1651
10.3k
            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
1652
10.3k
            if (bw4 == 1 && bh4 == ss_ver)
1653
2.55k
                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
1654
10.3k
        }
1655
1656
        // chroma prediction
1657
140k
        if (is_sub8x8) {
1658
10.2k
            assert(ss_hor == 1);
1659
10.2k
            ptrdiff_t h_off = 0, v_off = 0;
1660
10.2k
            if (bw4 == 1 && bh4 == ss_ver) {
1661
7.54k
                for (int pl = 0; pl < 2; pl++) {
1662
5.03k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1663
5.03k
                             NULL, f->cur.stride[1],
1664
5.03k
                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
1665
5.03k
                             r[-1][t->bx - 1].mv.mv[0],
1666
5.03k
                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
1667
5.03k
                             r[-1][t->bx - 1].ref.ref[0] - 1,
1668
5.03k
                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
1669
5.03k
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
1670
5.03k
                    if (res) return res;
1671
5.03k
                }
1672
2.51k
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1673
2.51k
                h_off = 2;
1674
2.51k
            }
1675
10.2k
            if (bw4 == 1) {
1676
6.40k
                const enum Filter2d left_filter_2d =
1677
6.40k
                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
1678
19.2k
                for (int pl = 0; pl < 2; pl++) {
1679
12.8k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
1680
12.8k
                             f->cur.stride[1], bw4, bh4, t->bx - 1,
1681
12.8k
                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
1682
12.8k
                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
1683
12.8k
                             r[0][t->bx - 1].ref.ref[0] - 1,
1684
12.8k
                             t->frame_thread.pass != 2 ? left_filter_2d :
1685
12.8k
                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
1686
12.8k
                    if (res) return res;
1687
12.8k
                }
1688
6.40k
                h_off = 2;
1689
6.40k
            }
1690
10.2k
            if (bh4 == ss_ver) {
1691
6.34k
                const enum Filter2d top_filter_2d =
1692
6.34k
                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
1693
19.0k
                for (int pl = 0; pl < 2; pl++) {
1694
12.6k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
1695
12.6k
                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
1696
12.6k
                             1 + pl, r[-1][t->bx].mv.mv[0],
1697
12.6k
                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
1698
12.6k
                             r[-1][t->bx].ref.ref[0] - 1,
1699
12.6k
                             t->frame_thread.pass != 2 ? top_filter_2d :
1700
12.6k
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
1701
12.6k
                    if (res) return res;
1702
12.6k
                }
1703
6.34k
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1704
6.34k
            }
1705
30.6k
            for (int pl = 0; pl < 2; pl++) {
1706
20.4k
                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
1707
20.4k
                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
1708
20.4k
                         refp, b->ref[0], filter_2d);
1709
20.4k
                if (res) return res;
1710
20.4k
            }
1711
130k
        } else {
1712
130k
            if (imin(cbw4, cbh4) > 1 &&
1713
69.8k
                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1714
65.8k
                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1715
6.54k
            {
1716
19.6k
                for (int pl = 0; pl < 2; pl++) {
1717
13.0k
                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
1718
13.0k
                                      f->cur.stride[1], b_dim, 1 + pl, refp,
1719
13.0k
                                      b->motion_mode == MM_WARP ? &t->warpmv :
1720
13.0k
                                          &f->frame_hdr->gmv[b->ref[0]]);
1721
13.0k
                    if (res) return res;
1722
13.0k
                }
1723
123k
            } else {
1724
370k
                for (int pl = 0; pl < 2; pl++) {
1725
247k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1726
247k
                             NULL, f->cur.stride[1],
1727
247k
                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1728
247k
                             t->bx & ~ss_hor, t->by & ~ss_ver,
1729
247k
                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
1730
247k
                    if (res) return res;
1731
247k
                    if (b->motion_mode == MM_OBMC) {
1732
50.6k
                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1733
50.6k
                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
1734
50.6k
                        if (res) return res;
1735
50.6k
                    }
1736
247k
                }
1737
123k
            }
1738
130k
            if (b->interintra_type) {
1739
                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
1740
                // the wrong thing since it will select 4x16, not 4x32, as a
1741
                // transform size...
1742
4.20k
                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
1743
1744
12.6k
                for (int pl = 0; pl < 2; pl++) {
1745
8.40k
                    pixel *const tmp = bitfn(t->scratch.interintra);
1746
8.40k
                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1747
8.40k
                    enum IntraPredMode m =
1748
8.40k
                        b->interintra_mode == II_SMOOTH_PRED ?
1749
6.91k
                        SMOOTH_PRED : b->interintra_mode;
1750
8.40k
                    int angle = 0;
1751
8.40k
                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1752
8.40k
                    const pixel *top_sb_edge = NULL;
1753
8.40k
                    if (!(t->by & (f->sb_step - 1))) {
1754
4.09k
                        top_sb_edge = f->ipred_edge[pl + 1];
1755
4.09k
                        const int sby = t->by >> f->sb_shift;
1756
4.09k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1757
4.09k
                    }
1758
8.40k
                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
1759
8.40k
                                                          (t->bx >> ss_hor) >
1760
8.40k
                                                              (ts->tiling.col_start >> ss_hor),
1761
8.40k
                                                          t->by >> ss_ver,
1762
8.40k
                                                          (t->by >> ss_ver) >
1763
8.40k
                                                              (ts->tiling.row_start >> ss_ver),
1764
8.40k
                                                          ts->tiling.col_end >> ss_hor,
1765
8.40k
                                                          ts->tiling.row_end >> ss_ver,
1766
8.40k
                                                          0, uvdst, f->cur.stride[1],
1767
8.40k
                                                          top_sb_edge, m,
1768
8.40k
                                                          &angle, cbw4, cbh4, 0, tl_edge
1769
8.40k
                                                          HIGHBD_CALL_SUFFIX);
1770
8.40k
                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
1771
8.40k
                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
1772
8.40k
                                             HIGHBD_CALL_SUFFIX);
1773
8.40k
                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
1774
8.40k
                                  cbw4 * 4, cbh4 * 4, ii_mask);
1775
8.40k
                }
1776
4.20k
            }
1777
130k
        }
1778
1779
185k
    skip_inter_chroma_pred: {}
1780
185k
        t->tl_4x4_filter = filter_2d;
1781
185k
    } else {
1782
35.4k
        const enum Filter2d filter_2d = b->filter2d;
1783
        // Maximum super block size is 128x128
1784
35.4k
        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
1785
35.4k
        int jnt_weight;
1786
35.4k
        uint8_t *const seg_mask = t->scratch.seg_mask;
1787
35.4k
        const uint8_t *mask;
1788
1789
106k
        for (int i = 0; i < 2; i++) {
1790
70.9k
            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1791
1792
70.9k
            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
1793
3.70k
                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
1794
3.70k
                                  &f->frame_hdr->gmv[b->ref[i]]);
1795
3.70k
                if (res) return res;
1796
67.2k
            } else {
1797
67.2k
                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
1798
67.2k
                         b->mv[i], refp, b->ref[i], filter_2d);
1799
67.2k
                if (res) return res;
1800
67.2k
            }
1801
70.9k
        }
1802
35.4k
        switch (b->comp_type) {
1803
25.2k
        case COMP_INTER_AVG:
1804
25.2k
            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1805
25.2k
                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
1806
25.2k
            break;
1807
3.49k
        case COMP_INTER_WEIGHTED_AVG:
1808
3.49k
            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
1809
3.49k
            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1810
3.49k
                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
1811
3.49k
            break;
1812
4.84k
        case COMP_INTER_SEG:
1813
4.84k
            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
1814
4.84k
                                           tmp[b->mask_sign], tmp[!b->mask_sign],
1815
4.84k
                                           bw4 * 4, bh4 * 4, seg_mask,
1816
4.84k
                                           b->mask_sign HIGHBD_CALL_SUFFIX);
1817
4.84k
            mask = seg_mask;
1818
4.84k
            break;
1819
1.89k
        case COMP_INTER_WEDGE:
1820
1.89k
            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
1821
1.89k
            dsp->mc.mask(dst, f->cur.stride[0],
1822
1.89k
                         tmp[b->mask_sign], tmp[!b->mask_sign],
1823
1.89k
                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
1824
1.89k
            if (has_chroma)
1825
1.34k
                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
1826
1.89k
            break;
1827
35.4k
        }
1828
1829
        // chroma
1830
75.7k
        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1831
151k
            for (int i = 0; i < 2; i++) {
1832
101k
                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1833
101k
                if (b->inter_mode == GLOBALMV_GLOBALMV &&
1834
17.4k
                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
1835
5.43k
                {
1836
5.43k
                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
1837
5.43k
                                      b_dim, 1 + pl,
1838
5.43k
                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
1839
5.43k
                    if (res) return res;
1840
95.6k
                } else {
1841
95.6k
                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
1842
95.6k
                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
1843
95.6k
                    if (res) return res;
1844
95.6k
                }
1845
101k
            }
1846
50.5k
            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1847
50.5k
            switch (b->comp_type) {
1848
36.2k
            case COMP_INTER_AVG:
1849
36.2k
                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1850
36.2k
                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
1851
36.2k
                            HIGHBD_CALL_SUFFIX);
1852
36.2k
                break;
1853
4.94k
            case COMP_INTER_WEIGHTED_AVG:
1854
4.94k
                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1855
4.94k
                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
1856
4.94k
                              HIGHBD_CALL_SUFFIX);
1857
4.94k
                break;
1858
2.68k
            case COMP_INTER_WEDGE:
1859
9.37k
            case COMP_INTER_SEG:
1860
9.37k
                dsp->mc.mask(uvdst, f->cur.stride[1],
1861
9.37k
                             tmp[b->mask_sign], tmp[!b->mask_sign],
1862
9.37k
                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
1863
9.37k
                             HIGHBD_CALL_SUFFIX);
1864
9.37k
                break;
1865
50.5k
            }
1866
50.5k
        }
1867
35.4k
    }
1868
1869
356k
    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1870
0
        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
1871
0
        if (has_chroma) {
1872
0
            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
1873
0
                     cbw4 * 4, cbh4 * 4, "u-pred");
1874
0
            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
1875
0
                     cbw4 * 4, cbh4 * 4, "v-pred");
1876
0
        }
1877
0
    }
1878
1879
356k
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1880
1881
356k
    if (b->skip) {
1882
        // reset coef contexts
1883
181k
        BlockContext *const a = t->a;
1884
181k
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
1885
181k
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
1886
181k
        if (has_chroma) {
1887
113k
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
1888
113k
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
1889
113k
            memset_cw(&a->ccoef[0][cbx4], 0x40);
1890
113k
            memset_cw(&a->ccoef[1][cbx4], 0x40);
1891
113k
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
1892
113k
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
1893
113k
        }
1894
181k
        return 0;
1895
181k
    }
1896
1897
174k
    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
1898
174k
    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
1899
174k
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
1900
1901
358k
    for (int init_y = 0; init_y < bh4; init_y += 16) {
1902
370k
        for (int init_x = 0; init_x < bw4; init_x += 16) {
1903
            // coefficient coding & inverse transforms
1904
187k
            int y_off = !!init_y, y;
1905
187k
            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
1906
397k
            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
1907
210k
                 y += ytx->h, y_off++)
1908
210k
            {
1909
210k
                int x, x_off = !!init_x;
1910
497k
                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
1911
287k
                     x += ytx->w, x_off++)
1912
287k
                {
1913
287k
                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
1914
287k
                                   x_off, y_off, &dst[x * 4]);
1915
287k
                    t->bx += ytx->w;
1916
287k
                }
1917
210k
                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
1918
210k
                t->bx -= x;
1919
210k
                t->by += ytx->h;
1920
210k
            }
1921
187k
            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
1922
187k
            t->by -= y;
1923
1924
            // chroma coefs and inverse transform
1925
437k
            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1926
291k
                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
1927
291k
                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
1928
291k
                for (y = init_y >> ss_ver, t->by += init_y;
1929
632k
                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
1930
340k
                {
1931
340k
                    int x;
1932
340k
                    for (x = init_x >> ss_hor, t->bx += init_x;
1933
822k
                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
1934
481k
                    {
1935
481k
                        coef *cf;
1936
481k
                        int eob;
1937
481k
                        enum TxfmType txtp;
1938
481k
                        if (t->frame_thread.pass) {
1939
481k
                            const int p = t->frame_thread.pass & 1;
1940
481k
                            const int cbi = *ts->frame_thread[p].cbi++;
1941
481k
                            cf = ts->frame_thread[p].cf;
1942
481k
                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
1943
481k
                            eob  = cbi >> 5;
1944
481k
                            txtp = cbi & 0x1f;
1945
18.4E
                        } else {
1946
18.4E
                            uint8_t cf_ctx;
1947
18.4E
                            cf = bitfn(t->cf);
1948
18.4E
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
1949
18.4E
                                                        bx4 + (x << ss_hor)];
1950
18.4E
                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1951
18.4E
                                               &t->l.ccoef[pl][cby4 + y],
1952
18.4E
                                               b->uvtx, bs, b, 0, 1 + pl,
1953
18.4E
                                               cf, &txtp, &cf_ctx);
1954
18.4E
                            if (DEBUG_BLOCK_INFO)
1955
0
                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1956
0
                                       "txtp=%d,eob=%d]: r=%d\n",
1957
0
                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
1958
18.4E
                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1959
18.4E
                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
1960
18.4E
                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1961
18.4E
                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1962
18.4E
                        }
1963
481k
                        if (eob >= 0) {
1964
167k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1965
0
                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
1966
167k
                            dsp->itx.itxfm_add[b->uvtx]
1967
167k
                                              [txtp](&uvdst[4 * x],
1968
167k
                                                     f->cur.stride[1],
1969
167k
                                                     cf, eob HIGHBD_CALL_SUFFIX);
1970
167k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1971
0
                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
1972
0
                                         uvtx->w * 4, uvtx->h * 4, "recon");
1973
167k
                        }
1974
481k
                        t->bx += uvtx->w << ss_hor;
1975
481k
                    }
1976
340k
                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
1977
340k
                    t->bx -= x << ss_hor;
1978
340k
                    t->by += uvtx->h << ss_ver;
1979
340k
                }
1980
291k
                t->by -= y << ss_ver;
1981
291k
            }
1982
187k
        }
1983
183k
    }
1984
174k
    return 0;
1985
356k
}
dav1d_recon_b_inter_8bpc
Line
Count
Source
1559
217k
{
1560
217k
    Dav1dTileState *const ts = t->ts;
1561
217k
    const Dav1dFrameContext *const f = t->f;
1562
217k
    const Dav1dDSPContext *const dsp = f->dsp;
1563
217k
    const int bx4 = t->bx & 31, by4 = t->by & 31;
1564
217k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1565
217k
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1566
217k
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1567
217k
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1568
217k
    const int bw4 = b_dim[0], bh4 = b_dim[1];
1569
217k
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1570
217k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1571
165k
                           (bw4 > ss_hor || t->bx & 1) &&
1572
156k
                           (bh4 > ss_ver || t->by & 1);
1573
217k
    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
1574
217k
                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
1575
217k
    int res;
1576
1577
    // prediction
1578
217k
    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
1579
217k
    pixel *dst = ((pixel *) f->cur.data[0]) +
1580
217k
        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1581
217k
    const ptrdiff_t uvdstoff =
1582
217k
        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1583
217k
    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
1584
        // intrabc
1585
88.8k
        assert(!f->frame_hdr->super_res.enabled);
1586
88.8k
        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
1587
88.8k
                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1588
88.8k
        if (res) return res;
1589
143k
        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
1590
95.8k
            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
1591
95.8k
                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1592
95.8k
                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
1593
95.8k
                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1594
95.8k
            if (res) return res;
1595
95.8k
        }
1596
128k
    } else if (b->comp_type == COMP_INTER_NONE) {
1597
111k
        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
1598
111k
        const enum Filter2d filter_2d = b->filter2d;
1599
1600
111k
        if (imin(bw4, bh4) > 1 &&
1601
71.0k
            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1602
67.9k
             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1603
5.01k
        {
1604
5.01k
            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
1605
5.01k
                              b->motion_mode == MM_WARP ? &t->warpmv :
1606
5.01k
                                  &f->frame_hdr->gmv[b->ref[0]]);
1607
5.01k
            if (res) return res;
1608
106k
        } else {
1609
106k
            res = mc(t, dst, NULL, f->cur.stride[0],
1610
106k
                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
1611
106k
            if (res) return res;
1612
106k
            if (b->motion_mode == MM_OBMC) {
1613
18.7k
                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
1614
18.7k
                if (res) return res;
1615
18.7k
            }
1616
106k
        }
1617
111k
        if (b->interintra_type) {
1618
2.83k
            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1619
2.83k
            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
1620
2.36k
                                   SMOOTH_PRED : b->interintra_mode;
1621
2.83k
            pixel *const tmp = bitfn(t->scratch.interintra);
1622
2.83k
            int angle = 0;
1623
2.83k
            const pixel *top_sb_edge = NULL;
1624
2.83k
            if (!(t->by & (f->sb_step - 1))) {
1625
1.35k
                top_sb_edge = f->ipred_edge[0];
1626
1.35k
                const int sby = t->by >> f->sb_shift;
1627
1.35k
                top_sb_edge += f->sb128w * 128 * (sby - 1);
1628
1.35k
            }
1629
2.83k
            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
1630
2.83k
                                                  t->by, t->by > ts->tiling.row_start,
1631
2.83k
                                                  ts->tiling.col_end, ts->tiling.row_end,
1632
2.83k
                                                  0, dst, f->cur.stride[0], top_sb_edge,
1633
2.83k
                                                  m, &angle, bw4, bh4, 0, tl_edge
1634
2.83k
                                                  HIGHBD_CALL_SUFFIX);
1635
2.83k
            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
1636
2.83k
                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
1637
2.83k
                                     HIGHBD_CALL_SUFFIX);
1638
2.83k
            dsp->mc.blend(dst, f->cur.stride[0], tmp,
1639
2.83k
                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
1640
2.83k
        }
1641
1642
111k
        if (!has_chroma) goto skip_inter_chroma_pred;
1643
1644
        // sub8x8 derivation
1645
89.8k
        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
1646
89.8k
        refmvs_block *const *r;
1647
89.8k
        if (is_sub8x8) {
1648
6.33k
            assert(ss_hor == 1);
1649
6.33k
            r = &t->rt.r[(t->by & 31) + 5];
1650
6.33k
            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
1651
6.33k
            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
1652
6.33k
            if (bw4 == 1 && bh4 == ss_ver)
1653
1.42k
                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
1654
6.33k
        }
1655
1656
        // chroma prediction
1657
89.8k
        if (is_sub8x8) {
1658
6.26k
            assert(ss_hor == 1);
1659
6.26k
            ptrdiff_t h_off = 0, v_off = 0;
1660
6.26k
            if (bw4 == 1 && bh4 == ss_ver) {
1661
4.20k
                for (int pl = 0; pl < 2; pl++) {
1662
2.80k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1663
2.80k
                             NULL, f->cur.stride[1],
1664
2.80k
                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
1665
2.80k
                             r[-1][t->bx - 1].mv.mv[0],
1666
2.80k
                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
1667
2.80k
                             r[-1][t->bx - 1].ref.ref[0] - 1,
1668
2.80k
                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
1669
2.80k
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
1670
2.80k
                    if (res) return res;
1671
2.80k
                }
1672
1.40k
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1673
1.40k
                h_off = 2;
1674
1.40k
            }
1675
6.26k
            if (bw4 == 1) {
1676
3.80k
                const enum Filter2d left_filter_2d =
1677
3.80k
                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
1678
11.4k
                for (int pl = 0; pl < 2; pl++) {
1679
7.60k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
1680
7.60k
                             f->cur.stride[1], bw4, bh4, t->bx - 1,
1681
7.60k
                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
1682
7.60k
                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
1683
7.60k
                             r[0][t->bx - 1].ref.ref[0] - 1,
1684
7.60k
                             t->frame_thread.pass != 2 ? left_filter_2d :
1685
7.60k
                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
1686
7.60k
                    if (res) return res;
1687
7.60k
                }
1688
3.80k
                h_off = 2;
1689
3.80k
            }
1690
6.26k
            if (bh4 == ss_ver) {
1691
3.86k
                const enum Filter2d top_filter_2d =
1692
3.86k
                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
1693
11.5k
                for (int pl = 0; pl < 2; pl++) {
1694
7.72k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
1695
7.72k
                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
1696
7.72k
                             1 + pl, r[-1][t->bx].mv.mv[0],
1697
7.72k
                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
1698
7.72k
                             r[-1][t->bx].ref.ref[0] - 1,
1699
7.72k
                             t->frame_thread.pass != 2 ? top_filter_2d :
1700
7.72k
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
1701
7.72k
                    if (res) return res;
1702
7.72k
                }
1703
3.86k
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1704
3.86k
            }
1705
18.7k
            for (int pl = 0; pl < 2; pl++) {
1706
12.5k
                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
1707
12.5k
                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
1708
12.5k
                         refp, b->ref[0], filter_2d);
1709
12.5k
                if (res) return res;
1710
12.5k
            }
1711
83.5k
        } else {
1712
83.5k
            if (imin(cbw4, cbh4) > 1 &&
1713
48.1k
                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1714
45.6k
                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1715
4.28k
            {
1716
12.8k
                for (int pl = 0; pl < 2; pl++) {
1717
8.57k
                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
1718
8.57k
                                      f->cur.stride[1], b_dim, 1 + pl, refp,
1719
8.57k
                                      b->motion_mode == MM_WARP ? &t->warpmv :
1720
8.57k
                                          &f->frame_hdr->gmv[b->ref[0]]);
1721
8.57k
                    if (res) return res;
1722
8.57k
                }
1723
79.3k
            } else {
1724
237k
                for (int pl = 0; pl < 2; pl++) {
1725
158k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1726
158k
                             NULL, f->cur.stride[1],
1727
158k
                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1728
158k
                             t->bx & ~ss_hor, t->by & ~ss_ver,
1729
158k
                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
1730
158k
                    if (res) return res;
1731
158k
                    if (b->motion_mode == MM_OBMC) {
1732
34.3k
                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1733
34.3k
                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
1734
34.3k
                        if (res) return res;
1735
34.3k
                    }
1736
158k
                }
1737
79.3k
            }
1738
83.5k
            if (b->interintra_type) {
1739
                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
1740
                // the wrong thing since it will select 4x16, not 4x32, as a
1741
                // transform size...
1742
2.13k
                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
1743
1744
6.39k
                for (int pl = 0; pl < 2; pl++) {
1745
4.26k
                    pixel *const tmp = bitfn(t->scratch.interintra);
1746
4.26k
                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1747
4.26k
                    enum IntraPredMode m =
1748
4.26k
                        b->interintra_mode == II_SMOOTH_PRED ?
1749
3.63k
                        SMOOTH_PRED : b->interintra_mode;
1750
4.26k
                    int angle = 0;
1751
4.26k
                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1752
4.26k
                    const pixel *top_sb_edge = NULL;
1753
4.26k
                    if (!(t->by & (f->sb_step - 1))) {
1754
2.00k
                        top_sb_edge = f->ipred_edge[pl + 1];
1755
2.00k
                        const int sby = t->by >> f->sb_shift;
1756
2.00k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1757
2.00k
                    }
1758
4.26k
                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
1759
4.26k
                                                          (t->bx >> ss_hor) >
1760
4.26k
                                                              (ts->tiling.col_start >> ss_hor),
1761
4.26k
                                                          t->by >> ss_ver,
1762
4.26k
                                                          (t->by >> ss_ver) >
1763
4.26k
                                                              (ts->tiling.row_start >> ss_ver),
1764
4.26k
                                                          ts->tiling.col_end >> ss_hor,
1765
4.26k
                                                          ts->tiling.row_end >> ss_ver,
1766
4.26k
                                                          0, uvdst, f->cur.stride[1],
1767
4.26k
                                                          top_sb_edge, m,
1768
4.26k
                                                          &angle, cbw4, cbh4, 0, tl_edge
1769
4.26k
                                                          HIGHBD_CALL_SUFFIX);
1770
4.26k
                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
1771
4.26k
                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
1772
4.26k
                                             HIGHBD_CALL_SUFFIX);
1773
4.26k
                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
1774
4.26k
                                  cbw4 * 4, cbh4 * 4, ii_mask);
1775
4.26k
                }
1776
2.13k
            }
1777
83.5k
        }
1778
1779
111k
    skip_inter_chroma_pred: {}
1780
111k
        t->tl_4x4_filter = filter_2d;
1781
111k
    } else {
1782
17.3k
        const enum Filter2d filter_2d = b->filter2d;
1783
        // Maximum super block size is 128x128
1784
17.3k
        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
1785
17.3k
        int jnt_weight;
1786
17.3k
        uint8_t *const seg_mask = t->scratch.seg_mask;
1787
17.3k
        const uint8_t *mask;
1788
1789
51.9k
        for (int i = 0; i < 2; i++) {
1790
34.6k
            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1791
1792
34.6k
            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
1793
2.31k
                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
1794
2.31k
                                  &f->frame_hdr->gmv[b->ref[i]]);
1795
2.31k
                if (res) return res;
1796
32.2k
            } else {
1797
32.2k
                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
1798
32.2k
                         b->mv[i], refp, b->ref[i], filter_2d);
1799
32.2k
                if (res) return res;
1800
32.2k
            }
1801
34.6k
        }
1802
17.3k
        switch (b->comp_type) {
1803
12.7k
        case COMP_INTER_AVG:
1804
12.7k
            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1805
12.7k
                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
1806
12.7k
            break;
1807
1.43k
        case COMP_INTER_WEIGHTED_AVG:
1808
1.43k
            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
1809
1.43k
            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1810
1.43k
                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
1811
1.43k
            break;
1812
2.24k
        case COMP_INTER_SEG:
1813
2.24k
            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
1814
2.24k
                                           tmp[b->mask_sign], tmp[!b->mask_sign],
1815
2.24k
                                           bw4 * 4, bh4 * 4, seg_mask,
1816
2.24k
                                           b->mask_sign HIGHBD_CALL_SUFFIX);
1817
2.24k
            mask = seg_mask;
1818
2.24k
            break;
1819
865
        case COMP_INTER_WEDGE:
1820
865
            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
1821
865
            dsp->mc.mask(dst, f->cur.stride[0],
1822
865
                         tmp[b->mask_sign], tmp[!b->mask_sign],
1823
865
                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
1824
865
            if (has_chroma)
1825
579
                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
1826
865
            break;
1827
17.3k
        }
1828
1829
        // chroma
1830
36.1k
        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1831
72.3k
            for (int i = 0; i < 2; i++) {
1832
48.2k
                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1833
48.2k
                if (b->inter_mode == GLOBALMV_GLOBALMV &&
1834
8.96k
                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
1835
3.41k
                {
1836
3.41k
                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
1837
3.41k
                                      b_dim, 1 + pl,
1838
3.41k
                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
1839
3.41k
                    if (res) return res;
1840
44.8k
                } else {
1841
44.8k
                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
1842
44.8k
                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
1843
44.8k
                    if (res) return res;
1844
44.8k
                }
1845
48.2k
            }
1846
24.1k
            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1847
24.1k
            switch (b->comp_type) {
1848
17.7k
            case COMP_INTER_AVG:
1849
17.7k
                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1850
17.7k
                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
1851
17.7k
                            HIGHBD_CALL_SUFFIX);
1852
17.7k
                break;
1853
2.14k
            case COMP_INTER_WEIGHTED_AVG:
1854
2.14k
                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1855
2.14k
                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
1856
2.14k
                              HIGHBD_CALL_SUFFIX);
1857
2.14k
                break;
1858
1.15k
            case COMP_INTER_WEDGE:
1859
4.21k
            case COMP_INTER_SEG:
1860
4.21k
                dsp->mc.mask(uvdst, f->cur.stride[1],
1861
4.21k
                             tmp[b->mask_sign], tmp[!b->mask_sign],
1862
4.21k
                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
1863
4.21k
                             HIGHBD_CALL_SUFFIX);
1864
4.21k
                break;
1865
24.1k
            }
1866
24.1k
        }
1867
17.3k
    }
1868
1869
217k
    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1870
0
        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
1871
0
        if (has_chroma) {
1872
0
            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
1873
0
                     cbw4 * 4, cbh4 * 4, "u-pred");
1874
0
            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
1875
0
                     cbw4 * 4, cbh4 * 4, "v-pred");
1876
0
        }
1877
0
    }
1878
1879
217k
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1880
1881
217k
    if (b->skip) {
1882
        // reset coef contexts
1883
123k
        BlockContext *const a = t->a;
1884
123k
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
1885
123k
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
1886
123k
        if (has_chroma) {
1887
76.3k
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
1888
76.3k
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
1889
76.3k
            memset_cw(&a->ccoef[0][cbx4], 0x40);
1890
76.3k
            memset_cw(&a->ccoef[1][cbx4], 0x40);
1891
76.3k
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
1892
76.3k
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
1893
76.3k
        }
1894
123k
        return 0;
1895
123k
    }
1896
1897
94.0k
    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
1898
94.0k
    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
1899
94.0k
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
1900
1901
192k
    for (int init_y = 0; init_y < bh4; init_y += 16) {
1902
199k
        for (int init_x = 0; init_x < bw4; init_x += 16) {
1903
            // coefficient coding & inverse transforms
1904
100k
            int y_off = !!init_y, y;
1905
100k
            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
1906
218k
            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
1907
117k
                 y += ytx->h, y_off++)
1908
117k
            {
1909
117k
                int x, x_off = !!init_x;
1910
287k
                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
1911
170k
                     x += ytx->w, x_off++)
1912
170k
                {
1913
170k
                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
1914
170k
                                   x_off, y_off, &dst[x * 4]);
1915
170k
                    t->bx += ytx->w;
1916
170k
                }
1917
117k
                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
1918
117k
                t->bx -= x;
1919
117k
                t->by += ytx->h;
1920
117k
            }
1921
100k
            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
1922
100k
            t->by -= y;
1923
1924
            // chroma coefs and inverse transform
1925
238k
            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1926
158k
                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
1927
158k
                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
1928
158k
                for (y = init_y >> ss_ver, t->by += init_y;
1929
350k
                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
1930
191k
                {
1931
191k
                    int x;
1932
191k
                    for (x = init_x >> ss_hor, t->bx += init_x;
1933
478k
                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
1934
286k
                    {
1935
286k
                        coef *cf;
1936
286k
                        int eob;
1937
286k
                        enum TxfmType txtp;
1938
286k
                        if (t->frame_thread.pass) {
1939
286k
                            const int p = t->frame_thread.pass & 1;
1940
286k
                            const int cbi = *ts->frame_thread[p].cbi++;
1941
286k
                            cf = ts->frame_thread[p].cf;
1942
286k
                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
1943
286k
                            eob  = cbi >> 5;
1944
286k
                            txtp = cbi & 0x1f;
1945
18.4E
                        } else {
1946
18.4E
                            uint8_t cf_ctx;
1947
18.4E
                            cf = bitfn(t->cf);
1948
18.4E
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
1949
18.4E
                                                        bx4 + (x << ss_hor)];
1950
18.4E
                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1951
18.4E
                                               &t->l.ccoef[pl][cby4 + y],
1952
18.4E
                                               b->uvtx, bs, b, 0, 1 + pl,
1953
18.4E
                                               cf, &txtp, &cf_ctx);
1954
18.4E
                            if (DEBUG_BLOCK_INFO)
1955
0
                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1956
0
                                       "txtp=%d,eob=%d]: r=%d\n",
1957
0
                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
1958
18.4E
                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1959
18.4E
                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
1960
18.4E
                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1961
18.4E
                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1962
18.4E
                        }
1963
286k
                        if (eob >= 0) {
1964
106k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1965
0
                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
1966
106k
                            dsp->itx.itxfm_add[b->uvtx]
1967
106k
                                              [txtp](&uvdst[4 * x],
1968
106k
                                                     f->cur.stride[1],
1969
106k
                                                     cf, eob HIGHBD_CALL_SUFFIX);
1970
106k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1971
0
                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
1972
0
                                         uvtx->w * 4, uvtx->h * 4, "recon");
1973
106k
                        }
1974
286k
                        t->bx += uvtx->w << ss_hor;
1975
286k
                    }
1976
191k
                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
1977
191k
                    t->bx -= x << ss_hor;
1978
191k
                    t->by += uvtx->h << ss_ver;
1979
191k
                }
1980
158k
                t->by -= y << ss_ver;
1981
158k
            }
1982
100k
        }
1983
98.5k
    }
1984
94.0k
    return 0;
1985
217k
}
dav1d_recon_b_inter_16bpc
Line
Count
Source
1559
138k
{
1560
138k
    Dav1dTileState *const ts = t->ts;
1561
138k
    const Dav1dFrameContext *const f = t->f;
1562
138k
    const Dav1dDSPContext *const dsp = f->dsp;
1563
138k
    const int bx4 = t->bx & 31, by4 = t->by & 31;
1564
138k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1565
138k
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1566
138k
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1567
138k
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1568
138k
    const int bw4 = b_dim[0], bh4 = b_dim[1];
1569
138k
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1570
138k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1571
108k
                           (bw4 > ss_hor || t->bx & 1) &&
1572
103k
                           (bh4 > ss_ver || t->by & 1);
1573
138k
    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
1574
138k
                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
1575
138k
    int res;
1576
1577
    // prediction
1578
138k
    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
1579
138k
    pixel *dst = ((pixel *) f->cur.data[0]) +
1580
138k
        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1581
138k
    const ptrdiff_t uvdstoff =
1582
138k
        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1583
138k
    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
1584
        // intrabc
1585
46.7k
        assert(!f->frame_hdr->super_res.enabled);
1586
46.7k
        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
1587
46.7k
                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1588
46.7k
        if (res) return res;
1589
105k
        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
1590
70.4k
            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
1591
70.4k
                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1592
70.4k
                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
1593
70.4k
                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1594
70.4k
            if (res) return res;
1595
70.4k
        }
1596
92.0k
    } else if (b->comp_type == COMP_INTER_NONE) {
1597
73.8k
        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
1598
73.8k
        const enum Filter2d filter_2d = b->filter2d;
1599
1600
73.8k
        if (imin(bw4, bh4) > 1 &&
1601
40.9k
            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1602
39.0k
             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1603
3.02k
        {
1604
3.02k
            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
1605
3.02k
                              b->motion_mode == MM_WARP ? &t->warpmv :
1606
3.02k
                                  &f->frame_hdr->gmv[b->ref[0]]);
1607
3.02k
            if (res) return res;
1608
70.8k
        } else {
1609
70.8k
            res = mc(t, dst, NULL, f->cur.stride[0],
1610
70.8k
                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
1611
70.8k
            if (res) return res;
1612
70.8k
            if (b->motion_mode == MM_OBMC) {
1613
11.4k
                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
1614
11.4k
                if (res) return res;
1615
11.4k
            }
1616
70.8k
        }
1617
73.8k
        if (b->interintra_type) {
1618
3.04k
            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1619
3.04k
            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
1620
2.43k
                                   SMOOTH_PRED : b->interintra_mode;
1621
3.04k
            pixel *const tmp = bitfn(t->scratch.interintra);
1622
3.04k
            int angle = 0;
1623
3.04k
            const pixel *top_sb_edge = NULL;
1624
3.04k
            if (!(t->by & (f->sb_step - 1))) {
1625
1.45k
                top_sb_edge = f->ipred_edge[0];
1626
1.45k
                const int sby = t->by >> f->sb_shift;
1627
1.45k
                top_sb_edge += f->sb128w * 128 * (sby - 1);
1628
1.45k
            }
1629
3.04k
            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
1630
3.04k
                                                  t->by, t->by > ts->tiling.row_start,
1631
3.04k
                                                  ts->tiling.col_end, ts->tiling.row_end,
1632
3.04k
                                                  0, dst, f->cur.stride[0], top_sb_edge,
1633
3.04k
                                                  m, &angle, bw4, bh4, 0, tl_edge
1634
3.04k
                                                  HIGHBD_CALL_SUFFIX);
1635
3.04k
            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
1636
3.04k
                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
1637
3.04k
                                     HIGHBD_CALL_SUFFIX);
1638
3.04k
            dsp->mc.blend(dst, f->cur.stride[0], tmp,
1639
3.04k
                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
1640
3.04k
        }
1641
1642
73.8k
        if (!has_chroma) goto skip_inter_chroma_pred;
1643
1644
        // sub8x8 derivation
1645
50.4k
        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
1646
50.4k
        refmvs_block *const *r;
1647
50.4k
        if (is_sub8x8) {
1648
4.05k
            assert(ss_hor == 1);
1649
4.05k
            r = &t->rt.r[(t->by & 31) + 5];
1650
4.05k
            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
1651
4.05k
            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
1652
4.05k
            if (bw4 == 1 && bh4 == ss_ver)
1653
1.13k
                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
1654
4.05k
        }
1655
1656
        // chroma prediction
1657
50.4k
        if (is_sub8x8) {
1658
3.96k
            assert(ss_hor == 1);
1659
3.96k
            ptrdiff_t h_off = 0, v_off = 0;
1660
3.96k
            if (bw4 == 1 && bh4 == ss_ver) {
1661
3.33k
                for (int pl = 0; pl < 2; pl++) {
1662
2.22k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1663
2.22k
                             NULL, f->cur.stride[1],
1664
2.22k
                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
1665
2.22k
                             r[-1][t->bx - 1].mv.mv[0],
1666
2.22k
                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
1667
2.22k
                             r[-1][t->bx - 1].ref.ref[0] - 1,
1668
2.22k
                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
1669
2.22k
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
1670
2.22k
                    if (res) return res;
1671
2.22k
                }
1672
1.11k
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1673
1.11k
                h_off = 2;
1674
1.11k
            }
1675
3.96k
            if (bw4 == 1) {
1676
2.59k
                const enum Filter2d left_filter_2d =
1677
2.59k
                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
1678
7.79k
                for (int pl = 0; pl < 2; pl++) {
1679
5.19k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
1680
5.19k
                             f->cur.stride[1], bw4, bh4, t->bx - 1,
1681
5.19k
                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
1682
5.19k
                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
1683
5.19k
                             r[0][t->bx - 1].ref.ref[0] - 1,
1684
5.19k
                             t->frame_thread.pass != 2 ? left_filter_2d :
1685
5.19k
                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
1686
5.19k
                    if (res) return res;
1687
5.19k
                }
1688
2.59k
                h_off = 2;
1689
2.59k
            }
1690
3.96k
            if (bh4 == ss_ver) {
1691
2.47k
                const enum Filter2d top_filter_2d =
1692
2.47k
                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
1693
7.42k
                for (int pl = 0; pl < 2; pl++) {
1694
4.95k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
1695
4.95k
                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
1696
4.95k
                             1 + pl, r[-1][t->bx].mv.mv[0],
1697
4.95k
                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
1698
4.95k
                             r[-1][t->bx].ref.ref[0] - 1,
1699
4.95k
                             t->frame_thread.pass != 2 ? top_filter_2d :
1700
4.95k
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
1701
4.95k
                    if (res) return res;
1702
4.95k
                }
1703
2.47k
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1704
2.47k
            }
1705
11.8k
            for (int pl = 0; pl < 2; pl++) {
1706
7.92k
                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
1707
7.92k
                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
1708
7.92k
                         refp, b->ref[0], filter_2d);
1709
7.92k
                if (res) return res;
1710
7.92k
            }
1711
46.5k
        } else {
1712
46.5k
            if (imin(cbw4, cbh4) > 1 &&
1713
21.7k
                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1714
20.2k
                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1715
2.25k
            {
1716
6.77k
                for (int pl = 0; pl < 2; pl++) {
1717
4.51k
                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
1718
4.51k
                                      f->cur.stride[1], b_dim, 1 + pl, refp,
1719
4.51k
                                      b->motion_mode == MM_WARP ? &t->warpmv :
1720
4.51k
                                          &f->frame_hdr->gmv[b->ref[0]]);
1721
4.51k
                    if (res) return res;
1722
4.51k
                }
1723
44.2k
            } else {
1724
132k
                for (int pl = 0; pl < 2; pl++) {
1725
88.4k
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1726
88.4k
                             NULL, f->cur.stride[1],
1727
88.4k
                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1728
88.4k
                             t->bx & ~ss_hor, t->by & ~ss_ver,
1729
88.4k
                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
1730
88.4k
                    if (res) return res;
1731
88.4k
                    if (b->motion_mode == MM_OBMC) {
1732
16.3k
                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1733
16.3k
                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
1734
16.3k
                        if (res) return res;
1735
16.3k
                    }
1736
88.4k
                }
1737
44.2k
            }
1738
46.5k
            if (b->interintra_type) {
1739
                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
1740
                // the wrong thing since it will select 4x16, not 4x32, as a
1741
                // transform size...
1742
2.07k
                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
1743
1744
6.21k
                for (int pl = 0; pl < 2; pl++) {
1745
4.14k
                    pixel *const tmp = bitfn(t->scratch.interintra);
1746
4.14k
                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1747
4.14k
                    enum IntraPredMode m =
1748
4.14k
                        b->interintra_mode == II_SMOOTH_PRED ?
1749
3.27k
                        SMOOTH_PRED : b->interintra_mode;
1750
4.14k
                    int angle = 0;
1751
4.14k
                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1752
4.14k
                    const pixel *top_sb_edge = NULL;
1753
4.14k
                    if (!(t->by & (f->sb_step - 1))) {
1754
2.09k
                        top_sb_edge = f->ipred_edge[pl + 1];
1755
2.09k
                        const int sby = t->by >> f->sb_shift;
1756
2.09k
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
1757
2.09k
                    }
1758
4.14k
                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
1759
4.14k
                                                          (t->bx >> ss_hor) >
1760
4.14k
                                                              (ts->tiling.col_start >> ss_hor),
1761
4.14k
                                                          t->by >> ss_ver,
1762
4.14k
                                                          (t->by >> ss_ver) >
1763
4.14k
                                                              (ts->tiling.row_start >> ss_ver),
1764
4.14k
                                                          ts->tiling.col_end >> ss_hor,
1765
4.14k
                                                          ts->tiling.row_end >> ss_ver,
1766
4.14k
                                                          0, uvdst, f->cur.stride[1],
1767
4.14k
                                                          top_sb_edge, m,
1768
4.14k
                                                          &angle, cbw4, cbh4, 0, tl_edge
1769
4.14k
                                                          HIGHBD_CALL_SUFFIX);
1770
4.14k
                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
1771
4.14k
                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
1772
4.14k
                                             HIGHBD_CALL_SUFFIX);
1773
4.14k
                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
1774
4.14k
                                  cbw4 * 4, cbh4 * 4, ii_mask);
1775
4.14k
                }
1776
2.07k
            }
1777
46.5k
        }
1778
1779
73.8k
    skip_inter_chroma_pred: {}
1780
73.8k
        t->tl_4x4_filter = filter_2d;
1781
73.8k
    } else {
1782
18.1k
        const enum Filter2d filter_2d = b->filter2d;
1783
        // Maximum super block size is 128x128
1784
18.1k
        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
1785
18.1k
        int jnt_weight;
1786
18.1k
        uint8_t *const seg_mask = t->scratch.seg_mask;
1787
18.1k
        const uint8_t *mask;
1788
1789
54.4k
        for (int i = 0; i < 2; i++) {
1790
36.3k
            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1791
1792
36.3k
            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
1793
1.39k
                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
1794
1.39k
                                  &f->frame_hdr->gmv[b->ref[i]]);
1795
1.39k
                if (res) return res;
1796
34.9k
            } else {
1797
34.9k
                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
1798
34.9k
                         b->mv[i], refp, b->ref[i], filter_2d);
1799
34.9k
                if (res) return res;
1800
34.9k
            }
1801
36.3k
        }
1802
18.1k
        switch (b->comp_type) {
1803
12.4k
        case COMP_INTER_AVG:
1804
12.4k
            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1805
12.4k
                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
1806
12.4k
            break;
1807
2.05k
        case COMP_INTER_WEIGHTED_AVG:
1808
2.05k
            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
1809
2.05k
            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1810
2.05k
                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
1811
2.05k
            break;
1812
2.60k
        case COMP_INTER_SEG:
1813
2.60k
            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
1814
2.60k
                                           tmp[b->mask_sign], tmp[!b->mask_sign],
1815
2.60k
                                           bw4 * 4, bh4 * 4, seg_mask,
1816
2.60k
                                           b->mask_sign HIGHBD_CALL_SUFFIX);
1817
2.60k
            mask = seg_mask;
1818
2.60k
            break;
1819
1.02k
        case COMP_INTER_WEDGE:
1820
1.02k
            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
1821
1.02k
            dsp->mc.mask(dst, f->cur.stride[0],
1822
1.02k
                         tmp[b->mask_sign], tmp[!b->mask_sign],
1823
1.02k
                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
1824
1.02k
            if (has_chroma)
1825
763
                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
1826
1.02k
            break;
1827
18.1k
        }
1828
1829
        // chroma
1830
39.6k
        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1831
79.2k
            for (int i = 0; i < 2; i++) {
1832
52.8k
                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1833
52.8k
                if (b->inter_mode == GLOBALMV_GLOBALMV &&
1834
8.46k
                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
1835
2.01k
                {
1836
2.01k
                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
1837
2.01k
                                      b_dim, 1 + pl,
1838
2.01k
                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
1839
2.01k
                    if (res) return res;
1840
50.7k
                } else {
1841
50.7k
                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
1842
50.7k
                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
1843
50.7k
                    if (res) return res;
1844
50.7k
                }
1845
52.8k
            }
1846
26.4k
            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1847
26.4k
            switch (b->comp_type) {
1848
18.4k
            case COMP_INTER_AVG:
1849
18.4k
                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1850
18.4k
                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
1851
18.4k
                            HIGHBD_CALL_SUFFIX);
1852
18.4k
                break;
1853
2.80k
            case COMP_INTER_WEIGHTED_AVG:
1854
2.80k
                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1855
2.80k
                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
1856
2.80k
                              HIGHBD_CALL_SUFFIX);
1857
2.80k
                break;
1858
1.52k
            case COMP_INTER_WEDGE:
1859
5.15k
            case COMP_INTER_SEG:
1860
5.15k
                dsp->mc.mask(uvdst, f->cur.stride[1],
1861
5.15k
                             tmp[b->mask_sign], tmp[!b->mask_sign],
1862
5.15k
                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
1863
5.15k
                             HIGHBD_CALL_SUFFIX);
1864
5.15k
                break;
1865
26.4k
            }
1866
26.4k
        }
1867
18.1k
    }
1868
1869
138k
    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1870
0
        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
1871
0
        if (has_chroma) {
1872
0
            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
1873
0
                     cbw4 * 4, cbh4 * 4, "u-pred");
1874
0
            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
1875
0
                     cbw4 * 4, cbh4 * 4, "v-pred");
1876
0
        }
1877
0
    }
1878
1879
138k
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1880
1881
138k
    if (b->skip) {
1882
        // reset coef contexts
1883
58.0k
        BlockContext *const a = t->a;
1884
58.0k
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
1885
58.0k
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
1886
58.0k
        if (has_chroma) {
1887
36.7k
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
1888
36.7k
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
1889
36.7k
            memset_cw(&a->ccoef[0][cbx4], 0x40);
1890
36.7k
            memset_cw(&a->ccoef[1][cbx4], 0x40);
1891
36.7k
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
1892
36.7k
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
1893
36.7k
        }
1894
58.0k
        return 0;
1895
58.0k
    }
1896
1897
80.6k
    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
1898
80.6k
    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
1899
80.6k
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
1900
1901
165k
    for (int init_y = 0; init_y < bh4; init_y += 16) {
1902
171k
        for (int init_x = 0; init_x < bw4; init_x += 16) {
1903
            // coefficient coding & inverse transforms
1904
86.4k
            int y_off = !!init_y, y;
1905
86.4k
            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
1906
179k
            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
1907
92.9k
                 y += ytx->h, y_off++)
1908
92.9k
            {
1909
92.9k
                int x, x_off = !!init_x;
1910
209k
                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
1911
116k
                     x += ytx->w, x_off++)
1912
116k
                {
1913
116k
                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
1914
116k
                                   x_off, y_off, &dst[x * 4]);
1915
116k
                    t->bx += ytx->w;
1916
116k
                }
1917
92.9k
                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
1918
92.9k
                t->bx -= x;
1919
92.9k
                t->by += ytx->h;
1920
92.9k
            }
1921
86.4k
            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
1922
86.4k
            t->by -= y;
1923
1924
            // chroma coefs and inverse transform
1925
199k
            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1926
133k
                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
1927
133k
                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
1928
133k
                for (y = init_y >> ss_ver, t->by += init_y;
1929
281k
                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
1930
148k
                {
1931
148k
                    int x;
1932
148k
                    for (x = init_x >> ss_hor, t->bx += init_x;
1933
343k
                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
1934
194k
                    {
1935
194k
                        coef *cf;
1936
194k
                        int eob;
1937
194k
                        enum TxfmType txtp;
1938
194k
                        if (t->frame_thread.pass) {
1939
194k
                            const int p = t->frame_thread.pass & 1;
1940
194k
                            const int cbi = *ts->frame_thread[p].cbi++;
1941
194k
                            cf = ts->frame_thread[p].cf;
1942
194k
                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
1943
194k
                            eob  = cbi >> 5;
1944
194k
                            txtp = cbi & 0x1f;
1945
18.4E
                        } else {
1946
18.4E
                            uint8_t cf_ctx;
1947
18.4E
                            cf = bitfn(t->cf);
1948
18.4E
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
1949
18.4E
                                                        bx4 + (x << ss_hor)];
1950
18.4E
                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1951
18.4E
                                               &t->l.ccoef[pl][cby4 + y],
1952
18.4E
                                               b->uvtx, bs, b, 0, 1 + pl,
1953
18.4E
                                               cf, &txtp, &cf_ctx);
1954
18.4E
                            if (DEBUG_BLOCK_INFO)
1955
0
                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1956
0
                                       "txtp=%d,eob=%d]: r=%d\n",
1957
0
                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
1958
18.4E
                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
1959
18.4E
                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
1960
18.4E
                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
1961
18.4E
                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
1962
18.4E
                        }
1963
194k
                        if (eob >= 0) {
1964
60.1k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1965
0
                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
1966
60.1k
                            dsp->itx.itxfm_add[b->uvtx]
1967
60.1k
                                              [txtp](&uvdst[4 * x],
1968
60.1k
                                                     f->cur.stride[1],
1969
60.1k
                                                     cf, eob HIGHBD_CALL_SUFFIX);
1970
60.1k
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1971
0
                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
1972
0
                                         uvtx->w * 4, uvtx->h * 4, "recon");
1973
60.1k
                        }
1974
194k
                        t->bx += uvtx->w << ss_hor;
1975
194k
                    }
1976
148k
                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
1977
148k
                    t->bx -= x << ss_hor;
1978
148k
                    t->by += uvtx->h << ss_ver;
1979
148k
                }
1980
133k
                t->by -= y << ss_ver;
1981
133k
            }
1982
86.4k
        }
1983
84.9k
    }
1984
80.6k
    return 0;
1985
138k
}
1986
1987
185k
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
1988
185k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
1989
185k
        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
1990
0
    {
1991
0
        return;
1992
0
    }
1993
185k
    const int y = sby * f->sb_step * 4;
1994
185k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1995
185k
    pixel *const p[3] = {
1996
185k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
1997
185k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
1998
185k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
1999
185k
    };
2000
185k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2001
185k
    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
2002
185k
                                        f->lf.start_of_tile_row[sby]);
2003
185k
}
dav1d_filter_sbrow_deblock_cols_8bpc
Line
Count
Source
1987
93.4k
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
1988
93.4k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
1989
93.4k
        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
1990
0
    {
1991
0
        return;
1992
0
    }
1993
93.4k
    const int y = sby * f->sb_step * 4;
1994
93.4k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1995
93.4k
    pixel *const p[3] = {
1996
93.4k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
1997
93.4k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
1998
93.4k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
1999
93.4k
    };
2000
93.4k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2001
93.4k
    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
2002
93.4k
                                        f->lf.start_of_tile_row[sby]);
2003
93.4k
}
dav1d_filter_sbrow_deblock_cols_16bpc
Line
Count
Source
1987
92.5k
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
1988
92.5k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
1989
92.5k
        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
1990
0
    {
1991
0
        return;
1992
0
    }
1993
92.5k
    const int y = sby * f->sb_step * 4;
1994
92.5k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1995
92.5k
    pixel *const p[3] = {
1996
92.5k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
1997
92.5k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
1998
92.5k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
1999
92.5k
    };
2000
92.5k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2001
92.5k
    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
2002
92.5k
                                        f->lf.start_of_tile_row[sby]);
2003
92.5k
}
2004
2005
242k
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
2006
242k
    const int y = sby * f->sb_step * 4;
2007
242k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2008
242k
    pixel *const p[3] = {
2009
242k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2010
242k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2011
242k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2012
242k
    };
2013
242k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2014
242k
    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
2015
242k
        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
2016
185k
    {
2017
185k
        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
2018
185k
    }
2019
242k
    if (f->seq_hdr->cdef || f->lf.restore_planes) {
2020
        // Store loop filtered pixels required by CDEF / LR
2021
207k
        bytefn(dav1d_copy_lpf)(f, p, sby);
2022
207k
    }
2023
242k
}
dav1d_filter_sbrow_deblock_rows_8bpc
Line
Count
Source
2005
118k
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
2006
118k
    const int y = sby * f->sb_step * 4;
2007
118k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2008
118k
    pixel *const p[3] = {
2009
118k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2010
118k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2011
118k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2012
118k
    };
2013
118k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2014
118k
    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
2015
118k
        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
2016
93.2k
    {
2017
93.2k
        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
2018
93.2k
    }
2019
118k
    if (f->seq_hdr->cdef || f->lf.restore_planes) {
2020
        // Store loop filtered pixels required by CDEF / LR
2021
98.7k
        bytefn(dav1d_copy_lpf)(f, p, sby);
2022
98.7k
    }
2023
118k
}
dav1d_filter_sbrow_deblock_rows_16bpc
Line
Count
Source
2005
124k
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
2006
124k
    const int y = sby * f->sb_step * 4;
2007
124k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2008
124k
    pixel *const p[3] = {
2009
124k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2010
124k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2011
124k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2012
124k
    };
2013
124k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2014
124k
    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
2015
124k
        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
2016
92.4k
    {
2017
92.4k
        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
2018
92.4k
    }
2019
124k
    if (f->seq_hdr->cdef || f->lf.restore_planes) {
2020
        // Store loop filtered pixels required by CDEF / LR
2021
108k
        bytefn(dav1d_copy_lpf)(f, p, sby);
2022
108k
    }
2023
124k
}
2024
2025
160k
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
2026
160k
    const Dav1dFrameContext *const f = tc->f;
2027
160k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
2028
160k
    const int sbsz = f->sb_step;
2029
160k
    const int y = sby * sbsz * 4;
2030
160k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2031
160k
    pixel *const p[3] = {
2032
160k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2033
160k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2034
160k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2035
160k
    };
2036
160k
    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
2037
160k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2038
160k
    const int start = sby * sbsz;
2039
160k
    if (sby) {
2040
97.0k
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2041
97.0k
        pixel *p_up[3] = {
2042
97.0k
            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
2043
97.0k
            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2044
97.0k
            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2045
97.0k
        };
2046
97.0k
        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
2047
97.0k
    }
2048
160k
    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
2049
160k
    const int end = imin(start + n_blks, f->bh);
2050
160k
    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
2051
160k
}
dav1d_filter_sbrow_cdef_8bpc
Line
Count
Source
2025
74.8k
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
2026
74.8k
    const Dav1dFrameContext *const f = tc->f;
2027
74.8k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
2028
74.8k
    const int sbsz = f->sb_step;
2029
74.8k
    const int y = sby * sbsz * 4;
2030
74.8k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2031
74.8k
    pixel *const p[3] = {
2032
74.8k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2033
74.8k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2034
74.8k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2035
74.8k
    };
2036
74.8k
    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
2037
74.8k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2038
74.8k
    const int start = sby * sbsz;
2039
74.8k
    if (sby) {
2040
40.3k
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2041
40.3k
        pixel *p_up[3] = {
2042
40.3k
            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
2043
40.3k
            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2044
40.3k
            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2045
40.3k
        };
2046
40.3k
        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
2047
40.3k
    }
2048
74.8k
    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
2049
74.8k
    const int end = imin(start + n_blks, f->bh);
2050
74.8k
    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
2051
74.8k
}
dav1d_filter_sbrow_cdef_16bpc
Line
Count
Source
2025
85.5k
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
2026
85.5k
    const Dav1dFrameContext *const f = tc->f;
2027
85.5k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
2028
85.5k
    const int sbsz = f->sb_step;
2029
85.5k
    const int y = sby * sbsz * 4;
2030
85.5k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2031
85.5k
    pixel *const p[3] = {
2032
85.5k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2033
85.5k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2034
85.5k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2035
85.5k
    };
2036
85.5k
    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
2037
85.5k
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2038
85.5k
    const int start = sby * sbsz;
2039
85.5k
    if (sby) {
2040
56.6k
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2041
56.6k
        pixel *p_up[3] = {
2042
56.6k
            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
2043
56.6k
            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2044
56.6k
            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2045
56.6k
        };
2046
56.6k
        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
2047
56.6k
    }
2048
85.5k
    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
2049
85.5k
    const int end = imin(start + n_blks, f->bh);
2050
85.5k
    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
2051
85.5k
}
2052
2053
46.0k
void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
2054
46.0k
    const int sbsz = f->sb_step;
2055
46.0k
    const int y = sby * sbsz * 4;
2056
46.0k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2057
46.0k
    const pixel *const p[3] = {
2058
46.0k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2059
46.0k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2060
46.0k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2061
46.0k
    };
2062
46.0k
    pixel *const sr_p[3] = {
2063
46.0k
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2064
46.0k
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2065
46.0k
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2066
46.0k
    };
2067
46.0k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
2068
156k
    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
2069
110k
        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2070
110k
        const int h_start = 8 * !!sby >> ss_ver;
2071
110k
        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
2072
110k
        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
2073
110k
        const ptrdiff_t src_stride = f->cur.stride[!!pl];
2074
110k
        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
2075
110k
        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
2076
110k
        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2077
110k
        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
2078
110k
        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
2079
110k
        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
2080
2081
110k
        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
2082
110k
                          imin(img_h, h_end) + h_start, src_w,
2083
110k
                          f->resize_step[!!pl], f->resize_start[!!pl]
2084
110k
                          HIGHBD_CALL_SUFFIX);
2085
110k
    }
2086
46.0k
}
dav1d_filter_sbrow_resize_8bpc
Line
Count
Source
2053
22.6k
void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
2054
22.6k
    const int sbsz = f->sb_step;
2055
22.6k
    const int y = sby * sbsz * 4;
2056
22.6k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2057
22.6k
    const pixel *const p[3] = {
2058
22.6k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2059
22.6k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2060
22.6k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2061
22.6k
    };
2062
22.6k
    pixel *const sr_p[3] = {
2063
22.6k
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2064
22.6k
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2065
22.6k
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2066
22.6k
    };
2067
22.6k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
2068
82.4k
    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
2069
59.7k
        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2070
59.7k
        const int h_start = 8 * !!sby >> ss_ver;
2071
59.7k
        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
2072
59.7k
        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
2073
59.7k
        const ptrdiff_t src_stride = f->cur.stride[!!pl];
2074
59.7k
        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
2075
59.7k
        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
2076
59.7k
        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2077
59.7k
        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
2078
59.7k
        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
2079
59.7k
        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
2080
2081
59.7k
        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
2082
59.7k
                          imin(img_h, h_end) + h_start, src_w,
2083
59.7k
                          f->resize_step[!!pl], f->resize_start[!!pl]
2084
59.7k
                          HIGHBD_CALL_SUFFIX);
2085
59.7k
    }
2086
22.6k
}
dav1d_filter_sbrow_resize_16bpc
Line
Count
Source
2053
23.3k
void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
2054
23.3k
    const int sbsz = f->sb_step;
2055
23.3k
    const int y = sby * sbsz * 4;
2056
23.3k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2057
23.3k
    const pixel *const p[3] = {
2058
23.3k
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2059
23.3k
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2060
23.3k
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2061
23.3k
    };
2062
23.3k
    pixel *const sr_p[3] = {
2063
23.3k
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2064
23.3k
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2065
23.3k
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2066
23.3k
    };
2067
23.3k
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
2068
73.8k
    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
2069
50.5k
        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2070
50.5k
        const int h_start = 8 * !!sby >> ss_ver;
2071
50.5k
        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
2072
50.5k
        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
2073
50.5k
        const ptrdiff_t src_stride = f->cur.stride[!!pl];
2074
50.5k
        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
2075
50.5k
        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
2076
50.5k
        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2077
50.5k
        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
2078
50.5k
        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
2079
50.5k
        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
2080
2081
50.5k
        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
2082
50.5k
                          imin(img_h, h_end) + h_start, src_w,
2083
50.5k
                          f->resize_step[!!pl], f->resize_start[!!pl]
2084
50.5k
                          HIGHBD_CALL_SUFFIX);
2085
50.5k
    }
2086
23.3k
}
2087
2088
110k
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
2089
110k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
2090
110k
    const int y = sby * f->sb_step * 4;
2091
110k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2092
110k
    pixel *const sr_p[3] = {
2093
110k
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2094
110k
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2095
110k
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2096
110k
    };
2097
110k
    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
2098
110k
}
dav1d_filter_sbrow_lr_8bpc
Line
Count
Source
2088
61.0k
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
2089
61.0k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
2090
61.0k
    const int y = sby * f->sb_step * 4;
2091
61.0k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2092
61.0k
    pixel *const sr_p[3] = {
2093
61.0k
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2094
61.0k
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2095
61.0k
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2096
61.0k
    };
2097
61.0k
    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
2098
61.0k
}
dav1d_filter_sbrow_lr_16bpc
Line
Count
Source
2088
49.3k
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
2089
49.3k
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
2090
49.3k
    const int y = sby * f->sb_step * 4;
2091
49.3k
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2092
49.3k
    pixel *const sr_p[3] = {
2093
49.3k
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2094
49.3k
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2095
49.3k
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2096
49.3k
    };
2097
49.3k
    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
2098
49.3k
}
2099
2100
0
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
2101
0
    bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
2102
0
    bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
2103
0
    if (f->seq_hdr->cdef)
2104
0
        bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
2105
0
    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
2106
0
        bytefn(dav1d_filter_sbrow_resize)(f, sby);
2107
0
    if (f->lf.restore_planes)
2108
0
        bytefn(dav1d_filter_sbrow_lr)(f, sby);
2109
0
}
Unexecuted instantiation: dav1d_filter_sbrow_8bpc
Unexecuted instantiation: dav1d_filter_sbrow_16bpc
2110
2111
294k
void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
2112
294k
    const Dav1dFrameContext *const f = t->f;
2113
294k
    Dav1dTileState *const ts = t->ts;
2114
294k
    const int sby = t->by >> f->sb_shift;
2115
294k
    const int sby_off = f->sb128w * 128 * sby;
2116
294k
    const int x_off = ts->tiling.col_start;
2117
2118
294k
    const pixel *const y =
2119
294k
        ((const pixel *) f->cur.data[0]) + x_off * 4 +
2120
294k
                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
2121
294k
    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
2122
294k
               4 * (ts->tiling.col_end - x_off));
2123
2124
294k
    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
2125
231k
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2126
231k
        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2127
2128
231k
        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
2129
231k
            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
2130
693k
        for (int pl = 1; pl <= 2; pl++)
2131
462k
            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
2132
238k
                       &((const pixel *) f->cur.data[pl])[uv_off],
2133
238k
                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
2134
231k
    }
2135
294k
}
dav1d_backup_ipred_edge_8bpc
Line
Count
Source
2111
146k
void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
2112
146k
    const Dav1dFrameContext *const f = t->f;
2113
146k
    Dav1dTileState *const ts = t->ts;
2114
146k
    const int sby = t->by >> f->sb_shift;
2115
146k
    const int sby_off = f->sb128w * 128 * sby;
2116
146k
    const int x_off = ts->tiling.col_start;
2117
2118
146k
    const pixel *const y =
2119
146k
        ((const pixel *) f->cur.data[0]) + x_off * 4 +
2120
146k
                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
2121
146k
    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
2122
146k
               4 * (ts->tiling.col_end - x_off));
2123
2124
146k
    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
2125
119k
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2126
119k
        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2127
2128
119k
        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
2129
119k
            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
2130
358k
        for (int pl = 1; pl <= 2; pl++)
2131
238k
            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
2132
238k
                       &((const pixel *) f->cur.data[pl])[uv_off],
2133
238k
                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
2134
119k
    }
2135
146k
}
dav1d_backup_ipred_edge_16bpc
Line
Count
Source
2111
148k
void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
2112
148k
    const Dav1dFrameContext *const f = t->f;
2113
148k
    Dav1dTileState *const ts = t->ts;
2114
148k
    const int sby = t->by >> f->sb_shift;
2115
148k
    const int sby_off = f->sb128w * 128 * sby;
2116
148k
    const int x_off = ts->tiling.col_start;
2117
2118
148k
    const pixel *const y =
2119
148k
        ((const pixel *) f->cur.data[0]) + x_off * 4 +
2120
148k
                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
2121
148k
    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
2122
148k
               4 * (ts->tiling.col_end - x_off));
2123
2124
148k
    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
2125
111k
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2126
111k
        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2127
2128
111k
        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
2129
111k
            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
2130
335k
        for (int pl = 1; pl <= 2; pl++)
2131
223k
            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
2132
111k
                       &((const pixel *) f->cur.data[pl])[uv_off],
2133
111k
                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
2134
111k
    }
2135
148k
}
2136
2137
void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
2138
                                    const int bx4, const int by4,
2139
                                    const int bw4, const int bh4)
2140
2141
38.6k
{
2142
38.6k
    const Dav1dFrameContext *const f = t->f;
2143
38.6k
    pixel *const pal = t->frame_thread.pass ?
2144
38.6k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2145
38.6k
                            ((t->bx >> 1) + (t->by & 1))][0] :
2146
38.6k
        bytefn(t->scratch.pal)[0];
2147
183k
    for (int x = 0; x < bw4; x++)
2148
145k
        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
2149
158k
    for (int y = 0; y < bh4; y++)
2150
119k
        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
2151
38.6k
}
dav1d_copy_pal_block_y_8bpc
Line
Count
Source
2141
20.1k
{
2142
20.1k
    const Dav1dFrameContext *const f = t->f;
2143
20.1k
    pixel *const pal = t->frame_thread.pass ?
2144
20.1k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2145
20.1k
                            ((t->bx >> 1) + (t->by & 1))][0] :
2146
20.1k
        bytefn(t->scratch.pal)[0];
2147
95.1k
    for (int x = 0; x < bw4; x++)
2148
74.9k
        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
2149
80.6k
    for (int y = 0; y < bh4; y++)
2150
60.4k
        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
2151
20.1k
}
dav1d_copy_pal_block_y_16bpc
Line
Count
Source
2141
18.4k
{
2142
18.4k
    const Dav1dFrameContext *const f = t->f;
2143
18.4k
    pixel *const pal = t->frame_thread.pass ?
2144
18.4k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2145
18.4k
                            ((t->bx >> 1) + (t->by & 1))][0] :
2146
18.4k
        bytefn(t->scratch.pal)[0];
2147
88.6k
    for (int x = 0; x < bw4; x++)
2148
70.1k
        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
2149
77.5k
    for (int y = 0; y < bh4; y++)
2150
59.0k
        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
2151
18.4k
}
2152
2153
void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
2154
                                     const int bx4, const int by4,
2155
                                     const int bw4, const int bh4)
2156
2157
13.1k
{
2158
13.1k
    const Dav1dFrameContext *const f = t->f;
2159
13.1k
    const pixel (*const pal)[8] = t->frame_thread.pass ?
2160
13.1k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2161
13.1k
                            ((t->bx >> 1) + (t->by & 1))] :
2162
13.1k
        bytefn(t->scratch.pal);
2163
    // see aomedia bug 2183 for why we use luma coordinates here
2164
39.4k
    for (int pl = 1; pl <= 2; pl++) {
2165
146k
        for (int x = 0; x < bw4; x++)
2166
120k
            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
2167
132k
        for (int y = 0; y < bh4; y++)
2168
105k
            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
2169
26.3k
    }
2170
13.1k
}
dav1d_copy_pal_block_uv_8bpc
Line
Count
Source
2157
7.39k
{
2158
7.39k
    const Dav1dFrameContext *const f = t->f;
2159
7.39k
    const pixel (*const pal)[8] = t->frame_thread.pass ?
2160
7.39k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2161
7.39k
                            ((t->bx >> 1) + (t->by & 1))] :
2162
7.39k
        bytefn(t->scratch.pal);
2163
    // see aomedia bug 2183 for why we use luma coordinates here
2164
22.1k
    for (int pl = 1; pl <= 2; pl++) {
2165
82.5k
        for (int x = 0; x < bw4; x++)
2166
67.7k
            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
2167
75.2k
        for (int y = 0; y < bh4; y++)
2168
60.4k
            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
2169
14.7k
    }
2170
7.39k
}
dav1d_copy_pal_block_uv_16bpc
Line
Count
Source
2157
5.76k
{
2158
5.76k
    const Dav1dFrameContext *const f = t->f;
2159
5.76k
    const pixel (*const pal)[8] = t->frame_thread.pass ?
2160
5.76k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2161
5.76k
                            ((t->bx >> 1) + (t->by & 1))] :
2162
5.76k
        bytefn(t->scratch.pal);
2163
    // see aomedia bug 2183 for why we use luma coordinates here
2164
17.2k
    for (int pl = 1; pl <= 2; pl++) {
2165
63.9k
        for (int x = 0; x < bw4; x++)
2166
52.4k
            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
2167
56.8k
        for (int y = 0; y < bh4; y++)
2168
45.3k
            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
2169
11.5k
    }
2170
5.76k
}
2171
2172
void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
2173
                                  const int pl, const int sz_ctx,
2174
                                  const int bx4, const int by4)
2175
51.7k
{
2176
51.7k
    Dav1dTileState *const ts = t->ts;
2177
51.7k
    const Dav1dFrameContext *const f = t->f;
2178
51.7k
    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
2179
51.7k
                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
2180
51.7k
    pixel cache[16], used_cache[8];
2181
51.7k
    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
2182
51.7k
    int n_cache = 0;
2183
    // don't reuse above palette outside SB64 boundaries
2184
51.7k
    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
2185
51.7k
    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
2186
51.7k
    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
2187
2188
    // fill/sort cache
2189
84.3k
    while (l_cache && a_cache) {
2190
32.5k
        if (*l < *a) {
2191
11.8k
            if (!n_cache || cache[n_cache - 1] != *l)
2192
11.7k
                cache[n_cache++] = *l;
2193
11.8k
            l++;
2194
11.8k
            l_cache--;
2195
20.6k
        } else {
2196
20.6k
            if (*a == *l) {
2197
8.35k
                l++;
2198
8.35k
                l_cache--;
2199
8.35k
            }
2200
20.6k
            if (!n_cache || cache[n_cache - 1] != *a)
2201
19.9k
                cache[n_cache++] = *a;
2202
20.6k
            a++;
2203
20.6k
            a_cache--;
2204
20.6k
        }
2205
32.5k
    }
2206
51.7k
    if (l_cache) {
2207
61.7k
        do {
2208
61.7k
            if (!n_cache || cache[n_cache - 1] != *l)
2209
50.3k
                cache[n_cache++] = *l;
2210
61.7k
            l++;
2211
61.7k
        } while (--l_cache > 0);
2212
36.9k
    } else if (a_cache) {
2213
46.1k
        do {
2214
46.1k
            if (!n_cache || cache[n_cache - 1] != *a)
2215
36.8k
                cache[n_cache++] = *a;
2216
46.1k
            a++;
2217
46.1k
        } while (--a_cache > 0);
2218
11.1k
    }
2219
2220
    // find reused cache entries
2221
51.7k
    int i = 0;
2222
157k
    for (int n = 0; n < n_cache && i < pal_sz; n++)
2223
105k
        if (dav1d_msac_decode_bool_equi(&ts->msac))
2224
52.4k
            used_cache[i++] = cache[n];
2225
51.7k
    const int n_used_cache = i;
2226
2227
    // parse new entries
2228
51.7k
    pixel *const pal = t->frame_thread.pass ?
2229
51.7k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2230
51.7k
                            ((t->bx >> 1) + (t->by & 1))][pl] :
2231
18.4E
        bytefn(t->scratch.pal)[pl];
2232
51.7k
    if (i < pal_sz) {
2233
45.1k
        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2234
45.1k
        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
2235
2236
45.1k
        if (i < pal_sz) {
2237
40.7k
            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
2238
40.7k
            const int max = (1 << bpc) - 1;
2239
2240
95.0k
            do {
2241
95.0k
                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2242
95.0k
                prev = pal[i++] = imin(prev + delta + !pl, max);
2243
95.0k
                if (prev + !pl >= max) {
2244
53.6k
                    for (; i < pal_sz; i++)
2245
35.0k
                        pal[i] = max;
2246
18.5k
                    break;
2247
18.5k
                }
2248
76.4k
                bits = imin(bits, 1 + ulog2(max - prev - !pl));
2249
76.4k
            } while (i < pal_sz);
2250
40.7k
        }
2251
2252
        // merge cache+new entries
2253
45.1k
        int n = 0, m = n_used_cache;
2254
255k
        for (i = 0; i < pal_sz; i++) {
2255
210k
            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
2256
34.8k
                pal[i] = used_cache[n++];
2257
175k
            } else {
2258
175k
                assert(m < pal_sz);
2259
175k
                pal[i] = pal[m++];
2260
175k
            }
2261
210k
        }
2262
45.1k
    } else {
2263
6.59k
        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
2264
6.59k
    }
2265
2266
51.7k
    if (DEBUG_BLOCK_INFO) {
2267
0
        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
2268
0
               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
2269
0
        for (int n = 0; n < n_cache; n++)
2270
0
            printf("%c%02x", n ? ' ' : '[', cache[n]);
2271
0
        printf("%s, pal=", n_cache ? "]" : "[]");
2272
0
        for (int n = 0; n < pal_sz; n++)
2273
0
            printf("%c%02x", n ? ' ' : '[', pal[n]);
2274
0
        printf("]\n");
2275
0
    }
2276
51.7k
}
dav1d_read_pal_plane_8bpc
Line
Count
Source
2175
27.5k
{
2176
27.5k
    Dav1dTileState *const ts = t->ts;
2177
27.5k
    const Dav1dFrameContext *const f = t->f;
2178
27.5k
    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
2179
27.5k
                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
2180
27.5k
    pixel cache[16], used_cache[8];
2181
27.5k
    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
2182
27.5k
    int n_cache = 0;
2183
    // don't reuse above palette outside SB64 boundaries
2184
27.5k
    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
2185
27.5k
    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
2186
27.5k
    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
2187
2188
    // fill/sort cache
2189
48.1k
    while (l_cache && a_cache) {
2190
20.6k
        if (*l < *a) {
2191
7.44k
            if (!n_cache || cache[n_cache - 1] != *l)
2192
7.36k
                cache[n_cache++] = *l;
2193
7.44k
            l++;
2194
7.44k
            l_cache--;
2195
13.1k
        } else {
2196
13.1k
            if (*a == *l) {
2197
5.28k
                l++;
2198
5.28k
                l_cache--;
2199
5.28k
            }
2200
13.1k
            if (!n_cache || cache[n_cache - 1] != *a)
2201
12.6k
                cache[n_cache++] = *a;
2202
13.1k
            a++;
2203
13.1k
            a_cache--;
2204
13.1k
        }
2205
20.6k
    }
2206
27.5k
    if (l_cache) {
2207
33.1k
        do {
2208
33.1k
            if (!n_cache || cache[n_cache - 1] != *l)
2209
27.0k
                cache[n_cache++] = *l;
2210
33.1k
            l++;
2211
33.1k
        } while (--l_cache > 0);
2212
19.4k
    } else if (a_cache) {
2213
24.8k
        do {
2214
24.8k
            if (!n_cache || cache[n_cache - 1] != *a)
2215
19.7k
                cache[n_cache++] = *a;
2216
24.8k
            a++;
2217
24.8k
        } while (--a_cache > 0);
2218
6.16k
    }
2219
2220
    // find reused cache entries
2221
27.5k
    int i = 0;
2222
86.4k
    for (int n = 0; n < n_cache && i < pal_sz; n++)
2223
58.8k
        if (dav1d_msac_decode_bool_equi(&ts->msac))
2224
29.2k
            used_cache[i++] = cache[n];
2225
27.5k
    const int n_used_cache = i;
2226
2227
    // parse new entries
2228
27.5k
    pixel *const pal = t->frame_thread.pass ?
2229
27.5k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2230
27.5k
                            ((t->bx >> 1) + (t->by & 1))][pl] :
2231
18.4E
        bytefn(t->scratch.pal)[pl];
2232
27.5k
    if (i < pal_sz) {
2233
23.8k
        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2234
23.8k
        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
2235
2236
23.8k
        if (i < pal_sz) {
2237
21.4k
            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
2238
21.4k
            const int max = (1 << bpc) - 1;
2239
2240
49.3k
            do {
2241
49.3k
                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2242
49.3k
                prev = pal[i++] = imin(prev + delta + !pl, max);
2243
49.3k
                if (prev + !pl >= max) {
2244
28.6k
                    for (; i < pal_sz; i++)
2245
18.7k
                        pal[i] = max;
2246
9.84k
                    break;
2247
9.84k
                }
2248
39.5k
                bits = imin(bits, 1 + ulog2(max - prev - !pl));
2249
39.5k
            } while (i < pal_sz);
2250
21.4k
        }
2251
2252
        // merge cache+new entries
2253
23.8k
        int n = 0, m = n_used_cache;
2254
135k
        for (i = 0; i < pal_sz; i++) {
2255
111k
            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
2256
19.4k
                pal[i] = used_cache[n++];
2257
91.9k
            } else {
2258
91.9k
                assert(m < pal_sz);
2259
91.9k
                pal[i] = pal[m++];
2260
91.9k
            }
2261
111k
        }
2262
23.8k
    } else {
2263
3.65k
        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
2264
3.65k
    }
2265
2266
27.5k
    if (DEBUG_BLOCK_INFO) {
2267
0
        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
2268
0
               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
2269
0
        for (int n = 0; n < n_cache; n++)
2270
0
            printf("%c%02x", n ? ' ' : '[', cache[n]);
2271
0
        printf("%s, pal=", n_cache ? "]" : "[]");
2272
0
        for (int n = 0; n < pal_sz; n++)
2273
0
            printf("%c%02x", n ? ' ' : '[', pal[n]);
2274
0
        printf("]\n");
2275
0
    }
2276
27.5k
}
dav1d_read_pal_plane_16bpc
Line
Count
Source
2175
24.2k
{
2176
24.2k
    Dav1dTileState *const ts = t->ts;
2177
24.2k
    const Dav1dFrameContext *const f = t->f;
2178
24.2k
    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
2179
24.2k
                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
2180
24.2k
    pixel cache[16], used_cache[8];
2181
24.2k
    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
2182
24.2k
    int n_cache = 0;
2183
    // don't reuse above palette outside SB64 boundaries
2184
24.2k
    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
2185
24.2k
    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
2186
24.2k
    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
2187
2188
    // fill/sort cache
2189
36.1k
    while (l_cache && a_cache) {
2190
11.9k
        if (*l < *a) {
2191
4.44k
            if (!n_cache || cache[n_cache - 1] != *l)
2192
4.41k
                cache[n_cache++] = *l;
2193
4.44k
            l++;
2194
4.44k
            l_cache--;
2195
7.50k
        } else {
2196
7.50k
            if (*a == *l) {
2197
3.07k
                l++;
2198
3.07k
                l_cache--;
2199
3.07k
            }
2200
7.50k
            if (!n_cache || cache[n_cache - 1] != *a)
2201
7.27k
                cache[n_cache++] = *a;
2202
7.50k
            a++;
2203
7.50k
            a_cache--;
2204
7.50k
        }
2205
11.9k
    }
2206
24.2k
    if (l_cache) {
2207
28.5k
        do {
2208
28.5k
            if (!n_cache || cache[n_cache - 1] != *l)
2209
23.3k
                cache[n_cache++] = *l;
2210
28.5k
            l++;
2211
28.5k
        } while (--l_cache > 0);
2212
17.4k
    } else if (a_cache) {
2213
21.3k
        do {
2214
21.3k
            if (!n_cache || cache[n_cache - 1] != *a)
2215
17.0k
                cache[n_cache++] = *a;
2216
21.3k
            a++;
2217
21.3k
        } while (--a_cache > 0);
2218
4.97k
    }
2219
2220
    // find reused cache entries
2221
24.2k
    int i = 0;
2222
70.7k
    for (int n = 0; n < n_cache && i < pal_sz; n++)
2223
46.4k
        if (dav1d_msac_decode_bool_equi(&ts->msac))
2224
23.1k
            used_cache[i++] = cache[n];
2225
24.2k
    const int n_used_cache = i;
2226
2227
    // parse new entries
2228
24.2k
    pixel *const pal = t->frame_thread.pass ?
2229
24.2k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2230
24.2k
                            ((t->bx >> 1) + (t->by & 1))][pl] :
2231
24.2k
        bytefn(t->scratch.pal)[pl];
2232
24.2k
    if (i < pal_sz) {
2233
21.3k
        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2234
21.3k
        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
2235
2236
21.3k
        if (i < pal_sz) {
2237
19.3k
            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
2238
19.3k
            const int max = (1 << bpc) - 1;
2239
2240
45.6k
            do {
2241
45.6k
                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2242
45.6k
                prev = pal[i++] = imin(prev + delta + !pl, max);
2243
45.6k
                if (prev + !pl >= max) {
2244
25.0k
                    for (; i < pal_sz; i++)
2245
16.2k
                        pal[i] = max;
2246
8.73k
                    break;
2247
8.73k
                }
2248
36.9k
                bits = imin(bits, 1 + ulog2(max - prev - !pl));
2249
36.9k
            } while (i < pal_sz);
2250
19.3k
        }
2251
2252
        // merge cache+new entries
2253
21.3k
        int n = 0, m = n_used_cache;
2254
120k
        for (i = 0; i < pal_sz; i++) {
2255
98.7k
            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
2256
15.4k
                pal[i] = used_cache[n++];
2257
83.2k
            } else {
2258
83.2k
                assert(m < pal_sz);
2259
83.2k
                pal[i] = pal[m++];
2260
83.2k
            }
2261
98.7k
        }
2262
21.3k
    } else {
2263
2.94k
        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
2264
2.94k
    }
2265
2266
24.2k
    if (DEBUG_BLOCK_INFO) {
2267
0
        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
2268
0
               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
2269
0
        for (int n = 0; n < n_cache; n++)
2270
0
            printf("%c%02x", n ? ' ' : '[', cache[n]);
2271
0
        printf("%s, pal=", n_cache ? "]" : "[]");
2272
0
        for (int n = 0; n < pal_sz; n++)
2273
0
            printf("%c%02x", n ? ' ' : '[', pal[n]);
2274
0
        printf("]\n");
2275
0
    }
2276
24.2k
}
2277
2278
void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
2279
                               const int sz_ctx, const int bx4, const int by4)
2280
13.1k
{
2281
13.1k
    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
2282
2283
    // V pal coding
2284
13.1k
    Dav1dTileState *const ts = t->ts;
2285
13.1k
    const Dav1dFrameContext *const f = t->f;
2286
13.1k
    pixel *const pal = t->frame_thread.pass ?
2287
13.1k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2288
13.1k
                            ((t->bx >> 1) + (t->by & 1))][2] :
2289
13.1k
        bytefn(t->scratch.pal)[2];
2290
13.1k
    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2291
13.1k
    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
2292
6.64k
        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
2293
6.64k
        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
2294
6.64k
        const int max = (1 << bpc) - 1;
2295
27.8k
        for (int i = 1; i < b->pal_sz[1]; i++) {
2296
21.1k
            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2297
21.1k
            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
2298
21.1k
            prev = pal[i] = (prev + delta) & max;
2299
21.1k
        }
2300
6.64k
    } else {
2301
33.1k
        for (int i = 0; i < b->pal_sz[1]; i++)
2302
26.6k
            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
2303
6.50k
    }
2304
13.1k
    if (DEBUG_BLOCK_INFO) {
2305
0
        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
2306
0
        for (int n = 0; n < b->pal_sz[1]; n++)
2307
0
            printf("%c%02x", n ? ' ' : '[', pal[n]);
2308
0
        printf("]\n");
2309
0
    }
2310
13.1k
}
dav1d_read_pal_uv_8bpc
Line
Count
Source
2280
7.39k
{
2281
7.39k
    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
2282
2283
    // V pal coding
2284
7.39k
    Dav1dTileState *const ts = t->ts;
2285
7.39k
    const Dav1dFrameContext *const f = t->f;
2286
7.39k
    pixel *const pal = t->frame_thread.pass ?
2287
7.39k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2288
7.39k
                            ((t->bx >> 1) + (t->by & 1))][2] :
2289
7.39k
        bytefn(t->scratch.pal)[2];
2290
7.39k
    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2291
7.39k
    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
2292
3.64k
        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
2293
3.64k
        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
2294
3.64k
        const int max = (1 << bpc) - 1;
2295
15.5k
        for (int i = 1; i < b->pal_sz[1]; i++) {
2296
11.9k
            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2297
11.9k
            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
2298
11.9k
            prev = pal[i] = (prev + delta) & max;
2299
11.9k
        }
2300
3.75k
    } else {
2301
19.2k
        for (int i = 0; i < b->pal_sz[1]; i++)
2302
15.5k
            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
2303
3.75k
    }
2304
7.39k
    if (DEBUG_BLOCK_INFO) {
2305
0
        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
2306
0
        for (int n = 0; n < b->pal_sz[1]; n++)
2307
0
            printf("%c%02x", n ? ' ' : '[', pal[n]);
2308
0
        printf("]\n");
2309
0
    }
2310
7.39k
}
dav1d_read_pal_uv_16bpc
Line
Count
Source
2280
5.76k
{
2281
5.76k
    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
2282
2283
    // V pal coding
2284
5.76k
    Dav1dTileState *const ts = t->ts;
2285
5.76k
    const Dav1dFrameContext *const f = t->f;
2286
5.76k
    pixel *const pal = t->frame_thread.pass ?
2287
5.76k
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2288
5.76k
                            ((t->bx >> 1) + (t->by & 1))][2] :
2289
5.76k
        bytefn(t->scratch.pal)[2];
2290
5.76k
    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2291
5.76k
    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
2292
3.00k
        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
2293
3.00k
        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
2294
3.00k
        const int max = (1 << bpc) - 1;
2295
12.2k
        for (int i = 1; i < b->pal_sz[1]; i++) {
2296
9.26k
            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2297
9.26k
            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
2298
9.26k
            prev = pal[i] = (prev + delta) & max;
2299
9.26k
        }
2300
3.00k
    } else {
2301
13.8k
        for (int i = 0; i < b->pal_sz[1]; i++)
2302
11.1k
            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
2303
2.75k
    }
2304
5.76k
    if (DEBUG_BLOCK_INFO) {
2305
0
        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
2306
0
        for (int n = 0; n < b->pal_sz[1]; n++)
2307
0
            printf("%c%02x", n ? ' ' : '[', pal[n]);
2308
0
        printf("]\n");
2309
0
    }
2310
5.76k
}