/work/dav1d/src/recon_tmpl.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright © 2018-2021, VideoLAN and dav1d authors |
3 | | * Copyright © 2018, Two Orioles, LLC |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions are met: |
8 | | * |
9 | | * 1. Redistributions of source code must retain the above copyright notice, this |
10 | | * list of conditions and the following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above copyright notice, |
13 | | * this list of conditions and the following disclaimer in the documentation |
14 | | * and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #include "config.h" |
29 | | |
30 | | #include <string.h> |
31 | | #include <stdio.h> |
32 | | |
33 | | #include "common/attributes.h" |
34 | | #include "common/bitdepth.h" |
35 | | #include "common/dump.h" |
36 | | #include "common/frame.h" |
37 | | #include "common/intops.h" |
38 | | |
39 | | #include "src/cdef_apply.h" |
40 | | #include "src/ctx.h" |
41 | | #include "src/ipred_prepare.h" |
42 | | #include "src/lf_apply.h" |
43 | | #include "src/lr_apply.h" |
44 | | #include "src/recon.h" |
45 | | #include "src/scan.h" |
46 | | #include "src/tables.h" |
47 | | #include "src/wedge.h" |
48 | | |
49 | 1.25M | static inline unsigned read_golomb(MsacContext *const msac) { |
50 | 1.25M | int len = 0; |
51 | 1.25M | unsigned val = 1; |
52 | | |
53 | 2.25M | while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++; |
54 | 2.25M | while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac); |
55 | | |
56 | 1.25M | return val - 1; |
57 | 1.25M | } |
58 | | |
59 | | static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim, |
60 | | const enum BlockSize bs, |
61 | | const uint8_t *const a, |
62 | | const uint8_t *const l, |
63 | | const int chroma, |
64 | | const enum Dav1dPixelLayout layout) |
65 | 9.34M | { |
66 | 9.34M | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; |
67 | | |
68 | 9.34M | if (chroma) { |
69 | 4.92M | const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; |
70 | 4.92M | const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; |
71 | 4.92M | const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw || |
72 | 2.81M | b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh; |
73 | 4.92M | unsigned ca, cl; |
74 | | |
75 | 4.92M | #define MERGE_CTX(dir, type, no_val) \ |
76 | 9.87M | c##dir = *(const type *) dir != no_val; \ |
77 | 9.87M | break |
78 | | |
79 | 4.92M | switch (t_dim->lw) { |
80 | | /* For some reason the MSVC CRT _wassert() function is not flagged as |
81 | | * __declspec(noreturn), so when using those headers the compiler will |
82 | | * expect execution to continue after an assertion has been triggered |
83 | | * and will therefore complain about the use of uninitialized variables |
84 | | * when compiled in debug mode if we put the default case at the end. */ |
85 | 0 | default: assert(0); /* fall-through */ |
86 | 1.82M | case TX_4X4: MERGE_CTX(a, uint8_t, 0x40); |
87 | 901k | case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040); |
88 | 705k | case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U); |
89 | 1.50M | case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL); |
90 | 4.92M | } |
91 | 4.92M | switch (t_dim->lh) { |
92 | 0 | default: assert(0); /* fall-through */ |
93 | 2.03M | case TX_4X4: MERGE_CTX(l, uint8_t, 0x40); |
94 | 891k | case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040); |
95 | 561k | case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U); |
96 | 1.45M | case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL); |
97 | 4.92M | } |
98 | 4.92M | #undef MERGE_CTX |
99 | | |
100 | 4.92M | return 7 + not_one_blk * 3 + ca + cl; |
101 | 4.92M | } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) { |
102 | 1.47M | return 0; |
103 | 2.95M | } else { |
104 | 2.95M | unsigned la, ll; |
105 | | |
106 | 2.95M | #define MERGE_CTX(dir, type, tx) \ |
107 | 6.09M | if (tx == TX_64X64) { \ |
108 | 497k | uint64_t tmp = *(const uint64_t *) dir; \ |
109 | 497k | tmp |= *(const uint64_t *) &dir[8]; \ |
110 | 497k | l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \ |
111 | 497k | } else \ |
112 | 6.09M | l##dir = *(const type *) dir; \ |
113 | 6.09M | if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \ |
114 | 6.09M | if (tx >= TX_16X16) l##dir |= l##dir >> 16; \ |
115 | 6.09M | if (tx >= TX_8X8) l##dir |= l##dir >> 8; \ |
116 | 6.09M | break |
117 | | |
118 | 2.95M | switch (t_dim->lw) { |
119 | 0 | default: assert(0); /* fall-through */ |
120 | 1.88M | case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4); |
121 | 470k | case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8); |
122 | 407k | case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16); |
123 | 35.3k | case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32); |
124 | 248k | case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64); |
125 | 2.95M | } |
126 | 3.04M | switch (t_dim->lh) { |
127 | 0 | default: assert(0); /* fall-through */ |
128 | 1.90M | case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4); |
129 | 458k | case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8); |
130 | 405k | case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16); |
131 | 35.8k | case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32); |
132 | 248k | case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64); |
133 | 3.04M | } |
134 | 3.04M | #undef MERGE_CTX |
135 | | |
136 | 3.04M | return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)]; |
137 | 3.04M | } |
138 | 9.34M | } |
139 | | |
140 | | static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx, |
141 | | const uint8_t *const a, |
142 | | const uint8_t *const l) |
143 | 4.18M | { |
144 | 4.18M | uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL; |
145 | 4.18M | int s; |
146 | | |
147 | 4.18M | #if ARCH_X86_64 && defined(__GNUC__) |
148 | | /* Coerce compilers into producing better code. For some reason |
149 | | * every x86-64 compiler is awful at handling 64-bit constants. */ |
150 | 4.18M | __asm__("" : "+r"(mask), "+r"(mul)); |
151 | 4.18M | #endif |
152 | | |
153 | 4.18M | switch(tx) { |
154 | 0 | default: assert(0); /* fall-through */ |
155 | 1.85M | case TX_4X4: { |
156 | 1.85M | int t = *(const uint8_t *) a >> 6; |
157 | 1.85M | t += *(const uint8_t *) l >> 6; |
158 | 1.85M | s = t - 1 - 1; |
159 | 1.85M | break; |
160 | 0 | } |
161 | 367k | case TX_8X8: { |
162 | 367k | uint32_t t = *(const uint16_t *) a & (uint32_t) mask; |
163 | 367k | t += *(const uint16_t *) l & (uint32_t) mask; |
164 | 367k | t *= 0x04040404U; |
165 | 367k | s = (int) (t >> 24) - 2 - 2; |
166 | 367k | break; |
167 | 0 | } |
168 | 295k | case TX_16X16: { |
169 | 295k | uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6; |
170 | 295k | t += (*(const uint32_t *) l & (uint32_t) mask) >> 6; |
171 | 295k | t *= (uint32_t) mul; |
172 | 295k | s = (int) (t >> 24) - 4 - 4; |
173 | 295k | break; |
174 | 0 | } |
175 | 445k | case TX_32X32: { |
176 | 445k | uint64_t t = (*(const uint64_t *) a & mask) >> 6; |
177 | 445k | t += (*(const uint64_t *) l & mask) >> 6; |
178 | 445k | t *= mul; |
179 | 445k | s = (int) (t >> 56) - 8 - 8; |
180 | 445k | break; |
181 | 0 | } |
182 | 199k | case TX_64X64: { |
183 | 199k | uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; |
184 | 199k | t += (*(const uint64_t *) &a[8] & mask) >> 6; |
185 | 199k | t += (*(const uint64_t *) &l[0] & mask) >> 6; |
186 | 199k | t += (*(const uint64_t *) &l[8] & mask) >> 6; |
187 | 199k | t *= mul; |
188 | 199k | s = (int) (t >> 56) - 16 - 16; |
189 | 199k | break; |
190 | 0 | } |
191 | 113k | case RTX_4X8: { |
192 | 113k | uint32_t t = *(const uint8_t *) a & (uint32_t) mask; |
193 | 113k | t += *(const uint16_t *) l & (uint32_t) mask; |
194 | 113k | t *= 0x04040404U; |
195 | 113k | s = (int) (t >> 24) - 1 - 2; |
196 | 113k | break; |
197 | 0 | } |
198 | 170k | case RTX_8X4: { |
199 | 170k | uint32_t t = *(const uint16_t *) a & (uint32_t) mask; |
200 | 170k | t += *(const uint8_t *) l & (uint32_t) mask; |
201 | 170k | t *= 0x04040404U; |
202 | 170k | s = (int) (t >> 24) - 2 - 1; |
203 | 170k | break; |
204 | 0 | } |
205 | 101k | case RTX_8X16: { |
206 | 101k | uint32_t t = *(const uint16_t *) a & (uint32_t) mask; |
207 | 101k | t += *(const uint32_t *) l & (uint32_t) mask; |
208 | 101k | t = (t >> 6) * (uint32_t) mul; |
209 | 101k | s = (int) (t >> 24) - 2 - 4; |
210 | 101k | break; |
211 | 0 | } |
212 | 184k | case RTX_16X8: { |
213 | 184k | uint32_t t = *(const uint32_t *) a & (uint32_t) mask; |
214 | 184k | t += *(const uint16_t *) l & (uint32_t) mask; |
215 | 184k | t = (t >> 6) * (uint32_t) mul; |
216 | 184k | s = (int) (t >> 24) - 4 - 2; |
217 | 184k | break; |
218 | 0 | } |
219 | 62.1k | case RTX_16X32: { |
220 | 62.1k | uint64_t t = *(const uint32_t *) a & (uint32_t) mask; |
221 | 62.1k | t += *(const uint64_t *) l & mask; |
222 | 62.1k | t = (t >> 6) * mul; |
223 | 62.1k | s = (int) (t >> 56) - 4 - 8; |
224 | 62.1k | break; |
225 | 0 | } |
226 | 96.1k | case RTX_32X16: { |
227 | 96.1k | uint64_t t = *(const uint64_t *) a & mask; |
228 | 96.1k | t += *(const uint32_t *) l & (uint32_t) mask; |
229 | 96.1k | t = (t >> 6) * mul; |
230 | 96.1k | s = (int) (t >> 56) - 8 - 4; |
231 | 96.1k | break; |
232 | 0 | } |
233 | 37.8k | case RTX_32X64: { |
234 | 37.8k | uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; |
235 | 37.8k | t += (*(const uint64_t *) &l[0] & mask) >> 6; |
236 | 37.8k | t += (*(const uint64_t *) &l[8] & mask) >> 6; |
237 | 37.8k | t *= mul; |
238 | 37.8k | s = (int) (t >> 56) - 8 - 16; |
239 | 37.8k | break; |
240 | 0 | } |
241 | 35.2k | case RTX_64X32: { |
242 | 35.2k | uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; |
243 | 35.2k | t += (*(const uint64_t *) &a[8] & mask) >> 6; |
244 | 35.2k | t += (*(const uint64_t *) &l[0] & mask) >> 6; |
245 | 35.2k | t *= mul; |
246 | 35.2k | s = (int) (t >> 56) - 16 - 8; |
247 | 35.2k | break; |
248 | 0 | } |
249 | 50.2k | case RTX_4X16: { |
250 | 50.2k | uint32_t t = *(const uint8_t *) a & (uint32_t) mask; |
251 | 50.2k | t += *(const uint32_t *) l & (uint32_t) mask; |
252 | 50.2k | t = (t >> 6) * (uint32_t) mul; |
253 | 50.2k | s = (int) (t >> 24) - 1 - 4; |
254 | 50.2k | break; |
255 | 0 | } |
256 | 98.8k | case RTX_16X4: { |
257 | 98.8k | uint32_t t = *(const uint32_t *) a & (uint32_t) mask; |
258 | 98.8k | t += *(const uint8_t *) l & (uint32_t) mask; |
259 | 98.8k | t = (t >> 6) * (uint32_t) mul; |
260 | 98.8k | s = (int) (t >> 24) - 4 - 1; |
261 | 98.8k | break; |
262 | 0 | } |
263 | 30.5k | case RTX_8X32: { |
264 | 30.5k | uint64_t t = *(const uint16_t *) a & (uint32_t) mask; |
265 | 30.5k | t += *(const uint64_t *) l & mask; |
266 | 30.5k | t = (t >> 6) * mul; |
267 | 30.5k | s = (int) (t >> 56) - 2 - 8; |
268 | 30.5k | break; |
269 | 0 | } |
270 | 44.1k | case RTX_32X8: { |
271 | 44.1k | uint64_t t = *(const uint64_t *) a & mask; |
272 | 44.1k | t += *(const uint16_t *) l & (uint32_t) mask; |
273 | 44.1k | t = (t >> 6) * mul; |
274 | 44.1k | s = (int) (t >> 56) - 8 - 2; |
275 | 44.1k | break; |
276 | 0 | } |
277 | 8.97k | case RTX_16X64: { |
278 | 8.97k | uint64_t t = *(const uint32_t *) a & (uint32_t) mask; |
279 | 8.97k | t += *(const uint64_t *) &l[0] & mask; |
280 | 8.97k | t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6); |
281 | 8.97k | t *= mul; |
282 | 8.97k | s = (int) (t >> 56) - 4 - 16; |
283 | 8.97k | break; |
284 | 0 | } |
285 | 8.58k | case RTX_64X16: { |
286 | 8.58k | uint64_t t = *(const uint64_t *) &a[0] & mask; |
287 | 8.58k | t += *(const uint32_t *) l & (uint32_t) mask; |
288 | 8.58k | t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6); |
289 | 8.58k | t *= mul; |
290 | 8.58k | s = (int) (t >> 56) - 16 - 4; |
291 | 8.58k | break; |
292 | 0 | } |
293 | 4.18M | } |
294 | | |
295 | 4.18M | return (s != 0) + (s > 0); |
296 | 4.18M | } |
297 | | |
298 | | static inline unsigned get_lo_ctx(const uint8_t *const levels, |
299 | | const enum TxClass tx_class, |
300 | | unsigned *const hi_mag, |
301 | | const uint8_t (*const ctx_offsets)[5], |
302 | | const unsigned x, const unsigned y, |
303 | | const ptrdiff_t stride) |
304 | 76.9M | { |
305 | 76.9M | unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0]; |
306 | 76.9M | unsigned offset; |
307 | 76.9M | if (tx_class == TX_CLASS_2D) { |
308 | 72.9M | mag += levels[1 * stride + 1]; |
309 | 72.9M | *hi_mag = mag; |
310 | 72.9M | mag += levels[0 * stride + 2] + levels[2 * stride + 0]; |
311 | 72.9M | offset = ctx_offsets[umin(y, 4)][umin(x, 4)]; |
312 | 72.9M | } else { |
313 | 3.98M | mag += levels[0 * stride + 2]; |
314 | 3.98M | *hi_mag = mag; |
315 | 3.98M | mag += levels[0 * stride + 3] + levels[0 * stride + 4]; |
316 | 3.98M | offset = 26 + (y > 1 ? 10 : y * 5); |
317 | 3.98M | } |
318 | 76.9M | return offset + (mag > 512 ? 4 : (mag + 64) >> 7); |
319 | 76.9M | } |
320 | | |
321 | | static int decode_coefs(Dav1dTaskContext *const t, |
322 | | uint8_t *const a, uint8_t *const l, |
323 | | const enum RectTxfmSize tx, const enum BlockSize bs, |
324 | | const Av1Block *const b, const int intra, |
325 | | const int plane, coef *cf, |
326 | | enum TxfmType *const txtp, uint8_t *res_ctx) |
327 | 9.33M | { |
328 | 9.33M | Dav1dTileState *const ts = t->ts; |
329 | 9.33M | const int chroma = !!plane; |
330 | 9.33M | const Dav1dFrameContext *const f = t->f; |
331 | 9.33M | const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id]; |
332 | 9.33M | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; |
333 | 9.33M | const int dbg = DEBUG_BLOCK_INFO && plane && 0; |
334 | | |
335 | 9.33M | if (dbg) |
336 | 0 | printf("Start: r=%d\n", ts->msac.rng); |
337 | | |
338 | | // does this block have any non-zero coefficients |
339 | 9.33M | const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout); |
340 | 9.33M | const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac, |
341 | 9.33M | ts->cdf.coef.skip[t_dim->ctx][sctx]); |
342 | 9.33M | if (dbg) |
343 | 0 | printf("Post-non-zero[%d][%d][%d]: r=%d\n", |
344 | 0 | t_dim->ctx, sctx, all_skip, ts->msac.rng); |
345 | 9.33M | if (all_skip) { |
346 | 4.45M | *res_ctx = 0x40; |
347 | 4.45M | *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */ |
348 | 4.45M | return -1; |
349 | 4.45M | } |
350 | | |
351 | | // transform type (chroma: derived, luma: explicitly coded) |
352 | 4.88M | if (lossless) { |
353 | 1.76M | assert(t_dim->max == TX_4X4); |
354 | 1.76M | *txtp = WHT_WHT; |
355 | 3.11M | } else if (t_dim->max + intra >= TX_64X64) { |
356 | 981k | *txtp = DCT_DCT; |
357 | 2.13M | } else if (chroma) { |
358 | | // inferred from either the luma txtp (inter) or a LUT (intra) |
359 | 587k | *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] : |
360 | 587k | get_uv_inter_txtp(t_dim, *txtp); |
361 | 1.54M | } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) { |
362 | | // In libaom, lossless is checked by a literal qidx == 0, but not all |
363 | | // such blocks are actually lossless. The remainder gets an implicit |
364 | | // transform type (for luma) |
365 | 18.7k | *txtp = DCT_DCT; |
366 | 1.52M | } else { |
367 | 1.52M | unsigned idx; |
368 | 1.52M | if (intra) { |
369 | 1.22M | const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ? |
370 | 1.04M | dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode; |
371 | 1.22M | if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) { |
372 | 535k | idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, |
373 | 535k | ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4); |
374 | 535k | *txtp = dav1d_tx_types_per_set[idx + 0]; |
375 | 690k | } else { |
376 | 690k | idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, |
377 | 690k | ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6); |
378 | 690k | *txtp = dav1d_tx_types_per_set[idx + 5]; |
379 | 690k | } |
380 | 1.22M | if (dbg) |
381 | 0 | printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n", |
382 | 0 | tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng); |
383 | 1.22M | } else { |
384 | 299k | if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) { |
385 | 115k | idx = dav1d_msac_decode_bool_adapt(&ts->msac, |
386 | 115k | ts->cdf.m.txtp_inter3[t_dim->min]); |
387 | 115k | *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */ |
388 | 183k | } else if (t_dim->min == TX_16X16) { |
389 | 25.3k | idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, |
390 | 25.3k | ts->cdf.m.txtp_inter2, 11); |
391 | 25.3k | *txtp = dav1d_tx_types_per_set[idx + 12]; |
392 | 158k | } else { |
393 | 158k | idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, |
394 | 158k | ts->cdf.m.txtp_inter1[t_dim->min], 15); |
395 | 158k | *txtp = dav1d_tx_types_per_set[idx + 24]; |
396 | 158k | } |
397 | 299k | if (dbg) |
398 | 0 | printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n", |
399 | 0 | tx, t_dim->min, idx, *txtp, ts->msac.rng); |
400 | 299k | } |
401 | 1.52M | } |
402 | | |
403 | | // find end-of-block (eob) |
404 | 4.88M | int eob; |
405 | 4.88M | const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32); |
406 | 4.88M | const int tx2dszctx = slw + slh; |
407 | 4.88M | const enum TxClass tx_class = dav1d_tx_type_class[*txtp]; |
408 | 4.88M | const int is_1d = tx_class != TX_CLASS_2D; |
409 | 4.88M | switch (tx2dszctx) { |
410 | 0 | #define case_sz(sz, bin, ns, is_1d) \ |
411 | 4.93M | case sz: { \ |
412 | 4.93M | uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \ |
413 | 4.93M | eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \ |
414 | 4.93M | break; \ |
415 | 4.93M | } |
416 | 2.12M | case_sz(0, 16, 8, [is_1d]); |
417 | 367k | case_sz(1, 32, 8, [is_1d]); |
418 | 659k | case_sz(2, 64, 8, [is_1d]); |
419 | 368k | case_sz(3, 128, 8, [is_1d]); |
420 | 460k | case_sz(4, 256, 16, [is_1d]); |
421 | 202k | case_sz(5, 512, 16, ); |
422 | 757k | case_sz(6, 1024, 16, ); |
423 | 4.88M | #undef case_sz |
424 | 4.88M | } |
425 | 4.89M | if (dbg) |
426 | 0 | printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n", |
427 | 0 | 16 << tx2dszctx, chroma, is_1d, eob, ts->msac.rng); |
428 | 4.89M | if (eob > 1) { |
429 | 2.83M | const int eob_bin = eob - 2; |
430 | 2.83M | uint16_t *const eob_hi_bit_cdf = |
431 | 2.83M | ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin]; |
432 | 2.83M | const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf); |
433 | 2.83M | if (dbg) |
434 | 0 | printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n", |
435 | 0 | t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng); |
436 | 2.83M | eob = ((eob_hi_bit | 2) << eob_bin) | dav1d_msac_decode_bools(&ts->msac, eob_bin); |
437 | 2.83M | if (dbg) |
438 | 0 | printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng); |
439 | 2.83M | } |
440 | 4.89M | assert(eob >= 0); |
441 | | |
442 | | // base tokens |
443 | 4.89M | uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma]; |
444 | 4.89M | uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma]; |
445 | 4.89M | unsigned rc, dc_tok; |
446 | | |
447 | 4.89M | if (eob) { |
448 | 2.95M | uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma]; |
449 | 2.95M | uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok |
450 | | |
451 | | /* eob */ |
452 | 2.95M | unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx); |
453 | 2.95M | int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2); |
454 | 2.95M | int tok = eob_tok + 1; |
455 | 2.95M | int level_tok = tok * 0x41; |
456 | 2.95M | unsigned mag; |
457 | | |
458 | 2.95M | #define DECODE_COEFS_CLASS(tx_class) \ |
459 | 2.96M | unsigned x, y; \ |
460 | 2.96M | uint8_t *level; \ |
461 | 2.96M | if (tx_class == TX_CLASS_2D) \ |
462 | 2.96M | rc = scan[eob], x = rc >> shift, y = rc & mask; \ |
463 | 2.96M | else if (tx_class == TX_CLASS_H) \ |
464 | | /* Transposing reduces the stride and padding requirements */ \ |
465 | 187k | x = eob & mask, y = eob >> shift, rc = eob; \ |
466 | 187k | else /* tx_class == TX_CLASS_V */ \ |
467 | 187k | x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \ |
468 | 2.96M | if (dbg) \ |
469 | 2.96M | printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ |
470 | 0 | t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \ |
471 | 2.96M | if (eob_tok == 2) { \ |
472 | 75.7k | ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \ |
473 | 75.7k | tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ |
474 | 75.7k | level_tok = tok + (3 << 6); \ |
475 | 75.7k | if (dbg) \ |
476 | 75.7k | printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ |
477 | 0 | imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \ |
478 | 0 | ts->msac.rng); \ |
479 | 75.7k | } \ |
480 | 2.96M | cf[rc] = tok << 11; \ |
481 | 2.96M | if (tx_class == TX_CLASS_2D) \ |
482 | 2.96M | level = levels + rc; \ |
483 | 2.96M | else \ |
484 | 2.96M | level = levels + x * stride + y; \ |
485 | 2.96M | *level = (uint8_t) level_tok; \ |
486 | 79.8M | for (int i = eob - 1; i > 0; i--) { /* ac */ \ |
487 | 76.8M | unsigned rc_i; \ |
488 | 76.8M | if (tx_class == TX_CLASS_2D) \ |
489 | 76.8M | rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \ |
490 | 76.8M | else if (tx_class == TX_CLASS_H) \ |
491 | 3.97M | x = i & mask, y = i >> shift, rc_i = i; \ |
492 | 3.97M | else /* tx_class == TX_CLASS_V */ \ |
493 | 3.97M | x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \ |
494 | 76.8M | assert(x < 32 && y < 32); \ |
495 | 76.8M | if (tx_class == TX_CLASS_2D) \ |
496 | 76.8M | level = levels + rc_i; \ |
497 | 76.8M | else \ |
498 | 76.8M | level = levels + x * stride + y; \ |
499 | 76.8M | ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \ |
500 | 76.8M | if (tx_class == TX_CLASS_2D) \ |
501 | 76.8M | y |= x; \ |
502 | 76.8M | tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ |
503 | 76.8M | if (dbg) \ |
504 | 76.8M | printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ |
505 | 0 | t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \ |
506 | 76.8M | if (tok == 3) { \ |
507 | 6.64M | mag &= 63; \ |
508 | 6.64M | ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \ |
509 | 6.64M | (mag > 12 ? 6 : (mag + 1) >> 1); \ |
510 | 6.64M | tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ |
511 | 6.64M | if (dbg) \ |
512 | 6.64M | printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ |
513 | 0 | imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \ |
514 | 0 | ts->msac.rng); \ |
515 | 6.64M | *level = (uint8_t) (tok + (3 << 6)); \ |
516 | 6.64M | cf[rc_i] = (tok << 11) | rc; \ |
517 | 6.64M | rc = rc_i; \ |
518 | 70.2M | } else { \ |
519 | | /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \ |
520 | 70.2M | tok *= 0x17ff41; \ |
521 | 70.2M | *level = (uint8_t) tok; \ |
522 | | /* tok ? (tok << 11) | rc : 0 */ \ |
523 | 70.2M | tok = (tok >> 9) & (rc + ~0x7ffu); \ |
524 | 70.2M | if (tok) rc = rc_i; \ |
525 | 70.2M | cf[rc_i] = tok; \ |
526 | 70.2M | } \ |
527 | 76.8M | } \ |
528 | | /* dc */ \ |
529 | 2.96M | ctx = (tx_class == TX_CLASS_2D) ? 0 : \ |
530 | 2.96M | get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \ |
531 | 2.96M | dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ |
532 | 2.96M | if (dbg) \ |
533 | 2.96M | printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \ |
534 | 0 | t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \ |
535 | 2.96M | if (dc_tok == 3) { \ |
536 | 1.16M | if (tx_class == TX_CLASS_2D) \ |
537 | 1.16M | mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \ |
538 | 1.14M | levels[1 * stride + 1]; \ |
539 | 1.16M | mag &= 63; \ |
540 | 1.16M | ctx = mag > 12 ? 6 : (mag + 1) >> 1; \ |
541 | 1.16M | dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ |
542 | 1.16M | if (dbg) \ |
543 | 1.16M | printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \ |
544 | 0 | imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \ |
545 | 1.16M | } \ |
546 | 2.96M | break |
547 | | |
548 | 2.95M | const uint16_t *scan; |
549 | 2.95M | switch (tx_class) { |
550 | 2.76M | case TX_CLASS_2D: { |
551 | 2.76M | const unsigned nonsquare_tx = tx >= RTX_4X8; |
552 | 2.76M | const uint8_t (*const lo_ctx_offsets)[5] = |
553 | 2.76M | dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)]; |
554 | 2.76M | scan = dav1d_scans[tx]; |
555 | 2.76M | const ptrdiff_t stride = 4 << slh; |
556 | 2.76M | const unsigned shift = slh + 2, shift2 = 0; |
557 | 2.76M | const unsigned mask = (4 << slh) - 1; |
558 | 2.76M | memset(levels, 0, stride * ((4 << slw) + 2)); |
559 | 2.76M | DECODE_COEFS_CLASS(TX_CLASS_2D); |
560 | 2.76M | } |
561 | 130k | case TX_CLASS_H: { |
562 | 130k | const uint8_t (*const lo_ctx_offsets)[5] = NULL; |
563 | 130k | const ptrdiff_t stride = 16; |
564 | 130k | const unsigned shift = slh + 2, shift2 = 0; |
565 | 130k | const unsigned mask = (4 << slh) - 1; |
566 | 130k | memset(levels, 0, stride * ((4 << slh) + 2)); |
567 | 130k | DECODE_COEFS_CLASS(TX_CLASS_H); |
568 | 130k | } |
569 | 68.7k | case TX_CLASS_V: { |
570 | 68.7k | const uint8_t (*const lo_ctx_offsets)[5] = NULL; |
571 | 68.7k | const ptrdiff_t stride = 16; |
572 | 68.7k | const unsigned shift = slw + 2, shift2 = slh + 2; |
573 | 68.7k | const unsigned mask = (4 << slw) - 1; |
574 | 68.7k | memset(levels, 0, stride * ((4 << slw) + 2)); |
575 | 68.7k | DECODE_COEFS_CLASS(TX_CLASS_V); |
576 | 68.7k | } |
577 | 0 | #undef DECODE_COEFS_CLASS |
578 | 0 | default: assert(0); |
579 | 2.95M | } |
580 | 2.95M | } else { // dc-only |
581 | 1.93M | int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2); |
582 | 1.93M | dc_tok = 1 + tok_br; |
583 | 1.93M | if (dbg) |
584 | 0 | printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", |
585 | 0 | t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng); |
586 | 1.93M | if (tok_br == 2) { |
587 | 53.3k | dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]); |
588 | 53.3k | if (dbg) |
589 | 0 | printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", |
590 | 0 | imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); |
591 | 53.3k | } |
592 | 1.93M | rc = 0; |
593 | 1.93M | } |
594 | | |
595 | | // residual and sign |
596 | 4.90M | const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; |
597 | 4.90M | const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL; |
598 | 4.90M | const int dq_shift = imax(0, t_dim->ctx - 2); |
599 | 4.90M | const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc)); |
600 | 4.90M | unsigned cul_level, dc_sign_level; |
601 | | |
602 | 4.90M | if (!dc_tok) { |
603 | 711k | cul_level = 0; |
604 | 711k | dc_sign_level = 1 << 6; |
605 | 711k | if (qm_tbl) goto ac_qm; |
606 | 504k | goto ac_noqm; |
607 | 711k | } |
608 | | |
609 | 4.19M | const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); |
610 | 4.19M | uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; |
611 | 4.19M | const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); |
612 | 4.19M | if (dbg) |
613 | 0 | printf("Post-dc_sign[%d][%d][%d]: r=%d\n", |
614 | 0 | chroma, dc_sign_ctx, dc_sign, ts->msac.rng); |
615 | | |
616 | 4.19M | int dc_dq = dq_tbl[0]; |
617 | 4.19M | dc_sign_level = (dc_sign - 1) & (2 << 6); |
618 | | |
619 | 4.19M | if (qm_tbl) { |
620 | 1.22M | dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5; |
621 | | |
622 | 1.22M | if (dc_tok == 15) { |
623 | 40.0k | dc_tok = read_golomb(&ts->msac) + 15; |
624 | 40.0k | if (dbg) |
625 | 0 | printf("Post-dc_residual[%d->%d]: r=%d\n", |
626 | 0 | dc_tok - 15, dc_tok, ts->msac.rng); |
627 | | |
628 | 40.0k | dc_tok &= 0xfffff; |
629 | 40.0k | dc_dq = (dc_dq * dc_tok) & 0xffffff; |
630 | 1.18M | } else { |
631 | 1.18M | dc_dq *= dc_tok; |
632 | 1.18M | assert(dc_dq <= 0xffffff); |
633 | 1.18M | } |
634 | 1.22M | cul_level = dc_tok; |
635 | 1.22M | dc_dq >>= dq_shift; |
636 | 1.22M | dc_dq = umin(dc_dq, cf_max + dc_sign); |
637 | 1.22M | cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); |
638 | | |
639 | 1.66M | if (rc) ac_qm: { |
640 | 1.66M | const unsigned ac_dq = dq_tbl[1]; |
641 | 13.3M | do { |
642 | 13.3M | const int sign = dav1d_msac_decode_bool_equi(&ts->msac); |
643 | 13.3M | if (dbg) |
644 | 0 | printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); |
645 | 13.3M | const unsigned rc_tok = cf[rc]; |
646 | 13.3M | unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5; |
647 | 13.3M | int dq_sat; |
648 | | |
649 | 13.3M | if (rc_tok >= (15 << 11)) { |
650 | 690k | tok = read_golomb(&ts->msac) + 15; |
651 | 690k | if (dbg) |
652 | 0 | printf("Post-residual[%d=%d->%d]: r=%d\n", |
653 | 0 | rc, tok - 15, tok, ts->msac.rng); |
654 | | |
655 | 690k | tok &= 0xfffff; |
656 | 690k | dq = (dq * tok) & 0xffffff; |
657 | 12.6M | } else { |
658 | 12.6M | tok = rc_tok >> 11; |
659 | 12.6M | dq *= tok; |
660 | 12.6M | assert(dq <= 0xffffff); |
661 | 12.6M | } |
662 | 13.3M | cul_level += tok; |
663 | 13.3M | dq >>= dq_shift; |
664 | 13.3M | dq_sat = umin(dq, cf_max + sign); |
665 | 13.3M | cf[rc] = (coef) (sign ? -dq_sat : dq_sat); |
666 | | |
667 | 13.3M | rc = rc_tok & 0x3ff; |
668 | 13.3M | } while (rc); |
669 | 1.66M | } |
670 | 2.96M | } else { |
671 | | // non-qmatrix is the common case and allows for additional optimizations |
672 | 2.96M | if (dc_tok == 15) { |
673 | 108k | dc_tok = read_golomb(&ts->msac) + 15; |
674 | 108k | if (dbg) |
675 | 0 | printf("Post-dc_residual[%d->%d]: r=%d\n", |
676 | 0 | dc_tok - 15, dc_tok, ts->msac.rng); |
677 | | |
678 | 108k | dc_tok &= 0xfffff; |
679 | 108k | dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift; |
680 | 108k | dc_dq = umin(dc_dq, cf_max + dc_sign); |
681 | 2.86M | } else { |
682 | 2.86M | dc_dq = ((dc_dq * dc_tok) >> dq_shift); |
683 | 2.86M | assert(dc_dq <= cf_max); |
684 | 2.86M | } |
685 | 2.96M | cul_level = dc_tok; |
686 | 2.96M | cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); |
687 | | |
688 | 3.59M | if (rc) ac_noqm: { |
689 | 3.59M | const unsigned ac_dq = dq_tbl[1]; |
690 | 18.2M | do { |
691 | 18.2M | const int sign = dav1d_msac_decode_bool_equi(&ts->msac); |
692 | 18.2M | if (dbg) |
693 | 0 | printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); |
694 | 18.2M | const unsigned rc_tok = cf[rc]; |
695 | 18.2M | unsigned tok; |
696 | 18.2M | int dq; |
697 | | |
698 | | // residual |
699 | 18.2M | if (rc_tok >= (15 << 11)) { |
700 | 418k | tok = read_golomb(&ts->msac) + 15; |
701 | 418k | if (dbg) |
702 | 0 | printf("Post-residual[%d=%d->%d]: r=%d\n", |
703 | 0 | rc, tok - 15, tok, ts->msac.rng); |
704 | | |
705 | | // coefficient parsing, see 5.11.39 |
706 | 418k | tok &= 0xfffff; |
707 | | |
708 | | // dequant, see 7.12.3 |
709 | 418k | dq = ((ac_dq * tok) & 0xffffff) >> dq_shift; |
710 | 418k | dq = umin(dq, cf_max + sign); |
711 | 17.8M | } else { |
712 | | // cannot exceed cf_max, so we can avoid the clipping |
713 | 17.8M | tok = rc_tok >> 11; |
714 | 17.8M | dq = ((ac_dq * tok) >> dq_shift); |
715 | 17.8M | assert(dq <= cf_max); |
716 | 17.8M | } |
717 | 18.2M | cul_level += tok; |
718 | 18.2M | cf[rc] = (coef) (sign ? -dq : dq); |
719 | | |
720 | 18.2M | rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob |
721 | 18.2M | } while (rc); |
722 | 3.59M | } |
723 | 2.96M | } |
724 | | |
725 | | // context |
726 | 4.87M | *res_ctx = umin(cul_level, 63) | dc_sign_level; |
727 | | |
728 | 4.87M | return eob; |
729 | 4.19M | } |
730 | | |
731 | | static void read_coef_tree(Dav1dTaskContext *const t, |
732 | | const enum BlockSize bs, const Av1Block *const b, |
733 | | const enum RectTxfmSize ytx, const int depth, |
734 | | const uint16_t *const tx_split, |
735 | | const int x_off, const int y_off, pixel *dst) |
736 | 1.14M | { |
737 | 1.14M | const Dav1dFrameContext *const f = t->f; |
738 | 1.14M | Dav1dTileState *const ts = t->ts; |
739 | 1.14M | const Dav1dDSPContext *const dsp = f->dsp; |
740 | 1.14M | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx]; |
741 | 1.14M | const int txw = t_dim->w, txh = t_dim->h; |
742 | | |
743 | | /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't |
744 | | * be splitted. Aviods an undefined left shift. */ |
745 | 1.14M | if (depth < 2 && tx_split[depth] && |
746 | 103k | tx_split[depth] & (1 << (y_off * 4 + x_off))) |
747 | 80.0k | { |
748 | 80.0k | const enum RectTxfmSize sub = t_dim->sub; |
749 | 80.0k | const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub]; |
750 | 80.0k | const int txsw = sub_t_dim->w, txsh = sub_t_dim->h; |
751 | | |
752 | 80.0k | read_coef_tree(t, bs, b, sub, depth + 1, tx_split, |
753 | 80.0k | x_off * 2 + 0, y_off * 2 + 0, dst); |
754 | 80.0k | t->bx += txsw; |
755 | 80.0k | if (txw >= txh && t->bx < f->bw) |
756 | 57.3k | read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1, |
757 | 57.3k | y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL); |
758 | 80.0k | t->bx -= txsw; |
759 | 80.0k | t->by += txsh; |
760 | 80.0k | if (txh >= txw && t->by < f->bh) { |
761 | 54.6k | if (dst) |
762 | 15.5k | dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]); |
763 | 54.6k | read_coef_tree(t, bs, b, sub, depth + 1, tx_split, |
764 | 54.6k | x_off * 2 + 0, y_off * 2 + 1, dst); |
765 | 54.6k | t->bx += txsw; |
766 | 54.6k | if (txw >= txh && t->bx < f->bw) |
767 | 33.1k | read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1, |
768 | 33.1k | y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL); |
769 | 54.6k | t->bx -= txsw; |
770 | 54.6k | } |
771 | 80.0k | t->by -= txsh; |
772 | 1.06M | } else { |
773 | 1.06M | const int bx4 = t->bx & 31, by4 = t->by & 31; |
774 | 1.06M | enum TxfmType txtp; |
775 | 1.06M | uint8_t cf_ctx; |
776 | 1.06M | int eob; |
777 | 1.06M | coef *cf; |
778 | | |
779 | 1.06M | if (t->frame_thread.pass) { |
780 | 1.06M | const int p = t->frame_thread.pass & 1; |
781 | 1.06M | assert(ts->frame_thread[p].cf); |
782 | 1.06M | cf = ts->frame_thread[p].cf; |
783 | 1.06M | ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; |
784 | 1.06M | } else { |
785 | 10 | cf = bitfn(t->cf); |
786 | 10 | } |
787 | 1.06M | if (t->frame_thread.pass != 2) { |
788 | 740k | eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4], |
789 | 740k | ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx); |
790 | 740k | if (DEBUG_BLOCK_INFO) |
791 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", |
792 | 0 | ytx, txtp, eob, ts->msac.rng); |
793 | 740k | dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx)); |
794 | 740k | dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by)); |
795 | 740k | #define set_ctx(rep_macro) \ |
796 | 2.77M | for (int y = 0; y < txh; y++) { \ |
797 | 2.03M | rep_macro(txtp_map, 0, txtp); \ |
798 | 2.03M | txtp_map += 32; \ |
799 | 2.03M | } |
800 | 740k | uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4]; |
801 | 740k | case_set_upto16(t_dim->lw); |
802 | 740k | #undef set_ctx |
803 | 740k | if (t->frame_thread.pass == 1) |
804 | 740k | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; |
805 | 740k | } else { |
806 | 322k | const int cbi = *ts->frame_thread[0].cbi++; |
807 | 322k | eob = cbi >> 5; |
808 | 322k | txtp = cbi & 0x1f; |
809 | 322k | } |
810 | 1.06M | if (!(t->frame_thread.pass & 1)) { |
811 | 323k | assert(dst); |
812 | 323k | if (eob >= 0) { |
813 | 210k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
814 | 0 | coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq"); |
815 | 210k | dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob |
816 | 210k | HIGHBD_CALL_SUFFIX); |
817 | 210k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
818 | 0 | hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon"); |
819 | 210k | } |
820 | 323k | } |
821 | 1.06M | } |
822 | 1.14M | } |
823 | | |
824 | | void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, |
825 | | const enum BlockSize bs, const Av1Block *const b) |
826 | 4.62M | { |
827 | 4.62M | const Dav1dFrameContext *const f = t->f; |
828 | 4.62M | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
829 | 4.62M | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
830 | 4.62M | const int bx4 = t->bx & 31, by4 = t->by & 31; |
831 | 4.62M | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; |
832 | 4.62M | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; |
833 | 4.62M | const int bw4 = b_dim[0], bh4 = b_dim[1]; |
834 | 4.62M | const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; |
835 | 4.62M | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && |
836 | 4.14M | (bw4 > ss_hor || t->bx & 1) && |
837 | 3.84M | (bh4 > ss_ver || t->by & 1); |
838 | | |
839 | 4.62M | if (b->skip) { |
840 | 2.79M | BlockContext *const a = t->a; |
841 | 2.79M | dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); |
842 | 2.79M | dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); |
843 | 2.79M | if (has_chroma) { |
844 | 2.12M | dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; |
845 | 2.12M | dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; |
846 | 2.12M | memset_cw(&a->ccoef[0][cbx4], 0x40); |
847 | 2.12M | memset_cw(&a->ccoef[1][cbx4], 0x40); |
848 | 2.12M | memset_ch(&t->l.ccoef[0][cby4], 0x40); |
849 | 2.12M | memset_ch(&t->l.ccoef[1][cby4], 0x40); |
850 | 2.12M | } |
851 | 2.79M | return; |
852 | 2.79M | } |
853 | | |
854 | 1.82M | Dav1dTileState *const ts = t->ts; |
855 | 1.82M | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); |
856 | 1.82M | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; |
857 | 1.82M | assert(t->frame_thread.pass == 1); |
858 | 1.82M | assert(!b->skip); |
859 | 1.82M | const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; |
860 | 1.82M | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx]; |
861 | 1.82M | const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; |
862 | | |
863 | 3.72M | for (int init_y = 0; init_y < h4; init_y += 16) { |
864 | 1.90M | const int sub_h4 = imin(h4, 16 + init_y); |
865 | 3.92M | for (int init_x = 0; init_x < w4; init_x += 16) { |
866 | 2.02M | const int sub_w4 = imin(w4, init_x + 16); |
867 | 2.02M | int y_off = !!init_y, y, x; |
868 | 4.45M | for (y = init_y, t->by += init_y; y < sub_h4; |
869 | 2.42M | y += t_dim->h, t->by += t_dim->h, y_off++) |
870 | 2.42M | { |
871 | 2.42M | int x_off = !!init_x; |
872 | 6.79M | for (x = init_x, t->bx += init_x; x < sub_w4; |
873 | 4.36M | x += t_dim->w, t->bx += t_dim->w, x_off++) |
874 | 4.36M | { |
875 | 4.36M | if (!b->intra) { |
876 | 633k | read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, |
877 | 633k | x_off, y_off, NULL); |
878 | 3.73M | } else { |
879 | 3.73M | uint8_t cf_ctx = 0x40; |
880 | 3.73M | enum TxfmType txtp; |
881 | 3.73M | const int eob = |
882 | 3.73M | decode_coefs(t, &t->a->lcoef[bx4 + x], |
883 | 3.73M | &t->l.lcoef[by4 + y], b->tx, bs, b, 1, |
884 | 3.73M | 0, ts->frame_thread[1].cf, &txtp, &cf_ctx); |
885 | 3.73M | if (DEBUG_BLOCK_INFO) |
886 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", |
887 | 0 | b->tx, txtp, eob, ts->msac.rng); |
888 | 3.73M | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; |
889 | 3.73M | ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; |
890 | 3.73M | dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); |
891 | 3.73M | dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); |
892 | 3.73M | } |
893 | 4.36M | } |
894 | 2.42M | t->bx -= x; |
895 | 2.42M | } |
896 | 2.02M | t->by -= y; |
897 | | |
898 | 2.02M | if (!has_chroma) continue; |
899 | | |
900 | 1.63M | const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); |
901 | 1.63M | const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); |
902 | 4.88M | for (int pl = 0; pl < 2; pl++) { |
903 | 6.94M | for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; |
904 | 3.69M | y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) |
905 | 3.69M | { |
906 | 8.57M | for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; |
907 | 4.88M | x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) |
908 | 4.88M | { |
909 | 4.88M | uint8_t cf_ctx = 0x40; |
910 | 4.88M | enum TxfmType txtp; |
911 | 4.88M | if (!b->intra) |
912 | 880k | txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + |
913 | 880k | bx4 + (x << ss_hor)]; |
914 | 4.88M | const int eob = |
915 | 4.88M | decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], |
916 | 4.88M | &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, |
917 | 4.88M | b, b->intra, 1 + pl, ts->frame_thread[1].cf, |
918 | 4.88M | &txtp, &cf_ctx); |
919 | 4.88M | if (DEBUG_BLOCK_INFO) |
920 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," |
921 | 0 | "txtp=%d,eob=%d]: r=%d\n", |
922 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng); |
923 | 4.88M | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; |
924 | 4.88M | ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; |
925 | 4.88M | int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); |
926 | 4.88M | int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); |
927 | 4.88M | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); |
928 | 4.88M | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); |
929 | 4.88M | } |
930 | 3.69M | t->bx -= x << ss_hor; |
931 | 3.69M | } |
932 | 3.25M | t->by -= y << ss_ver; |
933 | 3.25M | } |
934 | 1.63M | } |
935 | 1.90M | } |
936 | 1.82M | } dav1d_read_coef_blocks_8bpc Line | Count | Source | 826 | 2.25M | { | 827 | 2.25M | const Dav1dFrameContext *const f = t->f; | 828 | 2.25M | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 829 | 2.25M | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 830 | 2.25M | const int bx4 = t->bx & 31, by4 = t->by & 31; | 831 | 2.25M | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; | 832 | 2.25M | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; | 833 | 2.25M | const int bw4 = b_dim[0], bh4 = b_dim[1]; | 834 | 2.25M | const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; | 835 | 2.25M | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && | 836 | 2.05M | (bw4 > ss_hor || t->bx & 1) && | 837 | 1.90M | (bh4 > ss_ver || t->by & 1); | 838 | | | 839 | 2.25M | if (b->skip) { | 840 | 1.41M | BlockContext *const a = t->a; | 841 | 1.41M | dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); | 842 | 1.41M | dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); | 843 | 1.41M | if (has_chroma) { | 844 | 1.09M | dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; | 845 | 1.09M | dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; | 846 | 1.09M | memset_cw(&a->ccoef[0][cbx4], 0x40); | 847 | 1.09M | memset_cw(&a->ccoef[1][cbx4], 0x40); | 848 | 1.09M | memset_ch(&t->l.ccoef[0][cby4], 0x40); | 849 | 1.09M | memset_ch(&t->l.ccoef[1][cby4], 0x40); | 850 | 1.09M | } | 851 | 1.41M | return; | 852 | 1.41M | } | 853 | | | 854 | 839k | Dav1dTileState *const ts = t->ts; | 855 | 839k | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); | 856 | 839k | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; | 857 | 839k | assert(t->frame_thread.pass == 1); | 858 | 839k | assert(!b->skip); | 859 | 839k | const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; | 860 | 839k | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx]; | 861 | 839k | const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; | 862 | | | 863 | 1.71M | for (int init_y = 0; init_y < h4; init_y += 16) { | 864 | 874k | const int sub_h4 = imin(h4, 16 + init_y); | 865 | 1.78M | for (int init_x = 0; init_x < w4; init_x += 16) { | 866 | 915k | const int sub_w4 = imin(w4, init_x + 16); | 867 | 915k | int y_off = !!init_y, y, x; | 868 | 2.03M | for (y = init_y, t->by += init_y; y < sub_h4; | 869 | 1.11M | y += t_dim->h, t->by += t_dim->h, y_off++) | 870 | 1.11M | { | 871 | 1.11M | int x_off = !!init_x; | 872 | 3.38M | for (x = init_x, t->bx += init_x; x < sub_w4; | 873 | 2.27M | x += t_dim->w, t->bx += t_dim->w, x_off++) | 874 | 2.27M | { | 875 | 2.27M | if (!b->intra) { | 876 | 388k | read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, | 877 | 388k | x_off, y_off, NULL); | 878 | 1.88M | } else { | 879 | 1.88M | uint8_t cf_ctx = 0x40; | 880 | 1.88M | enum TxfmType txtp; | 881 | 1.88M | const int eob = | 882 | 1.88M | decode_coefs(t, &t->a->lcoef[bx4 + x], | 883 | 1.88M | &t->l.lcoef[by4 + y], b->tx, bs, b, 1, | 884 | 1.88M | 0, ts->frame_thread[1].cf, &txtp, &cf_ctx); | 885 | 1.88M | if (DEBUG_BLOCK_INFO) | 886 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", | 887 | 0 | b->tx, txtp, eob, ts->msac.rng); | 888 | 1.88M | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; | 889 | 1.88M | ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; | 890 | 1.88M | dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); | 891 | 1.88M | dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); | 892 | 1.88M | } | 893 | 2.27M | } | 894 | 1.11M | t->bx -= x; | 895 | 1.11M | } | 896 | 915k | t->by -= y; | 897 | | | 898 | 915k | if (!has_chroma) continue; | 899 | | | 900 | 756k | const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); | 901 | 756k | const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); | 902 | 2.26M | for (int pl = 0; pl < 2; pl++) { | 903 | 3.22M | for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; | 904 | 1.71M | y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) | 905 | 1.71M | { | 906 | 4.01M | for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; | 907 | 2.30M | x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) | 908 | 2.30M | { | 909 | 2.30M | uint8_t cf_ctx = 0x40; | 910 | 2.30M | enum TxfmType txtp; | 911 | 2.30M | if (!b->intra) | 912 | 502k | txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + | 913 | 502k | bx4 + (x << ss_hor)]; | 914 | 2.30M | const int eob = | 915 | 2.30M | decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], | 916 | 2.30M | &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, | 917 | 2.30M | b, b->intra, 1 + pl, ts->frame_thread[1].cf, | 918 | 2.30M | &txtp, &cf_ctx); | 919 | 2.30M | if (DEBUG_BLOCK_INFO) | 920 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," | 921 | 0 | "txtp=%d,eob=%d]: r=%d\n", | 922 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng); | 923 | 2.30M | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; | 924 | 2.30M | ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; | 925 | 2.30M | int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); | 926 | 2.30M | int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); | 927 | 2.30M | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); | 928 | 2.30M | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); | 929 | 2.30M | } | 930 | 1.71M | t->bx -= x << ss_hor; | 931 | 1.71M | } | 932 | 1.50M | t->by -= y << ss_ver; | 933 | 1.50M | } | 934 | 756k | } | 935 | 874k | } | 936 | 839k | } |
dav1d_read_coef_blocks_16bpc Line | Count | Source | 826 | 2.37M | { | 827 | 2.37M | const Dav1dFrameContext *const f = t->f; | 828 | 2.37M | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 829 | 2.37M | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 830 | 2.37M | const int bx4 = t->bx & 31, by4 = t->by & 31; | 831 | 2.37M | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; | 832 | 2.37M | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; | 833 | 2.37M | const int bw4 = b_dim[0], bh4 = b_dim[1]; | 834 | 2.37M | const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; | 835 | 2.37M | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && | 836 | 2.09M | (bw4 > ss_hor || t->bx & 1) && | 837 | 1.94M | (bh4 > ss_ver || t->by & 1); | 838 | | | 839 | 2.37M | if (b->skip) { | 840 | 1.38M | BlockContext *const a = t->a; | 841 | 1.38M | dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); | 842 | 1.38M | dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); | 843 | 1.38M | if (has_chroma) { | 844 | 1.02M | dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; | 845 | 1.02M | dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; | 846 | 1.02M | memset_cw(&a->ccoef[0][cbx4], 0x40); | 847 | 1.02M | memset_cw(&a->ccoef[1][cbx4], 0x40); | 848 | 1.02M | memset_ch(&t->l.ccoef[0][cby4], 0x40); | 849 | 1.02M | memset_ch(&t->l.ccoef[1][cby4], 0x40); | 850 | 1.02M | } | 851 | 1.38M | return; | 852 | 1.38M | } | 853 | | | 854 | 982k | Dav1dTileState *const ts = t->ts; | 855 | 982k | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); | 856 | 982k | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; | 857 | 982k | assert(t->frame_thread.pass == 1); | 858 | 982k | assert(!b->skip); | 859 | 982k | const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; | 860 | 982k | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx]; | 861 | 982k | const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; | 862 | | | 863 | 2.01M | for (int init_y = 0; init_y < h4; init_y += 16) { | 864 | 1.03M | const int sub_h4 = imin(h4, 16 + init_y); | 865 | 2.13M | for (int init_x = 0; init_x < w4; init_x += 16) { | 866 | 1.10M | const int sub_w4 = imin(w4, init_x + 16); | 867 | 1.10M | int y_off = !!init_y, y, x; | 868 | 2.41M | for (y = init_y, t->by += init_y; y < sub_h4; | 869 | 1.31M | y += t_dim->h, t->by += t_dim->h, y_off++) | 870 | 1.31M | { | 871 | 1.31M | int x_off = !!init_x; | 872 | 3.40M | for (x = init_x, t->bx += init_x; x < sub_w4; | 873 | 2.09M | x += t_dim->w, t->bx += t_dim->w, x_off++) | 874 | 2.09M | { | 875 | 2.09M | if (!b->intra) { | 876 | 244k | read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, | 877 | 244k | x_off, y_off, NULL); | 878 | 1.84M | } else { | 879 | 1.84M | uint8_t cf_ctx = 0x40; | 880 | 1.84M | enum TxfmType txtp; | 881 | 1.84M | const int eob = | 882 | 1.84M | decode_coefs(t, &t->a->lcoef[bx4 + x], | 883 | 1.84M | &t->l.lcoef[by4 + y], b->tx, bs, b, 1, | 884 | 1.84M | 0, ts->frame_thread[1].cf, &txtp, &cf_ctx); | 885 | 1.84M | if (DEBUG_BLOCK_INFO) | 886 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", | 887 | 0 | b->tx, txtp, eob, ts->msac.rng); | 888 | 1.84M | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; | 889 | 1.84M | ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; | 890 | 1.84M | dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); | 891 | 1.84M | dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); | 892 | 1.84M | } | 893 | 2.09M | } | 894 | 1.31M | t->bx -= x; | 895 | 1.31M | } | 896 | 1.10M | t->by -= y; | 897 | | | 898 | 1.10M | if (!has_chroma) continue; | 899 | | | 900 | 876k | const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); | 901 | 876k | const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); | 902 | 2.62M | for (int pl = 0; pl < 2; pl++) { | 903 | 3.72M | for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; | 904 | 1.98M | y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) | 905 | 1.98M | { | 906 | 4.56M | for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; | 907 | 2.57M | x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) | 908 | 2.57M | { | 909 | 2.57M | uint8_t cf_ctx = 0x40; | 910 | 2.57M | enum TxfmType txtp; | 911 | 2.57M | if (!b->intra) | 912 | 377k | txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + | 913 | 377k | bx4 + (x << ss_hor)]; | 914 | 2.57M | const int eob = | 915 | 2.57M | decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], | 916 | 2.57M | &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, | 917 | 2.57M | b, b->intra, 1 + pl, ts->frame_thread[1].cf, | 918 | 2.57M | &txtp, &cf_ctx); | 919 | 2.57M | if (DEBUG_BLOCK_INFO) | 920 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," | 921 | 0 | "txtp=%d,eob=%d]: r=%d\n", | 922 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng); | 923 | 2.57M | *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; | 924 | 2.57M | ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; | 925 | 2.57M | int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); | 926 | 2.57M | int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); | 927 | 2.57M | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); | 928 | 2.57M | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); | 929 | 2.57M | } | 930 | 1.98M | t->bx -= x << ss_hor; | 931 | 1.98M | } | 932 | 1.74M | t->by -= y << ss_ver; | 933 | 1.74M | } | 934 | 876k | } | 935 | 1.03M | } | 936 | 982k | } |
|
937 | | |
938 | | static int mc(Dav1dTaskContext *const t, |
939 | | pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride, |
940 | | const int bw4, const int bh4, |
941 | | const int bx, const int by, const int pl, |
942 | | const mv mv, const Dav1dThreadPicture *const refp, const int refidx, |
943 | | const enum Filter2d filter_2d) |
944 | 1.06M | { |
945 | 1.06M | assert((dst8 != NULL) ^ (dst16 != NULL)); |
946 | 1.06M | const Dav1dFrameContext *const f = t->f; |
947 | 1.06M | const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
948 | 1.06M | const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
949 | 1.06M | const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; |
950 | 1.06M | const int mvx = mv.x, mvy = mv.y; |
951 | 1.06M | const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver); |
952 | 1.06M | ptrdiff_t ref_stride = refp->p.stride[!!pl]; |
953 | 1.06M | const pixel *ref; |
954 | | |
955 | 1.06M | if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) { |
956 | 805k | const int dx = bx * h_mul + (mvx >> (3 + ss_hor)); |
957 | 805k | const int dy = by * v_mul + (mvy >> (3 + ss_ver)); |
958 | 805k | int w, h; |
959 | | |
960 | 805k | if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc |
961 | 503k | w = (f->cur.p.w + ss_hor) >> ss_hor; |
962 | 503k | h = (f->cur.p.h + ss_ver) >> ss_ver; |
963 | 503k | } else { |
964 | 301k | w = f->bw * 4 >> ss_hor; |
965 | 301k | h = f->bh * 4 >> ss_ver; |
966 | 301k | } |
967 | 805k | if (dx < !!mx * 3 || dy < !!my * 3 || |
968 | 669k | dx + bw4 * h_mul + !!mx * 4 > w || |
969 | 508k | dy + bh4 * v_mul + !!my * 4 > h) |
970 | 384k | { |
971 | 384k | pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); |
972 | 384k | f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, |
973 | 384k | w, h, dx - !!mx * 3, dy - !!my * 3, |
974 | 384k | emu_edge_buf, 192 * sizeof(pixel), |
975 | 384k | refp->p.data[pl], ref_stride); |
976 | 384k | ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3]; |
977 | 384k | ref_stride = 192 * sizeof(pixel); |
978 | 420k | } else { |
979 | 420k | ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; |
980 | 420k | } |
981 | | |
982 | 805k | if (dst8 != NULL) { |
983 | 694k | f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul, |
984 | 694k | bh4 * v_mul, mx << !ss_hor, my << !ss_ver |
985 | 694k | HIGHBD_CALL_SUFFIX); |
986 | 694k | } else { |
987 | 110k | f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul, |
988 | 110k | bh4 * v_mul, mx << !ss_hor, my << !ss_ver |
989 | 110k | HIGHBD_CALL_SUFFIX); |
990 | 110k | } |
991 | 805k | } else { |
992 | 261k | assert(refp != &f->sr_cur); |
993 | | |
994 | 261k | const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver); |
995 | 261k | const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); |
996 | 522k | #define scale_mv(res, val, scale) do { \ |
997 | 522k | const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \ |
998 | 522k | res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \ |
999 | 522k | } while (0) |
1000 | 261k | int pos_y, pos_x; |
1001 | 261k | scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale); |
1002 | 261k | scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale); |
1003 | 261k | #undef scale_mv |
1004 | 261k | const int left = pos_x >> 10; |
1005 | 261k | const int top = pos_y >> 10; |
1006 | 261k | const int right = |
1007 | 261k | ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1; |
1008 | 261k | const int bottom = |
1009 | 261k | ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1; |
1010 | | |
1011 | 261k | if (DEBUG_BLOCK_INFO) |
1012 | 0 | printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n", |
1013 | 0 | left, top, orig_pos_x, f->svc[refidx][0].scale, refidx, |
1014 | 0 | right-left, bottom-top, |
1015 | 0 | f->svc[refidx][0].step, f->svc[refidx][1].step); |
1016 | | |
1017 | 261k | const int w = (refp->p.p.w + ss_hor) >> ss_hor; |
1018 | 261k | const int h = (refp->p.p.h + ss_ver) >> ss_ver; |
1019 | 261k | if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) { |
1020 | 216k | pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); |
1021 | 216k | f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7, |
1022 | 216k | w, h, left - 3, top - 3, |
1023 | 216k | emu_edge_buf, 320 * sizeof(pixel), |
1024 | 216k | refp->p.data[pl], ref_stride); |
1025 | 216k | ref = &emu_edge_buf[320 * 3 + 3]; |
1026 | 216k | ref_stride = 320 * sizeof(pixel); |
1027 | 216k | if (DEBUG_BLOCK_INFO) printf("Emu\n"); |
1028 | 216k | } else { |
1029 | 45.1k | ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left; |
1030 | 45.1k | } |
1031 | | |
1032 | 261k | if (dst8 != NULL) { |
1033 | 207k | f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride, |
1034 | 207k | bw4 * h_mul, bh4 * v_mul, |
1035 | 207k | pos_x & 0x3ff, pos_y & 0x3ff, |
1036 | 207k | f->svc[refidx][0].step, |
1037 | 207k | f->svc[refidx][1].step |
1038 | 207k | HIGHBD_CALL_SUFFIX); |
1039 | 207k | } else { |
1040 | 53.4k | f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride, |
1041 | 53.4k | bw4 * h_mul, bh4 * v_mul, |
1042 | 53.4k | pos_x & 0x3ff, pos_y & 0x3ff, |
1043 | 53.4k | f->svc[refidx][0].step, |
1044 | 53.4k | f->svc[refidx][1].step |
1045 | 53.4k | HIGHBD_CALL_SUFFIX); |
1046 | 53.4k | } |
1047 | 261k | } |
1048 | | |
1049 | 1.06M | return 0; |
1050 | 1.06M | } |
1051 | | |
1052 | | static int obmc(Dav1dTaskContext *const t, |
1053 | | pixel *const dst, const ptrdiff_t dst_stride, |
1054 | | const uint8_t *const b_dim, const int pl, |
1055 | | const int bx4, const int by4, const int w4, const int h4) |
1056 | 80.6k | { |
1057 | 80.6k | assert(!(t->bx & 1) && !(t->by & 1)); |
1058 | 80.6k | const Dav1dFrameContext *const f = t->f; |
1059 | 80.6k | /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5]; |
1060 | 80.6k | pixel *const lap = bitfn(t->scratch.lap); |
1061 | 80.6k | const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
1062 | 80.6k | const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
1063 | 80.6k | const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; |
1064 | 80.6k | int res; |
1065 | | |
1066 | 80.6k | if (t->by > t->ts->tiling.row_start && |
1067 | 66.3k | (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16)) |
1068 | 56.9k | { |
1069 | 119k | for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) { |
1070 | | // only odd blocks are considered for overlap handling, hence +1 |
1071 | 62.7k | const refmvs_block *const a_r = &r[-1][t->bx + x + 1]; |
1072 | 62.7k | const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs]; |
1073 | 62.7k | const int step4 = iclip(a_b_dim[0], 2, 16); |
1074 | | |
1075 | 62.7k | if (a_r->ref.ref[0] > 0) { |
1076 | 62.1k | const int ow4 = imin(step4, b_dim[0]); |
1077 | 62.1k | const int oh4 = imin(b_dim[1], 16) >> 1; |
1078 | 62.1k | res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2, |
1079 | 62.1k | t->bx + x, t->by, pl, a_r->mv.mv[0], |
1080 | 62.1k | &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1, |
1081 | 62.1k | dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]); |
1082 | 62.1k | if (res) return res; |
1083 | 62.1k | f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap, |
1084 | 62.1k | h_mul * ow4, v_mul * oh4); |
1085 | 62.1k | i++; |
1086 | 62.1k | } |
1087 | 62.7k | x += step4; |
1088 | 62.7k | } |
1089 | 56.9k | } |
1090 | | |
1091 | 80.6k | if (t->bx > t->ts->tiling.col_start) |
1092 | 127k | for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) { |
1093 | | // only odd blocks are considered for overlap handling, hence +1 |
1094 | 66.1k | const refmvs_block *const l_r = &r[y + 1][t->bx - 1]; |
1095 | 66.1k | const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs]; |
1096 | 66.1k | const int step4 = iclip(l_b_dim[1], 2, 16); |
1097 | | |
1098 | 66.1k | if (l_r->ref.ref[0] > 0) { |
1099 | 65.2k | const int ow4 = imin(b_dim[0], 16) >> 1; |
1100 | 65.2k | const int oh4 = imin(step4, b_dim[1]); |
1101 | 65.2k | res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4, |
1102 | 65.2k | t->bx, t->by + y, pl, l_r->mv.mv[0], |
1103 | 65.2k | &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1, |
1104 | 65.2k | dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]); |
1105 | 65.2k | if (res) return res; |
1106 | 65.2k | f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)], |
1107 | 65.2k | dst_stride, lap, h_mul * ow4, v_mul * oh4); |
1108 | 65.2k | i++; |
1109 | 65.2k | } |
1110 | 66.1k | y += step4; |
1111 | 66.1k | } |
1112 | 80.6k | return 0; |
1113 | 80.6k | } |
1114 | | |
1115 | | static int warp_affine(Dav1dTaskContext *const t, |
1116 | | pixel *dst8, int16_t *dst16, const ptrdiff_t dstride, |
1117 | | const uint8_t *const b_dim, const int pl, |
1118 | | const Dav1dThreadPicture *const refp, |
1119 | | const Dav1dWarpedMotionParams *const wmp) |
1120 | 30.0k | { |
1121 | 30.0k | assert((dst8 != NULL) ^ (dst16 != NULL)); |
1122 | 30.0k | const Dav1dFrameContext *const f = t->f; |
1123 | 30.0k | const Dav1dDSPContext *const dsp = f->dsp; |
1124 | 30.0k | const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
1125 | 30.0k | const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
1126 | 30.0k | const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; |
1127 | 30.0k | assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); |
1128 | 30.0k | const int32_t *const mat = wmp->matrix; |
1129 | 30.0k | const int width = (refp->p.p.w + ss_hor) >> ss_hor; |
1130 | 30.0k | const int height = (refp->p.p.h + ss_ver) >> ss_ver; |
1131 | | |
1132 | 99.4k | for (int y = 0; y < b_dim[1] * v_mul; y += 8) { |
1133 | 69.4k | const int src_y = t->by * 4 + ((y + 4) << ss_ver); |
1134 | 69.4k | const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; |
1135 | 69.4k | const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; |
1136 | 371k | for (int x = 0; x < b_dim[0] * h_mul; x += 8) { |
1137 | | // calculate transformation relative to center of 8x8 block in |
1138 | | // luma pixel units |
1139 | 301k | const int src_x = t->bx * 4 + ((x + 4) << ss_hor); |
1140 | 301k | const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; |
1141 | 301k | const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; |
1142 | | |
1143 | 301k | const int dx = (int) (mvx >> 16) - 4; |
1144 | 301k | const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 - |
1145 | 301k | wmp->u.p.beta * 7) & ~0x3f; |
1146 | 301k | const int dy = (int) (mvy >> 16) - 4; |
1147 | 301k | const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 - |
1148 | 301k | wmp->u.p.delta * 4) & ~0x3f; |
1149 | | |
1150 | 301k | const pixel *ref_ptr; |
1151 | 301k | ptrdiff_t ref_stride = refp->p.stride[!!pl]; |
1152 | | |
1153 | 301k | if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) { |
1154 | 220k | pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); |
1155 | 220k | f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3, |
1156 | 220k | emu_edge_buf, 32 * sizeof(pixel), |
1157 | 220k | refp->p.data[pl], ref_stride); |
1158 | 220k | ref_ptr = &emu_edge_buf[32 * 3 + 3]; |
1159 | 220k | ref_stride = 32 * sizeof(pixel); |
1160 | 220k | } else { |
1161 | 81.3k | ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; |
1162 | 81.3k | } |
1163 | 301k | if (dst16 != NULL) |
1164 | 51.2k | dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride, |
1165 | 51.2k | wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); |
1166 | 250k | else |
1167 | 250k | dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride, |
1168 | 250k | wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); |
1169 | 301k | } |
1170 | 69.4k | if (dst8) dst8 += 8 * PXSTRIDE(dstride); |
1171 | 15.4k | else dst16 += 8 * dstride; |
1172 | 69.4k | } |
1173 | 30.0k | return 0; |
1174 | 30.0k | } |
1175 | | |
1176 | | void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs, |
1177 | | const enum EdgeFlags intra_edge_flags, |
1178 | | const Av1Block *const b) |
1179 | 1.81M | { |
1180 | 1.81M | Dav1dTileState *const ts = t->ts; |
1181 | 1.81M | const Dav1dFrameContext *const f = t->f; |
1182 | 1.81M | const Dav1dDSPContext *const dsp = f->dsp; |
1183 | 1.81M | const int bx4 = t->bx & 31, by4 = t->by & 31; |
1184 | 1.81M | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
1185 | 1.81M | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
1186 | 1.81M | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; |
1187 | 1.81M | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; |
1188 | 1.81M | const int bw4 = b_dim[0], bh4 = b_dim[1]; |
1189 | 1.81M | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); |
1190 | 1.81M | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; |
1191 | 1.81M | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && |
1192 | 1.63M | (bw4 > ss_hor || t->bx & 1) && |
1193 | 1.53M | (bh4 > ss_ver || t->by & 1); |
1194 | 1.81M | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx]; |
1195 | 1.81M | const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; |
1196 | | |
1197 | | // coefficient coding |
1198 | 1.81M | pixel *const edge = bitfn(t->scratch.edge) + 128; |
1199 | 1.81M | const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; |
1200 | | |
1201 | 1.81M | const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10; |
1202 | | |
1203 | 3.72M | for (int init_y = 0; init_y < h4; init_y += 16) { |
1204 | 1.90M | const int sub_h4 = imin(h4, 16 + init_y); |
1205 | 1.90M | const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); |
1206 | 3.96M | for (int init_x = 0; init_x < w4; init_x += 16) { |
1207 | 2.06M | if (b->pal_sz[0]) { |
1208 | 8.79k | pixel *dst = ((pixel *) f->cur.data[0]) + |
1209 | 8.79k | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); |
1210 | 8.79k | const uint8_t *pal_idx; |
1211 | 8.79k | if (t->frame_thread.pass) { |
1212 | 8.79k | const int p = t->frame_thread.pass & 1; |
1213 | 8.79k | assert(ts->frame_thread[p].pal_idx); |
1214 | 8.79k | pal_idx = ts->frame_thread[p].pal_idx; |
1215 | 8.79k | ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; |
1216 | 8.79k | } else { |
1217 | 0 | pal_idx = t->scratch.pal_idx_y; |
1218 | 0 | } |
1219 | 8.79k | const pixel *const pal = t->frame_thread.pass ? |
1220 | 8.79k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + |
1221 | 8.79k | ((t->bx >> 1) + (t->by & 1))][0] : |
1222 | 8.79k | bytefn(t->scratch.pal)[0]; |
1223 | 8.79k | f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, |
1224 | 8.79k | pal_idx, bw4 * 4, bh4 * 4); |
1225 | 8.79k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1226 | 0 | hex_dump(dst, PXSTRIDE(f->cur.stride[0]), |
1227 | 0 | bw4 * 4, bh4 * 4, "y-pal-pred"); |
1228 | 8.79k | } |
1229 | | |
1230 | 2.06M | const int intra_flags = (sm_flag(t->a, bx4) | |
1231 | 2.06M | sm_flag(&t->l, by4) | |
1232 | 2.06M | intra_edge_filter_flag); |
1233 | 2.06M | const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 : |
1234 | 1.90M | intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT; |
1235 | 2.06M | const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 : |
1236 | 1.90M | intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM; |
1237 | 2.06M | int y, x; |
1238 | 2.06M | const int sub_w4 = imin(w4, init_x + 16); |
1239 | 5.09M | for (y = init_y, t->by += init_y; y < sub_h4; |
1240 | 3.03M | y += t_dim->h, t->by += t_dim->h) |
1241 | 3.03M | { |
1242 | 3.03M | pixel *dst = ((pixel *) f->cur.data[0]) + |
1243 | 3.03M | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + |
1244 | 3.03M | t->bx + init_x); |
1245 | 14.7M | for (x = init_x, t->bx += init_x; x < sub_w4; |
1246 | 11.7M | x += t_dim->w, t->bx += t_dim->w) |
1247 | 11.7M | { |
1248 | 11.7M | if (b->pal_sz[0]) goto skip_y_pred; |
1249 | | |
1250 | 11.7M | int angle = b->y_angle; |
1251 | 11.7M | const enum EdgeFlags edge_flags = |
1252 | 11.7M | (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ? |
1253 | 10.1M | 0 : EDGE_I444_TOP_HAS_RIGHT) | |
1254 | 11.7M | ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ? |
1255 | 9.84M | 0 : EDGE_I444_LEFT_HAS_BOTTOM); |
1256 | 11.7M | const pixel *top_sb_edge = NULL; |
1257 | 11.7M | if (!(t->by & (f->sb_step - 1))) { |
1258 | 1.18M | top_sb_edge = f->ipred_edge[0]; |
1259 | 1.18M | const int sby = t->by >> f->sb_shift; |
1260 | 1.18M | top_sb_edge += f->sb128w * 128 * (sby - 1); |
1261 | 1.18M | } |
1262 | 11.7M | const enum IntraPredMode m = |
1263 | 11.7M | bytefn(dav1d_prepare_intra_edges)(t->bx, |
1264 | 11.7M | t->bx > ts->tiling.col_start, |
1265 | 11.7M | t->by, |
1266 | 11.7M | t->by > ts->tiling.row_start, |
1267 | 11.7M | ts->tiling.col_end, |
1268 | 11.7M | ts->tiling.row_end, |
1269 | 11.7M | edge_flags, dst, |
1270 | 11.7M | f->cur.stride[0], top_sb_edge, |
1271 | 11.7M | b->y_mode, &angle, |
1272 | 11.7M | t_dim->w, t_dim->h, |
1273 | 11.7M | f->seq_hdr->intra_edge_filter, |
1274 | 11.7M | edge HIGHBD_CALL_SUFFIX); |
1275 | 11.7M | dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge, |
1276 | 11.7M | t_dim->w * 4, t_dim->h * 4, |
1277 | 11.7M | angle | intra_flags, |
1278 | 11.7M | 4 * f->bw - 4 * t->bx, |
1279 | 11.7M | 4 * f->bh - 4 * t->by |
1280 | 11.7M | HIGHBD_CALL_SUFFIX); |
1281 | | |
1282 | 11.7M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { |
1283 | 0 | hex_dump(edge - t_dim->h * 4, t_dim->h * 4, |
1284 | 0 | t_dim->h * 4, 2, "l"); |
1285 | 0 | hex_dump(edge, 0, 1, 1, "tl"); |
1286 | 0 | hex_dump(edge + 1, t_dim->w * 4, |
1287 | 0 | t_dim->w * 4, 2, "t"); |
1288 | 0 | hex_dump(dst, f->cur.stride[0], |
1289 | 0 | t_dim->w * 4, t_dim->h * 4, "y-intra-pred"); |
1290 | 0 | } |
1291 | | |
1292 | 11.7M | skip_y_pred: {} |
1293 | 11.7M | if (!b->skip) { |
1294 | 1.70M | coef *cf; |
1295 | 1.70M | int eob; |
1296 | 1.70M | enum TxfmType txtp; |
1297 | 1.70M | if (t->frame_thread.pass) { |
1298 | 1.70M | const int p = t->frame_thread.pass & 1; |
1299 | 1.70M | const int cbi = *ts->frame_thread[p].cbi++; |
1300 | 1.70M | cf = ts->frame_thread[p].cf; |
1301 | 1.70M | ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; |
1302 | 1.70M | eob = cbi >> 5; |
1303 | 1.70M | txtp = cbi & 0x1f; |
1304 | 18.4E | } else { |
1305 | 18.4E | uint8_t cf_ctx; |
1306 | 18.4E | cf = bitfn(t->cf); |
1307 | 18.4E | eob = decode_coefs(t, &t->a->lcoef[bx4 + x], |
1308 | 18.4E | &t->l.lcoef[by4 + y], b->tx, bs, |
1309 | 18.4E | b, 1, 0, cf, &txtp, &cf_ctx); |
1310 | 18.4E | if (DEBUG_BLOCK_INFO) |
1311 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", |
1312 | 0 | b->tx, txtp, eob, ts->msac.rng); |
1313 | 18.4E | dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); |
1314 | 18.4E | dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); |
1315 | 18.4E | } |
1316 | 1.70M | if (eob >= 0) { |
1317 | 1.28M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1318 | 0 | coef_dump(cf, imin(t_dim->h, 8) * 4, |
1319 | 0 | imin(t_dim->w, 8) * 4, 3, "dq"); |
1320 | 1.28M | dsp->itx.itxfm_add[b->tx] |
1321 | 1.28M | [txtp](dst, |
1322 | 1.28M | f->cur.stride[0], |
1323 | 1.28M | cf, eob HIGHBD_CALL_SUFFIX); |
1324 | 1.28M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1325 | 0 | hex_dump(dst, f->cur.stride[0], |
1326 | 0 | t_dim->w * 4, t_dim->h * 4, "recon"); |
1327 | 1.28M | } |
1328 | 10.0M | } else if (!t->frame_thread.pass) { |
1329 | 0 | dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40); |
1330 | 0 | dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40); |
1331 | 0 | } |
1332 | 11.7M | dst += 4 * t_dim->w; |
1333 | 11.7M | } |
1334 | 3.03M | t->bx -= x; |
1335 | 3.03M | } |
1336 | 2.05M | t->by -= y; |
1337 | | |
1338 | 2.05M | if (!has_chroma) continue; |
1339 | | |
1340 | 1.59M | const ptrdiff_t stride = f->cur.stride[1]; |
1341 | | |
1342 | 1.59M | if (b->uv_mode == CFL_PRED) { |
1343 | 323k | assert(!init_x && !init_y); |
1344 | | |
1345 | 323k | int16_t *const ac = t->scratch.ac; |
1346 | 323k | pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) + |
1347 | 323k | 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]); |
1348 | 323k | const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) + |
1349 | 323k | (t->by >> ss_ver) * PXSTRIDE(stride)); |
1350 | 323k | pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off, |
1351 | 323k | ((pixel *) f->cur.data[2]) + uv_off }; |
1352 | | |
1353 | 323k | const int furthest_r = |
1354 | 323k | ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1); |
1355 | 323k | const int furthest_b = |
1356 | 323k | ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1); |
1357 | 323k | dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0], |
1358 | 323k | cbw4 - (furthest_r >> ss_hor), |
1359 | 323k | cbh4 - (furthest_b >> ss_ver), |
1360 | 323k | cbw4 * 4, cbh4 * 4); |
1361 | 971k | for (int pl = 0; pl < 2; pl++) { |
1362 | 647k | if (!b->cfl_alpha[pl]) continue; |
1363 | 541k | int angle = 0; |
1364 | 541k | const pixel *top_sb_edge = NULL; |
1365 | 541k | if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { |
1366 | 142k | top_sb_edge = f->ipred_edge[pl + 1]; |
1367 | 142k | const int sby = t->by >> f->sb_shift; |
1368 | 142k | top_sb_edge += f->sb128w * 128 * (sby - 1); |
1369 | 142k | } |
1370 | 541k | const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; |
1371 | 541k | const int xstart = ts->tiling.col_start >> ss_hor; |
1372 | 541k | const int ystart = ts->tiling.row_start >> ss_ver; |
1373 | 541k | const enum IntraPredMode m = |
1374 | 541k | bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, |
1375 | 541k | ypos, ypos > ystart, |
1376 | 541k | ts->tiling.col_end >> ss_hor, |
1377 | 541k | ts->tiling.row_end >> ss_ver, |
1378 | 541k | 0, uv_dst[pl], stride, |
1379 | 541k | top_sb_edge, DC_PRED, &angle, |
1380 | 541k | uv_t_dim->w, uv_t_dim->h, 0, |
1381 | 541k | edge HIGHBD_CALL_SUFFIX); |
1382 | 541k | dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge, |
1383 | 541k | uv_t_dim->w * 4, |
1384 | 541k | uv_t_dim->h * 4, |
1385 | 541k | ac, b->cfl_alpha[pl] |
1386 | 541k | HIGHBD_CALL_SUFFIX); |
1387 | 541k | } |
1388 | 323k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { |
1389 | 0 | ac_dump(ac, 4*cbw4, 4*cbh4, "ac"); |
1390 | 0 | hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred"); |
1391 | 0 | hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred"); |
1392 | 0 | } |
1393 | 1.27M | } else if (b->pal_sz[1]) { |
1394 | 4.07k | const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + |
1395 | 4.07k | (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); |
1396 | 4.07k | const pixel (*pal)[8]; |
1397 | 4.07k | const uint8_t *pal_idx; |
1398 | 4.07k | if (t->frame_thread.pass) { |
1399 | 4.07k | const int p = t->frame_thread.pass & 1; |
1400 | 4.07k | assert(ts->frame_thread[p].pal_idx); |
1401 | 4.07k | pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + |
1402 | 4.07k | ((t->bx >> 1) + (t->by & 1))]; |
1403 | 4.07k | pal_idx = ts->frame_thread[p].pal_idx; |
1404 | 4.07k | ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8; |
1405 | 4.07k | } else { |
1406 | 0 | pal = bytefn(t->scratch.pal); |
1407 | 0 | pal_idx = t->scratch.pal_idx_uv; |
1408 | 0 | } |
1409 | | |
1410 | 4.07k | f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff, |
1411 | 4.07k | f->cur.stride[1], pal[1], |
1412 | 4.07k | pal_idx, cbw4 * 4, cbh4 * 4); |
1413 | 4.07k | f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff, |
1414 | 4.07k | f->cur.stride[1], pal[2], |
1415 | 4.07k | pal_idx, cbw4 * 4, cbh4 * 4); |
1416 | 4.07k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { |
1417 | 0 | hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff, |
1418 | 0 | PXSTRIDE(f->cur.stride[1]), |
1419 | 0 | cbw4 * 4, cbh4 * 4, "u-pal-pred"); |
1420 | 0 | hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff, |
1421 | 0 | PXSTRIDE(f->cur.stride[1]), |
1422 | 0 | cbw4 * 4, cbh4 * 4, "v-pal-pred"); |
1423 | 0 | } |
1424 | 4.07k | } |
1425 | | |
1426 | 1.59M | const int sm_uv_fl = sm_uv_flag(t->a, cbx4) | |
1427 | 1.59M | sm_uv_flag(&t->l, cby4); |
1428 | 1.59M | const int uv_sb_has_tr = |
1429 | 1.59M | ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 : |
1430 | 1.50M | intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1)); |
1431 | 1.59M | const int uv_sb_has_bl = |
1432 | 1.59M | init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 : |
1433 | 1.50M | intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1)); |
1434 | 1.59M | const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); |
1435 | 4.79M | for (int pl = 0; pl < 2; pl++) { |
1436 | 6.88M | for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; |
1437 | 3.68M | y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) |
1438 | 3.68M | { |
1439 | 3.68M | pixel *dst = ((pixel *) f->cur.data[1 + pl]) + |
1440 | 3.68M | 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) + |
1441 | 3.68M | ((t->bx + init_x) >> ss_hor)); |
1442 | 8.56M | for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; |
1443 | 4.88M | x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) |
1444 | 4.88M | { |
1445 | 4.88M | if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) || |
1446 | 4.34M | b->pal_sz[1]) |
1447 | 551k | { |
1448 | 551k | goto skip_uv_pred; |
1449 | 551k | } |
1450 | | |
1451 | 4.33M | int angle = b->uv_angle; |
1452 | | // this probably looks weird because we're using |
1453 | | // luma flags in a chroma loop, but that's because |
1454 | | // prepare_intra_edges() expects luma flags as input |
1455 | 4.33M | const enum EdgeFlags edge_flags = |
1456 | 4.33M | (((y > (init_y >> ss_ver) || !uv_sb_has_tr) && |
1457 | 2.16M | (x + uv_t_dim->w >= sub_cw4)) ? |
1458 | 3.12M | 0 : EDGE_I444_TOP_HAS_RIGHT) | |
1459 | 4.33M | ((x > (init_x >> ss_hor) || |
1460 | 3.13M | (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ? |
1461 | 2.72M | 0 : EDGE_I444_LEFT_HAS_BOTTOM); |
1462 | 4.33M | const pixel *top_sb_edge = NULL; |
1463 | 4.33M | if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { |
1464 | 1.26M | top_sb_edge = f->ipred_edge[1 + pl]; |
1465 | 1.26M | const int sby = t->by >> f->sb_shift; |
1466 | 1.26M | top_sb_edge += f->sb128w * 128 * (sby - 1); |
1467 | 1.26M | } |
1468 | 4.33M | const enum IntraPredMode uv_mode = |
1469 | 4.33M | b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode; |
1470 | 4.33M | const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; |
1471 | 4.33M | const int xstart = ts->tiling.col_start >> ss_hor; |
1472 | 4.33M | const int ystart = ts->tiling.row_start >> ss_ver; |
1473 | 4.33M | const enum IntraPredMode m = |
1474 | 4.33M | bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, |
1475 | 4.33M | ypos, ypos > ystart, |
1476 | 4.33M | ts->tiling.col_end >> ss_hor, |
1477 | 4.33M | ts->tiling.row_end >> ss_ver, |
1478 | 4.33M | edge_flags, dst, stride, |
1479 | 4.33M | top_sb_edge, uv_mode, |
1480 | 4.33M | &angle, uv_t_dim->w, |
1481 | 4.33M | uv_t_dim->h, |
1482 | 4.33M | f->seq_hdr->intra_edge_filter, |
1483 | 4.33M | edge HIGHBD_CALL_SUFFIX); |
1484 | 4.33M | angle |= intra_edge_filter_flag; |
1485 | 4.33M | dsp->ipred.intra_pred[m](dst, stride, edge, |
1486 | 4.33M | uv_t_dim->w * 4, |
1487 | 4.33M | uv_t_dim->h * 4, |
1488 | 4.33M | angle | sm_uv_fl, |
1489 | 4.33M | (4 * f->bw + ss_hor - |
1490 | 4.33M | 4 * (t->bx & ~ss_hor)) >> ss_hor, |
1491 | 4.33M | (4 * f->bh + ss_ver - |
1492 | 4.33M | 4 * (t->by & ~ss_ver)) >> ss_ver |
1493 | 4.33M | HIGHBD_CALL_SUFFIX); |
1494 | 4.33M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { |
1495 | 0 | hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4, |
1496 | 0 | uv_t_dim->h * 4, 2, "l"); |
1497 | 0 | hex_dump(edge, 0, 1, 1, "tl"); |
1498 | 0 | hex_dump(edge + 1, uv_t_dim->w * 4, |
1499 | 0 | uv_t_dim->w * 4, 2, "t"); |
1500 | 0 | hex_dump(dst, stride, uv_t_dim->w * 4, |
1501 | 0 | uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred"); |
1502 | 0 | } |
1503 | | |
1504 | 4.88M | skip_uv_pred: {} |
1505 | 4.88M | if (!b->skip) { |
1506 | 1.60M | enum TxfmType txtp; |
1507 | 1.60M | int eob; |
1508 | 1.60M | coef *cf; |
1509 | 1.60M | if (t->frame_thread.pass) { |
1510 | 1.60M | const int p = t->frame_thread.pass & 1; |
1511 | 1.60M | const int cbi = *ts->frame_thread[p].cbi++; |
1512 | 1.60M | cf = ts->frame_thread[p].cf; |
1513 | 1.60M | ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16; |
1514 | 1.60M | eob = cbi >> 5; |
1515 | 1.60M | txtp = cbi & 0x1f; |
1516 | 18.4E | } else { |
1517 | 18.4E | uint8_t cf_ctx; |
1518 | 18.4E | cf = bitfn(t->cf); |
1519 | 18.4E | eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], |
1520 | 18.4E | &t->l.ccoef[pl][cby4 + y], |
1521 | 18.4E | b->uvtx, bs, b, 1, 1 + pl, cf, |
1522 | 18.4E | &txtp, &cf_ctx); |
1523 | 18.4E | if (DEBUG_BLOCK_INFO) |
1524 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," |
1525 | 0 | "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n", |
1526 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4); |
1527 | 18.4E | int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); |
1528 | 18.4E | int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); |
1529 | 18.4E | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); |
1530 | 18.4E | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); |
1531 | 18.4E | } |
1532 | 1.60M | if (eob >= 0) { |
1533 | 579k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1534 | 0 | coef_dump(cf, uv_t_dim->h * 4, |
1535 | 0 | uv_t_dim->w * 4, 3, "dq"); |
1536 | 579k | dsp->itx.itxfm_add[b->uvtx] |
1537 | 579k | [txtp](dst, stride, |
1538 | 579k | cf, eob HIGHBD_CALL_SUFFIX); |
1539 | 579k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1540 | 0 | hex_dump(dst, stride, uv_t_dim->w * 4, |
1541 | 0 | uv_t_dim->h * 4, "recon"); |
1542 | 579k | } |
1543 | 3.27M | } else if (!t->frame_thread.pass) { |
1544 | 0 | dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40); |
1545 | 0 | dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40); |
1546 | 0 | } |
1547 | 4.88M | dst += uv_t_dim->w * 4; |
1548 | 4.88M | } |
1549 | 3.68M | t->bx -= x << ss_hor; |
1550 | 3.68M | } |
1551 | 3.19M | t->by -= y << ss_ver; |
1552 | 3.19M | } |
1553 | 1.59M | } |
1554 | 1.90M | } |
1555 | 1.81M | } Line | Count | Source | 1179 | 809k | { | 1180 | 809k | Dav1dTileState *const ts = t->ts; | 1181 | 809k | const Dav1dFrameContext *const f = t->f; | 1182 | 809k | const Dav1dDSPContext *const dsp = f->dsp; | 1183 | 809k | const int bx4 = t->bx & 31, by4 = t->by & 31; | 1184 | 809k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 1185 | 809k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 1186 | 809k | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; | 1187 | 809k | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; | 1188 | 809k | const int bw4 = b_dim[0], bh4 = b_dim[1]; | 1189 | 809k | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); | 1190 | 809k | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; | 1191 | 809k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && | 1192 | 756k | (bw4 > ss_hor || t->bx & 1) && | 1193 | 709k | (bh4 > ss_ver || t->by & 1); | 1194 | 809k | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx]; | 1195 | 809k | const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; | 1196 | | | 1197 | | // coefficient coding | 1198 | 809k | pixel *const edge = bitfn(t->scratch.edge) + 128; | 1199 | 809k | const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; | 1200 | | | 1201 | 809k | const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10; | 1202 | | | 1203 | 1.65M | for (int init_y = 0; init_y < h4; init_y += 16) { | 1204 | 848k | const int sub_h4 = imin(h4, 16 + init_y); | 1205 | 848k | const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); | 1206 | 1.75M | for (int init_x = 0; init_x < w4; init_x += 16) { | 1207 | 907k | if (b->pal_sz[0]) { | 1208 | 4.67k | pixel *dst = ((pixel *) f->cur.data[0]) + | 1209 | 4.67k | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); | 1210 | 4.67k | const uint8_t *pal_idx; | 1211 | 4.67k | if (t->frame_thread.pass) { | 1212 | 4.67k | const int p = t->frame_thread.pass & 1; | 1213 | 4.67k | assert(ts->frame_thread[p].pal_idx); | 1214 | 4.67k | pal_idx = ts->frame_thread[p].pal_idx; | 1215 | 4.67k | ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; | 1216 | 4.67k | } else { | 1217 | 0 | pal_idx = t->scratch.pal_idx_y; | 1218 | 0 | } | 1219 | 4.67k | const pixel *const pal = t->frame_thread.pass ? | 1220 | 4.67k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 1221 | 4.67k | ((t->bx >> 1) + (t->by & 1))][0] : | 1222 | 4.67k | bytefn(t->scratch.pal)[0]; | 1223 | 4.67k | f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, | 1224 | 4.67k | pal_idx, bw4 * 4, bh4 * 4); | 1225 | 4.67k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1226 | 0 | hex_dump(dst, PXSTRIDE(f->cur.stride[0]), | 1227 | 0 | bw4 * 4, bh4 * 4, "y-pal-pred"); | 1228 | 4.67k | } | 1229 | | | 1230 | 907k | const int intra_flags = (sm_flag(t->a, bx4) | | 1231 | 907k | sm_flag(&t->l, by4) | | 1232 | 907k | intra_edge_filter_flag); | 1233 | 907k | const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 : | 1234 | 848k | intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT; | 1235 | 907k | const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 : | 1236 | 848k | intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM; | 1237 | 907k | int y, x; | 1238 | 907k | const int sub_w4 = imin(w4, init_x + 16); | 1239 | 2.20M | for (y = init_y, t->by += init_y; y < sub_h4; | 1240 | 1.30M | y += t_dim->h, t->by += t_dim->h) | 1241 | 1.29M | { | 1242 | 1.29M | pixel *dst = ((pixel *) f->cur.data[0]) + | 1243 | 1.29M | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + | 1244 | 1.29M | t->bx + init_x); | 1245 | 5.95M | for (x = init_x, t->bx += init_x; x < sub_w4; | 1246 | 4.65M | x += t_dim->w, t->bx += t_dim->w) | 1247 | 4.65M | { | 1248 | 4.65M | if (b->pal_sz[0]) goto skip_y_pred; | 1249 | | | 1250 | 4.65M | int angle = b->y_angle; | 1251 | 4.65M | const enum EdgeFlags edge_flags = | 1252 | 4.65M | (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ? | 1253 | 4.01M | 0 : EDGE_I444_TOP_HAS_RIGHT) | | 1254 | 4.65M | ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ? | 1255 | 3.86M | 0 : EDGE_I444_LEFT_HAS_BOTTOM); | 1256 | 4.65M | const pixel *top_sb_edge = NULL; | 1257 | 4.65M | if (!(t->by & (f->sb_step - 1))) { | 1258 | 513k | top_sb_edge = f->ipred_edge[0]; | 1259 | 513k | const int sby = t->by >> f->sb_shift; | 1260 | 513k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1261 | 513k | } | 1262 | 4.65M | const enum IntraPredMode m = | 1263 | 4.65M | bytefn(dav1d_prepare_intra_edges)(t->bx, | 1264 | 4.65M | t->bx > ts->tiling.col_start, | 1265 | 4.65M | t->by, | 1266 | 4.65M | t->by > ts->tiling.row_start, | 1267 | 4.65M | ts->tiling.col_end, | 1268 | 4.65M | ts->tiling.row_end, | 1269 | 4.65M | edge_flags, dst, | 1270 | 4.65M | f->cur.stride[0], top_sb_edge, | 1271 | 4.65M | b->y_mode, &angle, | 1272 | 4.65M | t_dim->w, t_dim->h, | 1273 | 4.65M | f->seq_hdr->intra_edge_filter, | 1274 | 4.65M | edge HIGHBD_CALL_SUFFIX); | 1275 | 4.65M | dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge, | 1276 | 4.65M | t_dim->w * 4, t_dim->h * 4, | 1277 | 4.65M | angle | intra_flags, | 1278 | 4.65M | 4 * f->bw - 4 * t->bx, | 1279 | 4.65M | 4 * f->bh - 4 * t->by | 1280 | 4.65M | HIGHBD_CALL_SUFFIX); | 1281 | | | 1282 | 4.65M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1283 | 0 | hex_dump(edge - t_dim->h * 4, t_dim->h * 4, | 1284 | 0 | t_dim->h * 4, 2, "l"); | 1285 | 0 | hex_dump(edge, 0, 1, 1, "tl"); | 1286 | 0 | hex_dump(edge + 1, t_dim->w * 4, | 1287 | 0 | t_dim->w * 4, 2, "t"); | 1288 | 0 | hex_dump(dst, f->cur.stride[0], | 1289 | 0 | t_dim->w * 4, t_dim->h * 4, "y-intra-pred"); | 1290 | 0 | } | 1291 | | | 1292 | 4.65M | skip_y_pred: {} | 1293 | 4.65M | if (!b->skip) { | 1294 | 893k | coef *cf; | 1295 | 893k | int eob; | 1296 | 893k | enum TxfmType txtp; | 1297 | 893k | if (t->frame_thread.pass) { | 1298 | 893k | const int p = t->frame_thread.pass & 1; | 1299 | 893k | const int cbi = *ts->frame_thread[p].cbi++; | 1300 | 893k | cf = ts->frame_thread[p].cf; | 1301 | 893k | ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; | 1302 | 893k | eob = cbi >> 5; | 1303 | 893k | txtp = cbi & 0x1f; | 1304 | 18.4E | } else { | 1305 | 18.4E | uint8_t cf_ctx; | 1306 | 18.4E | cf = bitfn(t->cf); | 1307 | 18.4E | eob = decode_coefs(t, &t->a->lcoef[bx4 + x], | 1308 | 18.4E | &t->l.lcoef[by4 + y], b->tx, bs, | 1309 | 18.4E | b, 1, 0, cf, &txtp, &cf_ctx); | 1310 | 18.4E | if (DEBUG_BLOCK_INFO) | 1311 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", | 1312 | 0 | b->tx, txtp, eob, ts->msac.rng); | 1313 | 18.4E | dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); | 1314 | 18.4E | dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); | 1315 | 18.4E | } | 1316 | 893k | if (eob >= 0) { | 1317 | 733k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1318 | 0 | coef_dump(cf, imin(t_dim->h, 8) * 4, | 1319 | 0 | imin(t_dim->w, 8) * 4, 3, "dq"); | 1320 | 733k | dsp->itx.itxfm_add[b->tx] | 1321 | 733k | [txtp](dst, | 1322 | 733k | f->cur.stride[0], | 1323 | 733k | cf, eob HIGHBD_CALL_SUFFIX); | 1324 | 733k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1325 | 0 | hex_dump(dst, f->cur.stride[0], | 1326 | 0 | t_dim->w * 4, t_dim->h * 4, "recon"); | 1327 | 733k | } | 1328 | 3.76M | } else if (!t->frame_thread.pass) { | 1329 | 0 | dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40); | 1330 | 0 | dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40); | 1331 | 0 | } | 1332 | 4.65M | dst += 4 * t_dim->w; | 1333 | 4.65M | } | 1334 | 1.30M | t->bx -= x; | 1335 | 1.30M | } | 1336 | 908k | t->by -= y; | 1337 | | | 1338 | 908k | if (!has_chroma) continue; | 1339 | | | 1340 | 746k | const ptrdiff_t stride = f->cur.stride[1]; | 1341 | | | 1342 | 746k | if (b->uv_mode == CFL_PRED) { | 1343 | 153k | assert(!init_x && !init_y); | 1344 | | | 1345 | 153k | int16_t *const ac = t->scratch.ac; | 1346 | 153k | pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) + | 1347 | 153k | 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]); | 1348 | 153k | const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) + | 1349 | 153k | (t->by >> ss_ver) * PXSTRIDE(stride)); | 1350 | 153k | pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off, | 1351 | 153k | ((pixel *) f->cur.data[2]) + uv_off }; | 1352 | | | 1353 | 153k | const int furthest_r = | 1354 | 153k | ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1); | 1355 | 153k | const int furthest_b = | 1356 | 153k | ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1); | 1357 | 153k | dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0], | 1358 | 153k | cbw4 - (furthest_r >> ss_hor), | 1359 | 153k | cbh4 - (furthest_b >> ss_ver), | 1360 | 153k | cbw4 * 4, cbh4 * 4); | 1361 | 459k | for (int pl = 0; pl < 2; pl++) { | 1362 | 306k | if (!b->cfl_alpha[pl]) continue; | 1363 | 257k | int angle = 0; | 1364 | 257k | const pixel *top_sb_edge = NULL; | 1365 | 257k | if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { | 1366 | 72.0k | top_sb_edge = f->ipred_edge[pl + 1]; | 1367 | 72.0k | const int sby = t->by >> f->sb_shift; | 1368 | 72.0k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1369 | 72.0k | } | 1370 | 257k | const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; | 1371 | 257k | const int xstart = ts->tiling.col_start >> ss_hor; | 1372 | 257k | const int ystart = ts->tiling.row_start >> ss_ver; | 1373 | 257k | const enum IntraPredMode m = | 1374 | 257k | bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, | 1375 | 257k | ypos, ypos > ystart, | 1376 | 257k | ts->tiling.col_end >> ss_hor, | 1377 | 257k | ts->tiling.row_end >> ss_ver, | 1378 | 257k | 0, uv_dst[pl], stride, | 1379 | 257k | top_sb_edge, DC_PRED, &angle, | 1380 | 257k | uv_t_dim->w, uv_t_dim->h, 0, | 1381 | 257k | edge HIGHBD_CALL_SUFFIX); | 1382 | 257k | dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge, | 1383 | 257k | uv_t_dim->w * 4, | 1384 | 257k | uv_t_dim->h * 4, | 1385 | 257k | ac, b->cfl_alpha[pl] | 1386 | 257k | HIGHBD_CALL_SUFFIX); | 1387 | 257k | } | 1388 | 153k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1389 | 0 | ac_dump(ac, 4*cbw4, 4*cbh4, "ac"); | 1390 | 0 | hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred"); | 1391 | 0 | hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred"); | 1392 | 0 | } | 1393 | 592k | } else if (b->pal_sz[1]) { | 1394 | 2.72k | const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + | 1395 | 2.72k | (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); | 1396 | 2.72k | const pixel (*pal)[8]; | 1397 | 2.72k | const uint8_t *pal_idx; | 1398 | 2.72k | if (t->frame_thread.pass) { | 1399 | 2.72k | const int p = t->frame_thread.pass & 1; | 1400 | 2.72k | assert(ts->frame_thread[p].pal_idx); | 1401 | 2.72k | pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 1402 | 2.72k | ((t->bx >> 1) + (t->by & 1))]; | 1403 | 2.72k | pal_idx = ts->frame_thread[p].pal_idx; | 1404 | 2.72k | ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8; | 1405 | 2.72k | } else { | 1406 | 0 | pal = bytefn(t->scratch.pal); | 1407 | 0 | pal_idx = t->scratch.pal_idx_uv; | 1408 | 0 | } | 1409 | | | 1410 | 2.72k | f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff, | 1411 | 2.72k | f->cur.stride[1], pal[1], | 1412 | 2.72k | pal_idx, cbw4 * 4, cbh4 * 4); | 1413 | 2.72k | f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff, | 1414 | 2.72k | f->cur.stride[1], pal[2], | 1415 | 2.72k | pal_idx, cbw4 * 4, cbh4 * 4); | 1416 | 2.72k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1417 | 0 | hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff, | 1418 | 0 | PXSTRIDE(f->cur.stride[1]), | 1419 | 0 | cbw4 * 4, cbh4 * 4, "u-pal-pred"); | 1420 | 0 | hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff, | 1421 | 0 | PXSTRIDE(f->cur.stride[1]), | 1422 | 0 | cbw4 * 4, cbh4 * 4, "v-pal-pred"); | 1423 | 0 | } | 1424 | 2.72k | } | 1425 | | | 1426 | 746k | const int sm_uv_fl = sm_uv_flag(t->a, cbx4) | | 1427 | 746k | sm_uv_flag(&t->l, cby4); | 1428 | 746k | const int uv_sb_has_tr = | 1429 | 746k | ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 : | 1430 | 697k | intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1)); | 1431 | 746k | const int uv_sb_has_bl = | 1432 | 746k | init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 : | 1433 | 697k | intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1)); | 1434 | 746k | const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); | 1435 | 2.23M | for (int pl = 0; pl < 2; pl++) { | 1436 | 3.22M | for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; | 1437 | 1.73M | y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) | 1438 | 1.73M | { | 1439 | 1.73M | pixel *dst = ((pixel *) f->cur.data[1 + pl]) + | 1440 | 1.73M | 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) + | 1441 | 1.73M | ((t->bx + init_x) >> ss_hor)); | 1442 | 4.09M | for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; | 1443 | 2.35M | x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) | 1444 | 2.35M | { | 1445 | 2.35M | if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) || | 1446 | 2.09M | b->pal_sz[1]) | 1447 | 264k | { | 1448 | 264k | goto skip_uv_pred; | 1449 | 264k | } | 1450 | | | 1451 | 2.09M | int angle = b->uv_angle; | 1452 | | // this probably looks weird because we're using | 1453 | | // luma flags in a chroma loop, but that's because | 1454 | | // prepare_intra_edges() expects luma flags as input | 1455 | 2.09M | const enum EdgeFlags edge_flags = | 1456 | 2.09M | (((y > (init_y >> ss_ver) || !uv_sb_has_tr) && | 1457 | 1.06M | (x + uv_t_dim->w >= sub_cw4)) ? | 1458 | 1.53M | 0 : EDGE_I444_TOP_HAS_RIGHT) | | 1459 | 2.09M | ((x > (init_x >> ss_hor) || | 1460 | 1.47M | (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ? | 1461 | 1.31M | 0 : EDGE_I444_LEFT_HAS_BOTTOM); | 1462 | 2.09M | const pixel *top_sb_edge = NULL; | 1463 | 2.09M | if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { | 1464 | 609k | top_sb_edge = f->ipred_edge[1 + pl]; | 1465 | 609k | const int sby = t->by >> f->sb_shift; | 1466 | 609k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1467 | 609k | } | 1468 | 2.09M | const enum IntraPredMode uv_mode = | 1469 | 2.09M | b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode; | 1470 | 2.09M | const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; | 1471 | 2.09M | const int xstart = ts->tiling.col_start >> ss_hor; | 1472 | 2.09M | const int ystart = ts->tiling.row_start >> ss_ver; | 1473 | 2.09M | const enum IntraPredMode m = | 1474 | 2.09M | bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, | 1475 | 2.09M | ypos, ypos > ystart, | 1476 | 2.09M | ts->tiling.col_end >> ss_hor, | 1477 | 2.09M | ts->tiling.row_end >> ss_ver, | 1478 | 2.09M | edge_flags, dst, stride, | 1479 | 2.09M | top_sb_edge, uv_mode, | 1480 | 2.09M | &angle, uv_t_dim->w, | 1481 | 2.09M | uv_t_dim->h, | 1482 | 2.09M | f->seq_hdr->intra_edge_filter, | 1483 | 2.09M | edge HIGHBD_CALL_SUFFIX); | 1484 | 2.09M | angle |= intra_edge_filter_flag; | 1485 | 2.09M | dsp->ipred.intra_pred[m](dst, stride, edge, | 1486 | 2.09M | uv_t_dim->w * 4, | 1487 | 2.09M | uv_t_dim->h * 4, | 1488 | 2.09M | angle | sm_uv_fl, | 1489 | 2.09M | (4 * f->bw + ss_hor - | 1490 | 2.09M | 4 * (t->bx & ~ss_hor)) >> ss_hor, | 1491 | 2.09M | (4 * f->bh + ss_ver - | 1492 | 2.09M | 4 * (t->by & ~ss_ver)) >> ss_ver | 1493 | 2.09M | HIGHBD_CALL_SUFFIX); | 1494 | 2.09M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1495 | 0 | hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4, | 1496 | 0 | uv_t_dim->h * 4, 2, "l"); | 1497 | 0 | hex_dump(edge, 0, 1, 1, "tl"); | 1498 | 0 | hex_dump(edge + 1, uv_t_dim->w * 4, | 1499 | 0 | uv_t_dim->w * 4, 2, "t"); | 1500 | 0 | hex_dump(dst, stride, uv_t_dim->w * 4, | 1501 | 0 | uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred"); | 1502 | 0 | } | 1503 | | | 1504 | 2.35M | skip_uv_pred: {} | 1505 | 2.35M | if (!b->skip) { | 1506 | 679k | enum TxfmType txtp; | 1507 | 679k | int eob; | 1508 | 679k | coef *cf; | 1509 | 679k | if (t->frame_thread.pass) { | 1510 | 679k | const int p = t->frame_thread.pass & 1; | 1511 | 679k | const int cbi = *ts->frame_thread[p].cbi++; | 1512 | 679k | cf = ts->frame_thread[p].cf; | 1513 | 679k | ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16; | 1514 | 679k | eob = cbi >> 5; | 1515 | 679k | txtp = cbi & 0x1f; | 1516 | 18.4E | } else { | 1517 | 18.4E | uint8_t cf_ctx; | 1518 | 18.4E | cf = bitfn(t->cf); | 1519 | 18.4E | eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], | 1520 | 18.4E | &t->l.ccoef[pl][cby4 + y], | 1521 | 18.4E | b->uvtx, bs, b, 1, 1 + pl, cf, | 1522 | 18.4E | &txtp, &cf_ctx); | 1523 | 18.4E | if (DEBUG_BLOCK_INFO) | 1524 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," | 1525 | 0 | "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n", | 1526 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4); | 1527 | 18.4E | int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); | 1528 | 18.4E | int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); | 1529 | 18.4E | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); | 1530 | 18.4E | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); | 1531 | 18.4E | } | 1532 | 679k | if (eob >= 0) { | 1533 | 286k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1534 | 0 | coef_dump(cf, uv_t_dim->h * 4, | 1535 | 0 | uv_t_dim->w * 4, 3, "dq"); | 1536 | 286k | dsp->itx.itxfm_add[b->uvtx] | 1537 | 286k | [txtp](dst, stride, | 1538 | 286k | cf, eob HIGHBD_CALL_SUFFIX); | 1539 | 286k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1540 | 0 | hex_dump(dst, stride, uv_t_dim->w * 4, | 1541 | 0 | uv_t_dim->h * 4, "recon"); | 1542 | 286k | } | 1543 | 1.67M | } else if (!t->frame_thread.pass) { | 1544 | 0 | dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40); | 1545 | 0 | dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40); | 1546 | 0 | } | 1547 | 2.35M | dst += uv_t_dim->w * 4; | 1548 | 2.35M | } | 1549 | 1.73M | t->bx -= x << ss_hor; | 1550 | 1.73M | } | 1551 | 1.49M | t->by -= y << ss_ver; | 1552 | 1.49M | } | 1553 | 746k | } | 1554 | 848k | } | 1555 | 809k | } |
dav1d_recon_b_intra_16bpc Line | Count | Source | 1179 | 1.00M | { | 1180 | 1.00M | Dav1dTileState *const ts = t->ts; | 1181 | 1.00M | const Dav1dFrameContext *const f = t->f; | 1182 | 1.00M | const Dav1dDSPContext *const dsp = f->dsp; | 1183 | 1.00M | const int bx4 = t->bx & 31, by4 = t->by & 31; | 1184 | 1.00M | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 1185 | 1.00M | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 1186 | 1.00M | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; | 1187 | 1.00M | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; | 1188 | 1.00M | const int bw4 = b_dim[0], bh4 = b_dim[1]; | 1189 | 1.00M | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); | 1190 | 1.00M | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; | 1191 | 1.00M | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && | 1192 | 880k | (bw4 > ss_hor || t->bx & 1) && | 1193 | 829k | (bh4 > ss_ver || t->by & 1); | 1194 | 1.00M | const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx]; | 1195 | 1.00M | const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; | 1196 | | | 1197 | | // coefficient coding | 1198 | 1.00M | pixel *const edge = bitfn(t->scratch.edge) + 128; | 1199 | 1.00M | const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; | 1200 | | | 1201 | 1.00M | const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10; | 1202 | | | 1203 | 2.06M | for (int init_y = 0; init_y < h4; init_y += 16) { | 1204 | 1.06M | const int sub_h4 = imin(h4, 16 + init_y); | 1205 | 1.06M | const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); | 1206 | 2.21M | for (int init_x = 0; init_x < w4; init_x += 16) { | 1207 | 1.15M | if (b->pal_sz[0]) { | 1208 | 4.11k | pixel *dst = ((pixel *) f->cur.data[0]) + | 1209 | 4.11k | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); | 1210 | 4.11k | const uint8_t *pal_idx; | 1211 | 4.11k | if (t->frame_thread.pass) { | 1212 | 4.11k | const int p = t->frame_thread.pass & 1; | 1213 | 4.11k | assert(ts->frame_thread[p].pal_idx); | 1214 | 4.11k | pal_idx = ts->frame_thread[p].pal_idx; | 1215 | 4.11k | ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; | 1216 | 4.11k | } else { | 1217 | 0 | pal_idx = t->scratch.pal_idx_y; | 1218 | 0 | } | 1219 | 4.11k | const pixel *const pal = t->frame_thread.pass ? | 1220 | 4.11k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 1221 | 4.11k | ((t->bx >> 1) + (t->by & 1))][0] : | 1222 | 4.11k | bytefn(t->scratch.pal)[0]; | 1223 | 4.11k | f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, | 1224 | 4.11k | pal_idx, bw4 * 4, bh4 * 4); | 1225 | 4.11k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1226 | 0 | hex_dump(dst, PXSTRIDE(f->cur.stride[0]), | 1227 | 0 | bw4 * 4, bh4 * 4, "y-pal-pred"); | 1228 | 4.11k | } | 1229 | | | 1230 | 1.15M | const int intra_flags = (sm_flag(t->a, bx4) | | 1231 | 1.15M | sm_flag(&t->l, by4) | | 1232 | 1.15M | intra_edge_filter_flag); | 1233 | 1.15M | const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 : | 1234 | 1.06M | intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT; | 1235 | 1.15M | const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 : | 1236 | 1.06M | intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM; | 1237 | 1.15M | int y, x; | 1238 | 1.15M | const int sub_w4 = imin(w4, init_x + 16); | 1239 | 2.88M | for (y = init_y, t->by += init_y; y < sub_h4; | 1240 | 1.73M | y += t_dim->h, t->by += t_dim->h) | 1241 | 1.73M | { | 1242 | 1.73M | pixel *dst = ((pixel *) f->cur.data[0]) + | 1243 | 1.73M | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + | 1244 | 1.73M | t->bx + init_x); | 1245 | 8.79M | for (x = init_x, t->bx += init_x; x < sub_w4; | 1246 | 7.06M | x += t_dim->w, t->bx += t_dim->w) | 1247 | 7.06M | { | 1248 | 7.06M | if (b->pal_sz[0]) goto skip_y_pred; | 1249 | | | 1250 | 7.05M | int angle = b->y_angle; | 1251 | 7.05M | const enum EdgeFlags edge_flags = | 1252 | 7.05M | (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ? | 1253 | 6.13M | 0 : EDGE_I444_TOP_HAS_RIGHT) | | 1254 | 7.05M | ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ? | 1255 | 5.98M | 0 : EDGE_I444_LEFT_HAS_BOTTOM); | 1256 | 7.05M | const pixel *top_sb_edge = NULL; | 1257 | 7.05M | if (!(t->by & (f->sb_step - 1))) { | 1258 | 671k | top_sb_edge = f->ipred_edge[0]; | 1259 | 671k | const int sby = t->by >> f->sb_shift; | 1260 | 671k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1261 | 671k | } | 1262 | 7.05M | const enum IntraPredMode m = | 1263 | 7.05M | bytefn(dav1d_prepare_intra_edges)(t->bx, | 1264 | 7.05M | t->bx > ts->tiling.col_start, | 1265 | 7.05M | t->by, | 1266 | 7.05M | t->by > ts->tiling.row_start, | 1267 | 7.05M | ts->tiling.col_end, | 1268 | 7.05M | ts->tiling.row_end, | 1269 | 7.05M | edge_flags, dst, | 1270 | 7.05M | f->cur.stride[0], top_sb_edge, | 1271 | 7.05M | b->y_mode, &angle, | 1272 | 7.05M | t_dim->w, t_dim->h, | 1273 | 7.05M | f->seq_hdr->intra_edge_filter, | 1274 | 7.05M | edge HIGHBD_CALL_SUFFIX); | 1275 | 7.05M | dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge, | 1276 | 7.05M | t_dim->w * 4, t_dim->h * 4, | 1277 | 7.05M | angle | intra_flags, | 1278 | 7.05M | 4 * f->bw - 4 * t->bx, | 1279 | 7.05M | 4 * f->bh - 4 * t->by | 1280 | 7.05M | HIGHBD_CALL_SUFFIX); | 1281 | | | 1282 | 7.05M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1283 | 0 | hex_dump(edge - t_dim->h * 4, t_dim->h * 4, | 1284 | 0 | t_dim->h * 4, 2, "l"); | 1285 | 0 | hex_dump(edge, 0, 1, 1, "tl"); | 1286 | 0 | hex_dump(edge + 1, t_dim->w * 4, | 1287 | 0 | t_dim->w * 4, 2, "t"); | 1288 | 0 | hex_dump(dst, f->cur.stride[0], | 1289 | 0 | t_dim->w * 4, t_dim->h * 4, "y-intra-pred"); | 1290 | 0 | } | 1291 | | | 1292 | 7.06M | skip_y_pred: {} | 1293 | 7.06M | if (!b->skip) { | 1294 | 808k | coef *cf; | 1295 | 808k | int eob; | 1296 | 808k | enum TxfmType txtp; | 1297 | 808k | if (t->frame_thread.pass) { | 1298 | 808k | const int p = t->frame_thread.pass & 1; | 1299 | 808k | const int cbi = *ts->frame_thread[p].cbi++; | 1300 | 808k | cf = ts->frame_thread[p].cf; | 1301 | 808k | ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; | 1302 | 808k | eob = cbi >> 5; | 1303 | 808k | txtp = cbi & 0x1f; | 1304 | 18.4E | } else { | 1305 | 18.4E | uint8_t cf_ctx; | 1306 | 18.4E | cf = bitfn(t->cf); | 1307 | 18.4E | eob = decode_coefs(t, &t->a->lcoef[bx4 + x], | 1308 | 18.4E | &t->l.lcoef[by4 + y], b->tx, bs, | 1309 | 18.4E | b, 1, 0, cf, &txtp, &cf_ctx); | 1310 | 18.4E | if (DEBUG_BLOCK_INFO) | 1311 | 0 | printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", | 1312 | 0 | b->tx, txtp, eob, ts->msac.rng); | 1313 | 18.4E | dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); | 1314 | 18.4E | dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); | 1315 | 18.4E | } | 1316 | 808k | if (eob >= 0) { | 1317 | 556k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1318 | 0 | coef_dump(cf, imin(t_dim->h, 8) * 4, | 1319 | 0 | imin(t_dim->w, 8) * 4, 3, "dq"); | 1320 | 556k | dsp->itx.itxfm_add[b->tx] | 1321 | 556k | [txtp](dst, | 1322 | 556k | f->cur.stride[0], | 1323 | 556k | cf, eob HIGHBD_CALL_SUFFIX); | 1324 | 556k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1325 | 0 | hex_dump(dst, f->cur.stride[0], | 1326 | 0 | t_dim->w * 4, t_dim->h * 4, "recon"); | 1327 | 556k | } | 1328 | 6.25M | } else if (!t->frame_thread.pass) { | 1329 | 0 | dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40); | 1330 | 0 | dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40); | 1331 | 0 | } | 1332 | 7.06M | dst += 4 * t_dim->w; | 1333 | 7.06M | } | 1334 | 1.73M | t->bx -= x; | 1335 | 1.73M | } | 1336 | 1.15M | t->by -= y; | 1337 | | | 1338 | 1.15M | if (!has_chroma) continue; | 1339 | | | 1340 | 853k | const ptrdiff_t stride = f->cur.stride[1]; | 1341 | | | 1342 | 853k | if (b->uv_mode == CFL_PRED) { | 1343 | 170k | assert(!init_x && !init_y); | 1344 | | | 1345 | 170k | int16_t *const ac = t->scratch.ac; | 1346 | 170k | pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) + | 1347 | 170k | 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]); | 1348 | 170k | const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) + | 1349 | 170k | (t->by >> ss_ver) * PXSTRIDE(stride)); | 1350 | 170k | pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off, | 1351 | 170k | ((pixel *) f->cur.data[2]) + uv_off }; | 1352 | | | 1353 | 170k | const int furthest_r = | 1354 | 170k | ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1); | 1355 | 170k | const int furthest_b = | 1356 | 170k | ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1); | 1357 | 170k | dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0], | 1358 | 170k | cbw4 - (furthest_r >> ss_hor), | 1359 | 170k | cbh4 - (furthest_b >> ss_ver), | 1360 | 170k | cbw4 * 4, cbh4 * 4); | 1361 | 511k | for (int pl = 0; pl < 2; pl++) { | 1362 | 341k | if (!b->cfl_alpha[pl]) continue; | 1363 | 284k | int angle = 0; | 1364 | 284k | const pixel *top_sb_edge = NULL; | 1365 | 284k | if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { | 1366 | 70.5k | top_sb_edge = f->ipred_edge[pl + 1]; | 1367 | 70.5k | const int sby = t->by >> f->sb_shift; | 1368 | 70.5k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1369 | 70.5k | } | 1370 | 284k | const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; | 1371 | 284k | const int xstart = ts->tiling.col_start >> ss_hor; | 1372 | 284k | const int ystart = ts->tiling.row_start >> ss_ver; | 1373 | 284k | const enum IntraPredMode m = | 1374 | 284k | bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, | 1375 | 284k | ypos, ypos > ystart, | 1376 | 284k | ts->tiling.col_end >> ss_hor, | 1377 | 284k | ts->tiling.row_end >> ss_ver, | 1378 | 284k | 0, uv_dst[pl], stride, | 1379 | 284k | top_sb_edge, DC_PRED, &angle, | 1380 | 284k | uv_t_dim->w, uv_t_dim->h, 0, | 1381 | 284k | edge HIGHBD_CALL_SUFFIX); | 1382 | 284k | dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge, | 1383 | 284k | uv_t_dim->w * 4, | 1384 | 284k | uv_t_dim->h * 4, | 1385 | 284k | ac, b->cfl_alpha[pl] | 1386 | 284k | HIGHBD_CALL_SUFFIX); | 1387 | 284k | } | 1388 | 170k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1389 | 0 | ac_dump(ac, 4*cbw4, 4*cbh4, "ac"); | 1390 | 0 | hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred"); | 1391 | 0 | hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred"); | 1392 | 0 | } | 1393 | 682k | } else if (b->pal_sz[1]) { | 1394 | 1.35k | const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + | 1395 | 1.35k | (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); | 1396 | 1.35k | const pixel (*pal)[8]; | 1397 | 1.35k | const uint8_t *pal_idx; | 1398 | 1.35k | if (t->frame_thread.pass) { | 1399 | 1.35k | const int p = t->frame_thread.pass & 1; | 1400 | 1.35k | assert(ts->frame_thread[p].pal_idx); | 1401 | 1.35k | pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 1402 | 1.35k | ((t->bx >> 1) + (t->by & 1))]; | 1403 | 1.35k | pal_idx = ts->frame_thread[p].pal_idx; | 1404 | 1.35k | ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8; | 1405 | 1.35k | } else { | 1406 | 0 | pal = bytefn(t->scratch.pal); | 1407 | 0 | pal_idx = t->scratch.pal_idx_uv; | 1408 | 0 | } | 1409 | | | 1410 | 1.35k | f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff, | 1411 | 1.35k | f->cur.stride[1], pal[1], | 1412 | 1.35k | pal_idx, cbw4 * 4, cbh4 * 4); | 1413 | 1.35k | f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff, | 1414 | 1.35k | f->cur.stride[1], pal[2], | 1415 | 1.35k | pal_idx, cbw4 * 4, cbh4 * 4); | 1416 | 1.35k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1417 | 0 | hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff, | 1418 | 0 | PXSTRIDE(f->cur.stride[1]), | 1419 | 0 | cbw4 * 4, cbh4 * 4, "u-pal-pred"); | 1420 | 0 | hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff, | 1421 | 0 | PXSTRIDE(f->cur.stride[1]), | 1422 | 0 | cbw4 * 4, cbh4 * 4, "v-pal-pred"); | 1423 | 0 | } | 1424 | 1.35k | } | 1425 | | | 1426 | 853k | const int sm_uv_fl = sm_uv_flag(t->a, cbx4) | | 1427 | 853k | sm_uv_flag(&t->l, cby4); | 1428 | 853k | const int uv_sb_has_tr = | 1429 | 853k | ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 : | 1430 | 807k | intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1)); | 1431 | 853k | const int uv_sb_has_bl = | 1432 | 853k | init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 : | 1433 | 807k | intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1)); | 1434 | 853k | const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); | 1435 | 2.56M | for (int pl = 0; pl < 2; pl++) { | 1436 | 3.65M | for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; | 1437 | 1.94M | y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) | 1438 | 1.94M | { | 1439 | 1.94M | pixel *dst = ((pixel *) f->cur.data[1 + pl]) + | 1440 | 1.94M | 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) + | 1441 | 1.94M | ((t->bx + init_x) >> ss_hor)); | 1442 | 4.47M | for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; | 1443 | 2.52M | x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) | 1444 | 2.52M | { | 1445 | 2.52M | if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) || | 1446 | 2.24M | b->pal_sz[1]) | 1447 | 287k | { | 1448 | 287k | goto skip_uv_pred; | 1449 | 287k | } | 1450 | | | 1451 | 2.23M | int angle = b->uv_angle; | 1452 | | // this probably looks weird because we're using | 1453 | | // luma flags in a chroma loop, but that's because | 1454 | | // prepare_intra_edges() expects luma flags as input | 1455 | 2.23M | const enum EdgeFlags edge_flags = | 1456 | 2.23M | (((y > (init_y >> ss_ver) || !uv_sb_has_tr) && | 1457 | 1.09M | (x + uv_t_dim->w >= sub_cw4)) ? | 1458 | 1.59M | 0 : EDGE_I444_TOP_HAS_RIGHT) | | 1459 | 2.23M | ((x > (init_x >> ss_hor) || | 1460 | 1.66M | (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ? | 1461 | 1.40M | 0 : EDGE_I444_LEFT_HAS_BOTTOM); | 1462 | 2.23M | const pixel *top_sb_edge = NULL; | 1463 | 2.23M | if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { | 1464 | 652k | top_sb_edge = f->ipred_edge[1 + pl]; | 1465 | 652k | const int sby = t->by >> f->sb_shift; | 1466 | 652k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1467 | 652k | } | 1468 | 2.23M | const enum IntraPredMode uv_mode = | 1469 | 2.23M | b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode; | 1470 | 2.23M | const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; | 1471 | 2.23M | const int xstart = ts->tiling.col_start >> ss_hor; | 1472 | 2.23M | const int ystart = ts->tiling.row_start >> ss_ver; | 1473 | 2.23M | const enum IntraPredMode m = | 1474 | 2.23M | bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, | 1475 | 2.23M | ypos, ypos > ystart, | 1476 | 2.23M | ts->tiling.col_end >> ss_hor, | 1477 | 2.23M | ts->tiling.row_end >> ss_ver, | 1478 | 2.23M | edge_flags, dst, stride, | 1479 | 2.23M | top_sb_edge, uv_mode, | 1480 | 2.23M | &angle, uv_t_dim->w, | 1481 | 2.23M | uv_t_dim->h, | 1482 | 2.23M | f->seq_hdr->intra_edge_filter, | 1483 | 2.23M | edge HIGHBD_CALL_SUFFIX); | 1484 | 2.23M | angle |= intra_edge_filter_flag; | 1485 | 2.23M | dsp->ipred.intra_pred[m](dst, stride, edge, | 1486 | 2.23M | uv_t_dim->w * 4, | 1487 | 2.23M | uv_t_dim->h * 4, | 1488 | 2.23M | angle | sm_uv_fl, | 1489 | 2.23M | (4 * f->bw + ss_hor - | 1490 | 2.23M | 4 * (t->bx & ~ss_hor)) >> ss_hor, | 1491 | 2.23M | (4 * f->bh + ss_ver - | 1492 | 2.23M | 4 * (t->by & ~ss_ver)) >> ss_ver | 1493 | 2.23M | HIGHBD_CALL_SUFFIX); | 1494 | 2.23M | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1495 | 0 | hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4, | 1496 | 0 | uv_t_dim->h * 4, 2, "l"); | 1497 | 0 | hex_dump(edge, 0, 1, 1, "tl"); | 1498 | 0 | hex_dump(edge + 1, uv_t_dim->w * 4, | 1499 | 0 | uv_t_dim->w * 4, 2, "t"); | 1500 | 0 | hex_dump(dst, stride, uv_t_dim->w * 4, | 1501 | 0 | uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred"); | 1502 | 0 | } | 1503 | | | 1504 | 2.52M | skip_uv_pred: {} | 1505 | 2.52M | if (!b->skip) { | 1506 | 927k | enum TxfmType txtp; | 1507 | 927k | int eob; | 1508 | 927k | coef *cf; | 1509 | 927k | if (t->frame_thread.pass) { | 1510 | 927k | const int p = t->frame_thread.pass & 1; | 1511 | 927k | const int cbi = *ts->frame_thread[p].cbi++; | 1512 | 927k | cf = ts->frame_thread[p].cf; | 1513 | 927k | ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16; | 1514 | 927k | eob = cbi >> 5; | 1515 | 927k | txtp = cbi & 0x1f; | 1516 | 18.4E | } else { | 1517 | 18.4E | uint8_t cf_ctx; | 1518 | 18.4E | cf = bitfn(t->cf); | 1519 | 18.4E | eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], | 1520 | 18.4E | &t->l.ccoef[pl][cby4 + y], | 1521 | 18.4E | b->uvtx, bs, b, 1, 1 + pl, cf, | 1522 | 18.4E | &txtp, &cf_ctx); | 1523 | 18.4E | if (DEBUG_BLOCK_INFO) | 1524 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," | 1525 | 0 | "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n", | 1526 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4); | 1527 | 18.4E | int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); | 1528 | 18.4E | int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); | 1529 | 18.4E | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); | 1530 | 18.4E | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); | 1531 | 18.4E | } | 1532 | 927k | if (eob >= 0) { | 1533 | 292k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1534 | 0 | coef_dump(cf, uv_t_dim->h * 4, | 1535 | 0 | uv_t_dim->w * 4, 3, "dq"); | 1536 | 292k | dsp->itx.itxfm_add[b->uvtx] | 1537 | 292k | [txtp](dst, stride, | 1538 | 292k | cf, eob HIGHBD_CALL_SUFFIX); | 1539 | 292k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1540 | 0 | hex_dump(dst, stride, uv_t_dim->w * 4, | 1541 | 0 | uv_t_dim->h * 4, "recon"); | 1542 | 292k | } | 1543 | 1.59M | } else if (!t->frame_thread.pass) { | 1544 | 0 | dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40); | 1545 | 0 | dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40); | 1546 | 0 | } | 1547 | 2.52M | dst += uv_t_dim->w * 4; | 1548 | 2.52M | } | 1549 | 1.94M | t->bx -= x << ss_hor; | 1550 | 1.94M | } | 1551 | 1.70M | t->by -= y << ss_ver; | 1552 | 1.70M | } | 1553 | 853k | } | 1554 | 1.06M | } | 1555 | 1.00M | } |
|
1556 | | |
1557 | | int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs, |
1558 | | const Av1Block *const b) |
1559 | 357k | { |
1560 | 357k | Dav1dTileState *const ts = t->ts; |
1561 | 357k | const Dav1dFrameContext *const f = t->f; |
1562 | 357k | const Dav1dDSPContext *const dsp = f->dsp; |
1563 | 357k | const int bx4 = t->bx & 31, by4 = t->by & 31; |
1564 | 357k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
1565 | 357k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
1566 | 357k | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; |
1567 | 357k | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; |
1568 | 357k | const int bw4 = b_dim[0], bh4 = b_dim[1]; |
1569 | 357k | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); |
1570 | 357k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && |
1571 | 272k | (bw4 > ss_hor || t->bx & 1) && |
1572 | 258k | (bh4 > ss_ver || t->by & 1); |
1573 | 357k | const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 : |
1574 | 357k | DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout; |
1575 | 357k | int res; |
1576 | | |
1577 | | // prediction |
1578 | 357k | const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor; |
1579 | 357k | pixel *dst = ((pixel *) f->cur.data[0]) + |
1580 | 357k | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); |
1581 | 357k | const ptrdiff_t uvdstoff = |
1582 | 357k | 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); |
1583 | 357k | if (IS_KEY_OR_INTRA(f->frame_hdr)) { |
1584 | | // intrabc |
1585 | 137k | assert(!f->frame_hdr->super_res.enabled); |
1586 | 137k | res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0, |
1587 | 137k | b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); |
1588 | 137k | if (res) return res; |
1589 | 246k | if (has_chroma) for (int pl = 1; pl < 3; pl++) { |
1590 | 164k | res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1], |
1591 | 164k | bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), |
1592 | 164k | t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0], |
1593 | 164k | &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); |
1594 | 164k | if (res) return res; |
1595 | 164k | } |
1596 | 220k | } else if (b->comp_type == COMP_INTER_NONE) { |
1597 | 184k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]]; |
1598 | 184k | const enum Filter2d filter_2d = b->filter2d; |
1599 | | |
1600 | 184k | if (imin(bw4, bh4) > 1 && |
1601 | 111k | ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || |
1602 | 106k | (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) |
1603 | 7.83k | { |
1604 | 7.83k | res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp, |
1605 | 7.83k | b->motion_mode == MM_WARP ? &t->warpmv : |
1606 | 7.83k | &f->frame_hdr->gmv[b->ref[0]]); |
1607 | 7.83k | if (res) return res; |
1608 | 176k | } else { |
1609 | 176k | res = mc(t, dst, NULL, f->cur.stride[0], |
1610 | 176k | bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d); |
1611 | 176k | if (res) return res; |
1612 | 176k | if (b->motion_mode == MM_OBMC) { |
1613 | 30.0k | res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4); |
1614 | 30.0k | if (res) return res; |
1615 | 30.0k | } |
1616 | 176k | } |
1617 | 184k | if (b->interintra_type) { |
1618 | 5.73k | pixel *const tl_edge = bitfn(t->scratch.edge) + 32; |
1619 | 5.73k | enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ? |
1620 | 4.67k | SMOOTH_PRED : b->interintra_mode; |
1621 | 5.73k | pixel *const tmp = bitfn(t->scratch.interintra); |
1622 | 5.73k | int angle = 0; |
1623 | 5.73k | const pixel *top_sb_edge = NULL; |
1624 | 5.73k | if (!(t->by & (f->sb_step - 1))) { |
1625 | 2.80k | top_sb_edge = f->ipred_edge[0]; |
1626 | 2.80k | const int sby = t->by >> f->sb_shift; |
1627 | 2.80k | top_sb_edge += f->sb128w * 128 * (sby - 1); |
1628 | 2.80k | } |
1629 | 5.73k | m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start, |
1630 | 5.73k | t->by, t->by > ts->tiling.row_start, |
1631 | 5.73k | ts->tiling.col_end, ts->tiling.row_end, |
1632 | 5.73k | 0, dst, f->cur.stride[0], top_sb_edge, |
1633 | 5.73k | m, &angle, bw4, bh4, 0, tl_edge |
1634 | 5.73k | HIGHBD_CALL_SUFFIX); |
1635 | 5.73k | dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), |
1636 | 5.73k | tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 |
1637 | 5.73k | HIGHBD_CALL_SUFFIX); |
1638 | 5.73k | dsp->mc.blend(dst, f->cur.stride[0], tmp, |
1639 | 5.73k | bw4 * 4, bh4 * 4, II_MASK(0, bs, b)); |
1640 | 5.73k | } |
1641 | | |
1642 | 184k | if (!has_chroma) goto skip_inter_chroma_pred; |
1643 | | |
1644 | | // sub8x8 derivation |
1645 | 140k | int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; |
1646 | 140k | refmvs_block *const *r; |
1647 | 140k | if (is_sub8x8) { |
1648 | 10.1k | assert(ss_hor == 1); |
1649 | 10.1k | r = &t->rt.r[(t->by & 31) + 5]; |
1650 | 10.1k | if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; |
1651 | 10.1k | if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; |
1652 | 10.1k | if (bw4 == 1 && bh4 == ss_ver) |
1653 | 2.49k | is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; |
1654 | 10.1k | } |
1655 | | |
1656 | | // chroma prediction |
1657 | 140k | if (is_sub8x8) { |
1658 | 10.0k | assert(ss_hor == 1); |
1659 | 10.0k | ptrdiff_t h_off = 0, v_off = 0; |
1660 | 10.0k | if (bw4 == 1 && bh4 == ss_ver) { |
1661 | 7.35k | for (int pl = 0; pl < 2; pl++) { |
1662 | 4.90k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, |
1663 | 4.90k | NULL, f->cur.stride[1], |
1664 | 4.90k | bw4, bh4, t->bx - 1, t->by - 1, 1 + pl, |
1665 | 4.90k | r[-1][t->bx - 1].mv.mv[0], |
1666 | 4.90k | &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1], |
1667 | 4.90k | r[-1][t->bx - 1].ref.ref[0] - 1, |
1668 | 4.90k | t->frame_thread.pass != 2 ? t->tl_4x4_filter : |
1669 | 4.90k | f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d); |
1670 | 4.90k | if (res) return res; |
1671 | 4.90k | } |
1672 | 2.45k | v_off = 2 * PXSTRIDE(f->cur.stride[1]); |
1673 | 2.45k | h_off = 2; |
1674 | 2.45k | } |
1675 | 10.0k | if (bw4 == 1) { |
1676 | 6.18k | const enum Filter2d left_filter_2d = |
1677 | 6.18k | dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]]; |
1678 | 18.5k | for (int pl = 0; pl < 2; pl++) { |
1679 | 12.3k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL, |
1680 | 12.3k | f->cur.stride[1], bw4, bh4, t->bx - 1, |
1681 | 12.3k | t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0], |
1682 | 12.3k | &f->refp[r[0][t->bx - 1].ref.ref[0] - 1], |
1683 | 12.3k | r[0][t->bx - 1].ref.ref[0] - 1, |
1684 | 12.3k | t->frame_thread.pass != 2 ? left_filter_2d : |
1685 | 12.3k | f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d); |
1686 | 12.3k | if (res) return res; |
1687 | 12.3k | } |
1688 | 6.18k | h_off = 2; |
1689 | 6.18k | } |
1690 | 10.0k | if (bh4 == ss_ver) { |
1691 | 6.29k | const enum Filter2d top_filter_2d = |
1692 | 6.29k | dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]]; |
1693 | 18.8k | for (int pl = 0; pl < 2; pl++) { |
1694 | 12.5k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL, |
1695 | 12.5k | f->cur.stride[1], bw4, bh4, t->bx, t->by - 1, |
1696 | 12.5k | 1 + pl, r[-1][t->bx].mv.mv[0], |
1697 | 12.5k | &f->refp[r[-1][t->bx].ref.ref[0] - 1], |
1698 | 12.5k | r[-1][t->bx].ref.ref[0] - 1, |
1699 | 12.5k | t->frame_thread.pass != 2 ? top_filter_2d : |
1700 | 12.5k | f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d); |
1701 | 12.5k | if (res) return res; |
1702 | 12.5k | } |
1703 | 6.29k | v_off = 2 * PXSTRIDE(f->cur.stride[1]); |
1704 | 6.29k | } |
1705 | 30.0k | for (int pl = 0; pl < 2; pl++) { |
1706 | 20.0k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1], |
1707 | 20.0k | bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], |
1708 | 20.0k | refp, b->ref[0], filter_2d); |
1709 | 20.0k | if (res) return res; |
1710 | 20.0k | } |
1711 | 129k | } else { |
1712 | 129k | if (imin(cbw4, cbh4) > 1 && |
1713 | 69.9k | ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || |
1714 | 65.9k | (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) |
1715 | 6.53k | { |
1716 | 19.6k | for (int pl = 0; pl < 2; pl++) { |
1717 | 13.0k | res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL, |
1718 | 13.0k | f->cur.stride[1], b_dim, 1 + pl, refp, |
1719 | 13.0k | b->motion_mode == MM_WARP ? &t->warpmv : |
1720 | 13.0k | &f->frame_hdr->gmv[b->ref[0]]); |
1721 | 13.0k | if (res) return res; |
1722 | 13.0k | } |
1723 | 123k | } else { |
1724 | 370k | for (int pl = 0; pl < 2; pl++) { |
1725 | 246k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, |
1726 | 246k | NULL, f->cur.stride[1], |
1727 | 246k | bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), |
1728 | 246k | t->bx & ~ss_hor, t->by & ~ss_ver, |
1729 | 246k | 1 + pl, b->mv[0], refp, b->ref[0], filter_2d); |
1730 | 246k | if (res) return res; |
1731 | 246k | if (b->motion_mode == MM_OBMC) { |
1732 | 50.5k | res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, |
1733 | 50.5k | f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4); |
1734 | 50.5k | if (res) return res; |
1735 | 50.5k | } |
1736 | 246k | } |
1737 | 123k | } |
1738 | 129k | if (b->interintra_type) { |
1739 | | // FIXME for 8x32 with 4:2:2 subsampling, this probably does |
1740 | | // the wrong thing since it will select 4x16, not 4x32, as a |
1741 | | // transform size... |
1742 | 4.08k | const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b); |
1743 | | |
1744 | 12.2k | for (int pl = 0; pl < 2; pl++) { |
1745 | 8.16k | pixel *const tmp = bitfn(t->scratch.interintra); |
1746 | 8.16k | pixel *const tl_edge = bitfn(t->scratch.edge) + 32; |
1747 | 8.16k | enum IntraPredMode m = |
1748 | 8.16k | b->interintra_mode == II_SMOOTH_PRED ? |
1749 | 6.68k | SMOOTH_PRED : b->interintra_mode; |
1750 | 8.16k | int angle = 0; |
1751 | 8.16k | pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; |
1752 | 8.16k | const pixel *top_sb_edge = NULL; |
1753 | 8.16k | if (!(t->by & (f->sb_step - 1))) { |
1754 | 4.06k | top_sb_edge = f->ipred_edge[pl + 1]; |
1755 | 4.06k | const int sby = t->by >> f->sb_shift; |
1756 | 4.06k | top_sb_edge += f->sb128w * 128 * (sby - 1); |
1757 | 4.06k | } |
1758 | 8.16k | m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor, |
1759 | 8.16k | (t->bx >> ss_hor) > |
1760 | 8.16k | (ts->tiling.col_start >> ss_hor), |
1761 | 8.16k | t->by >> ss_ver, |
1762 | 8.16k | (t->by >> ss_ver) > |
1763 | 8.16k | (ts->tiling.row_start >> ss_ver), |
1764 | 8.16k | ts->tiling.col_end >> ss_hor, |
1765 | 8.16k | ts->tiling.row_end >> ss_ver, |
1766 | 8.16k | 0, uvdst, f->cur.stride[1], |
1767 | 8.16k | top_sb_edge, m, |
1768 | 8.16k | &angle, cbw4, cbh4, 0, tl_edge |
1769 | 8.16k | HIGHBD_CALL_SUFFIX); |
1770 | 8.16k | dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), |
1771 | 8.16k | tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0 |
1772 | 8.16k | HIGHBD_CALL_SUFFIX); |
1773 | 8.16k | dsp->mc.blend(uvdst, f->cur.stride[1], tmp, |
1774 | 8.16k | cbw4 * 4, cbh4 * 4, ii_mask); |
1775 | 8.16k | } |
1776 | 4.08k | } |
1777 | 129k | } |
1778 | | |
1779 | 184k | skip_inter_chroma_pred: {} |
1780 | 184k | t->tl_4x4_filter = filter_2d; |
1781 | 184k | } else { |
1782 | 35.7k | const enum Filter2d filter_2d = b->filter2d; |
1783 | | // Maximum super block size is 128x128 |
1784 | 35.7k | int16_t (*tmp)[128 * 128] = t->scratch.compinter; |
1785 | 35.7k | int jnt_weight; |
1786 | 35.7k | uint8_t *const seg_mask = t->scratch.seg_mask; |
1787 | 35.7k | const uint8_t *mask; |
1788 | | |
1789 | 107k | for (int i = 0; i < 2; i++) { |
1790 | 71.4k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; |
1791 | | |
1792 | 71.4k | if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { |
1793 | 3.69k | res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp, |
1794 | 3.69k | &f->frame_hdr->gmv[b->ref[i]]); |
1795 | 3.69k | if (res) return res; |
1796 | 67.7k | } else { |
1797 | 67.7k | res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0, |
1798 | 67.7k | b->mv[i], refp, b->ref[i], filter_2d); |
1799 | 67.7k | if (res) return res; |
1800 | 67.7k | } |
1801 | 71.4k | } |
1802 | 35.7k | switch (b->comp_type) { |
1803 | 25.3k | case COMP_INTER_AVG: |
1804 | 25.3k | dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1], |
1805 | 25.3k | bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX); |
1806 | 25.3k | break; |
1807 | 3.45k | case COMP_INTER_WEIGHTED_AVG: |
1808 | 3.45k | jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]]; |
1809 | 3.45k | dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1], |
1810 | 3.45k | bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX); |
1811 | 3.45k | break; |
1812 | 4.96k | case COMP_INTER_SEG: |
1813 | 4.96k | dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0], |
1814 | 4.96k | tmp[b->mask_sign], tmp[!b->mask_sign], |
1815 | 4.96k | bw4 * 4, bh4 * 4, seg_mask, |
1816 | 4.96k | b->mask_sign HIGHBD_CALL_SUFFIX); |
1817 | 4.96k | mask = seg_mask; |
1818 | 4.96k | break; |
1819 | 1.90k | case COMP_INTER_WEDGE: |
1820 | 1.90k | mask = WEDGE_MASK(0, bs, 0, b->wedge_idx); |
1821 | 1.90k | dsp->mc.mask(dst, f->cur.stride[0], |
1822 | 1.90k | tmp[b->mask_sign], tmp[!b->mask_sign], |
1823 | 1.90k | bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); |
1824 | 1.90k | if (has_chroma) |
1825 | 1.34k | mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx); |
1826 | 1.90k | break; |
1827 | 35.7k | } |
1828 | | |
1829 | | // chroma |
1830 | 76.3k | if (has_chroma) for (int pl = 0; pl < 2; pl++) { |
1831 | 152k | for (int i = 0; i < 2; i++) { |
1832 | 101k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; |
1833 | 101k | if (b->inter_mode == GLOBALMV_GLOBALMV && |
1834 | 17.5k | imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) |
1835 | 5.40k | { |
1836 | 5.40k | res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor, |
1837 | 5.40k | b_dim, 1 + pl, |
1838 | 5.40k | refp, &f->frame_hdr->gmv[b->ref[i]]); |
1839 | 5.40k | if (res) return res; |
1840 | 96.3k | } else { |
1841 | 96.3k | res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, |
1842 | 96.3k | 1 + pl, b->mv[i], refp, b->ref[i], filter_2d); |
1843 | 96.3k | if (res) return res; |
1844 | 96.3k | } |
1845 | 101k | } |
1846 | 50.8k | pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; |
1847 | 50.8k | switch (b->comp_type) { |
1848 | 36.4k | case COMP_INTER_AVG: |
1849 | 36.4k | dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], |
1850 | 36.4k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver |
1851 | 36.4k | HIGHBD_CALL_SUFFIX); |
1852 | 36.4k | break; |
1853 | 4.94k | case COMP_INTER_WEIGHTED_AVG: |
1854 | 4.94k | dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], |
1855 | 4.94k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight |
1856 | 4.94k | HIGHBD_CALL_SUFFIX); |
1857 | 4.94k | break; |
1858 | 2.69k | case COMP_INTER_WEDGE: |
1859 | 9.53k | case COMP_INTER_SEG: |
1860 | 9.53k | dsp->mc.mask(uvdst, f->cur.stride[1], |
1861 | 9.53k | tmp[b->mask_sign], tmp[!b->mask_sign], |
1862 | 9.53k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask |
1863 | 9.53k | HIGHBD_CALL_SUFFIX); |
1864 | 9.53k | break; |
1865 | 50.8k | } |
1866 | 50.8k | } |
1867 | 35.6k | } |
1868 | | |
1869 | 357k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { |
1870 | 0 | hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred"); |
1871 | 0 | if (has_chroma) { |
1872 | 0 | hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1], |
1873 | 0 | cbw4 * 4, cbh4 * 4, "u-pred"); |
1874 | 0 | hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1], |
1875 | 0 | cbw4 * 4, cbh4 * 4, "v-pred"); |
1876 | 0 | } |
1877 | 0 | } |
1878 | | |
1879 | 357k | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; |
1880 | | |
1881 | 357k | if (b->skip) { |
1882 | | // reset coef contexts |
1883 | 182k | BlockContext *const a = t->a; |
1884 | 182k | dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); |
1885 | 182k | dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); |
1886 | 182k | if (has_chroma) { |
1887 | 112k | dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; |
1888 | 112k | dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; |
1889 | 112k | memset_cw(&a->ccoef[0][cbx4], 0x40); |
1890 | 112k | memset_cw(&a->ccoef[1][cbx4], 0x40); |
1891 | 112k | memset_ch(&t->l.ccoef[0][cby4], 0x40); |
1892 | 112k | memset_ch(&t->l.ccoef[1][cby4], 0x40); |
1893 | 112k | } |
1894 | 182k | return 0; |
1895 | 182k | } |
1896 | | |
1897 | 175k | const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx]; |
1898 | 175k | const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx]; |
1899 | 175k | const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; |
1900 | | |
1901 | 358k | for (int init_y = 0; init_y < bh4; init_y += 16) { |
1902 | 371k | for (int init_x = 0; init_x < bw4; init_x += 16) { |
1903 | | // coefficient coding & inverse transforms |
1904 | 187k | int y_off = !!init_y, y; |
1905 | 187k | dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y; |
1906 | 397k | for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16); |
1907 | 209k | y += ytx->h, y_off++) |
1908 | 209k | { |
1909 | 209k | int x, x_off = !!init_x; |
1910 | 495k | for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16); |
1911 | 285k | x += ytx->w, x_off++) |
1912 | 285k | { |
1913 | 285k | read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, |
1914 | 285k | x_off, y_off, &dst[x * 4]); |
1915 | 285k | t->bx += ytx->w; |
1916 | 285k | } |
1917 | 209k | dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h; |
1918 | 209k | t->bx -= x; |
1919 | 209k | t->by += ytx->h; |
1920 | 209k | } |
1921 | 187k | dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y; |
1922 | 187k | t->by -= y; |
1923 | | |
1924 | | // chroma coefs and inverse transform |
1925 | 435k | if (has_chroma) for (int pl = 0; pl < 2; pl++) { |
1926 | 290k | pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff + |
1927 | 290k | (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver); |
1928 | 290k | for (y = init_y >> ss_ver, t->by += init_y; |
1929 | 629k | y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h) |
1930 | 338k | { |
1931 | 338k | int x; |
1932 | 338k | for (x = init_x >> ss_hor, t->bx += init_x; |
1933 | 815k | x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w) |
1934 | 476k | { |
1935 | 476k | coef *cf; |
1936 | 476k | int eob; |
1937 | 476k | enum TxfmType txtp; |
1938 | 476k | if (t->frame_thread.pass) { |
1939 | 476k | const int p = t->frame_thread.pass & 1; |
1940 | 476k | const int cbi = *ts->frame_thread[p].cbi++; |
1941 | 476k | cf = ts->frame_thread[p].cf; |
1942 | 476k | ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16; |
1943 | 476k | eob = cbi >> 5; |
1944 | 476k | txtp = cbi & 0x1f; |
1945 | 476k | } else { |
1946 | 0 | uint8_t cf_ctx; |
1947 | 0 | cf = bitfn(t->cf); |
1948 | 0 | txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + |
1949 | 0 | bx4 + (x << ss_hor)]; |
1950 | 0 | eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], |
1951 | 0 | &t->l.ccoef[pl][cby4 + y], |
1952 | 0 | b->uvtx, bs, b, 0, 1 + pl, |
1953 | 0 | cf, &txtp, &cf_ctx); |
1954 | 0 | if (DEBUG_BLOCK_INFO) |
1955 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," |
1956 | 0 | "txtp=%d,eob=%d]: r=%d\n", |
1957 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng); |
1958 | 0 | int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor); |
1959 | 0 | int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver); |
1960 | 0 | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); |
1961 | 0 | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); |
1962 | 0 | } |
1963 | 476k | if (eob >= 0) { |
1964 | 163k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1965 | 0 | coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq"); |
1966 | 163k | dsp->itx.itxfm_add[b->uvtx] |
1967 | 163k | [txtp](&uvdst[4 * x], |
1968 | 163k | f->cur.stride[1], |
1969 | 163k | cf, eob HIGHBD_CALL_SUFFIX); |
1970 | 163k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) |
1971 | 0 | hex_dump(&uvdst[4 * x], f->cur.stride[1], |
1972 | 0 | uvtx->w * 4, uvtx->h * 4, "recon"); |
1973 | 163k | } |
1974 | 476k | t->bx += uvtx->w << ss_hor; |
1975 | 476k | } |
1976 | 338k | uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h; |
1977 | 338k | t->bx -= x << ss_hor; |
1978 | 338k | t->by += uvtx->h << ss_ver; |
1979 | 338k | } |
1980 | 290k | t->by -= y << ss_ver; |
1981 | 290k | } |
1982 | 187k | } |
1983 | 183k | } |
1984 | 175k | return 0; |
1985 | 357k | } Line | Count | Source | 1559 | 222k | { | 1560 | 222k | Dav1dTileState *const ts = t->ts; | 1561 | 222k | const Dav1dFrameContext *const f = t->f; | 1562 | 222k | const Dav1dDSPContext *const dsp = f->dsp; | 1563 | 222k | const int bx4 = t->bx & 31, by4 = t->by & 31; | 1564 | 222k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 1565 | 222k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 1566 | 222k | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; | 1567 | 222k | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; | 1568 | 222k | const int bw4 = b_dim[0], bh4 = b_dim[1]; | 1569 | 222k | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); | 1570 | 222k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && | 1571 | 166k | (bw4 > ss_hor || t->bx & 1) && | 1572 | 157k | (bh4 > ss_ver || t->by & 1); | 1573 | 222k | const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 : | 1574 | 222k | DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout; | 1575 | 222k | int res; | 1576 | | | 1577 | | // prediction | 1578 | 222k | const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor; | 1579 | 222k | pixel *dst = ((pixel *) f->cur.data[0]) + | 1580 | 222k | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); | 1581 | 222k | const ptrdiff_t uvdstoff = | 1582 | 222k | 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); | 1583 | 222k | if (IS_KEY_OR_INTRA(f->frame_hdr)) { | 1584 | | // intrabc | 1585 | 93.3k | assert(!f->frame_hdr->super_res.enabled); | 1586 | 93.3k | res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0, | 1587 | 93.3k | b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); | 1588 | 93.3k | if (res) return res; | 1589 | 147k | if (has_chroma) for (int pl = 1; pl < 3; pl++) { | 1590 | 98.0k | res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1], | 1591 | 98.0k | bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), | 1592 | 98.0k | t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0], | 1593 | 98.0k | &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); | 1594 | 98.0k | if (res) return res; | 1595 | 98.0k | } | 1596 | 128k | } else if (b->comp_type == COMP_INTER_NONE) { | 1597 | 111k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]]; | 1598 | 111k | const enum Filter2d filter_2d = b->filter2d; | 1599 | | | 1600 | 111k | if (imin(bw4, bh4) > 1 && | 1601 | 71.0k | ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || | 1602 | 68.0k | (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) | 1603 | 4.90k | { | 1604 | 4.90k | res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp, | 1605 | 4.90k | b->motion_mode == MM_WARP ? &t->warpmv : | 1606 | 4.90k | &f->frame_hdr->gmv[b->ref[0]]); | 1607 | 4.90k | if (res) return res; | 1608 | 106k | } else { | 1609 | 106k | res = mc(t, dst, NULL, f->cur.stride[0], | 1610 | 106k | bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d); | 1611 | 106k | if (res) return res; | 1612 | 106k | if (b->motion_mode == MM_OBMC) { | 1613 | 18.8k | res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4); | 1614 | 18.8k | if (res) return res; | 1615 | 18.8k | } | 1616 | 106k | } | 1617 | 111k | if (b->interintra_type) { | 1618 | 2.76k | pixel *const tl_edge = bitfn(t->scratch.edge) + 32; | 1619 | 2.76k | enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ? | 1620 | 2.29k | SMOOTH_PRED : b->interintra_mode; | 1621 | 2.76k | pixel *const tmp = bitfn(t->scratch.interintra); | 1622 | 2.76k | int angle = 0; | 1623 | 2.76k | const pixel *top_sb_edge = NULL; | 1624 | 2.76k | if (!(t->by & (f->sb_step - 1))) { | 1625 | 1.34k | top_sb_edge = f->ipred_edge[0]; | 1626 | 1.34k | const int sby = t->by >> f->sb_shift; | 1627 | 1.34k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1628 | 1.34k | } | 1629 | 2.76k | m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start, | 1630 | 2.76k | t->by, t->by > ts->tiling.row_start, | 1631 | 2.76k | ts->tiling.col_end, ts->tiling.row_end, | 1632 | 2.76k | 0, dst, f->cur.stride[0], top_sb_edge, | 1633 | 2.76k | m, &angle, bw4, bh4, 0, tl_edge | 1634 | 2.76k | HIGHBD_CALL_SUFFIX); | 1635 | 2.76k | dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), | 1636 | 2.76k | tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 | 1637 | 2.76k | HIGHBD_CALL_SUFFIX); | 1638 | 2.76k | dsp->mc.blend(dst, f->cur.stride[0], tmp, | 1639 | 2.76k | bw4 * 4, bh4 * 4, II_MASK(0, bs, b)); | 1640 | 2.76k | } | 1641 | | | 1642 | 111k | if (!has_chroma) goto skip_inter_chroma_pred; | 1643 | | | 1644 | | // sub8x8 derivation | 1645 | 89.8k | int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; | 1646 | 89.8k | refmvs_block *const *r; | 1647 | 89.8k | if (is_sub8x8) { | 1648 | 6.19k | assert(ss_hor == 1); | 1649 | 6.19k | r = &t->rt.r[(t->by & 31) + 5]; | 1650 | 6.19k | if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; | 1651 | 6.19k | if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; | 1652 | 6.19k | if (bw4 == 1 && bh4 == ss_ver) | 1653 | 1.37k | is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; | 1654 | 6.19k | } | 1655 | | | 1656 | | // chroma prediction | 1657 | 89.8k | if (is_sub8x8) { | 1658 | 6.13k | assert(ss_hor == 1); | 1659 | 6.13k | ptrdiff_t h_off = 0, v_off = 0; | 1660 | 6.13k | if (bw4 == 1 && bh4 == ss_ver) { | 1661 | 4.06k | for (int pl = 0; pl < 2; pl++) { | 1662 | 2.71k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, | 1663 | 2.71k | NULL, f->cur.stride[1], | 1664 | 2.71k | bw4, bh4, t->bx - 1, t->by - 1, 1 + pl, | 1665 | 2.71k | r[-1][t->bx - 1].mv.mv[0], | 1666 | 2.71k | &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1], | 1667 | 2.71k | r[-1][t->bx - 1].ref.ref[0] - 1, | 1668 | 2.71k | t->frame_thread.pass != 2 ? t->tl_4x4_filter : | 1669 | 2.71k | f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d); | 1670 | 2.71k | if (res) return res; | 1671 | 2.71k | } | 1672 | 1.35k | v_off = 2 * PXSTRIDE(f->cur.stride[1]); | 1673 | 1.35k | h_off = 2; | 1674 | 1.35k | } | 1675 | 6.13k | if (bw4 == 1) { | 1676 | 3.66k | const enum Filter2d left_filter_2d = | 1677 | 3.66k | dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]]; | 1678 | 10.9k | for (int pl = 0; pl < 2; pl++) { | 1679 | 7.32k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL, | 1680 | 7.32k | f->cur.stride[1], bw4, bh4, t->bx - 1, | 1681 | 7.32k | t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0], | 1682 | 7.32k | &f->refp[r[0][t->bx - 1].ref.ref[0] - 1], | 1683 | 7.32k | r[0][t->bx - 1].ref.ref[0] - 1, | 1684 | 7.32k | t->frame_thread.pass != 2 ? left_filter_2d : | 1685 | 7.32k | f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d); | 1686 | 7.32k | if (res) return res; | 1687 | 7.32k | } | 1688 | 3.66k | h_off = 2; | 1689 | 3.66k | } | 1690 | 6.13k | if (bh4 == ss_ver) { | 1691 | 3.82k | const enum Filter2d top_filter_2d = | 1692 | 3.82k | dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]]; | 1693 | 11.4k | for (int pl = 0; pl < 2; pl++) { | 1694 | 7.64k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL, | 1695 | 7.64k | f->cur.stride[1], bw4, bh4, t->bx, t->by - 1, | 1696 | 7.64k | 1 + pl, r[-1][t->bx].mv.mv[0], | 1697 | 7.64k | &f->refp[r[-1][t->bx].ref.ref[0] - 1], | 1698 | 7.64k | r[-1][t->bx].ref.ref[0] - 1, | 1699 | 7.64k | t->frame_thread.pass != 2 ? top_filter_2d : | 1700 | 7.64k | f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d); | 1701 | 7.64k | if (res) return res; | 1702 | 7.64k | } | 1703 | 3.82k | v_off = 2 * PXSTRIDE(f->cur.stride[1]); | 1704 | 3.82k | } | 1705 | 18.3k | for (int pl = 0; pl < 2; pl++) { | 1706 | 12.2k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1], | 1707 | 12.2k | bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], | 1708 | 12.2k | refp, b->ref[0], filter_2d); | 1709 | 12.2k | if (res) return res; | 1710 | 12.2k | } | 1711 | 83.7k | } else { | 1712 | 83.7k | if (imin(cbw4, cbh4) > 1 && | 1713 | 48.2k | ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || | 1714 | 45.7k | (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) | 1715 | 4.25k | { | 1716 | 12.7k | for (int pl = 0; pl < 2; pl++) { | 1717 | 8.51k | res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL, | 1718 | 8.51k | f->cur.stride[1], b_dim, 1 + pl, refp, | 1719 | 8.51k | b->motion_mode == MM_WARP ? &t->warpmv : | 1720 | 8.51k | &f->frame_hdr->gmv[b->ref[0]]); | 1721 | 8.51k | if (res) return res; | 1722 | 8.51k | } | 1723 | 79.4k | } else { | 1724 | 238k | for (int pl = 0; pl < 2; pl++) { | 1725 | 158k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, | 1726 | 158k | NULL, f->cur.stride[1], | 1727 | 158k | bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), | 1728 | 158k | t->bx & ~ss_hor, t->by & ~ss_ver, | 1729 | 158k | 1 + pl, b->mv[0], refp, b->ref[0], filter_2d); | 1730 | 158k | if (res) return res; | 1731 | 158k | if (b->motion_mode == MM_OBMC) { | 1732 | 34.5k | res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, | 1733 | 34.5k | f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4); | 1734 | 34.5k | if (res) return res; | 1735 | 34.5k | } | 1736 | 158k | } | 1737 | 79.4k | } | 1738 | 83.7k | if (b->interintra_type) { | 1739 | | // FIXME for 8x32 with 4:2:2 subsampling, this probably does | 1740 | | // the wrong thing since it will select 4x16, not 4x32, as a | 1741 | | // transform size... | 1742 | 2.06k | const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b); | 1743 | | | 1744 | 6.19k | for (int pl = 0; pl < 2; pl++) { | 1745 | 4.13k | pixel *const tmp = bitfn(t->scratch.interintra); | 1746 | 4.13k | pixel *const tl_edge = bitfn(t->scratch.edge) + 32; | 1747 | 4.13k | enum IntraPredMode m = | 1748 | 4.13k | b->interintra_mode == II_SMOOTH_PRED ? | 1749 | 3.51k | SMOOTH_PRED : b->interintra_mode; | 1750 | 4.13k | int angle = 0; | 1751 | 4.13k | pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; | 1752 | 4.13k | const pixel *top_sb_edge = NULL; | 1753 | 4.13k | if (!(t->by & (f->sb_step - 1))) { | 1754 | 2.00k | top_sb_edge = f->ipred_edge[pl + 1]; | 1755 | 2.00k | const int sby = t->by >> f->sb_shift; | 1756 | 2.00k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1757 | 2.00k | } | 1758 | 4.13k | m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor, | 1759 | 4.13k | (t->bx >> ss_hor) > | 1760 | 4.13k | (ts->tiling.col_start >> ss_hor), | 1761 | 4.13k | t->by >> ss_ver, | 1762 | 4.13k | (t->by >> ss_ver) > | 1763 | 4.13k | (ts->tiling.row_start >> ss_ver), | 1764 | 4.13k | ts->tiling.col_end >> ss_hor, | 1765 | 4.13k | ts->tiling.row_end >> ss_ver, | 1766 | 4.13k | 0, uvdst, f->cur.stride[1], | 1767 | 4.13k | top_sb_edge, m, | 1768 | 4.13k | &angle, cbw4, cbh4, 0, tl_edge | 1769 | 4.13k | HIGHBD_CALL_SUFFIX); | 1770 | 4.13k | dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), | 1771 | 4.13k | tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0 | 1772 | 4.13k | HIGHBD_CALL_SUFFIX); | 1773 | 4.13k | dsp->mc.blend(uvdst, f->cur.stride[1], tmp, | 1774 | 4.13k | cbw4 * 4, cbh4 * 4, ii_mask); | 1775 | 4.13k | } | 1776 | 2.06k | } | 1777 | 83.7k | } | 1778 | | | 1779 | 111k | skip_inter_chroma_pred: {} | 1780 | 111k | t->tl_4x4_filter = filter_2d; | 1781 | 111k | } else { | 1782 | 17.4k | const enum Filter2d filter_2d = b->filter2d; | 1783 | | // Maximum super block size is 128x128 | 1784 | 17.4k | int16_t (*tmp)[128 * 128] = t->scratch.compinter; | 1785 | 17.4k | int jnt_weight; | 1786 | 17.4k | uint8_t *const seg_mask = t->scratch.seg_mask; | 1787 | 17.4k | const uint8_t *mask; | 1788 | | | 1789 | 52.3k | for (int i = 0; i < 2; i++) { | 1790 | 34.9k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; | 1791 | | | 1792 | 34.9k | if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { | 1793 | 2.32k | res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp, | 1794 | 2.32k | &f->frame_hdr->gmv[b->ref[i]]); | 1795 | 2.32k | if (res) return res; | 1796 | 32.5k | } else { | 1797 | 32.5k | res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0, | 1798 | 32.5k | b->mv[i], refp, b->ref[i], filter_2d); | 1799 | 32.5k | if (res) return res; | 1800 | 32.5k | } | 1801 | 34.9k | } | 1802 | 17.4k | switch (b->comp_type) { | 1803 | 12.8k | case COMP_INTER_AVG: | 1804 | 12.8k | dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1], | 1805 | 12.8k | bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX); | 1806 | 12.8k | break; | 1807 | 1.42k | case COMP_INTER_WEIGHTED_AVG: | 1808 | 1.42k | jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]]; | 1809 | 1.42k | dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1], | 1810 | 1.42k | bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX); | 1811 | 1.42k | break; | 1812 | 2.31k | case COMP_INTER_SEG: | 1813 | 2.31k | dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0], | 1814 | 2.31k | tmp[b->mask_sign], tmp[!b->mask_sign], | 1815 | 2.31k | bw4 * 4, bh4 * 4, seg_mask, | 1816 | 2.31k | b->mask_sign HIGHBD_CALL_SUFFIX); | 1817 | 2.31k | mask = seg_mask; | 1818 | 2.31k | break; | 1819 | 887 | case COMP_INTER_WEDGE: | 1820 | 887 | mask = WEDGE_MASK(0, bs, 0, b->wedge_idx); | 1821 | 887 | dsp->mc.mask(dst, f->cur.stride[0], | 1822 | 887 | tmp[b->mask_sign], tmp[!b->mask_sign], | 1823 | 887 | bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); | 1824 | 887 | if (has_chroma) | 1825 | 587 | mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx); | 1826 | 887 | break; | 1827 | 17.4k | } | 1828 | | | 1829 | | // chroma | 1830 | 36.4k | if (has_chroma) for (int pl = 0; pl < 2; pl++) { | 1831 | 72.8k | for (int i = 0; i < 2; i++) { | 1832 | 48.5k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; | 1833 | 48.5k | if (b->inter_mode == GLOBALMV_GLOBALMV && | 1834 | 9.01k | imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) | 1835 | 3.42k | { | 1836 | 3.42k | res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor, | 1837 | 3.42k | b_dim, 1 + pl, | 1838 | 3.42k | refp, &f->frame_hdr->gmv[b->ref[i]]); | 1839 | 3.42k | if (res) return res; | 1840 | 45.1k | } else { | 1841 | 45.1k | res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, | 1842 | 45.1k | 1 + pl, b->mv[i], refp, b->ref[i], filter_2d); | 1843 | 45.1k | if (res) return res; | 1844 | 45.1k | } | 1845 | 48.5k | } | 1846 | 24.2k | pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; | 1847 | 24.2k | switch (b->comp_type) { | 1848 | 17.8k | case COMP_INTER_AVG: | 1849 | 17.8k | dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], | 1850 | 17.8k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver | 1851 | 17.8k | HIGHBD_CALL_SUFFIX); | 1852 | 17.8k | break; | 1853 | 2.14k | case COMP_INTER_WEIGHTED_AVG: | 1854 | 2.14k | dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], | 1855 | 2.14k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight | 1856 | 2.14k | HIGHBD_CALL_SUFFIX); | 1857 | 2.14k | break; | 1858 | 1.17k | case COMP_INTER_WEDGE: | 1859 | 4.33k | case COMP_INTER_SEG: | 1860 | 4.33k | dsp->mc.mask(uvdst, f->cur.stride[1], | 1861 | 4.33k | tmp[b->mask_sign], tmp[!b->mask_sign], | 1862 | 4.33k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask | 1863 | 4.33k | HIGHBD_CALL_SUFFIX); | 1864 | 4.33k | break; | 1865 | 24.2k | } | 1866 | 24.2k | } | 1867 | 17.4k | } | 1868 | | | 1869 | 222k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1870 | 0 | hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred"); | 1871 | 0 | if (has_chroma) { | 1872 | 0 | hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1], | 1873 | 0 | cbw4 * 4, cbh4 * 4, "u-pred"); | 1874 | 0 | hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1], | 1875 | 0 | cbw4 * 4, cbh4 * 4, "v-pred"); | 1876 | 0 | } | 1877 | 0 | } | 1878 | | | 1879 | 222k | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; | 1880 | | | 1881 | 222k | if (b->skip) { | 1882 | | // reset coef contexts | 1883 | 126k | BlockContext *const a = t->a; | 1884 | 126k | dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); | 1885 | 126k | dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); | 1886 | 126k | if (has_chroma) { | 1887 | 77.1k | dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; | 1888 | 77.1k | dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; | 1889 | 77.1k | memset_cw(&a->ccoef[0][cbx4], 0x40); | 1890 | 77.1k | memset_cw(&a->ccoef[1][cbx4], 0x40); | 1891 | 77.1k | memset_ch(&t->l.ccoef[0][cby4], 0x40); | 1892 | 77.1k | memset_ch(&t->l.ccoef[1][cby4], 0x40); | 1893 | 77.1k | } | 1894 | 126k | return 0; | 1895 | 126k | } | 1896 | | | 1897 | 95.2k | const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx]; | 1898 | 95.2k | const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx]; | 1899 | 95.2k | const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; | 1900 | | | 1901 | 194k | for (int init_y = 0; init_y < bh4; init_y += 16) { | 1902 | 201k | for (int init_x = 0; init_x < bw4; init_x += 16) { | 1903 | | // coefficient coding & inverse transforms | 1904 | 102k | int y_off = !!init_y, y; | 1905 | 102k | dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y; | 1906 | 220k | for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16); | 1907 | 118k | y += ytx->h, y_off++) | 1908 | 118k | { | 1909 | 118k | int x, x_off = !!init_x; | 1910 | 290k | for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16); | 1911 | 171k | x += ytx->w, x_off++) | 1912 | 171k | { | 1913 | 171k | read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, | 1914 | 171k | x_off, y_off, &dst[x * 4]); | 1915 | 171k | t->bx += ytx->w; | 1916 | 171k | } | 1917 | 118k | dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h; | 1918 | 118k | t->bx -= x; | 1919 | 118k | t->by += ytx->h; | 1920 | 118k | } | 1921 | 102k | dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y; | 1922 | 102k | t->by -= y; | 1923 | | | 1924 | | // chroma coefs and inverse transform | 1925 | 239k | if (has_chroma) for (int pl = 0; pl < 2; pl++) { | 1926 | 159k | pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff + | 1927 | 159k | (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver); | 1928 | 159k | for (y = init_y >> ss_ver, t->by += init_y; | 1929 | 352k | y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h) | 1930 | 192k | { | 1931 | 192k | int x; | 1932 | 192k | for (x = init_x >> ss_hor, t->bx += init_x; | 1933 | 480k | x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w) | 1934 | 287k | { | 1935 | 287k | coef *cf; | 1936 | 287k | int eob; | 1937 | 287k | enum TxfmType txtp; | 1938 | 287k | if (t->frame_thread.pass) { | 1939 | 287k | const int p = t->frame_thread.pass & 1; | 1940 | 287k | const int cbi = *ts->frame_thread[p].cbi++; | 1941 | 287k | cf = ts->frame_thread[p].cf; | 1942 | 287k | ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16; | 1943 | 287k | eob = cbi >> 5; | 1944 | 287k | txtp = cbi & 0x1f; | 1945 | 287k | } else { | 1946 | 0 | uint8_t cf_ctx; | 1947 | 0 | cf = bitfn(t->cf); | 1948 | 0 | txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + | 1949 | 0 | bx4 + (x << ss_hor)]; | 1950 | 0 | eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], | 1951 | 0 | &t->l.ccoef[pl][cby4 + y], | 1952 | 0 | b->uvtx, bs, b, 0, 1 + pl, | 1953 | 0 | cf, &txtp, &cf_ctx); | 1954 | 0 | if (DEBUG_BLOCK_INFO) | 1955 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," | 1956 | 0 | "txtp=%d,eob=%d]: r=%d\n", | 1957 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng); | 1958 | 0 | int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor); | 1959 | 0 | int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver); | 1960 | 0 | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); | 1961 | 0 | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); | 1962 | 0 | } | 1963 | 287k | if (eob >= 0) { | 1964 | 107k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1965 | 0 | coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq"); | 1966 | 107k | dsp->itx.itxfm_add[b->uvtx] | 1967 | 107k | [txtp](&uvdst[4 * x], | 1968 | 107k | f->cur.stride[1], | 1969 | 107k | cf, eob HIGHBD_CALL_SUFFIX); | 1970 | 107k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1971 | 0 | hex_dump(&uvdst[4 * x], f->cur.stride[1], | 1972 | 0 | uvtx->w * 4, uvtx->h * 4, "recon"); | 1973 | 107k | } | 1974 | 287k | t->bx += uvtx->w << ss_hor; | 1975 | 287k | } | 1976 | 192k | uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h; | 1977 | 192k | t->bx -= x << ss_hor; | 1978 | 192k | t->by += uvtx->h << ss_ver; | 1979 | 192k | } | 1980 | 159k | t->by -= y << ss_ver; | 1981 | 159k | } | 1982 | 102k | } | 1983 | 99.7k | } | 1984 | 95.2k | return 0; | 1985 | 222k | } |
dav1d_recon_b_inter_16bpc Line | Count | Source | 1559 | 135k | { | 1560 | 135k | Dav1dTileState *const ts = t->ts; | 1561 | 135k | const Dav1dFrameContext *const f = t->f; | 1562 | 135k | const Dav1dDSPContext *const dsp = f->dsp; | 1563 | 135k | const int bx4 = t->bx & 31, by4 = t->by & 31; | 1564 | 135k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 1565 | 135k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 1566 | 135k | const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; | 1567 | 135k | const uint8_t *const b_dim = dav1d_block_dimensions[bs]; | 1568 | 135k | const int bw4 = b_dim[0], bh4 = b_dim[1]; | 1569 | 135k | const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); | 1570 | 135k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && | 1571 | 106k | (bw4 > ss_hor || t->bx & 1) && | 1572 | 101k | (bh4 > ss_ver || t->by & 1); | 1573 | 135k | const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 : | 1574 | 135k | DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout; | 1575 | 135k | int res; | 1576 | | | 1577 | | // prediction | 1578 | 135k | const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor; | 1579 | 135k | pixel *dst = ((pixel *) f->cur.data[0]) + | 1580 | 135k | 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); | 1581 | 135k | const ptrdiff_t uvdstoff = | 1582 | 135k | 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); | 1583 | 135k | if (IS_KEY_OR_INTRA(f->frame_hdr)) { | 1584 | | // intrabc | 1585 | 44.1k | assert(!f->frame_hdr->super_res.enabled); | 1586 | 44.1k | res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0, | 1587 | 44.1k | b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); | 1588 | 44.1k | if (res) return res; | 1589 | 99.4k | if (has_chroma) for (int pl = 1; pl < 3; pl++) { | 1590 | 66.2k | res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1], | 1591 | 66.2k | bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), | 1592 | 66.2k | t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0], | 1593 | 66.2k | &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); | 1594 | 66.2k | if (res) return res; | 1595 | 66.2k | } | 1596 | 91.5k | } else if (b->comp_type == COMP_INTER_NONE) { | 1597 | 73.3k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]]; | 1598 | 73.3k | const enum Filter2d filter_2d = b->filter2d; | 1599 | | | 1600 | 73.3k | if (imin(bw4, bh4) > 1 && | 1601 | 40.6k | ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || | 1602 | 38.8k | (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) | 1603 | 2.93k | { | 1604 | 2.93k | res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp, | 1605 | 2.93k | b->motion_mode == MM_WARP ? &t->warpmv : | 1606 | 2.93k | &f->frame_hdr->gmv[b->ref[0]]); | 1607 | 2.93k | if (res) return res; | 1608 | 70.3k | } else { | 1609 | 70.3k | res = mc(t, dst, NULL, f->cur.stride[0], | 1610 | 70.3k | bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d); | 1611 | 70.3k | if (res) return res; | 1612 | 70.3k | if (b->motion_mode == MM_OBMC) { | 1613 | 11.2k | res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4); | 1614 | 11.2k | if (res) return res; | 1615 | 11.2k | } | 1616 | 70.3k | } | 1617 | 73.3k | if (b->interintra_type) { | 1618 | 2.97k | pixel *const tl_edge = bitfn(t->scratch.edge) + 32; | 1619 | 2.97k | enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ? | 1620 | 2.37k | SMOOTH_PRED : b->interintra_mode; | 1621 | 2.97k | pixel *const tmp = bitfn(t->scratch.interintra); | 1622 | 2.97k | int angle = 0; | 1623 | 2.97k | const pixel *top_sb_edge = NULL; | 1624 | 2.97k | if (!(t->by & (f->sb_step - 1))) { | 1625 | 1.45k | top_sb_edge = f->ipred_edge[0]; | 1626 | 1.45k | const int sby = t->by >> f->sb_shift; | 1627 | 1.45k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1628 | 1.45k | } | 1629 | 2.97k | m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start, | 1630 | 2.97k | t->by, t->by > ts->tiling.row_start, | 1631 | 2.97k | ts->tiling.col_end, ts->tiling.row_end, | 1632 | 2.97k | 0, dst, f->cur.stride[0], top_sb_edge, | 1633 | 2.97k | m, &angle, bw4, bh4, 0, tl_edge | 1634 | 2.97k | HIGHBD_CALL_SUFFIX); | 1635 | 2.97k | dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), | 1636 | 2.97k | tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 | 1637 | 2.97k | HIGHBD_CALL_SUFFIX); | 1638 | 2.97k | dsp->mc.blend(dst, f->cur.stride[0], tmp, | 1639 | 2.97k | bw4 * 4, bh4 * 4, II_MASK(0, bs, b)); | 1640 | 2.97k | } | 1641 | | | 1642 | 73.3k | if (!has_chroma) goto skip_inter_chroma_pred; | 1643 | | | 1644 | | // sub8x8 derivation | 1645 | 50.1k | int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; | 1646 | 50.1k | refmvs_block *const *r; | 1647 | 50.1k | if (is_sub8x8) { | 1648 | 3.99k | assert(ss_hor == 1); | 1649 | 3.99k | r = &t->rt.r[(t->by & 31) + 5]; | 1650 | 3.99k | if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; | 1651 | 3.99k | if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; | 1652 | 3.99k | if (bw4 == 1 && bh4 == ss_ver) | 1653 | 1.11k | is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; | 1654 | 3.99k | } | 1655 | | | 1656 | | // chroma prediction | 1657 | 50.1k | if (is_sub8x8) { | 1658 | 3.90k | assert(ss_hor == 1); | 1659 | 3.90k | ptrdiff_t h_off = 0, v_off = 0; | 1660 | 3.90k | if (bw4 == 1 && bh4 == ss_ver) { | 1661 | 3.28k | for (int pl = 0; pl < 2; pl++) { | 1662 | 2.19k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, | 1663 | 2.19k | NULL, f->cur.stride[1], | 1664 | 2.19k | bw4, bh4, t->bx - 1, t->by - 1, 1 + pl, | 1665 | 2.19k | r[-1][t->bx - 1].mv.mv[0], | 1666 | 2.19k | &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1], | 1667 | 2.19k | r[-1][t->bx - 1].ref.ref[0] - 1, | 1668 | 2.19k | t->frame_thread.pass != 2 ? t->tl_4x4_filter : | 1669 | 2.19k | f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d); | 1670 | 2.19k | if (res) return res; | 1671 | 2.19k | } | 1672 | 1.09k | v_off = 2 * PXSTRIDE(f->cur.stride[1]); | 1673 | 1.09k | h_off = 2; | 1674 | 1.09k | } | 1675 | 3.90k | if (bw4 == 1) { | 1676 | 2.52k | const enum Filter2d left_filter_2d = | 1677 | 2.52k | dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]]; | 1678 | 7.57k | for (int pl = 0; pl < 2; pl++) { | 1679 | 5.05k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL, | 1680 | 5.05k | f->cur.stride[1], bw4, bh4, t->bx - 1, | 1681 | 5.05k | t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0], | 1682 | 5.05k | &f->refp[r[0][t->bx - 1].ref.ref[0] - 1], | 1683 | 5.05k | r[0][t->bx - 1].ref.ref[0] - 1, | 1684 | 5.05k | t->frame_thread.pass != 2 ? left_filter_2d : | 1685 | 5.05k | f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d); | 1686 | 5.05k | if (res) return res; | 1687 | 5.05k | } | 1688 | 2.52k | h_off = 2; | 1689 | 2.52k | } | 1690 | 3.90k | if (bh4 == ss_ver) { | 1691 | 2.47k | const enum Filter2d top_filter_2d = | 1692 | 2.47k | dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]]; | 1693 | 7.41k | for (int pl = 0; pl < 2; pl++) { | 1694 | 4.94k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL, | 1695 | 4.94k | f->cur.stride[1], bw4, bh4, t->bx, t->by - 1, | 1696 | 4.94k | 1 + pl, r[-1][t->bx].mv.mv[0], | 1697 | 4.94k | &f->refp[r[-1][t->bx].ref.ref[0] - 1], | 1698 | 4.94k | r[-1][t->bx].ref.ref[0] - 1, | 1699 | 4.94k | t->frame_thread.pass != 2 ? top_filter_2d : | 1700 | 4.94k | f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d); | 1701 | 4.94k | if (res) return res; | 1702 | 4.94k | } | 1703 | 2.47k | v_off = 2 * PXSTRIDE(f->cur.stride[1]); | 1704 | 2.47k | } | 1705 | 11.7k | for (int pl = 0; pl < 2; pl++) { | 1706 | 7.80k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1], | 1707 | 7.80k | bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], | 1708 | 7.80k | refp, b->ref[0], filter_2d); | 1709 | 7.80k | if (res) return res; | 1710 | 7.80k | } | 1711 | 46.2k | } else { | 1712 | 46.2k | if (imin(cbw4, cbh4) > 1 && | 1713 | 21.6k | ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || | 1714 | 20.2k | (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) | 1715 | 2.28k | { | 1716 | 6.84k | for (int pl = 0; pl < 2; pl++) { | 1717 | 4.56k | res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL, | 1718 | 4.56k | f->cur.stride[1], b_dim, 1 + pl, refp, | 1719 | 4.56k | b->motion_mode == MM_WARP ? &t->warpmv : | 1720 | 4.56k | &f->frame_hdr->gmv[b->ref[0]]); | 1721 | 4.56k | if (res) return res; | 1722 | 4.56k | } | 1723 | 43.9k | } else { | 1724 | 131k | for (int pl = 0; pl < 2; pl++) { | 1725 | 87.9k | res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, | 1726 | 87.9k | NULL, f->cur.stride[1], | 1727 | 87.9k | bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), | 1728 | 87.9k | t->bx & ~ss_hor, t->by & ~ss_ver, | 1729 | 87.9k | 1 + pl, b->mv[0], refp, b->ref[0], filter_2d); | 1730 | 87.9k | if (res) return res; | 1731 | 87.9k | if (b->motion_mode == MM_OBMC) { | 1732 | 16.0k | res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, | 1733 | 16.0k | f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4); | 1734 | 16.0k | if (res) return res; | 1735 | 16.0k | } | 1736 | 87.9k | } | 1737 | 43.9k | } | 1738 | 46.2k | if (b->interintra_type) { | 1739 | | // FIXME for 8x32 with 4:2:2 subsampling, this probably does | 1740 | | // the wrong thing since it will select 4x16, not 4x32, as a | 1741 | | // transform size... | 1742 | 2.01k | const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b); | 1743 | | | 1744 | 6.05k | for (int pl = 0; pl < 2; pl++) { | 1745 | 4.03k | pixel *const tmp = bitfn(t->scratch.interintra); | 1746 | 4.03k | pixel *const tl_edge = bitfn(t->scratch.edge) + 32; | 1747 | 4.03k | enum IntraPredMode m = | 1748 | 4.03k | b->interintra_mode == II_SMOOTH_PRED ? | 1749 | 3.17k | SMOOTH_PRED : b->interintra_mode; | 1750 | 4.03k | int angle = 0; | 1751 | 4.03k | pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; | 1752 | 4.03k | const pixel *top_sb_edge = NULL; | 1753 | 4.03k | if (!(t->by & (f->sb_step - 1))) { | 1754 | 2.06k | top_sb_edge = f->ipred_edge[pl + 1]; | 1755 | 2.06k | const int sby = t->by >> f->sb_shift; | 1756 | 2.06k | top_sb_edge += f->sb128w * 128 * (sby - 1); | 1757 | 2.06k | } | 1758 | 4.03k | m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor, | 1759 | 4.03k | (t->bx >> ss_hor) > | 1760 | 4.03k | (ts->tiling.col_start >> ss_hor), | 1761 | 4.03k | t->by >> ss_ver, | 1762 | 4.03k | (t->by >> ss_ver) > | 1763 | 4.03k | (ts->tiling.row_start >> ss_ver), | 1764 | 4.03k | ts->tiling.col_end >> ss_hor, | 1765 | 4.03k | ts->tiling.row_end >> ss_ver, | 1766 | 4.03k | 0, uvdst, f->cur.stride[1], | 1767 | 4.03k | top_sb_edge, m, | 1768 | 4.03k | &angle, cbw4, cbh4, 0, tl_edge | 1769 | 4.03k | HIGHBD_CALL_SUFFIX); | 1770 | 4.03k | dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), | 1771 | 4.03k | tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0 | 1772 | 4.03k | HIGHBD_CALL_SUFFIX); | 1773 | 4.03k | dsp->mc.blend(uvdst, f->cur.stride[1], tmp, | 1774 | 4.03k | cbw4 * 4, cbh4 * 4, ii_mask); | 1775 | 4.03k | } | 1776 | 2.01k | } | 1777 | 46.2k | } | 1778 | | | 1779 | 73.3k | skip_inter_chroma_pred: {} | 1780 | 73.3k | t->tl_4x4_filter = filter_2d; | 1781 | 73.3k | } else { | 1782 | 18.2k | const enum Filter2d filter_2d = b->filter2d; | 1783 | | // Maximum super block size is 128x128 | 1784 | 18.2k | int16_t (*tmp)[128 * 128] = t->scratch.compinter; | 1785 | 18.2k | int jnt_weight; | 1786 | 18.2k | uint8_t *const seg_mask = t->scratch.seg_mask; | 1787 | 18.2k | const uint8_t *mask; | 1788 | | | 1789 | 54.7k | for (int i = 0; i < 2; i++) { | 1790 | 36.4k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; | 1791 | | | 1792 | 36.4k | if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { | 1793 | 1.36k | res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp, | 1794 | 1.36k | &f->frame_hdr->gmv[b->ref[i]]); | 1795 | 1.36k | if (res) return res; | 1796 | 35.1k | } else { | 1797 | 35.1k | res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0, | 1798 | 35.1k | b->mv[i], refp, b->ref[i], filter_2d); | 1799 | 35.1k | if (res) return res; | 1800 | 35.1k | } | 1801 | 36.4k | } | 1802 | 18.2k | switch (b->comp_type) { | 1803 | 12.5k | case COMP_INTER_AVG: | 1804 | 12.5k | dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1], | 1805 | 12.5k | bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX); | 1806 | 12.5k | break; | 1807 | 2.02k | case COMP_INTER_WEIGHTED_AVG: | 1808 | 2.02k | jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]]; | 1809 | 2.02k | dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1], | 1810 | 2.02k | bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX); | 1811 | 2.02k | break; | 1812 | 2.64k | case COMP_INTER_SEG: | 1813 | 2.64k | dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0], | 1814 | 2.64k | tmp[b->mask_sign], tmp[!b->mask_sign], | 1815 | 2.64k | bw4 * 4, bh4 * 4, seg_mask, | 1816 | 2.64k | b->mask_sign HIGHBD_CALL_SUFFIX); | 1817 | 2.64k | mask = seg_mask; | 1818 | 2.64k | break; | 1819 | 1.02k | case COMP_INTER_WEDGE: | 1820 | 1.02k | mask = WEDGE_MASK(0, bs, 0, b->wedge_idx); | 1821 | 1.02k | dsp->mc.mask(dst, f->cur.stride[0], | 1822 | 1.02k | tmp[b->mask_sign], tmp[!b->mask_sign], | 1823 | 1.02k | bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); | 1824 | 1.02k | if (has_chroma) | 1825 | 762 | mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx); | 1826 | 1.02k | break; | 1827 | 18.2k | } | 1828 | | | 1829 | | // chroma | 1830 | 39.8k | if (has_chroma) for (int pl = 0; pl < 2; pl++) { | 1831 | 79.7k | for (int i = 0; i < 2; i++) { | 1832 | 53.1k | const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; | 1833 | 53.1k | if (b->inter_mode == GLOBALMV_GLOBALMV && | 1834 | 8.50k | imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) | 1835 | 1.97k | { | 1836 | 1.97k | res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor, | 1837 | 1.97k | b_dim, 1 + pl, | 1838 | 1.97k | refp, &f->frame_hdr->gmv[b->ref[i]]); | 1839 | 1.97k | if (res) return res; | 1840 | 51.2k | } else { | 1841 | 51.2k | res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, | 1842 | 51.2k | 1 + pl, b->mv[i], refp, b->ref[i], filter_2d); | 1843 | 51.2k | if (res) return res; | 1844 | 51.2k | } | 1845 | 53.1k | } | 1846 | 26.5k | pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; | 1847 | 26.5k | switch (b->comp_type) { | 1848 | 18.6k | case COMP_INTER_AVG: | 1849 | 18.6k | dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], | 1850 | 18.6k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver | 1851 | 18.6k | HIGHBD_CALL_SUFFIX); | 1852 | 18.6k | break; | 1853 | 2.79k | case COMP_INTER_WEIGHTED_AVG: | 1854 | 2.79k | dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], | 1855 | 2.79k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight | 1856 | 2.79k | HIGHBD_CALL_SUFFIX); | 1857 | 2.79k | break; | 1858 | 1.52k | case COMP_INTER_WEDGE: | 1859 | 5.19k | case COMP_INTER_SEG: | 1860 | 5.19k | dsp->mc.mask(uvdst, f->cur.stride[1], | 1861 | 5.19k | tmp[b->mask_sign], tmp[!b->mask_sign], | 1862 | 5.19k | bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask | 1863 | 5.19k | HIGHBD_CALL_SUFFIX); | 1864 | 5.19k | break; | 1865 | 26.5k | } | 1866 | 26.5k | } | 1867 | 18.2k | } | 1868 | | | 1869 | 135k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { | 1870 | 0 | hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred"); | 1871 | 0 | if (has_chroma) { | 1872 | 0 | hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1], | 1873 | 0 | cbw4 * 4, cbh4 * 4, "u-pred"); | 1874 | 0 | hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1], | 1875 | 0 | cbw4 * 4, cbh4 * 4, "v-pred"); | 1876 | 0 | } | 1877 | 0 | } | 1878 | | | 1879 | 135k | const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; | 1880 | | | 1881 | 135k | if (b->skip) { | 1882 | | // reset coef contexts | 1883 | 55.8k | BlockContext *const a = t->a; | 1884 | 55.8k | dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); | 1885 | 55.8k | dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); | 1886 | 55.8k | if (has_chroma) { | 1887 | 35.4k | dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; | 1888 | 35.4k | dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; | 1889 | 35.4k | memset_cw(&a->ccoef[0][cbx4], 0x40); | 1890 | 35.4k | memset_cw(&a->ccoef[1][cbx4], 0x40); | 1891 | 35.4k | memset_ch(&t->l.ccoef[0][cby4], 0x40); | 1892 | 35.4k | memset_ch(&t->l.ccoef[1][cby4], 0x40); | 1893 | 35.4k | } | 1894 | 55.8k | return 0; | 1895 | 55.8k | } | 1896 | | | 1897 | 79.8k | const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx]; | 1898 | 79.8k | const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx]; | 1899 | 79.8k | const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; | 1900 | | | 1901 | 163k | for (int init_y = 0; init_y < bh4; init_y += 16) { | 1902 | 169k | for (int init_x = 0; init_x < bw4; init_x += 16) { | 1903 | | // coefficient coding & inverse transforms | 1904 | 85.5k | int y_off = !!init_y, y; | 1905 | 85.5k | dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y; | 1906 | 177k | for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16); | 1907 | 91.6k | y += ytx->h, y_off++) | 1908 | 91.6k | { | 1909 | 91.6k | int x, x_off = !!init_x; | 1910 | 205k | for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16); | 1911 | 114k | x += ytx->w, x_off++) | 1912 | 114k | { | 1913 | 114k | read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, | 1914 | 114k | x_off, y_off, &dst[x * 4]); | 1915 | 114k | t->bx += ytx->w; | 1916 | 114k | } | 1917 | 91.6k | dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h; | 1918 | 91.6k | t->bx -= x; | 1919 | 91.6k | t->by += ytx->h; | 1920 | 91.6k | } | 1921 | 85.5k | dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y; | 1922 | 85.5k | t->by -= y; | 1923 | | | 1924 | | // chroma coefs and inverse transform | 1925 | 196k | if (has_chroma) for (int pl = 0; pl < 2; pl++) { | 1926 | 130k | pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff + | 1927 | 130k | (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver); | 1928 | 130k | for (y = init_y >> ss_ver, t->by += init_y; | 1929 | 276k | y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h) | 1930 | 145k | { | 1931 | 145k | int x; | 1932 | 145k | for (x = init_x >> ss_hor, t->bx += init_x; | 1933 | 335k | x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w) | 1934 | 189k | { | 1935 | 189k | coef *cf; | 1936 | 189k | int eob; | 1937 | 189k | enum TxfmType txtp; | 1938 | 189k | if (t->frame_thread.pass) { | 1939 | 189k | const int p = t->frame_thread.pass & 1; | 1940 | 189k | const int cbi = *ts->frame_thread[p].cbi++; | 1941 | 189k | cf = ts->frame_thread[p].cf; | 1942 | 189k | ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16; | 1943 | 189k | eob = cbi >> 5; | 1944 | 189k | txtp = cbi & 0x1f; | 1945 | 189k | } else { | 1946 | 0 | uint8_t cf_ctx; | 1947 | 0 | cf = bitfn(t->cf); | 1948 | 0 | txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + | 1949 | 0 | bx4 + (x << ss_hor)]; | 1950 | 0 | eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], | 1951 | 0 | &t->l.ccoef[pl][cby4 + y], | 1952 | 0 | b->uvtx, bs, b, 0, 1 + pl, | 1953 | 0 | cf, &txtp, &cf_ctx); | 1954 | 0 | if (DEBUG_BLOCK_INFO) | 1955 | 0 | printf("Post-uv-cf-blk[pl=%d,tx=%d," | 1956 | 0 | "txtp=%d,eob=%d]: r=%d\n", | 1957 | 0 | pl, b->uvtx, txtp, eob, ts->msac.rng); | 1958 | 0 | int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor); | 1959 | 0 | int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver); | 1960 | 0 | dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); | 1961 | 0 | dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); | 1962 | 0 | } | 1963 | 189k | if (eob >= 0) { | 1964 | 56.3k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1965 | 0 | coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq"); | 1966 | 56.3k | dsp->itx.itxfm_add[b->uvtx] | 1967 | 56.3k | [txtp](&uvdst[4 * x], | 1968 | 56.3k | f->cur.stride[1], | 1969 | 56.3k | cf, eob HIGHBD_CALL_SUFFIX); | 1970 | 56.3k | if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) | 1971 | 0 | hex_dump(&uvdst[4 * x], f->cur.stride[1], | 1972 | 0 | uvtx->w * 4, uvtx->h * 4, "recon"); | 1973 | 56.3k | } | 1974 | 189k | t->bx += uvtx->w << ss_hor; | 1975 | 189k | } | 1976 | 145k | uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h; | 1977 | 145k | t->bx -= x << ss_hor; | 1978 | 145k | t->by += uvtx->h << ss_ver; | 1979 | 145k | } | 1980 | 130k | t->by -= y << ss_ver; | 1981 | 130k | } | 1982 | 85.5k | } | 1983 | 84.1k | } | 1984 | 79.8k | return 0; | 1985 | 135k | } |
|
1986 | | |
1987 | 185k | void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) { |
1988 | 185k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) || |
1989 | 185k | (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1])) |
1990 | 0 | { |
1991 | 0 | return; |
1992 | 0 | } |
1993 | 185k | const int y = sby * f->sb_step * 4; |
1994 | 185k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
1995 | 185k | pixel *const p[3] = { |
1996 | 185k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), |
1997 | 185k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), |
1998 | 185k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) |
1999 | 185k | }; |
2000 | 185k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; |
2001 | 185k | bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby, |
2002 | 185k | f->lf.start_of_tile_row[sby]); |
2003 | 185k | } dav1d_filter_sbrow_deblock_cols_8bpc Line | Count | Source | 1987 | 93.7k | void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) { | 1988 | 93.7k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) || | 1989 | 93.7k | (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1])) | 1990 | 0 | { | 1991 | 0 | return; | 1992 | 0 | } | 1993 | 93.7k | const int y = sby * f->sb_step * 4; | 1994 | 93.7k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 1995 | 93.7k | pixel *const p[3] = { | 1996 | 93.7k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 1997 | 93.7k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 1998 | 93.7k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 1999 | 93.7k | }; | 2000 | 93.7k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; | 2001 | 93.7k | bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby, | 2002 | 93.7k | f->lf.start_of_tile_row[sby]); | 2003 | 93.7k | } |
dav1d_filter_sbrow_deblock_cols_16bpc Line | Count | Source | 1987 | 91.7k | void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) { | 1988 | 91.7k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) || | 1989 | 91.7k | (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1])) | 1990 | 0 | { | 1991 | 0 | return; | 1992 | 0 | } | 1993 | 91.7k | const int y = sby * f->sb_step * 4; | 1994 | 91.7k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 1995 | 91.7k | pixel *const p[3] = { | 1996 | 91.7k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 1997 | 91.7k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 1998 | 91.7k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 1999 | 91.7k | }; | 2000 | 91.7k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; | 2001 | 91.7k | bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby, | 2002 | 91.7k | f->lf.start_of_tile_row[sby]); | 2003 | 91.7k | } |
|
2004 | | |
2005 | 243k | void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) { |
2006 | 243k | const int y = sby * f->sb_step * 4; |
2007 | 243k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2008 | 243k | pixel *const p[3] = { |
2009 | 243k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), |
2010 | 243k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), |
2011 | 243k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) |
2012 | 243k | }; |
2013 | 243k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; |
2014 | 243k | if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK && |
2015 | 243k | (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])) |
2016 | 185k | { |
2017 | 185k | bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby); |
2018 | 185k | } |
2019 | 243k | if (f->seq_hdr->cdef || f->lf.restore_planes) { |
2020 | | // Store loop filtered pixels required by CDEF / LR |
2021 | 208k | bytefn(dav1d_copy_lpf)(f, p, sby); |
2022 | 208k | } |
2023 | 243k | } dav1d_filter_sbrow_deblock_rows_8bpc Line | Count | Source | 2005 | 120k | void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) { | 2006 | 120k | const int y = sby * f->sb_step * 4; | 2007 | 120k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2008 | 120k | pixel *const p[3] = { | 2009 | 120k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 2010 | 120k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2011 | 120k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 2012 | 120k | }; | 2013 | 120k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; | 2014 | 120k | if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK && | 2015 | 120k | (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])) | 2016 | 93.5k | { | 2017 | 93.5k | bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby); | 2018 | 93.5k | } | 2019 | 120k | if (f->seq_hdr->cdef || f->lf.restore_planes) { | 2020 | | // Store loop filtered pixels required by CDEF / LR | 2021 | 101k | bytefn(dav1d_copy_lpf)(f, p, sby); | 2022 | 101k | } | 2023 | 120k | } |
dav1d_filter_sbrow_deblock_rows_16bpc Line | Count | Source | 2005 | 123k | void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) { | 2006 | 123k | const int y = sby * f->sb_step * 4; | 2007 | 123k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2008 | 123k | pixel *const p[3] = { | 2009 | 123k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 2010 | 123k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2011 | 123k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 2012 | 123k | }; | 2013 | 123k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; | 2014 | 123k | if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK && | 2015 | 123k | (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])) | 2016 | 91.6k | { | 2017 | 91.6k | bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby); | 2018 | 91.6k | } | 2019 | 123k | if (f->seq_hdr->cdef || f->lf.restore_planes) { | 2020 | | // Store loop filtered pixels required by CDEF / LR | 2021 | 107k | bytefn(dav1d_copy_lpf)(f, p, sby); | 2022 | 107k | } | 2023 | 123k | } |
|
2024 | | |
2025 | 161k | void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) { |
2026 | 161k | const Dav1dFrameContext *const f = tc->f; |
2027 | 161k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return; |
2028 | 161k | const int sbsz = f->sb_step; |
2029 | 161k | const int y = sby * sbsz * 4; |
2030 | 161k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2031 | 161k | pixel *const p[3] = { |
2032 | 161k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), |
2033 | 161k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), |
2034 | 161k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) |
2035 | 161k | }; |
2036 | 161k | Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w; |
2037 | 161k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; |
2038 | 161k | const int start = sby * sbsz; |
2039 | 161k | if (sby) { |
2040 | 97.8k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2041 | 97.8k | pixel *p_up[3] = { |
2042 | 97.8k | p[0] - 8 * PXSTRIDE(f->cur.stride[0]), |
2043 | 97.8k | p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), |
2044 | 97.8k | p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), |
2045 | 97.8k | }; |
2046 | 97.8k | bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby); |
2047 | 97.8k | } |
2048 | 161k | const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); |
2049 | 161k | const int end = imin(start + n_blks, f->bh); |
2050 | 161k | bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby); |
2051 | 161k | } dav1d_filter_sbrow_cdef_8bpc Line | Count | Source | 2025 | 76.7k | void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) { | 2026 | 76.7k | const Dav1dFrameContext *const f = tc->f; | 2027 | 76.7k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return; | 2028 | 76.7k | const int sbsz = f->sb_step; | 2029 | 76.7k | const int y = sby * sbsz * 4; | 2030 | 76.7k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2031 | 76.7k | pixel *const p[3] = { | 2032 | 76.7k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 2033 | 76.7k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2034 | 76.7k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 2035 | 76.7k | }; | 2036 | 76.7k | Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w; | 2037 | 76.7k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; | 2038 | 76.7k | const int start = sby * sbsz; | 2039 | 76.7k | if (sby) { | 2040 | 42.1k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2041 | 42.1k | pixel *p_up[3] = { | 2042 | 42.1k | p[0] - 8 * PXSTRIDE(f->cur.stride[0]), | 2043 | 42.1k | p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2044 | 42.1k | p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2045 | 42.1k | }; | 2046 | 42.1k | bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby); | 2047 | 42.1k | } | 2048 | 76.7k | const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); | 2049 | 76.7k | const int end = imin(start + n_blks, f->bh); | 2050 | 76.7k | bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby); | 2051 | 76.7k | } |
dav1d_filter_sbrow_cdef_16bpc Line | Count | Source | 2025 | 84.4k | void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) { | 2026 | 84.4k | const Dav1dFrameContext *const f = tc->f; | 2027 | 84.4k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return; | 2028 | 84.4k | const int sbsz = f->sb_step; | 2029 | 84.4k | const int y = sby * sbsz * 4; | 2030 | 84.4k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2031 | 84.4k | pixel *const p[3] = { | 2032 | 84.4k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 2033 | 84.4k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2034 | 84.4k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 2035 | 84.4k | }; | 2036 | 84.4k | Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w; | 2037 | 84.4k | Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; | 2038 | 84.4k | const int start = sby * sbsz; | 2039 | 84.4k | if (sby) { | 2040 | 55.7k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2041 | 55.7k | pixel *p_up[3] = { | 2042 | 55.7k | p[0] - 8 * PXSTRIDE(f->cur.stride[0]), | 2043 | 55.7k | p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2044 | 55.7k | p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2045 | 55.7k | }; | 2046 | 55.7k | bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby); | 2047 | 55.7k | } | 2048 | 84.4k | const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); | 2049 | 84.4k | const int end = imin(start + n_blks, f->bh); | 2050 | 84.4k | bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby); | 2051 | 84.4k | } |
|
2052 | | |
2053 | 44.4k | void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { |
2054 | 44.4k | const int sbsz = f->sb_step; |
2055 | 44.4k | const int y = sby * sbsz * 4; |
2056 | 44.4k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2057 | 44.4k | const pixel *const p[3] = { |
2058 | 44.4k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), |
2059 | 44.4k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), |
2060 | 44.4k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) |
2061 | 44.4k | }; |
2062 | 44.4k | pixel *const sr_p[3] = { |
2063 | 44.4k | f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), |
2064 | 44.4k | f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), |
2065 | 44.4k | f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) |
2066 | 44.4k | }; |
2067 | 44.4k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; |
2068 | 149k | for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { |
2069 | 105k | const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2070 | 105k | const int h_start = 8 * !!sby >> ss_ver; |
2071 | 105k | const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; |
2072 | 105k | pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride); |
2073 | 105k | const ptrdiff_t src_stride = f->cur.stride[!!pl]; |
2074 | 105k | const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride); |
2075 | 105k | const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver; |
2076 | 105k | const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
2077 | 105k | const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; |
2078 | 105k | const int src_w = (4 * f->bw + ss_hor) >> ss_hor; |
2079 | 105k | const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; |
2080 | | |
2081 | 105k | f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, |
2082 | 105k | imin(img_h, h_end) + h_start, src_w, |
2083 | 105k | f->resize_step[!!pl], f->resize_start[!!pl] |
2084 | 105k | HIGHBD_CALL_SUFFIX); |
2085 | 105k | } |
2086 | 44.4k | } dav1d_filter_sbrow_resize_8bpc Line | Count | Source | 2053 | 21.8k | void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { | 2054 | 21.8k | const int sbsz = f->sb_step; | 2055 | 21.8k | const int y = sby * sbsz * 4; | 2056 | 21.8k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2057 | 21.8k | const pixel *const p[3] = { | 2058 | 21.8k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 2059 | 21.8k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2060 | 21.8k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 2061 | 21.8k | }; | 2062 | 21.8k | pixel *const sr_p[3] = { | 2063 | 21.8k | f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), | 2064 | 21.8k | f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), | 2065 | 21.8k | f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) | 2066 | 21.8k | }; | 2067 | 21.8k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; | 2068 | 79.3k | for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { | 2069 | 57.4k | const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2070 | 57.4k | const int h_start = 8 * !!sby >> ss_ver; | 2071 | 57.4k | const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; | 2072 | 57.4k | pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride); | 2073 | 57.4k | const ptrdiff_t src_stride = f->cur.stride[!!pl]; | 2074 | 57.4k | const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride); | 2075 | 57.4k | const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver; | 2076 | 57.4k | const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 2077 | 57.4k | const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; | 2078 | 57.4k | const int src_w = (4 * f->bw + ss_hor) >> ss_hor; | 2079 | 57.4k | const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; | 2080 | | | 2081 | 57.4k | f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, | 2082 | 57.4k | imin(img_h, h_end) + h_start, src_w, | 2083 | 57.4k | f->resize_step[!!pl], f->resize_start[!!pl] | 2084 | 57.4k | HIGHBD_CALL_SUFFIX); | 2085 | 57.4k | } | 2086 | 21.8k | } |
dav1d_filter_sbrow_resize_16bpc Line | Count | Source | 2053 | 22.5k | void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { | 2054 | 22.5k | const int sbsz = f->sb_step; | 2055 | 22.5k | const int y = sby * sbsz * 4; | 2056 | 22.5k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2057 | 22.5k | const pixel *const p[3] = { | 2058 | 22.5k | f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), | 2059 | 22.5k | f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), | 2060 | 22.5k | f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) | 2061 | 22.5k | }; | 2062 | 22.5k | pixel *const sr_p[3] = { | 2063 | 22.5k | f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), | 2064 | 22.5k | f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), | 2065 | 22.5k | f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) | 2066 | 22.5k | }; | 2067 | 22.5k | const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; | 2068 | 70.5k | for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { | 2069 | 47.9k | const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2070 | 47.9k | const int h_start = 8 * !!sby >> ss_ver; | 2071 | 47.9k | const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; | 2072 | 47.9k | pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride); | 2073 | 47.9k | const ptrdiff_t src_stride = f->cur.stride[!!pl]; | 2074 | 47.9k | const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride); | 2075 | 47.9k | const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver; | 2076 | 47.9k | const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 2077 | 47.9k | const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; | 2078 | 47.9k | const int src_w = (4 * f->bw + ss_hor) >> ss_hor; | 2079 | 47.9k | const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; | 2080 | | | 2081 | 47.9k | f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, | 2082 | 47.9k | imin(img_h, h_end) + h_start, src_w, | 2083 | 47.9k | f->resize_step[!!pl], f->resize_start[!!pl] | 2084 | 47.9k | HIGHBD_CALL_SUFFIX); | 2085 | 47.9k | } | 2086 | 22.5k | } |
|
2087 | | |
2088 | 111k | void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { |
2089 | 111k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return; |
2090 | 111k | const int y = sby * f->sb_step * 4; |
2091 | 111k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2092 | 111k | pixel *const sr_p[3] = { |
2093 | 111k | f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), |
2094 | 111k | f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), |
2095 | 111k | f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) |
2096 | 111k | }; |
2097 | 111k | bytefn(dav1d_lr_sbrow)(f, sr_p, sby); |
2098 | 111k | } dav1d_filter_sbrow_lr_8bpc Line | Count | Source | 2088 | 62.3k | void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { | 2089 | 62.3k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return; | 2090 | 62.3k | const int y = sby * f->sb_step * 4; | 2091 | 62.3k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2092 | 62.3k | pixel *const sr_p[3] = { | 2093 | 62.3k | f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), | 2094 | 62.3k | f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), | 2095 | 62.3k | f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) | 2096 | 62.3k | }; | 2097 | 62.3k | bytefn(dav1d_lr_sbrow)(f, sr_p, sby); | 2098 | 62.3k | } |
dav1d_filter_sbrow_lr_16bpc Line | Count | Source | 2088 | 49.3k | void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { | 2089 | 49.3k | if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return; | 2090 | 49.3k | const int y = sby * f->sb_step * 4; | 2091 | 49.3k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2092 | 49.3k | pixel *const sr_p[3] = { | 2093 | 49.3k | f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), | 2094 | 49.3k | f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), | 2095 | 49.3k | f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) | 2096 | 49.3k | }; | 2097 | 49.3k | bytefn(dav1d_lr_sbrow)(f, sr_p, sby); | 2098 | 49.3k | } |
|
2099 | | |
2100 | 0 | void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { |
2101 | 0 | bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby); |
2102 | 0 | bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby); |
2103 | 0 | if (f->seq_hdr->cdef) |
2104 | 0 | bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby); |
2105 | 0 | if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) |
2106 | 0 | bytefn(dav1d_filter_sbrow_resize)(f, sby); |
2107 | 0 | if (f->lf.restore_planes) |
2108 | 0 | bytefn(dav1d_filter_sbrow_lr)(f, sby); |
2109 | 0 | } Unexecuted instantiation: dav1d_filter_sbrow_8bpc Unexecuted instantiation: dav1d_filter_sbrow_16bpc |
2110 | | |
2111 | 296k | void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) { |
2112 | 296k | const Dav1dFrameContext *const f = t->f; |
2113 | 296k | Dav1dTileState *const ts = t->ts; |
2114 | 296k | const int sby = t->by >> f->sb_shift; |
2115 | 296k | const int sby_off = f->sb128w * 128 * sby; |
2116 | 296k | const int x_off = ts->tiling.col_start; |
2117 | | |
2118 | 296k | const pixel *const y = |
2119 | 296k | ((const pixel *) f->cur.data[0]) + x_off * 4 + |
2120 | 296k | ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]); |
2121 | 296k | pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y, |
2122 | 296k | 4 * (ts->tiling.col_end - x_off)); |
2123 | | |
2124 | 296k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { |
2125 | 232k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
2126 | 232k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
2127 | | |
2128 | 232k | const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) + |
2129 | 232k | (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]); |
2130 | 696k | for (int pl = 1; pl <= 2; pl++) |
2131 | 464k | pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)], |
2132 | 242k | &((const pixel *) f->cur.data[pl])[uv_off], |
2133 | 242k | 4 * (ts->tiling.col_end - x_off) >> ss_hor); |
2134 | 232k | } |
2135 | 296k | } dav1d_backup_ipred_edge_8bpc Line | Count | Source | 2111 | 149k | void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) { | 2112 | 149k | const Dav1dFrameContext *const f = t->f; | 2113 | 149k | Dav1dTileState *const ts = t->ts; | 2114 | 149k | const int sby = t->by >> f->sb_shift; | 2115 | 149k | const int sby_off = f->sb128w * 128 * sby; | 2116 | 149k | const int x_off = ts->tiling.col_start; | 2117 | | | 2118 | 149k | const pixel *const y = | 2119 | 149k | ((const pixel *) f->cur.data[0]) + x_off * 4 + | 2120 | 149k | ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]); | 2121 | 149k | pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y, | 2122 | 149k | 4 * (ts->tiling.col_end - x_off)); | 2123 | | | 2124 | 149k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { | 2125 | 121k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2126 | 121k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 2127 | | | 2128 | 121k | const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) + | 2129 | 121k | (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]); | 2130 | 364k | for (int pl = 1; pl <= 2; pl++) | 2131 | 242k | pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)], | 2132 | 242k | &((const pixel *) f->cur.data[pl])[uv_off], | 2133 | 242k | 4 * (ts->tiling.col_end - x_off) >> ss_hor); | 2134 | 121k | } | 2135 | 149k | } |
dav1d_backup_ipred_edge_16bpc Line | Count | Source | 2111 | 147k | void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) { | 2112 | 147k | const Dav1dFrameContext *const f = t->f; | 2113 | 147k | Dav1dTileState *const ts = t->ts; | 2114 | 147k | const int sby = t->by >> f->sb_shift; | 2115 | 147k | const int sby_off = f->sb128w * 128 * sby; | 2116 | 147k | const int x_off = ts->tiling.col_start; | 2117 | | | 2118 | 147k | const pixel *const y = | 2119 | 147k | ((const pixel *) f->cur.data[0]) + x_off * 4 + | 2120 | 147k | ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]); | 2121 | 147k | pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y, | 2122 | 147k | 4 * (ts->tiling.col_end - x_off)); | 2123 | | | 2124 | 147k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { | 2125 | 110k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 2126 | 110k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 2127 | | | 2128 | 110k | const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) + | 2129 | 110k | (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]); | 2130 | 332k | for (int pl = 1; pl <= 2; pl++) | 2131 | 221k | pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)], | 2132 | 110k | &((const pixel *) f->cur.data[pl])[uv_off], | 2133 | 110k | 4 * (ts->tiling.col_end - x_off) >> ss_hor); | 2134 | 110k | } | 2135 | 147k | } |
|
2136 | | |
2137 | | void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t, |
2138 | | const int bx4, const int by4, |
2139 | | const int bw4, const int bh4) |
2140 | | |
2141 | 38.6k | { |
2142 | 38.6k | const Dav1dFrameContext *const f = t->f; |
2143 | 38.6k | pixel *const pal = t->frame_thread.pass ? |
2144 | 38.6k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + |
2145 | 38.6k | ((t->bx >> 1) + (t->by & 1))][0] : |
2146 | 38.6k | bytefn(t->scratch.pal)[0]; |
2147 | 184k | for (int x = 0; x < bw4; x++) |
2148 | 145k | memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel)); |
2149 | 158k | for (int y = 0; y < bh4; y++) |
2150 | 119k | memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel)); |
2151 | 38.6k | } dav1d_copy_pal_block_y_8bpc Line | Count | Source | 2141 | 19.8k | { | 2142 | 19.8k | const Dav1dFrameContext *const f = t->f; | 2143 | 19.8k | pixel *const pal = t->frame_thread.pass ? | 2144 | 19.8k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2145 | 19.8k | ((t->bx >> 1) + (t->by & 1))][0] : | 2146 | 19.8k | bytefn(t->scratch.pal)[0]; | 2147 | 93.7k | for (int x = 0; x < bw4; x++) | 2148 | 73.8k | memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel)); | 2149 | 79.6k | for (int y = 0; y < bh4; y++) | 2150 | 59.7k | memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel)); | 2151 | 19.8k | } |
dav1d_copy_pal_block_y_16bpc Line | Count | Source | 2141 | 18.8k | { | 2142 | 18.8k | const Dav1dFrameContext *const f = t->f; | 2143 | 18.8k | pixel *const pal = t->frame_thread.pass ? | 2144 | 18.8k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2145 | 18.8k | ((t->bx >> 1) + (t->by & 1))][0] : | 2146 | 18.8k | bytefn(t->scratch.pal)[0]; | 2147 | 90.5k | for (int x = 0; x < bw4; x++) | 2148 | 71.7k | memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel)); | 2149 | 78.8k | for (int y = 0; y < bh4; y++) | 2150 | 60.0k | memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel)); | 2151 | 18.8k | } |
|
2152 | | |
2153 | | void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t, |
2154 | | const int bx4, const int by4, |
2155 | | const int bw4, const int bh4) |
2156 | | |
2157 | 13.2k | { |
2158 | 13.2k | const Dav1dFrameContext *const f = t->f; |
2159 | 13.2k | const pixel (*const pal)[8] = t->frame_thread.pass ? |
2160 | 13.2k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + |
2161 | 13.2k | ((t->bx >> 1) + (t->by & 1))] : |
2162 | 13.2k | bytefn(t->scratch.pal); |
2163 | | // see aomedia bug 2183 for why we use luma coordinates here |
2164 | 39.8k | for (int pl = 1; pl <= 2; pl++) { |
2165 | 148k | for (int x = 0; x < bw4; x++) |
2166 | 121k | memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel)); |
2167 | 134k | for (int y = 0; y < bh4; y++) |
2168 | 107k | memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel)); |
2169 | 26.5k | } |
2170 | 13.2k | } dav1d_copy_pal_block_uv_8bpc Line | Count | Source | 2157 | 7.30k | { | 2158 | 7.30k | const Dav1dFrameContext *const f = t->f; | 2159 | 7.30k | const pixel (*const pal)[8] = t->frame_thread.pass ? | 2160 | 7.30k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2161 | 7.30k | ((t->bx >> 1) + (t->by & 1))] : | 2162 | 7.30k | bytefn(t->scratch.pal); | 2163 | | // see aomedia bug 2183 for why we use luma coordinates here | 2164 | 21.9k | for (int pl = 1; pl <= 2; pl++) { | 2165 | 81.4k | for (int x = 0; x < bw4; x++) | 2166 | 66.8k | memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel)); | 2167 | 74.7k | for (int y = 0; y < bh4; y++) | 2168 | 60.1k | memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel)); | 2169 | 14.6k | } | 2170 | 7.30k | } |
dav1d_copy_pal_block_uv_16bpc Line | Count | Source | 2157 | 5.97k | { | 2158 | 5.97k | const Dav1dFrameContext *const f = t->f; | 2159 | 5.97k | const pixel (*const pal)[8] = t->frame_thread.pass ? | 2160 | 5.97k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2161 | 5.97k | ((t->bx >> 1) + (t->by & 1))] : | 2162 | 5.97k | bytefn(t->scratch.pal); | 2163 | | // see aomedia bug 2183 for why we use luma coordinates here | 2164 | 17.9k | for (int pl = 1; pl <= 2; pl++) { | 2165 | 66.6k | for (int x = 0; x < bw4; x++) | 2166 | 54.7k | memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel)); | 2167 | 59.4k | for (int y = 0; y < bh4; y++) | 2168 | 47.5k | memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel)); | 2169 | 11.9k | } | 2170 | 5.97k | } |
|
2171 | | |
2172 | | void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b, |
2173 | | const int pl, const int sz_ctx, |
2174 | | const int bx4, const int by4) |
2175 | 51.9k | { |
2176 | 51.9k | Dav1dTileState *const ts = t->ts; |
2177 | 51.9k | const Dav1dFrameContext *const f = t->f; |
2178 | 51.9k | const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, |
2179 | 51.9k | ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; |
2180 | 51.9k | pixel cache[16], used_cache[8]; |
2181 | 51.9k | int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; |
2182 | 51.9k | int n_cache = 0; |
2183 | | // don't reuse above palette outside SB64 boundaries |
2184 | 51.9k | int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; |
2185 | 51.9k | const pixel *l = bytefn(t->al_pal)[1][by4][pl]; |
2186 | 51.9k | const pixel *a = bytefn(t->al_pal)[0][bx4][pl]; |
2187 | | |
2188 | | // fill/sort cache |
2189 | 84.5k | while (l_cache && a_cache) { |
2190 | 32.5k | if (*l < *a) { |
2191 | 11.8k | if (!n_cache || cache[n_cache - 1] != *l) |
2192 | 11.7k | cache[n_cache++] = *l; |
2193 | 11.8k | l++; |
2194 | 11.8k | l_cache--; |
2195 | 20.6k | } else { |
2196 | 20.6k | if (*a == *l) { |
2197 | 8.31k | l++; |
2198 | 8.31k | l_cache--; |
2199 | 8.31k | } |
2200 | 20.6k | if (!n_cache || cache[n_cache - 1] != *a) |
2201 | 19.9k | cache[n_cache++] = *a; |
2202 | 20.6k | a++; |
2203 | 20.6k | a_cache--; |
2204 | 20.6k | } |
2205 | 32.5k | } |
2206 | 51.9k | if (l_cache) { |
2207 | 62.2k | do { |
2208 | 62.2k | if (!n_cache || cache[n_cache - 1] != *l) |
2209 | 50.7k | cache[n_cache++] = *l; |
2210 | 62.2k | l++; |
2211 | 62.2k | } while (--l_cache > 0); |
2212 | 36.9k | } else if (a_cache) { |
2213 | 46.0k | do { |
2214 | 46.0k | if (!n_cache || cache[n_cache - 1] != *a) |
2215 | 36.7k | cache[n_cache++] = *a; |
2216 | 46.0k | a++; |
2217 | 46.0k | } while (--a_cache > 0); |
2218 | 11.1k | } |
2219 | | |
2220 | | // find reused cache entries |
2221 | 51.9k | int i = 0; |
2222 | 157k | for (int n = 0; n < n_cache && i < pal_sz; n++) |
2223 | 105k | if (dav1d_msac_decode_bool_equi(&ts->msac)) |
2224 | 52.5k | used_cache[i++] = cache[n]; |
2225 | 51.9k | const int n_used_cache = i; |
2226 | | |
2227 | | // parse new entries |
2228 | 51.9k | pixel *const pal = t->frame_thread.pass ? |
2229 | 51.9k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + |
2230 | 51.9k | ((t->bx >> 1) + (t->by & 1))][pl] : |
2231 | 51.9k | bytefn(t->scratch.pal)[pl]; |
2232 | 51.9k | if (i < pal_sz) { |
2233 | 45.3k | const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; |
2234 | 45.3k | int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc); |
2235 | | |
2236 | 45.3k | if (i < pal_sz) { |
2237 | 40.9k | int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); |
2238 | 40.9k | const int max = (1 << bpc) - 1; |
2239 | | |
2240 | 95.4k | do { |
2241 | 95.4k | const int delta = dav1d_msac_decode_bools(&ts->msac, bits); |
2242 | 95.4k | prev = pal[i++] = imin(prev + delta + !pl, max); |
2243 | 95.4k | if (prev + !pl >= max) { |
2244 | 53.8k | for (; i < pal_sz; i++) |
2245 | 35.2k | pal[i] = max; |
2246 | 18.6k | break; |
2247 | 18.6k | } |
2248 | 76.8k | bits = imin(bits, 1 + ulog2(max - prev - !pl)); |
2249 | 76.8k | } while (i < pal_sz); |
2250 | 40.9k | } |
2251 | | |
2252 | | // merge cache+new entries |
2253 | 45.3k | int n = 0, m = n_used_cache; |
2254 | 256k | for (i = 0; i < pal_sz; i++) { |
2255 | 210k | if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { |
2256 | 34.9k | pal[i] = used_cache[n++]; |
2257 | 175k | } else { |
2258 | 175k | assert(m < pal_sz); |
2259 | 175k | pal[i] = pal[m++]; |
2260 | 175k | } |
2261 | 210k | } |
2262 | 45.3k | } else { |
2263 | 6.63k | memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); |
2264 | 6.63k | } |
2265 | | |
2266 | 51.9k | if (DEBUG_BLOCK_INFO) { |
2267 | 0 | printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", |
2268 | 0 | pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); |
2269 | 0 | for (int n = 0; n < n_cache; n++) |
2270 | 0 | printf("%c%02x", n ? ' ' : '[', cache[n]); |
2271 | 0 | printf("%s, pal=", n_cache ? "]" : "[]"); |
2272 | 0 | for (int n = 0; n < pal_sz; n++) |
2273 | 0 | printf("%c%02x", n ? ' ' : '[', pal[n]); |
2274 | 0 | printf("]\n"); |
2275 | 0 | } |
2276 | 51.9k | } dav1d_read_pal_plane_8bpc Line | Count | Source | 2175 | 27.1k | { | 2176 | 27.1k | Dav1dTileState *const ts = t->ts; | 2177 | 27.1k | const Dav1dFrameContext *const f = t->f; | 2178 | 27.1k | const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, | 2179 | 27.1k | ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; | 2180 | 27.1k | pixel cache[16], used_cache[8]; | 2181 | 27.1k | int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; | 2182 | 27.1k | int n_cache = 0; | 2183 | | // don't reuse above palette outside SB64 boundaries | 2184 | 27.1k | int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; | 2185 | 27.1k | const pixel *l = bytefn(t->al_pal)[1][by4][pl]; | 2186 | 27.1k | const pixel *a = bytefn(t->al_pal)[0][bx4][pl]; | 2187 | | | 2188 | | // fill/sort cache | 2189 | 47.2k | while (l_cache && a_cache) { | 2190 | 20.0k | if (*l < *a) { | 2191 | 7.22k | if (!n_cache || cache[n_cache - 1] != *l) | 2192 | 7.14k | cache[n_cache++] = *l; | 2193 | 7.22k | l++; | 2194 | 7.22k | l_cache--; | 2195 | 12.8k | } else { | 2196 | 12.8k | if (*a == *l) { | 2197 | 5.12k | l++; | 2198 | 5.12k | l_cache--; | 2199 | 5.12k | } | 2200 | 12.8k | if (!n_cache || cache[n_cache - 1] != *a) | 2201 | 12.4k | cache[n_cache++] = *a; | 2202 | 12.8k | a++; | 2203 | 12.8k | a_cache--; | 2204 | 12.8k | } | 2205 | 20.0k | } | 2206 | 27.1k | if (l_cache) { | 2207 | 32.7k | do { | 2208 | 32.7k | if (!n_cache || cache[n_cache - 1] != *l) | 2209 | 26.7k | cache[n_cache++] = *l; | 2210 | 32.7k | l++; | 2211 | 32.7k | } while (--l_cache > 0); | 2212 | 19.1k | } else if (a_cache) { | 2213 | 24.3k | do { | 2214 | 24.3k | if (!n_cache || cache[n_cache - 1] != *a) | 2215 | 19.3k | cache[n_cache++] = *a; | 2216 | 24.3k | a++; | 2217 | 24.3k | } while (--a_cache > 0); | 2218 | 6.03k | } | 2219 | | | 2220 | | // find reused cache entries | 2221 | 27.1k | int i = 0; | 2222 | 84.9k | for (int n = 0; n < n_cache && i < pal_sz; n++) | 2223 | 57.7k | if (dav1d_msac_decode_bool_equi(&ts->msac)) | 2224 | 28.7k | used_cache[i++] = cache[n]; | 2225 | 27.1k | const int n_used_cache = i; | 2226 | | | 2227 | | // parse new entries | 2228 | 27.1k | pixel *const pal = t->frame_thread.pass ? | 2229 | 27.1k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2230 | 27.1k | ((t->bx >> 1) + (t->by & 1))][pl] : | 2231 | 27.1k | bytefn(t->scratch.pal)[pl]; | 2232 | 27.1k | if (i < pal_sz) { | 2233 | 23.5k | const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; | 2234 | 23.5k | int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc); | 2235 | | | 2236 | 23.5k | if (i < pal_sz) { | 2237 | 21.1k | int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); | 2238 | 21.1k | const int max = (1 << bpc) - 1; | 2239 | | | 2240 | 48.7k | do { | 2241 | 48.7k | const int delta = dav1d_msac_decode_bools(&ts->msac, bits); | 2242 | 48.7k | prev = pal[i++] = imin(prev + delta + !pl, max); | 2243 | 48.7k | if (prev + !pl >= max) { | 2244 | 28.2k | for (; i < pal_sz; i++) | 2245 | 18.5k | pal[i] = max; | 2246 | 9.71k | break; | 2247 | 9.71k | } | 2248 | 39.0k | bits = imin(bits, 1 + ulog2(max - prev - !pl)); | 2249 | 39.0k | } while (i < pal_sz); | 2250 | 21.1k | } | 2251 | | | 2252 | | // merge cache+new entries | 2253 | 23.5k | int n = 0, m = n_used_cache; | 2254 | 133k | for (i = 0; i < pal_sz; i++) { | 2255 | 109k | if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { | 2256 | 19.0k | pal[i] = used_cache[n++]; | 2257 | 90.8k | } else { | 2258 | 90.8k | assert(m < pal_sz); | 2259 | 90.8k | pal[i] = pal[m++]; | 2260 | 90.8k | } | 2261 | 109k | } | 2262 | 23.5k | } else { | 2263 | 3.58k | memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); | 2264 | 3.58k | } | 2265 | | | 2266 | 27.1k | if (DEBUG_BLOCK_INFO) { | 2267 | 0 | printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", | 2268 | 0 | pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); | 2269 | 0 | for (int n = 0; n < n_cache; n++) | 2270 | 0 | printf("%c%02x", n ? ' ' : '[', cache[n]); | 2271 | 0 | printf("%s, pal=", n_cache ? "]" : "[]"); | 2272 | 0 | for (int n = 0; n < pal_sz; n++) | 2273 | 0 | printf("%c%02x", n ? ' ' : '[', pal[n]); | 2274 | 0 | printf("]\n"); | 2275 | 0 | } | 2276 | 27.1k | } |
dav1d_read_pal_plane_16bpc Line | Count | Source | 2175 | 24.7k | { | 2176 | 24.7k | Dav1dTileState *const ts = t->ts; | 2177 | 24.7k | const Dav1dFrameContext *const f = t->f; | 2178 | 24.7k | const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, | 2179 | 24.7k | ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; | 2180 | 24.7k | pixel cache[16], used_cache[8]; | 2181 | 24.7k | int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; | 2182 | 24.7k | int n_cache = 0; | 2183 | | // don't reuse above palette outside SB64 boundaries | 2184 | 24.7k | int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; | 2185 | 24.7k | const pixel *l = bytefn(t->al_pal)[1][by4][pl]; | 2186 | 24.7k | const pixel *a = bytefn(t->al_pal)[0][bx4][pl]; | 2187 | | | 2188 | | // fill/sort cache | 2189 | 37.2k | while (l_cache && a_cache) { | 2190 | 12.4k | if (*l < *a) { | 2191 | 4.63k | if (!n_cache || cache[n_cache - 1] != *l) | 2192 | 4.61k | cache[n_cache++] = *l; | 2193 | 4.63k | l++; | 2194 | 4.63k | l_cache--; | 2195 | 7.83k | } else { | 2196 | 7.83k | if (*a == *l) { | 2197 | 3.19k | l++; | 2198 | 3.19k | l_cache--; | 2199 | 3.19k | } | 2200 | 7.83k | if (!n_cache || cache[n_cache - 1] != *a) | 2201 | 7.59k | cache[n_cache++] = *a; | 2202 | 7.83k | a++; | 2203 | 7.83k | a_cache--; | 2204 | 7.83k | } | 2205 | 12.4k | } | 2206 | 24.7k | if (l_cache) { | 2207 | 29.4k | do { | 2208 | 29.4k | if (!n_cache || cache[n_cache - 1] != *l) | 2209 | 24.0k | cache[n_cache++] = *l; | 2210 | 29.4k | l++; | 2211 | 29.4k | } while (--l_cache > 0); | 2212 | 17.7k | } else if (a_cache) { | 2213 | 21.6k | do { | 2214 | 21.6k | if (!n_cache || cache[n_cache - 1] != *a) | 2215 | 17.4k | cache[n_cache++] = *a; | 2216 | 21.6k | a++; | 2217 | 21.6k | } while (--a_cache > 0); | 2218 | 5.06k | } | 2219 | | | 2220 | | // find reused cache entries | 2221 | 24.7k | int i = 0; | 2222 | 72.5k | for (int n = 0; n < n_cache && i < pal_sz; n++) | 2223 | 47.7k | if (dav1d_msac_decode_bool_equi(&ts->msac)) | 2224 | 23.8k | used_cache[i++] = cache[n]; | 2225 | 24.7k | const int n_used_cache = i; | 2226 | | | 2227 | | // parse new entries | 2228 | 24.7k | pixel *const pal = t->frame_thread.pass ? | 2229 | 24.7k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2230 | 24.7k | ((t->bx >> 1) + (t->by & 1))][pl] : | 2231 | 24.7k | bytefn(t->scratch.pal)[pl]; | 2232 | 24.7k | if (i < pal_sz) { | 2233 | 21.7k | const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; | 2234 | 21.7k | int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc); | 2235 | | | 2236 | 21.7k | if (i < pal_sz) { | 2237 | 19.7k | int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); | 2238 | 19.7k | const int max = (1 << bpc) - 1; | 2239 | | | 2240 | 46.6k | do { | 2241 | 46.6k | const int delta = dav1d_msac_decode_bools(&ts->msac, bits); | 2242 | 46.6k | prev = pal[i++] = imin(prev + delta + !pl, max); | 2243 | 46.6k | if (prev + !pl >= max) { | 2244 | 25.5k | for (; i < pal_sz; i++) | 2245 | 16.6k | pal[i] = max; | 2246 | 8.89k | break; | 2247 | 8.89k | } | 2248 | 37.7k | bits = imin(bits, 1 + ulog2(max - prev - !pl)); | 2249 | 37.7k | } while (i < pal_sz); | 2250 | 19.7k | } | 2251 | | | 2252 | | // merge cache+new entries | 2253 | 21.7k | int n = 0, m = n_used_cache; | 2254 | 122k | for (i = 0; i < pal_sz; i++) { | 2255 | 100k | if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { | 2256 | 15.8k | pal[i] = used_cache[n++]; | 2257 | 85.0k | } else { | 2258 | 85.0k | assert(m < pal_sz); | 2259 | 85.0k | pal[i] = pal[m++]; | 2260 | 85.0k | } | 2261 | 100k | } | 2262 | 21.7k | } else { | 2263 | 3.04k | memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); | 2264 | 3.04k | } | 2265 | | | 2266 | 24.7k | if (DEBUG_BLOCK_INFO) { | 2267 | 0 | printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", | 2268 | 0 | pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); | 2269 | 0 | for (int n = 0; n < n_cache; n++) | 2270 | 0 | printf("%c%02x", n ? ' ' : '[', cache[n]); | 2271 | 0 | printf("%s, pal=", n_cache ? "]" : "[]"); | 2272 | 0 | for (int n = 0; n < pal_sz; n++) | 2273 | 0 | printf("%c%02x", n ? ' ' : '[', pal[n]); | 2274 | 0 | printf("]\n"); | 2275 | 0 | } | 2276 | 24.7k | } |
|
2277 | | |
2278 | | void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b, |
2279 | | const int sz_ctx, const int bx4, const int by4) |
2280 | 13.2k | { |
2281 | 13.2k | bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4); |
2282 | | |
2283 | | // V pal coding |
2284 | 13.2k | Dav1dTileState *const ts = t->ts; |
2285 | 13.2k | const Dav1dFrameContext *const f = t->f; |
2286 | 13.2k | pixel *const pal = t->frame_thread.pass ? |
2287 | 13.2k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + |
2288 | 13.2k | ((t->bx >> 1) + (t->by & 1))][2] : |
2289 | 13.2k | bytefn(t->scratch.pal)[2]; |
2290 | 13.2k | const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; |
2291 | 13.2k | if (dav1d_msac_decode_bool_equi(&ts->msac)) { |
2292 | 6.77k | const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2); |
2293 | 6.77k | int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc); |
2294 | 6.77k | const int max = (1 << bpc) - 1; |
2295 | 28.4k | for (int i = 1; i < b->pal_sz[1]; i++) { |
2296 | 21.6k | int delta = dav1d_msac_decode_bools(&ts->msac, bits); |
2297 | 21.6k | if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; |
2298 | 21.6k | prev = pal[i] = (prev + delta) & max; |
2299 | 21.6k | } |
2300 | 6.77k | } else { |
2301 | 33.1k | for (int i = 0; i < b->pal_sz[1]; i++) |
2302 | 26.6k | pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc); |
2303 | 6.50k | } |
2304 | 13.2k | if (DEBUG_BLOCK_INFO) { |
2305 | 0 | printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); |
2306 | 0 | for (int n = 0; n < b->pal_sz[1]; n++) |
2307 | 0 | printf("%c%02x", n ? ' ' : '[', pal[n]); |
2308 | 0 | printf("]\n"); |
2309 | 0 | } |
2310 | 13.2k | } Line | Count | Source | 2280 | 7.31k | { | 2281 | 7.31k | bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4); | 2282 | | | 2283 | | // V pal coding | 2284 | 7.31k | Dav1dTileState *const ts = t->ts; | 2285 | 7.31k | const Dav1dFrameContext *const f = t->f; | 2286 | 7.31k | pixel *const pal = t->frame_thread.pass ? | 2287 | 7.30k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2288 | 7.30k | ((t->bx >> 1) + (t->by & 1))][2] : | 2289 | 7.31k | bytefn(t->scratch.pal)[2]; | 2290 | 7.31k | const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; | 2291 | 7.31k | if (dav1d_msac_decode_bool_equi(&ts->msac)) { | 2292 | 3.62k | const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2); | 2293 | 3.62k | int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc); | 2294 | 3.62k | const int max = (1 << bpc) - 1; | 2295 | 15.5k | for (int i = 1; i < b->pal_sz[1]; i++) { | 2296 | 11.9k | int delta = dav1d_msac_decode_bools(&ts->msac, bits); | 2297 | 11.9k | if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; | 2298 | 11.9k | prev = pal[i] = (prev + delta) & max; | 2299 | 11.9k | } | 2300 | 3.68k | } else { | 2301 | 18.9k | for (int i = 0; i < b->pal_sz[1]; i++) | 2302 | 15.2k | pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc); | 2303 | 3.68k | } | 2304 | 7.31k | if (DEBUG_BLOCK_INFO) { | 2305 | 0 | printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); | 2306 | 0 | for (int n = 0; n < b->pal_sz[1]; n++) | 2307 | 0 | printf("%c%02x", n ? ' ' : '[', pal[n]); | 2308 | 0 | printf("]\n"); | 2309 | 0 | } | 2310 | 7.31k | } |
Line | Count | Source | 2280 | 5.97k | { | 2281 | 5.97k | bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4); | 2282 | | | 2283 | | // V pal coding | 2284 | 5.97k | Dav1dTileState *const ts = t->ts; | 2285 | 5.97k | const Dav1dFrameContext *const f = t->f; | 2286 | 5.97k | pixel *const pal = t->frame_thread.pass ? | 2287 | 5.97k | f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + | 2288 | 5.97k | ((t->bx >> 1) + (t->by & 1))][2] : | 2289 | 5.97k | bytefn(t->scratch.pal)[2]; | 2290 | 5.97k | const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; | 2291 | 5.97k | if (dav1d_msac_decode_bool_equi(&ts->msac)) { | 2292 | 3.14k | const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2); | 2293 | 3.14k | int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc); | 2294 | 3.14k | const int max = (1 << bpc) - 1; | 2295 | 12.9k | for (int i = 1; i < b->pal_sz[1]; i++) { | 2296 | 9.77k | int delta = dav1d_msac_decode_bools(&ts->msac, bits); | 2297 | 9.77k | if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; | 2298 | 9.77k | prev = pal[i] = (prev + delta) & max; | 2299 | 9.77k | } | 2300 | 3.14k | } else { | 2301 | 14.2k | for (int i = 0; i < b->pal_sz[1]; i++) | 2302 | 11.4k | pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc); | 2303 | 2.82k | } | 2304 | 5.97k | if (DEBUG_BLOCK_INFO) { | 2305 | 0 | printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); | 2306 | 0 | for (int n = 0; n < b->pal_sz[1]; n++) | 2307 | 0 | printf("%c%02x", n ? ' ' : '[', pal[n]); | 2308 | 0 | printf("]\n"); | 2309 | 0 | } | 2310 | 5.97k | } |
|