/src/dav1d/src/lf_apply_tmpl.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright © 2018, VideoLAN and dav1d authors |
3 | | * Copyright © 2018, Two Orioles, LLC |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions are met: |
8 | | * |
9 | | * 1. Redistributions of source code must retain the above copyright notice, this |
10 | | * list of conditions and the following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above copyright notice, |
13 | | * this list of conditions and the following disclaimer in the documentation |
14 | | * and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #include "config.h" |
29 | | |
30 | | #include <string.h> |
31 | | |
32 | | #include "common/intops.h" |
33 | | |
34 | | #include "src/lf_apply.h" |
35 | | #include "src/lr_apply.h" |
36 | | |
37 | | // The loop filter buffer stores 12 rows of pixels. A superblock block will |
38 | | // contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above |
39 | | // and 2 below) the final 4 rows are used to swap the bottom of the last |
40 | | // stripe with the top of the next super block row. |
41 | | static void backup_lpf(const Dav1dFrameContext *const f, |
42 | | pixel *dst, const ptrdiff_t dst_stride, |
43 | | const pixel *src, const ptrdiff_t src_stride, |
44 | | const int ss_ver, const int sb128, |
45 | | int row, const int row_h, const int src_w, |
46 | | const int h, const int ss_hor, const int lr_backup) |
47 | 1.52M | { |
48 | 1.52M | const int cdef_backup = !lr_backup; |
49 | 1.52M | const int dst_w = f->frame_hdr->super_res.enabled ? |
50 | 1.42M | (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w; |
51 | | |
52 | | // The first stripe of the frame is shorter by 8 luma pixel rows. |
53 | 1.52M | int stripe_h = ((64 << (cdef_backup & sb128)) - 8 * !row) >> ss_ver; |
54 | 1.52M | src += (stripe_h - 2) * PXSTRIDE(src_stride); |
55 | | |
56 | 1.52M | if (f->c->n_tc == 1) { |
57 | 100k | if (row) { |
58 | 77.7k | const int top = 4 << sb128; |
59 | | // Copy the top part of the stored loop filtered pixels from the |
60 | | // previous sb row needed above the first stripe of this sb row. |
61 | 77.7k | pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], |
62 | 77.7k | &dst[PXSTRIDE(dst_stride) * top], dst_w); |
63 | 77.7k | pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], |
64 | 77.7k | &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); |
65 | 77.7k | pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], |
66 | 77.7k | &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); |
67 | 77.7k | pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], |
68 | 77.7k | &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); |
69 | 77.7k | } |
70 | 100k | dst += 4 * PXSTRIDE(dst_stride); |
71 | 100k | } |
72 | | |
73 | 1.52M | if (lr_backup && (f->frame_hdr->width[0] != f->frame_hdr->width[1])) { |
74 | 82.8k | while (row + stripe_h <= row_h) { |
75 | 48.6k | const int n_lines = 4 - (row + stripe_h + 1 == h); |
76 | 48.6k | f->dsp->mc.resize(dst, dst_stride, src, src_stride, |
77 | 48.6k | dst_w, n_lines, src_w, f->resize_step[ss_hor], |
78 | 48.6k | f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX); |
79 | 48.6k | row += stripe_h; // unmodified stripe_h for the 1st stripe |
80 | 48.6k | stripe_h = 64 >> ss_ver; |
81 | 48.6k | src += stripe_h * PXSTRIDE(src_stride); |
82 | 48.6k | dst += n_lines * PXSTRIDE(dst_stride); |
83 | 48.6k | if (n_lines == 3) { |
84 | 5.90k | pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w); |
85 | 5.90k | dst += PXSTRIDE(dst_stride); |
86 | 5.90k | } |
87 | 48.6k | } |
88 | 1.48M | } else { |
89 | 3.03M | while (row + stripe_h <= row_h) { |
90 | 1.54M | const int n_lines = 4 - (row + stripe_h + 1 == h); |
91 | 7.71M | for (int i = 0; i < 4; i++) { |
92 | 6.17M | pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] : |
93 | 6.17M | src, src_w); |
94 | 6.17M | dst += PXSTRIDE(dst_stride); |
95 | 6.17M | src += PXSTRIDE(src_stride); |
96 | 6.17M | } |
97 | 1.54M | row += stripe_h; // unmodified stripe_h for the 1st stripe |
98 | 1.54M | stripe_h = 64 >> ss_ver; |
99 | 1.54M | src += (stripe_h - 4) * PXSTRIDE(src_stride); |
100 | 1.54M | } |
101 | 1.48M | } |
102 | 1.52M | } |
103 | | |
104 | | void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f, |
105 | | /*const*/ pixel *const src[3], const int sby) |
106 | 976k | { |
107 | 976k | const int have_tt = f->c->n_tc > 1; |
108 | 976k | const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; |
109 | 976k | const int offset = 8 * !!sby; |
110 | 976k | const ptrdiff_t *const src_stride = f->cur.stride; |
111 | 976k | const ptrdiff_t *const lr_stride = f->sr_cur.p.stride; |
112 | 976k | const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128); |
113 | 976k | pixel *const dst[3] = { |
114 | 976k | f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]), |
115 | 976k | f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]), |
116 | 976k | f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1]) |
117 | 976k | }; |
118 | | |
119 | | // TODO Also check block level restore type to reduce copying. |
120 | 976k | const int restore_planes = f->lf.restore_planes; |
121 | | |
122 | 976k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) { |
123 | 967k | const int h = f->cur.p.h; |
124 | 967k | const int w = f->bw << 2; |
125 | 967k | const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); |
126 | 967k | const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; |
127 | 967k | if (restore_planes & LR_RESTORE_Y || !resize) |
128 | 939k | backup_lpf(f, dst[0], lr_stride[0], |
129 | 939k | src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], |
130 | 939k | 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1); |
131 | 967k | if (have_tt && resize) { |
132 | 35.4k | const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]); |
133 | 35.4k | backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0], |
134 | 35.4k | src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], |
135 | 35.4k | 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0); |
136 | 35.4k | } |
137 | 967k | } |
138 | 976k | if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) && |
139 | 976k | f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) |
140 | 278k | { |
141 | 278k | const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
142 | 278k | const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
143 | 278k | const int h = (f->cur.p.h + ss_ver) >> ss_ver; |
144 | 278k | const int w = f->bw << (2 - ss_hor); |
145 | 278k | const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); |
146 | 278k | const int offset_uv = offset >> ss_ver; |
147 | 278k | const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; |
148 | 278k | const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]); |
149 | 278k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) { |
150 | 267k | if (restore_planes & LR_RESTORE_U || !resize) |
151 | 259k | backup_lpf(f, dst[1], lr_stride[1], |
152 | 259k | src[1] - offset_uv * PXSTRIDE(src_stride[1]), |
153 | 259k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, |
154 | 259k | row_h, w, h, ss_hor, 1); |
155 | 267k | if (have_tt && resize) |
156 | 10.4k | backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1], |
157 | 10.4k | src[1] - offset_uv * PXSTRIDE(src_stride[1]), |
158 | 10.4k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, |
159 | 10.4k | row_h, w, h, ss_hor, 0); |
160 | 267k | } |
161 | 278k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) { |
162 | 275k | if (restore_planes & LR_RESTORE_V || !resize) |
163 | 266k | backup_lpf(f, dst[2], lr_stride[1], |
164 | 266k | src[2] - offset_uv * PXSTRIDE(src_stride[1]), |
165 | 266k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, |
166 | 266k | row_h, w, h, ss_hor, 1); |
167 | 275k | if (have_tt && resize) |
168 | 10.8k | backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1], |
169 | 10.8k | src[2] - offset_uv * PXSTRIDE(src_stride[1]), |
170 | 10.8k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, |
171 | 10.8k | row_h, w, h, ss_hor, 0); |
172 | 275k | } |
173 | 278k | } |
174 | 976k | } Line | Count | Source | 106 | 188k | { | 107 | 188k | const int have_tt = f->c->n_tc > 1; | 108 | 188k | const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; | 109 | 188k | const int offset = 8 * !!sby; | 110 | 188k | const ptrdiff_t *const src_stride = f->cur.stride; | 111 | 188k | const ptrdiff_t *const lr_stride = f->sr_cur.p.stride; | 112 | 188k | const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128); | 113 | 188k | pixel *const dst[3] = { | 114 | 188k | f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]), | 115 | 188k | f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]), | 116 | 188k | f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1]) | 117 | 188k | }; | 118 | | | 119 | | // TODO Also check block level restore type to reduce copying. | 120 | 188k | const int restore_planes = f->lf.restore_planes; | 121 | | | 122 | 188k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) { | 123 | 183k | const int h = f->cur.p.h; | 124 | 183k | const int w = f->bw << 2; | 125 | 183k | const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); | 126 | 183k | const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; | 127 | 183k | if (restore_planes & LR_RESTORE_Y || !resize) | 128 | 179k | backup_lpf(f, dst[0], lr_stride[0], | 129 | 179k | src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], | 130 | 179k | 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1); | 131 | 183k | if (have_tt && resize) { | 132 | 5.16k | const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]); | 133 | 5.16k | backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0], | 134 | 5.16k | src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], | 135 | 5.16k | 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0); | 136 | 5.16k | } | 137 | 183k | } | 138 | 188k | if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) && | 139 | 188k | f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) | 140 | 53.1k | { | 141 | 53.1k | const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 142 | 53.1k | const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 143 | 53.1k | const int h = (f->cur.p.h + ss_ver) >> ss_ver; | 144 | 53.1k | const int w = f->bw << (2 - ss_hor); | 145 | 53.1k | const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); | 146 | 53.1k | const int offset_uv = offset >> ss_ver; | 147 | 53.1k | const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; | 148 | 53.1k | const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]); | 149 | 53.1k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) { | 150 | 48.2k | if (restore_planes & LR_RESTORE_U || !resize) | 151 | 44.4k | backup_lpf(f, dst[1], lr_stride[1], | 152 | 44.4k | src[1] - offset_uv * PXSTRIDE(src_stride[1]), | 153 | 44.4k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 154 | 44.4k | row_h, w, h, ss_hor, 1); | 155 | 48.2k | if (have_tt && resize) | 156 | 4.38k | backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1], | 157 | 4.38k | src[1] - offset_uv * PXSTRIDE(src_stride[1]), | 158 | 4.38k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 159 | 4.38k | row_h, w, h, ss_hor, 0); | 160 | 48.2k | } | 161 | 53.1k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) { | 162 | 51.5k | if (restore_planes & LR_RESTORE_V || !resize) | 163 | 47.7k | backup_lpf(f, dst[2], lr_stride[1], | 164 | 47.7k | src[2] - offset_uv * PXSTRIDE(src_stride[1]), | 165 | 47.7k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 166 | 47.7k | row_h, w, h, ss_hor, 1); | 167 | 51.5k | if (have_tt && resize) | 168 | 4.87k | backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1], | 169 | 4.87k | src[2] - offset_uv * PXSTRIDE(src_stride[1]), | 170 | 4.87k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 171 | 4.87k | row_h, w, h, ss_hor, 0); | 172 | 51.5k | } | 173 | 53.1k | } | 174 | 188k | } |
Line | Count | Source | 106 | 788k | { | 107 | 788k | const int have_tt = f->c->n_tc > 1; | 108 | 788k | const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; | 109 | 788k | const int offset = 8 * !!sby; | 110 | 788k | const ptrdiff_t *const src_stride = f->cur.stride; | 111 | 788k | const ptrdiff_t *const lr_stride = f->sr_cur.p.stride; | 112 | 788k | const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128); | 113 | 788k | pixel *const dst[3] = { | 114 | 788k | f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]), | 115 | 788k | f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]), | 116 | 788k | f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1]) | 117 | 788k | }; | 118 | | | 119 | | // TODO Also check block level restore type to reduce copying. | 120 | 788k | const int restore_planes = f->lf.restore_planes; | 121 | | | 122 | 788k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) { | 123 | 783k | const int h = f->cur.p.h; | 124 | 783k | const int w = f->bw << 2; | 125 | 783k | const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); | 126 | 783k | const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; | 127 | 783k | if (restore_planes & LR_RESTORE_Y || !resize) | 128 | 759k | backup_lpf(f, dst[0], lr_stride[0], | 129 | 759k | src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], | 130 | 759k | 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1); | 131 | 783k | if (have_tt && resize) { | 132 | 30.2k | const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]); | 133 | 30.2k | backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0], | 134 | 30.2k | src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], | 135 | 30.2k | 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0); | 136 | 30.2k | } | 137 | 783k | } | 138 | 788k | if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) && | 139 | 788k | f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) | 140 | 224k | { | 141 | 224k | const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 142 | 224k | const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 143 | 224k | const int h = (f->cur.p.h + ss_ver) >> ss_ver; | 144 | 224k | const int w = f->bw << (2 - ss_hor); | 145 | 224k | const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); | 146 | 224k | const int offset_uv = offset >> ss_ver; | 147 | 224k | const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; | 148 | 224k | const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]); | 149 | 224k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) { | 150 | 219k | if (restore_planes & LR_RESTORE_U || !resize) | 151 | 215k | backup_lpf(f, dst[1], lr_stride[1], | 152 | 215k | src[1] - offset_uv * PXSTRIDE(src_stride[1]), | 153 | 215k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 154 | 215k | row_h, w, h, ss_hor, 1); | 155 | 219k | if (have_tt && resize) | 156 | 6.05k | backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1], | 157 | 6.05k | src[1] - offset_uv * PXSTRIDE(src_stride[1]), | 158 | 6.05k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 159 | 6.05k | row_h, w, h, ss_hor, 0); | 160 | 219k | } | 161 | 224k | if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) { | 162 | 224k | if (restore_planes & LR_RESTORE_V || !resize) | 163 | 219k | backup_lpf(f, dst[2], lr_stride[1], | 164 | 219k | src[2] - offset_uv * PXSTRIDE(src_stride[1]), | 165 | 219k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 166 | 219k | row_h, w, h, ss_hor, 1); | 167 | 224k | if (have_tt && resize) | 168 | 6.00k | backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1], | 169 | 6.00k | src[2] - offset_uv * PXSTRIDE(src_stride[1]), | 170 | 6.00k | src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, | 171 | 6.00k | row_h, w, h, ss_hor, 0); | 172 | 224k | } | 173 | 224k | } | 174 | 788k | } |
|
175 | | |
176 | | static inline void filter_plane_cols_y(const Dav1dFrameContext *const f, |
177 | | const int have_left, |
178 | | const uint8_t (*lvl)[4], |
179 | | const ptrdiff_t b4_stride, |
180 | | const uint16_t (*const mask)[3][2], |
181 | | pixel *dst, const ptrdiff_t ls, |
182 | | const int w, |
183 | | const int starty4, const int endy4) |
184 | 834k | { |
185 | 834k | const Dav1dDSPContext *const dsp = f->dsp; |
186 | | |
187 | | // filter edges between columns (e.g. block1 | block2) |
188 | 21.7M | for (int x = 0; x < w; x++) { |
189 | 20.8M | if (!have_left && !x) continue; |
190 | 20.1M | uint32_t hmask[4]; |
191 | 20.1M | if (!starty4) { |
192 | 18.4M | hmask[0] = mask[x][0][0]; |
193 | 18.4M | hmask[1] = mask[x][1][0]; |
194 | 18.4M | hmask[2] = mask[x][2][0]; |
195 | 18.4M | if (endy4 > 16) { |
196 | 16.3M | hmask[0] |= (unsigned) mask[x][0][1] << 16; |
197 | 16.3M | hmask[1] |= (unsigned) mask[x][1][1] << 16; |
198 | 16.3M | hmask[2] |= (unsigned) mask[x][2][1] << 16; |
199 | 16.3M | } |
200 | 18.4M | } else { |
201 | 1.71M | hmask[0] = mask[x][0][1]; |
202 | 1.71M | hmask[1] = mask[x][1][1]; |
203 | 1.71M | hmask[2] = mask[x][2][1]; |
204 | 1.71M | } |
205 | 20.1M | hmask[3] = 0; |
206 | 20.1M | dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask, |
207 | 20.1M | (const uint8_t(*)[4]) &lvl[x][0], b4_stride, |
208 | 20.1M | &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX); |
209 | 20.1M | } |
210 | 834k | } |
211 | | |
212 | | static inline void filter_plane_rows_y(const Dav1dFrameContext *const f, |
213 | | const int have_top, |
214 | | const uint8_t (*lvl)[4], |
215 | | const ptrdiff_t b4_stride, |
216 | | const uint16_t (*const mask)[3][2], |
217 | | pixel *dst, const ptrdiff_t ls, |
218 | | const int w, |
219 | | const int starty4, const int endy4) |
220 | 834k | { |
221 | 834k | const Dav1dDSPContext *const dsp = f->dsp; |
222 | | |
223 | | // block1 |
224 | | // filter edges between rows (e.g. ------) |
225 | | // block2 |
226 | 24.4M | for (int y = starty4; y < endy4; |
227 | 23.5M | y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride) |
228 | 23.5M | { |
229 | 23.5M | if (!have_top && !y) continue; |
230 | 23.5M | const uint32_t vmask[4] = { |
231 | 23.5M | mask[y][0][0] | ((unsigned) mask[y][0][1] << 16), |
232 | 23.5M | mask[y][1][0] | ((unsigned) mask[y][1][1] << 16), |
233 | 23.5M | mask[y][2][0] | ((unsigned) mask[y][2][1] << 16), |
234 | 23.5M | 0, |
235 | 23.5M | }; |
236 | 23.5M | dsp->lf.loop_filter_sb[0][1](dst, ls, vmask, |
237 | 23.5M | (const uint8_t(*)[4]) &lvl[0][1], b4_stride, |
238 | 23.5M | &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX); |
239 | 23.5M | } |
240 | 834k | } |
241 | | |
242 | | static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f, |
243 | | const int have_left, |
244 | | const uint8_t (*lvl)[4], |
245 | | const ptrdiff_t b4_stride, |
246 | | const uint16_t (*const mask)[2][2], |
247 | | pixel *const u, pixel *const v, |
248 | | const ptrdiff_t ls, const int w, |
249 | | const int starty4, const int endy4, |
250 | | const int ss_ver) |
251 | 161k | { |
252 | 161k | const Dav1dDSPContext *const dsp = f->dsp; |
253 | | |
254 | | // filter edges between columns (e.g. block1 | block2) |
255 | 2.45M | for (int x = 0; x < w; x++) { |
256 | 2.28M | if (!have_left && !x) continue; |
257 | 2.17M | uint32_t hmask[3]; |
258 | 2.17M | if (!starty4) { |
259 | 2.08M | hmask[0] = mask[x][0][0]; |
260 | 2.08M | hmask[1] = mask[x][1][0]; |
261 | 2.08M | if (endy4 > (16 >> ss_ver)) { |
262 | 1.87M | hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver); |
263 | 1.87M | hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver); |
264 | 1.87M | } |
265 | 2.08M | } else { |
266 | 94.8k | hmask[0] = mask[x][0][1]; |
267 | 94.8k | hmask[1] = mask[x][1][1]; |
268 | 94.8k | } |
269 | 2.17M | hmask[2] = 0; |
270 | 2.17M | dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask, |
271 | 2.17M | (const uint8_t(*)[4]) &lvl[x][2], b4_stride, |
272 | 2.17M | &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX); |
273 | 2.17M | dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask, |
274 | 2.17M | (const uint8_t(*)[4]) &lvl[x][3], b4_stride, |
275 | 2.17M | &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX); |
276 | 2.17M | } |
277 | 161k | } |
278 | | |
279 | | static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f, |
280 | | const int have_top, |
281 | | const uint8_t (*lvl)[4], |
282 | | const ptrdiff_t b4_stride, |
283 | | const uint16_t (*const mask)[2][2], |
284 | | pixel *const u, pixel *const v, |
285 | | const ptrdiff_t ls, const int w, |
286 | | const int starty4, const int endy4, |
287 | | const int ss_hor) |
288 | 161k | { |
289 | 161k | const Dav1dDSPContext *const dsp = f->dsp; |
290 | 161k | ptrdiff_t off_l = 0; |
291 | | |
292 | | // block1 |
293 | | // filter edges between rows (e.g. ------) |
294 | | // block2 |
295 | 2.75M | for (int y = starty4; y < endy4; |
296 | 2.59M | y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride) |
297 | 2.59M | { |
298 | 2.59M | if (!have_top && !y) continue; |
299 | 2.56M | const uint32_t vmask[3] = { |
300 | 2.56M | mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)), |
301 | 2.56M | mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)), |
302 | 2.56M | 0, |
303 | 2.56M | }; |
304 | 2.56M | dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask, |
305 | 2.56M | (const uint8_t(*)[4]) &lvl[0][2], b4_stride, |
306 | 2.56M | &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX); |
307 | 2.56M | dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask, |
308 | 2.56M | (const uint8_t(*)[4]) &lvl[0][3], b4_stride, |
309 | 2.56M | &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX); |
310 | 2.56M | } |
311 | 161k | } |
312 | | |
313 | | void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *const f, |
314 | | pixel *const p[3], Av1Filter *const lflvl, |
315 | | int sby, const int start_of_tile_row) |
316 | 758k | { |
317 | 758k | int x, have_left; |
318 | | // Don't filter outside the frame |
319 | 758k | const int is_sb64 = !f->seq_hdr->sb128; |
320 | 758k | const int starty4 = (sby & is_sb64) << 4; |
321 | 758k | const int sbsz = 32 >> is_sb64; |
322 | 758k | const int sbl2 = 5 - is_sb64; |
323 | 758k | const int halign = (f->bh + 31) & ~31; |
324 | 758k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
325 | 758k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
326 | 758k | const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor; |
327 | 758k | const unsigned vmax = 1U << vmask, hmax = 1U << hmask; |
328 | 758k | const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); |
329 | 758k | const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; |
330 | | |
331 | | // fix lpf strength at tile col boundaries |
332 | 758k | const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2]; |
333 | 758k | const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)]; |
334 | 798k | for (int tile_col = 1;; tile_col++) { |
335 | 798k | x = f->frame_hdr->tiling.col_start_sb[tile_col]; |
336 | 798k | if ((x << sbl2) >= f->bw) break; |
337 | 39.7k | const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor; |
338 | 39.7k | x >>= is_sb64; |
339 | | |
340 | 39.7k | uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4]; |
341 | 734k | for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) { |
342 | 694k | const int sidx = mask >= 0x10000U; |
343 | 694k | const unsigned smask = mask >> (sidx << 4); |
344 | 694k | const int idx = 2 * !!(y_hmask[2][sidx] & smask) + |
345 | 694k | !!(y_hmask[1][sidx] & smask); |
346 | 694k | y_hmask[2][sidx] &= ~smask; |
347 | 694k | y_hmask[1][sidx] &= ~smask; |
348 | 694k | y_hmask[0][sidx] &= ~smask; |
349 | 694k | y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask; |
350 | 694k | } |
351 | | |
352 | 39.7k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { |
353 | 5.24k | uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4]; |
354 | 67.8k | for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4; |
355 | 62.6k | y++, uv_mask <<= 1) |
356 | 62.6k | { |
357 | 62.6k | const int sidx = uv_mask >= vmax; |
358 | 62.6k | const unsigned smask = uv_mask >> (sidx << (4 - ss_ver)); |
359 | 62.6k | const int idx = !!(uv_hmask[1][sidx] & smask); |
360 | 62.6k | uv_hmask[1][sidx] &= ~smask; |
361 | 62.6k | uv_hmask[0][sidx] &= ~smask; |
362 | 62.6k | uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask; |
363 | 62.6k | } |
364 | 5.24k | } |
365 | 39.7k | lpf_y += halign; |
366 | 39.7k | lpf_uv += halign >> ss_ver; |
367 | 39.7k | } |
368 | | |
369 | | // fix lpf strength at tile row boundaries |
370 | 758k | if (start_of_tile_row) { |
371 | 2.40k | const BlockContext *a; |
372 | 2.40k | for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)]; |
373 | 9.44k | x < f->sb128w; x++, a++) |
374 | 7.04k | { |
375 | 7.04k | uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4]; |
376 | 7.04k | const unsigned w = imin(32, f->w4 - (x << 5)); |
377 | 213k | for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) { |
378 | 206k | const int sidx = mask >= 0x10000U; |
379 | 206k | const unsigned smask = mask >> (sidx << 4); |
380 | 206k | const int idx = 2 * !!(y_vmask[2][sidx] & smask) + |
381 | 206k | !!(y_vmask[1][sidx] & smask); |
382 | 206k | y_vmask[2][sidx] &= ~smask; |
383 | 206k | y_vmask[1][sidx] &= ~smask; |
384 | 206k | y_vmask[0][sidx] &= ~smask; |
385 | 206k | y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask; |
386 | 206k | } |
387 | | |
388 | 7.04k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { |
389 | 5.03k | const unsigned cw = (w + ss_hor) >> ss_hor; |
390 | 5.03k | uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver]; |
391 | 89.8k | for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) { |
392 | 84.8k | const int sidx = uv_mask >= hmax; |
393 | 84.8k | const unsigned smask = uv_mask >> (sidx << (4 - ss_hor)); |
394 | 84.8k | const int idx = !!(uv_vmask[1][sidx] & smask); |
395 | 84.8k | uv_vmask[1][sidx] &= ~smask; |
396 | 84.8k | uv_vmask[0][sidx] &= ~smask; |
397 | 84.8k | uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask; |
398 | 84.8k | } |
399 | 5.03k | } |
400 | 7.04k | } |
401 | 2.40k | } |
402 | | |
403 | 758k | pixel *ptr; |
404 | 758k | uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; |
405 | 1.59M | for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w; |
406 | 834k | x++, have_left = 1, ptr += 128, level_ptr += 32) |
407 | 834k | { |
408 | 834k | filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride, |
409 | 834k | lflvl[x].filter_y[0], ptr, f->cur.stride[0], |
410 | 834k | imin(32, f->w4 - x * 32), starty4, endy4); |
411 | 834k | } |
412 | | |
413 | 758k | if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) |
414 | 644k | return; |
415 | | |
416 | 114k | ptrdiff_t uv_off; |
417 | 114k | level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); |
418 | 276k | for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w; |
419 | 161k | x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) |
420 | 161k | { |
421 | 161k | filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride, |
422 | 161k | lflvl[x].filter_uv[0], |
423 | 161k | &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], |
424 | 161k | (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, |
425 | 161k | starty4 >> ss_ver, uv_endy4, ss_ver); |
426 | 161k | } |
427 | 114k | } dav1d_loopfilter_sbrow_cols_8bpc Line | Count | Source | 316 | 156k | { | 317 | 156k | int x, have_left; | 318 | | // Don't filter outside the frame | 319 | 156k | const int is_sb64 = !f->seq_hdr->sb128; | 320 | 156k | const int starty4 = (sby & is_sb64) << 4; | 321 | 156k | const int sbsz = 32 >> is_sb64; | 322 | 156k | const int sbl2 = 5 - is_sb64; | 323 | 156k | const int halign = (f->bh + 31) & ~31; | 324 | 156k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 325 | 156k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 326 | 156k | const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor; | 327 | 156k | const unsigned vmax = 1U << vmask, hmax = 1U << hmask; | 328 | 156k | const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); | 329 | 156k | const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; | 330 | | | 331 | | // fix lpf strength at tile col boundaries | 332 | 156k | const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2]; | 333 | 156k | const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)]; | 334 | 191k | for (int tile_col = 1;; tile_col++) { | 335 | 191k | x = f->frame_hdr->tiling.col_start_sb[tile_col]; | 336 | 191k | if ((x << sbl2) >= f->bw) break; | 337 | 35.7k | const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor; | 338 | 35.7k | x >>= is_sb64; | 339 | | | 340 | 35.7k | uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4]; | 341 | 644k | for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) { | 342 | 608k | const int sidx = mask >= 0x10000U; | 343 | 608k | const unsigned smask = mask >> (sidx << 4); | 344 | 608k | const int idx = 2 * !!(y_hmask[2][sidx] & smask) + | 345 | 608k | !!(y_hmask[1][sidx] & smask); | 346 | 608k | y_hmask[2][sidx] &= ~smask; | 347 | 608k | y_hmask[1][sidx] &= ~smask; | 348 | 608k | y_hmask[0][sidx] &= ~smask; | 349 | 608k | y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask; | 350 | 608k | } | 351 | | | 352 | 35.7k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { | 353 | 4.02k | uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4]; | 354 | 46.5k | for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4; | 355 | 42.5k | y++, uv_mask <<= 1) | 356 | 42.5k | { | 357 | 42.5k | const int sidx = uv_mask >= vmax; | 358 | 42.5k | const unsigned smask = uv_mask >> (sidx << (4 - ss_ver)); | 359 | 42.5k | const int idx = !!(uv_hmask[1][sidx] & smask); | 360 | 42.5k | uv_hmask[1][sidx] &= ~smask; | 361 | 42.5k | uv_hmask[0][sidx] &= ~smask; | 362 | 42.5k | uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask; | 363 | 42.5k | } | 364 | 4.02k | } | 365 | 35.7k | lpf_y += halign; | 366 | 35.7k | lpf_uv += halign >> ss_ver; | 367 | 35.7k | } | 368 | | | 369 | | // fix lpf strength at tile row boundaries | 370 | 156k | if (start_of_tile_row) { | 371 | 994 | const BlockContext *a; | 372 | 994 | for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)]; | 373 | 4.51k | x < f->sb128w; x++, a++) | 374 | 3.51k | { | 375 | 3.51k | uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4]; | 376 | 3.51k | const unsigned w = imin(32, f->w4 - (x << 5)); | 377 | 107k | for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) { | 378 | 103k | const int sidx = mask >= 0x10000U; | 379 | 103k | const unsigned smask = mask >> (sidx << 4); | 380 | 103k | const int idx = 2 * !!(y_vmask[2][sidx] & smask) + | 381 | 103k | !!(y_vmask[1][sidx] & smask); | 382 | 103k | y_vmask[2][sidx] &= ~smask; | 383 | 103k | y_vmask[1][sidx] &= ~smask; | 384 | 103k | y_vmask[0][sidx] &= ~smask; | 385 | 103k | y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask; | 386 | 103k | } | 387 | | | 388 | 3.51k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { | 389 | 2.28k | const unsigned cw = (w + ss_hor) >> ss_hor; | 390 | 2.28k | uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver]; | 391 | 39.8k | for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) { | 392 | 37.5k | const int sidx = uv_mask >= hmax; | 393 | 37.5k | const unsigned smask = uv_mask >> (sidx << (4 - ss_hor)); | 394 | 37.5k | const int idx = !!(uv_vmask[1][sidx] & smask); | 395 | 37.5k | uv_vmask[1][sidx] &= ~smask; | 396 | 37.5k | uv_vmask[0][sidx] &= ~smask; | 397 | 37.5k | uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask; | 398 | 37.5k | } | 399 | 2.28k | } | 400 | 3.51k | } | 401 | 994 | } | 402 | | | 403 | 156k | pixel *ptr; | 404 | 156k | uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; | 405 | 354k | for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w; | 406 | 198k | x++, have_left = 1, ptr += 128, level_ptr += 32) | 407 | 198k | { | 408 | 198k | filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride, | 409 | 198k | lflvl[x].filter_y[0], ptr, f->cur.stride[0], | 410 | 198k | imin(32, f->w4 - x * 32), starty4, endy4); | 411 | 198k | } | 412 | | | 413 | 156k | if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) | 414 | 108k | return; | 415 | | | 416 | 47.1k | ptrdiff_t uv_off; | 417 | 47.1k | level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); | 418 | 122k | for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w; | 419 | 75.6k | x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) | 420 | 75.6k | { | 421 | 75.6k | filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride, | 422 | 75.6k | lflvl[x].filter_uv[0], | 423 | 75.6k | &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], | 424 | 75.6k | (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, | 425 | 75.6k | starty4 >> ss_ver, uv_endy4, ss_ver); | 426 | 75.6k | } | 427 | 47.1k | } |
dav1d_loopfilter_sbrow_cols_16bpc Line | Count | Source | 316 | 602k | { | 317 | 602k | int x, have_left; | 318 | | // Don't filter outside the frame | 319 | 602k | const int is_sb64 = !f->seq_hdr->sb128; | 320 | 602k | const int starty4 = (sby & is_sb64) << 4; | 321 | 602k | const int sbsz = 32 >> is_sb64; | 322 | 602k | const int sbl2 = 5 - is_sb64; | 323 | 602k | const int halign = (f->bh + 31) & ~31; | 324 | 602k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 325 | 602k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 326 | 602k | const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor; | 327 | 602k | const unsigned vmax = 1U << vmask, hmax = 1U << hmask; | 328 | 602k | const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); | 329 | 602k | const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; | 330 | | | 331 | | // fix lpf strength at tile col boundaries | 332 | 602k | const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2]; | 333 | 602k | const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)]; | 334 | 606k | for (int tile_col = 1;; tile_col++) { | 335 | 606k | x = f->frame_hdr->tiling.col_start_sb[tile_col]; | 336 | 606k | if ((x << sbl2) >= f->bw) break; | 337 | 3.98k | const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor; | 338 | 3.98k | x >>= is_sb64; | 339 | | | 340 | 3.98k | uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4]; | 341 | 89.6k | for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) { | 342 | 85.6k | const int sidx = mask >= 0x10000U; | 343 | 85.6k | const unsigned smask = mask >> (sidx << 4); | 344 | 85.6k | const int idx = 2 * !!(y_hmask[2][sidx] & smask) + | 345 | 85.6k | !!(y_hmask[1][sidx] & smask); | 346 | 85.6k | y_hmask[2][sidx] &= ~smask; | 347 | 85.6k | y_hmask[1][sidx] &= ~smask; | 348 | 85.6k | y_hmask[0][sidx] &= ~smask; | 349 | 85.6k | y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask; | 350 | 85.6k | } | 351 | | | 352 | 3.98k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { | 353 | 1.22k | uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4]; | 354 | 21.3k | for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4; | 355 | 20.0k | y++, uv_mask <<= 1) | 356 | 20.0k | { | 357 | 20.0k | const int sidx = uv_mask >= vmax; | 358 | 20.0k | const unsigned smask = uv_mask >> (sidx << (4 - ss_ver)); | 359 | 20.0k | const int idx = !!(uv_hmask[1][sidx] & smask); | 360 | 20.0k | uv_hmask[1][sidx] &= ~smask; | 361 | 20.0k | uv_hmask[0][sidx] &= ~smask; | 362 | 20.0k | uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask; | 363 | 20.0k | } | 364 | 1.22k | } | 365 | 3.98k | lpf_y += halign; | 366 | 3.98k | lpf_uv += halign >> ss_ver; | 367 | 3.98k | } | 368 | | | 369 | | // fix lpf strength at tile row boundaries | 370 | 602k | if (start_of_tile_row) { | 371 | 1.41k | const BlockContext *a; | 372 | 1.41k | for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)]; | 373 | 4.93k | x < f->sb128w; x++, a++) | 374 | 3.52k | { | 375 | 3.52k | uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4]; | 376 | 3.52k | const unsigned w = imin(32, f->w4 - (x << 5)); | 377 | 106k | for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) { | 378 | 102k | const int sidx = mask >= 0x10000U; | 379 | 102k | const unsigned smask = mask >> (sidx << 4); | 380 | 102k | const int idx = 2 * !!(y_vmask[2][sidx] & smask) + | 381 | 102k | !!(y_vmask[1][sidx] & smask); | 382 | 102k | y_vmask[2][sidx] &= ~smask; | 383 | 102k | y_vmask[1][sidx] &= ~smask; | 384 | 102k | y_vmask[0][sidx] &= ~smask; | 385 | 102k | y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask; | 386 | 102k | } | 387 | | | 388 | 3.52k | if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { | 389 | 2.74k | const unsigned cw = (w + ss_hor) >> ss_hor; | 390 | 2.74k | uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver]; | 391 | 50.0k | for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) { | 392 | 47.2k | const int sidx = uv_mask >= hmax; | 393 | 47.2k | const unsigned smask = uv_mask >> (sidx << (4 - ss_hor)); | 394 | 47.2k | const int idx = !!(uv_vmask[1][sidx] & smask); | 395 | 47.2k | uv_vmask[1][sidx] &= ~smask; | 396 | 47.2k | uv_vmask[0][sidx] &= ~smask; | 397 | 47.2k | uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask; | 398 | 47.2k | } | 399 | 2.74k | } | 400 | 3.52k | } | 401 | 1.41k | } | 402 | | | 403 | 602k | pixel *ptr; | 404 | 602k | uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; | 405 | 1.23M | for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w; | 406 | 636k | x++, have_left = 1, ptr += 128, level_ptr += 32) | 407 | 636k | { | 408 | 636k | filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride, | 409 | 636k | lflvl[x].filter_y[0], ptr, f->cur.stride[0], | 410 | 636k | imin(32, f->w4 - x * 32), starty4, endy4); | 411 | 636k | } | 412 | | | 413 | 602k | if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) | 414 | 535k | return; | 415 | | | 416 | 67.7k | ptrdiff_t uv_off; | 417 | 67.7k | level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); | 418 | 153k | for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w; | 419 | 85.6k | x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) | 420 | 85.6k | { | 421 | 85.6k | filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride, | 422 | 85.6k | lflvl[x].filter_uv[0], | 423 | 85.6k | &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], | 424 | 85.6k | (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, | 425 | 85.6k | starty4 >> ss_ver, uv_endy4, ss_ver); | 426 | 85.6k | } | 427 | 67.7k | } |
|
428 | | |
429 | | void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *const f, |
430 | | pixel *const p[3], Av1Filter *const lflvl, |
431 | | int sby) |
432 | 758k | { |
433 | 758k | int x; |
434 | | // Don't filter outside the frame |
435 | 758k | const int have_top = sby > 0; |
436 | 758k | const int is_sb64 = !f->seq_hdr->sb128; |
437 | 758k | const int starty4 = (sby & is_sb64) << 4; |
438 | 758k | const int sbsz = 32 >> is_sb64; |
439 | 758k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; |
440 | 758k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; |
441 | 758k | const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); |
442 | 758k | const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; |
443 | | |
444 | 758k | pixel *ptr; |
445 | 758k | uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; |
446 | 1.59M | for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) { |
447 | 834k | filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride, |
448 | 834k | lflvl[x].filter_y[1], ptr, f->cur.stride[0], |
449 | 834k | imin(32, f->w4 - x * 32), starty4, endy4); |
450 | 834k | } |
451 | | |
452 | 758k | if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) |
453 | 643k | return; |
454 | | |
455 | 114k | ptrdiff_t uv_off; |
456 | 114k | level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); |
457 | 276k | for (uv_off = 0, x = 0; x < f->sb128w; |
458 | 161k | x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) |
459 | 161k | { |
460 | 161k | filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride, |
461 | 161k | lflvl[x].filter_uv[1], |
462 | 161k | &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], |
463 | 161k | (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, |
464 | 161k | starty4 >> ss_ver, uv_endy4, ss_hor); |
465 | 161k | } |
466 | 114k | } dav1d_loopfilter_sbrow_rows_8bpc Line | Count | Source | 432 | 156k | { | 433 | 156k | int x; | 434 | | // Don't filter outside the frame | 435 | 156k | const int have_top = sby > 0; | 436 | 156k | const int is_sb64 = !f->seq_hdr->sb128; | 437 | 156k | const int starty4 = (sby & is_sb64) << 4; | 438 | 156k | const int sbsz = 32 >> is_sb64; | 439 | 156k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 440 | 156k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 441 | 156k | const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); | 442 | 156k | const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; | 443 | | | 444 | 156k | pixel *ptr; | 445 | 156k | uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; | 446 | 354k | for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) { | 447 | 198k | filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride, | 448 | 198k | lflvl[x].filter_y[1], ptr, f->cur.stride[0], | 449 | 198k | imin(32, f->w4 - x * 32), starty4, endy4); | 450 | 198k | } | 451 | | | 452 | 156k | if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) | 453 | 108k | return; | 454 | | | 455 | 47.1k | ptrdiff_t uv_off; | 456 | 47.1k | level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); | 457 | 122k | for (uv_off = 0, x = 0; x < f->sb128w; | 458 | 75.5k | x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) | 459 | 75.5k | { | 460 | 75.5k | filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride, | 461 | 75.5k | lflvl[x].filter_uv[1], | 462 | 75.5k | &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], | 463 | 75.5k | (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, | 464 | 75.5k | starty4 >> ss_ver, uv_endy4, ss_hor); | 465 | 75.5k | } | 466 | 47.1k | } |
dav1d_loopfilter_sbrow_rows_16bpc Line | Count | Source | 432 | 602k | { | 433 | 602k | int x; | 434 | | // Don't filter outside the frame | 435 | 602k | const int have_top = sby > 0; | 436 | 602k | const int is_sb64 = !f->seq_hdr->sb128; | 437 | 602k | const int starty4 = (sby & is_sb64) << 4; | 438 | 602k | const int sbsz = 32 >> is_sb64; | 439 | 602k | const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; | 440 | 602k | const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; | 441 | 602k | const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); | 442 | 602k | const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; | 443 | | | 444 | 602k | pixel *ptr; | 445 | 602k | uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; | 446 | 1.23M | for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) { | 447 | 636k | filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride, | 448 | 636k | lflvl[x].filter_y[1], ptr, f->cur.stride[0], | 449 | 636k | imin(32, f->w4 - x * 32), starty4, endy4); | 450 | 636k | } | 451 | | | 452 | 602k | if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) | 453 | 534k | return; | 454 | | | 455 | 67.7k | ptrdiff_t uv_off; | 456 | 67.7k | level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); | 457 | 153k | for (uv_off = 0, x = 0; x < f->sb128w; | 458 | 85.6k | x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) | 459 | 85.6k | { | 460 | 85.6k | filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride, | 461 | 85.6k | lflvl[x].filter_uv[1], | 462 | 85.6k | &p[1][uv_off], &p[2][uv_off], f->cur.stride[1], | 463 | 85.6k | (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, | 464 | 85.6k | starty4 >> ss_ver, uv_endy4, ss_hor); | 465 | 85.6k | } | 466 | 67.7k | } |
|