/work/svt-av1/Source/Lib/Codec/cdef.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
10 | | */ |
11 | | |
12 | | #include "cdef.h" |
13 | | #include "common_dsp_rtcd.h" |
14 | | #include "bitstream_unit.h" |
15 | | |
16 | 0 | static INLINE int32_t sign(int32_t i) { |
17 | 0 | return i < 0 ? -1 : 1; |
18 | 0 | } |
19 | | |
20 | 0 | static INLINE int32_t constrain(int32_t diff, int32_t threshold, int32_t damping) { |
21 | 0 | if (!threshold) { |
22 | 0 | return 0; |
23 | 0 | } |
24 | | |
25 | 0 | const int32_t shift = AOMMAX(0, damping - get_msb(threshold)); |
26 | 0 | return sign(diff) * AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift))); |
27 | 0 | } |
28 | | |
29 | | /* |
30 | | This is Cdef_Directions (section 7.15.3) with 2 padding entries at the |
31 | | beginning and end of the table. The cdef direction range is [0, 7] and the |
32 | | first index is offset +/-2. This removes the need to constrain the first |
33 | | index to the same range using e.g., & 7. |
34 | | */ |
35 | | DECLARE_ALIGNED(16, const int, eb_cdef_directions_padded[12][2]) = { |
36 | | /* Padding: svt_aom_eb_cdef_directions[6] */ |
37 | | {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0}, |
38 | | /* Padding: svt_aom_eb_cdef_directions[7] */ |
39 | | {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1}, |
40 | | |
41 | | /* Begin svt_aom_eb_cdef_directions */ |
42 | | {-1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2}, |
43 | | {0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2}, |
44 | | {0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2}, |
45 | | {0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2}, |
46 | | {1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2}, |
47 | | {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1}, |
48 | | {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0}, |
49 | | {1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1}, |
50 | | /* End svt_aom_eb_cdef_directions */ |
51 | | |
52 | | /* Padding: svt_aom_eb_cdef_directions[0] */ |
53 | | {-1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2}, |
54 | | /* Padding: svt_aom_eb_cdef_directions[1] */ |
55 | | {0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2}, |
56 | | }; |
57 | | |
58 | | const int (*const svt_aom_eb_cdef_directions)[2] = eb_cdef_directions_padded + 2; |
59 | | |
60 | | /* Compute the primary filter strength for an 8x8 block based on the |
61 | | directional variance difference. A high variance difference means |
62 | | that we have a highly directional pattern (e.g. a high contrast |
63 | | edge), so we can apply more deringing. A low variance means that we |
64 | | either have a low contrast edge, or a non-directional texture, so |
65 | | we want to be careful not to blur. */ |
66 | 0 | static INLINE int32_t adjust_strength(int32_t strength, int32_t var) { |
67 | 0 | const int32_t i = (var >> 6) ? AOMMIN(get_msb(var >> 6), 12) : 0; |
68 | | /* We use the variance of 8x8 blocks to adjust the strength. */ |
69 | 0 | return var ? (strength * (4 + i) + 8) >> 4 : 0; |
70 | 0 | } |
71 | | |
72 | | void svt_aom_copy_rect8_8bit_to_16bit_c(uint16_t* dst, int32_t dstride, const uint8_t* src, int32_t sstride, int32_t v, |
73 | 0 | int32_t h) { |
74 | 0 | for (int32_t i = 0; i < v; i++) { |
75 | 0 | for (int32_t j = 0; j < h; j++) { |
76 | 0 | dst[i * dstride + j] = src[i * sstride + j]; |
77 | 0 | } |
78 | 0 | } |
79 | 0 | } |
80 | | |
81 | | /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. |
82 | | The search minimizes the weighted variance along all the lines in a |
83 | | particular direction, i.e. the squared error between the input and a |
84 | | "predicted" block where each pixel is replaced by the average along a line |
85 | | in a particular direction. Since each direction have the same sum(x^2) term, |
86 | | that term is never computed. See Section 2, step 2, of: |
87 | | http://jmvalin.ca/notes/intra_paint.pdf */ |
88 | 0 | uint8_t svt_aom_cdef_find_dir_c(const uint16_t* img, int32_t stride, int32_t* var, int32_t coeff_shift) { |
89 | 0 | int32_t cost[8] = {0}; |
90 | 0 | int32_t partial[8][15] = {{0}}; |
91 | 0 | int32_t best_cost = 0; |
92 | 0 | uint8_t i; |
93 | 0 | uint8_t best_dir = 0; |
94 | | /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. |
95 | | The output is then 840 times larger, but we don't care for finding |
96 | | the max. */ |
97 | 0 | static const int32_t div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105}; |
98 | 0 | for (i = 0; i < 8; i++) { |
99 | 0 | int32_t j; |
100 | 0 | for (j = 0; j < 8; j++) { |
101 | 0 | int32_t x; |
102 | | /* We subtract 128 here to reduce the maximum range of the squared |
103 | | partial sums. */ |
104 | 0 | x = (img[i * stride + j] >> coeff_shift) - 128; |
105 | 0 | partial[0][i + j] += x; |
106 | 0 | partial[1][i + j / 2] += x; |
107 | 0 | partial[2][i] += x; |
108 | 0 | partial[3][3 + i - j / 2] += x; |
109 | 0 | partial[4][7 + i - j] += x; |
110 | 0 | partial[5][3 - i / 2 + j] += x; |
111 | 0 | partial[6][j] += x; |
112 | 0 | partial[7][i / 2 + j] += x; |
113 | 0 | } |
114 | 0 | } |
115 | 0 | for (i = 0; i < 8; i++) { |
116 | 0 | cost[2] += partial[2][i] * partial[2][i]; |
117 | 0 | cost[6] += partial[6][i] * partial[6][i]; |
118 | 0 | } |
119 | 0 | cost[2] *= div_table[8]; |
120 | 0 | cost[6] *= div_table[8]; |
121 | 0 | for (i = 0; i < 7; i++) { |
122 | 0 | cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * div_table[i + 1]; |
123 | 0 | cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * div_table[i + 1]; |
124 | 0 | } |
125 | 0 | cost[0] += partial[0][7] * partial[0][7] * div_table[8]; |
126 | 0 | cost[4] += partial[4][7] * partial[4][7] * div_table[8]; |
127 | 0 | for (i = 1; i < 8; i += 2) { |
128 | 0 | int32_t j; |
129 | 0 | for (j = 0; j < 4 + 1; j++) { |
130 | 0 | cost[i] += partial[i][3 + j] * partial[i][3 + j]; |
131 | 0 | } |
132 | 0 | cost[i] *= div_table[8]; |
133 | 0 | for (j = 0; j < 4 - 1; j++) { |
134 | 0 | cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * div_table[2 * j + 2]; |
135 | 0 | } |
136 | 0 | } |
137 | 0 | for (i = 0; i < 8; i++) { |
138 | 0 | if (cost[i] > best_cost) { |
139 | 0 | best_cost = cost[i]; |
140 | 0 | best_dir = i; |
141 | 0 | } |
142 | 0 | } |
143 | | /* Difference between the optimal variance and the variance along the |
144 | | orthogonal direction. Again, the sum(x^2) terms cancel out. */ |
145 | 0 | *var = best_cost - cost[(best_dir + 4) & 7]; |
146 | | /* We'd normally divide by 840, but dividing by 1024 is close enough |
147 | | for what we're going to do with this. */ |
148 | 0 | *var >>= 10; |
149 | 0 | return best_dir; |
150 | 0 | } |
151 | | |
152 | | void svt_aom_cdef_find_dir_dual_c(const uint16_t* img1, const uint16_t* img2, int stride, int32_t* var1, int32_t* var2, |
153 | 0 | int32_t coeff_shift, uint8_t* out1, uint8_t* out2) { |
154 | 0 | *out1 = svt_aom_cdef_find_dir_c(img1, stride, var1, coeff_shift); |
155 | 0 | *out2 = svt_aom_cdef_find_dir_c(img2, stride, var2, coeff_shift); |
156 | 0 | } |
157 | | |
158 | | static AOM_INLINE void cdef_find_dir(uint16_t* in, CdefList* dlist, int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS], |
159 | 0 | int32_t cdef_count, int32_t coeff_shift, uint8_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) { |
160 | 0 | int bi; |
161 | | |
162 | | // Find direction of two 8x8 blocks together. |
163 | 0 | for (bi = 0; bi < cdef_count - 1; bi += 2) { |
164 | 0 | const uint8_t by = dlist[bi].by; |
165 | 0 | const uint8_t bx = dlist[bi].bx; |
166 | 0 | const uint8_t by2 = dlist[bi + 1].by; |
167 | 0 | const uint8_t bx2 = dlist[bi + 1].bx; |
168 | 0 | const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx; |
169 | 0 | const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2; |
170 | 0 | svt_aom_cdef_find_dir_dual(&in[pos1], |
171 | 0 | &in[pos2], |
172 | 0 | CDEF_BSTRIDE, |
173 | 0 | &var[by][bx], |
174 | 0 | &var[by2][bx2], |
175 | 0 | coeff_shift, |
176 | 0 | &dir[by][bx], |
177 | 0 | &dir[by2][bx2]); |
178 | 0 | } |
179 | | |
180 | | // Process remaining 8x8 blocks here. One 8x8 at a time. |
181 | 0 | if (cdef_count % 2) { |
182 | 0 | const uint8_t by = dlist[bi].by; |
183 | 0 | const uint8_t bx = dlist[bi].bx; |
184 | 0 | dir[by][bx] = svt_aom_cdef_find_dir( |
185 | 0 | &in[8 * by * CDEF_BSTRIDE + 8 * bx], CDEF_BSTRIDE, &var[by][bx], coeff_shift); |
186 | 0 | } |
187 | 0 | } |
188 | | |
189 | | const int32_t svt_aom_eb_cdef_pri_taps[2][2] = {{4, 2}, {3, 3}}; |
190 | | const int32_t svt_aom_eb_cdef_sec_taps[2][2] = {{2, 1}, {2, 1}}; |
191 | | |
192 | | /* Smooth in the direction detected. */ |
193 | | void svt_cdef_filter_block_c(uint8_t* dst8, uint16_t* dst16, int32_t dstride, const uint16_t* in, int32_t pri_strength, |
194 | | int32_t sec_strength, int32_t dir, int32_t pri_damping, int32_t sec_damping, int32_t bsize, |
195 | 0 | int32_t coeff_shift, uint8_t subsampling_factor) { |
196 | 0 | int32_t i, j, k; |
197 | 0 | const int32_t s = CDEF_BSTRIDE; |
198 | 0 | const int32_t* pri_taps = svt_aom_eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; |
199 | 0 | const int32_t* sec_taps = svt_aom_eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1]; |
200 | |
|
201 | 0 | for (i = 0; i < (4 << (int32_t)(bsize == BLOCK_8X8 || bsize == BLOCK_4X8)); i += subsampling_factor) { |
202 | 0 | for (j = 0; j < (4 << (int32_t)(bsize == BLOCK_8X8 || bsize == BLOCK_8X4)); j++) { |
203 | 0 | int16_t sum = 0; |
204 | 0 | int16_t y; |
205 | 0 | int16_t x = in[i * s + j]; |
206 | 0 | int32_t max = x; |
207 | 0 | int32_t min = x; |
208 | 0 | for (k = 0; k < 2; k++) { |
209 | 0 | int16_t p0 = in[i * s + j + svt_aom_eb_cdef_directions[dir][k]]; |
210 | 0 | int16_t p1 = in[i * s + j - svt_aom_eb_cdef_directions[dir][k]]; |
211 | 0 | sum += (int16_t)(pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping)); |
212 | 0 | sum += (int16_t)(pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping)); |
213 | 0 | if (p0 != CDEF_VERY_LARGE) { |
214 | 0 | max = AOMMAX(p0, max); |
215 | 0 | } |
216 | 0 | if (p1 != CDEF_VERY_LARGE) { |
217 | 0 | max = AOMMAX(p1, max); |
218 | 0 | } |
219 | 0 | min = AOMMIN(p0, min); |
220 | 0 | min = AOMMIN(p1, min); |
221 | 0 | int16_t s0 = in[i * s + j + svt_aom_eb_cdef_directions[(dir + 2)][k]]; |
222 | 0 | int16_t s1 = in[i * s + j - svt_aom_eb_cdef_directions[(dir + 2)][k]]; |
223 | 0 | int16_t s2 = in[i * s + j + svt_aom_eb_cdef_directions[(dir - 2)][k]]; |
224 | 0 | int16_t s3 = in[i * s + j - svt_aom_eb_cdef_directions[(dir - 2)][k]]; |
225 | 0 | if (s0 != CDEF_VERY_LARGE) { |
226 | 0 | max = AOMMAX(s0, max); |
227 | 0 | } |
228 | 0 | if (s1 != CDEF_VERY_LARGE) { |
229 | 0 | max = AOMMAX(s1, max); |
230 | 0 | } |
231 | 0 | if (s2 != CDEF_VERY_LARGE) { |
232 | 0 | max = AOMMAX(s2, max); |
233 | 0 | } |
234 | 0 | if (s3 != CDEF_VERY_LARGE) { |
235 | 0 | max = AOMMAX(s3, max); |
236 | 0 | } |
237 | 0 | min = AOMMIN(s0, min); |
238 | 0 | min = AOMMIN(s1, min); |
239 | 0 | min = AOMMIN(s2, min); |
240 | 0 | min = AOMMIN(s3, min); |
241 | 0 | sum += (int16_t)(sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping)); |
242 | 0 | sum += (int16_t)(sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping)); |
243 | 0 | sum += (int16_t)(sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping)); |
244 | 0 | sum += (int16_t)(sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping)); |
245 | 0 | } |
246 | 0 | y = (int16_t)clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max); |
247 | 0 | if (dst8) { |
248 | 0 | dst8[i * dstride + j] = (uint8_t)y; |
249 | 0 | } else { |
250 | 0 | dst16[i * dstride + j] = (uint16_t)y; |
251 | 0 | } |
252 | 0 | } |
253 | 0 | } |
254 | 0 | } |
255 | | |
256 | | void svt_aom_copy_sb8_16(uint16_t* dst, int32_t dstride, const uint8_t* src, int32_t src_voffset, int32_t src_hoffset, |
257 | 0 | int32_t sstride, int32_t vsize, int32_t hsize, bool is_16bit) { |
258 | 0 | if (is_16bit) { |
259 | 0 | const uint16_t* base = ((uint16_t*)src) + (src_voffset * sstride + src_hoffset); |
260 | 0 | for (int r = 0; r < vsize; r++) { |
261 | 0 | svt_memcpy(dst, base, 2 * hsize); |
262 | 0 | dst += dstride; |
263 | 0 | base += sstride; |
264 | 0 | } |
265 | 0 | } else { |
266 | 0 | const uint8_t* base = &src[src_voffset * sstride + src_hoffset]; |
267 | 0 | svt_aom_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); |
268 | 0 | } |
269 | 0 | } |
270 | | |
271 | | /* |
272 | | * Loop over the non-skip 8x8 blocks. For each block, find the CDEF direction, then apply the specified filter. |
273 | | */ |
274 | | void svt_cdef_filter_fb(uint8_t* dst8, uint16_t* dst16, int32_t dstride, uint16_t* in, int32_t xdec, int32_t ydec, |
275 | | uint8_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t* dirinit, |
276 | | int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t pli, CdefList* dlist, int32_t cdef_count, |
277 | | int32_t level, int32_t sec_strength, int32_t pri_damping, int32_t sec_damping, |
278 | 0 | int32_t coeff_shift, uint8_t subsampling_factor) { |
279 | 0 | int32_t bi; |
280 | 0 | int32_t pri_strength = level << coeff_shift; |
281 | 0 | sec_strength <<= coeff_shift; |
282 | 0 | sec_damping += coeff_shift - (pli != PLANE_Y); |
283 | 0 | pri_damping += coeff_shift - (pli != PLANE_Y); |
284 | |
|
285 | 0 | int32_t bsize = ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8); |
286 | 0 | int32_t bsizex = 3 - xdec; |
287 | 0 | int32_t bsizey = 3 - ydec; |
288 | |
|
289 | 0 | if (!dstride && pri_strength == 0 && sec_strength == 0) { |
290 | | // If we're here, both primary and secondary strengths are 0, and |
291 | | // we still haven't written anything to y[] yet, so we just copy |
292 | | // the input to y[]. This is necessary only for svt_av1_cdef_search() |
293 | | // and only svt_av1_cdef_search() sets dirinit. |
294 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
295 | 0 | int32_t by = dlist[bi].by << bsizey; |
296 | 0 | int32_t bx = dlist[bi].bx << bsizex; |
297 | 0 | int32_t iy; |
298 | 0 | uint16_t* src_16 = in + (by * CDEF_BSTRIDE + bx); |
299 | 0 | if (dst8) { |
300 | 0 | uint8_t* dst_8 = dst8 + (bi << (bsizex + bsizey)); |
301 | | //size 2x2 and 3x3, no gain to use SIMD |
302 | 0 | for (iy = 0; iy < 1 << bsizey; iy += subsampling_factor) { |
303 | 0 | for (int32_t ix = 0; ix < 1 << bsizex; ix++) { |
304 | 0 | dst_8[(iy << bsizex) + ix] = (uint8_t)src_16[iy * CDEF_BSTRIDE + ix]; |
305 | 0 | } |
306 | 0 | } |
307 | 0 | } else { |
308 | 0 | uint16_t* dst_16 = dst16 + (bi << (bsizex + bsizey)); |
309 | 0 | for (iy = 0; iy < 1 << bsizey; iy += subsampling_factor) { |
310 | 0 | memcpy(dst_16 + (iy << bsizex), |
311 | 0 | src_16 + iy * CDEF_BSTRIDE, |
312 | 0 | (uint32_t)(1 << bsizex) * sizeof(uint16_t)); |
313 | 0 | } |
314 | 0 | } |
315 | 0 | } |
316 | 0 | return; |
317 | 0 | } |
318 | | |
319 | 0 | if (pli == 0) { |
320 | 0 | if (!dirinit || !*dirinit) { |
321 | 0 | cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir); |
322 | 0 | if (dirinit) { |
323 | 0 | *dirinit = 1; |
324 | 0 | } |
325 | 0 | } |
326 | 0 | } else if (pli == 1 && xdec != ydec) { |
327 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
328 | 0 | static const uint8_t conv422[8] = {7, 0, 2, 4, 5, 6, 6, 6}; |
329 | 0 | static const uint8_t conv440[8] = {1, 2, 2, 2, 3, 4, 6, 0}; |
330 | |
|
331 | 0 | int32_t by = dlist[bi].by; |
332 | 0 | int32_t bx = dlist[bi].bx; |
333 | 0 | dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]]; |
334 | 0 | } |
335 | 0 | } |
336 | |
|
337 | 0 | for (bi = 0; bi < cdef_count; bi++) { |
338 | 0 | int32_t by = dlist[bi].by; |
339 | 0 | int32_t bx = dlist[bi].bx; |
340 | 0 | int32_t t = pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]); |
341 | 0 | int32_t k = dstride ? (by << bsizey) * dstride + (bx << bsizex) : bi << (bsizex + bsizey); |
342 | 0 | svt_cdef_filter_block(dst8 ? &dst8[k] : NULL, |
343 | 0 | dst8 ? NULL : &dst16[k], |
344 | 0 | dstride ? dstride : 1 << bsizex, |
345 | 0 | &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)], |
346 | 0 | t, |
347 | 0 | sec_strength, |
348 | 0 | pri_strength ? dir[by][bx] : 0, |
349 | 0 | pri_damping, |
350 | 0 | sec_damping, |
351 | 0 | bsize, |
352 | 0 | coeff_shift, |
353 | 0 | subsampling_factor); |
354 | 0 | } |
355 | 0 | } |