/src/vvenc/source/Lib/CommonLib/MCTF.cpp
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | |
44 | | /** \file MCTF.cpp |
45 | | \brief MCTF class |
46 | | */ |
47 | | |
48 | | #include "MCTF.h" |
49 | | #include <math.h> |
50 | | #include "CommonLib/Picture.h" |
51 | | #include "CommonLib/dtrace_buffer.h" |
52 | | #include "Utilities/NoMallocThreadPool.h" |
53 | | |
54 | | namespace vvenc { |
55 | | |
56 | | #ifdef TRACE_ENABLE_ITT |
57 | | static __itt_string_handle* itt_handle_est = __itt_string_handle_create( "MCTF_est" ); |
58 | | static __itt_domain* itt_domain_MCTF_est = __itt_domain_create( "MCTFEst" ); |
59 | | static __itt_string_handle* itt_handle_flt = __itt_string_handle_create( "MCTF_flt" ); |
60 | | static __itt_domain* itt_domain_MCTF_flt = __itt_domain_create( "MCTFFlt" ); |
61 | | #endif |
62 | | |
63 | | // ==================================================================================================================== |
64 | | // Constructor / destructor / initialization / destroy |
65 | | // ==================================================================================================================== |
66 | | |
67 | | const double MCTF::m_chromaFactor = 0.55; |
68 | | const double MCTF::m_sigmaMultiplier = 9.0; |
69 | | const int MCTF::m_range = VVENC_MCTF_RANGE-2; |
70 | | const int MCTF::m_motionVectorFactor = 16; |
71 | | const int MCTF::m_padding = MCTF_PADDING; |
72 | | const int16_t MCTF::m_interpolationFilter8[16][8] = |
73 | | { |
74 | | { 0, 0, 0, 64, 0, 0, 0, 0 }, //0 |
75 | | { 0, 1, -3, 64, 4, -2, 0, 0 }, //1 -->--> |
76 | | { 0, 1, -6, 62, 9, -3, 1, 0 }, //2 --> |
77 | | { 0, 2, -8, 60, 14, -5, 1, 0 }, //3 -->--> |
78 | | { 0, 2, -9, 57, 19, -7, 2, 0 }, //4 |
79 | | { 0, 3, -10, 53, 24, -8, 2, 0 }, //5 -->--> |
80 | | { 0, 3, -11, 50, 29, -9, 2, 0 }, //6 --> |
81 | | { 0, 3, -11, 44, 35, -10, 3, 0 }, //7 -->--> |
82 | | { 0, 1, -7, 38, 38, -7, 1, 0 }, //8 |
83 | | { 0, 3, -10, 35, 44, -11, 3, 0 }, //9 -->--> |
84 | | { 0, 2, -9, 29, 50, -11, 3, 0 }, //10--> |
85 | | { 0, 2, -8, 24, 53, -10, 3, 0 }, //11-->--> |
86 | | { 0, 2, -7, 19, 57, -9, 2, 0 }, //12 |
87 | | { 0, 1, -5, 14, 60, -8, 2, 0 }, //13-->--> |
88 | | { 0, 1, -3, 9, 62, -6, 1, 0 }, //14--> |
89 | | { 0, 0, -2, 4, 64, -3, 1, 0 } //15-->--> |
90 | | }; |
91 | | |
92 | | const int16_t MCTF::m_interpolationFilter4[16][4] = |
93 | | { |
94 | | { 0, 64, 0, 0 }, //0 |
95 | | { -2, 62, 4, 0 }, //1 -->--> |
96 | | { -2, 58, 10, -2 }, //2 --> |
97 | | { -4, 56, 14, -2 }, //3 -->--> |
98 | | { -4, 54, 16, -2 }, //4 |
99 | | { -6, 52, 20, -2 }, //5 -->--> |
100 | | { -6, 46, 28, -4 }, //6 --> |
101 | | { -4, 42, 30, -4 }, //7 -->--> |
102 | | { -4, 36, 36, -4 }, //8 |
103 | | { -4, 30, 42, -4 }, //9 -->--> |
104 | | { -4, 28, 46, -6 }, //10--> |
105 | | { -2, 20, 52, -6 }, //11-->--> |
106 | | { -2, 16, 54, -4 }, //12 |
107 | | { -2, 14, 56, -4 }, //13-->--> |
108 | | { -2, 10, 58, -2 }, //14--> |
109 | | { 0, 4, 62, -2 }, //15-->--> |
110 | | }; |
111 | | |
112 | | const double MCTF::m_refStrengths[2][6] = // min(..., {3 or 5} / (1 + 2 * |POC offset|)) |
113 | | { // abs(POC offset) |
114 | | // 1 2 3 4 5 6 |
115 | | { 0.84375, 0.6, 0.4286, 0.3333, 0.2727, 0.2308 }, // RA |
116 | | { 1.12500, 1.0, 0.7143, 0.5556, 0.4545, 0.3846 } // LD |
117 | | }; |
118 | | |
119 | | const int MCTF::m_cuTreeThresh[4] = { 75, 60, 30, 15 }; |
120 | | const double MCTF::m_cuTreeCenter = 45; |
121 | | |
122 | | int motionErrorLumaInt( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int besterror ) |
123 | 0 | { |
124 | 0 | int error = 0; |
125 | |
|
126 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
127 | 0 | { |
128 | 0 | const Pel* origRowStart = org + y1 * origStride; |
129 | 0 | const Pel* bufferRowStart = buf + y1 * buffStride; |
130 | |
|
131 | 0 | for( int x1 = 0; x1 < w; x1 += 2 ) |
132 | 0 | { |
133 | 0 | int diff = origRowStart[x1] - bufferRowStart[x1]; |
134 | 0 | error += diff * diff; |
135 | 0 | diff = origRowStart[x1 + 1] - bufferRowStart[x1 + 1]; |
136 | 0 | error += diff * diff; |
137 | 0 | } |
138 | 0 | if( error > besterror ) |
139 | 0 | { |
140 | 0 | return error; |
141 | 0 | } |
142 | 0 | } |
143 | | |
144 | 0 | return error; |
145 | 0 | } |
146 | | |
147 | | int motionErrorLumaFrac6( const Pel *org, const ptrdiff_t origStride, const Pel *buf, const ptrdiff_t buffStride, const int w, const int h, const int16_t *xFilter, const int16_t *yFilter, const int bitDepth, const int besterror ) |
148 | 0 | { |
149 | 0 | int error = 0; |
150 | 0 | Pel tempArray[64 + 8][64]; |
151 | 0 | int sum, base; |
152 | 0 | const Pel maxSampleValue = ( 1 << bitDepth ) - 1; |
153 | |
|
154 | 0 | for( int y1 = 1; y1 < h + 7; y1++ ) |
155 | 0 | { |
156 | 0 | const int yOffset = y1 - 3; |
157 | 0 | const Pel *sourceRow = buf + yOffset * buffStride; |
158 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
159 | 0 | { |
160 | 0 | sum = 0; |
161 | 0 | base = x1 - 3; |
162 | 0 | const Pel *rowStart = sourceRow + base; |
163 | |
|
164 | 0 | sum += xFilter[1] * rowStart[1]; |
165 | 0 | sum += xFilter[2] * rowStart[2]; |
166 | 0 | sum += xFilter[3] * rowStart[3]; |
167 | 0 | sum += xFilter[4] * rowStart[4]; |
168 | 0 | sum += xFilter[5] * rowStart[5]; |
169 | 0 | sum += xFilter[6] * rowStart[6]; |
170 | |
|
171 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
172 | 0 | sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum ); |
173 | |
|
174 | 0 | tempArray[y1][x1] = sum; |
175 | 0 | } |
176 | 0 | } |
177 | |
|
178 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
179 | 0 | { |
180 | 0 | const Pel *origRow = org + y1 * origStride; |
181 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
182 | 0 | { |
183 | 0 | sum = 0; |
184 | 0 | sum += yFilter[1] * tempArray[y1 + 1][x1]; |
185 | 0 | sum += yFilter[2] * tempArray[y1 + 2][x1]; |
186 | 0 | sum += yFilter[3] * tempArray[y1 + 3][x1]; |
187 | 0 | sum += yFilter[4] * tempArray[y1 + 4][x1]; |
188 | 0 | sum += yFilter[5] * tempArray[y1 + 5][x1]; |
189 | 0 | sum += yFilter[6] * tempArray[y1 + 6][x1]; |
190 | |
|
191 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
192 | 0 | sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum ); |
193 | |
|
194 | 0 | error += ( sum - origRow[x1] ) * ( sum - origRow[x1] ); |
195 | 0 | } |
196 | 0 | if( error > besterror ) |
197 | 0 | { |
198 | 0 | return error; |
199 | 0 | } |
200 | 0 | } |
201 | | |
202 | 0 | return error; |
203 | 0 | } |
204 | | |
205 | | int motionErrorLumaFrac4( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth, const int besterror ) |
206 | 0 | { |
207 | 0 | int error = 0; |
208 | 0 | Pel tempArray[64 + 4][64]; |
209 | 0 | int sum, base; |
210 | 0 | const Pel maxSampleValue = ( 1 << bitDepth ) - 1; |
211 | |
|
212 | 0 | for( int y1 = 0; y1 < h + 3; y1++ ) |
213 | 0 | { |
214 | 0 | const int yOffset = y1 - 1; |
215 | 0 | const Pel* sourceRow = buf + yOffset * buffStride; |
216 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
217 | 0 | { |
218 | 0 | sum = 0; |
219 | 0 | base = x1 - 1; |
220 | 0 | const Pel* rowStart = sourceRow + base; |
221 | |
|
222 | 0 | sum += xFilter[0] * rowStart[0]; |
223 | 0 | sum += xFilter[1] * rowStart[1]; |
224 | 0 | sum += xFilter[2] * rowStart[2]; |
225 | 0 | sum += xFilter[3] * rowStart[3]; |
226 | |
|
227 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
228 | 0 | sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum ); |
229 | |
|
230 | 0 | tempArray[y1][x1] = sum; |
231 | 0 | } |
232 | 0 | } |
233 | |
|
234 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
235 | 0 | { |
236 | 0 | const Pel* origRow = org + y1 * origStride; |
237 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
238 | 0 | { |
239 | 0 | sum = 0; |
240 | 0 | sum += yFilter[0] * tempArray[y1 + 0][x1]; |
241 | 0 | sum += yFilter[1] * tempArray[y1 + 1][x1]; |
242 | 0 | sum += yFilter[2] * tempArray[y1 + 2][x1]; |
243 | 0 | sum += yFilter[3] * tempArray[y1 + 3][x1]; |
244 | |
|
245 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
246 | 0 | sum = sum < 0 ? 0 : ( sum > maxSampleValue ? maxSampleValue : sum ); |
247 | |
|
248 | 0 | error += ( sum - origRow[x1] ) * ( sum - origRow[x1] ); |
249 | 0 | } |
250 | 0 | if( error > besterror ) |
251 | 0 | { |
252 | 0 | return error; |
253 | 0 | } |
254 | 0 | } |
255 | | |
256 | 0 | return error; |
257 | 0 | } |
258 | | |
259 | | void applyFrac8Core_6Tap( const Pel* org, const ptrdiff_t origStride, Pel* dst, const ptrdiff_t dstStride, const int w, const int h, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth ) |
260 | 0 | { |
261 | 0 | const int numFilterTaps = 7; |
262 | 0 | const int centreTapOffset = 3; |
263 | 0 | const int maxValue = ( 1 << bitDepth ) - 1; |
264 | |
|
265 | 0 | Pel tempArray[64 + numFilterTaps][64]; |
266 | |
|
267 | 0 | for( int by = 1; by < h + numFilterTaps - 1; by++ ) |
268 | 0 | { |
269 | 0 | const int yOffset = by - centreTapOffset; |
270 | 0 | const Pel *sourceRow = org + yOffset * origStride; |
271 | 0 | for( int bx = 0; bx < w; bx++ ) |
272 | 0 | { |
273 | 0 | int base = bx - centreTapOffset; |
274 | 0 | const Pel *rowStart = sourceRow + base; |
275 | |
|
276 | 0 | int sum = 0; |
277 | 0 | sum += xFilter[1] * rowStart[1]; |
278 | 0 | sum += xFilter[2] * rowStart[2]; |
279 | 0 | sum += xFilter[3] * rowStart[3]; |
280 | 0 | sum += xFilter[4] * rowStart[4]; |
281 | 0 | sum += xFilter[5] * rowStart[5]; |
282 | 0 | sum += xFilter[6] * rowStart[6]; |
283 | |
|
284 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
285 | 0 | tempArray[by][bx] = sum; |
286 | 0 | } |
287 | 0 | } |
288 | |
|
289 | 0 | Pel *dstRow = dst; |
290 | 0 | for( int by = 0; by < h; by++, dstRow += dstStride ) |
291 | 0 | { |
292 | 0 | Pel *dstPel = dstRow; |
293 | 0 | for( int bx = 0; bx < w; bx++, dstPel++ ) |
294 | 0 | { |
295 | 0 | int sum = 0; |
296 | |
|
297 | 0 | sum += yFilter[1] * tempArray[by + 1][bx]; |
298 | 0 | sum += yFilter[2] * tempArray[by + 2][bx]; |
299 | 0 | sum += yFilter[3] * tempArray[by + 3][bx]; |
300 | 0 | sum += yFilter[4] * tempArray[by + 4][bx]; |
301 | 0 | sum += yFilter[5] * tempArray[by + 5][bx]; |
302 | 0 | sum += yFilter[6] * tempArray[by + 6][bx]; |
303 | |
|
304 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
305 | 0 | sum = sum < 0 ? 0 : ( sum > maxValue ? maxValue : sum ); |
306 | 0 | *dstPel = sum; |
307 | 0 | } |
308 | 0 | } |
309 | 0 | } |
310 | | |
311 | | void applyFrac8Core_4Tap( const Pel* org, const ptrdiff_t origStride, Pel* dst, const ptrdiff_t dstStride, const int w, const int h, const int16_t* xFilter, const int16_t* yFilter, const int bitDepth ) |
312 | 0 | { |
313 | 0 | const int numFilterTaps = 3; |
314 | 0 | const int centreTapOffset = 1; |
315 | 0 | const int maxValue = ( 1 << bitDepth ) - 1; |
316 | |
|
317 | 0 | Pel tempArray[64 + numFilterTaps][64]; |
318 | |
|
319 | 0 | for( int by = 0; by < h + numFilterTaps; by++ ) |
320 | 0 | { |
321 | 0 | const int yOffset = by - centreTapOffset; |
322 | 0 | const Pel* sourceRow = org + yOffset * origStride; |
323 | |
|
324 | 0 | for( int bx = 0; bx < w; bx++ ) |
325 | 0 | { |
326 | 0 | int base = bx - centreTapOffset; |
327 | 0 | const Pel* rowStart = sourceRow + base; |
328 | |
|
329 | 0 | int sum = 0; |
330 | 0 | sum += xFilter[0] * rowStart[0]; |
331 | 0 | sum += xFilter[1] * rowStart[1]; |
332 | 0 | sum += xFilter[2] * rowStart[2]; |
333 | 0 | sum += xFilter[3] * rowStart[3]; |
334 | |
|
335 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
336 | 0 | tempArray[by][bx] = sum; |
337 | 0 | } |
338 | 0 | } |
339 | |
|
340 | 0 | Pel* dstRow = dst; |
341 | 0 | for( int by = 0; by < h; by++, dstRow += dstStride ) |
342 | 0 | { |
343 | 0 | Pel* dstPel = dstRow; |
344 | 0 | for( int bx = 0; bx < w; bx++, dstPel++ ) |
345 | 0 | { |
346 | 0 | int sum = 0; |
347 | 0 | sum += yFilter[0] * tempArray[by + 0][bx]; |
348 | 0 | sum += yFilter[1] * tempArray[by + 1][bx]; |
349 | 0 | sum += yFilter[2] * tempArray[by + 2][bx]; |
350 | 0 | sum += yFilter[3] * tempArray[by + 3][bx]; |
351 | |
|
352 | 0 | sum = ( sum + ( 1 << 5 ) ) >> 6; |
353 | 0 | sum = sum < 0 ? 0 : ( sum > maxValue ? maxValue : sum ); |
354 | 0 | *dstPel = sum; |
355 | 0 | } |
356 | 0 | } |
357 | 0 | } |
358 | | |
359 | | inline static float fastExp( float n, float d ) |
360 | 0 | { |
361 | | // using the e^x ~= ( 1 + x/n )^n for n -> inf |
362 | 0 | float x = 1.0f + n / ( d * 1024 ); |
363 | 0 | x *= x; x *= x; x *= x; x *= x; |
364 | 0 | x *= x; x *= x; x *= x; x *= x; |
365 | 0 | x *= x; x *= x; |
366 | 0 | return x; |
367 | 0 | } |
368 | | |
369 | | static const int32_t xSzm[6] = {0, 1, 20, 336, 5440, 87296}; |
370 | | |
371 | | // works for bit depths up to incl. 12 and power-of-2 block dimensions in both directions |
372 | | void applyPlanarCorrectionCore( const Pel* refPel, const ptrdiff_t refStride, Pel* dstPel, const ptrdiff_t dstStride, const int32_t w, const int32_t h, const ClpRng& clpRng, const uint16_t motionError ) |
373 | 0 | { |
374 | 0 | const int32_t blockSize = w * h; |
375 | 0 | const int32_t log2Width = floorLog2 (w); |
376 | 0 | const int32_t maxPelVal = clpRng.max(); |
377 | 0 | const int32_t mWeight = std::min (512u, (uint32_t) motionError * (uint32_t) motionError); |
378 | 0 | const int32_t xSum = (blockSize * (w - 1)) >> 1; |
379 | 0 | int32_t x1yzm = 0, x2yzm = 0, ySum = 0; |
380 | 0 | int32_t b0, b1, b2; |
381 | 0 | int64_t numer, denom; |
382 | |
|
383 | 0 | for (int32_t y = 0; y < h; y++) // sum up dot-products between indices and sample diffs |
384 | 0 | { |
385 | 0 | for (int32_t x = 0; x < w; x++) |
386 | 0 | { |
387 | 0 | const Pel* pDst = dstPel + y * dstStride + x; |
388 | 0 | const Pel* pRef = refPel + y * refStride + x; |
389 | 0 | const int32_t z = *pDst - *pRef; |
390 | |
|
391 | 0 | x1yzm += x * z; x2yzm += y * z; ySum += z; |
392 | 0 | } |
393 | 0 | } |
394 | |
|
395 | 0 | denom = blockSize * xSzm[log2Width]; // plane-fit parameters, in fixed-point arithmetic |
396 | 0 | numer = (int64_t) mWeight * ((int64_t) x1yzm * blockSize - xSum * ySum); |
397 | 0 | b1 = int32_t ((numer < 0 ? numer - (denom >> 1) : numer + (denom >> 1)) / denom); |
398 | 0 | b1 = (b1 < INT16_MIN ? INT16_MIN : (b1 > INT16_MAX ? INT16_MAX : b1)); |
399 | 0 | numer = (int64_t) mWeight * ((int64_t) x2yzm * blockSize - xSum * ySum); |
400 | 0 | b2 = int32_t ((numer < 0 ? numer - (denom >> 1) : numer + (denom >> 1)) / denom); |
401 | 0 | b2 = (b2 > INT16_MAX ? INT16_MAX : (b2 < INT16_MIN ? INT16_MIN : b2)); |
402 | 0 | b0 = (mWeight * ySum - (b1 + b2) * xSum + (blockSize >> 1)) >> (log2Width << 1); |
403 | |
|
404 | 0 | if (b0 == 0 && b1 == 0 && b2 == 0) return; |
405 | | |
406 | 0 | for (int32_t y = 0; y < h; y++) // perform deblocking by adding fitted correction plane |
407 | 0 | { |
408 | 0 | for (int32_t x = 0; x < w; x++) |
409 | 0 | { |
410 | 0 | Pel* const pDst = dstPel + y * dstStride + x; |
411 | 0 | const int32_t p = (b0 + b1 * x + b2 * y + 256) >> 9; // fixed-point plane corrector |
412 | 0 | const int32_t z = *pDst - p; |
413 | |
|
414 | 0 | *pDst = Pel (z < 0 ? 0 : (z > maxPelVal ? maxPelVal : z)); |
415 | 0 | } |
416 | 0 | } |
417 | 0 | } |
418 | | |
419 | | void applyBlockCore( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const ClpRng& clpRng, const Pel** correctedPics, int numRefs, const int* verror, const double* refStrenghts, double weightScaling, double sigmaSq ) |
420 | 0 | { |
421 | 0 | const int w = blk.width; |
422 | 0 | const int h = blk.height; |
423 | 0 | const int bx = blk.x; |
424 | 0 | const int by = blk.y; |
425 | |
|
426 | 0 | const ptrdiff_t srcStride = src.stride; |
427 | 0 | const ptrdiff_t dstStride = dst.stride; |
428 | |
|
429 | 0 | const Pel *srcPel = src.bufAt( bx, by ); |
430 | 0 | Pel *dstPel = dst.bufAt( bx, by ); |
431 | |
|
432 | 0 | const Pel maxSampleValue = clpRng.max(); |
433 | |
|
434 | 0 | int vnoise[2 * VVENC_MCTF_RANGE] = { 0, }; |
435 | 0 | float vsw [2 * VVENC_MCTF_RANGE] = { 0.0f, }; |
436 | 0 | float vww [2 * VVENC_MCTF_RANGE] = { 0.0f, }; |
437 | |
|
438 | 0 | int minError = INT32_MAX; |
439 | |
|
440 | 0 | for( int i = 0; i < numRefs; i++ ) |
441 | 0 | { |
442 | 0 | int64_t variance = 0, diffsum = 0; |
443 | 0 | const ptrdiff_t refStride = w; |
444 | 0 | const Pel * refPel = correctedPics[i]; |
445 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
446 | 0 | { |
447 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
448 | 0 | { |
449 | 0 | const Pel pix = *( srcPel + srcStride * y1 + x1 ); |
450 | 0 | const Pel ref = *( refPel + refStride * y1 + x1 ); |
451 | |
|
452 | 0 | const int diff = pix - ref; |
453 | 0 | variance += diff * diff; |
454 | 0 | if( x1 != w - 1 ) |
455 | 0 | { |
456 | 0 | const Pel pixR = *( srcPel + srcStride * y1 + x1 + 1 ); |
457 | 0 | const Pel refR = *( refPel + refStride * y1 + x1 + 1 ); |
458 | 0 | const int diffR = pixR - refR; |
459 | 0 | diffsum += ( diffR - diff ) * ( diffR - diff ); |
460 | 0 | } |
461 | 0 | if( y1 != h - 1 ) |
462 | 0 | { |
463 | 0 | const Pel pixD = *( srcPel + srcStride * y1 + x1 + srcStride ); |
464 | 0 | const Pel refD = *( refPel + refStride * y1 + x1 + refStride ); |
465 | 0 | const int diffD = pixD - refD; |
466 | 0 | diffsum += ( diffD - diff ) * ( diffD - diff ); |
467 | 0 | } |
468 | 0 | } |
469 | 0 | } |
470 | 0 | variance *= (int64_t) 1 << (2*(10-clpRng.bd)); |
471 | 0 | diffsum *= (int64_t) 1 << (2*(10-clpRng.bd)); |
472 | 0 | const int cntV = w * h; |
473 | 0 | const int cntD = 2 * cntV - w - h; |
474 | 0 | vnoise[i] = ( int ) round( ( 15.0 * cntD / cntV * variance + 5.0 ) / ( diffsum + 5.0 ) ); |
475 | 0 | minError = std::min( minError, verror[i] ); |
476 | 0 | } |
477 | |
|
478 | 0 | for( int i = 0; i < numRefs; i++ ) |
479 | 0 | { |
480 | 0 | const int error = verror[i]; |
481 | 0 | const int noise = vnoise[i]; |
482 | 0 | float ww = 1, sw = 1; |
483 | 0 | ww *= ( noise < 25 ) ? 1.0 : 0.6; |
484 | 0 | sw *= ( noise < 25 ) ? 1.0 : 0.8; |
485 | 0 | ww *= ( error < 50 ) ? 1.2 : ( ( error > 100 ) ? 0.6 : 1.0 ); |
486 | 0 | sw *= ( error < 50 ) ? 1.0 : 0.8; |
487 | 0 | ww *= ( ( minError + 1.0 ) / ( error + 1.0 ) ); |
488 | |
|
489 | 0 | vww[i] = ww * weightScaling * refStrenghts[i]; |
490 | 0 | vsw[i] = sw * 2 * sigmaSq; |
491 | 0 | } |
492 | |
|
493 | 0 | for( int y = 0; y < h; y++ ) |
494 | 0 | { |
495 | 0 | for( int x = 0; x < w; x++ ) |
496 | 0 | { |
497 | 0 | const Pel orgVal = *( srcPel + srcStride * y + x ); |
498 | 0 | float temporalWeightSum = 1.0; |
499 | 0 | float newVal = ( float ) orgVal; |
500 | |
|
501 | 0 | for( int i = 0; i < numRefs; i++ ) |
502 | 0 | { |
503 | 0 | const Pel* pCorrectedPelPtr = correctedPics[i] + y * w + x; |
504 | 0 | const int refVal = *pCorrectedPelPtr; |
505 | 0 | const int diff = refVal - orgVal; |
506 | 0 | const float diffSq = diff * diff; |
507 | |
|
508 | 0 | float weight = vww[i] * fastExp( -diffSq, vsw[i] ); |
509 | 0 | newVal += weight * refVal; |
510 | 0 | temporalWeightSum += weight; |
511 | 0 | } |
512 | 0 | newVal /= temporalWeightSum; |
513 | 0 | Pel sampleVal = ( Pel ) ( newVal + 0.5 ); |
514 | 0 | sampleVal = ( sampleVal < 0 ? 0 : ( sampleVal > maxSampleValue ? maxSampleValue : sampleVal ) ); |
515 | 0 | *( dstPel + dstStride * y + x ) = sampleVal; |
516 | 0 | } |
517 | 0 | } |
518 | 0 | } |
519 | | |
520 | | double calcVarCore( const Pel* org, const ptrdiff_t origStride, const int w, const int h ) |
521 | 0 | { |
522 | | // calculate average |
523 | 0 | int avg = 0; |
524 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
525 | 0 | { |
526 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
527 | 0 | { |
528 | 0 | avg = avg + *( org + x1 + y1 * origStride ); |
529 | 0 | } |
530 | 0 | } |
531 | 0 | avg <<= 4; |
532 | 0 | avg = avg / ( w * h ); |
533 | | |
534 | | // calculate variance |
535 | 0 | int64_t variance = 0; |
536 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
537 | 0 | { |
538 | 0 | for( int x1 = 0; x1 < w; x1++ ) |
539 | 0 | { |
540 | 0 | int pix = *( org + x1 + y1 * origStride ) << 4; |
541 | 0 | variance = variance + ( pix - avg ) * ( pix - avg ); |
542 | 0 | } |
543 | 0 | } |
544 | |
|
545 | 0 | return variance / 256.0; |
546 | 0 | } |
547 | | |
548 | | MCTF::MCTF( bool enableOpt ) |
549 | 0 | : m_encCfg ( nullptr ) |
550 | 0 | , m_threadPool ( nullptr ) |
551 | 0 | , m_isFinalPass( true ) |
552 | 0 | , m_filterPoc ( 0 ) |
553 | 0 | , m_lastPicIn ( nullptr ) |
554 | 0 | { |
555 | 0 | m_motionErrorLumaIntX = motionErrorLumaInt; |
556 | 0 | m_motionErrorLumaInt8 = motionErrorLumaInt; |
557 | 0 | m_motionErrorLumaFracX[0] = motionErrorLumaFrac6; |
558 | 0 | m_motionErrorLumaFrac8[0] = motionErrorLumaFrac6; |
559 | 0 | m_motionErrorLumaFracX[1] = motionErrorLumaFrac4; |
560 | 0 | m_motionErrorLumaFrac8[1] = motionErrorLumaFrac4; |
561 | 0 | m_applyFrac[0][0] = applyFrac8Core_6Tap; |
562 | 0 | m_applyFrac[0][1] = applyFrac8Core_4Tap; |
563 | 0 | m_applyFrac[1][0] = applyFrac8Core_6Tap; |
564 | 0 | m_applyFrac[1][1] = applyFrac8Core_4Tap; |
565 | 0 | m_applyPlanarCorrection = applyPlanarCorrectionCore; |
566 | 0 | m_applyBlock = applyBlockCore; |
567 | 0 | m_calcVar = calcVarCore; |
568 | |
|
569 | 0 | if( enableOpt ) |
570 | 0 | { |
571 | 0 | #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_MCTF |
572 | 0 | initMCTF_X86(); |
573 | 0 | #endif |
574 | | #if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCTF |
575 | | initMCTF_ARM(); |
576 | | #endif |
577 | 0 | } |
578 | 0 | } |
579 | | |
580 | | MCTF::~MCTF() |
581 | 0 | { |
582 | 0 | } |
583 | | |
584 | | void MCTF::init( const VVEncCfg& encCfg, bool isFinalPass, NoMallocThreadPool* threadPool ) |
585 | 0 | { |
586 | 0 | CHECK( encCfg.m_vvencMCTF.numFrames != encCfg.m_vvencMCTF.numStrength, "should have been checked before" ); |
587 | |
|
588 | 0 | m_encCfg = &encCfg; |
589 | 0 | m_threadPool = threadPool; |
590 | 0 | m_isFinalPass = isFinalPass; |
591 | 0 | m_filterPoc = 0; |
592 | 0 | m_area = Area( 0, 0, m_encCfg->m_PadSourceWidth, m_encCfg->m_PadSourceHeight ); |
593 | | |
594 | | // TLayer (TL) dependent definition of drop frames: TL = 4, TL = 3, TL = 2, TL = 1, TL = 0 |
595 | 0 | const static int sMCTFSpeed[5] { 0, 0, ((3<<12) + (2<<9) + (2<<6) + (0<<3) + 0), ((3<<12) + (2<<9) + (2<<6) + (0<<3) + 0), ((3<<12) + (3<<9) + (3<<6) + (2<<3) + 2) }; |
596 | |
|
597 | 0 | m_MCTFSpeedVal = sMCTFSpeed[ m_encCfg->m_vvencMCTF.MCTFSpeed ]; |
598 | 0 | m_lowResFltSearch = m_encCfg->m_vvencMCTF.MCTFSpeed > 0; |
599 | 0 | m_searchPttrn = m_encCfg->m_vvencMCTF.MCTFSpeed > 0 ? ( m_encCfg->m_vvencMCTF.MCTFSpeed >= 3 ? 2 : 1 ) : 0; |
600 | 0 | m_mctfUnitSize = m_encCfg->m_vvencMCTF.MCTFUnitSize; |
601 | 0 | } |
602 | | |
603 | | // ==================================================================================================================== |
604 | | // Public member functions |
605 | | // ==================================================================================================================== |
606 | | |
607 | | |
608 | | void MCTF::initPicture( Picture* pic ) |
609 | 0 | { |
610 | 0 | pic->getOrigBuf().extendBorderPel( MCTF_PADDING, MCTF_PADDING ); |
611 | 0 | pic->setSccFlags( m_encCfg ); |
612 | 0 | } |
613 | | |
614 | | void MCTF::processPictures( const PicList& picList, AccessUnitList& auList, PicList& doneList, PicList& freeList ) |
615 | 0 | { |
616 | | // ensure this is only processed if necessary |
617 | 0 | if( picList.empty() || ( m_lastPicIn == picList.back() && ! picList.back()->isFlush )) |
618 | 0 | { |
619 | 0 | return; |
620 | 0 | } |
621 | 0 | m_lastPicIn = picList.back(); |
622 | | |
623 | | // filter one picture (either all or up to frames to be encoded) |
624 | 0 | if( picList.size() |
625 | 0 | && m_filterPoc <= picList.back()->poc |
626 | 0 | && ( m_encCfg->m_framesToBeEncoded <= 0 || m_filterPoc < m_encCfg->m_framesToBeEncoded ) ) |
627 | 0 | { |
628 | | // setup fifo of pictures to be filtered |
629 | 0 | std::deque<Picture*> picFifo; |
630 | 0 | int filterIdx = 0; |
631 | 0 | for( auto pic : picList ) |
632 | 0 | { |
633 | 0 | const int minPoc = m_filterPoc - VVENC_MCTF_RANGE; |
634 | 0 | const int maxPoc = m_encCfg->m_vvencMCTF.MCTFFutureReference ? m_filterPoc + VVENC_MCTF_RANGE : m_filterPoc; |
635 | 0 | if( pic->poc >= minPoc && pic->poc <= maxPoc ) |
636 | 0 | { |
637 | 0 | picFifo.push_back( pic ); |
638 | 0 | if( pic->poc < m_filterPoc ) |
639 | 0 | { |
640 | 0 | filterIdx += 1; |
641 | 0 | } |
642 | 0 | } |
643 | 0 | } |
644 | 0 | CHECK( picFifo.empty(), "MCTF: no pictures to be filtered found" ); |
645 | 0 | CHECK( filterIdx >= (int)picFifo.size(), "MCTF: picture filter error" ); |
646 | 0 | CHECK( picFifo[ filterIdx ]->poc != m_filterPoc, "MCTF: picture filter error" ); |
647 | | // filter picture (when more than 1 picture is available for processing) |
648 | 0 | if( picFifo.size() > 1 ) |
649 | 0 | { |
650 | 0 | filter( picFifo, filterIdx ); |
651 | 0 | } |
652 | | // set picture done |
653 | 0 | doneList.push_back( picFifo[ filterIdx ] ); |
654 | 0 | } |
655 | | |
656 | | // mark pictures not needed anymore |
657 | 0 | for( auto pic : picList ) |
658 | 0 | { |
659 | 0 | if( pic->poc > m_filterPoc - VVENC_MCTF_RANGE ) |
660 | 0 | break; |
661 | 0 | freeList.push_back( pic ); |
662 | 0 | } |
663 | 0 | m_filterPoc += 1; |
664 | 0 | } |
665 | | |
666 | | void MCTF::motionEstimationMCTF(Picture* curPic, std::deque<TemporalFilterSourcePicInfo> &srcFrameInfo, const PelStorage& origBuf, PelStorage& origSubsampled2, PelStorage& origSubsampled4, PelStorage& origSubsampled8, std::vector<double> &mvErr, double &minError, bool addLevel, bool calcErr) |
667 | 0 | { |
668 | 0 | srcFrameInfo.push_back(TemporalFilterSourcePicInfo()); |
669 | 0 | TemporalFilterSourcePicInfo& srcPic = srcFrameInfo.back(); |
670 | |
|
671 | 0 | const int wInBlks = (m_area.width + m_mctfUnitSize - 1) / m_mctfUnitSize; |
672 | 0 | const int hInBlks = (m_area.height + m_mctfUnitSize - 1) / m_mctfUnitSize; |
673 | |
|
674 | 0 | srcPic.picBuffer.createFromBuf(curPic->getOrigBuf()); |
675 | 0 | srcPic.mvs.allocate(wInBlks, hInBlks); |
676 | 0 | srcPic.index = std::min(5, std::abs(curPic->poc - m_filterPoc) - 1); |
677 | | |
678 | |
|
679 | 0 | { |
680 | 0 | const int width = m_area.width; |
681 | 0 | const int height = m_area.height; |
682 | 0 | Array2D<MotionVector> mv_0(width / (m_mctfUnitSize * 8) + 1, height / (m_mctfUnitSize * 8) + 1); |
683 | 0 | Array2D<MotionVector> mv_1(width / (m_mctfUnitSize * 4) + 1, height / (m_mctfUnitSize * 4) + 1); |
684 | 0 | Array2D<MotionVector> mv_2(width / (m_mctfUnitSize * 2) + 1, height / (m_mctfUnitSize * 2) + 1); |
685 | |
|
686 | 0 | PelStorage bufferSub2; |
687 | 0 | PelStorage bufferSub4; |
688 | |
|
689 | 0 | subsampleLuma(srcPic.picBuffer, bufferSub2); |
690 | 0 | subsampleLuma(bufferSub2, bufferSub4); |
691 | |
|
692 | 0 | if (addLevel) |
693 | 0 | { |
694 | 0 | Array2D<MotionVector> mv_m(width / (m_mctfUnitSize * 16) + 1, height / (m_mctfUnitSize * 16) + 1); |
695 | 0 | PelStorage bufferSub8; |
696 | 0 | subsampleLuma(bufferSub4, bufferSub8); |
697 | 0 | motionEstimationLuma(mv_m, origSubsampled8, bufferSub8, 2 * m_mctfUnitSize); |
698 | 0 | motionEstimationLuma(mv_0, origSubsampled4, bufferSub4, 2 * m_mctfUnitSize, &mv_m, 2); |
699 | 0 | } |
700 | 0 | else |
701 | 0 | { |
702 | 0 | motionEstimationLuma(mv_0, origSubsampled4, bufferSub4, 2 * m_mctfUnitSize); |
703 | 0 | } |
704 | 0 | motionEstimationLuma(mv_1, origSubsampled2, bufferSub2, 2 * m_mctfUnitSize, &mv_0, 2); |
705 | 0 | motionEstimationLuma(mv_2, origBuf, srcPic.picBuffer, 2 * m_mctfUnitSize, &mv_1, 2); |
706 | |
|
707 | 0 | motionEstimationLuma(srcPic.mvs, origBuf, srcPic.picBuffer, m_mctfUnitSize, &mv_2, 1, true); |
708 | |
|
709 | 0 | if (calcErr) |
710 | 0 | { |
711 | 0 | double sumErr = 0.0; |
712 | 0 | for (int y = 0; y < srcPic.mvs.h(); y++) // going over ref pic in block steps |
713 | 0 | { |
714 | 0 | for (int x = 0; x < srcPic.mvs.w(); x++) |
715 | 0 | { |
716 | 0 | sumErr += srcPic.mvs.get(x, y).error; |
717 | 0 | } |
718 | 0 | } |
719 | 0 | double S = 1.0 / (srcPic.mvs.w() * srcPic.mvs.h()); |
720 | 0 | mvErr.push_back(sumErr * S); |
721 | 0 | minError = std::min(minError, sumErr * S); |
722 | 0 | } |
723 | 0 | } |
724 | 0 | } |
725 | | |
726 | | void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx ) |
727 | 0 | { |
728 | 0 | PROFILER_SCOPE_AND_STAGE( 1, g_timeProfiler, P_MCTF ); |
729 | |
|
730 | 0 | Picture* pic = picFifo[ filterIdx ]; |
731 | | |
732 | | // first-pass temporal downsampling |
733 | 0 | if( ! m_isFinalPass && pic->gopEntry->m_skipFirstPass ) |
734 | 0 | { |
735 | 0 | return; |
736 | 0 | } |
737 | | |
738 | 0 | const int mctfIdx = pic->gopEntry ? pic->gopEntry->m_mctfIndex : -1; |
739 | 0 | const double overallStrength = mctfIdx >= 0 ? m_encCfg->m_vvencMCTF.MCTFStrengths[ mctfIdx ] : -1.0; |
740 | 0 | double meanRmsAcrossPic = 0.0; |
741 | 0 | uint64_t sumSRmsAcrossPic = 0; |
742 | 0 | uint16_t nMax = 0, maxRmsCTU = 0; |
743 | 0 | bool isFilterThisFrame = mctfIdx >= 0; |
744 | |
|
745 | 0 | int dropFrames = ( m_encCfg->m_usePerceptQPA ? VVENC_MCTF_RANGE >> 1 : 0 ); |
746 | 0 | if( mctfIdx >= 0 ) |
747 | 0 | { |
748 | 0 | const int idxTLayer = m_encCfg->m_vvencMCTF.numFrames - (mctfIdx + 1); |
749 | 0 | const int threshold = (m_MCTFSpeedVal >> (idxTLayer * 3)) & 7; |
750 | |
|
751 | 0 | dropFrames = std::min(VVENC_MCTF_RANGE, threshold); |
752 | 0 | isFilterThisFrame = threshold < VVENC_MCTF_RANGE; |
753 | 0 | } |
754 | |
|
755 | 0 | const int filterFrames = VVENC_MCTF_RANGE - 2 - dropFrames; |
756 | |
|
757 | 0 | int dropFramesFront = std::min( std::max( filterIdx - filterFrames, 0 ), dropFrames + 2 ); |
758 | 0 | int dropFramesBack = std::min( std::max( static_cast<int>( picFifo.size() ) - 1 - filterIdx - filterFrames, 0 ), dropFrames + 2 ); |
759 | |
|
760 | 0 | if( !pic->useMCTF && !pic->gopEntry->m_isStartOfGop ) |
761 | 0 | { |
762 | 0 | isFilterThisFrame = false; |
763 | 0 | } |
764 | |
|
765 | 0 | if ( isFilterThisFrame ) |
766 | 0 | { |
767 | 0 | bool useMCTFadaptation = true; |
768 | 0 | const bool condAddLevel = useMCTFadaptation && m_area.width >= 1920; |
769 | 0 | std::vector<double> mvErr; |
770 | 0 | double minError = MAX_DOUBLE; |
771 | |
|
772 | 0 | const PelStorage& origBuf = pic->getOrigBuffer(); |
773 | 0 | PelStorage& fltrBuf = pic->getFilteredOrigBuffer(); |
774 | | |
775 | | // subsample original picture so it only needs to be done once |
776 | 0 | PelStorage origSubsampled2; |
777 | 0 | PelStorage origSubsampled4; |
778 | 0 | PelStorage origSubsampled8; |
779 | 0 | subsampleLuma( origBuf, origSubsampled2 ); |
780 | 0 | subsampleLuma( origSubsampled2, origSubsampled4 ); |
781 | 0 | if (condAddLevel) |
782 | 0 | { |
783 | 0 | subsampleLuma(origSubsampled4, origSubsampled8); |
784 | 0 | } |
785 | | |
786 | | // determine motion vectors |
787 | 0 | std::deque<TemporalFilterSourcePicInfo> srcFrameInfo; |
788 | 0 | for ( int i = dropFramesFront; i < picFifo.size() - dropFramesBack; i++ ) |
789 | 0 | { |
790 | 0 | Picture* curPic = picFifo[ i ]; |
791 | 0 | if ( curPic->poc == m_filterPoc ) |
792 | 0 | { |
793 | 0 | continue; |
794 | 0 | } |
795 | 0 | motionEstimationMCTF(curPic, srcFrameInfo, origBuf, origSubsampled2, origSubsampled4, origSubsampled8 ,mvErr, minError, condAddLevel, useMCTFadaptation); |
796 | 0 | } |
797 | |
|
798 | 0 | int lastIndexRefFr = -1; |
799 | 0 | if ((m_encCfg->m_vvencMCTF.MCTFSpeed < 4) && (minError > 80)) |
800 | 0 | { |
801 | 0 | useMCTFadaptation = false; |
802 | 0 | } |
803 | 0 | if (useMCTFadaptation && minError) |
804 | 0 | { |
805 | 0 | const double errThr = 0.75 * minError * srcFrameInfo.size(); |
806 | 0 | int avgErrCond = 0; |
807 | 0 | int minErrCond = 0; |
808 | 0 | double factErr = m_encCfg->m_vvencMCTF.MCTFSpeed < 4 ? 1.0 : 2.0 ; |
809 | 0 | double SizeThi = m_encCfg->m_vvencMCTF.MCTFSpeed < 4 ? filterFrames + 1 : 3.0; |
810 | |
|
811 | 0 | for (const double& framMvErr : mvErr) |
812 | 0 | { |
813 | 0 | if (factErr * framMvErr > errThr) |
814 | 0 | { |
815 | 0 | avgErrCond++; |
816 | 0 | } |
817 | 0 | if (framMvErr > SizeThi * minError) |
818 | 0 | { |
819 | 0 | minErrCond++; |
820 | 0 | } |
821 | 0 | } |
822 | 0 | int newFilterFrames = minErrCond ? filterFrames : (filterFrames + 2 - avgErrCond); |
823 | 0 | if (filterFrames <= 2 && newFilterFrames > 3) newFilterFrames = 3; |
824 | |
|
825 | 0 | for (int curIdx = filterFrames + 1; (curIdx < newFilterFrames + 1)&&((lastIndexRefFr == -1)); curIdx++) |
826 | 0 | { |
827 | 0 | for (int i = 0; i < picFifo.size(); i++) |
828 | 0 | { |
829 | 0 | Picture* curPic = picFifo[i]; |
830 | 0 | if (curIdx == std::abs(curPic->poc - m_filterPoc)) |
831 | 0 | { |
832 | 0 | motionEstimationMCTF(curPic, srcFrameInfo, origBuf, origSubsampled2, origSubsampled4, origSubsampled8, mvErr, minError, condAddLevel, m_encCfg->m_vvencMCTF.MCTFSpeed == 4); |
833 | 0 | if (m_encCfg->m_vvencMCTF.MCTFSpeed == 4) |
834 | 0 | { |
835 | 0 | int nSize = (int(srcFrameInfo.size()) & 1) + int(srcFrameInfo.size()); |
836 | 0 | const double errThrcur = 0.75 * minError * nSize; |
837 | 0 | if (mvErr.back() > errThrcur) |
838 | 0 | { |
839 | 0 | lastIndexRefFr = curIdx; |
840 | 0 | break; |
841 | 0 | } |
842 | 0 | } |
843 | 0 | } |
844 | 0 | } |
845 | 0 | } |
846 | 0 | if ((lastIndexRefFr != -1)) |
847 | 0 | { |
848 | 0 | for (auto it = srcFrameInfo.begin(); it != srcFrameInfo.end(); ) |
849 | 0 | { |
850 | 0 | if ((it->index + 1) >= lastIndexRefFr) |
851 | 0 | { |
852 | 0 | it = srcFrameInfo.erase(it); |
853 | 0 | } |
854 | 0 | else |
855 | 0 | { |
856 | 0 | ++it; |
857 | 0 | } |
858 | 0 | } |
859 | 0 | } |
860 | 0 | } |
861 | | |
862 | | // filter |
863 | 0 | if( pic->useMCTF ) |
864 | 0 | { |
865 | 0 | fltrBuf.create( m_encCfg->m_internChromaFormat, m_area, 0, m_padding ); |
866 | 0 | bilateralFilter( origBuf, srcFrameInfo, fltrBuf, overallStrength ); |
867 | 0 | } |
868 | |
|
869 | 0 | if( m_encCfg->m_blockImportanceMapping || m_encCfg->m_usePerceptQPA || pic->gopEntry->m_isStartOfGop ) |
870 | 0 | { |
871 | 0 | const int ctuSize = m_encCfg->m_bimCtuSize; |
872 | 0 | const int widthInCtus = ( m_area.width + ctuSize - 1 ) / ctuSize; |
873 | 0 | const int heightInCtus = ( m_area.height + ctuSize - 1 ) / ctuSize; |
874 | 0 | const int numCtu = widthInCtus * heightInCtus; |
875 | 0 | const int ctuBlocks = ctuSize / m_mctfUnitSize; |
876 | |
|
877 | 0 | std::vector<double> sumError( numCtu * 2, 0 ); |
878 | 0 | std::vector<uint32_t> sumRMS( numCtu * 2, 0 ); // RMS of motion estimation error |
879 | 0 | std::vector<uint16_t> maxRMS( numCtu * 2, 0 ); // maximum block estimation error |
880 | 0 | std::vector<double> blkCount( numCtu * 2, 0 ); |
881 | |
|
882 | 0 | int distFactor[2] = { 3,3 }; |
883 | |
|
884 | 0 | for( auto& srcPic : srcFrameInfo ) |
885 | 0 | { |
886 | 0 | if( srcPic.index >= 2 ) |
887 | 0 | { |
888 | 0 | continue; |
889 | 0 | } |
890 | | |
891 | 0 | int dist = srcPic.index; |
892 | 0 | distFactor[dist]--; |
893 | |
|
894 | 0 | for( int y = 0; y < srcPic.mvs.h(); y++ ) // going over ref pic in block steps |
895 | 0 | { |
896 | 0 | for( int x = 0; x < srcPic.mvs.w(); x++ ) |
897 | 0 | { |
898 | 0 | const int ctuX = x / ctuBlocks; |
899 | 0 | const int ctuY = y / ctuBlocks; |
900 | 0 | const int ctuId = ctuY * widthInCtus + ctuX; |
901 | 0 | const auto& mvBlk = srcPic.mvs.get( x, y ); |
902 | 0 | sumError[dist * numCtu + ctuId] += mvBlk.error; |
903 | 0 | sumRMS [dist * numCtu + ctuId] += mvBlk.rmsme; |
904 | 0 | maxRMS [dist * numCtu + ctuId] = std::max( maxRMS[dist * numCtu + ctuId], mvBlk.rmsme ); |
905 | 0 | blkCount[dist * numCtu + ctuId] += mvBlk.overlap; |
906 | 0 | } |
907 | 0 | } |
908 | 0 | } |
909 | |
|
910 | 0 | if( distFactor[0] < 3 && distFactor[1] < 3 && ( m_encCfg->m_usePerceptQPA || pic->gopEntry->m_isStartOfGop ) ) |
911 | 0 | { |
912 | 0 | const double bd12bScale = double (m_encCfg->m_internalBitDepth[CH_L] < 12 ? 4 : 1); |
913 | |
|
914 | 0 | for( int i = 0; i < numCtu; i++ ) // start noise estimation with motion errors |
915 | 0 | { |
916 | 0 | const Position pos ((i % widthInCtus) * ctuSize, (i / widthInCtus) * ctuSize); |
917 | 0 | const CompArea ctuArea = clipArea (CompArea (COMP_Y, pic->chromaFormat, Area (pos.x, pos.y, ctuSize, ctuSize)), pic->Y()); |
918 | 0 | const unsigned avgIndex = pic->getOrigBuf (ctuArea).getAvg() >> (m_encCfg->m_internalBitDepth[CH_L] - 3); // one of 8 mean level regions |
919 | 0 | double meanInCTU; |
920 | |
|
921 | 0 | sumRMS[i] = std::min (sumRMS[i], sumRMS[i + numCtu]); |
922 | 0 | meanInCTU = bd12bScale * sumRMS[i] / blkCount[i]; |
923 | 0 | meanRmsAcrossPic += meanInCTU; |
924 | 0 | if (meanInCTU < pic->m_picShared->m_minNoiseLevels[avgIndex]) |
925 | 0 | { |
926 | 0 | pic->m_picShared->m_minNoiseLevels[avgIndex] = uint8_t (0.5 + meanInCTU); // scaled to 12 bit, see filterAndCalculateAverageActivity() |
927 | 0 | } |
928 | |
|
929 | 0 | maxRMS[i] = std::min (maxRMS[i], maxRMS[i + numCtu]); |
930 | 0 | maxRmsCTU = std::max (maxRmsCTU, maxRMS[i]); |
931 | 0 | sumSRmsAcrossPic += (uint64_t) maxRMS[i] * maxRMS[i]; |
932 | 0 | if (maxRMS[i] > 0) |
933 | 0 | { |
934 | 0 | nMax++; // count all CTUs with non-zero motion error (excludes e.g. black borders). CTU with the motion error peak is subtracted below |
935 | 0 | } |
936 | 0 | } |
937 | 0 | pic->m_picShared->m_picMotEstError = uint16_t (0.5 + meanRmsAcrossPic / numCtu); |
938 | |
|
939 | 0 | if( pic->gopEntry->m_isStartOfGop && !pic->useMCTF && m_encCfg->m_vvencMCTF.MCTF > 0 && meanRmsAcrossPic > numCtu * 27.0 ) |
940 | 0 | { |
941 | | // check application (re-enabling) of MCTF filter for key pictures, in case MCTF has been disabled based on SCC detection |
942 | 0 | bool allNoiseZero = true; |
943 | 0 | for( int i = 0; i < QPA_MAX_NOISE_LEVELS; i++ ) |
944 | 0 | { |
945 | 0 | if( pic->m_picShared->m_minNoiseLevels[i] && pic->m_picShared->m_minNoiseLevels[i] < 255 ) |
946 | 0 | { |
947 | 0 | allNoiseZero = false; |
948 | 0 | break; |
949 | 0 | } |
950 | 0 | } |
951 | 0 | int numZeroRMSCtus = 0; |
952 | 0 | if( allNoiseZero ) |
953 | 0 | { |
954 | 0 | for( int i = 0; i < numCtu; i++ ) |
955 | 0 | { |
956 | 0 | if( sumRMS[i] == 0 ) |
957 | 0 | { |
958 | 0 | numZeroRMSCtus += 1; |
959 | 0 | } |
960 | 0 | } |
961 | 0 | } |
962 | 0 | const bool doFilter = ( numZeroRMSCtus * 100 <= numCtu * 6 ); |
963 | 0 | if( doFilter ) |
964 | 0 | { |
965 | 0 | fltrBuf.create( m_encCfg->m_internChromaFormat, m_area, 0, m_padding ); |
966 | 0 | bilateralFilter( origBuf, srcFrameInfo, fltrBuf, overallStrength ); |
967 | 0 | } |
968 | 0 | } |
969 | 0 | } |
970 | 0 | if (m_encCfg->m_forceScc <= 0) |
971 | 0 | { |
972 | 0 | bool forceSCC = false; |
973 | 0 | if (pic->gopEntry->m_isStartOfGop) |
974 | 0 | { |
975 | 0 | forceSCC = true; |
976 | 0 | for (int j = 0; j < QPA_MAX_NOISE_LEVELS; j++) |
977 | 0 | { |
978 | 0 | if (pic->m_picShared->m_minNoiseLevels[j] < 255 && pic->m_picShared->m_minNoiseLevels[j]) |
979 | 0 | { |
980 | 0 | forceSCC = false; |
981 | 0 | break; |
982 | 0 | } |
983 | 0 | } |
984 | 0 | if (forceSCC) |
985 | 0 | { |
986 | 0 | for (int s = 0; s < mvErr.size(); s++) |
987 | 0 | { |
988 | 0 | if (int(mvErr[s]) == 0) |
989 | 0 | { |
990 | 0 | forceSCC = false; |
991 | 0 | break; |
992 | 0 | } |
993 | 0 | } |
994 | 0 | } |
995 | 0 | } |
996 | 0 | pic->m_picShared->m_forceSCC = forceSCC; |
997 | 0 | } |
998 | |
|
999 | 0 | if( !m_encCfg->m_blockImportanceMapping || !pic->useMCTF ) |
1000 | 0 | { |
1001 | 0 | CHECKD( !pic->m_picShared->m_ctuBimQpOffset.empty(), "BIM disabled, but offset vector not empty!" ); |
1002 | 0 | return; |
1003 | 0 | } |
1004 | | |
1005 | 0 | pic->m_picShared->m_ctuBimQpOffset.resize( numCtu, 0 ); |
1006 | |
|
1007 | 0 | if( distFactor[0] < 3 && distFactor[1] < 3 ) |
1008 | 0 | { |
1009 | 0 | const double weight = std::min( 1.0, overallStrength ); |
1010 | 0 | const double factor = std::min( 1.0, sqrt((1920.0 * 1080.0) / double (m_encCfg->m_SourceWidth * m_encCfg->m_SourceHeight)) ) * ( (double) m_encCfg->m_QP / (MAX_QP + 1.0) ); |
1011 | 0 | int sumCtuQpOffsets = 0; |
1012 | |
|
1013 | 0 | meanRmsAcrossPic = (!m_encCfg->m_usePerceptQPA || !m_encCfg->m_salienceBasedOpt || maxRmsCTU == 0 || nMax < 2 ? 65535.0 : sqrt (double (sumSRmsAcrossPic - (uint64_t) maxRmsCTU * maxRmsCTU) / (nMax - 1.0))); |
1014 | |
|
1015 | 0 | for( int i = 0; i < numCtu; i++ ) |
1016 | 0 | { |
1017 | 0 | const int avgErrD1 = ( int ) ( ( sumError[i ] / blkCount[i ] ) * distFactor[0] ); |
1018 | 0 | const int avgErrD2 = ( int ) ( ( sumError[i + numCtu] / blkCount[i + numCtu] ) * distFactor[1] ); |
1019 | 0 | int weightedErr = std::max( avgErrD1, avgErrD2 ) + abs( avgErrD2 - avgErrD1 ) * 3; |
1020 | 0 | weightedErr = ( int ) ( weightedErr * weight + ( 1 - weight ) * m_cuTreeCenter ); |
1021 | |
|
1022 | 0 | int qpOffset = 0; |
1023 | |
|
1024 | 0 | if( weightedErr > m_cuTreeThresh[0] ) |
1025 | 0 | { |
1026 | 0 | qpOffset = 2; |
1027 | 0 | } |
1028 | 0 | else if( weightedErr > m_cuTreeThresh[1] ) |
1029 | 0 | { |
1030 | 0 | qpOffset = 1; |
1031 | 0 | } |
1032 | 0 | else if( weightedErr < m_cuTreeThresh[3] ) |
1033 | 0 | { |
1034 | 0 | qpOffset = -2; |
1035 | 0 | } |
1036 | 0 | else if( weightedErr < m_cuTreeThresh[2] ) |
1037 | 0 | { |
1038 | 0 | qpOffset = -1; |
1039 | 0 | } |
1040 | |
|
1041 | 0 | if (meanRmsAcrossPic < maxRMS[i] * factor) |
1042 | 0 | { |
1043 | 0 | qpOffset += int (6.0 * log (std::max ((ctuSize > 64 ? 0.625 : 0.5) * maxRMS[i] * factor, meanRmsAcrossPic) / (maxRMS[i] * factor)) / (sqrt (weight) * log (2.0)) - 0.5); |
1044 | 0 | } |
1045 | |
|
1046 | 0 | pic->m_picShared->m_ctuBimQpOffset[i] = qpOffset; |
1047 | 0 | sumCtuQpOffsets += qpOffset; |
1048 | 0 | } |
1049 | |
|
1050 | 0 | pic->m_picShared->m_picAuxQpOffset = ( sumCtuQpOffsets + ( sumCtuQpOffsets < 0 ? -(numCtu >> 1) : numCtu >> 1 ) ) / numCtu; // pic average |
1051 | 0 | for( int i = 0; i < numCtu; i++ ) |
1052 | 0 | { |
1053 | 0 | pic->m_picShared->m_ctuBimQpOffset[i] -= pic->m_picShared->m_picAuxQpOffset; // delta-QP relative to above average, see xGetQPForPicture |
1054 | 0 | } |
1055 | 0 | } |
1056 | 0 | else |
1057 | 0 | { |
1058 | 0 | std::fill( pic->m_picShared->m_ctuBimQpOffset.begin(), pic->m_picShared->m_ctuBimQpOffset.end(), 0 ); |
1059 | 0 | } |
1060 | 0 | } |
1061 | 0 | } |
1062 | 0 | else |
1063 | 0 | { |
1064 | 0 | pic->m_picShared->m_ctuBimQpOffset.resize( 0 ); |
1065 | 0 | } |
1066 | 0 | } |
1067 | | |
1068 | | // ==================================================================================================================== |
1069 | | // Private member functions |
1070 | | // ==================================================================================================================== |
1071 | | |
1072 | | void MCTF::subsampleLuma(const PelStorage &input, PelStorage &output, const int factor) const |
1073 | 0 | { |
1074 | 0 | const int newWidth = input.Y().width / factor; |
1075 | 0 | const int newHeight = input.Y().height / factor; |
1076 | 0 | output.create(CHROMA_400, Area(0, 0, newWidth, newHeight), 0, m_padding); |
1077 | |
|
1078 | 0 | const Pel* srcRow = input.Y().buf; |
1079 | 0 | const int srcStride = input.Y().stride; |
1080 | 0 | Pel* dstRow = output.Y().buf; |
1081 | 0 | const int dstStride = output.Y().stride; |
1082 | |
|
1083 | 0 | for (int y = 0; y < newHeight; y++, srcRow+=factor*srcStride, dstRow+=dstStride) |
1084 | 0 | { |
1085 | 0 | const Pel* inRow = srcRow; |
1086 | 0 | const Pel* inRowBelow = srcRow+srcStride; |
1087 | 0 | Pel* target = dstRow; |
1088 | |
|
1089 | 0 | for (int x = 0; x < newWidth; x++) |
1090 | 0 | { |
1091 | 0 | target[x] = (inRow[0] + inRowBelow[0] + inRow[1] + inRowBelow[1] + 2) >> 2; |
1092 | 0 | inRow += 2; |
1093 | 0 | inRowBelow += 2; |
1094 | 0 | } |
1095 | 0 | } |
1096 | 0 | output.extendBorderPel(m_padding, m_padding); |
1097 | 0 | } |
1098 | | |
1099 | | int MCTF::motionErrorLuma(const PelStorage &orig, |
1100 | | const PelStorage &buffer, |
1101 | | const int x, |
1102 | | const int y, |
1103 | | int dx, |
1104 | | int dy, |
1105 | | const int bs, |
1106 | | const int besterror = MAX_INT) const |
1107 | 0 | { |
1108 | 0 | int fx = dx & 0xf; |
1109 | 0 | int fy = dy & 0xf; |
1110 | |
|
1111 | 0 | int error = 0;// dx * 10 + dy * 10; |
1112 | |
|
1113 | 0 | CHECKD( bs & 7, "Blocksize has to be a multiple of 8!" ); |
1114 | |
|
1115 | 0 | const int w = std::min<int>( bs, orig.Y().width - x ) & ~7; |
1116 | 0 | const int h = std::min<int>( bs, orig.Y().height - y ) & ~7; |
1117 | |
|
1118 | 0 | CHECK( !w || !h, "Incompatible sizes!" ); |
1119 | |
|
1120 | 0 | if( ( fx | fy ) == 0 ) |
1121 | 0 | { |
1122 | 0 | dx /= m_motionVectorFactor; |
1123 | 0 | dy /= m_motionVectorFactor; |
1124 | |
|
1125 | 0 | const int origStride = orig.Y().stride; |
1126 | 0 | const Pel* org = orig.Y().buf + x + y * origStride; |
1127 | 0 | const int buffStride = buffer.Y().stride; |
1128 | 0 | const Pel* buf = buffer.Y().buf + x + dx + ( y + dy ) * buffStride; |
1129 | |
|
1130 | 0 | return m_motionErrorLumaInt8( org, origStride, buf, buffStride, w, h, besterror ); |
1131 | 0 | } |
1132 | 0 | else if( m_lowResFltSearch ) |
1133 | 0 | { |
1134 | 0 | dx >>= 4; |
1135 | 0 | dy >>= 4; |
1136 | |
|
1137 | 0 | const int origStride = orig.Y().stride; |
1138 | 0 | const Pel* org = orig.Y().buf + x + y * origStride; |
1139 | 0 | const int buffStride = buffer.Y().stride; |
1140 | 0 | const Pel* buf = buffer.Y().buf + x + dx + ( y + dy ) * buffStride; |
1141 | |
|
1142 | 0 | const int16_t *xFilter = m_interpolationFilter4[fx]; |
1143 | 0 | const int16_t *yFilter = m_interpolationFilter4[fy]; |
1144 | |
|
1145 | 0 | return m_motionErrorLumaFrac8[1]( org, origStride, buf, buffStride, w, h, xFilter, yFilter, m_encCfg->m_internalBitDepth[CH_L], besterror ); |
1146 | 0 | } |
1147 | 0 | else |
1148 | 0 | { |
1149 | 0 | dx >>= 4; |
1150 | 0 | dy >>= 4; |
1151 | |
|
1152 | 0 | const int origStride = orig.Y().stride; |
1153 | 0 | const Pel* org = orig.Y().buf + x + y * origStride; |
1154 | 0 | const int buffStride = buffer.Y().stride; |
1155 | 0 | const Pel* buf = buffer.Y().buf + x + dx + ( y + dy ) * buffStride; |
1156 | |
|
1157 | 0 | const int16_t *xFilter = m_interpolationFilter8[fx]; |
1158 | 0 | const int16_t *yFilter = m_interpolationFilter8[fy]; |
1159 | |
|
1160 | 0 | return m_motionErrorLumaFrac8[0]( org, origStride, buf, buffStride, w,h, xFilter, yFilter, m_encCfg->m_internalBitDepth[CH_L], besterror ); |
1161 | 0 | } |
1162 | | |
1163 | 0 | return error; |
1164 | 0 | } |
1165 | | |
1166 | | bool MCTF::estimateLumaLn( std::atomic_int& blockX_, std::atomic_int* prevLineX, Array2D<MotionVector> &mvs, const PelStorage &orig, const PelStorage &buffer, const int blockSize, |
1167 | | const Array2D<MotionVector> *previous, const int factor, const bool doubleRes, int blockY, int bitDepth ) const |
1168 | 0 | { |
1169 | 0 | PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_MCTF_SEARCH ); |
1170 | |
|
1171 | 0 | const int stepSize = blockSize; |
1172 | 0 | const int origWidth = orig.Y().width; |
1173 | |
|
1174 | 0 | for( int blockX = blockX_.load(); blockX + 8 <= origWidth; blockX += stepSize, blockX_.store( blockX) ) |
1175 | 0 | { |
1176 | 0 | if( prevLineX && blockX >= prevLineX->load() ) return false; |
1177 | | |
1178 | 0 | int range = doubleRes ? 0 : ( m_searchPttrn == 2 ? 3 : 5 ); |
1179 | 0 | const int stepSize = blockSize; |
1180 | |
|
1181 | 0 | MotionVector best; |
1182 | |
|
1183 | 0 | if (previous == NULL) |
1184 | 0 | { |
1185 | 0 | range = 8; |
1186 | 0 | } |
1187 | 0 | else |
1188 | 0 | { |
1189 | 0 | for( int py = -1; py <= 1; py++ ) |
1190 | 0 | { |
1191 | 0 | int testy = blockY / (2 * blockSize) + py; |
1192 | 0 | if( (testy >= 0) && (testy < previous->h()) ) |
1193 | 0 | { |
1194 | 0 | for (int px = -1; px <= 1; px++) |
1195 | 0 | { |
1196 | 0 | int testx = blockX / (2 * blockSize) + px; |
1197 | 0 | if ((testx >= 0) && (testx < previous->w()) ) |
1198 | 0 | { |
1199 | 0 | const MotionVector& old = previous->get(testx, testy); |
1200 | 0 | int error = motionErrorLuma(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, best.error); |
1201 | 0 | if (error < best.error) |
1202 | 0 | { |
1203 | 0 | best.set(old.x * factor, old.y * factor, error); |
1204 | 0 | } |
1205 | 0 | } |
1206 | 0 | } |
1207 | 0 | } |
1208 | 0 | } |
1209 | |
|
1210 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, 0, 0, blockSize, best.error ); |
1211 | 0 | if( error < best.error ) |
1212 | 0 | { |
1213 | 0 | best.set( 0, 0, error ); |
1214 | 0 | } |
1215 | 0 | } |
1216 | 0 | MotionVector prevBest = best; |
1217 | 0 | const int d = previous == NULL && m_searchPttrn == 2 ? 2 : 1; |
1218 | 0 | for( int y2 = prevBest.y / m_motionVectorFactor - range; y2 <= prevBest.y / m_motionVectorFactor + range; y2 += d ) |
1219 | 0 | { |
1220 | 0 | for( int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2 += d ) |
1221 | 0 | { |
1222 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, best.error ); |
1223 | 0 | if( error < best.error ) |
1224 | 0 | { |
1225 | 0 | best.set( x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, error ); |
1226 | 0 | } |
1227 | 0 | } |
1228 | 0 | } |
1229 | 0 | if (doubleRes) |
1230 | 0 | { // merge into one loop, probably with precision array (here [12, 3] or maybe [4, 1]) with setable number of iterations |
1231 | 0 | PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_MCTF_SEARCH_SUBPEL ); |
1232 | |
|
1233 | 0 | prevBest = best; |
1234 | 0 | int doubleRange = m_searchPttrn ? 6 : 12; |
1235 | 0 | const int d1 = m_searchPttrn == 2 ? 6 : 4; |
1236 | | |
1237 | | // first iteration, 49 - 1 or 16 checks or 9 - 1 checks |
1238 | 0 | for( int y2 = -doubleRange; y2 <= doubleRange; y2 += d1 ) |
1239 | 0 | { |
1240 | 0 | for( int x2 = -doubleRange; x2 <= doubleRange; x2 += d1 ) |
1241 | 0 | { |
1242 | 0 | if( x2 || y2 ) |
1243 | 0 | { |
1244 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, prevBest.x + x2, prevBest.y + y2, blockSize, best.error ); |
1245 | 0 | if( error < best.error ) |
1246 | 0 | { |
1247 | 0 | best.set( prevBest.x + x2, prevBest.y + y2, error ); |
1248 | 0 | } |
1249 | 0 | } |
1250 | 0 | } |
1251 | 0 | } |
1252 | |
|
1253 | 0 | prevBest = best; |
1254 | 0 | doubleRange = 2; |
1255 | | // second iteration, 9 - 1 checks |
1256 | 0 | for( int y2 = -doubleRange; y2 <= doubleRange; y2 += 2 ) |
1257 | 0 | { |
1258 | 0 | for( int x2 = -doubleRange; x2 <= doubleRange; x2 += 2 ) |
1259 | 0 | { |
1260 | 0 | if( x2 || y2 ) |
1261 | 0 | { |
1262 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, prevBest.x + x2, prevBest.y + y2, blockSize, best.error ); |
1263 | 0 | if( error < best.error ) |
1264 | 0 | { |
1265 | 0 | best.set( prevBest.x + x2, prevBest.y + y2, error ); |
1266 | 0 | } |
1267 | 0 | } |
1268 | 0 | } |
1269 | 0 | } |
1270 | |
|
1271 | 0 | prevBest = best; |
1272 | 0 | doubleRange = 1; |
1273 | | // third iteration, 9 - 1 checks |
1274 | 0 | for (int y2 = -doubleRange; y2 <= doubleRange; y2++) |
1275 | 0 | { |
1276 | 0 | for (int x2 = -doubleRange; x2 <= doubleRange; x2++) |
1277 | 0 | { |
1278 | 0 | if( x2 || y2 ) |
1279 | 0 | { |
1280 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, prevBest.x + x2, prevBest.y + y2, blockSize, best.error ); |
1281 | 0 | if( error < best.error ) |
1282 | 0 | { |
1283 | 0 | best.set( prevBest.x + x2, prevBest.y + y2, error ); |
1284 | 0 | } |
1285 | 0 | } |
1286 | 0 | } |
1287 | 0 | } |
1288 | 0 | } |
1289 | 0 | if( blockY > 0 ) |
1290 | 0 | { |
1291 | 0 | MotionVector aboveMV = mvs.get( blockX / stepSize, ( blockY - stepSize ) / stepSize ); |
1292 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, best.error ); |
1293 | 0 | if( error < best.error ) |
1294 | 0 | { |
1295 | 0 | best.set( aboveMV.x, aboveMV.y, error ); |
1296 | 0 | } |
1297 | 0 | } |
1298 | 0 | if( blockX > 0 ) |
1299 | 0 | { |
1300 | 0 | MotionVector leftMV = mvs.get( ( blockX - stepSize ) / stepSize, blockY / stepSize ); |
1301 | 0 | int error = motionErrorLuma( orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, best.error ); |
1302 | 0 | if( error < best.error ) |
1303 | 0 | { |
1304 | 0 | best.set( leftMV.x, leftMV.y, error ); |
1305 | 0 | } |
1306 | 0 | } |
1307 | |
|
1308 | 0 | if( doubleRes ) |
1309 | 0 | { |
1310 | 0 | const int w = std::min<int>( blockSize, orig.Y().width - blockX ) & ~7; |
1311 | 0 | const int h = std::min<int>( blockSize, orig.Y().height - blockY ) & ~7; |
1312 | |
|
1313 | 0 | CHECKD(bitDepth>10, "unsupported internal bit depth (also in calcVar)" ); |
1314 | 0 | const double bdScale = double(1<<(2*(10-bitDepth))); |
1315 | 0 | const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h ) * bdScale; |
1316 | 0 | const double mse = best.error * bdScale / double( w * h ); |
1317 | |
|
1318 | 0 | best.error = ( int ) ( 20 * ( ( best.error*bdScale + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 ); |
1319 | 0 | best.rmsme = uint16_t( 0.5 + sqrt( mse ) ); |
1320 | 0 | best.overlap = ( ( double ) w * h ) / ( m_mctfUnitSize * m_mctfUnitSize ); |
1321 | 0 | } |
1322 | | |
1323 | 0 | mvs.get(blockX / stepSize, blockY / stepSize) = best; |
1324 | 0 | } |
1325 | | |
1326 | 0 | return true; |
1327 | 0 | } |
1328 | | |
1329 | | void MCTF::motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &orig, const PelStorage &buffer, const int blockSize, const Array2D<MotionVector> *previous, const int factor, const bool doubleRes) const |
1330 | 0 | { |
1331 | 0 | const int stepSize = blockSize; |
1332 | 0 | const int origHeight = orig.Y().height; |
1333 | 0 | const int bitDepth = m_encCfg->m_internalBitDepth[CH_L]; |
1334 | |
|
1335 | 0 | if( m_threadPool ) |
1336 | 0 | { |
1337 | 0 | struct EstParams |
1338 | 0 | { |
1339 | 0 | std::atomic_int blockX; |
1340 | 0 | std::atomic_int* prevLineX; |
1341 | 0 | Array2D<MotionVector> *mvs; |
1342 | 0 | const PelStorage* orig; |
1343 | 0 | const PelStorage* buffer; |
1344 | 0 | const Array2D<MotionVector> *previous; |
1345 | 0 | int blockSize; |
1346 | 0 | int factor; |
1347 | 0 | bool doubleRes; |
1348 | 0 | int blockY; |
1349 | 0 | int bitDepth; |
1350 | 0 | const MCTF* mctf; |
1351 | 0 | }; |
1352 | |
|
1353 | 0 | std::vector<EstParams> EstParamsArray( origHeight/stepSize + 1 ); |
1354 | |
|
1355 | 0 | WaitCounter taskCounter; |
1356 | |
|
1357 | 0 | for( int n = 0, blockY = 0; blockY + 8 <= origHeight; blockY += stepSize, n++ ) |
1358 | 0 | { |
1359 | 0 | static auto task = []( int tId, EstParams* params) |
1360 | 0 | { |
1361 | 0 | ITT_TASKSTART( itt_domain_MCTF_est, itt_handle_est ); |
1362 | |
|
1363 | 0 | bool ret = params->mctf->estimateLumaLn( params->blockX, params->prevLineX, *params->mvs, *params->orig, *params->buffer, params->blockSize, params->previous, params->factor, params->doubleRes, params->blockY, params->bitDepth ); |
1364 | |
|
1365 | 0 | ITT_TASKEND( itt_domain_MCTF_est, itt_handle_est ); |
1366 | 0 | return ret; |
1367 | 0 | }; |
1368 | |
|
1369 | 0 | EstParams& cEstParams = EstParamsArray[n]; |
1370 | 0 | cEstParams.blockX = 0; |
1371 | 0 | cEstParams.prevLineX = n == 0 ? nullptr : &EstParamsArray[n-1].blockX; |
1372 | 0 | cEstParams.mvs = &mvs; |
1373 | 0 | cEstParams.orig = &orig; |
1374 | 0 | cEstParams.buffer = &buffer; |
1375 | 0 | cEstParams.previous = previous; |
1376 | 0 | cEstParams.blockSize = blockSize; |
1377 | 0 | cEstParams.factor = factor; |
1378 | 0 | cEstParams.doubleRes = doubleRes; |
1379 | 0 | cEstParams.mctf = this; |
1380 | 0 | cEstParams.blockY = blockY; |
1381 | 0 | cEstParams.bitDepth = bitDepth; |
1382 | |
|
1383 | 0 | m_threadPool->addBarrierTask<EstParams>( task, &cEstParams, &taskCounter); |
1384 | 0 | } |
1385 | 0 | taskCounter.wait(); |
1386 | 0 | } |
1387 | 0 | else |
1388 | 0 | { |
1389 | 0 | for( int blockY = 0; blockY + 8 <= origHeight; blockY += stepSize ) |
1390 | 0 | { |
1391 | 0 | std::atomic_int blockX( 0 ), prevBlockX( orig.Y().width + stepSize ); |
1392 | 0 | estimateLumaLn( blockX, blockY ? &prevBlockX : nullptr, mvs, orig, buffer, blockSize, previous, factor, doubleRes, blockY, bitDepth ); |
1393 | 0 | } |
1394 | |
|
1395 | 0 | } |
1396 | 0 | } |
1397 | | |
1398 | | void MCTF::xFinalizeBlkLine( const PelStorage &orgPic, std::deque<TemporalFilterSourcePicInfo> &srcFrameInfo, PelStorage &newOrgPic, int yStart, const double sigmaSqCh[MAX_NUM_CH], double overallStrength ) const |
1399 | 0 | { |
1400 | 0 | PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_MCTF_APPLY ); |
1401 | |
|
1402 | 0 | const int numRefs = int(srcFrameInfo.size()); |
1403 | |
|
1404 | 0 | int refStrengthRow = m_encCfg->m_picReordering ? 0 : 1; |
1405 | | |
1406 | | // max 64*64*8*2 = 2^(6+6+3+1)=2^16=64kbps, usually 16*16*8*2=2^(4+4+3+1)=4kbps, and allow for overread of one line |
1407 | 0 | Pel* dstBufs = ( Pel* ) alloca( sizeof( Pel ) * ( numRefs * m_mctfUnitSize * m_mctfUnitSize + m_mctfUnitSize ) ); |
1408 | |
|
1409 | 0 | for( int c = 0; c < getNumberValidComponents( m_encCfg->m_internChromaFormat ); c++ ) |
1410 | 0 | { |
1411 | 0 | const ComponentID compID = ( ComponentID ) c; |
1412 | 0 | const int height = orgPic.bufs[c].height; |
1413 | 0 | const int width = orgPic.bufs[c].width; |
1414 | |
|
1415 | 0 | const double sigmaSq = sigmaSqCh[ toChannelType( compID) ]; |
1416 | 0 | const double weightScaling = overallStrength * ( isChroma( compID ) ? m_chromaFactor : 0.4 ); |
1417 | 0 | const ClpRng clpRng{ m_encCfg->m_internalBitDepth[toChannelType( compID )] }; |
1418 | |
|
1419 | 0 | const int blkSizeY = m_mctfUnitSize >> getComponentScaleY( compID, m_encCfg->m_internChromaFormat ); |
1420 | 0 | const int blkSizeX = m_mctfUnitSize >> getComponentScaleX( compID, m_encCfg->m_internChromaFormat ); |
1421 | 0 | const int yOut = yStart >> getComponentScaleY( compID, m_encCfg->m_internChromaFormat ); |
1422 | |
|
1423 | 0 | for( int by = yOut, yBlkAddr = yStart / m_mctfUnitSize; by < std::min( yOut + blkSizeY, height ); by += blkSizeY, yBlkAddr++ ) |
1424 | 0 | { |
1425 | 0 | const int h = std::min( blkSizeY, height - by ); |
1426 | |
|
1427 | 0 | for( int bx = 0, xBlkAddr = 0; bx < width; bx += blkSizeX, xBlkAddr++ ) |
1428 | 0 | { |
1429 | 0 | const int w = std::min( blkSizeX, width - bx ); |
1430 | |
|
1431 | 0 | const int csx = getComponentScaleX( compID, m_encCfg->m_internChromaFormat ); |
1432 | 0 | const int csy = getComponentScaleY( compID, m_encCfg->m_internChromaFormat ); |
1433 | |
|
1434 | 0 | const Pel* correctedPics[2 * VVENC_MCTF_RANGE] = { nullptr, }; |
1435 | 0 | Pel* currDst = dstBufs; |
1436 | 0 | int verror [2 * VVENC_MCTF_RANGE] = { 0, }; |
1437 | 0 | double refStr [2 * VVENC_MCTF_RANGE] = { 0.0, }; |
1438 | |
|
1439 | 0 | for( int i = 0; i < numRefs; i++, currDst += w * h ) |
1440 | 0 | { |
1441 | 0 | const Pel* srcImage = srcFrameInfo[i].picBuffer.bufs[compID].buf; |
1442 | 0 | const int srcStride = srcFrameInfo[i].picBuffer.bufs[compID].stride; |
1443 | |
|
1444 | 0 | Pel* dst = currDst; |
1445 | 0 | const int dstStride = w; |
1446 | 0 | correctedPics[i] = dst; |
1447 | |
|
1448 | 0 | const MotionVector& mv = srcFrameInfo[i].mvs.get( xBlkAddr, yBlkAddr); |
1449 | 0 | const int dx = mv.x >> csx; |
1450 | 0 | const int dy = mv.y >> csy; |
1451 | 0 | const int xInt = mv.x >> ( 4 + csx ); |
1452 | 0 | const int yInt = mv.y >> ( 4 + csy ); |
1453 | |
|
1454 | 0 | const int yOffset = by + yInt; |
1455 | 0 | const int xOffset = bx + xInt; |
1456 | 0 | const Pel* src = srcImage + yOffset * srcStride + xOffset; |
1457 | |
|
1458 | 0 | if( m_lowResFltApply ) // || isChroma( compID ) |
1459 | 0 | { |
1460 | 0 | const int16_t* xFilter = m_interpolationFilter4[dx & 0xf]; |
1461 | 0 | const int16_t* yFilter = m_interpolationFilter4[dy & 0xf]; // will add 6 bit. |
1462 | |
|
1463 | 0 | m_applyFrac[toChannelType( compID )][1]( src, srcStride, dst, dstStride, w, h, xFilter, yFilter, m_encCfg->m_internalBitDepth[toChannelType( compID )] ); |
1464 | 0 | } |
1465 | 0 | else |
1466 | 0 | { |
1467 | 0 | const int16_t* xFilter = m_interpolationFilter8[dx & 0xf]; |
1468 | 0 | const int16_t* yFilter = m_interpolationFilter8[dy & 0xf]; // will add 6 bit. |
1469 | |
|
1470 | 0 | m_applyFrac[toChannelType( compID )][0]( src, srcStride, dst, dstStride, w, h, xFilter, yFilter, m_encCfg->m_internalBitDepth[toChannelType( compID )] ); |
1471 | 0 | } |
1472 | |
|
1473 | 0 | if( mv.rmsme > 0 && m_encCfg->m_QP <= 32 && w == h && w <= 32 ) // "deblocking" |
1474 | 0 | { |
1475 | 0 | m_applyPlanarCorrection( orgPic.bufs[c].bufAt( bx, by ), orgPic.bufs[c].stride, dst, dstStride, w, h, clpRng, mv.rmsme ); |
1476 | 0 | } |
1477 | |
|
1478 | 0 | verror[i] = mv.error; |
1479 | 0 | refStr[i] = m_refStrengths[refStrengthRow][srcFrameInfo[i].index]; |
1480 | 0 | } |
1481 | |
|
1482 | 0 | m_applyBlock( orgPic.bufs[c], newOrgPic.bufs[c], CompArea( compID, orgPic.chromaFormat, Area( bx, by, w, h ) ), clpRng, correctedPics, numRefs, verror, refStr, weightScaling, sigmaSq ); |
1483 | 0 | } |
1484 | 0 | } |
1485 | 0 | } |
1486 | 0 | } |
1487 | | |
1488 | | void MCTF::bilateralFilter(const PelStorage& orgPic, std::deque<TemporalFilterSourcePicInfo>& srcFrameInfo, PelStorage& newOrgPic, double overallStrength) const |
1489 | 0 | { |
1490 | 0 | const double lumaSigmaSq = m_sigmaMultiplier * ( 128.0 + 3.0 / 256.0 * m_encCfg->m_QP * m_encCfg->m_QP * m_encCfg->m_QP ); |
1491 | 0 | const double chromaSigmaSq = 30 * 30; |
1492 | |
|
1493 | 0 | double sigmaSqCh[MAX_NUM_CH]; |
1494 | 0 | for(int c=0; c< getNumberValidChannels(m_encCfg->m_internChromaFormat); c++) |
1495 | 0 | { |
1496 | 0 | const ChannelType ch=(ChannelType)c; |
1497 | 0 | const Pel maxSampleValue = (1<<m_encCfg->m_internalBitDepth[ch])-1; |
1498 | 0 | const double bitDepthDiffWeighting=1024.0 / (maxSampleValue+1); |
1499 | 0 | sigmaSqCh[ch] = ( isChroma( ch ) ? chromaSigmaSq : lumaSigmaSq ) / ( bitDepthDiffWeighting * bitDepthDiffWeighting ); |
1500 | 0 | } |
1501 | |
|
1502 | 0 | if( m_threadPool ) |
1503 | 0 | { |
1504 | 0 | struct FltParams |
1505 | 0 | { |
1506 | 0 | const PelStorage *orgPic; |
1507 | 0 | std::deque<TemporalFilterSourcePicInfo> *srcFrameInfo; |
1508 | 0 | PelStorage *newOrgPic; |
1509 | 0 | const double *sigmaSqCh; |
1510 | 0 | double overallStrength; |
1511 | 0 | const MCTF* mctf; |
1512 | 0 | int yStart; |
1513 | 0 | }; |
1514 | |
|
1515 | 0 | std::vector<FltParams> FltParamsArray( orgPic.Y().height/ m_mctfUnitSize + 1 ); |
1516 | |
|
1517 | 0 | WaitCounter taskCounter; |
1518 | |
|
1519 | 0 | for (int n = 0, yStart = 0; yStart < orgPic.Y().height; yStart += m_mctfUnitSize, n++) |
1520 | 0 | { |
1521 | 0 | static auto task = []( int tId, FltParams* params) |
1522 | 0 | { |
1523 | 0 | ITT_TASKSTART( itt_domain_MCTF_flt, itt_handle_flt ); |
1524 | |
|
1525 | 0 | params->mctf->xFinalizeBlkLine( *params->orgPic, *params->srcFrameInfo, *params->newOrgPic, params->yStart, params->sigmaSqCh, params->overallStrength ); |
1526 | |
|
1527 | 0 | ITT_TASKEND( itt_domain_MCTF_flt, itt_handle_flt ); |
1528 | 0 | return true; |
1529 | 0 | }; |
1530 | |
|
1531 | 0 | FltParams& cFltParams = FltParamsArray[n]; |
1532 | 0 | cFltParams.orgPic = &orgPic; |
1533 | 0 | cFltParams.srcFrameInfo = &srcFrameInfo; |
1534 | 0 | cFltParams.newOrgPic = &newOrgPic; |
1535 | 0 | cFltParams.sigmaSqCh = sigmaSqCh; |
1536 | 0 | cFltParams.overallStrength = overallStrength; |
1537 | 0 | cFltParams.mctf = this; |
1538 | 0 | cFltParams.yStart = yStart; |
1539 | |
|
1540 | 0 | m_threadPool->addBarrierTask<FltParams>( task, &cFltParams, &taskCounter); |
1541 | 0 | } |
1542 | 0 | taskCounter.wait(); |
1543 | 0 | } |
1544 | 0 | else |
1545 | 0 | { |
1546 | 0 | for (int yStart = 0; yStart < orgPic.Y().height; yStart += m_mctfUnitSize ) |
1547 | 0 | { |
1548 | 0 | xFinalizeBlkLine( orgPic, srcFrameInfo, newOrgPic, yStart, sigmaSqCh, overallStrength ); |
1549 | 0 | } |
1550 | 0 | } |
1551 | 0 | } |
1552 | | |
1553 | | } // namespace vvenc |
1554 | | |
1555 | | //! \} |