/src/x265/source/encoder/sao.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * Min Chen <chenm003@163.com> |
6 | | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> |
7 | | * |
8 | | * This program is free software; you can redistribute it and/or modify |
9 | | * it under the terms of the GNU General Public License as published by |
10 | | * the Free Software Foundation; either version 2 of the License, or |
11 | | * (at your option) any later version. |
12 | | * |
13 | | * This program is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | * GNU General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU General Public License |
19 | | * along with this program; if not, write to the Free Software |
20 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
21 | | * |
22 | | * This program is also available under a commercial proprietary license. |
23 | | * For more information, contact us at license @ x265.com. |
24 | | *****************************************************************************/ |
25 | | |
26 | | #include "common.h" |
27 | | #include "frame.h" |
28 | | #include "framedata.h" |
29 | | #include "picyuv.h" |
30 | | #include "sao.h" |
31 | | |
32 | | namespace { |
33 | | |
34 | | inline int32_t roundIBDI(int32_t num, int32_t den) |
35 | 43.5k | { |
36 | 43.5k | return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2)); |
37 | 43.5k | } |
38 | | |
39 | | /* get the sign of input variable (TODO: this is a dup, make common) */ |
40 | | inline int8_t signOf(int x) |
41 | 2.53M | { |
42 | 2.53M | return (x >> 31) | ((int)((((uint32_t)-x)) >> 31)); |
43 | 2.53M | } |
44 | | |
45 | | inline int signOf2(const int a, const int b) |
46 | 107M | { |
47 | | // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order! |
48 | 107M | int r = 0; |
49 | 107M | if (a < b) |
50 | 5.73k | r = -1; |
51 | 107M | if (a > b) |
52 | 2.12k | r = 1; |
53 | 107M | return r; |
54 | 107M | } |
55 | | |
56 | | inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg) |
57 | 3.02k | { |
58 | 3.02k | return (count * offset - offsetOrg * 2) * offset; |
59 | 3.02k | } |
60 | | } // end anonymous namespace |
61 | | |
62 | | |
63 | | namespace X265_NS { |
64 | | |
65 | | const uint32_t SAO::s_eoTable[NUM_EDGETYPE] = |
66 | | { |
67 | | 1, // 0 |
68 | | 2, // 1 |
69 | | 0, // 2 |
70 | | 3, // 3 |
71 | | 4 // 4 |
72 | | }; |
73 | | |
74 | | SAO::SAO() |
75 | 14.8k | { |
76 | 14.8k | m_countPreDblk = NULL; |
77 | 14.8k | m_offsetOrgPreDblk = NULL; |
78 | 14.8k | m_refDepth = 0; |
79 | 14.8k | m_param = NULL; |
80 | 14.8k | m_clipTable = NULL; |
81 | 14.8k | m_clipTableBase = NULL; |
82 | 14.8k | m_tmpU[0] = NULL; |
83 | 14.8k | m_tmpU[1] = NULL; |
84 | 14.8k | m_tmpU[2] = NULL; |
85 | 14.8k | m_tmpL1[0] = NULL; |
86 | 14.8k | m_tmpL1[1] = NULL; |
87 | 14.8k | m_tmpL1[2] = NULL; |
88 | 14.8k | m_tmpL2[0] = NULL; |
89 | 14.8k | m_tmpL2[1] = NULL; |
90 | 14.8k | m_tmpL2[2] = NULL; |
91 | 14.8k | m_depthSaoRate = NULL; |
92 | 14.8k | } |
93 | | |
94 | | bool SAO::create(x265_param* param, int initCommon) |
95 | 14.8k | { |
96 | 14.8k | m_param = param; |
97 | 14.8k | m_chromaFormat = param->internalCsp; |
98 | 14.8k | m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp); |
99 | 14.8k | m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp); |
100 | | |
101 | 14.8k | m_numCuInWidth = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; |
102 | 14.8k | m_numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; |
103 | | |
104 | 14.8k | const pixel maxY = (1 << X265_DEPTH) - 1; |
105 | 14.8k | const pixel rangeExt = maxY >> 1; |
106 | 14.8k | int numCtu = m_numCuInWidth * m_numCuInHeight; |
107 | | |
108 | 59.5k | for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++) |
109 | 44.6k | { |
110 | 44.6k | CHECKED_MALLOC(m_tmpL1[i], pixel, m_param->maxCUSize + 1); |
111 | 44.6k | CHECKED_MALLOC(m_tmpL2[i], pixel, m_param->maxCUSize + 1); |
112 | | |
113 | | // SAO asm code will read 1 pixel before and after, so pad by 2 |
114 | | // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here |
115 | 44.6k | CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * m_param->maxCUSize + 2 + 32); |
116 | 44.6k | m_tmpU[i] += 1; |
117 | 44.6k | } |
118 | | |
119 | 14.8k | if (initCommon) |
120 | 3.11k | { |
121 | 3.11k | if (m_param->bSaoNonDeblocked) |
122 | 0 | { |
123 | 0 | CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); |
124 | 0 | CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu); |
125 | 0 | } |
126 | 3.11k | CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE); |
127 | | |
128 | 3.11k | m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0; |
129 | 3.11k | m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0; |
130 | 3.11k | m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0; |
131 | 3.11k | m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0; |
132 | 3.11k | m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0; |
133 | 3.11k | m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0; |
134 | 3.11k | m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0; |
135 | 3.11k | m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0; |
136 | | |
137 | 3.11k | CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt); |
138 | 3.11k | m_clipTable = &(m_clipTableBase[rangeExt]); |
139 | | |
140 | | // Share with fast clip lookup table |
141 | | |
142 | 399k | for (int i = 0; i < rangeExt; i++) |
143 | 396k | m_clipTableBase[i] = 0; |
144 | | |
145 | 798k | for (int i = 0; i < maxY; i++) |
146 | 795k | m_clipTable[i] = (pixel)i; |
147 | | |
148 | 399k | for (int i = maxY; i < maxY + rangeExt; i++) |
149 | 396k | m_clipTable[i] = maxY; |
150 | | |
151 | 3.11k | } |
152 | 11.7k | else |
153 | 11.7k | { |
154 | | // must initialize these common pointer outside of function |
155 | 11.7k | m_countPreDblk = NULL; |
156 | 11.7k | m_offsetOrgPreDblk = NULL; |
157 | 11.7k | m_clipTableBase = NULL; |
158 | 11.7k | m_clipTable = NULL; |
159 | 11.7k | } |
160 | | |
161 | 14.8k | return true; |
162 | | |
163 | 0 | fail: |
164 | 0 | return false; |
165 | 14.8k | } |
166 | | |
167 | | void SAO::createFromRootNode(SAO* root) |
168 | 11.7k | { |
169 | 11.7k | X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk"); |
170 | 11.7k | X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk"); |
171 | 11.7k | X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate"); |
172 | 11.7k | X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase"); |
173 | 11.7k | X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable"); |
174 | | |
175 | 11.7k | m_countPreDblk = root->m_countPreDblk; |
176 | 11.7k | m_offsetOrgPreDblk = root->m_offsetOrgPreDblk; |
177 | 11.7k | m_depthSaoRate = root->m_depthSaoRate; |
178 | 11.7k | m_clipTableBase = root->m_clipTableBase; // Unnecessary |
179 | 11.7k | m_clipTable = root->m_clipTable; |
180 | 11.7k | } |
181 | | |
182 | | void SAO::destroy(int destoryCommon) |
183 | 14.8k | { |
184 | 59.5k | for (int i = 0; i < 3; i++) |
185 | 44.6k | { |
186 | 44.6k | if (m_tmpL1[i]) |
187 | 44.6k | { |
188 | 44.6k | X265_FREE(m_tmpL1[i]); |
189 | 44.6k | m_tmpL1[i] = NULL; |
190 | 44.6k | } |
191 | | |
192 | 44.6k | if (m_tmpL2[i]) |
193 | 44.6k | { |
194 | 44.6k | X265_FREE(m_tmpL2[i]); |
195 | 44.6k | m_tmpL2[i] = NULL; |
196 | 44.6k | } |
197 | | |
198 | 44.6k | if (m_tmpU[i]) |
199 | 44.6k | { |
200 | 44.6k | X265_FREE(m_tmpU[i] - 1); |
201 | 44.6k | m_tmpU[i] = NULL; |
202 | 44.6k | } |
203 | 44.6k | } |
204 | | |
205 | 14.8k | if (destoryCommon) |
206 | 3.11k | { |
207 | 3.11k | if (m_param->bSaoNonDeblocked) |
208 | 0 | { |
209 | 0 | X265_FREE_ZERO(m_countPreDblk); |
210 | 0 | X265_FREE_ZERO(m_offsetOrgPreDblk); |
211 | 0 | } |
212 | 3.11k | X265_FREE_ZERO(m_depthSaoRate); |
213 | 3.11k | X265_FREE_ZERO(m_clipTableBase); |
214 | 3.11k | } |
215 | 14.8k | } |
216 | | |
217 | | /* allocate memory for SAO parameters */ |
218 | | void SAO::allocSaoParam(SAOParam* saoParam) const |
219 | 698 | { |
220 | 698 | int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1; |
221 | 698 | saoParam->numCuInWidth = m_numCuInWidth; |
222 | | |
223 | 2.79k | for (int i = 0; i < planes; i++) |
224 | 2.09k | saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth]; |
225 | 698 | } |
226 | | |
227 | | void SAO::startSlice(Frame* frame, Entropy& initState) |
228 | 3.17k | { |
229 | 3.17k | m_frame = frame; |
230 | 3.17k | Slice* slice = m_frame->m_encData->m_slice; |
231 | | |
232 | 3.17k | switch (slice->m_sliceType) |
233 | 3.17k | { |
234 | 3.17k | case I_SLICE: |
235 | 3.17k | m_refDepth = 0; |
236 | 3.17k | break; |
237 | 0 | case P_SLICE: |
238 | 0 | m_refDepth = 1; |
239 | 0 | break; |
240 | 0 | case B_SLICE: |
241 | 0 | m_refDepth = 2 + !IS_REFERENCED(frame); |
242 | 0 | break; |
243 | 3.17k | } |
244 | | |
245 | 3.17k | m_entropyCoder.load(initState); |
246 | 3.17k | m_rdContexts.next.load(initState); |
247 | 3.17k | m_rdContexts.cur.load(initState); |
248 | | |
249 | 3.17k | SAOParam* saoParam = frame->m_encData->m_saoParam; |
250 | 3.17k | if (!saoParam) |
251 | 698 | { |
252 | 698 | saoParam = new SAOParam; |
253 | 698 | allocSaoParam(saoParam); |
254 | 698 | frame->m_encData->m_saoParam = saoParam; |
255 | 698 | } |
256 | | |
257 | 3.17k | saoParam->bSaoFlag[0] = true; |
258 | 3.17k | saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400; |
259 | | |
260 | 3.17k | m_numNoSao[0] = 0; // Luma |
261 | 3.17k | m_numNoSao[1] = 0; // Chroma |
262 | | |
263 | | // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled. |
264 | 3.17k | if (m_param->frameNumThreads == 1) |
265 | 90 | { |
266 | 90 | if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE) |
267 | 0 | saoParam->bSaoFlag[0] = false; |
268 | 90 | if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA) |
269 | 0 | saoParam->bSaoFlag[1] = false; |
270 | 90 | } |
271 | 3.17k | } |
272 | | |
273 | | // CTU-based SAO process without slice granularity |
274 | | void SAO::applyPixelOffsets(int addr, int typeIdx, int plane) |
275 | 0 | { |
276 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
277 | 0 | pixel* rec = reconPic->getPlaneAddr(plane, addr); |
278 | 0 | intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; |
279 | 0 | uint32_t picWidth = m_param->sourceWidth; |
280 | 0 | uint32_t picHeight = m_param->sourceHeight; |
281 | 0 | const CUData* cu = m_frame->m_encData->getPicCTU(addr); |
282 | 0 | int ctuWidth = m_param->maxCUSize; |
283 | 0 | int ctuHeight = m_param->maxCUSize; |
284 | 0 | uint32_t lpelx = cu->m_cuPelX; |
285 | 0 | uint32_t tpely = cu->m_cuPelY; |
286 | 0 | const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice; |
287 | 0 | const uint32_t lastRowInSlice = cu->m_bLastRowInSlice; |
288 | 0 | const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice; |
289 | | |
290 | | // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it |
291 | 0 | if (lastRowInSlice) |
292 | 0 | { |
293 | 0 | picHeight = x265_min(picHeight, (tpely + ctuHeight)); |
294 | 0 | } |
295 | |
|
296 | 0 | if (plane) |
297 | 0 | { |
298 | 0 | picWidth >>= m_hChromaShift; |
299 | 0 | picHeight >>= m_vChromaShift; |
300 | 0 | ctuWidth >>= m_hChromaShift; |
301 | 0 | ctuHeight >>= m_vChromaShift; |
302 | 0 | lpelx >>= m_hChromaShift; |
303 | 0 | tpely >>= m_vChromaShift; |
304 | 0 | } |
305 | 0 | uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth); |
306 | 0 | uint32_t bpely = x265_min(tpely + ctuHeight, picHeight); |
307 | 0 | ctuWidth = rpelx - lpelx; |
308 | 0 | ctuHeight = bpely - tpely; |
309 | |
|
310 | 0 | int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2]; |
311 | 0 | int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; |
312 | |
|
313 | 0 | memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */ |
314 | |
|
315 | 0 | pixel* tmpL = m_tmpL1[plane]; |
316 | 0 | pixel* tmpU = &(m_tmpU[plane][lpelx]); |
317 | |
|
318 | 0 | int8_t* offsetEo = m_offsetEo[plane]; |
319 | |
|
320 | 0 | switch (typeIdx) |
321 | 0 | { |
322 | 0 | case SAO_EO_0: // dir: - |
323 | 0 | { |
324 | 0 | pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0; |
325 | 0 | int startX = !lpelx; |
326 | 0 | int endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; |
327 | 0 | if (ctuWidth & 15) |
328 | 0 | { |
329 | 0 | for (int y = 0; y < ctuHeight; y++, rec += stride) |
330 | 0 | { |
331 | 0 | int signLeft = signOf(rec[startX] - tmpL[y]); |
332 | 0 | for (int x = startX; x < endX; x++) |
333 | 0 | { |
334 | 0 | int signRight = signOf(rec[x] - rec[x + 1]); |
335 | 0 | int edgeType = signRight + signLeft + 2; |
336 | 0 | signLeft = -signRight; |
337 | |
|
338 | 0 | rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; |
339 | 0 | } |
340 | 0 | } |
341 | 0 | } |
342 | 0 | else |
343 | 0 | { |
344 | 0 | for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride) |
345 | 0 | { |
346 | 0 | signLeft1[0] = signOf(rec[startX] - tmpL[y]); |
347 | 0 | signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]); |
348 | |
|
349 | 0 | if (!lpelx) |
350 | 0 | { |
351 | 0 | firstPxl = rec[0]; |
352 | 0 | row1FirstPxl = rec[stride]; |
353 | 0 | } |
354 | |
|
355 | 0 | if (rpelx == picWidth) |
356 | 0 | { |
357 | 0 | lastPxl = rec[ctuWidth - 1]; |
358 | 0 | row1LastPxl = rec[stride + ctuWidth - 1]; |
359 | 0 | } |
360 | |
|
361 | 0 | primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride); |
362 | |
|
363 | 0 | if (!lpelx) |
364 | 0 | { |
365 | 0 | rec[0] = firstPxl; |
366 | 0 | rec[stride] = row1FirstPxl; |
367 | 0 | } |
368 | |
|
369 | 0 | if (rpelx == picWidth) |
370 | 0 | { |
371 | 0 | rec[ctuWidth - 1] = lastPxl; |
372 | 0 | rec[stride + ctuWidth - 1] = row1LastPxl; |
373 | 0 | } |
374 | 0 | } |
375 | 0 | } |
376 | 0 | break; |
377 | 0 | } |
378 | 0 | case SAO_EO_1: // dir: | |
379 | 0 | { |
380 | 0 | int startY = bAboveUnavail; |
381 | 0 | int endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; |
382 | 0 | if (startY) |
383 | 0 | rec += stride; |
384 | |
|
385 | 0 | if (ctuWidth & 15) |
386 | 0 | { |
387 | 0 | for (int x = 0; x < ctuWidth; x++) |
388 | 0 | upBuff1[x] = signOf(rec[x] - tmpU[x]); |
389 | |
|
390 | 0 | for (int y = startY; y < endY; y++, rec += stride) |
391 | 0 | { |
392 | 0 | for (int x = 0; x < ctuWidth; x++) |
393 | 0 | { |
394 | 0 | int8_t signDown = signOf(rec[x] - rec[x + stride]); |
395 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
396 | 0 | upBuff1[x] = -signDown; |
397 | |
|
398 | 0 | rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; |
399 | 0 | } |
400 | 0 | } |
401 | 0 | } |
402 | 0 | else |
403 | 0 | { |
404 | 0 | primitives.sign(upBuff1, rec, tmpU, ctuWidth); |
405 | |
|
406 | 0 | int diff = (endY - startY) % 2; |
407 | 0 | for (int y = startY; y < endY - diff; y += 2, rec += 2 * stride) |
408 | 0 | primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth); |
409 | |
|
410 | 0 | if (diff & 1) |
411 | 0 | primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth); |
412 | 0 | } |
413 | |
|
414 | 0 | break; |
415 | 0 | } |
416 | 0 | case SAO_EO_2: // dir: 135 |
417 | 0 | { |
418 | 0 | int startX = !lpelx; |
419 | 0 | int endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; |
420 | |
|
421 | 0 | int startY = bAboveUnavail; |
422 | 0 | int endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; |
423 | |
|
424 | 0 | if (startY) |
425 | 0 | rec += stride; |
426 | |
|
427 | 0 | if (!(ctuWidth & 15)) |
428 | 0 | { |
429 | 0 | int8_t firstSign, lastSign; |
430 | |
|
431 | 0 | if (!lpelx) |
432 | 0 | firstSign = upBuff1[0]; |
433 | |
|
434 | 0 | if (rpelx == picWidth) |
435 | 0 | lastSign = upBuff1[ctuWidth - 1]; |
436 | |
|
437 | 0 | primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth); |
438 | |
|
439 | 0 | if (!lpelx) |
440 | 0 | upBuff1[0] = firstSign; |
441 | |
|
442 | 0 | if (rpelx == picWidth) |
443 | 0 | upBuff1[ctuWidth - 1] = lastSign; |
444 | 0 | } |
445 | 0 | else |
446 | 0 | { |
447 | 0 | for (int x = startX; x < endX; x++) |
448 | 0 | upBuff1[x] = signOf(rec[x] - tmpU[x - 1]); |
449 | 0 | } |
450 | |
|
451 | 0 | if (ctuWidth & 15) |
452 | 0 | { |
453 | 0 | for (int y = startY; y < endY; y++, rec += stride) |
454 | 0 | { |
455 | 0 | upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]); |
456 | 0 | for (int x = startX; x < endX; x++) |
457 | 0 | { |
458 | 0 | int8_t signDown = signOf(rec[x] - rec[x + stride + 1]); |
459 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
460 | 0 | upBufft[x + 1] = -signDown; |
461 | 0 | rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; |
462 | 0 | } |
463 | |
|
464 | 0 | std::swap(upBuff1, upBufft); |
465 | 0 | } |
466 | 0 | } |
467 | 0 | else |
468 | 0 | { |
469 | 0 | for (int y = startY; y < endY; y++, rec += stride) |
470 | 0 | { |
471 | 0 | int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]); |
472 | |
|
473 | 0 | primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride); |
474 | |
|
475 | 0 | upBufft[startX] = iSignDown2; |
476 | |
|
477 | 0 | std::swap(upBuff1, upBufft); |
478 | 0 | } |
479 | 0 | } |
480 | 0 | break; |
481 | 0 | } |
482 | 0 | case SAO_EO_3: // dir: 45 |
483 | 0 | { |
484 | 0 | int startX = !lpelx; |
485 | 0 | int endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; |
486 | |
|
487 | 0 | int startY = bAboveUnavail; |
488 | 0 | int endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; |
489 | |
|
490 | 0 | if (startY) |
491 | 0 | rec += stride; |
492 | |
|
493 | 0 | if (ctuWidth & 15) |
494 | 0 | { |
495 | 0 | for (int x = startX - 1; x < endX; x++) |
496 | 0 | upBuff1[x] = signOf(rec[x] - tmpU[x + 1]); |
497 | |
|
498 | 0 | for (int y = startY; y < endY; y++, rec += stride) |
499 | 0 | { |
500 | 0 | int x = startX; |
501 | 0 | int8_t signDown = signOf(rec[x] - tmpL[y + 1]); |
502 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
503 | 0 | upBuff1[x - 1] = -signDown; |
504 | 0 | rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; |
505 | |
|
506 | 0 | for (x = startX + 1; x < endX; x++) |
507 | 0 | { |
508 | 0 | signDown = signOf(rec[x] - rec[x + stride - 1]); |
509 | 0 | edgeType = signDown + upBuff1[x] + 2; |
510 | 0 | upBuff1[x - 1] = -signDown; |
511 | 0 | rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; |
512 | 0 | } |
513 | |
|
514 | 0 | upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); |
515 | 0 | } |
516 | 0 | } |
517 | 0 | else |
518 | 0 | { |
519 | 0 | int8_t firstSign, lastSign; |
520 | |
|
521 | 0 | if (lpelx) |
522 | 0 | firstSign = signOf(rec[-1] - tmpU[0]); |
523 | 0 | if (rpelx == picWidth) |
524 | 0 | lastSign = upBuff1[ctuWidth - 1]; |
525 | |
|
526 | 0 | primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth); |
527 | |
|
528 | 0 | if (lpelx) |
529 | 0 | upBuff1[-1] = firstSign; |
530 | 0 | if (rpelx == picWidth) |
531 | 0 | upBuff1[ctuWidth - 1] = lastSign; |
532 | |
|
533 | 0 | for (int y = startY; y < endY; y++, rec += stride) |
534 | 0 | { |
535 | 0 | int x = startX; |
536 | 0 | int8_t signDown = signOf(rec[x] - tmpL[y + 1]); |
537 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
538 | 0 | upBuff1[x - 1] = -signDown; |
539 | 0 | rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; |
540 | |
|
541 | 0 | primitives.saoCuOrgE3[endX > 16](rec, upBuff1, offsetEo, stride - 1, startX, endX); |
542 | |
|
543 | 0 | upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); |
544 | 0 | } |
545 | 0 | } |
546 | |
|
547 | 0 | break; |
548 | 0 | } |
549 | 0 | case SAO_BO: |
550 | 0 | { |
551 | 0 | const int8_t* offsetBo = m_offsetBo[plane]; |
552 | |
|
553 | 0 | if (ctuWidth & 15) |
554 | 0 | { |
555 | 41.9k | #define SAO_BO_BITS 5 |
556 | 0 | const int boShift = X265_DEPTH - SAO_BO_BITS; |
557 | |
|
558 | 0 | for (int y = 0; y < ctuHeight; y++, rec += stride) |
559 | 0 | for (int x = 0; x < ctuWidth; x++) |
560 | 0 | rec[x] = x265_clip(rec[x] + offsetBo[rec[x] >> boShift]); |
561 | 0 | } |
562 | 0 | else |
563 | 0 | primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride); |
564 | |
|
565 | 0 | break; |
566 | 0 | } |
567 | 0 | default: break; |
568 | 0 | } |
569 | 0 | } |
570 | | |
571 | | /* Process SAO unit */ |
572 | | void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX) |
573 | 13.9k | { |
574 | 13.9k | PicYuv* reconPic = m_frame->m_reconPic; |
575 | 13.9k | intptr_t stride = reconPic->m_stride; |
576 | 13.9k | int ctuWidth = m_param->maxCUSize; |
577 | 13.9k | int ctuHeight = m_param->maxCUSize; |
578 | | |
579 | 13.9k | int addr = idxY * m_numCuInWidth + idxX; |
580 | 13.9k | pixel* rec = reconPic->getLumaAddr(addr); |
581 | | |
582 | 13.9k | if (idxX == 0) |
583 | 3.17k | { |
584 | 140k | for (int i = 0; i < ctuHeight + 1; i++) |
585 | 137k | { |
586 | 137k | m_tmpL1[0][i] = rec[0]; |
587 | 137k | rec += stride; |
588 | 137k | } |
589 | 3.17k | } |
590 | | |
591 | 13.9k | bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT); |
592 | 13.9k | int typeIdx = ctuParam[addr].typeIdx; |
593 | | |
594 | 13.9k | if (idxX != (m_numCuInWidth - 1)) |
595 | 10.8k | { |
596 | 10.8k | rec = reconPic->getLumaAddr(addr); |
597 | 421k | for (int i = 0; i < ctuHeight + 1; i++) |
598 | 410k | { |
599 | 410k | m_tmpL2[0][i] = rec[ctuWidth - 1]; |
600 | 410k | rec += stride; |
601 | 410k | } |
602 | 10.8k | } |
603 | | |
604 | 13.9k | if (typeIdx >= 0) |
605 | 0 | { |
606 | 0 | if (!mergeLeftFlag) |
607 | 0 | { |
608 | 0 | if (typeIdx == SAO_BO) |
609 | 0 | { |
610 | 0 | memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0])); |
611 | |
|
612 | 0 | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
613 | 0 | m_offsetBo[0][((ctuParam[addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); |
614 | 0 | } |
615 | 0 | else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) |
616 | 0 | { |
617 | 0 | int offset[NUM_EDGETYPE]; |
618 | 0 | offset[0] = 0; |
619 | 0 | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
620 | 0 | offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; |
621 | |
|
622 | 0 | for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) |
623 | 0 | m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; |
624 | 0 | } |
625 | 0 | } |
626 | 0 | applyPixelOffsets(addr, typeIdx, 0); |
627 | 0 | } |
628 | 13.9k | std::swap(m_tmpL1[0], m_tmpL2[0]); |
629 | 13.9k | } |
630 | | |
631 | | /* Process SAO unit (Chroma only) */ |
632 | | void SAO::generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX) |
633 | 13.9k | { |
634 | 13.9k | PicYuv* reconPic = m_frame->m_reconPic; |
635 | 13.9k | intptr_t stride = reconPic->m_strideC; |
636 | 13.9k | int ctuWidth = m_param->maxCUSize; |
637 | 13.9k | int ctuHeight = m_param->maxCUSize; |
638 | | |
639 | 13.9k | { |
640 | 13.9k | ctuWidth >>= m_hChromaShift; |
641 | 13.9k | ctuHeight >>= m_vChromaShift; |
642 | 13.9k | } |
643 | | |
644 | 13.9k | int addr = idxY * m_numCuInWidth + idxX; |
645 | 13.9k | pixel* recCb = reconPic->getCbAddr(addr); |
646 | 13.9k | pixel* recCr = reconPic->getCrAddr(addr); |
647 | | |
648 | 13.9k | if (idxX == 0) |
649 | 3.17k | { |
650 | 73.4k | for (int i = 0; i < ctuHeight + 1; i++) |
651 | 70.2k | { |
652 | 70.2k | m_tmpL1[1][i] = recCb[0]; |
653 | 70.2k | m_tmpL1[2][i] = recCr[0]; |
654 | 70.2k | recCb += stride; |
655 | 70.2k | recCr += stride; |
656 | 70.2k | } |
657 | 3.17k | } |
658 | | |
659 | 13.9k | bool mergeLeftFlagCb = (ctuParam[1][addr].mergeMode == SAO_MERGE_LEFT); |
660 | 13.9k | int typeIdxCb = ctuParam[1][addr].typeIdx; |
661 | | |
662 | 13.9k | bool mergeLeftFlagCr = (ctuParam[2][addr].mergeMode == SAO_MERGE_LEFT); |
663 | 13.9k | int typeIdxCr = ctuParam[2][addr].typeIdx; |
664 | | |
665 | 13.9k | if (idxX != (m_numCuInWidth - 1)) |
666 | 10.8k | { |
667 | 10.8k | recCb = reconPic->getCbAddr(addr); |
668 | 10.8k | recCr = reconPic->getCrAddr(addr); |
669 | 221k | for (int i = 0; i < ctuHeight + 1; i++) |
670 | 210k | { |
671 | 210k | m_tmpL2[1][i] = recCb[ctuWidth - 1]; |
672 | 210k | m_tmpL2[2][i] = recCr[ctuWidth - 1]; |
673 | 210k | recCb += stride; |
674 | 210k | recCr += stride; |
675 | 210k | } |
676 | 10.8k | } |
677 | | |
678 | | // Process U |
679 | 13.9k | if (typeIdxCb >= 0) |
680 | 0 | { |
681 | 0 | if (!mergeLeftFlagCb) |
682 | 0 | { |
683 | 0 | if (typeIdxCb == SAO_BO) |
684 | 0 | { |
685 | 0 | memset(m_offsetBo[1], 0, sizeof(m_offsetBo[0])); |
686 | |
|
687 | 0 | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
688 | 0 | m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC); |
689 | 0 | } |
690 | 0 | else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) |
691 | 0 | { |
692 | 0 | int offset[NUM_EDGETYPE]; |
693 | 0 | offset[0] = 0; |
694 | 0 | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
695 | 0 | offset[i + 1] = ctuParam[1][addr].offset[i] << SAO_BIT_INC; |
696 | |
|
697 | 0 | for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) |
698 | 0 | m_offsetEo[1][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; |
699 | 0 | } |
700 | 0 | } |
701 | 0 | applyPixelOffsets(addr, typeIdxCb, 1); |
702 | 0 | } |
703 | | |
704 | | // Process V |
705 | 13.9k | if (typeIdxCr >= 0) |
706 | 0 | { |
707 | 0 | if (!mergeLeftFlagCr) |
708 | 0 | { |
709 | 0 | if (typeIdxCr == SAO_BO) |
710 | 0 | { |
711 | 0 | memset(m_offsetBo[2], 0, sizeof(m_offsetBo[0])); |
712 | |
|
713 | 0 | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
714 | 0 | m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC); |
715 | 0 | } |
716 | 0 | else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) |
717 | 0 | { |
718 | 0 | int offset[NUM_EDGETYPE]; |
719 | 0 | offset[0] = 0; |
720 | 0 | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
721 | 0 | offset[i + 1] = ctuParam[2][addr].offset[i] << SAO_BIT_INC; |
722 | |
|
723 | 0 | for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) |
724 | 0 | m_offsetEo[2][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; |
725 | 0 | } |
726 | 0 | } |
727 | 0 | applyPixelOffsets(addr, typeIdxCb, 2); |
728 | 0 | } |
729 | | |
730 | 13.9k | std::swap(m_tmpL1[1], m_tmpL2[1]); |
731 | 13.9k | std::swap(m_tmpL1[2], m_tmpL2[2]); |
732 | 13.9k | } |
733 | | |
734 | | /* Calculate SAO statistics for current CTU without non-crossing slice */ |
735 | | void SAO::calcSaoStatsCTU(int addr, int plane) |
736 | 41.9k | { |
737 | 41.9k | Slice* slice = m_frame->m_encData->m_slice; |
738 | 41.9k | const PicYuv* reconPic = m_frame->m_reconPic; |
739 | 41.9k | const CUData* cu = m_frame->m_encData->getPicCTU(addr); |
740 | 41.9k | const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr); |
741 | 41.9k | const pixel* rec0 = reconPic->getPlaneAddr(plane, addr); |
742 | 41.9k | const pixel* fenc; |
743 | 41.9k | const pixel* rec; |
744 | 41.9k | intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; |
745 | 41.9k | uint32_t picWidth = m_param->sourceWidth; |
746 | 41.9k | uint32_t picHeight = m_param->sourceHeight; |
747 | 41.9k | int ctuWidth = m_param->maxCUSize; |
748 | 41.9k | int ctuHeight = m_param->maxCUSize; |
749 | 41.9k | uint32_t lpelx = cu->m_cuPelX; |
750 | 41.9k | uint32_t tpely = cu->m_cuPelY; |
751 | 41.9k | const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice; |
752 | 41.9k | const uint32_t lastRowInSlice = cu->m_bLastRowInSlice; |
753 | 41.9k | const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice; |
754 | | |
755 | 41.9k | if (plane) |
756 | 27.9k | { |
757 | 27.9k | picWidth >>= m_hChromaShift; |
758 | 27.9k | picHeight >>= m_vChromaShift; |
759 | 27.9k | ctuWidth >>= m_hChromaShift; |
760 | 27.9k | ctuHeight >>= m_vChromaShift; |
761 | 27.9k | lpelx >>= m_hChromaShift; |
762 | 27.9k | tpely >>= m_vChromaShift; |
763 | 27.9k | } |
764 | 41.9k | uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth); |
765 | 41.9k | uint32_t bpely = x265_min(tpely + ctuHeight, picHeight); |
766 | 41.9k | ctuWidth = rpelx - lpelx; |
767 | 41.9k | ctuHeight = bpely - tpely; |
768 | | |
769 | | // NOTE: Careful! the picHeight apply for Equal operator only in below, so I may safe to hack it |
770 | 41.9k | if (lastRowInSlice) |
771 | 9.30k | { |
772 | 9.30k | picHeight = bpely; |
773 | 9.30k | } |
774 | | |
775 | 41.9k | int startX; |
776 | 41.9k | int startY; |
777 | 41.9k | int endX; |
778 | 41.9k | int endY; |
779 | | |
780 | 41.9k | const int plane_offset = plane ? 2 : 0; |
781 | 41.9k | int skipB = 4; |
782 | 41.9k | int skipR = 5; |
783 | | |
784 | 41.9k | int8_t _upBuff[2 * (MAX_CU_SIZE + 16 + 16)], *upBuff1 = _upBuff + 16, *upBufft = upBuff1 + (MAX_CU_SIZE + 16 + 16); |
785 | | |
786 | 41.9k | ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]); |
787 | | |
788 | | // Calculate (fenc - frec) and put into diff[] |
789 | 41.9k | if ((lpelx + ctuWidth < picWidth) & (tpely + ctuHeight < picHeight)) |
790 | 25.2k | { |
791 | | // WARNING: *) May read beyond bound on video than ctuWidth or ctuHeight is NOT multiple of cuSize |
792 | 25.2k | X265_CHECK((ctuWidth == ctuHeight) || (m_chromaFormat != X265_CSP_I420), "video size check failure\n"); |
793 | 25.2k | if (plane) |
794 | 16.8k | primitives.chroma[m_chromaFormat].cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride); |
795 | 8.41k | else |
796 | 8.41k | primitives.cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride); |
797 | 25.2k | } |
798 | 16.7k | else |
799 | 16.7k | { |
800 | | // path for non-square area (most in edge) |
801 | 376k | for(int y = 0; y < ctuHeight; y++) |
802 | 359k | { |
803 | 10.3M | for(int x = 0; x < ctuWidth; x++) |
804 | 10.0M | { |
805 | 10.0M | diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]); |
806 | 10.0M | } |
807 | 359k | } |
808 | 16.7k | } |
809 | | |
810 | | // SAO_BO: |
811 | 41.9k | { |
812 | 41.9k | if (m_param->bSaoNonDeblocked) |
813 | 0 | { |
814 | 0 | skipB = 3; |
815 | 0 | skipR = 4; |
816 | 0 | } |
817 | | |
818 | 41.9k | endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset; |
819 | 41.9k | endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset; |
820 | | |
821 | 41.9k | primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]); |
822 | 41.9k | } |
823 | | |
824 | 41.9k | { |
825 | | // SAO_EO_0: // dir: - |
826 | 41.9k | { |
827 | 41.9k | if (m_param->bSaoNonDeblocked) |
828 | 0 | { |
829 | 0 | skipB = 3; |
830 | 0 | skipR = 5; |
831 | 0 | } |
832 | | |
833 | 41.9k | startX = !lpelx; |
834 | 41.9k | endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset; |
835 | | |
836 | 41.9k | primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]); |
837 | 41.9k | } |
838 | | |
839 | | // SAO_EO_1: // dir: | |
840 | 41.9k | { |
841 | 41.9k | if (m_param->bSaoNonDeblocked) |
842 | 0 | { |
843 | 0 | skipB = 4; |
844 | 0 | skipR = 4; |
845 | 0 | } |
846 | | |
847 | 41.9k | rec = rec0; |
848 | | |
849 | 41.9k | startY = bAboveUnavail; |
850 | 41.9k | endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset; |
851 | 41.9k | endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset; |
852 | 41.9k | if (startY) |
853 | 9.30k | { |
854 | 9.30k | rec += stride; |
855 | 9.30k | } |
856 | | |
857 | 41.9k | primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth); |
858 | | |
859 | 41.9k | primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]); |
860 | 41.9k | } |
861 | 41.9k | if (!m_param->bLimitSAO || ((slice->m_sliceType == P_SLICE && !cu->isSkipped(0)) || |
862 | 0 | (slice->m_sliceType != B_SLICE))) |
863 | 41.9k | { |
864 | | // SAO_EO_2: // dir: 135 |
865 | 41.9k | { |
866 | 41.9k | if (m_param->bSaoNonDeblocked) |
867 | 0 | { |
868 | 0 | skipB = 4; |
869 | 0 | skipR = 5; |
870 | 0 | } |
871 | | |
872 | 41.9k | fenc = fenc0; |
873 | 41.9k | rec = rec0; |
874 | | |
875 | 41.9k | startX = !lpelx; |
876 | 41.9k | endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset; |
877 | | |
878 | 41.9k | startY = bAboveUnavail; |
879 | 41.9k | endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset; |
880 | 41.9k | if (startY) |
881 | 9.30k | { |
882 | 9.30k | fenc += stride; |
883 | 9.30k | rec += stride; |
884 | 9.30k | } |
885 | | |
886 | 41.9k | primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX)); |
887 | | |
888 | 41.9k | primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0 + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]); |
889 | 41.9k | } |
890 | | // SAO_EO_3: // dir: 45 |
891 | 41.9k | { |
892 | 41.9k | if (m_param->bSaoNonDeblocked) |
893 | 0 | { |
894 | 0 | skipB = 4; |
895 | 0 | skipR = 5; |
896 | 0 | } |
897 | 41.9k | fenc = fenc0; |
898 | 41.9k | rec = rec0; |
899 | 41.9k | startX = !lpelx; |
900 | 41.9k | endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset; |
901 | | |
902 | 41.9k | startY = bAboveUnavail; |
903 | 41.9k | endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset; |
904 | | |
905 | 41.9k | if (startY) |
906 | 9.30k | { |
907 | 9.30k | fenc += stride; |
908 | 9.30k | rec += stride; |
909 | 9.30k | } |
910 | | |
911 | 41.9k | primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1)); |
912 | | |
913 | 41.9k | primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0 + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]); |
914 | 41.9k | } |
915 | 41.9k | } |
916 | 41.9k | } |
917 | 41.9k | } |
918 | | |
919 | | void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY) |
920 | 0 | { |
921 | 0 | int addr = idxX + m_numCuInWidth * idxY; |
922 | |
|
923 | 0 | int x, y; |
924 | 0 | const CUData* cu = frame->m_encData->getPicCTU(addr); |
925 | 0 | const PicYuv* reconPic = m_frame->m_reconPic; |
926 | 0 | const pixel* fenc; |
927 | 0 | const pixel* rec; |
928 | 0 | intptr_t stride = reconPic->m_stride; |
929 | 0 | uint32_t picWidth = m_param->sourceWidth; |
930 | 0 | uint32_t picHeight = m_param->sourceHeight; |
931 | 0 | int ctuWidth = m_param->maxCUSize; |
932 | 0 | int ctuHeight = m_param->maxCUSize; |
933 | 0 | uint32_t lpelx = cu->m_cuPelX; |
934 | 0 | uint32_t tpely = cu->m_cuPelY; |
935 | 0 | const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice; |
936 | 0 | const uint32_t lastRowInSlice = cu->m_bLastRowInSlice; |
937 | 0 | const uint32_t bAboveAvail = (!tpely) | firstRowInSlice; |
938 | | |
939 | | // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it |
940 | 0 | if (lastRowInSlice) |
941 | 0 | { |
942 | 0 | picHeight = x265_min(picHeight, (tpely + ctuHeight)); |
943 | 0 | } |
944 | |
|
945 | 0 | uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth); |
946 | 0 | uint32_t bpely = x265_min(tpely + ctuHeight, picHeight); |
947 | 0 | ctuWidth = rpelx - lpelx; |
948 | 0 | ctuHeight = bpely - tpely; |
949 | |
|
950 | 0 | int startX; |
951 | 0 | int startY; |
952 | 0 | int endX; |
953 | 0 | int endY; |
954 | 0 | int firstX, firstY; |
955 | 0 | int32_t* stats; |
956 | 0 | int32_t* count; |
957 | |
|
958 | 0 | int skipB, skipR; |
959 | |
|
960 | 0 | int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1; |
961 | 0 | int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; |
962 | |
|
963 | 0 | const int boShift = X265_DEPTH - SAO_BO_BITS; |
964 | |
|
965 | 0 | memset(m_countPreDblk[addr], 0, sizeof(PerPlane)); |
966 | 0 | memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane)); |
967 | |
|
968 | 0 | int plane_offset = 0; |
969 | 0 | for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400? NUM_PLANE : 1); plane++) |
970 | 0 | { |
971 | 0 | if (plane == 1) |
972 | 0 | { |
973 | 0 | stride = reconPic->m_strideC; |
974 | 0 | picWidth >>= m_hChromaShift; |
975 | 0 | picHeight >>= m_vChromaShift; |
976 | 0 | ctuWidth >>= m_hChromaShift; |
977 | 0 | ctuHeight >>= m_vChromaShift; |
978 | 0 | lpelx >>= m_hChromaShift; |
979 | 0 | tpely >>= m_vChromaShift; |
980 | 0 | rpelx >>= m_hChromaShift; |
981 | 0 | bpely >>= m_vChromaShift; |
982 | 0 | } |
983 | | |
984 | | // SAO_BO: |
985 | |
|
986 | 0 | skipB = 3 - plane_offset; |
987 | 0 | skipR = 4 - plane_offset; |
988 | |
|
989 | 0 | stats = m_offsetOrgPreDblk[addr][plane][SAO_BO]; |
990 | 0 | count = m_countPreDblk[addr][plane][SAO_BO]; |
991 | |
|
992 | 0 | const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr); |
993 | 0 | const pixel* rec0 = reconPic->getPlaneAddr(plane, addr); |
994 | 0 | fenc = fenc0; |
995 | 0 | rec = rec0; |
996 | |
|
997 | 0 | startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; |
998 | 0 | startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB; |
999 | |
|
1000 | 0 | for (y = 0; y < ctuHeight; y++) |
1001 | 0 | { |
1002 | 0 | for (x = (y < startY ? startX : 0); x < ctuWidth; x++) |
1003 | 0 | { |
1004 | 0 | int classIdx = rec[x] >> boShift; |
1005 | 0 | stats[classIdx] += (fenc[x] - rec[x]); |
1006 | 0 | count[classIdx]++; |
1007 | 0 | } |
1008 | |
|
1009 | 0 | fenc += stride; |
1010 | 0 | rec += stride; |
1011 | 0 | } |
1012 | | |
1013 | | // SAO_EO_0: // dir: - |
1014 | 0 | { |
1015 | 0 | skipB = 3 - plane_offset; |
1016 | 0 | skipR = 5 - plane_offset; |
1017 | |
|
1018 | 0 | stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0]; |
1019 | 0 | count = m_countPreDblk[addr][plane][SAO_EO_0]; |
1020 | |
|
1021 | 0 | fenc = fenc0; |
1022 | 0 | rec = rec0; |
1023 | |
|
1024 | 0 | startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; |
1025 | 0 | startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB; |
1026 | 0 | firstX = !lpelx; |
1027 | | // endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; |
1028 | 0 | endX = ctuWidth - 1; // not refer right CTU |
1029 | |
|
1030 | 0 | for (y = 0; y < ctuHeight; y++) |
1031 | 0 | { |
1032 | 0 | x = (y < startY ? startX : firstX); |
1033 | 0 | int signLeft = signOf(rec[x] - rec[x - 1]); |
1034 | 0 | for (; x < endX; x++) |
1035 | 0 | { |
1036 | 0 | int signRight = signOf(rec[x] - rec[x + 1]); |
1037 | 0 | int edgeType = signRight + signLeft + 2; |
1038 | 0 | signLeft = -signRight; |
1039 | |
|
1040 | 0 | stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); |
1041 | 0 | count[s_eoTable[edgeType]]++; |
1042 | 0 | } |
1043 | |
|
1044 | 0 | fenc += stride; |
1045 | 0 | rec += stride; |
1046 | 0 | } |
1047 | 0 | } |
1048 | | |
1049 | | // SAO_EO_1: // dir: | |
1050 | 0 | { |
1051 | 0 | skipB = 4 - plane_offset; |
1052 | 0 | skipR = 4 - plane_offset; |
1053 | |
|
1054 | 0 | stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1]; |
1055 | 0 | count = m_countPreDblk[addr][plane][SAO_EO_1]; |
1056 | |
|
1057 | 0 | fenc = fenc0; |
1058 | 0 | rec = rec0; |
1059 | |
|
1060 | 0 | startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; |
1061 | 0 | startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; |
1062 | 0 | firstY = bAboveAvail; |
1063 | | // endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; |
1064 | 0 | endY = ctuHeight - 1; // not refer below CTU |
1065 | 0 | if (firstY) |
1066 | 0 | { |
1067 | 0 | fenc += stride; |
1068 | 0 | rec += stride; |
1069 | 0 | } |
1070 | |
|
1071 | 0 | for (x = startX; x < ctuWidth; x++) |
1072 | 0 | upBuff1[x] = signOf(rec[x] - rec[x - stride]); |
1073 | |
|
1074 | 0 | for (y = firstY; y < endY; y++) |
1075 | 0 | { |
1076 | 0 | for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++) |
1077 | 0 | { |
1078 | 0 | int signDown = signOf(rec[x] - rec[x + stride]); |
1079 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
1080 | 0 | upBuff1[x] = -signDown; |
1081 | |
|
1082 | 0 | if (x < startX && y < startY) |
1083 | 0 | continue; |
1084 | | |
1085 | 0 | stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); |
1086 | 0 | count[s_eoTable[edgeType]]++; |
1087 | 0 | } |
1088 | |
|
1089 | 0 | fenc += stride; |
1090 | 0 | rec += stride; |
1091 | 0 | } |
1092 | 0 | } |
1093 | | |
1094 | | // SAO_EO_2: // dir: 135 |
1095 | 0 | { |
1096 | 0 | skipB = 4 - plane_offset; |
1097 | 0 | skipR = 5 - plane_offset; |
1098 | |
|
1099 | 0 | stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2]; |
1100 | 0 | count = m_countPreDblk[addr][plane][SAO_EO_2]; |
1101 | |
|
1102 | 0 | fenc = fenc0; |
1103 | 0 | rec = rec0; |
1104 | |
|
1105 | 0 | startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; |
1106 | 0 | startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; |
1107 | 0 | firstX = !lpelx; |
1108 | 0 | firstY = bAboveAvail; |
1109 | | // endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; |
1110 | | // endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; |
1111 | 0 | endX = ctuWidth - 1; // not refer right CTU |
1112 | 0 | endY = ctuHeight - 1; // not refer below CTU |
1113 | 0 | if (firstY) |
1114 | 0 | { |
1115 | 0 | fenc += stride; |
1116 | 0 | rec += stride; |
1117 | 0 | } |
1118 | |
|
1119 | 0 | for (x = startX; x < endX; x++) |
1120 | 0 | upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]); |
1121 | |
|
1122 | 0 | for (y = firstY; y < endY; y++) |
1123 | 0 | { |
1124 | 0 | x = (y < startY - 1 ? startX : firstX); |
1125 | 0 | upBufft[x] = signOf(rec[x + stride] - rec[x - 1]); |
1126 | 0 | for (; x < endX; x++) |
1127 | 0 | { |
1128 | 0 | int signDown = signOf(rec[x] - rec[x + stride + 1]); |
1129 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
1130 | 0 | upBufft[x + 1] = -signDown; |
1131 | |
|
1132 | 0 | if (x < startX && y < startY) |
1133 | 0 | continue; |
1134 | | |
1135 | 0 | stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); |
1136 | 0 | count[s_eoTable[edgeType]]++; |
1137 | 0 | } |
1138 | |
|
1139 | 0 | std::swap(upBuff1, upBufft); |
1140 | |
|
1141 | 0 | rec += stride; |
1142 | 0 | fenc += stride; |
1143 | 0 | } |
1144 | 0 | } |
1145 | | |
1146 | | // SAO_EO_3: // dir: 45 |
1147 | 0 | { |
1148 | 0 | skipB = 4 - plane_offset; |
1149 | 0 | skipR = 5 - plane_offset; |
1150 | |
|
1151 | 0 | stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3]; |
1152 | 0 | count = m_countPreDblk[addr][plane][SAO_EO_3]; |
1153 | |
|
1154 | 0 | fenc = fenc0; |
1155 | 0 | rec = rec0; |
1156 | |
|
1157 | 0 | startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; |
1158 | 0 | startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB; |
1159 | 0 | firstX = !lpelx; |
1160 | 0 | firstY = bAboveAvail; |
1161 | | // endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth; |
1162 | | // endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight; |
1163 | 0 | endX = ctuWidth - 1; // not refer right CTU |
1164 | 0 | endY = ctuHeight - 1; // not refer below CTU |
1165 | 0 | if (firstY) |
1166 | 0 | { |
1167 | 0 | fenc += stride; |
1168 | 0 | rec += stride; |
1169 | 0 | } |
1170 | |
|
1171 | 0 | for (x = startX - 1; x < endX; x++) |
1172 | 0 | upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]); |
1173 | |
|
1174 | 0 | for (y = firstY; y < endY; y++) |
1175 | 0 | { |
1176 | 0 | for (x = (y < startY - 1 ? startX : firstX); x < endX; x++) |
1177 | 0 | { |
1178 | 0 | int signDown = signOf(rec[x] - rec[x + stride - 1]); |
1179 | 0 | int edgeType = signDown + upBuff1[x] + 2; |
1180 | 0 | upBuff1[x - 1] = -signDown; |
1181 | |
|
1182 | 0 | if (x < startX && y < startY) |
1183 | 0 | continue; |
1184 | | |
1185 | 0 | stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]); |
1186 | 0 | count[s_eoTable[edgeType]]++; |
1187 | 0 | } |
1188 | |
|
1189 | 0 | upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); |
1190 | |
|
1191 | 0 | rec += stride; |
1192 | 0 | fenc += stride; |
1193 | 0 | } |
1194 | 0 | } |
1195 | 0 | plane_offset = 2; |
1196 | 0 | } |
1197 | 0 | } |
1198 | | |
1199 | | /* reset offset statistics */ |
1200 | | void SAO::resetStats() |
1201 | 698 | { |
1202 | 698 | memset(m_count, 0, sizeof(m_count)); |
1203 | 698 | memset(m_offset, 0, sizeof(m_offset)); |
1204 | 698 | memset(m_offsetOrg, 0, sizeof(m_offsetOrg)); |
1205 | 698 | } |
1206 | | |
1207 | | void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus) |
1208 | 0 | { |
1209 | 0 | if (!saoParam->bSaoFlag[0]) |
1210 | 0 | m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0; |
1211 | 0 | else |
1212 | 0 | { |
1213 | 0 | X265_CHECK(m_numNoSao[0] <= numctus, "m_numNoSao check failure!"); |
1214 | 0 | m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus); |
1215 | 0 | } |
1216 | |
|
1217 | 0 | if (!saoParam->bSaoFlag[1]) |
1218 | 0 | { |
1219 | 0 | m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0; |
1220 | 0 | } |
1221 | 0 | else |
1222 | 0 | m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus); |
1223 | 0 | } |
1224 | | |
1225 | | void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr) |
1226 | 13.9k | { |
1227 | 13.9k | Slice* slice = m_frame->m_encData->m_slice; |
1228 | 13.9k | const CUData* cu = m_frame->m_encData->getPicCTU(addr); |
1229 | 13.9k | int qp = cu->m_qp[0]; |
1230 | 13.9k | int64_t lambda[2] = { 0 }; |
1231 | | |
1232 | 13.9k | int qpCb = qp + slice->m_pps->chromaQpOffset[0] + slice->m_chromaQpOffset[0]; |
1233 | 13.9k | if (m_param->internalCsp == X265_CSP_I420) |
1234 | 13.9k | qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)g_chromaScale[x265_clip3(QP_MIN, QP_MAX_MAX, qpCb)]); |
1235 | 0 | else |
1236 | 0 | qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, qpCb); |
1237 | 13.9k | lambda[0] = (int64_t)floor(256.0 * x265_lambda2_tab[qp]); |
1238 | 13.9k | lambda[1] = (int64_t)floor(256.0 * x265_lambda2_tab[qpCb]); // Use Cb QP for SAO chroma |
1239 | | |
1240 | 13.9k | const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up |
1241 | | |
1242 | 13.9k | const int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up |
1243 | | |
1244 | 13.9k | bool chroma = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400; |
1245 | 13.9k | int planes = chroma ? 3 : 1; |
1246 | | |
1247 | | // reset stats Y, Cb, Cr |
1248 | 13.9k | X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane"); |
1249 | | |
1250 | | // TODO: Confirm the address space is continuous |
1251 | 13.9k | if (m_param->bSaoNonDeblocked) |
1252 | 0 | { |
1253 | 0 | memcpy(m_count, m_countPreDblk[addr], sizeof(m_count)); |
1254 | 0 | memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg)); |
1255 | 0 | } |
1256 | 13.9k | else |
1257 | 13.9k | { |
1258 | 13.9k | memset(m_count, 0, sizeof(m_count)); |
1259 | 13.9k | memset(m_offsetOrg, 0, sizeof(m_offsetOrg)); |
1260 | 13.9k | } |
1261 | | |
1262 | 55.9k | for (int i = 0; i < planes; i++) |
1263 | 41.9k | saoParam->ctuParam[i][addr].reset(); |
1264 | | // SAO distortion calculation |
1265 | 13.9k | m_entropyCoder.load(m_rdContexts.cur); |
1266 | 13.9k | m_entropyCoder.resetBits(); |
1267 | 13.9k | if (allowMerge[0]) |
1268 | 10.8k | m_entropyCoder.codeSaoMerge(0); |
1269 | 13.9k | if (allowMerge[1]) |
1270 | 10.8k | m_entropyCoder.codeSaoMerge(0); |
1271 | 13.9k | m_entropyCoder.store(m_rdContexts.temp); |
1272 | 13.9k | memset(m_offset, 0, sizeof(m_offset)); |
1273 | 13.9k | int64_t bestCost = 0; |
1274 | 13.9k | int64_t rateDist = 0; |
1275 | | |
1276 | 13.9k | bool bAboveLeftAvail = true; |
1277 | 41.9k | for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx) |
1278 | 27.9k | { |
1279 | 27.9k | if (!allowMerge[mergeIdx]) |
1280 | 6.27k | continue; |
1281 | | |
1282 | 21.7k | SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[0][addrMerge[mergeIdx]]); |
1283 | 21.7k | bAboveLeftAvail = bAboveLeftAvail && (mergeSrcParam->typeIdx == -1); |
1284 | 21.7k | } |
1285 | | // Don't apply sao if ctu is skipped or ajacent ctus are sao off |
1286 | 13.9k | bool bSaoOff = (slice->m_sliceType == B_SLICE) && (cu->isSkipped(0) || bAboveLeftAvail); |
1287 | | |
1288 | | // Estimate distortion and cost of new SAO params |
1289 | 13.9k | if (saoParam->bSaoFlag[0]) |
1290 | 13.9k | { |
1291 | 13.9k | if (!m_param->bLimitSAO || !bSaoOff) |
1292 | 13.9k | { |
1293 | 13.9k | calcSaoStatsCTU(addr, 0); |
1294 | 13.9k | saoStatsInitialOffset(addr, 0); |
1295 | 13.9k | saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost); |
1296 | 13.9k | } |
1297 | 13.9k | } |
1298 | | |
1299 | 13.9k | SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr]; |
1300 | 13.9k | if (saoParam->bSaoFlag[1]) |
1301 | 13.9k | { |
1302 | 13.9k | if (!m_param->bLimitSAO || ((lclCtuParam->typeIdx != -1) && !bSaoOff)) |
1303 | 13.9k | { |
1304 | 13.9k | calcSaoStatsCTU(addr, 1); |
1305 | 13.9k | calcSaoStatsCTU(addr, 2); |
1306 | 13.9k | saoStatsInitialOffset(addr, 1); |
1307 | 13.9k | saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost); |
1308 | 13.9k | } |
1309 | 13.9k | } |
1310 | 13.9k | if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1]) |
1311 | 13.9k | { |
1312 | | // Cost of merge left or Up |
1313 | 41.9k | for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx) |
1314 | 27.9k | { |
1315 | 27.9k | if (!allowMerge[mergeIdx]) |
1316 | 6.27k | continue; |
1317 | | |
1318 | 21.7k | int64_t mergeDist = 0; |
1319 | 86.8k | for (int plane = 0; plane < planes; plane++) |
1320 | 65.1k | { |
1321 | 65.1k | int64_t estDist = 0; |
1322 | 65.1k | SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]); |
1323 | 65.1k | int typeIdx = mergeSrcParam->typeIdx; |
1324 | 65.1k | if (typeIdx >= 0) |
1325 | 0 | { |
1326 | 0 | int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 1; |
1327 | 0 | for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) |
1328 | 0 | { |
1329 | 0 | int mergeOffset = mergeSrcParam->offset[classIdx]; |
1330 | 0 | estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]); |
1331 | 0 | } |
1332 | 0 | } |
1333 | 65.1k | mergeDist += (estDist << 8) / lambda[!!plane]; |
1334 | 65.1k | } |
1335 | | |
1336 | 21.7k | m_entropyCoder.load(m_rdContexts.cur); |
1337 | 21.7k | m_entropyCoder.resetBits(); |
1338 | 21.7k | if (allowMerge[0]) |
1339 | 19.2k | m_entropyCoder.codeSaoMerge(1 - mergeIdx); |
1340 | 21.7k | if (allowMerge[1] && (mergeIdx == 1)) |
1341 | 10.8k | m_entropyCoder.codeSaoMerge(1); |
1342 | | |
1343 | 21.7k | uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits(); |
1344 | 21.7k | int64_t mergeCost = mergeDist + estRate; |
1345 | 21.7k | if (mergeCost < bestCost) |
1346 | 9.56k | { |
1347 | 9.56k | SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT; |
1348 | 9.56k | bestCost = mergeCost; |
1349 | 9.56k | m_entropyCoder.store(m_rdContexts.temp); |
1350 | 38.2k | for (int plane = 0; plane < planes; plane++) |
1351 | 28.6k | { |
1352 | 28.6k | if (saoParam->bSaoFlag[plane > 0]) |
1353 | 28.6k | { |
1354 | 28.6k | SaoCtuParam* dstCtuParam = &saoParam->ctuParam[plane][addr]; |
1355 | 28.6k | SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]); |
1356 | 28.6k | dstCtuParam->mergeMode = mergeMode; |
1357 | 28.6k | dstCtuParam->typeIdx = mergeSrcParam->typeIdx; |
1358 | 28.6k | dstCtuParam->bandPos = mergeSrcParam->bandPos; |
1359 | | |
1360 | 143k | for (int i = 0; i < SAO_NUM_OFFSET; i++) |
1361 | 114k | dstCtuParam->offset[i] = mergeSrcParam->offset[i]; |
1362 | 28.6k | } |
1363 | 28.6k | } |
1364 | 9.56k | } |
1365 | 21.7k | } |
1366 | | |
1367 | 13.9k | if (saoParam->ctuParam[0][addr].typeIdx < 0) |
1368 | 13.9k | m_numNoSao[0]++; |
1369 | 13.9k | if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0) |
1370 | 13.9k | m_numNoSao[1]++; |
1371 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1372 | 13.9k | m_entropyCoder.store(m_rdContexts.cur); |
1373 | 13.9k | } |
1374 | 13.9k | } |
1375 | | |
1376 | | // Rounds the division of initial offsets by the number of samples in |
1377 | | // each of the statistics table entries. |
1378 | | void SAO::saoStatsInitialOffset(int addr, int planes) |
1379 | 27.9k | { |
1380 | 27.9k | Slice* slice = m_frame->m_encData->m_slice; |
1381 | 27.9k | const CUData* cu = m_frame->m_encData->getPicCTU(addr); |
1382 | | |
1383 | 27.9k | int maxSaoType; |
1384 | 27.9k | if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) || |
1385 | 0 | (slice->m_sliceType == B_SLICE))) |
1386 | 0 | { |
1387 | 0 | maxSaoType = MAX_NUM_SAO_TYPE - 3; |
1388 | 0 | } |
1389 | 27.9k | else |
1390 | 27.9k | { |
1391 | 27.9k | maxSaoType = MAX_NUM_SAO_TYPE - 1; |
1392 | 27.9k | } |
1393 | | // EO |
1394 | 69.9k | for (int plane = planes; plane <= planes * 2; plane++) |
1395 | 41.9k | { |
1396 | 209k | for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++) |
1397 | 167k | { |
1398 | 839k | for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++) |
1399 | 671k | { |
1400 | 671k | int32_t& count = m_count[plane][typeIdx][classIdx]; |
1401 | 671k | int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx]; |
1402 | 671k | int32_t& offsetOut = m_offset[plane][typeIdx][classIdx]; |
1403 | | |
1404 | 671k | if (count) |
1405 | 1.62k | { |
1406 | 1.62k | offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC); |
1407 | 1.62k | offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut); |
1408 | | |
1409 | 1.62k | if (classIdx < 3) |
1410 | 1.48k | offsetOut = X265_MAX(offsetOut, 0); |
1411 | 140 | else |
1412 | 140 | offsetOut = X265_MIN(offsetOut, 0); |
1413 | 1.62k | } |
1414 | 671k | } |
1415 | 167k | } |
1416 | 41.9k | } |
1417 | | // BO |
1418 | 69.9k | for (int plane = planes; plane <= planes * 2; plane++) |
1419 | 41.9k | { |
1420 | 1.38M | for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++) |
1421 | 1.34M | { |
1422 | 1.34M | int32_t& count = m_count[plane][SAO_BO][classIdx]; |
1423 | 1.34M | int32_t& offsetOrg = m_offsetOrg[plane][SAO_BO][classIdx]; |
1424 | 1.34M | int32_t& offsetOut = m_offset[plane][SAO_BO][classIdx]; |
1425 | | |
1426 | 1.34M | if (count) |
1427 | 41.9k | { |
1428 | 41.9k | offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC); |
1429 | 41.9k | offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut); |
1430 | 41.9k | } |
1431 | 1.34M | } |
1432 | 41.9k | } |
1433 | 27.9k | } |
1434 | | |
1435 | | inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda) |
1436 | 2.18M | { |
1437 | 2.18M | #if X265_DEPTH < 10 |
1438 | 2.18M | X265_CHECK(bits <= (INT64_MAX - 128) / lambda, |
1439 | 2.18M | "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n", |
1440 | 2.18M | distortion, bits, lambda); |
1441 | | #else |
1442 | | X265_CHECK(bits <= (INT64_MAX - 128) / lambda, |
1443 | | "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n", |
1444 | | distortion, bits, lambda); |
1445 | | #endif |
1446 | 2.18M | return distortion + ((bits * lambda + 128) >> 8); |
1447 | 2.18M | } |
1448 | | |
1449 | | void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses) |
1450 | 2.01M | { |
1451 | 2.01M | int bestOffset = 0; |
1452 | 2.01M | distClasses = 0; |
1453 | | |
1454 | | // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. |
1455 | | // entropy coder can be used to measure the exact rate here. |
1456 | 2.01M | int64_t bestCost = calcSaoRdoCost(0, 1, lambda); |
1457 | 2.01M | while (offset != 0) |
1458 | 3.02k | { |
1459 | | // Calculate the bits required for signalling the offset |
1460 | 3.02k | uint32_t rate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1); |
1461 | 3.02k | if (abs(offset) == OFFSET_THRESH - 1) |
1462 | 0 | rate--; |
1463 | | |
1464 | | // Do the dequntization before distorion calculation |
1465 | 3.02k | int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg); |
1466 | 3.02k | int64_t cost = calcSaoRdoCost(dist, rate, lambda); |
1467 | 3.02k | if (cost < bestCost) |
1468 | 74 | { |
1469 | 74 | bestCost = cost; |
1470 | 74 | bestOffset = offset; |
1471 | 74 | distClasses = (int)dist; |
1472 | 74 | } |
1473 | 3.02k | offset = (offset > 0) ? (offset - 1) : (offset + 1); |
1474 | 3.02k | } |
1475 | | |
1476 | 2.01M | costClasses = bestCost; |
1477 | 2.01M | offset = bestOffset; |
1478 | 2.01M | } |
1479 | | void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost) |
1480 | 13.9k | { |
1481 | 13.9k | Slice* slice = m_frame->m_encData->m_slice; |
1482 | 13.9k | const CUData* cu = m_frame->m_encData->getPicCTU(addr); |
1483 | 13.9k | int64_t bestDist = 0; |
1484 | 13.9k | int bestTypeIdx = -1; |
1485 | 13.9k | SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr]; |
1486 | | |
1487 | 13.9k | int32_t distClasses[MAX_NUM_SAO_CLASS]; |
1488 | 13.9k | int64_t costClasses[MAX_NUM_SAO_CLASS]; |
1489 | | |
1490 | | // RDO SAO_NA |
1491 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1492 | 13.9k | m_entropyCoder.resetBits(); |
1493 | 13.9k | m_entropyCoder.codeSaoType(0); |
1494 | 13.9k | int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]); |
1495 | 13.9k | int maxSaoType; |
1496 | 13.9k | if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) || |
1497 | 0 | (slice->m_sliceType == B_SLICE))) |
1498 | 0 | { |
1499 | 0 | maxSaoType = MAX_NUM_SAO_TYPE - 3; |
1500 | 0 | } |
1501 | 13.9k | else |
1502 | 13.9k | { |
1503 | 13.9k | maxSaoType = MAX_NUM_SAO_TYPE - 1; |
1504 | 13.9k | } |
1505 | | |
1506 | | //EO distortion calculation |
1507 | 69.9k | for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++) |
1508 | 55.9k | { |
1509 | 55.9k | int64_t estDist = 0; |
1510 | 279k | for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++) |
1511 | 223k | { |
1512 | 223k | int32_t& count = m_count[0][typeIdx][classIdx]; |
1513 | 223k | int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx]; |
1514 | 223k | int32_t& offsetOut = m_offset[0][typeIdx][classIdx]; |
1515 | 223k | estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]); |
1516 | | |
1517 | | //Calculate distortion |
1518 | 223k | estDist += distClasses[classIdx]; |
1519 | 223k | } |
1520 | | |
1521 | 55.9k | m_entropyCoder.load(m_rdContexts.temp); |
1522 | 55.9k | m_entropyCoder.resetBits(); |
1523 | 55.9k | m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0); |
1524 | | |
1525 | 55.9k | int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]); |
1526 | | |
1527 | 55.9k | if (cost < costPartBest) |
1528 | 0 | { |
1529 | 0 | costPartBest = cost; |
1530 | 0 | bestDist = estDist; |
1531 | 0 | bestTypeIdx = typeIdx; |
1532 | 0 | } |
1533 | 55.9k | } |
1534 | | |
1535 | 13.9k | if (bestTypeIdx != -1) |
1536 | 0 | { |
1537 | 0 | lclCtuParam->mergeMode = SAO_MERGE_NONE; |
1538 | 0 | lclCtuParam->typeIdx = bestTypeIdx; |
1539 | 0 | lclCtuParam->bandPos = 0; |
1540 | 0 | for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) |
1541 | 0 | lclCtuParam->offset[classIdx] = m_offset[0][bestTypeIdx][classIdx + 1]; |
1542 | 0 | } |
1543 | | |
1544 | | //BO RDO |
1545 | 13.9k | int64_t estDist = 0; |
1546 | 461k | for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++) |
1547 | 447k | { |
1548 | 447k | int32_t& count = m_count[0][SAO_BO][classIdx]; |
1549 | 447k | int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx]; |
1550 | 447k | int32_t& offsetOut = m_offset[0][SAO_BO][classIdx]; |
1551 | | |
1552 | 447k | estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]); |
1553 | 447k | } |
1554 | | |
1555 | | // Estimate Best Position |
1556 | 13.9k | int32_t bestClassBO = 0; |
1557 | 13.9k | int64_t currentRDCost = costClasses[0]; |
1558 | 13.9k | currentRDCost += costClasses[1]; |
1559 | 13.9k | currentRDCost += costClasses[2]; |
1560 | 13.9k | currentRDCost += costClasses[3]; |
1561 | 13.9k | int64_t bestRDCostBO = currentRDCost; |
1562 | | |
1563 | 405k | for (int i = 1; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++) |
1564 | 391k | { |
1565 | 391k | currentRDCost -= costClasses[i - 1]; |
1566 | 391k | currentRDCost += costClasses[i + 3]; |
1567 | | |
1568 | 391k | if (currentRDCost < bestRDCostBO) |
1569 | 0 | { |
1570 | 0 | bestRDCostBO = currentRDCost; |
1571 | 0 | bestClassBO = i; |
1572 | 0 | } |
1573 | 391k | } |
1574 | | |
1575 | 13.9k | estDist = 0; |
1576 | 69.9k | for (int classIdx = bestClassBO; classIdx < bestClassBO + SAO_NUM_OFFSET; classIdx++) |
1577 | 55.9k | estDist += distClasses[classIdx]; |
1578 | | |
1579 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1580 | 13.9k | m_entropyCoder.resetBits(); |
1581 | 13.9k | m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0); |
1582 | | |
1583 | 13.9k | int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]); |
1584 | | |
1585 | 13.9k | if (cost < costPartBest) |
1586 | 0 | { |
1587 | 0 | costPartBest = cost; |
1588 | 0 | bestDist = estDist; |
1589 | |
|
1590 | 0 | lclCtuParam->mergeMode = SAO_MERGE_NONE; |
1591 | 0 | lclCtuParam->typeIdx = SAO_BO; |
1592 | 0 | lclCtuParam->bandPos = bestClassBO; |
1593 | 0 | for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) |
1594 | 0 | lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO]; |
1595 | 0 | } |
1596 | | |
1597 | 13.9k | rateDist = (bestDist << 8) / lambda[0]; |
1598 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1599 | 13.9k | m_entropyCoder.codeSaoOffset(*lclCtuParam, 0); |
1600 | 13.9k | m_entropyCoder.store(m_rdContexts.temp); |
1601 | | |
1602 | 13.9k | if (m_param->internalCsp == X265_CSP_I400) |
1603 | 0 | { |
1604 | 0 | bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits(); |
1605 | 0 | } |
1606 | 13.9k | } |
1607 | | void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost) |
1608 | 13.9k | { |
1609 | 13.9k | Slice* slice = m_frame->m_encData->m_slice; |
1610 | 13.9k | const CUData* cu = m_frame->m_encData->getPicCTU(addr); |
1611 | 13.9k | int64_t bestDist = 0; |
1612 | 13.9k | int bestTypeIdx = -1; |
1613 | 13.9k | SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] }; |
1614 | | |
1615 | 13.9k | int64_t costClasses[MAX_NUM_SAO_CLASS]; |
1616 | 13.9k | int32_t distClasses[MAX_NUM_SAO_CLASS]; |
1617 | 13.9k | int32_t bestClassBO[2] = { 0, 0 }; |
1618 | | |
1619 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1620 | 13.9k | m_entropyCoder.resetBits(); |
1621 | 13.9k | m_entropyCoder.codeSaoType(0); |
1622 | | |
1623 | 13.9k | uint32_t bits = m_entropyCoder.getNumberOfWrittenBits(); |
1624 | 13.9k | int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]); |
1625 | 13.9k | int maxSaoType; |
1626 | 13.9k | if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) || |
1627 | 0 | (slice->m_sliceType == B_SLICE))) |
1628 | 0 | { |
1629 | 0 | maxSaoType = MAX_NUM_SAO_TYPE - 3; |
1630 | 0 | } |
1631 | 13.9k | else |
1632 | 13.9k | { |
1633 | 13.9k | maxSaoType = MAX_NUM_SAO_TYPE - 1; |
1634 | 13.9k | } |
1635 | | |
1636 | | //EO RDO |
1637 | 69.9k | for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++) |
1638 | 55.9k | { |
1639 | 55.9k | int64_t estDist[2] = {0, 0}; |
1640 | 167k | for (int compIdx = 1; compIdx < 3; compIdx++) |
1641 | 111k | { |
1642 | 559k | for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++) |
1643 | 447k | { |
1644 | 447k | int32_t& count = m_count[compIdx][typeIdx][classIdx]; |
1645 | 447k | int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx]; |
1646 | 447k | int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx]; |
1647 | | |
1648 | 447k | estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]); |
1649 | | |
1650 | 447k | estDist[compIdx - 1] += distClasses[classIdx]; |
1651 | 447k | } |
1652 | 111k | } |
1653 | | |
1654 | 55.9k | m_entropyCoder.load(m_rdContexts.temp); |
1655 | 55.9k | m_entropyCoder.resetBits(); |
1656 | | |
1657 | 167k | for (int compIdx = 0; compIdx < 2; compIdx++) |
1658 | 111k | m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1); |
1659 | | |
1660 | 55.9k | uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits(); |
1661 | 55.9k | int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]); |
1662 | | |
1663 | 55.9k | if (cost < costPartBest) |
1664 | 0 | { |
1665 | 0 | costPartBest = cost; |
1666 | 0 | bestDist = (estDist[0] + estDist[1]); |
1667 | 0 | bestTypeIdx = typeIdx; |
1668 | 0 | } |
1669 | 55.9k | } |
1670 | | |
1671 | 13.9k | if (bestTypeIdx != -1) |
1672 | 0 | { |
1673 | 0 | for (int compIdx = 0; compIdx < 2; compIdx++) |
1674 | 0 | { |
1675 | 0 | lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE; |
1676 | 0 | lclCtuParam[compIdx]->typeIdx = bestTypeIdx; |
1677 | 0 | lclCtuParam[compIdx]->bandPos = 0; |
1678 | 0 | for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) |
1679 | 0 | lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][bestTypeIdx][classIdx + 1]; |
1680 | 0 | } |
1681 | 0 | } |
1682 | | |
1683 | | // BO RDO |
1684 | 13.9k | int64_t estDist[2]; |
1685 | | |
1686 | | // Estimate Best Position |
1687 | 41.9k | for (int compIdx = 1; compIdx < 3; compIdx++) |
1688 | 27.9k | { |
1689 | 27.9k | int64_t bestRDCostBO = MAX_INT64; |
1690 | | |
1691 | 923k | for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++) |
1692 | 895k | { |
1693 | 895k | int32_t& count = m_count[compIdx][SAO_BO][classIdx]; |
1694 | 895k | int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx]; |
1695 | 895k | int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx]; |
1696 | | |
1697 | 895k | estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]); |
1698 | 895k | } |
1699 | | |
1700 | 839k | for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++) |
1701 | 811k | { |
1702 | 811k | int64_t currentRDCost = 0; |
1703 | 4.05M | for (int j = i; j < i + SAO_NUM_OFFSET; j++) |
1704 | 3.24M | currentRDCost += costClasses[j]; |
1705 | | |
1706 | 811k | if (currentRDCost < bestRDCostBO) |
1707 | 27.9k | { |
1708 | 27.9k | bestRDCostBO = currentRDCost; |
1709 | 27.9k | bestClassBO[compIdx - 1] = i; |
1710 | 27.9k | } |
1711 | 811k | } |
1712 | | |
1713 | 27.9k | estDist[compIdx - 1] = 0; |
1714 | 139k | for (int classIdx = bestClassBO[compIdx - 1]; classIdx < bestClassBO[compIdx - 1] + SAO_NUM_OFFSET; classIdx++) |
1715 | 111k | estDist[compIdx - 1] += distClasses[classIdx]; |
1716 | 27.9k | } |
1717 | | |
1718 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1719 | 13.9k | m_entropyCoder.resetBits(); |
1720 | | |
1721 | 41.9k | for (int compIdx = 0; compIdx < 2; compIdx++) |
1722 | 27.9k | m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1); |
1723 | | |
1724 | 13.9k | uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits(); |
1725 | 13.9k | int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]); |
1726 | | |
1727 | 13.9k | if (cost < costPartBest) |
1728 | 0 | { |
1729 | 0 | costPartBest = cost; |
1730 | 0 | bestDist = (estDist[0] + estDist[1]); |
1731 | |
|
1732 | 0 | for (int compIdx = 0; compIdx < 2; compIdx++) |
1733 | 0 | { |
1734 | 0 | lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE; |
1735 | 0 | lclCtuParam[compIdx]->typeIdx = SAO_BO; |
1736 | 0 | lclCtuParam[compIdx]->bandPos = bestClassBO[compIdx]; |
1737 | 0 | for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++) |
1738 | 0 | lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]]; |
1739 | 0 | } |
1740 | 0 | } |
1741 | | |
1742 | 13.9k | rateDist += (bestDist << 8) / lambda[1]; |
1743 | 13.9k | m_entropyCoder.load(m_rdContexts.temp); |
1744 | | |
1745 | 13.9k | if (saoParam->bSaoFlag[1]) |
1746 | 13.9k | { |
1747 | 13.9k | m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1); |
1748 | 13.9k | m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2); |
1749 | 13.9k | m_entropyCoder.store(m_rdContexts.temp); |
1750 | | |
1751 | 13.9k | uint32_t rate = m_entropyCoder.getNumberOfWrittenBits(); |
1752 | 13.9k | bestCost = rateDist + rate; |
1753 | 13.9k | } |
1754 | 0 | else |
1755 | 0 | { |
1756 | 0 | uint32_t rate = m_entropyCoder.getNumberOfWrittenBits(); |
1757 | 0 | bestCost = rateDist + rate; |
1758 | 0 | } |
1759 | 13.9k | } |
1760 | | |
1761 | | // NOTE: must put in namespace X265_NS since we need class SAO |
1762 | | void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) |
1763 | 41.9k | { |
1764 | 41.9k | const int boShift = X265_DEPTH - SAO_BO_BITS; |
1765 | | |
1766 | 907k | for (int y = 0; y < endY; y++) |
1767 | 866k | { |
1768 | 28.6M | for (int x = 0; x < endX; x++) |
1769 | 27.7M | { |
1770 | 27.7M | int classIdx = rec[x] >> boShift; |
1771 | 27.7M | stats[classIdx] += diff[x]; |
1772 | 27.7M | count[classIdx]++; |
1773 | 27.7M | } |
1774 | | |
1775 | 866k | diff += MAX_CU_SIZE; |
1776 | 866k | rec += stride; |
1777 | 866k | } |
1778 | 41.9k | } |
1779 | | |
1780 | | void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) |
1781 | 41.9k | { |
1782 | 41.9k | int32_t tmp_stats[SAO::NUM_EDGETYPE]; |
1783 | 41.9k | int32_t tmp_count[SAO::NUM_EDGETYPE]; |
1784 | | |
1785 | 41.9k | X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n"); |
1786 | | |
1787 | 41.9k | memset(tmp_stats, 0, sizeof(tmp_stats)); |
1788 | 41.9k | memset(tmp_count, 0, sizeof(tmp_count)); |
1789 | | |
1790 | 883k | for (int y = 0; y < endY; y++) |
1791 | 841k | { |
1792 | 841k | int signLeft = signOf(rec[0] - rec[-1]); |
1793 | 27.4M | for (int x = 0; x < endX; x++) |
1794 | 26.6M | { |
1795 | 26.6M | int signRight = signOf2(rec[x], rec[x + 1]); |
1796 | 26.6M | X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), "signDown check failure\n"); |
1797 | 26.6M | uint32_t edgeType = signRight + signLeft + 2; |
1798 | 26.6M | signLeft = -signRight; |
1799 | | |
1800 | 26.6M | X265_CHECK(edgeType <= 4, "edgeType check failure\n"); |
1801 | 26.6M | tmp_stats[edgeType] += diff[x]; |
1802 | 26.6M | tmp_count[edgeType]++; |
1803 | 26.6M | } |
1804 | | |
1805 | 841k | diff += MAX_CU_SIZE; |
1806 | 841k | rec += stride; |
1807 | 841k | } |
1808 | | |
1809 | 251k | for (int x = 0; x < SAO::NUM_EDGETYPE; x++) |
1810 | 209k | { |
1811 | 209k | stats[SAO::s_eoTable[x]] += tmp_stats[x]; |
1812 | 209k | count[SAO::s_eoTable[x]] += tmp_count[x]; |
1813 | 209k | } |
1814 | 41.9k | } |
1815 | | |
1816 | | void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count) |
1817 | 41.9k | { |
1818 | 41.9k | X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n"); |
1819 | 41.9k | X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n"); |
1820 | | |
1821 | 41.9k | int32_t tmp_stats[SAO::NUM_EDGETYPE]; |
1822 | 41.9k | int32_t tmp_count[SAO::NUM_EDGETYPE]; |
1823 | | |
1824 | 41.9k | memset(tmp_stats, 0, sizeof(tmp_stats)); |
1825 | 41.9k | memset(tmp_count, 0, sizeof(tmp_count)); |
1826 | | |
1827 | 41.9k | X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n"); |
1828 | 889k | for (int y = 0; y < endY; y++) |
1829 | 847k | { |
1830 | 28.1M | for (int x = 0; x < endX; x++) |
1831 | 27.3M | { |
1832 | 27.3M | int signDown = signOf2(rec[x], rec[x + stride]); |
1833 | 27.3M | X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), "signDown check failure\n"); |
1834 | 27.3M | uint32_t edgeType = signDown + upBuff1[x] + 2; |
1835 | 27.3M | upBuff1[x] = (int8_t)(-signDown); |
1836 | | |
1837 | 27.3M | X265_CHECK(edgeType <= 4, "edgeType check failure\n"); |
1838 | 27.3M | tmp_stats[edgeType] += diff[x]; |
1839 | 27.3M | tmp_count[edgeType]++; |
1840 | 27.3M | } |
1841 | 847k | diff += MAX_CU_SIZE; |
1842 | 847k | rec += stride; |
1843 | 847k | } |
1844 | | |
1845 | 251k | for (int x = 0; x < SAO::NUM_EDGETYPE; x++) |
1846 | 209k | { |
1847 | 209k | stats[SAO::s_eoTable[x]] += tmp_stats[x]; |
1848 | 209k | count[SAO::s_eoTable[x]] += tmp_count[x]; |
1849 | 209k | } |
1850 | 41.9k | } |
1851 | | |
1852 | | void saoCuStatsE2_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count) |
1853 | 41.9k | { |
1854 | 41.9k | X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n"); |
1855 | 41.9k | X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n"); |
1856 | | |
1857 | 41.9k | int32_t tmp_stats[SAO::NUM_EDGETYPE]; |
1858 | 41.9k | int32_t tmp_count[SAO::NUM_EDGETYPE]; |
1859 | | |
1860 | 41.9k | memset(tmp_stats, 0, sizeof(tmp_stats)); |
1861 | 41.9k | memset(tmp_count, 0, sizeof(tmp_count)); |
1862 | | |
1863 | 889k | for (int y = 0; y < endY; y++) |
1864 | 847k | { |
1865 | 847k | upBufft[0] = signOf(rec[stride] - rec[-1]); |
1866 | 27.7M | for (int x = 0; x < endX; x++) |
1867 | 26.8M | { |
1868 | 26.8M | int signDown = signOf2(rec[x], rec[x + stride + 1]); |
1869 | 26.8M | X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n"); |
1870 | 26.8M | uint32_t edgeType = signDown + upBuff1[x] + 2; |
1871 | 26.8M | upBufft[x + 1] = (int8_t)(-signDown); |
1872 | 26.8M | tmp_stats[edgeType] += diff[x]; |
1873 | 26.8M | tmp_count[edgeType]++; |
1874 | 26.8M | } |
1875 | | |
1876 | 847k | std::swap(upBuff1, upBufft); |
1877 | | |
1878 | 847k | rec += stride; |
1879 | 847k | diff += MAX_CU_SIZE; |
1880 | 847k | } |
1881 | | |
1882 | 251k | for (int x = 0; x < SAO::NUM_EDGETYPE; x++) |
1883 | 209k | { |
1884 | 209k | stats[SAO::s_eoTable[x]] += tmp_stats[x]; |
1885 | 209k | count[SAO::s_eoTable[x]] += tmp_count[x]; |
1886 | 209k | } |
1887 | 41.9k | } |
1888 | | |
1889 | | void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count) |
1890 | 41.9k | { |
1891 | 41.9k | X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n"); |
1892 | 41.9k | X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n"); |
1893 | | |
1894 | 41.9k | int32_t tmp_stats[SAO::NUM_EDGETYPE]; |
1895 | 41.9k | int32_t tmp_count[SAO::NUM_EDGETYPE]; |
1896 | | |
1897 | 41.9k | memset(tmp_stats, 0, sizeof(tmp_stats)); |
1898 | 41.9k | memset(tmp_count, 0, sizeof(tmp_count)); |
1899 | | |
1900 | 889k | for (int y = 0; y < endY; y++) |
1901 | 847k | { |
1902 | 27.7M | for (int x = 0; x < endX; x++) |
1903 | 26.8M | { |
1904 | 26.8M | int signDown = signOf2(rec[x], rec[x + stride - 1]); |
1905 | 26.8M | X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n"); |
1906 | 26.8M | X265_CHECK(abs(upBuff1[x]) <= 1, "upBuffer1 check failure\n"); |
1907 | | |
1908 | 26.8M | uint32_t edgeType = signDown + upBuff1[x] + 2; |
1909 | 26.8M | upBuff1[x - 1] = (int8_t)(-signDown); |
1910 | 26.8M | tmp_stats[edgeType] += diff[x]; |
1911 | 26.8M | tmp_count[edgeType]++; |
1912 | 26.8M | } |
1913 | | |
1914 | 847k | upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]); |
1915 | | |
1916 | 847k | rec += stride; |
1917 | 847k | diff += MAX_CU_SIZE; |
1918 | 847k | } |
1919 | | |
1920 | 251k | for (int x = 0; x < SAO::NUM_EDGETYPE; x++) |
1921 | 209k | { |
1922 | 209k | stats[SAO::s_eoTable[x]] += tmp_stats[x]; |
1923 | 209k | count[SAO::s_eoTable[x]] += tmp_count[x]; |
1924 | 209k | } |
1925 | 41.9k | } |
1926 | | |
1927 | | void setupSaoPrimitives_c(EncoderPrimitives &p) |
1928 | 1 | { |
1929 | | // TODO: move other sao functions to here |
1930 | 1 | p.saoCuStatsBO = saoCuStatsBO_c; |
1931 | 1 | p.saoCuStatsE0 = saoCuStatsE0_c; |
1932 | 1 | p.saoCuStatsE1 = saoCuStatsE1_c; |
1933 | 1 | p.saoCuStatsE2 = saoCuStatsE2_c; |
1934 | 1 | p.saoCuStatsE3 = saoCuStatsE3_c; |
1935 | 1 | } |
1936 | | } |
1937 | | |