Coverage Report

Created: 2022-08-24 06:17

/src/x265/source/encoder/sao.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at license @ x265.com.
24
 *****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "picyuv.h"
30
#include "sao.h"
31
32
namespace {
33
34
inline int32_t roundIBDI(int32_t num, int32_t den)
35
0
{
36
0
    return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
37
0
}
38
39
/* get the sign of input variable (TODO: this is a dup, make common) */
40
inline int8_t signOf(int x)
41
0
{
42
0
    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
43
0
}
44
45
inline int signOf2(const int a, const int b)
46
0
{
47
    // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
48
0
    int r = 0;
49
0
    if (a < b)
50
0
        r = -1;
51
0
    if (a > b)
52
0
        r = 1;
53
0
    return r;
54
0
}
55
56
inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg)
57
0
{
58
0
    return (count * offset - offsetOrg * 2) * offset;
59
0
}
60
} // end anonymous namespace
61
62
63
namespace X265_NS {
64
65
const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
66
{
67
    1, // 0
68
    2, // 1
69
    0, // 2
70
    3, // 3
71
    4  // 4
72
};
73
74
SAO::SAO()
75
0
{
76
0
    m_countPreDblk = NULL;
77
0
    m_offsetOrgPreDblk = NULL;
78
0
    m_refDepth = 0;
79
0
    m_param = NULL;
80
0
    m_clipTable = NULL;
81
0
    m_clipTableBase = NULL;
82
0
    m_tmpU[0] = NULL;
83
0
    m_tmpU[1] = NULL;
84
0
    m_tmpU[2] = NULL;
85
0
    m_tmpL1[0] = NULL;
86
0
    m_tmpL1[1] = NULL;
87
0
    m_tmpL1[2] = NULL;
88
0
    m_tmpL2[0] = NULL;
89
0
    m_tmpL2[1] = NULL;
90
0
    m_tmpL2[2] = NULL;
91
0
    m_depthSaoRate = NULL;
92
0
}
93
94
bool SAO::create(x265_param* param, int initCommon)
95
0
{
96
0
    m_param = param;
97
0
    m_chromaFormat = param->internalCsp;
98
0
    m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
99
0
    m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
100
101
0
    m_numCuInWidth =  (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
102
0
    m_numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
103
104
0
    const pixel maxY = (1 << X265_DEPTH) - 1;
105
0
    const pixel rangeExt = maxY >> 1;
106
0
    int numCtu = m_numCuInWidth * m_numCuInHeight;
107
108
0
    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
109
0
    {
110
0
        CHECKED_MALLOC(m_tmpL1[i], pixel, m_param->maxCUSize + 1);
111
0
        CHECKED_MALLOC(m_tmpL2[i], pixel, m_param->maxCUSize + 1);
112
113
        // SAO asm code will read 1 pixel before and after, so pad by 2
114
        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
115
0
        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * m_param->maxCUSize + 2 + 32);
116
0
        m_tmpU[i] += 1;
117
0
    }
118
119
0
    if (initCommon)
120
0
    {
121
0
        if (m_param->bSaoNonDeblocked)
122
0
        {
123
0
            CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
124
0
            CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
125
0
        }
126
0
        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
127
128
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
129
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
130
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
131
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
132
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
133
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
134
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
135
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
136
137
0
        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
138
0
        m_clipTable = &(m_clipTableBase[rangeExt]);
139
140
        // Share with fast clip lookup table
141
142
0
        for (int i = 0; i < rangeExt; i++)
143
0
            m_clipTableBase[i] = 0;
144
145
0
        for (int i = 0; i < maxY; i++)
146
0
            m_clipTable[i] = (pixel)i;
147
148
0
        for (int i = maxY; i < maxY + rangeExt; i++)
149
0
            m_clipTable[i] = maxY;
150
151
0
    }
152
0
    else
153
0
    {
154
        // must initialize these common pointer outside of function
155
0
        m_countPreDblk = NULL;
156
0
        m_offsetOrgPreDblk = NULL;
157
0
        m_clipTableBase = NULL;
158
0
        m_clipTable = NULL;
159
0
    }
160
161
0
    return true;
162
163
0
fail:
164
0
    return false;
165
0
}
166
167
void SAO::createFromRootNode(SAO* root)
168
0
{
169
0
    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
170
0
    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
171
0
    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
172
0
    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
173
0
    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
174
175
0
    m_countPreDblk = root->m_countPreDblk;
176
0
    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
177
0
    m_depthSaoRate = root->m_depthSaoRate;
178
0
    m_clipTableBase = root->m_clipTableBase; // Unnecessary
179
0
    m_clipTable = root->m_clipTable;
180
0
}
181
182
void SAO::destroy(int destoryCommon)
183
0
{
184
0
    for (int i = 0; i < 3; i++)
185
0
    {
186
0
        if (m_tmpL1[i])
187
0
        {
188
0
            X265_FREE(m_tmpL1[i]);
189
0
            m_tmpL1[i] = NULL;
190
0
        }
191
192
0
        if (m_tmpL2[i])
193
0
        {
194
0
            X265_FREE(m_tmpL2[i]);
195
0
            m_tmpL2[i] = NULL;
196
0
        }
197
198
0
        if (m_tmpU[i])
199
0
        {
200
0
            X265_FREE(m_tmpU[i] - 1);
201
0
            m_tmpU[i] = NULL;
202
0
        }
203
0
    }
204
205
0
    if (destoryCommon)
206
0
    {
207
0
        if (m_param->bSaoNonDeblocked)
208
0
        {
209
0
            X265_FREE_ZERO(m_countPreDblk);
210
0
            X265_FREE_ZERO(m_offsetOrgPreDblk);
211
0
        }
212
0
        X265_FREE_ZERO(m_depthSaoRate);
213
0
        X265_FREE_ZERO(m_clipTableBase);
214
0
    }
215
0
}
216
217
/* allocate memory for SAO parameters */
218
void SAO::allocSaoParam(SAOParam* saoParam) const
219
0
{
220
0
    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
221
0
    saoParam->numCuInWidth  = m_numCuInWidth;
222
223
0
    for (int i = 0; i < planes; i++)
224
0
        saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
225
0
}
226
227
void SAO::startSlice(Frame* frame, Entropy& initState)
228
0
{
229
0
    m_frame = frame;
230
0
    Slice* slice = m_frame->m_encData->m_slice;
231
232
0
    switch (slice->m_sliceType)
233
0
    {
234
0
    case I_SLICE:
235
0
        m_refDepth = 0;
236
0
        break;
237
0
    case P_SLICE:
238
0
        m_refDepth = 1;
239
0
        break;
240
0
    case B_SLICE:
241
0
        m_refDepth = 2 + !IS_REFERENCED(frame);
242
0
        break;
243
0
    }
244
245
0
    m_entropyCoder.load(initState);
246
0
    m_rdContexts.next.load(initState);
247
0
    m_rdContexts.cur.load(initState);
248
249
0
    SAOParam* saoParam = frame->m_encData->m_saoParam;
250
0
    if (!saoParam)
251
0
    {
252
0
        saoParam = new SAOParam;
253
0
        allocSaoParam(saoParam);
254
0
        frame->m_encData->m_saoParam = saoParam;
255
0
    }
256
257
0
    saoParam->bSaoFlag[0] = true;
258
0
    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
259
260
0
    m_numNoSao[0] = 0; // Luma
261
0
    m_numNoSao[1] = 0; // Chroma
262
263
    // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
264
0
    if (m_param->frameNumThreads == 1)
265
0
    {
266
0
        if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE)
267
0
            saoParam->bSaoFlag[0] = false;
268
0
        if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
269
0
            saoParam->bSaoFlag[1] = false;
270
0
    }
271
0
}
272
273
// CTU-based SAO process without slice granularity
274
void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
275
0
{
276
0
    PicYuv* reconPic = m_frame->m_reconPic;
277
0
    pixel* rec = reconPic->getPlaneAddr(plane, addr);
278
0
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
279
0
    uint32_t picWidth  = m_param->sourceWidth;
280
0
    uint32_t picHeight = m_param->sourceHeight;
281
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
282
0
    int ctuWidth = m_param->maxCUSize;
283
0
    int ctuHeight = m_param->maxCUSize;
284
0
    uint32_t lpelx = cu->m_cuPelX;
285
0
    uint32_t tpely = cu->m_cuPelY;
286
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
287
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
288
0
    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
289
290
    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
291
0
    if (lastRowInSlice)
292
0
    {
293
0
        picHeight = x265_min(picHeight, (tpely + ctuHeight));
294
0
    }
295
296
0
    if (plane)
297
0
    {
298
0
        picWidth  >>= m_hChromaShift;
299
0
        picHeight >>= m_vChromaShift;
300
0
        ctuWidth  >>= m_hChromaShift;
301
0
        ctuHeight >>= m_vChromaShift;
302
0
        lpelx     >>= m_hChromaShift;
303
0
        tpely     >>= m_vChromaShift;
304
0
    }
305
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
306
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
307
0
    ctuWidth  = rpelx - lpelx;
308
0
    ctuHeight = bpely - tpely;
309
310
0
    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
311
0
    int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
312
313
0
    memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
314
315
0
    pixel* tmpL = m_tmpL1[plane];
316
0
    pixel* tmpU = &(m_tmpU[plane][lpelx]);
317
318
0
    int8_t* offsetEo = m_offsetEo[plane];
319
320
0
    switch (typeIdx)
321
0
    {
322
0
    case SAO_EO_0: // dir: -
323
0
    {
324
0
        pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
325
0
        int startX = !lpelx;
326
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
327
0
        if (ctuWidth & 15)
328
0
        {
329
0
            for (int y = 0; y < ctuHeight; y++, rec += stride)
330
0
            {
331
0
                int signLeft = signOf(rec[startX] - tmpL[y]);
332
0
                for (int x = startX; x < endX; x++)
333
0
                {
334
0
                    int signRight = signOf(rec[x] - rec[x + 1]);
335
0
                    int edgeType = signRight + signLeft + 2;
336
0
                    signLeft = -signRight;
337
338
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
339
0
                }
340
0
            }
341
0
        }
342
0
        else
343
0
        {
344
0
            for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
345
0
            {
346
0
                signLeft1[0] = signOf(rec[startX] - tmpL[y]);
347
0
                signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
348
349
0
                if (!lpelx)
350
0
                {
351
0
                    firstPxl = rec[0];
352
0
                    row1FirstPxl = rec[stride];
353
0
                }
354
355
0
                if (rpelx == picWidth)
356
0
                {
357
0
                    lastPxl = rec[ctuWidth - 1];
358
0
                    row1LastPxl = rec[stride + ctuWidth - 1];
359
0
                }
360
361
0
                primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride);
362
363
0
                if (!lpelx)
364
0
                {
365
0
                    rec[0] = firstPxl;
366
0
                    rec[stride] = row1FirstPxl;
367
0
                }
368
369
0
                if (rpelx == picWidth)
370
0
                {
371
0
                    rec[ctuWidth - 1] = lastPxl;
372
0
                    rec[stride + ctuWidth - 1] = row1LastPxl;
373
0
                }
374
0
            }
375
0
        }
376
0
        break;
377
0
    }
378
0
    case SAO_EO_1: // dir: |
379
0
    {
380
0
        int startY = bAboveUnavail;
381
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
382
0
        if (startY)
383
0
            rec += stride;
384
385
0
        if (ctuWidth & 15)
386
0
        {
387
0
            for (int x = 0; x < ctuWidth; x++)
388
0
                upBuff1[x] = signOf(rec[x] - tmpU[x]);
389
390
0
            for (int y = startY; y < endY; y++, rec += stride)
391
0
            {
392
0
                for (int x = 0; x < ctuWidth; x++)
393
0
                {
394
0
                    int8_t signDown = signOf(rec[x] - rec[x + stride]);
395
0
                    int edgeType = signDown + upBuff1[x] + 2;
396
0
                    upBuff1[x] = -signDown;
397
398
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
399
0
                }
400
0
            }
401
0
        }
402
0
        else
403
0
        {
404
0
            primitives.sign(upBuff1, rec, tmpU, ctuWidth);
405
406
0
            int diff = (endY - startY) % 2;
407
0
            for (int y = startY; y < endY - diff; y += 2, rec += 2 * stride)
408
0
                primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth);
409
410
0
            if (diff & 1)
411
0
                primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth);
412
0
        }
413
414
0
        break;
415
0
    }
416
0
    case SAO_EO_2: // dir: 135
417
0
    {
418
0
        int startX = !lpelx;
419
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
420
421
0
        int startY = bAboveUnavail;
422
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
423
424
0
        if (startY)
425
0
            rec += stride;
426
427
0
        if (!(ctuWidth & 15))
428
0
        {
429
0
            int8_t firstSign, lastSign;
430
431
0
            if (!lpelx)
432
0
                firstSign = upBuff1[0];
433
434
0
            if (rpelx == picWidth)
435
0
                lastSign = upBuff1[ctuWidth - 1];
436
437
0
            primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth);
438
439
0
            if (!lpelx)
440
0
                upBuff1[0] = firstSign;
441
442
0
            if (rpelx == picWidth)
443
0
                upBuff1[ctuWidth - 1] = lastSign;
444
0
        }
445
0
        else
446
0
        {
447
0
            for (int x = startX; x < endX; x++)
448
0
                upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
449
0
        }
450
451
0
        if (ctuWidth & 15)
452
0
        {
453
0
             for (int y = startY; y < endY; y++, rec += stride)
454
0
             {
455
0
                 upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
456
0
                 for (int x = startX; x < endX; x++)
457
0
                 {
458
0
                     int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
459
0
                     int edgeType = signDown + upBuff1[x] + 2;
460
0
                     upBufft[x + 1] = -signDown;
461
0
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
462
0
                 }
463
464
0
                 std::swap(upBuff1, upBufft);
465
0
             }
466
0
        }
467
0
        else
468
0
        {
469
0
            for (int y = startY; y < endY; y++, rec += stride)
470
0
            {
471
0
                int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
472
473
0
                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
474
475
0
                upBufft[startX] = iSignDown2;
476
477
0
                std::swap(upBuff1, upBufft);
478
0
            }
479
0
        }
480
0
        break;
481
0
    }
482
0
    case SAO_EO_3: // dir: 45
483
0
    {
484
0
        int startX = !lpelx;
485
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
486
487
0
        int startY = bAboveUnavail;
488
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
489
490
0
        if (startY)
491
0
            rec += stride;
492
493
0
        if (ctuWidth & 15)
494
0
        {
495
0
            for (int x = startX - 1; x < endX; x++)
496
0
                upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);
497
498
0
            for (int y = startY; y < endY; y++, rec += stride)
499
0
            {
500
0
                int x = startX;
501
0
                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
502
0
                int edgeType = signDown + upBuff1[x] + 2;
503
0
                upBuff1[x - 1] = -signDown;
504
0
                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
505
506
0
                for (x = startX + 1; x < endX; x++)
507
0
                {
508
0
                    signDown = signOf(rec[x] - rec[x + stride - 1]);
509
0
                    edgeType = signDown + upBuff1[x] + 2;
510
0
                    upBuff1[x - 1] = -signDown;
511
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
512
0
                }
513
514
0
                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
515
0
            }
516
0
        }
517
0
        else
518
0
        {
519
0
            int8_t firstSign, lastSign;
520
521
0
            if (lpelx)
522
0
                firstSign = signOf(rec[-1] - tmpU[0]);
523
0
            if (rpelx == picWidth)
524
0
                lastSign = upBuff1[ctuWidth - 1];
525
526
0
            primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth);
527
528
0
            if (lpelx)
529
0
                upBuff1[-1] = firstSign;
530
0
            if (rpelx == picWidth)
531
0
                upBuff1[ctuWidth - 1] = lastSign;
532
533
0
            for (int y = startY; y < endY; y++, rec += stride)
534
0
            {
535
0
                int x = startX;
536
0
                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
537
0
                int edgeType = signDown + upBuff1[x] + 2;
538
0
                upBuff1[x - 1] = -signDown;
539
0
                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
540
541
0
                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, offsetEo, stride - 1, startX, endX);
542
543
0
                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
544
0
            }
545
0
        }
546
547
0
        break;
548
0
    }
549
0
    case SAO_BO:
550
0
    {
551
0
        const int8_t* offsetBo = m_offsetBo[plane];
552
553
0
        if (ctuWidth & 15)
554
0
        {
555
0
            #define SAO_BO_BITS 5
556
0
            const int boShift = X265_DEPTH - SAO_BO_BITS;
557
558
0
            for (int y = 0; y < ctuHeight; y++, rec += stride)
559
0
                for (int x = 0; x < ctuWidth; x++)
560
0
                    rec[x] = x265_clip(rec[x] + offsetBo[rec[x] >> boShift]);
561
0
        }
562
0
        else
563
0
            primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
564
565
0
        break;
566
0
    }
567
0
    default: break;
568
0
    }
569
0
}
570
571
/* Process SAO unit */
572
void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
573
0
{
574
0
    PicYuv* reconPic = m_frame->m_reconPic;
575
0
    intptr_t stride = reconPic->m_stride;
576
0
    int ctuWidth = m_param->maxCUSize;
577
0
    int ctuHeight = m_param->maxCUSize;
578
579
0
    int addr = idxY * m_numCuInWidth + idxX;
580
0
    pixel* rec = reconPic->getLumaAddr(addr);
581
582
0
    if (idxX == 0)
583
0
    {
584
0
        for (int i = 0; i < ctuHeight + 1; i++)
585
0
        {
586
0
            m_tmpL1[0][i] = rec[0];
587
0
            rec += stride;
588
0
        }
589
0
    }
590
591
0
    bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT);
592
0
    int typeIdx = ctuParam[addr].typeIdx;
593
594
0
    if (idxX != (m_numCuInWidth - 1))
595
0
    {
596
0
        rec = reconPic->getLumaAddr(addr);
597
0
        for (int i = 0; i < ctuHeight + 1; i++)
598
0
        {
599
0
            m_tmpL2[0][i] = rec[ctuWidth - 1];
600
0
            rec += stride;
601
0
        }
602
0
    }
603
604
0
    if (typeIdx >= 0)
605
0
    {
606
0
        if (!mergeLeftFlag)
607
0
        {
608
0
            if (typeIdx == SAO_BO)
609
0
            {
610
0
                memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0]));
611
612
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
613
0
                    m_offsetBo[0][((ctuParam[addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
614
0
            }
615
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
616
0
            {
617
0
                int offset[NUM_EDGETYPE];
618
0
                offset[0] = 0;
619
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
620
0
                    offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
621
622
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
623
0
                    m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
624
0
            }
625
0
        }
626
0
        applyPixelOffsets(addr, typeIdx, 0);
627
0
    }
628
0
    std::swap(m_tmpL1[0], m_tmpL2[0]);
629
0
}
630
631
/* Process SAO unit (Chroma only) */
632
void SAO::generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX)
633
0
{
634
0
    PicYuv* reconPic = m_frame->m_reconPic;
635
0
    intptr_t stride = reconPic->m_strideC;
636
0
    int ctuWidth  = m_param->maxCUSize;
637
0
    int ctuHeight = m_param->maxCUSize;
638
639
0
    {
640
0
        ctuWidth  >>= m_hChromaShift;
641
0
        ctuHeight >>= m_vChromaShift;
642
0
    }
643
644
0
    int addr = idxY * m_numCuInWidth + idxX;
645
0
    pixel* recCb = reconPic->getCbAddr(addr);
646
0
    pixel* recCr = reconPic->getCrAddr(addr);
647
648
0
    if (idxX == 0)
649
0
    {
650
0
        for (int i = 0; i < ctuHeight + 1; i++)
651
0
        {
652
0
            m_tmpL1[1][i] = recCb[0];
653
0
            m_tmpL1[2][i] = recCr[0];
654
0
            recCb += stride;
655
0
            recCr += stride;
656
0
        }
657
0
    }
658
659
0
    bool mergeLeftFlagCb = (ctuParam[1][addr].mergeMode == SAO_MERGE_LEFT);
660
0
    int typeIdxCb = ctuParam[1][addr].typeIdx;
661
662
0
    bool mergeLeftFlagCr = (ctuParam[2][addr].mergeMode == SAO_MERGE_LEFT);
663
0
    int typeIdxCr = ctuParam[2][addr].typeIdx;
664
665
0
    if (idxX != (m_numCuInWidth - 1))
666
0
    {
667
0
        recCb = reconPic->getCbAddr(addr);
668
0
        recCr = reconPic->getCrAddr(addr);
669
0
        for (int i = 0; i < ctuHeight + 1; i++)
670
0
        {
671
0
            m_tmpL2[1][i] = recCb[ctuWidth - 1];
672
0
            m_tmpL2[2][i] = recCr[ctuWidth - 1];
673
0
            recCb += stride;
674
0
            recCr += stride;
675
0
        }
676
0
    }
677
678
    // Process U
679
0
    if (typeIdxCb >= 0)
680
0
    {
681
0
        if (!mergeLeftFlagCb)
682
0
        {
683
0
            if (typeIdxCb == SAO_BO)
684
0
            {
685
0
                memset(m_offsetBo[1], 0, sizeof(m_offsetBo[0]));
686
687
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
688
0
                    m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC);
689
0
            }
690
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
691
0
            {
692
0
                int offset[NUM_EDGETYPE];
693
0
                offset[0] = 0;
694
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
695
0
                    offset[i + 1] = ctuParam[1][addr].offset[i] << SAO_BIT_INC;
696
697
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
698
0
                    m_offsetEo[1][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
699
0
            }
700
0
        }
701
0
        applyPixelOffsets(addr, typeIdxCb, 1);
702
0
    }
703
704
    // Process V
705
0
    if (typeIdxCr >= 0)
706
0
    {
707
0
        if (!mergeLeftFlagCr)
708
0
        {
709
0
            if (typeIdxCr == SAO_BO)
710
0
            {
711
0
                memset(m_offsetBo[2], 0, sizeof(m_offsetBo[0]));
712
713
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
714
0
                    m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC);
715
0
            }
716
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
717
0
            {
718
0
                int offset[NUM_EDGETYPE];
719
0
                offset[0] = 0;
720
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
721
0
                    offset[i + 1] = ctuParam[2][addr].offset[i] << SAO_BIT_INC;
722
723
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
724
0
                    m_offsetEo[2][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
725
0
            }
726
0
        }
727
0
        applyPixelOffsets(addr, typeIdxCb, 2);
728
0
    }
729
730
0
    std::swap(m_tmpL1[1], m_tmpL2[1]);
731
0
    std::swap(m_tmpL1[2], m_tmpL2[2]);
732
0
}
733
734
/* Calculate SAO statistics for current CTU without non-crossing slice */
735
void SAO::calcSaoStatsCTU(int addr, int plane)
736
0
{
737
0
    Slice* slice = m_frame->m_encData->m_slice;
738
0
    const PicYuv* reconPic = m_frame->m_reconPic;
739
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
740
0
    const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
741
0
    const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
742
0
    const pixel* fenc;
743
0
    const pixel* rec;
744
0
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
745
0
    uint32_t picWidth  = m_param->sourceWidth;
746
0
    uint32_t picHeight = m_param->sourceHeight;
747
0
    int ctuWidth  = m_param->maxCUSize;
748
0
    int ctuHeight = m_param->maxCUSize;
749
0
    uint32_t lpelx = cu->m_cuPelX;
750
0
    uint32_t tpely = cu->m_cuPelY;
751
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
752
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
753
0
    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
754
755
0
    if (plane)
756
0
    {
757
0
        picWidth  >>= m_hChromaShift;
758
0
        picHeight >>= m_vChromaShift;
759
0
        ctuWidth  >>= m_hChromaShift;
760
0
        ctuHeight >>= m_vChromaShift;
761
0
        lpelx     >>= m_hChromaShift;
762
0
        tpely     >>= m_vChromaShift;
763
0
    }
764
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
765
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
766
0
    ctuWidth  = rpelx - lpelx;
767
0
    ctuHeight = bpely - tpely;
768
769
    // NOTE: Careful! the picHeight apply for Equal operator only in below, so I may safe to hack it
770
0
    if (lastRowInSlice)
771
0
    {
772
0
        picHeight = bpely;
773
0
    }
774
775
0
    int startX;
776
0
    int startY;
777
0
    int endX;
778
0
    int endY;
779
780
0
    const int plane_offset = plane ? 2 : 0;
781
0
    int skipB = 4;
782
0
    int skipR = 5;
783
784
0
    int8_t _upBuff[2 * (MAX_CU_SIZE + 16 + 16)], *upBuff1 = _upBuff + 16, *upBufft = upBuff1 + (MAX_CU_SIZE + 16 + 16);
785
786
0
    ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]);
787
788
    // Calculate (fenc - frec) and put into diff[]
789
0
    if ((lpelx + ctuWidth <  picWidth) & (tpely + ctuHeight < picHeight))
790
0
    {
791
        // WARNING: *) May read beyond bound on video than ctuWidth or ctuHeight is NOT multiple of cuSize
792
0
        X265_CHECK((ctuWidth == ctuHeight) || (m_chromaFormat != X265_CSP_I420), "video size check failure\n");
793
0
        if (plane)
794
0
            primitives.chroma[m_chromaFormat].cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
795
0
        else
796
0
           primitives.cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
797
0
    }
798
0
    else
799
0
    {
800
        // path for non-square area (most in edge)
801
0
        for(int y = 0; y < ctuHeight; y++)
802
0
        {
803
0
            for(int x = 0; x < ctuWidth; x++)
804
0
            {
805
0
                diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]);
806
0
            }
807
0
        }
808
0
    }
809
810
    // SAO_BO:
811
0
    {
812
0
        if (m_param->bSaoNonDeblocked)
813
0
        {
814
0
            skipB = 3;
815
0
            skipR = 4;
816
0
        }
817
818
0
        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
819
0
        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset;
820
821
0
        primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
822
0
    }
823
824
0
    {
825
        // SAO_EO_0: // dir: -
826
0
        {
827
0
            if (m_param->bSaoNonDeblocked)
828
0
            {
829
0
                skipB = 3;
830
0
                skipR = 5;
831
0
            }
832
833
0
            startX = !lpelx;
834
0
            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
835
836
0
            primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
837
0
        }
838
839
        // SAO_EO_1: // dir: |
840
0
        {
841
0
            if (m_param->bSaoNonDeblocked)
842
0
            {
843
0
                skipB = 4;
844
0
                skipR = 4;
845
0
            }
846
847
0
            rec  = rec0;
848
849
0
            startY = bAboveUnavail;
850
0
            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
851
0
            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
852
0
            if (startY)
853
0
            {
854
0
                rec += stride;
855
0
            }
856
857
0
            primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
858
859
0
            primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
860
0
        }
861
0
        if (!m_param->bLimitSAO || ((slice->m_sliceType == P_SLICE && !cu->isSkipped(0)) ||
862
0
            (slice->m_sliceType != B_SLICE)))
863
0
        {
864
            // SAO_EO_2: // dir: 135
865
0
            {
866
0
                if (m_param->bSaoNonDeblocked)
867
0
                {
868
0
                    skipB = 4;
869
0
                    skipR = 5;
870
0
                }
871
872
0
                fenc = fenc0;
873
0
                rec  = rec0;
874
875
0
                startX = !lpelx;
876
0
                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
877
878
0
                startY = bAboveUnavail;
879
0
                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
880
0
                if (startY)
881
0
                {
882
0
                    fenc += stride;
883
0
                    rec += stride;
884
0
                }
885
886
0
                primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX));
887
888
0
                primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
889
0
            }
890
            // SAO_EO_3: // dir: 45
891
0
            {
892
0
                if (m_param->bSaoNonDeblocked)
893
0
                {
894
0
                    skipB = 4;
895
0
                    skipR = 5;
896
0
                }
897
0
                fenc = fenc0;
898
0
                rec  = rec0;
899
0
                startX = !lpelx;
900
0
                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
901
902
0
                startY = bAboveUnavail;
903
0
                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
904
905
0
                if (startY)
906
0
                {
907
0
                    fenc += stride;
908
0
                    rec += stride;
909
0
                }
910
911
0
                primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
912
913
0
                primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
914
0
            }
915
0
        }
916
0
    }
917
0
}
918
919
void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
920
0
{
921
0
    int addr = idxX + m_numCuInWidth * idxY;
922
923
0
    int x, y;
924
0
    const CUData* cu = frame->m_encData->getPicCTU(addr);
925
0
    const PicYuv* reconPic = m_frame->m_reconPic;
926
0
    const pixel* fenc;
927
0
    const pixel* rec;
928
0
    intptr_t stride = reconPic->m_stride;
929
0
    uint32_t picWidth  = m_param->sourceWidth;
930
0
    uint32_t picHeight = m_param->sourceHeight;
931
0
    int ctuWidth  = m_param->maxCUSize;
932
0
    int ctuHeight = m_param->maxCUSize;
933
0
    uint32_t lpelx = cu->m_cuPelX;
934
0
    uint32_t tpely = cu->m_cuPelY;
935
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
936
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
937
0
    const uint32_t bAboveAvail = (!tpely) | firstRowInSlice;
938
939
    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
940
0
    if (lastRowInSlice)
941
0
    {
942
0
        picHeight = x265_min(picHeight, (tpely + ctuHeight));
943
0
    }
944
945
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
946
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
947
0
    ctuWidth  = rpelx - lpelx;
948
0
    ctuHeight = bpely - tpely;
949
950
0
    int startX;
951
0
    int startY;
952
0
    int endX;
953
0
    int endY;
954
0
    int firstX, firstY;
955
0
    int32_t* stats;
956
0
    int32_t* count;
957
958
0
    int skipB, skipR;
959
960
0
    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
961
0
    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
962
963
0
    const int boShift = X265_DEPTH - SAO_BO_BITS;
964
965
0
    memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
966
0
    memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
967
968
0
    int plane_offset = 0;
969
0
    for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400? NUM_PLANE : 1); plane++)
970
0
    {
971
0
        if (plane == 1)
972
0
        {
973
0
            stride = reconPic->m_strideC;
974
0
            picWidth  >>= m_hChromaShift;
975
0
            picHeight >>= m_vChromaShift;
976
0
            ctuWidth  >>= m_hChromaShift;
977
0
            ctuHeight >>= m_vChromaShift;
978
0
            lpelx     >>= m_hChromaShift;
979
0
            tpely     >>= m_vChromaShift;
980
0
            rpelx     >>= m_hChromaShift;
981
0
            bpely     >>= m_vChromaShift;
982
0
        }
983
984
        // SAO_BO:
985
986
0
        skipB = 3 - plane_offset;
987
0
        skipR = 4 - plane_offset;
988
989
0
        stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
990
0
        count = m_countPreDblk[addr][plane][SAO_BO];
991
992
0
        const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
993
0
        const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
994
0
        fenc = fenc0;
995
0
        rec  = rec0;
996
997
0
        startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
998
0
        startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
999
1000
0
        for (y = 0; y < ctuHeight; y++)
1001
0
        {
1002
0
            for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
1003
0
            {
1004
0
                int classIdx = rec[x] >> boShift;
1005
0
                stats[classIdx] += (fenc[x] - rec[x]);
1006
0
                count[classIdx]++;
1007
0
            }
1008
1009
0
            fenc += stride;
1010
0
            rec += stride;
1011
0
        }
1012
1013
        // SAO_EO_0: // dir: -
1014
0
        {
1015
0
            skipB = 3 - plane_offset;
1016
0
            skipR = 5 - plane_offset;
1017
1018
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
1019
0
            count = m_countPreDblk[addr][plane][SAO_EO_0];
1020
1021
0
            fenc = fenc0;
1022
0
            rec  = rec0;
1023
1024
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1025
0
            startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
1026
0
            firstX = !lpelx;
1027
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1028
0
            endX   = ctuWidth - 1;  // not refer right CTU
1029
1030
0
            for (y = 0; y < ctuHeight; y++)
1031
0
            {
1032
0
                x = (y < startY ? startX : firstX);
1033
0
                int signLeft = signOf(rec[x] - rec[x - 1]);
1034
0
                for (; x < endX; x++)
1035
0
                {
1036
0
                    int signRight = signOf(rec[x] - rec[x + 1]);
1037
0
                    int edgeType = signRight + signLeft + 2;
1038
0
                    signLeft = -signRight;
1039
1040
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1041
0
                    count[s_eoTable[edgeType]]++;
1042
0
                }
1043
1044
0
                fenc += stride;
1045
0
                rec += stride;
1046
0
            }
1047
0
        }
1048
1049
        // SAO_EO_1: // dir: |
1050
0
        {
1051
0
            skipB = 4 - plane_offset;
1052
0
            skipR = 4 - plane_offset;
1053
1054
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
1055
0
            count = m_countPreDblk[addr][plane][SAO_EO_1];
1056
1057
0
            fenc = fenc0;
1058
0
            rec  = rec0;
1059
1060
0
            startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
1061
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1062
0
            firstY = bAboveAvail;
1063
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1064
0
            endY   = ctuHeight - 1; // not refer below CTU
1065
0
            if (firstY)
1066
0
            {
1067
0
                fenc += stride;
1068
0
                rec += stride;
1069
0
            }
1070
1071
0
            for (x = startX; x < ctuWidth; x++)
1072
0
                upBuff1[x] = signOf(rec[x] - rec[x - stride]);
1073
1074
0
            for (y = firstY; y < endY; y++)
1075
0
            {
1076
0
                for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
1077
0
                {
1078
0
                    int signDown = signOf(rec[x] - rec[x + stride]);
1079
0
                    int edgeType = signDown + upBuff1[x] + 2;
1080
0
                    upBuff1[x] = -signDown;
1081
1082
0
                    if (x < startX && y < startY)
1083
0
                        continue;
1084
1085
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1086
0
                    count[s_eoTable[edgeType]]++;
1087
0
                }
1088
1089
0
                fenc += stride;
1090
0
                rec += stride;
1091
0
            }
1092
0
        }
1093
1094
        // SAO_EO_2: // dir: 135
1095
0
        {
1096
0
            skipB = 4 - plane_offset;
1097
0
            skipR = 5 - plane_offset;
1098
1099
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
1100
0
            count = m_countPreDblk[addr][plane][SAO_EO_2];
1101
1102
0
            fenc = fenc0;
1103
0
            rec  = rec0;
1104
1105
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1106
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1107
0
            firstX = !lpelx;
1108
0
            firstY = bAboveAvail;
1109
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1110
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1111
0
            endX   = ctuWidth - 1;  // not refer right CTU
1112
0
            endY   = ctuHeight - 1; // not refer below CTU
1113
0
            if (firstY)
1114
0
            {
1115
0
                fenc += stride;
1116
0
                rec += stride;
1117
0
            }
1118
1119
0
            for (x = startX; x < endX; x++)
1120
0
                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
1121
1122
0
            for (y = firstY; y < endY; y++)
1123
0
            {
1124
0
                x = (y < startY - 1 ? startX : firstX);
1125
0
                upBufft[x] = signOf(rec[x + stride] - rec[x - 1]);
1126
0
                for (; x < endX; x++)
1127
0
                {
1128
0
                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
1129
0
                    int edgeType = signDown + upBuff1[x] + 2;
1130
0
                    upBufft[x + 1] = -signDown;
1131
1132
0
                    if (x < startX && y < startY)
1133
0
                        continue;
1134
1135
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1136
0
                    count[s_eoTable[edgeType]]++;
1137
0
                }
1138
1139
0
                std::swap(upBuff1, upBufft);
1140
1141
0
                rec += stride;
1142
0
                fenc += stride;
1143
0
            }
1144
0
        }
1145
1146
        // SAO_EO_3: // dir: 45
1147
0
        {
1148
0
            skipB = 4 - plane_offset;
1149
0
            skipR = 5 - plane_offset;
1150
1151
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
1152
0
            count = m_countPreDblk[addr][plane][SAO_EO_3];
1153
1154
0
            fenc = fenc0;
1155
0
            rec  = rec0;
1156
1157
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1158
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1159
0
            firstX = !lpelx;
1160
0
            firstY = bAboveAvail;
1161
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1162
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1163
0
            endX   = ctuWidth - 1;  // not refer right CTU
1164
0
            endY   = ctuHeight - 1; // not refer below CTU
1165
0
            if (firstY)
1166
0
            {
1167
0
                fenc += stride;
1168
0
                rec += stride;
1169
0
            }
1170
1171
0
            for (x = startX - 1; x < endX; x++)
1172
0
                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
1173
1174
0
            for (y = firstY; y < endY; y++)
1175
0
            {
1176
0
                for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
1177
0
                {
1178
0
                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
1179
0
                    int edgeType = signDown + upBuff1[x] + 2;
1180
0
                    upBuff1[x - 1] = -signDown;
1181
1182
0
                    if (x < startX && y < startY)
1183
0
                        continue;
1184
1185
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1186
0
                    count[s_eoTable[edgeType]]++;
1187
0
                }
1188
1189
0
                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
1190
1191
0
                rec += stride;
1192
0
                fenc += stride;
1193
0
            }
1194
0
        }
1195
0
        plane_offset = 2;
1196
0
    }
1197
0
}
1198
1199
/* reset offset statistics */
1200
void SAO::resetStats()
1201
0
{
1202
0
    memset(m_count, 0, sizeof(m_count));
1203
0
    memset(m_offset, 0, sizeof(m_offset));
1204
0
    memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1205
0
}
1206
1207
void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
1208
0
{
1209
0
    if (!saoParam->bSaoFlag[0])
1210
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
1211
0
    else
1212
0
    {
1213
0
        X265_CHECK(m_numNoSao[0] <= numctus, "m_numNoSao check failure!");
1214
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
1215
0
    }
1216
1217
0
    if (!saoParam->bSaoFlag[1])
1218
0
    {
1219
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
1220
0
    }
1221
0
    else
1222
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
1223
0
}
1224
1225
void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
1226
0
{
1227
0
    Slice* slice = m_frame->m_encData->m_slice;
1228
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1229
0
    int qp = cu->m_qp[0];
1230
0
    int64_t lambda[2] = { 0 };
1231
1232
0
    int qpCb = qp + slice->m_pps->chromaQpOffset[0] + slice->m_chromaQpOffset[0];
1233
0
    if (m_param->internalCsp == X265_CSP_I420)
1234
0
        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)g_chromaScale[x265_clip3(QP_MIN, QP_MAX_MAX, qpCb)]);
1235
0
    else
1236
0
        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, qpCb);
1237
0
    lambda[0] = (int64_t)floor(256.0 * x265_lambda2_tab[qp]);
1238
0
    lambda[1] = (int64_t)floor(256.0 * x265_lambda2_tab[qpCb]); // Use Cb QP for SAO chroma
1239
1240
0
    const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
1241
1242
0
    const int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up
1243
1244
0
    bool chroma = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
1245
0
    int planes = chroma ? 3 : 1;
1246
1247
    // reset stats Y, Cb, Cr
1248
0
    X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
1249
1250
    // TODO: Confirm the address space is continuous
1251
0
    if (m_param->bSaoNonDeblocked)
1252
0
    {
1253
0
        memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
1254
0
        memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
1255
0
    }
1256
0
    else
1257
0
    {
1258
0
        memset(m_count, 0, sizeof(m_count));
1259
0
        memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1260
0
    }
1261
1262
0
    for (int i = 0; i < planes; i++)
1263
0
        saoParam->ctuParam[i][addr].reset();
1264
    // SAO distortion calculation
1265
0
    m_entropyCoder.load(m_rdContexts.cur);
1266
0
    m_entropyCoder.resetBits();
1267
0
    if (allowMerge[0])
1268
0
        m_entropyCoder.codeSaoMerge(0);
1269
0
    if (allowMerge[1])
1270
0
        m_entropyCoder.codeSaoMerge(0);
1271
0
    m_entropyCoder.store(m_rdContexts.temp);
1272
0
    memset(m_offset, 0, sizeof(m_offset));
1273
0
    int64_t bestCost = 0;
1274
0
    int64_t rateDist = 0;
1275
1276
0
    bool bAboveLeftAvail = true;
1277
0
    for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1278
0
    {
1279
0
        if (!allowMerge[mergeIdx])
1280
0
            continue;
1281
1282
0
        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[0][addrMerge[mergeIdx]]);
1283
0
        bAboveLeftAvail = bAboveLeftAvail && (mergeSrcParam->typeIdx == -1);
1284
0
    }
1285
    // Don't apply sao if ctu is skipped or ajacent ctus are sao off
1286
0
    bool bSaoOff = (slice->m_sliceType == B_SLICE) && (cu->isSkipped(0) || bAboveLeftAvail);
1287
1288
    // Estimate distortion and cost of new SAO params
1289
0
    if (saoParam->bSaoFlag[0])
1290
0
    {
1291
0
        if (!m_param->bLimitSAO || !bSaoOff)
1292
0
        {
1293
0
            calcSaoStatsCTU(addr, 0);
1294
0
            saoStatsInitialOffset(addr, 0);
1295
0
            saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
1296
0
        }
1297
0
    }
1298
1299
0
    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
1300
0
    if (saoParam->bSaoFlag[1])
1301
0
    {
1302
0
        if (!m_param->bLimitSAO || ((lclCtuParam->typeIdx != -1) && !bSaoOff))
1303
0
        {
1304
0
            calcSaoStatsCTU(addr, 1);
1305
0
            calcSaoStatsCTU(addr, 2);
1306
0
            saoStatsInitialOffset(addr, 1);
1307
0
            saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
1308
0
        }
1309
0
    }
1310
0
    if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1311
0
    {
1312
        // Cost of merge left or Up
1313
0
        for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1314
0
        {
1315
0
            if (!allowMerge[mergeIdx])
1316
0
                continue;
1317
1318
0
            int64_t mergeDist = 0; 
1319
0
            for (int plane = 0; plane < planes; plane++)
1320
0
            {
1321
0
                int64_t estDist = 0;
1322
0
                SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
1323
0
                int typeIdx = mergeSrcParam->typeIdx;
1324
0
                if (typeIdx >= 0)
1325
0
                {
1326
0
                    int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 1;
1327
0
                    for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1328
0
                    {
1329
0
                        int mergeOffset = mergeSrcParam->offset[classIdx];
1330
0
                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]);
1331
0
                    }
1332
0
                }
1333
0
                mergeDist += (estDist << 8) / lambda[!!plane];
1334
0
            }
1335
1336
0
            m_entropyCoder.load(m_rdContexts.cur);
1337
0
            m_entropyCoder.resetBits();
1338
0
            if (allowMerge[0])
1339
0
                m_entropyCoder.codeSaoMerge(1 - mergeIdx);
1340
0
            if (allowMerge[1] && (mergeIdx == 1))
1341
0
                m_entropyCoder.codeSaoMerge(1);
1342
1343
0
            uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1344
0
            int64_t mergeCost = mergeDist + estRate;
1345
0
            if (mergeCost < bestCost)
1346
0
            {
1347
0
                SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1348
0
                bestCost = mergeCost;
1349
0
                m_entropyCoder.store(m_rdContexts.temp);
1350
0
                for (int plane = 0; plane < planes; plane++)
1351
0
                {
1352
0
                    if (saoParam->bSaoFlag[plane > 0])
1353
0
                    {
1354
0
                        SaoCtuParam* dstCtuParam   = &saoParam->ctuParam[plane][addr];
1355
0
                        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
1356
0
                        dstCtuParam->mergeMode = mergeMode;
1357
0
                        dstCtuParam->typeIdx   = mergeSrcParam->typeIdx;
1358
0
                        dstCtuParam->bandPos   = mergeSrcParam->bandPos;
1359
1360
0
                        for (int i = 0; i < SAO_NUM_OFFSET; i++)
1361
0
                            dstCtuParam->offset[i] = mergeSrcParam->offset[i];
1362
0
                    }
1363
0
                }
1364
0
            }
1365
0
        }
1366
1367
0
        if (saoParam->ctuParam[0][addr].typeIdx < 0)
1368
0
            m_numNoSao[0]++;
1369
0
        if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
1370
0
            m_numNoSao[1]++;
1371
0
        m_entropyCoder.load(m_rdContexts.temp);
1372
0
        m_entropyCoder.store(m_rdContexts.cur);
1373
0
    }
1374
0
}
1375
1376
// Rounds the division of initial offsets by the number of samples in
1377
// each of the statistics table entries.
1378
void SAO::saoStatsInitialOffset(int addr, int planes)
1379
0
{
1380
0
    Slice* slice = m_frame->m_encData->m_slice;
1381
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1382
1383
0
    int maxSaoType;
1384
0
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1385
0
       (slice->m_sliceType == B_SLICE)))
1386
0
    {
1387
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1388
0
    }
1389
0
    else
1390
0
    {
1391
0
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1392
0
    }
1393
    // EO
1394
0
    for (int plane = planes; plane <= planes * 2; plane++)
1395
0
    {
1396
0
        for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1397
0
        {
1398
0
            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1399
0
            {
1400
0
                int32_t&  count     = m_count[plane][typeIdx][classIdx];
1401
0
                int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
1402
0
                int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
1403
1404
0
                if (count)
1405
0
                {
1406
0
                    offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
1407
0
                    offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
1408
1409
0
                    if (classIdx < 3) 
1410
0
                        offsetOut = X265_MAX(offsetOut, 0);
1411
0
                    else
1412
0
                        offsetOut = X265_MIN(offsetOut, 0);
1413
0
                }
1414
0
            }
1415
0
        }
1416
0
    }
1417
    // BO
1418
0
    for (int plane = planes; plane <= planes * 2; plane++)
1419
0
    {
1420
0
        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1421
0
        {
1422
0
            int32_t&  count     = m_count[plane][SAO_BO][classIdx];
1423
0
            int32_t& offsetOrg = m_offsetOrg[plane][SAO_BO][classIdx];
1424
0
            int32_t& offsetOut = m_offset[plane][SAO_BO][classIdx];
1425
1426
0
            if (count)
1427
0
            {
1428
0
                offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
1429
0
                offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
1430
0
            }
1431
0
        }
1432
0
    }
1433
0
}
1434
1435
inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda)
1436
0
{
1437
0
#if X265_DEPTH < 10
1438
0
        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
1439
0
                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
1440
0
                   distortion, bits, lambda);
1441
#else
1442
        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
1443
                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
1444
                   distortion, bits, lambda);
1445
#endif
1446
0
        return distortion + ((bits * lambda + 128) >> 8);
1447
0
}
1448
1449
void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses)
1450
0
{
1451
0
    int bestOffset = 0;
1452
0
    distClasses    = 0;
1453
1454
    // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit.
1455
    // entropy coder can be used to measure the exact rate here.
1456
0
    int64_t bestCost = calcSaoRdoCost(0, 1, lambda);
1457
0
    while (offset != 0)
1458
0
    {
1459
        // Calculate the bits required for signalling the offset
1460
0
        uint32_t rate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
1461
0
        if (abs(offset) == OFFSET_THRESH - 1)
1462
0
            rate--;
1463
1464
        // Do the dequntization before distorion calculation
1465
0
        int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg);
1466
0
        int64_t cost  = calcSaoRdoCost(dist, rate, lambda);
1467
0
        if (cost < bestCost)
1468
0
        {
1469
0
            bestCost = cost;
1470
0
            bestOffset = offset;
1471
0
            distClasses = (int)dist;
1472
0
        }
1473
0
        offset = (offset > 0) ? (offset - 1) : (offset + 1);
1474
0
    }
1475
1476
0
    costClasses = bestCost;
1477
0
    offset = bestOffset;
1478
0
}
1479
void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1480
0
{
1481
0
    Slice* slice = m_frame->m_encData->m_slice;
1482
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1483
0
    int64_t bestDist = 0;
1484
0
    int bestTypeIdx = -1;
1485
0
    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
1486
1487
0
    int32_t distClasses[MAX_NUM_SAO_CLASS];
1488
0
    int64_t costClasses[MAX_NUM_SAO_CLASS];
1489
1490
    // RDO SAO_NA
1491
0
    m_entropyCoder.load(m_rdContexts.temp);
1492
0
    m_entropyCoder.resetBits();
1493
0
    m_entropyCoder.codeSaoType(0);
1494
0
    int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1495
0
    int maxSaoType;
1496
0
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1497
0
        (slice->m_sliceType == B_SLICE)))
1498
0
    {
1499
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1500
0
    }
1501
0
    else
1502
0
    {
1503
0
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1504
0
    }
1505
1506
    //EO distortion calculation
1507
0
    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1508
0
    {
1509
0
        int64_t estDist = 0;
1510
0
        for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1511
0
        {
1512
0
            int32_t&  count    = m_count[0][typeIdx][classIdx];
1513
0
            int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
1514
0
            int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
1515
0
            estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1516
1517
            //Calculate distortion
1518
0
            estDist += distClasses[classIdx];
1519
0
        }
1520
1521
0
        m_entropyCoder.load(m_rdContexts.temp);
1522
0
        m_entropyCoder.resetBits();
1523
0
        m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
1524
1525
0
        int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1526
1527
0
        if (cost < costPartBest)
1528
0
        {
1529
0
            costPartBest = cost;
1530
0
            bestDist = estDist;
1531
0
            bestTypeIdx = typeIdx;
1532
0
        }
1533
0
    }
1534
1535
0
    if (bestTypeIdx != -1)
1536
0
    {
1537
0
        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1538
0
        lclCtuParam->typeIdx = bestTypeIdx;
1539
0
        lclCtuParam->bandPos = 0;
1540
0
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1541
0
            lclCtuParam->offset[classIdx] = m_offset[0][bestTypeIdx][classIdx + 1];
1542
0
    }
1543
1544
    //BO RDO
1545
0
    int64_t estDist = 0;
1546
0
    for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1547
0
    {
1548
0
        int32_t&  count    = m_count[0][SAO_BO][classIdx];
1549
0
        int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx];
1550
0
        int32_t& offsetOut = m_offset[0][SAO_BO][classIdx];
1551
1552
0
        estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1553
0
    }
1554
1555
    // Estimate Best Position
1556
0
    int32_t bestClassBO  = 0;
1557
0
    int64_t currentRDCost = costClasses[0];
1558
0
    currentRDCost += costClasses[1];
1559
0
    currentRDCost += costClasses[2];
1560
0
    currentRDCost += costClasses[3];
1561
0
    int64_t bestRDCostBO = currentRDCost;
1562
1563
0
    for (int i = 1; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1564
0
    {
1565
0
        currentRDCost -= costClasses[i - 1];
1566
0
        currentRDCost += costClasses[i + 3];
1567
1568
0
        if (currentRDCost < bestRDCostBO)
1569
0
        {
1570
0
            bestRDCostBO = currentRDCost;
1571
0
            bestClassBO  = i;
1572
0
        }
1573
0
    }
1574
1575
0
    estDist = 0;
1576
0
    for (int classIdx = bestClassBO; classIdx < bestClassBO + SAO_NUM_OFFSET; classIdx++)
1577
0
        estDist += distClasses[classIdx];
1578
1579
0
    m_entropyCoder.load(m_rdContexts.temp);
1580
0
    m_entropyCoder.resetBits();
1581
0
    m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
1582
1583
0
    int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1584
1585
0
    if (cost < costPartBest)
1586
0
    {
1587
0
        costPartBest = cost;
1588
0
        bestDist = estDist;
1589
1590
0
        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1591
0
        lclCtuParam->typeIdx = SAO_BO;
1592
0
        lclCtuParam->bandPos = bestClassBO;
1593
0
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1594
0
            lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO];
1595
0
    }
1596
1597
0
    rateDist = (bestDist << 8) / lambda[0];
1598
0
    m_entropyCoder.load(m_rdContexts.temp);
1599
0
    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
1600
0
    m_entropyCoder.store(m_rdContexts.temp);
1601
1602
0
    if (m_param->internalCsp == X265_CSP_I400)
1603
0
    {
1604
0
        bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits();
1605
0
    }
1606
0
}
1607
void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1608
0
{
1609
0
    Slice* slice = m_frame->m_encData->m_slice;
1610
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1611
0
    int64_t bestDist = 0;
1612
0
    int bestTypeIdx = -1;
1613
0
    SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
1614
1615
0
    int64_t costClasses[MAX_NUM_SAO_CLASS];
1616
0
    int32_t distClasses[MAX_NUM_SAO_CLASS];
1617
0
    int32_t bestClassBO[2] = { 0, 0 };
1618
1619
0
    m_entropyCoder.load(m_rdContexts.temp);
1620
0
    m_entropyCoder.resetBits();
1621
0
    m_entropyCoder.codeSaoType(0);
1622
1623
0
    uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1624
0
    int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
1625
0
    int maxSaoType;
1626
0
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1627
0
        (slice->m_sliceType == B_SLICE)))
1628
0
    {
1629
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1630
0
    }
1631
0
    else
1632
0
    {
1633
0
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1634
0
    }
1635
1636
    //EO RDO
1637
0
    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1638
0
    {
1639
0
        int64_t estDist[2] = {0, 0};
1640
0
        for (int compIdx = 1; compIdx < 3; compIdx++)
1641
0
        {
1642
0
            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1643
0
            {
1644
0
                int32_t& count = m_count[compIdx][typeIdx][classIdx];
1645
0
                int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx];
1646
0
                int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx];
1647
1648
0
                estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1649
1650
0
                estDist[compIdx - 1] += distClasses[classIdx];
1651
0
            }
1652
0
        }
1653
1654
0
        m_entropyCoder.load(m_rdContexts.temp);
1655
0
        m_entropyCoder.resetBits();
1656
1657
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1658
0
            m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1);
1659
1660
0
        uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1661
0
        int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1662
1663
0
        if (cost < costPartBest)
1664
0
        {
1665
0
            costPartBest = cost;
1666
0
            bestDist = (estDist[0] + estDist[1]);
1667
0
            bestTypeIdx = typeIdx;
1668
0
        }
1669
0
    }
1670
1671
0
    if (bestTypeIdx != -1)
1672
0
    {
1673
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1674
0
        {
1675
0
            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1676
0
            lclCtuParam[compIdx]->typeIdx = bestTypeIdx;
1677
0
            lclCtuParam[compIdx]->bandPos = 0;
1678
0
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1679
0
                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][bestTypeIdx][classIdx + 1];
1680
0
        }
1681
0
    }
1682
1683
    // BO RDO
1684
0
    int64_t estDist[2];
1685
1686
    // Estimate Best Position
1687
0
    for (int compIdx = 1; compIdx < 3; compIdx++)
1688
0
    {
1689
0
        int64_t bestRDCostBO = MAX_INT64;
1690
1691
0
        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1692
0
        {
1693
0
            int32_t&  count = m_count[compIdx][SAO_BO][classIdx];
1694
0
            int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx];
1695
0
            int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx];
1696
1697
0
            estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1698
0
        }
1699
1700
0
        for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1701
0
        {
1702
0
            int64_t currentRDCost = 0;
1703
0
            for (int j = i; j < i + SAO_NUM_OFFSET; j++)
1704
0
                currentRDCost += costClasses[j];
1705
1706
0
            if (currentRDCost < bestRDCostBO)
1707
0
            {
1708
0
                bestRDCostBO = currentRDCost;
1709
0
                bestClassBO[compIdx - 1]  = i;
1710
0
            }
1711
0
        }
1712
1713
0
        estDist[compIdx - 1] = 0;
1714
0
        for (int classIdx = bestClassBO[compIdx - 1]; classIdx < bestClassBO[compIdx - 1] + SAO_NUM_OFFSET; classIdx++)
1715
0
            estDist[compIdx - 1] += distClasses[classIdx];
1716
0
    }
1717
1718
0
    m_entropyCoder.load(m_rdContexts.temp);
1719
0
    m_entropyCoder.resetBits();
1720
1721
0
    for (int compIdx = 0; compIdx < 2; compIdx++)
1722
0
        m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1);
1723
1724
0
    uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1725
0
    int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1726
1727
0
    if (cost < costPartBest)
1728
0
    {
1729
0
        costPartBest = cost;
1730
0
        bestDist = (estDist[0] + estDist[1]);
1731
1732
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1733
0
        {
1734
0
            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1735
0
            lclCtuParam[compIdx]->typeIdx = SAO_BO;
1736
0
            lclCtuParam[compIdx]->bandPos = bestClassBO[compIdx];
1737
0
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1738
0
                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]];
1739
0
        }
1740
0
    }
1741
1742
0
    rateDist += (bestDist << 8) / lambda[1];
1743
0
    m_entropyCoder.load(m_rdContexts.temp);
1744
1745
0
    if (saoParam->bSaoFlag[1])
1746
0
    {
1747
0
        m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1748
0
        m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
1749
0
        m_entropyCoder.store(m_rdContexts.temp);
1750
1751
0
        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1752
0
        bestCost = rateDist + rate;
1753
0
    }
1754
0
    else
1755
0
    {
1756
0
        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1757
0
        bestCost = rateDist + rate;
1758
0
    }
1759
0
}
1760
1761
// NOTE: must put in namespace X265_NS since we need class SAO
1762
void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1763
0
{
1764
0
    const int boShift = X265_DEPTH - SAO_BO_BITS;
1765
1766
0
    for (int y = 0; y < endY; y++)
1767
0
    {
1768
0
        for (int x = 0; x < endX; x++)
1769
0
        {
1770
0
            int classIdx = rec[x] >> boShift;
1771
0
            stats[classIdx] += diff[x];
1772
0
            count[classIdx]++;
1773
0
        }
1774
1775
0
        diff += MAX_CU_SIZE;
1776
0
        rec += stride;
1777
0
    }
1778
0
}
1779
1780
void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1781
0
{
1782
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1783
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1784
1785
0
    X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n");
1786
1787
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1788
0
    memset(tmp_count, 0, sizeof(tmp_count));
1789
1790
0
    for (int y = 0; y < endY; y++)
1791
0
    {
1792
0
        int signLeft = signOf(rec[0] - rec[-1]);
1793
0
        for (int x = 0; x < endX; x++)
1794
0
        {
1795
0
            int signRight = signOf2(rec[x], rec[x + 1]);
1796
0
            X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), "signDown check failure\n");
1797
0
            uint32_t edgeType = signRight + signLeft + 2;
1798
0
            signLeft = -signRight;
1799
1800
0
            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1801
0
            tmp_stats[edgeType] += diff[x];
1802
0
            tmp_count[edgeType]++;
1803
0
        }
1804
1805
0
        diff += MAX_CU_SIZE;
1806
0
        rec += stride;
1807
0
    }
1808
1809
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1810
0
    {
1811
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1812
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1813
0
    }
1814
0
}
1815
1816
void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1817
0
{
1818
0
    X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
1819
0
    X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
1820
1821
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1822
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1823
1824
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1825
0
    memset(tmp_count, 0, sizeof(tmp_count));
1826
1827
0
    X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n");
1828
0
    for (int y = 0; y < endY; y++)
1829
0
    {
1830
0
        for (int x = 0; x < endX; x++)
1831
0
        {
1832
0
            int signDown = signOf2(rec[x], rec[x + stride]);
1833
0
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), "signDown check failure\n");
1834
0
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1835
0
            upBuff1[x] = (int8_t)(-signDown);
1836
1837
0
            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1838
0
            tmp_stats[edgeType] += diff[x];
1839
0
            tmp_count[edgeType]++;
1840
0
        }
1841
0
        diff += MAX_CU_SIZE;
1842
0
        rec += stride;
1843
0
    }
1844
1845
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1846
0
    {
1847
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1848
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1849
0
    }
1850
0
}
1851
1852
void saoCuStatsE2_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
1853
0
{
1854
0
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1855
0
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1856
1857
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1858
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1859
1860
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1861
0
    memset(tmp_count, 0, sizeof(tmp_count));
1862
1863
0
    for (int y = 0; y < endY; y++)
1864
0
    {
1865
0
        upBufft[0] = signOf(rec[stride] - rec[-1]);
1866
0
        for (int x = 0; x < endX; x++)
1867
0
        {
1868
0
            int signDown = signOf2(rec[x], rec[x + stride + 1]);
1869
0
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
1870
0
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1871
0
            upBufft[x + 1] = (int8_t)(-signDown);
1872
0
            tmp_stats[edgeType] += diff[x];
1873
0
            tmp_count[edgeType]++;
1874
0
        }
1875
1876
0
        std::swap(upBuff1, upBufft);
1877
1878
0
        rec += stride;
1879
0
        diff += MAX_CU_SIZE;
1880
0
    }
1881
1882
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1883
0
    {
1884
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1885
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1886
0
    }
1887
0
}
1888
1889
void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1890
0
{
1891
0
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1892
0
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1893
1894
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1895
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1896
1897
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1898
0
    memset(tmp_count, 0, sizeof(tmp_count));
1899
1900
0
    for (int y = 0; y < endY; y++)
1901
0
    {
1902
0
        for (int x = 0; x < endX; x++)
1903
0
        {
1904
0
            int signDown = signOf2(rec[x], rec[x + stride - 1]);
1905
0
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
1906
0
            X265_CHECK(abs(upBuff1[x]) <= 1, "upBuffer1 check failure\n");
1907
1908
0
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1909
0
            upBuff1[x - 1] = (int8_t)(-signDown);
1910
0
            tmp_stats[edgeType] += diff[x];
1911
0
            tmp_count[edgeType]++;
1912
0
        }
1913
1914
0
        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
1915
1916
0
        rec += stride;
1917
0
        diff += MAX_CU_SIZE;
1918
0
    }
1919
1920
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1921
0
    {
1922
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1923
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1924
0
    }
1925
0
}
1926
1927
void setupSaoPrimitives_c(EncoderPrimitives &p)
1928
0
{
1929
    // TODO: move other sao functions to here
1930
0
    p.saoCuStatsBO = saoCuStatsBO_c;
1931
0
    p.saoCuStatsE0 = saoCuStatsE0_c;
1932
0
    p.saoCuStatsE1 = saoCuStatsE1_c;
1933
0
    p.saoCuStatsE2 = saoCuStatsE2_c;
1934
0
    p.saoCuStatsE3 = saoCuStatsE3_c;
1935
0
}
1936
}
1937