Coverage Report

Created: 2025-07-23 08:18

/src/x265/source/encoder/sao.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at license @ x265.com.
24
 *****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "picyuv.h"
30
#include "sao.h"
31
32
namespace {
33
34
inline int32_t roundIBDI(int32_t num, int32_t den)
35
0
{
36
0
    return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
37
0
}
38
39
inline int signOf2(const int a, const int b)
40
0
{
41
    // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
42
0
    int r = 0;
43
0
    if (a < b)
44
0
        r = -1;
45
0
    if (a > b)
46
0
        r = 1;
47
0
    return r;
48
0
}
49
50
inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg)
51
0
{
52
0
    return (count * offset - offsetOrg * 2) * offset;
53
0
}
54
} // end anonymous namespace
55
56
57
namespace X265_NS {
58
59
const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
60
{
61
    1, // 0
62
    2, // 1
63
    0, // 2
64
    3, // 3
65
    4  // 4
66
};
67
68
SAO::SAO()
69
0
{
70
0
    m_countPreDblk = NULL;
71
0
    m_offsetOrgPreDblk = NULL;
72
0
    m_refDepth = 0;
73
0
    m_param = NULL;
74
0
    m_clipTable = NULL;
75
0
    m_clipTableBase = NULL;
76
0
    m_tmpU[0] = NULL;
77
0
    m_tmpU[1] = NULL;
78
0
    m_tmpU[2] = NULL;
79
0
    m_tmpL1[0] = NULL;
80
0
    m_tmpL1[1] = NULL;
81
0
    m_tmpL1[2] = NULL;
82
0
    m_tmpL2[0] = NULL;
83
0
    m_tmpL2[1] = NULL;
84
0
    m_tmpL2[2] = NULL;
85
0
    m_depthSaoRate = NULL;
86
0
}
87
88
bool SAO::create(x265_param* param, int initCommon)
89
0
{
90
0
    m_param = param;
91
0
    m_chromaFormat = param->internalCsp;
92
0
    m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
93
0
    m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
94
95
0
    m_numCuInWidth =  (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
96
0
    m_numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
97
98
0
    const pixel maxY = (1 << X265_DEPTH) - 1;
99
0
    const pixel rangeExt = maxY >> 1;
100
0
    int numCtu = m_numCuInWidth * m_numCuInHeight;
101
102
0
    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
103
0
    {
104
0
        CHECKED_MALLOC(m_tmpL1[i], pixel, m_param->maxCUSize + 1);
105
0
        CHECKED_MALLOC(m_tmpL2[i], pixel, m_param->maxCUSize + 1);
106
107
        // SAO asm code will read 1 pixel before and after, so pad by 2
108
        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
109
0
        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * m_param->maxCUSize + 2 + 32);
110
0
        m_tmpU[i] += 1;
111
0
    }
112
113
0
    if (initCommon)
114
0
    {
115
0
        if (m_param->bSaoNonDeblocked)
116
0
        {
117
0
            CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
118
0
            CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
119
0
        }
120
0
        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
121
122
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
123
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
124
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
125
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
126
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
127
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
128
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
129
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
130
131
0
        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
132
0
        m_clipTable = &(m_clipTableBase[rangeExt]);
133
134
        // Share with fast clip lookup table
135
136
0
        for (int i = 0; i < rangeExt; i++)
137
0
            m_clipTableBase[i] = 0;
138
139
0
        for (int i = 0; i < maxY; i++)
140
0
            m_clipTable[i] = (pixel)i;
141
142
0
        for (int i = maxY; i < maxY + rangeExt; i++)
143
0
            m_clipTable[i] = maxY;
144
145
0
    }
146
0
    else
147
0
    {
148
        // must initialize these common pointer outside of function
149
0
        m_countPreDblk = NULL;
150
0
        m_offsetOrgPreDblk = NULL;
151
0
        m_clipTableBase = NULL;
152
0
        m_clipTable = NULL;
153
0
    }
154
155
0
    return true;
156
157
0
fail:
158
0
    return false;
159
0
}
160
161
void SAO::createFromRootNode(SAO* root)
162
0
{
163
0
    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
164
0
    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
165
0
    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
166
0
    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
167
0
    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
168
169
0
    m_countPreDblk = root->m_countPreDblk;
170
0
    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
171
0
    m_depthSaoRate = root->m_depthSaoRate;
172
0
    m_clipTableBase = root->m_clipTableBase; // Unnecessary
173
0
    m_clipTable = root->m_clipTable;
174
0
}
175
176
void SAO::destroy(int destoryCommon)
177
0
{
178
0
    for (int i = 0; i < 3; i++)
179
0
    {
180
0
        if (m_tmpL1[i])
181
0
        {
182
0
            X265_FREE(m_tmpL1[i]);
183
0
            m_tmpL1[i] = NULL;
184
0
        }
185
186
0
        if (m_tmpL2[i])
187
0
        {
188
0
            X265_FREE(m_tmpL2[i]);
189
0
            m_tmpL2[i] = NULL;
190
0
        }
191
192
0
        if (m_tmpU[i])
193
0
        {
194
0
            X265_FREE(m_tmpU[i] - 1);
195
0
            m_tmpU[i] = NULL;
196
0
        }
197
0
    }
198
199
0
    if (destoryCommon)
200
0
    {
201
0
        if (m_param->bSaoNonDeblocked)
202
0
        {
203
0
            X265_FREE_ZERO(m_countPreDblk);
204
0
            X265_FREE_ZERO(m_offsetOrgPreDblk);
205
0
        }
206
0
        X265_FREE_ZERO(m_depthSaoRate);
207
0
        X265_FREE_ZERO(m_clipTableBase);
208
0
    }
209
0
}
210
211
/* allocate memory for SAO parameters */
212
void SAO::allocSaoParam(SAOParam* saoParam) const
213
0
{
214
0
    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
215
0
    saoParam->numCuInWidth  = m_numCuInWidth;
216
217
0
    for (int i = 0; i < planes; i++)
218
0
        saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
219
0
}
220
221
void SAO::startSlice(Frame* frame, Entropy& initState)
222
0
{
223
0
    m_frame = frame;
224
0
    Slice* slice = m_frame->m_encData->m_slice;
225
226
0
    switch (slice->m_sliceType)
227
0
    {
228
0
    case I_SLICE:
229
0
        m_refDepth = 0;
230
0
        break;
231
0
    case P_SLICE:
232
0
        m_refDepth = 1;
233
0
        break;
234
0
    case B_SLICE:
235
0
        m_refDepth = 2 + !IS_REFERENCED(frame);
236
0
        break;
237
0
    }
238
239
0
    m_entropyCoder.load(initState);
240
0
    m_rdContexts.next.load(initState);
241
0
    m_rdContexts.cur.load(initState);
242
243
0
    SAOParam* saoParam = frame->m_encData->m_saoParam;
244
0
    if (!saoParam)
245
0
    {
246
0
        saoParam = new SAOParam;
247
0
        allocSaoParam(saoParam);
248
0
        frame->m_encData->m_saoParam = saoParam;
249
0
    }
250
251
0
    saoParam->bSaoFlag[0] = true;
252
0
    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
253
254
0
    m_numNoSao[0] = 0; // Luma
255
0
    m_numNoSao[1] = 0; // Chroma
256
257
    // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
258
0
    if (m_param->frameNumThreads == 1)
259
0
    {
260
0
        if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE)
261
0
            saoParam->bSaoFlag[0] = false;
262
0
        if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
263
0
            saoParam->bSaoFlag[1] = false;
264
0
    }
265
0
}
266
267
// CTU-based SAO process without slice granularity
268
void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
269
0
{
270
0
    PicYuv* reconPic = m_frame->m_reconPic[0];
271
0
    pixel* rec = reconPic->getPlaneAddr(plane, addr);
272
0
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
273
0
    uint32_t picWidth  = m_param->sourceWidth;
274
0
    uint32_t picHeight = m_param->sourceHeight;
275
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
276
0
    int ctuWidth = m_param->maxCUSize;
277
0
    int ctuHeight = m_param->maxCUSize;
278
0
    uint32_t lpelx = cu->m_cuPelX;
279
0
    uint32_t tpely = cu->m_cuPelY;
280
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
281
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
282
0
    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
283
284
    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
285
0
    if (lastRowInSlice)
286
0
    {
287
0
        picHeight = x265_min(picHeight, (tpely + ctuHeight));
288
0
    }
289
290
0
    if (plane)
291
0
    {
292
0
        picWidth  >>= m_hChromaShift;
293
0
        picHeight >>= m_vChromaShift;
294
0
        ctuWidth  >>= m_hChromaShift;
295
0
        ctuHeight >>= m_vChromaShift;
296
0
        lpelx     >>= m_hChromaShift;
297
0
        tpely     >>= m_vChromaShift;
298
0
    }
299
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
300
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
301
0
    ctuWidth  = rpelx - lpelx;
302
0
    ctuHeight = bpely - tpely;
303
304
0
    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
305
0
    int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
306
307
0
    memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
308
309
0
    pixel* tmpL = m_tmpL1[plane];
310
0
    pixel* tmpU = &(m_tmpU[plane][lpelx]);
311
312
0
    int8_t* offsetEo = m_offsetEo[plane];
313
314
0
    switch (typeIdx)
315
0
    {
316
0
    case SAO_EO_0: // dir: -
317
0
    {
318
0
        pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
319
0
        int startX = !lpelx;
320
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
321
0
        if (ctuWidth & 15)
322
0
        {
323
0
            for (int y = 0; y < ctuHeight; y++, rec += stride)
324
0
            {
325
0
                int signLeft = x265_signOf(rec[startX] - tmpL[y]);
326
0
                for (int x = startX; x < endX; x++)
327
0
                {
328
0
                    int signRight = x265_signOf(rec[x] - rec[x + 1]);
329
0
                    int edgeType = signRight + signLeft + 2;
330
0
                    signLeft = -signRight;
331
332
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
333
0
                }
334
0
            }
335
0
        }
336
0
        else
337
0
        {
338
0
            for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
339
0
            {
340
0
                signLeft1[0] = x265_signOf(rec[startX] - tmpL[y]);
341
0
                signLeft1[1] = x265_signOf(rec[stride + startX] - tmpL[y + 1]);
342
343
0
                if (!lpelx)
344
0
                {
345
0
                    firstPxl = rec[0];
346
0
                    row1FirstPxl = rec[stride];
347
0
                }
348
349
0
                if (rpelx == picWidth)
350
0
                {
351
0
                    lastPxl = rec[ctuWidth - 1];
352
0
                    row1LastPxl = rec[stride + ctuWidth - 1];
353
0
                }
354
355
0
                primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride);
356
357
0
                if (!lpelx)
358
0
                {
359
0
                    rec[0] = firstPxl;
360
0
                    rec[stride] = row1FirstPxl;
361
0
                }
362
363
0
                if (rpelx == picWidth)
364
0
                {
365
0
                    rec[ctuWidth - 1] = lastPxl;
366
0
                    rec[stride + ctuWidth - 1] = row1LastPxl;
367
0
                }
368
0
            }
369
0
        }
370
0
        break;
371
0
    }
372
0
    case SAO_EO_1: // dir: |
373
0
    {
374
0
        int startY = bAboveUnavail;
375
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
376
0
        if (startY)
377
0
            rec += stride;
378
379
0
        if (ctuWidth & 15)
380
0
        {
381
0
            for (int x = 0; x < ctuWidth; x++)
382
0
                upBuff1[x] = x265_signOf(rec[x] - tmpU[x]);
383
384
0
            for (int y = startY; y < endY; y++, rec += stride)
385
0
            {
386
0
                for (int x = 0; x < ctuWidth; x++)
387
0
                {
388
0
                    int8_t signDown = x265_signOf(rec[x] - rec[x + stride]);
389
0
                    int edgeType = signDown + upBuff1[x] + 2;
390
0
                    upBuff1[x] = -signDown;
391
392
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
393
0
                }
394
0
            }
395
0
        }
396
0
        else
397
0
        {
398
0
            primitives.sign(upBuff1, rec, tmpU, ctuWidth);
399
400
0
            int diff = (endY - startY) % 2;
401
0
            for (int y = startY; y < endY - diff; y += 2, rec += 2 * stride)
402
0
                primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth);
403
404
0
            if (diff & 1)
405
0
                primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth);
406
0
        }
407
408
0
        break;
409
0
    }
410
0
    case SAO_EO_2: // dir: 135
411
0
    {
412
0
        int startX = !lpelx;
413
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
414
415
0
        int startY = bAboveUnavail;
416
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
417
418
0
        if (startY)
419
0
            rec += stride;
420
421
0
        if (!(ctuWidth & 15))
422
0
        {
423
0
            int8_t firstSign, lastSign;
424
425
0
            if (!lpelx)
426
0
                firstSign = upBuff1[0];
427
428
0
            if (rpelx == picWidth)
429
0
                lastSign = upBuff1[ctuWidth - 1];
430
431
0
            primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth);
432
433
0
            if (!lpelx)
434
0
                upBuff1[0] = firstSign;
435
436
0
            if (rpelx == picWidth)
437
0
                upBuff1[ctuWidth - 1] = lastSign;
438
0
        }
439
0
        else
440
0
        {
441
0
            for (int x = startX; x < endX; x++)
442
0
                upBuff1[x] = x265_signOf(rec[x] - tmpU[x - 1]);
443
0
        }
444
445
0
        if (ctuWidth & 15)
446
0
        {
447
0
             for (int y = startY; y < endY; y++, rec += stride)
448
0
             {
449
0
                 upBufft[startX] = x265_signOf(rec[stride + startX] - tmpL[y]);
450
0
                 for (int x = startX; x < endX; x++)
451
0
                 {
452
0
                     int8_t signDown = x265_signOf(rec[x] - rec[x + stride + 1]);
453
0
                     int edgeType = signDown + upBuff1[x] + 2;
454
0
                     upBufft[x + 1] = -signDown;
455
0
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
456
0
                 }
457
458
0
                 std::swap(upBuff1, upBufft);
459
0
             }
460
0
        }
461
0
        else
462
0
        {
463
0
            for (int y = startY; y < endY; y++, rec += stride)
464
0
            {
465
0
                int8_t iSignDown2 = x265_signOf(rec[stride + startX] - tmpL[y]);
466
467
0
                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
468
469
0
                upBufft[startX] = iSignDown2;
470
471
0
                std::swap(upBuff1, upBufft);
472
0
            }
473
0
        }
474
0
        break;
475
0
    }
476
0
    case SAO_EO_3: // dir: 45
477
0
    {
478
0
        int startX = !lpelx;
479
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
480
481
0
        int startY = bAboveUnavail;
482
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
483
484
0
        if (startY)
485
0
            rec += stride;
486
487
0
        if (ctuWidth & 15)
488
0
        {
489
0
            for (int x = startX - 1; x < endX; x++)
490
0
                upBuff1[x] = x265_signOf(rec[x] - tmpU[x + 1]);
491
492
0
            for (int y = startY; y < endY; y++, rec += stride)
493
0
            {
494
0
                int x = startX;
495
0
                int8_t signDown = x265_signOf(rec[x] - tmpL[y + 1]);
496
0
                int edgeType = signDown + upBuff1[x] + 2;
497
0
                upBuff1[x - 1] = -signDown;
498
0
                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
499
500
0
                for (x = startX + 1; x < endX; x++)
501
0
                {
502
0
                    signDown = x265_signOf(rec[x] - rec[x + stride - 1]);
503
0
                    edgeType = signDown + upBuff1[x] + 2;
504
0
                    upBuff1[x - 1] = -signDown;
505
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
506
0
                }
507
508
0
                upBuff1[endX - 1] = x265_signOf(rec[endX - 1 + stride] - rec[endX]);
509
0
            }
510
0
        }
511
0
        else
512
0
        {
513
0
            int8_t firstSign, lastSign;
514
515
0
            if (lpelx)
516
0
                firstSign = x265_signOf(rec[-1] - tmpU[0]);
517
0
            if (rpelx == picWidth)
518
0
                lastSign = upBuff1[ctuWidth - 1];
519
520
0
            primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth);
521
522
0
            if (lpelx)
523
0
                upBuff1[-1] = firstSign;
524
0
            if (rpelx == picWidth)
525
0
                upBuff1[ctuWidth - 1] = lastSign;
526
527
0
            for (int y = startY; y < endY; y++, rec += stride)
528
0
            {
529
0
                int x = startX;
530
0
                int8_t signDown = x265_signOf(rec[x] - tmpL[y + 1]);
531
0
                int edgeType = signDown + upBuff1[x] + 2;
532
0
                upBuff1[x - 1] = -signDown;
533
0
                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
534
535
0
                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, offsetEo, stride - 1, startX, endX);
536
537
0
                upBuff1[endX - 1] = x265_signOf(rec[endX - 1 + stride] - rec[endX]);
538
0
            }
539
0
        }
540
541
0
        break;
542
0
    }
543
0
    case SAO_BO:
544
0
    {
545
0
        const int8_t* offsetBo = m_offsetBo[plane];
546
547
0
        if (ctuWidth & 15)
548
0
        {
549
0
            #define SAO_BO_BITS 5
550
0
            const int boShift = X265_DEPTH - SAO_BO_BITS;
551
552
0
            for (int y = 0; y < ctuHeight; y++, rec += stride)
553
0
                for (int x = 0; x < ctuWidth; x++)
554
0
                    rec[x] = x265_clip(rec[x] + offsetBo[rec[x] >> boShift]);
555
0
        }
556
0
        else
557
0
            primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
558
559
0
        break;
560
0
    }
561
0
    default: break;
562
0
    }
563
0
}
564
565
/* Process SAO unit */
566
void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
567
0
{
568
0
    PicYuv* reconPic = m_frame->m_reconPic[0];
569
0
    intptr_t stride = reconPic->m_stride;
570
0
    int ctuWidth = m_param->maxCUSize;
571
0
    int ctuHeight = m_param->maxCUSize;
572
573
0
    int addr = idxY * m_numCuInWidth + idxX;
574
0
    pixel* rec = reconPic->getLumaAddr(addr);
575
576
0
    if (idxX == 0)
577
0
    {
578
0
        for (int i = 0; i < ctuHeight + 1; i++)
579
0
        {
580
0
            m_tmpL1[0][i] = rec[0];
581
0
            rec += stride;
582
0
        }
583
0
    }
584
585
0
    bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT);
586
0
    int typeIdx = ctuParam[addr].typeIdx;
587
588
0
    if (idxX != (m_numCuInWidth - 1))
589
0
    {
590
0
        rec = reconPic->getLumaAddr(addr);
591
0
        for (int i = 0; i < ctuHeight + 1; i++)
592
0
        {
593
0
            m_tmpL2[0][i] = rec[ctuWidth - 1];
594
0
            rec += stride;
595
0
        }
596
0
    }
597
598
0
    if (typeIdx >= 0)
599
0
    {
600
0
        if (!mergeLeftFlag)
601
0
        {
602
0
            if (typeIdx == SAO_BO)
603
0
            {
604
0
                memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0]));
605
606
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
607
0
                    m_offsetBo[0][((ctuParam[addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
608
0
            }
609
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
610
0
            {
611
0
                int offset[NUM_EDGETYPE];
612
0
                offset[0] = 0;
613
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
614
0
                    offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
615
616
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
617
0
                    m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
618
0
            }
619
0
        }
620
0
        applyPixelOffsets(addr, typeIdx, 0);
621
0
    }
622
0
    std::swap(m_tmpL1[0], m_tmpL2[0]);
623
0
}
624
625
/* Process SAO unit (Chroma only) */
626
void SAO::generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX)
627
0
{
628
0
    PicYuv* reconPic = m_frame->m_reconPic[0];
629
0
    intptr_t stride = reconPic->m_strideC;
630
0
    int ctuWidth  = m_param->maxCUSize;
631
0
    int ctuHeight = m_param->maxCUSize;
632
633
0
    {
634
0
        ctuWidth  >>= m_hChromaShift;
635
0
        ctuHeight >>= m_vChromaShift;
636
0
    }
637
638
0
    int addr = idxY * m_numCuInWidth + idxX;
639
0
    pixel* recCb = reconPic->getCbAddr(addr);
640
0
    pixel* recCr = reconPic->getCrAddr(addr);
641
642
0
    if (idxX == 0)
643
0
    {
644
0
        for (int i = 0; i < ctuHeight + 1; i++)
645
0
        {
646
0
            m_tmpL1[1][i] = recCb[0];
647
0
            m_tmpL1[2][i] = recCr[0];
648
0
            recCb += stride;
649
0
            recCr += stride;
650
0
        }
651
0
    }
652
653
0
    bool mergeLeftFlagCb = (ctuParam[1][addr].mergeMode == SAO_MERGE_LEFT);
654
0
    int typeIdxCb = ctuParam[1][addr].typeIdx;
655
656
0
    bool mergeLeftFlagCr = (ctuParam[2][addr].mergeMode == SAO_MERGE_LEFT);
657
0
    int typeIdxCr = ctuParam[2][addr].typeIdx;
658
659
0
    if (idxX != (m_numCuInWidth - 1))
660
0
    {
661
0
        recCb = reconPic->getCbAddr(addr);
662
0
        recCr = reconPic->getCrAddr(addr);
663
0
        for (int i = 0; i < ctuHeight + 1; i++)
664
0
        {
665
0
            m_tmpL2[1][i] = recCb[ctuWidth - 1];
666
0
            m_tmpL2[2][i] = recCr[ctuWidth - 1];
667
0
            recCb += stride;
668
0
            recCr += stride;
669
0
        }
670
0
    }
671
672
    // Process U
673
0
    if (typeIdxCb >= 0)
674
0
    {
675
0
        if (!mergeLeftFlagCb)
676
0
        {
677
0
            if (typeIdxCb == SAO_BO)
678
0
            {
679
0
                memset(m_offsetBo[1], 0, sizeof(m_offsetBo[0]));
680
681
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
682
0
                    m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC);
683
0
            }
684
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
685
0
            {
686
0
                int offset[NUM_EDGETYPE];
687
0
                offset[0] = 0;
688
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
689
0
                    offset[i + 1] = ctuParam[1][addr].offset[i] << SAO_BIT_INC;
690
691
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
692
0
                    m_offsetEo[1][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
693
0
            }
694
0
        }
695
0
        applyPixelOffsets(addr, typeIdxCb, 1);
696
0
    }
697
698
    // Process V
699
0
    if (typeIdxCr >= 0)
700
0
    {
701
0
        if (!mergeLeftFlagCr)
702
0
        {
703
0
            if (typeIdxCr == SAO_BO)
704
0
            {
705
0
                memset(m_offsetBo[2], 0, sizeof(m_offsetBo[0]));
706
707
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
708
0
                    m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC);
709
0
            }
710
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
711
0
            {
712
0
                int offset[NUM_EDGETYPE];
713
0
                offset[0] = 0;
714
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
715
0
                    offset[i + 1] = ctuParam[2][addr].offset[i] << SAO_BIT_INC;
716
717
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
718
0
                    m_offsetEo[2][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
719
0
            }
720
0
        }
721
0
        applyPixelOffsets(addr, typeIdxCb, 2);
722
0
    }
723
724
0
    std::swap(m_tmpL1[1], m_tmpL2[1]);
725
0
    std::swap(m_tmpL1[2], m_tmpL2[2]);
726
0
}
727
728
/* Calculate SAO statistics for current CTU without non-crossing slice */
729
void SAO::calcSaoStatsCTU(int addr, int plane)
730
0
{
731
0
    Slice* slice = m_frame->m_encData->m_slice;
732
0
    const PicYuv* reconPic = m_frame->m_reconPic[0];
733
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
734
0
    const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
735
0
    const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
736
0
    const pixel* rec;
737
0
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
738
0
    uint32_t picWidth  = m_param->sourceWidth;
739
0
    uint32_t picHeight = m_param->sourceHeight;
740
0
    int ctuWidth  = m_param->maxCUSize;
741
0
    int ctuHeight = m_param->maxCUSize;
742
0
    uint32_t lpelx = cu->m_cuPelX;
743
0
    uint32_t tpely = cu->m_cuPelY;
744
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
745
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
746
0
    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
747
748
0
    if (plane)
749
0
    {
750
0
        picWidth  >>= m_hChromaShift;
751
0
        picHeight >>= m_vChromaShift;
752
0
        ctuWidth  >>= m_hChromaShift;
753
0
        ctuHeight >>= m_vChromaShift;
754
0
        lpelx     >>= m_hChromaShift;
755
0
        tpely     >>= m_vChromaShift;
756
0
    }
757
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
758
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
759
0
    ctuWidth  = rpelx - lpelx;
760
0
    ctuHeight = bpely - tpely;
761
762
    // NOTE: Careful! the picHeight apply for Equal operator only in below, so I may safe to hack it
763
0
    if (lastRowInSlice)
764
0
    {
765
0
        picHeight = bpely;
766
0
    }
767
768
0
    int startX;
769
0
    int startY;
770
0
    int endX;
771
0
    int endY;
772
773
0
    const int plane_offset = plane ? 2 : 0;
774
0
    int skipB = 4;
775
0
    int skipR = 5;
776
777
0
    int8_t _upBuff[2 * (MAX_CU_SIZE + 16 + 16)], *upBuff1 = _upBuff + 16, *upBufft = upBuff1 + (MAX_CU_SIZE + 16 + 16);
778
779
0
    ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]);
780
781
    // Calculate (fenc - frec) and put into diff[]
782
0
    if ((lpelx + ctuWidth <  picWidth) & (tpely + ctuHeight < picHeight))
783
0
    {
784
        // WARNING: *) May read beyond bound on video than ctuWidth or ctuHeight is NOT multiple of cuSize
785
0
        X265_CHECK((ctuWidth == ctuHeight) || (m_chromaFormat != X265_CSP_I420), "video size check failure\n");
786
0
        if (plane)
787
0
            primitives.chroma[m_chromaFormat].cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
788
0
        else
789
0
           primitives.cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
790
0
    }
791
0
    else
792
0
    {
793
        // path for non-square area (most in edge)
794
0
        for(int y = 0; y < ctuHeight; y++)
795
0
        {
796
0
            for(int x = 0; x < ctuWidth; x++)
797
0
            {
798
0
                diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]);
799
0
            }
800
0
        }
801
0
    }
802
803
    // SAO_BO:
804
0
    {
805
0
        if (m_param->bSaoNonDeblocked)
806
0
        {
807
0
            skipB = 3;
808
0
            skipR = 4;
809
0
        }
810
811
0
        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
812
0
        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset;
813
814
0
        primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
815
0
    }
816
817
0
    {
818
        // SAO_EO_0: // dir: -
819
0
        {
820
0
            if (m_param->bSaoNonDeblocked)
821
0
            {
822
0
                skipB = 3;
823
0
                skipR = 5;
824
0
            }
825
826
0
            startX = !lpelx;
827
0
            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
828
829
0
            primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
830
0
        }
831
832
        // SAO_EO_1: // dir: |
833
0
        {
834
0
            if (m_param->bSaoNonDeblocked)
835
0
            {
836
0
                skipB = 4;
837
0
                skipR = 4;
838
0
            }
839
840
0
            rec  = rec0;
841
842
0
            startY = bAboveUnavail;
843
0
            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
844
0
            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
845
0
            if (startY)
846
0
            {
847
0
                rec += stride;
848
0
            }
849
850
0
            primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
851
852
0
            primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
853
0
        }
854
0
        if (!m_param->bLimitSAO || ((slice->m_sliceType == P_SLICE && !cu->isSkipped(0)) ||
855
0
            (slice->m_sliceType != B_SLICE)))
856
0
        {
857
            // SAO_EO_2: // dir: 135
858
0
            {
859
0
                if (m_param->bSaoNonDeblocked)
860
0
                {
861
0
                    skipB = 4;
862
0
                    skipR = 5;
863
0
                }
864
865
0
                rec  = rec0;
866
867
0
                startX = !lpelx;
868
0
                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
869
870
0
                startY = bAboveUnavail;
871
0
                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
872
0
                if (startY)
873
0
                {
874
0
                    rec += stride;
875
0
                }
876
877
0
                primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX));
878
879
0
                primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
880
0
            }
881
            // SAO_EO_3: // dir: 45
882
0
            {
883
0
                if (m_param->bSaoNonDeblocked)
884
0
                {
885
0
                    skipB = 4;
886
0
                    skipR = 5;
887
0
                }
888
0
                rec  = rec0;
889
0
                startX = !lpelx;
890
0
                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
891
892
0
                startY = bAboveUnavail;
893
0
                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
894
895
0
                if (startY)
896
0
                {
897
0
                    rec += stride;
898
0
                }
899
900
0
                primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
901
902
0
                primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
903
0
            }
904
0
        }
905
0
    }
906
0
}
907
908
void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
909
0
{
910
0
    int addr = idxX + m_numCuInWidth * idxY;
911
912
0
    int x, y;
913
0
    const CUData* cu = frame->m_encData->getPicCTU(addr);
914
0
    const PicYuv* reconPic = m_frame->m_reconPic[0];
915
0
    const pixel* fenc;
916
0
    const pixel* rec;
917
0
    intptr_t stride = reconPic->m_stride;
918
0
    uint32_t picWidth  = m_param->sourceWidth;
919
0
    uint32_t picHeight = m_param->sourceHeight;
920
0
    int ctuWidth  = m_param->maxCUSize;
921
0
    int ctuHeight = m_param->maxCUSize;
922
0
    uint32_t lpelx = cu->m_cuPelX;
923
0
    uint32_t tpely = cu->m_cuPelY;
924
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
925
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
926
0
    const uint32_t bAboveAvail = (!tpely) | firstRowInSlice;
927
928
    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
929
0
    if (lastRowInSlice)
930
0
    {
931
0
        picHeight = x265_min(picHeight, (tpely + ctuHeight));
932
0
    }
933
934
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
935
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
936
0
    ctuWidth  = rpelx - lpelx;
937
0
    ctuHeight = bpely - tpely;
938
939
0
    int startX;
940
0
    int startY;
941
0
    int endX;
942
0
    int endY;
943
0
    int firstX, firstY;
944
0
    int32_t* stats;
945
0
    int32_t* count;
946
947
0
    int skipB, skipR;
948
949
0
    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
950
0
    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
951
952
0
    const int boShift = X265_DEPTH - SAO_BO_BITS;
953
954
0
    memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
955
0
    memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
956
957
0
    int plane_offset = 0;
958
0
    for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400? NUM_PLANE : 1); plane++)
959
0
    {
960
0
        if (plane == 1)
961
0
        {
962
0
            stride = reconPic->m_strideC;
963
0
            picWidth  >>= m_hChromaShift;
964
0
            picHeight >>= m_vChromaShift;
965
0
            ctuWidth  >>= m_hChromaShift;
966
0
            ctuHeight >>= m_vChromaShift;
967
0
            lpelx     >>= m_hChromaShift;
968
0
            tpely     >>= m_vChromaShift;
969
0
            rpelx     >>= m_hChromaShift;
970
0
            bpely     >>= m_vChromaShift;
971
0
        }
972
973
        // SAO_BO:
974
975
0
        skipB = 3 - plane_offset;
976
0
        skipR = 4 - plane_offset;
977
978
0
        stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
979
0
        count = m_countPreDblk[addr][plane][SAO_BO];
980
981
0
        const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
982
0
        const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
983
0
        fenc = fenc0;
984
0
        rec  = rec0;
985
986
0
        startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
987
0
        startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
988
989
0
        for (y = 0; y < ctuHeight; y++)
990
0
        {
991
0
            for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
992
0
            {
993
0
                int classIdx = rec[x] >> boShift;
994
0
                stats[classIdx] += (fenc[x] - rec[x]);
995
0
                count[classIdx]++;
996
0
            }
997
998
0
            fenc += stride;
999
0
            rec += stride;
1000
0
        }
1001
1002
        // SAO_EO_0: // dir: -
1003
0
        {
1004
0
            skipB = 3 - plane_offset;
1005
0
            skipR = 5 - plane_offset;
1006
1007
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
1008
0
            count = m_countPreDblk[addr][plane][SAO_EO_0];
1009
1010
0
            fenc = fenc0;
1011
0
            rec  = rec0;
1012
1013
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1014
0
            startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
1015
0
            firstX = !lpelx;
1016
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1017
0
            endX   = ctuWidth - 1;  // not refer right CTU
1018
1019
0
            for (y = 0; y < ctuHeight; y++)
1020
0
            {
1021
0
                x = (y < startY ? startX : firstX);
1022
0
                int signLeft = x265_signOf(rec[x] - rec[x - 1]);
1023
0
                for (; x < endX; x++)
1024
0
                {
1025
0
                    int signRight = x265_signOf(rec[x] - rec[x + 1]);
1026
0
                    int edgeType = signRight + signLeft + 2;
1027
0
                    signLeft = -signRight;
1028
1029
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1030
0
                    count[s_eoTable[edgeType]]++;
1031
0
                }
1032
1033
0
                fenc += stride;
1034
0
                rec += stride;
1035
0
            }
1036
0
        }
1037
1038
        // SAO_EO_1: // dir: |
1039
0
        {
1040
0
            skipB = 4 - plane_offset;
1041
0
            skipR = 4 - plane_offset;
1042
1043
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
1044
0
            count = m_countPreDblk[addr][plane][SAO_EO_1];
1045
1046
0
            fenc = fenc0;
1047
0
            rec  = rec0;
1048
1049
0
            startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
1050
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1051
0
            firstY = bAboveAvail;
1052
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1053
0
            endY   = ctuHeight - 1; // not refer below CTU
1054
0
            if (firstY)
1055
0
            {
1056
0
                fenc += stride;
1057
0
                rec += stride;
1058
0
            }
1059
1060
0
            for (x = startX; x < ctuWidth; x++)
1061
0
                upBuff1[x] = x265_signOf(rec[x] - rec[x - stride]);
1062
1063
0
            for (y = firstY; y < endY; y++)
1064
0
            {
1065
0
                for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
1066
0
                {
1067
0
                    int signDown = x265_signOf(rec[x] - rec[x + stride]);
1068
0
                    int edgeType = signDown + upBuff1[x] + 2;
1069
0
                    upBuff1[x] = -signDown;
1070
1071
0
                    if (x < startX && y < startY)
1072
0
                        continue;
1073
1074
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1075
0
                    count[s_eoTable[edgeType]]++;
1076
0
                }
1077
1078
0
                fenc += stride;
1079
0
                rec += stride;
1080
0
            }
1081
0
        }
1082
1083
        // SAO_EO_2: // dir: 135
1084
0
        {
1085
0
            skipB = 4 - plane_offset;
1086
0
            skipR = 5 - plane_offset;
1087
1088
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
1089
0
            count = m_countPreDblk[addr][plane][SAO_EO_2];
1090
1091
0
            fenc = fenc0;
1092
0
            rec  = rec0;
1093
1094
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1095
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1096
0
            firstX = !lpelx;
1097
0
            firstY = bAboveAvail;
1098
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1099
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1100
0
            endX   = ctuWidth - 1;  // not refer right CTU
1101
0
            endY   = ctuHeight - 1; // not refer below CTU
1102
0
            if (firstY)
1103
0
            {
1104
0
                fenc += stride;
1105
0
                rec += stride;
1106
0
            }
1107
1108
0
            for (x = startX; x < endX; x++)
1109
0
                upBuff1[x] = x265_signOf(rec[x] - rec[x - stride - 1]);
1110
1111
0
            for (y = firstY; y < endY; y++)
1112
0
            {
1113
0
                x = (y < startY - 1 ? startX : firstX);
1114
0
                upBufft[x] = x265_signOf(rec[x + stride] - rec[x - 1]);
1115
0
                for (; x < endX; x++)
1116
0
                {
1117
0
                    int signDown = x265_signOf(rec[x] - rec[x + stride + 1]);
1118
0
                    int edgeType = signDown + upBuff1[x] + 2;
1119
0
                    upBufft[x + 1] = -signDown;
1120
1121
0
                    if (x < startX && y < startY)
1122
0
                        continue;
1123
1124
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1125
0
                    count[s_eoTable[edgeType]]++;
1126
0
                }
1127
1128
0
                std::swap(upBuff1, upBufft);
1129
1130
0
                rec += stride;
1131
0
                fenc += stride;
1132
0
            }
1133
0
        }
1134
1135
        // SAO_EO_3: // dir: 45
1136
0
        {
1137
0
            skipB = 4 - plane_offset;
1138
0
            skipR = 5 - plane_offset;
1139
1140
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
1141
0
            count = m_countPreDblk[addr][plane][SAO_EO_3];
1142
1143
0
            fenc = fenc0;
1144
0
            rec  = rec0;
1145
1146
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1147
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1148
0
            firstX = !lpelx;
1149
0
            firstY = bAboveAvail;
1150
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1151
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1152
0
            endX   = ctuWidth - 1;  // not refer right CTU
1153
0
            endY   = ctuHeight - 1; // not refer below CTU
1154
0
            if (firstY)
1155
0
            {
1156
0
                fenc += stride;
1157
0
                rec += stride;
1158
0
            }
1159
1160
0
            for (x = startX - 1; x < endX; x++)
1161
0
                upBuff1[x] = x265_signOf(rec[x] - rec[x - stride + 1]);
1162
1163
0
            for (y = firstY; y < endY; y++)
1164
0
            {
1165
0
                for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
1166
0
                {
1167
0
                    int signDown = x265_signOf(rec[x] - rec[x + stride - 1]);
1168
0
                    int edgeType = signDown + upBuff1[x] + 2;
1169
0
                    upBuff1[x - 1] = -signDown;
1170
1171
0
                    if (x < startX && y < startY)
1172
0
                        continue;
1173
1174
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1175
0
                    count[s_eoTable[edgeType]]++;
1176
0
                }
1177
1178
0
                upBuff1[endX - 1] = x265_signOf(rec[endX - 1 + stride] - rec[endX]);
1179
1180
0
                rec += stride;
1181
0
                fenc += stride;
1182
0
            }
1183
0
        }
1184
0
        plane_offset = 2;
1185
0
    }
1186
0
}
1187
1188
/* reset offset statistics */
1189
void SAO::resetStats()
1190
0
{
1191
0
    memset(m_count, 0, sizeof(m_count));
1192
0
    memset(m_offset, 0, sizeof(m_offset));
1193
0
    memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1194
0
}
1195
1196
void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
1197
0
{
1198
0
    if (!saoParam->bSaoFlag[0])
1199
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
1200
0
    else
1201
0
    {
1202
0
        X265_CHECK(m_numNoSao[0] <= numctus, "m_numNoSao check failure!");
1203
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
1204
0
    }
1205
1206
0
    if (!saoParam->bSaoFlag[1])
1207
0
    {
1208
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
1209
0
    }
1210
0
    else
1211
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
1212
0
}
1213
1214
void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
1215
0
{
1216
0
    Slice* slice = m_frame->m_encData->m_slice;
1217
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1218
0
    int qp = cu->m_qp[0];
1219
0
    int64_t lambda[2] = { 0 };
1220
1221
0
    int qpCb = qp + slice->m_pps->chromaQpOffset[0] + slice->m_chromaQpOffset[0];
1222
0
    if (m_param->internalCsp == X265_CSP_I420)
1223
0
        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)g_chromaScale[x265_clip3(QP_MIN, QP_MAX_MAX, qpCb)]);
1224
0
    else
1225
0
        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, qpCb);
1226
0
    lambda[0] = (int64_t)floor(256.0 * x265_lambda2_tab[qp]);
1227
0
    lambda[1] = (int64_t)floor(256.0 * x265_lambda2_tab[qpCb]); // Use Cb QP for SAO chroma
1228
1229
0
    const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
1230
1231
0
    const int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up
1232
1233
0
    bool chroma = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
1234
0
    int planes = chroma ? 3 : 1;
1235
1236
    // reset stats Y, Cb, Cr
1237
0
    X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
1238
1239
    // TODO: Confirm the address space is continuous
1240
0
    if (m_param->bSaoNonDeblocked)
1241
0
    {
1242
0
        memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
1243
0
        memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
1244
0
    }
1245
0
    else
1246
0
    {
1247
0
        memset(m_count, 0, sizeof(m_count));
1248
0
        memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1249
0
    }
1250
1251
0
    for (int i = 0; i < planes; i++)
1252
0
        saoParam->ctuParam[i][addr].reset();
1253
    // SAO distortion calculation
1254
0
    m_entropyCoder.load(m_rdContexts.cur);
1255
0
    m_entropyCoder.resetBits();
1256
0
    if (allowMerge[0])
1257
0
        m_entropyCoder.codeSaoMerge(0);
1258
0
    if (allowMerge[1])
1259
0
        m_entropyCoder.codeSaoMerge(0);
1260
0
    m_entropyCoder.store(m_rdContexts.temp);
1261
0
    memset(m_offset, 0, sizeof(m_offset));
1262
0
    int64_t bestCost = 0;
1263
0
    int64_t rateDist = 0;
1264
1265
0
    bool bAboveLeftAvail = true;
1266
0
    for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1267
0
    {
1268
0
        if (!allowMerge[mergeIdx])
1269
0
            continue;
1270
1271
0
        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[0][addrMerge[mergeIdx]]);
1272
0
        bAboveLeftAvail = bAboveLeftAvail && (mergeSrcParam->typeIdx == -1);
1273
0
    }
1274
    // Don't apply sao if ctu is skipped or ajacent ctus are sao off
1275
0
    bool bSaoOff = (slice->m_sliceType == B_SLICE) && (cu->isSkipped(0) || bAboveLeftAvail);
1276
1277
    // Estimate distortion and cost of new SAO params
1278
0
    if (saoParam->bSaoFlag[0])
1279
0
    {
1280
0
        if (!m_param->bLimitSAO || !bSaoOff)
1281
0
        {
1282
0
            calcSaoStatsCTU(addr, 0);
1283
0
            saoStatsInitialOffset(addr, 0);
1284
0
            saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
1285
0
        }
1286
0
    }
1287
1288
0
    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
1289
0
    if (saoParam->bSaoFlag[1])
1290
0
    {
1291
0
        if (!m_param->bLimitSAO || ((lclCtuParam->typeIdx != -1) && !bSaoOff))
1292
0
        {
1293
0
            calcSaoStatsCTU(addr, 1);
1294
0
            calcSaoStatsCTU(addr, 2);
1295
0
            saoStatsInitialOffset(addr, 1);
1296
0
            saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
1297
0
        }
1298
0
    }
1299
0
    if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1300
0
    {
1301
        // Cost of merge left or Up
1302
0
        for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1303
0
        {
1304
0
            if (!allowMerge[mergeIdx])
1305
0
                continue;
1306
1307
0
            int64_t mergeDist = 0; 
1308
0
            for (int plane = 0; plane < planes; plane++)
1309
0
            {
1310
0
                int64_t estDist = 0;
1311
0
                SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
1312
0
                int typeIdx = mergeSrcParam->typeIdx;
1313
0
                if (typeIdx >= 0)
1314
0
                {
1315
0
                    int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 1;
1316
0
                    for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1317
0
                    {
1318
0
                        int mergeOffset = mergeSrcParam->offset[classIdx];
1319
0
                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]);
1320
0
                    }
1321
0
                }
1322
0
                mergeDist += (estDist << 8) / lambda[!!plane];
1323
0
            }
1324
1325
0
            m_entropyCoder.load(m_rdContexts.cur);
1326
0
            m_entropyCoder.resetBits();
1327
0
            if (allowMerge[0])
1328
0
                m_entropyCoder.codeSaoMerge(1 - mergeIdx);
1329
0
            if (allowMerge[1] && (mergeIdx == 1))
1330
0
                m_entropyCoder.codeSaoMerge(1);
1331
1332
0
            uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1333
0
            int64_t mergeCost = mergeDist + estRate;
1334
0
            if (mergeCost < bestCost)
1335
0
            {
1336
0
                SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1337
0
                bestCost = mergeCost;
1338
0
                m_entropyCoder.store(m_rdContexts.temp);
1339
0
                for (int plane = 0; plane < planes; plane++)
1340
0
                {
1341
0
                    if (saoParam->bSaoFlag[plane > 0])
1342
0
                    {
1343
0
                        SaoCtuParam* dstCtuParam   = &saoParam->ctuParam[plane][addr];
1344
0
                        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
1345
0
                        dstCtuParam->mergeMode = mergeMode;
1346
0
                        dstCtuParam->typeIdx   = mergeSrcParam->typeIdx;
1347
0
                        dstCtuParam->bandPos   = mergeSrcParam->bandPos;
1348
1349
0
                        for (int i = 0; i < SAO_NUM_OFFSET; i++)
1350
0
                            dstCtuParam->offset[i] = mergeSrcParam->offset[i];
1351
0
                    }
1352
0
                }
1353
0
            }
1354
0
        }
1355
1356
0
        if (saoParam->ctuParam[0][addr].typeIdx < 0)
1357
0
            m_numNoSao[0]++;
1358
0
        if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
1359
0
            m_numNoSao[1]++;
1360
0
        m_entropyCoder.load(m_rdContexts.temp);
1361
0
        m_entropyCoder.store(m_rdContexts.cur);
1362
0
    }
1363
0
}
1364
1365
// Rounds the division of initial offsets by the number of samples in
1366
// each of the statistics table entries.
1367
void SAO::saoStatsInitialOffset(int addr, int planes)
1368
0
{
1369
0
    Slice* slice = m_frame->m_encData->m_slice;
1370
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1371
1372
0
    int maxSaoType;
1373
0
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1374
0
       (slice->m_sliceType == B_SLICE)))
1375
0
    {
1376
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1377
0
    }
1378
0
    else
1379
0
    {
1380
0
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1381
0
    }
1382
    // EO
1383
0
    for (int plane = planes; plane <= planes * 2; plane++)
1384
0
    {
1385
0
        for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1386
0
        {
1387
0
            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1388
0
            {
1389
0
                int32_t&  count     = m_count[plane][typeIdx][classIdx];
1390
0
                int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
1391
0
                int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
1392
1393
0
                if (count)
1394
0
                {
1395
0
                    offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
1396
0
                    offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
1397
1398
0
                    if (classIdx < 3) 
1399
0
                        offsetOut = X265_MAX(offsetOut, 0);
1400
0
                    else
1401
0
                        offsetOut = X265_MIN(offsetOut, 0);
1402
0
                }
1403
0
            }
1404
0
        }
1405
0
    }
1406
    // BO
1407
0
    for (int plane = planes; plane <= planes * 2; plane++)
1408
0
    {
1409
0
        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1410
0
        {
1411
0
            int32_t&  count     = m_count[plane][SAO_BO][classIdx];
1412
0
            int32_t& offsetOrg = m_offsetOrg[plane][SAO_BO][classIdx];
1413
0
            int32_t& offsetOut = m_offset[plane][SAO_BO][classIdx];
1414
1415
0
            if (count)
1416
0
            {
1417
0
                offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
1418
0
                offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
1419
0
            }
1420
0
        }
1421
0
    }
1422
0
}
1423
1424
inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda)
1425
0
{
1426
0
#if X265_DEPTH < 10
1427
0
        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
1428
0
                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
1429
0
                   distortion, bits, lambda);
1430
#else
1431
        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
1432
                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
1433
                   distortion, bits, lambda);
1434
#endif
1435
0
        return distortion + ((bits * lambda + 128) >> 8);
1436
0
}
1437
1438
void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses)
1439
0
{
1440
0
    int bestOffset = 0;
1441
0
    distClasses    = 0;
1442
1443
    // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit.
1444
    // entropy coder can be used to measure the exact rate here.
1445
0
    int64_t bestCost = calcSaoRdoCost(0, 1, lambda);
1446
0
    while (offset != 0)
1447
0
    {
1448
        // Calculate the bits required for signalling the offset
1449
0
        uint32_t rate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
1450
0
        if (abs(offset) == OFFSET_THRESH - 1)
1451
0
            rate--;
1452
1453
        // Do the dequntization before distorion calculation
1454
0
        int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg);
1455
0
        int64_t cost  = calcSaoRdoCost(dist, rate, lambda);
1456
0
        if (cost < bestCost)
1457
0
        {
1458
0
            bestCost = cost;
1459
0
            bestOffset = offset;
1460
0
            distClasses = (int)dist;
1461
0
        }
1462
0
        offset = (offset > 0) ? (offset - 1) : (offset + 1);
1463
0
    }
1464
1465
0
    costClasses = bestCost;
1466
0
    offset = bestOffset;
1467
0
}
1468
void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1469
0
{
1470
0
    Slice* slice = m_frame->m_encData->m_slice;
1471
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1472
0
    int64_t bestDist = 0;
1473
0
    int bestTypeIdx = -1;
1474
0
    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
1475
1476
0
    int32_t distClasses[MAX_NUM_SAO_CLASS];
1477
0
    int64_t costClasses[MAX_NUM_SAO_CLASS];
1478
1479
    // RDO SAO_NA
1480
0
    m_entropyCoder.load(m_rdContexts.temp);
1481
0
    m_entropyCoder.resetBits();
1482
0
    m_entropyCoder.codeSaoType(0);
1483
0
    int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1484
0
    int maxSaoType;
1485
0
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1486
0
        (slice->m_sliceType == B_SLICE)))
1487
0
    {
1488
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1489
0
    }
1490
0
    else
1491
0
    {
1492
0
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1493
0
    }
1494
1495
    //EO distortion calculation
1496
0
    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1497
0
    {
1498
0
        int64_t estDist = 0;
1499
0
        for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1500
0
        {
1501
0
            int32_t&  count    = m_count[0][typeIdx][classIdx];
1502
0
            int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
1503
0
            int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
1504
0
            estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1505
1506
            //Calculate distortion
1507
0
            estDist += distClasses[classIdx];
1508
0
        }
1509
1510
0
        m_entropyCoder.load(m_rdContexts.temp);
1511
0
        m_entropyCoder.resetBits();
1512
0
        m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
1513
1514
0
        int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1515
1516
0
        if (cost < costPartBest)
1517
0
        {
1518
0
            costPartBest = cost;
1519
0
            bestDist = estDist;
1520
0
            bestTypeIdx = typeIdx;
1521
0
        }
1522
0
    }
1523
1524
0
    if (bestTypeIdx != -1)
1525
0
    {
1526
0
        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1527
0
        lclCtuParam->typeIdx = bestTypeIdx;
1528
0
        lclCtuParam->bandPos = 0;
1529
0
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1530
0
            lclCtuParam->offset[classIdx] = m_offset[0][bestTypeIdx][classIdx + 1];
1531
0
    }
1532
1533
    //BO RDO
1534
0
    int64_t estDist = 0;
1535
0
    for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1536
0
    {
1537
0
        int32_t&  count    = m_count[0][SAO_BO][classIdx];
1538
0
        int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx];
1539
0
        int32_t& offsetOut = m_offset[0][SAO_BO][classIdx];
1540
1541
0
        estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1542
0
    }
1543
1544
    // Estimate Best Position
1545
0
    int32_t bestClassBO  = 0;
1546
0
    int64_t currentRDCost = costClasses[0];
1547
0
    currentRDCost += costClasses[1];
1548
0
    currentRDCost += costClasses[2];
1549
0
    currentRDCost += costClasses[3];
1550
0
    int64_t bestRDCostBO = currentRDCost;
1551
1552
0
    for (int i = 1; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1553
0
    {
1554
0
        currentRDCost -= costClasses[i - 1];
1555
0
        currentRDCost += costClasses[i + 3];
1556
1557
0
        if (currentRDCost < bestRDCostBO)
1558
0
        {
1559
0
            bestRDCostBO = currentRDCost;
1560
0
            bestClassBO  = i;
1561
0
        }
1562
0
    }
1563
1564
0
    estDist = 0;
1565
0
    for (int classIdx = bestClassBO; classIdx < bestClassBO + SAO_NUM_OFFSET; classIdx++)
1566
0
        estDist += distClasses[classIdx];
1567
1568
0
    m_entropyCoder.load(m_rdContexts.temp);
1569
0
    m_entropyCoder.resetBits();
1570
0
    m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
1571
1572
0
    int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1573
1574
0
    if (cost < costPartBest)
1575
0
    {
1576
0
        costPartBest = cost;
1577
0
        bestDist = estDist;
1578
1579
0
        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1580
0
        lclCtuParam->typeIdx = SAO_BO;
1581
0
        lclCtuParam->bandPos = bestClassBO;
1582
0
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1583
0
            lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO];
1584
0
    }
1585
1586
0
    rateDist = (bestDist << 8) / lambda[0];
1587
0
    m_entropyCoder.load(m_rdContexts.temp);
1588
0
    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
1589
0
    m_entropyCoder.store(m_rdContexts.temp);
1590
1591
0
    if (m_param->internalCsp == X265_CSP_I400)
1592
0
    {
1593
0
        bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits();
1594
0
    }
1595
0
}
1596
void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1597
0
{
1598
0
    Slice* slice = m_frame->m_encData->m_slice;
1599
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1600
0
    int64_t bestDist = 0;
1601
0
    int bestTypeIdx = -1;
1602
0
    SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
1603
1604
0
    int64_t costClasses[MAX_NUM_SAO_CLASS];
1605
0
    int32_t distClasses[MAX_NUM_SAO_CLASS];
1606
0
    int32_t bestClassBO[2] = { 0, 0 };
1607
1608
0
    m_entropyCoder.load(m_rdContexts.temp);
1609
0
    m_entropyCoder.resetBits();
1610
0
    m_entropyCoder.codeSaoType(0);
1611
1612
0
    uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1613
0
    int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
1614
0
    int maxSaoType;
1615
0
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1616
0
        (slice->m_sliceType == B_SLICE)))
1617
0
    {
1618
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1619
0
    }
1620
0
    else
1621
0
    {
1622
0
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1623
0
    }
1624
1625
    //EO RDO
1626
0
    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1627
0
    {
1628
0
        int64_t estDist[2] = {0, 0};
1629
0
        for (int compIdx = 1; compIdx < 3; compIdx++)
1630
0
        {
1631
0
            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1632
0
            {
1633
0
                int32_t& count = m_count[compIdx][typeIdx][classIdx];
1634
0
                int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx];
1635
0
                int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx];
1636
1637
0
                estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1638
1639
0
                estDist[compIdx - 1] += distClasses[classIdx];
1640
0
            }
1641
0
        }
1642
1643
0
        m_entropyCoder.load(m_rdContexts.temp);
1644
0
        m_entropyCoder.resetBits();
1645
1646
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1647
0
            m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1);
1648
1649
0
        uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1650
0
        int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1651
1652
0
        if (cost < costPartBest)
1653
0
        {
1654
0
            costPartBest = cost;
1655
0
            bestDist = (estDist[0] + estDist[1]);
1656
0
            bestTypeIdx = typeIdx;
1657
0
        }
1658
0
    }
1659
1660
0
    if (bestTypeIdx != -1)
1661
0
    {
1662
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1663
0
        {
1664
0
            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1665
0
            lclCtuParam[compIdx]->typeIdx = bestTypeIdx;
1666
0
            lclCtuParam[compIdx]->bandPos = 0;
1667
0
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1668
0
                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][bestTypeIdx][classIdx + 1];
1669
0
        }
1670
0
    }
1671
1672
    // BO RDO
1673
0
    int64_t estDist[2];
1674
1675
    // Estimate Best Position
1676
0
    for (int compIdx = 1; compIdx < 3; compIdx++)
1677
0
    {
1678
0
        int64_t bestRDCostBO = MAX_INT64;
1679
1680
0
        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1681
0
        {
1682
0
            int32_t&  count = m_count[compIdx][SAO_BO][classIdx];
1683
0
            int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx];
1684
0
            int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx];
1685
1686
0
            estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1687
0
        }
1688
1689
0
        for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1690
0
        {
1691
0
            int64_t currentRDCost = 0;
1692
0
            for (int j = i; j < i + SAO_NUM_OFFSET; j++)
1693
0
                currentRDCost += costClasses[j];
1694
1695
0
            if (currentRDCost < bestRDCostBO)
1696
0
            {
1697
0
                bestRDCostBO = currentRDCost;
1698
0
                bestClassBO[compIdx - 1]  = i;
1699
0
            }
1700
0
        }
1701
1702
0
        estDist[compIdx - 1] = 0;
1703
0
        for (int classIdx = bestClassBO[compIdx - 1]; classIdx < bestClassBO[compIdx - 1] + SAO_NUM_OFFSET; classIdx++)
1704
0
            estDist[compIdx - 1] += distClasses[classIdx];
1705
0
    }
1706
1707
0
    m_entropyCoder.load(m_rdContexts.temp);
1708
0
    m_entropyCoder.resetBits();
1709
1710
0
    for (int compIdx = 0; compIdx < 2; compIdx++)
1711
0
        m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1);
1712
1713
0
    uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1714
0
    int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1715
1716
0
    if (cost < costPartBest)
1717
0
    {
1718
0
        costPartBest = cost;
1719
0
        bestDist = (estDist[0] + estDist[1]);
1720
1721
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1722
0
        {
1723
0
            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1724
0
            lclCtuParam[compIdx]->typeIdx = SAO_BO;
1725
0
            lclCtuParam[compIdx]->bandPos = bestClassBO[compIdx];
1726
0
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1727
0
                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]];
1728
0
        }
1729
0
    }
1730
1731
0
    rateDist += (bestDist << 8) / lambda[1];
1732
0
    m_entropyCoder.load(m_rdContexts.temp);
1733
1734
0
    if (saoParam->bSaoFlag[1])
1735
0
    {
1736
0
        m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1737
0
        m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
1738
0
        m_entropyCoder.store(m_rdContexts.temp);
1739
1740
0
        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1741
0
        bestCost = rateDist + rate;
1742
0
    }
1743
0
    else
1744
0
    {
1745
0
        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1746
0
        bestCost = rateDist + rate;
1747
0
    }
1748
0
}
1749
1750
// NOTE: must put in namespace X265_NS since we need class SAO
1751
void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1752
0
{
1753
0
    const int boShift = X265_DEPTH - SAO_BO_BITS;
1754
1755
0
    for (int y = 0; y < endY; y++)
1756
0
    {
1757
0
        for (int x = 0; x < endX; x++)
1758
0
        {
1759
0
            int classIdx = rec[x] >> boShift;
1760
0
            stats[classIdx] += diff[x];
1761
0
            count[classIdx]++;
1762
0
        }
1763
1764
0
        diff += MAX_CU_SIZE;
1765
0
        rec += stride;
1766
0
    }
1767
0
}
1768
1769
void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1770
0
{
1771
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1772
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1773
1774
0
    X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n");
1775
1776
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1777
0
    memset(tmp_count, 0, sizeof(tmp_count));
1778
1779
0
    for (int y = 0; y < endY; y++)
1780
0
    {
1781
0
        int signLeft = x265_signOf(rec[0] - rec[-1]);
1782
0
        for (int x = 0; x < endX; x++)
1783
0
        {
1784
0
            int signRight = signOf2(rec[x], rec[x + 1]);
1785
0
            X265_CHECK(signRight == x265_signOf(rec[x] - rec[x + 1]), "signDown check failure\n");
1786
0
            uint32_t edgeType = signRight + signLeft + 2;
1787
0
            signLeft = -signRight;
1788
1789
0
            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1790
0
            tmp_stats[edgeType] += diff[x];
1791
0
            tmp_count[edgeType]++;
1792
0
        }
1793
1794
0
        diff += MAX_CU_SIZE;
1795
0
        rec += stride;
1796
0
    }
1797
1798
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1799
0
    {
1800
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1801
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1802
0
    }
1803
0
}
1804
1805
void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1806
0
{
1807
0
    X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
1808
0
    X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
1809
1810
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1811
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1812
1813
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1814
0
    memset(tmp_count, 0, sizeof(tmp_count));
1815
1816
0
    X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n");
1817
0
    for (int y = 0; y < endY; y++)
1818
0
    {
1819
0
        for (int x = 0; x < endX; x++)
1820
0
        {
1821
0
            int signDown = signOf2(rec[x], rec[x + stride]);
1822
0
            X265_CHECK(signDown == x265_signOf(rec[x] - rec[x + stride]), "signDown check failure\n");
1823
0
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1824
0
            upBuff1[x] = (int8_t)(-signDown);
1825
1826
0
            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1827
0
            tmp_stats[edgeType] += diff[x];
1828
0
            tmp_count[edgeType]++;
1829
0
        }
1830
0
        diff += MAX_CU_SIZE;
1831
0
        rec += stride;
1832
0
    }
1833
1834
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1835
0
    {
1836
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1837
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1838
0
    }
1839
0
}
1840
1841
void saoCuStatsE2_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
1842
0
{
1843
0
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1844
0
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1845
1846
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1847
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1848
1849
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1850
0
    memset(tmp_count, 0, sizeof(tmp_count));
1851
1852
0
    for (int y = 0; y < endY; y++)
1853
0
    {
1854
0
        upBufft[0] = x265_signOf(rec[stride] - rec[-1]);
1855
0
        for (int x = 0; x < endX; x++)
1856
0
        {
1857
0
            int signDown = signOf2(rec[x], rec[x + stride + 1]);
1858
0
            X265_CHECK(signDown == x265_signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
1859
0
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1860
0
            upBufft[x + 1] = (int8_t)(-signDown);
1861
0
            tmp_stats[edgeType] += diff[x];
1862
0
            tmp_count[edgeType]++;
1863
0
        }
1864
1865
0
        std::swap(upBuff1, upBufft);
1866
1867
0
        rec += stride;
1868
0
        diff += MAX_CU_SIZE;
1869
0
    }
1870
1871
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1872
0
    {
1873
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1874
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1875
0
    }
1876
0
}
1877
1878
void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1879
0
{
1880
0
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1881
0
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1882
1883
0
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1884
0
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1885
1886
0
    memset(tmp_stats, 0, sizeof(tmp_stats));
1887
0
    memset(tmp_count, 0, sizeof(tmp_count));
1888
1889
0
    for (int y = 0; y < endY; y++)
1890
0
    {
1891
0
        for (int x = 0; x < endX; x++)
1892
0
        {
1893
0
            int signDown = signOf2(rec[x], rec[x + stride - 1]);
1894
0
            X265_CHECK(signDown == x265_signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
1895
0
            X265_CHECK(abs(upBuff1[x]) <= 1, "upBuffer1 check failure\n");
1896
1897
0
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1898
0
            upBuff1[x - 1] = (int8_t)(-signDown);
1899
0
            tmp_stats[edgeType] += diff[x];
1900
0
            tmp_count[edgeType]++;
1901
0
        }
1902
1903
0
        upBuff1[endX - 1] = x265_signOf(rec[endX - 1 + stride] - rec[endX]);
1904
1905
0
        rec += stride;
1906
0
        diff += MAX_CU_SIZE;
1907
0
    }
1908
1909
0
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1910
0
    {
1911
0
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1912
0
        count[SAO::s_eoTable[x]] += tmp_count[x];
1913
0
    }
1914
0
}
1915
1916
void setupSaoPrimitives_c(EncoderPrimitives &p)
1917
0
{
1918
    // TODO: move other sao functions to here
1919
0
    p.saoCuStatsBO = saoCuStatsBO_c;
1920
0
    p.saoCuStatsE0 = saoCuStatsE0_c;
1921
0
    p.saoCuStatsE1 = saoCuStatsE1_c;
1922
0
    p.saoCuStatsE2 = saoCuStatsE2_c;
1923
0
    p.saoCuStatsE3 = saoCuStatsE3_c;
1924
0
}
1925
}
1926