Coverage Report

Created: 2022-08-24 06:15

/src/x265/source/encoder/sao.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at license @ x265.com.
24
 *****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "picyuv.h"
30
#include "sao.h"
31
32
namespace {
33
34
inline int32_t roundIBDI(int32_t num, int32_t den)
35
43.5k
{
36
43.5k
    return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
37
43.5k
}
38
39
/* get the sign of input variable (TODO: this is a dup, make common) */
40
inline int8_t signOf(int x)
41
2.53M
{
42
2.53M
    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
43
2.53M
}
44
45
inline int signOf2(const int a, const int b)
46
107M
{
47
    // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
48
107M
    int r = 0;
49
107M
    if (a < b)
50
5.73k
        r = -1;
51
107M
    if (a > b)
52
2.12k
        r = 1;
53
107M
    return r;
54
107M
}
55
56
inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg)
57
3.02k
{
58
3.02k
    return (count * offset - offsetOrg * 2) * offset;
59
3.02k
}
60
} // end anonymous namespace
61
62
63
namespace X265_NS {
64
65
const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
66
{
67
    1, // 0
68
    2, // 1
69
    0, // 2
70
    3, // 3
71
    4  // 4
72
};
73
74
SAO::SAO()
75
14.8k
{
76
14.8k
    m_countPreDblk = NULL;
77
14.8k
    m_offsetOrgPreDblk = NULL;
78
14.8k
    m_refDepth = 0;
79
14.8k
    m_param = NULL;
80
14.8k
    m_clipTable = NULL;
81
14.8k
    m_clipTableBase = NULL;
82
14.8k
    m_tmpU[0] = NULL;
83
14.8k
    m_tmpU[1] = NULL;
84
14.8k
    m_tmpU[2] = NULL;
85
14.8k
    m_tmpL1[0] = NULL;
86
14.8k
    m_tmpL1[1] = NULL;
87
14.8k
    m_tmpL1[2] = NULL;
88
14.8k
    m_tmpL2[0] = NULL;
89
14.8k
    m_tmpL2[1] = NULL;
90
14.8k
    m_tmpL2[2] = NULL;
91
14.8k
    m_depthSaoRate = NULL;
92
14.8k
}
93
94
bool SAO::create(x265_param* param, int initCommon)
95
14.8k
{
96
14.8k
    m_param = param;
97
14.8k
    m_chromaFormat = param->internalCsp;
98
14.8k
    m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
99
14.8k
    m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
100
101
14.8k
    m_numCuInWidth =  (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
102
14.8k
    m_numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
103
104
14.8k
    const pixel maxY = (1 << X265_DEPTH) - 1;
105
14.8k
    const pixel rangeExt = maxY >> 1;
106
14.8k
    int numCtu = m_numCuInWidth * m_numCuInHeight;
107
108
59.5k
    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
109
44.6k
    {
110
44.6k
        CHECKED_MALLOC(m_tmpL1[i], pixel, m_param->maxCUSize + 1);
111
44.6k
        CHECKED_MALLOC(m_tmpL2[i], pixel, m_param->maxCUSize + 1);
112
113
        // SAO asm code will read 1 pixel before and after, so pad by 2
114
        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
115
44.6k
        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * m_param->maxCUSize + 2 + 32);
116
44.6k
        m_tmpU[i] += 1;
117
44.6k
    }
118
119
14.8k
    if (initCommon)
120
3.11k
    {
121
3.11k
        if (m_param->bSaoNonDeblocked)
122
0
        {
123
0
            CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
124
0
            CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
125
0
        }
126
3.11k
        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
127
128
3.11k
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
129
3.11k
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
130
3.11k
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
131
3.11k
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
132
3.11k
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
133
3.11k
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
134
3.11k
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
135
3.11k
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
136
137
3.11k
        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
138
3.11k
        m_clipTable = &(m_clipTableBase[rangeExt]);
139
140
        // Share with fast clip lookup table
141
142
399k
        for (int i = 0; i < rangeExt; i++)
143
396k
            m_clipTableBase[i] = 0;
144
145
798k
        for (int i = 0; i < maxY; i++)
146
795k
            m_clipTable[i] = (pixel)i;
147
148
399k
        for (int i = maxY; i < maxY + rangeExt; i++)
149
396k
            m_clipTable[i] = maxY;
150
151
3.11k
    }
152
11.7k
    else
153
11.7k
    {
154
        // must initialize these common pointer outside of function
155
11.7k
        m_countPreDblk = NULL;
156
11.7k
        m_offsetOrgPreDblk = NULL;
157
11.7k
        m_clipTableBase = NULL;
158
11.7k
        m_clipTable = NULL;
159
11.7k
    }
160
161
14.8k
    return true;
162
163
0
fail:
164
0
    return false;
165
14.8k
}
166
167
void SAO::createFromRootNode(SAO* root)
168
11.7k
{
169
11.7k
    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
170
11.7k
    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
171
11.7k
    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
172
11.7k
    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
173
11.7k
    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
174
175
11.7k
    m_countPreDblk = root->m_countPreDblk;
176
11.7k
    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
177
11.7k
    m_depthSaoRate = root->m_depthSaoRate;
178
11.7k
    m_clipTableBase = root->m_clipTableBase; // Unnecessary
179
11.7k
    m_clipTable = root->m_clipTable;
180
11.7k
}
181
182
void SAO::destroy(int destoryCommon)
183
14.8k
{
184
59.5k
    for (int i = 0; i < 3; i++)
185
44.6k
    {
186
44.6k
        if (m_tmpL1[i])
187
44.6k
        {
188
44.6k
            X265_FREE(m_tmpL1[i]);
189
44.6k
            m_tmpL1[i] = NULL;
190
44.6k
        }
191
192
44.6k
        if (m_tmpL2[i])
193
44.6k
        {
194
44.6k
            X265_FREE(m_tmpL2[i]);
195
44.6k
            m_tmpL2[i] = NULL;
196
44.6k
        }
197
198
44.6k
        if (m_tmpU[i])
199
44.6k
        {
200
44.6k
            X265_FREE(m_tmpU[i] - 1);
201
44.6k
            m_tmpU[i] = NULL;
202
44.6k
        }
203
44.6k
    }
204
205
14.8k
    if (destoryCommon)
206
3.11k
    {
207
3.11k
        if (m_param->bSaoNonDeblocked)
208
0
        {
209
0
            X265_FREE_ZERO(m_countPreDblk);
210
0
            X265_FREE_ZERO(m_offsetOrgPreDblk);
211
0
        }
212
3.11k
        X265_FREE_ZERO(m_depthSaoRate);
213
3.11k
        X265_FREE_ZERO(m_clipTableBase);
214
3.11k
    }
215
14.8k
}
216
217
/* allocate memory for SAO parameters */
218
void SAO::allocSaoParam(SAOParam* saoParam) const
219
698
{
220
698
    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
221
698
    saoParam->numCuInWidth  = m_numCuInWidth;
222
223
2.79k
    for (int i = 0; i < planes; i++)
224
2.09k
        saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
225
698
}
226
227
void SAO::startSlice(Frame* frame, Entropy& initState)
228
3.17k
{
229
3.17k
    m_frame = frame;
230
3.17k
    Slice* slice = m_frame->m_encData->m_slice;
231
232
3.17k
    switch (slice->m_sliceType)
233
3.17k
    {
234
3.17k
    case I_SLICE:
235
3.17k
        m_refDepth = 0;
236
3.17k
        break;
237
0
    case P_SLICE:
238
0
        m_refDepth = 1;
239
0
        break;
240
0
    case B_SLICE:
241
0
        m_refDepth = 2 + !IS_REFERENCED(frame);
242
0
        break;
243
3.17k
    }
244
245
3.17k
    m_entropyCoder.load(initState);
246
3.17k
    m_rdContexts.next.load(initState);
247
3.17k
    m_rdContexts.cur.load(initState);
248
249
3.17k
    SAOParam* saoParam = frame->m_encData->m_saoParam;
250
3.17k
    if (!saoParam)
251
698
    {
252
698
        saoParam = new SAOParam;
253
698
        allocSaoParam(saoParam);
254
698
        frame->m_encData->m_saoParam = saoParam;
255
698
    }
256
257
3.17k
    saoParam->bSaoFlag[0] = true;
258
3.17k
    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
259
260
3.17k
    m_numNoSao[0] = 0; // Luma
261
3.17k
    m_numNoSao[1] = 0; // Chroma
262
263
    // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
264
3.17k
    if (m_param->frameNumThreads == 1)
265
90
    {
266
90
        if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE)
267
0
            saoParam->bSaoFlag[0] = false;
268
90
        if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
269
0
            saoParam->bSaoFlag[1] = false;
270
90
    }
271
3.17k
}
272
273
// CTU-based SAO process without slice granularity
274
void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
275
0
{
276
0
    PicYuv* reconPic = m_frame->m_reconPic;
277
0
    pixel* rec = reconPic->getPlaneAddr(plane, addr);
278
0
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
279
0
    uint32_t picWidth  = m_param->sourceWidth;
280
0
    uint32_t picHeight = m_param->sourceHeight;
281
0
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
282
0
    int ctuWidth = m_param->maxCUSize;
283
0
    int ctuHeight = m_param->maxCUSize;
284
0
    uint32_t lpelx = cu->m_cuPelX;
285
0
    uint32_t tpely = cu->m_cuPelY;
286
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
287
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
288
0
    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
289
290
    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
291
0
    if (lastRowInSlice)
292
0
    {
293
0
        picHeight = x265_min(picHeight, (tpely + ctuHeight));
294
0
    }
295
296
0
    if (plane)
297
0
    {
298
0
        picWidth  >>= m_hChromaShift;
299
0
        picHeight >>= m_vChromaShift;
300
0
        ctuWidth  >>= m_hChromaShift;
301
0
        ctuHeight >>= m_vChromaShift;
302
0
        lpelx     >>= m_hChromaShift;
303
0
        tpely     >>= m_vChromaShift;
304
0
    }
305
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
306
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
307
0
    ctuWidth  = rpelx - lpelx;
308
0
    ctuHeight = bpely - tpely;
309
310
0
    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
311
0
    int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
312
313
0
    memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
314
315
0
    pixel* tmpL = m_tmpL1[plane];
316
0
    pixel* tmpU = &(m_tmpU[plane][lpelx]);
317
318
0
    int8_t* offsetEo = m_offsetEo[plane];
319
320
0
    switch (typeIdx)
321
0
    {
322
0
    case SAO_EO_0: // dir: -
323
0
    {
324
0
        pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
325
0
        int startX = !lpelx;
326
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
327
0
        if (ctuWidth & 15)
328
0
        {
329
0
            for (int y = 0; y < ctuHeight; y++, rec += stride)
330
0
            {
331
0
                int signLeft = signOf(rec[startX] - tmpL[y]);
332
0
                for (int x = startX; x < endX; x++)
333
0
                {
334
0
                    int signRight = signOf(rec[x] - rec[x + 1]);
335
0
                    int edgeType = signRight + signLeft + 2;
336
0
                    signLeft = -signRight;
337
338
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
339
0
                }
340
0
            }
341
0
        }
342
0
        else
343
0
        {
344
0
            for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
345
0
            {
346
0
                signLeft1[0] = signOf(rec[startX] - tmpL[y]);
347
0
                signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
348
349
0
                if (!lpelx)
350
0
                {
351
0
                    firstPxl = rec[0];
352
0
                    row1FirstPxl = rec[stride];
353
0
                }
354
355
0
                if (rpelx == picWidth)
356
0
                {
357
0
                    lastPxl = rec[ctuWidth - 1];
358
0
                    row1LastPxl = rec[stride + ctuWidth - 1];
359
0
                }
360
361
0
                primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride);
362
363
0
                if (!lpelx)
364
0
                {
365
0
                    rec[0] = firstPxl;
366
0
                    rec[stride] = row1FirstPxl;
367
0
                }
368
369
0
                if (rpelx == picWidth)
370
0
                {
371
0
                    rec[ctuWidth - 1] = lastPxl;
372
0
                    rec[stride + ctuWidth - 1] = row1LastPxl;
373
0
                }
374
0
            }
375
0
        }
376
0
        break;
377
0
    }
378
0
    case SAO_EO_1: // dir: |
379
0
    {
380
0
        int startY = bAboveUnavail;
381
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
382
0
        if (startY)
383
0
            rec += stride;
384
385
0
        if (ctuWidth & 15)
386
0
        {
387
0
            for (int x = 0; x < ctuWidth; x++)
388
0
                upBuff1[x] = signOf(rec[x] - tmpU[x]);
389
390
0
            for (int y = startY; y < endY; y++, rec += stride)
391
0
            {
392
0
                for (int x = 0; x < ctuWidth; x++)
393
0
                {
394
0
                    int8_t signDown = signOf(rec[x] - rec[x + stride]);
395
0
                    int edgeType = signDown + upBuff1[x] + 2;
396
0
                    upBuff1[x] = -signDown;
397
398
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
399
0
                }
400
0
            }
401
0
        }
402
0
        else
403
0
        {
404
0
            primitives.sign(upBuff1, rec, tmpU, ctuWidth);
405
406
0
            int diff = (endY - startY) % 2;
407
0
            for (int y = startY; y < endY - diff; y += 2, rec += 2 * stride)
408
0
                primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth);
409
410
0
            if (diff & 1)
411
0
                primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth);
412
0
        }
413
414
0
        break;
415
0
    }
416
0
    case SAO_EO_2: // dir: 135
417
0
    {
418
0
        int startX = !lpelx;
419
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
420
421
0
        int startY = bAboveUnavail;
422
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
423
424
0
        if (startY)
425
0
            rec += stride;
426
427
0
        if (!(ctuWidth & 15))
428
0
        {
429
0
            int8_t firstSign, lastSign;
430
431
0
            if (!lpelx)
432
0
                firstSign = upBuff1[0];
433
434
0
            if (rpelx == picWidth)
435
0
                lastSign = upBuff1[ctuWidth - 1];
436
437
0
            primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth);
438
439
0
            if (!lpelx)
440
0
                upBuff1[0] = firstSign;
441
442
0
            if (rpelx == picWidth)
443
0
                upBuff1[ctuWidth - 1] = lastSign;
444
0
        }
445
0
        else
446
0
        {
447
0
            for (int x = startX; x < endX; x++)
448
0
                upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
449
0
        }
450
451
0
        if (ctuWidth & 15)
452
0
        {
453
0
             for (int y = startY; y < endY; y++, rec += stride)
454
0
             {
455
0
                 upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
456
0
                 for (int x = startX; x < endX; x++)
457
0
                 {
458
0
                     int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
459
0
                     int edgeType = signDown + upBuff1[x] + 2;
460
0
                     upBufft[x + 1] = -signDown;
461
0
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
462
0
                 }
463
464
0
                 std::swap(upBuff1, upBufft);
465
0
             }
466
0
        }
467
0
        else
468
0
        {
469
0
            for (int y = startY; y < endY; y++, rec += stride)
470
0
            {
471
0
                int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
472
473
0
                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
474
475
0
                upBufft[startX] = iSignDown2;
476
477
0
                std::swap(upBuff1, upBufft);
478
0
            }
479
0
        }
480
0
        break;
481
0
    }
482
0
    case SAO_EO_3: // dir: 45
483
0
    {
484
0
        int startX = !lpelx;
485
0
        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
486
487
0
        int startY = bAboveUnavail;
488
0
        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
489
490
0
        if (startY)
491
0
            rec += stride;
492
493
0
        if (ctuWidth & 15)
494
0
        {
495
0
            for (int x = startX - 1; x < endX; x++)
496
0
                upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);
497
498
0
            for (int y = startY; y < endY; y++, rec += stride)
499
0
            {
500
0
                int x = startX;
501
0
                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
502
0
                int edgeType = signDown + upBuff1[x] + 2;
503
0
                upBuff1[x - 1] = -signDown;
504
0
                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
505
506
0
                for (x = startX + 1; x < endX; x++)
507
0
                {
508
0
                    signDown = signOf(rec[x] - rec[x + stride - 1]);
509
0
                    edgeType = signDown + upBuff1[x] + 2;
510
0
                    upBuff1[x - 1] = -signDown;
511
0
                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
512
0
                }
513
514
0
                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
515
0
            }
516
0
        }
517
0
        else
518
0
        {
519
0
            int8_t firstSign, lastSign;
520
521
0
            if (lpelx)
522
0
                firstSign = signOf(rec[-1] - tmpU[0]);
523
0
            if (rpelx == picWidth)
524
0
                lastSign = upBuff1[ctuWidth - 1];
525
526
0
            primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth);
527
528
0
            if (lpelx)
529
0
                upBuff1[-1] = firstSign;
530
0
            if (rpelx == picWidth)
531
0
                upBuff1[ctuWidth - 1] = lastSign;
532
533
0
            for (int y = startY; y < endY; y++, rec += stride)
534
0
            {
535
0
                int x = startX;
536
0
                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
537
0
                int edgeType = signDown + upBuff1[x] + 2;
538
0
                upBuff1[x - 1] = -signDown;
539
0
                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
540
541
0
                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, offsetEo, stride - 1, startX, endX);
542
543
0
                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
544
0
            }
545
0
        }
546
547
0
        break;
548
0
    }
549
0
    case SAO_BO:
550
0
    {
551
0
        const int8_t* offsetBo = m_offsetBo[plane];
552
553
0
        if (ctuWidth & 15)
554
0
        {
555
41.9k
            #define SAO_BO_BITS 5
556
0
            const int boShift = X265_DEPTH - SAO_BO_BITS;
557
558
0
            for (int y = 0; y < ctuHeight; y++, rec += stride)
559
0
                for (int x = 0; x < ctuWidth; x++)
560
0
                    rec[x] = x265_clip(rec[x] + offsetBo[rec[x] >> boShift]);
561
0
        }
562
0
        else
563
0
            primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
564
565
0
        break;
566
0
    }
567
0
    default: break;
568
0
    }
569
0
}
570
571
/* Process SAO unit */
572
void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
573
13.9k
{
574
13.9k
    PicYuv* reconPic = m_frame->m_reconPic;
575
13.9k
    intptr_t stride = reconPic->m_stride;
576
13.9k
    int ctuWidth = m_param->maxCUSize;
577
13.9k
    int ctuHeight = m_param->maxCUSize;
578
579
13.9k
    int addr = idxY * m_numCuInWidth + idxX;
580
13.9k
    pixel* rec = reconPic->getLumaAddr(addr);
581
582
13.9k
    if (idxX == 0)
583
3.17k
    {
584
140k
        for (int i = 0; i < ctuHeight + 1; i++)
585
137k
        {
586
137k
            m_tmpL1[0][i] = rec[0];
587
137k
            rec += stride;
588
137k
        }
589
3.17k
    }
590
591
13.9k
    bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT);
592
13.9k
    int typeIdx = ctuParam[addr].typeIdx;
593
594
13.9k
    if (idxX != (m_numCuInWidth - 1))
595
10.8k
    {
596
10.8k
        rec = reconPic->getLumaAddr(addr);
597
421k
        for (int i = 0; i < ctuHeight + 1; i++)
598
410k
        {
599
410k
            m_tmpL2[0][i] = rec[ctuWidth - 1];
600
410k
            rec += stride;
601
410k
        }
602
10.8k
    }
603
604
13.9k
    if (typeIdx >= 0)
605
0
    {
606
0
        if (!mergeLeftFlag)
607
0
        {
608
0
            if (typeIdx == SAO_BO)
609
0
            {
610
0
                memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0]));
611
612
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
613
0
                    m_offsetBo[0][((ctuParam[addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
614
0
            }
615
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
616
0
            {
617
0
                int offset[NUM_EDGETYPE];
618
0
                offset[0] = 0;
619
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
620
0
                    offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
621
622
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
623
0
                    m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
624
0
            }
625
0
        }
626
0
        applyPixelOffsets(addr, typeIdx, 0);
627
0
    }
628
13.9k
    std::swap(m_tmpL1[0], m_tmpL2[0]);
629
13.9k
}
630
631
/* Process SAO unit (Chroma only) */
632
void SAO::generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX)
633
13.9k
{
634
13.9k
    PicYuv* reconPic = m_frame->m_reconPic;
635
13.9k
    intptr_t stride = reconPic->m_strideC;
636
13.9k
    int ctuWidth  = m_param->maxCUSize;
637
13.9k
    int ctuHeight = m_param->maxCUSize;
638
639
13.9k
    {
640
13.9k
        ctuWidth  >>= m_hChromaShift;
641
13.9k
        ctuHeight >>= m_vChromaShift;
642
13.9k
    }
643
644
13.9k
    int addr = idxY * m_numCuInWidth + idxX;
645
13.9k
    pixel* recCb = reconPic->getCbAddr(addr);
646
13.9k
    pixel* recCr = reconPic->getCrAddr(addr);
647
648
13.9k
    if (idxX == 0)
649
3.17k
    {
650
73.4k
        for (int i = 0; i < ctuHeight + 1; i++)
651
70.2k
        {
652
70.2k
            m_tmpL1[1][i] = recCb[0];
653
70.2k
            m_tmpL1[2][i] = recCr[0];
654
70.2k
            recCb += stride;
655
70.2k
            recCr += stride;
656
70.2k
        }
657
3.17k
    }
658
659
13.9k
    bool mergeLeftFlagCb = (ctuParam[1][addr].mergeMode == SAO_MERGE_LEFT);
660
13.9k
    int typeIdxCb = ctuParam[1][addr].typeIdx;
661
662
13.9k
    bool mergeLeftFlagCr = (ctuParam[2][addr].mergeMode == SAO_MERGE_LEFT);
663
13.9k
    int typeIdxCr = ctuParam[2][addr].typeIdx;
664
665
13.9k
    if (idxX != (m_numCuInWidth - 1))
666
10.8k
    {
667
10.8k
        recCb = reconPic->getCbAddr(addr);
668
10.8k
        recCr = reconPic->getCrAddr(addr);
669
221k
        for (int i = 0; i < ctuHeight + 1; i++)
670
210k
        {
671
210k
            m_tmpL2[1][i] = recCb[ctuWidth - 1];
672
210k
            m_tmpL2[2][i] = recCr[ctuWidth - 1];
673
210k
            recCb += stride;
674
210k
            recCr += stride;
675
210k
        }
676
10.8k
    }
677
678
    // Process U
679
13.9k
    if (typeIdxCb >= 0)
680
0
    {
681
0
        if (!mergeLeftFlagCb)
682
0
        {
683
0
            if (typeIdxCb == SAO_BO)
684
0
            {
685
0
                memset(m_offsetBo[1], 0, sizeof(m_offsetBo[0]));
686
687
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
688
0
                    m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC);
689
0
            }
690
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
691
0
            {
692
0
                int offset[NUM_EDGETYPE];
693
0
                offset[0] = 0;
694
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
695
0
                    offset[i + 1] = ctuParam[1][addr].offset[i] << SAO_BIT_INC;
696
697
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
698
0
                    m_offsetEo[1][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
699
0
            }
700
0
        }
701
0
        applyPixelOffsets(addr, typeIdxCb, 1);
702
0
    }
703
704
    // Process V
705
13.9k
    if (typeIdxCr >= 0)
706
0
    {
707
0
        if (!mergeLeftFlagCr)
708
0
        {
709
0
            if (typeIdxCr == SAO_BO)
710
0
            {
711
0
                memset(m_offsetBo[2], 0, sizeof(m_offsetBo[0]));
712
713
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
714
0
                    m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC);
715
0
            }
716
0
            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
717
0
            {
718
0
                int offset[NUM_EDGETYPE];
719
0
                offset[0] = 0;
720
0
                for (int i = 0; i < SAO_NUM_OFFSET; i++)
721
0
                    offset[i + 1] = ctuParam[2][addr].offset[i] << SAO_BIT_INC;
722
723
0
                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
724
0
                    m_offsetEo[2][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
725
0
            }
726
0
        }
727
0
        applyPixelOffsets(addr, typeIdxCb, 2);
728
0
    }
729
730
13.9k
    std::swap(m_tmpL1[1], m_tmpL2[1]);
731
13.9k
    std::swap(m_tmpL1[2], m_tmpL2[2]);
732
13.9k
}
733
734
/* Calculate SAO statistics for current CTU without non-crossing slice */
735
void SAO::calcSaoStatsCTU(int addr, int plane)
736
41.9k
{
737
41.9k
    Slice* slice = m_frame->m_encData->m_slice;
738
41.9k
    const PicYuv* reconPic = m_frame->m_reconPic;
739
41.9k
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
740
41.9k
    const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
741
41.9k
    const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
742
41.9k
    const pixel* fenc;
743
41.9k
    const pixel* rec;
744
41.9k
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
745
41.9k
    uint32_t picWidth  = m_param->sourceWidth;
746
41.9k
    uint32_t picHeight = m_param->sourceHeight;
747
41.9k
    int ctuWidth  = m_param->maxCUSize;
748
41.9k
    int ctuHeight = m_param->maxCUSize;
749
41.9k
    uint32_t lpelx = cu->m_cuPelX;
750
41.9k
    uint32_t tpely = cu->m_cuPelY;
751
41.9k
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
752
41.9k
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
753
41.9k
    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
754
755
41.9k
    if (plane)
756
27.9k
    {
757
27.9k
        picWidth  >>= m_hChromaShift;
758
27.9k
        picHeight >>= m_vChromaShift;
759
27.9k
        ctuWidth  >>= m_hChromaShift;
760
27.9k
        ctuHeight >>= m_vChromaShift;
761
27.9k
        lpelx     >>= m_hChromaShift;
762
27.9k
        tpely     >>= m_vChromaShift;
763
27.9k
    }
764
41.9k
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
765
41.9k
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
766
41.9k
    ctuWidth  = rpelx - lpelx;
767
41.9k
    ctuHeight = bpely - tpely;
768
769
    // NOTE: Careful! the picHeight apply for Equal operator only in below, so I may safe to hack it
770
41.9k
    if (lastRowInSlice)
771
9.30k
    {
772
9.30k
        picHeight = bpely;
773
9.30k
    }
774
775
41.9k
    int startX;
776
41.9k
    int startY;
777
41.9k
    int endX;
778
41.9k
    int endY;
779
780
41.9k
    const int plane_offset = plane ? 2 : 0;
781
41.9k
    int skipB = 4;
782
41.9k
    int skipR = 5;
783
784
41.9k
    int8_t _upBuff[2 * (MAX_CU_SIZE + 16 + 16)], *upBuff1 = _upBuff + 16, *upBufft = upBuff1 + (MAX_CU_SIZE + 16 + 16);
785
786
41.9k
    ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]);
787
788
    // Calculate (fenc - frec) and put into diff[]
789
41.9k
    if ((lpelx + ctuWidth <  picWidth) & (tpely + ctuHeight < picHeight))
790
25.2k
    {
791
        // WARNING: *) May read beyond bound on video than ctuWidth or ctuHeight is NOT multiple of cuSize
792
25.2k
        X265_CHECK((ctuWidth == ctuHeight) || (m_chromaFormat != X265_CSP_I420), "video size check failure\n");
793
25.2k
        if (plane)
794
16.8k
            primitives.chroma[m_chromaFormat].cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
795
8.41k
        else
796
8.41k
           primitives.cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
797
25.2k
    }
798
16.7k
    else
799
16.7k
    {
800
        // path for non-square area (most in edge)
801
376k
        for(int y = 0; y < ctuHeight; y++)
802
359k
        {
803
10.3M
            for(int x = 0; x < ctuWidth; x++)
804
10.0M
            {
805
10.0M
                diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]);
806
10.0M
            }
807
359k
        }
808
16.7k
    }
809
810
    // SAO_BO:
811
41.9k
    {
812
41.9k
        if (m_param->bSaoNonDeblocked)
813
0
        {
814
0
            skipB = 3;
815
0
            skipR = 4;
816
0
        }
817
818
41.9k
        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
819
41.9k
        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset;
820
821
41.9k
        primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
822
41.9k
    }
823
824
41.9k
    {
825
        // SAO_EO_0: // dir: -
826
41.9k
        {
827
41.9k
            if (m_param->bSaoNonDeblocked)
828
0
            {
829
0
                skipB = 3;
830
0
                skipR = 5;
831
0
            }
832
833
41.9k
            startX = !lpelx;
834
41.9k
            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
835
836
41.9k
            primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
837
41.9k
        }
838
839
        // SAO_EO_1: // dir: |
840
41.9k
        {
841
41.9k
            if (m_param->bSaoNonDeblocked)
842
0
            {
843
0
                skipB = 4;
844
0
                skipR = 4;
845
0
            }
846
847
41.9k
            rec  = rec0;
848
849
41.9k
            startY = bAboveUnavail;
850
41.9k
            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
851
41.9k
            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
852
41.9k
            if (startY)
853
9.30k
            {
854
9.30k
                rec += stride;
855
9.30k
            }
856
857
41.9k
            primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
858
859
41.9k
            primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
860
41.9k
        }
861
41.9k
        if (!m_param->bLimitSAO || ((slice->m_sliceType == P_SLICE && !cu->isSkipped(0)) ||
862
0
            (slice->m_sliceType != B_SLICE)))
863
41.9k
        {
864
            // SAO_EO_2: // dir: 135
865
41.9k
            {
866
41.9k
                if (m_param->bSaoNonDeblocked)
867
0
                {
868
0
                    skipB = 4;
869
0
                    skipR = 5;
870
0
                }
871
872
41.9k
                fenc = fenc0;
873
41.9k
                rec  = rec0;
874
875
41.9k
                startX = !lpelx;
876
41.9k
                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
877
878
41.9k
                startY = bAboveUnavail;
879
41.9k
                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
880
41.9k
                if (startY)
881
9.30k
                {
882
9.30k
                    fenc += stride;
883
9.30k
                    rec += stride;
884
9.30k
                }
885
886
41.9k
                primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX));
887
888
41.9k
                primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
889
41.9k
            }
890
            // SAO_EO_3: // dir: 45
891
41.9k
            {
892
41.9k
                if (m_param->bSaoNonDeblocked)
893
0
                {
894
0
                    skipB = 4;
895
0
                    skipR = 5;
896
0
                }
897
41.9k
                fenc = fenc0;
898
41.9k
                rec  = rec0;
899
41.9k
                startX = !lpelx;
900
41.9k
                endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
901
902
41.9k
                startY = bAboveUnavail;
903
41.9k
                endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
904
905
41.9k
                if (startY)
906
9.30k
                {
907
9.30k
                    fenc += stride;
908
9.30k
                    rec += stride;
909
9.30k
                }
910
911
41.9k
                primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
912
913
41.9k
                primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
914
41.9k
            }
915
41.9k
        }
916
41.9k
    }
917
41.9k
}
918
919
void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
920
0
{
921
0
    int addr = idxX + m_numCuInWidth * idxY;
922
923
0
    int x, y;
924
0
    const CUData* cu = frame->m_encData->getPicCTU(addr);
925
0
    const PicYuv* reconPic = m_frame->m_reconPic;
926
0
    const pixel* fenc;
927
0
    const pixel* rec;
928
0
    intptr_t stride = reconPic->m_stride;
929
0
    uint32_t picWidth  = m_param->sourceWidth;
930
0
    uint32_t picHeight = m_param->sourceHeight;
931
0
    int ctuWidth  = m_param->maxCUSize;
932
0
    int ctuHeight = m_param->maxCUSize;
933
0
    uint32_t lpelx = cu->m_cuPelX;
934
0
    uint32_t tpely = cu->m_cuPelY;
935
0
    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
936
0
    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
937
0
    const uint32_t bAboveAvail = (!tpely) | firstRowInSlice;
938
939
    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
940
0
    if (lastRowInSlice)
941
0
    {
942
0
        picHeight = x265_min(picHeight, (tpely + ctuHeight));
943
0
    }
944
945
0
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
946
0
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
947
0
    ctuWidth  = rpelx - lpelx;
948
0
    ctuHeight = bpely - tpely;
949
950
0
    int startX;
951
0
    int startY;
952
0
    int endX;
953
0
    int endY;
954
0
    int firstX, firstY;
955
0
    int32_t* stats;
956
0
    int32_t* count;
957
958
0
    int skipB, skipR;
959
960
0
    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
961
0
    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
962
963
0
    const int boShift = X265_DEPTH - SAO_BO_BITS;
964
965
0
    memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
966
0
    memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
967
968
0
    int plane_offset = 0;
969
0
    for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400? NUM_PLANE : 1); plane++)
970
0
    {
971
0
        if (plane == 1)
972
0
        {
973
0
            stride = reconPic->m_strideC;
974
0
            picWidth  >>= m_hChromaShift;
975
0
            picHeight >>= m_vChromaShift;
976
0
            ctuWidth  >>= m_hChromaShift;
977
0
            ctuHeight >>= m_vChromaShift;
978
0
            lpelx     >>= m_hChromaShift;
979
0
            tpely     >>= m_vChromaShift;
980
0
            rpelx     >>= m_hChromaShift;
981
0
            bpely     >>= m_vChromaShift;
982
0
        }
983
984
        // SAO_BO:
985
986
0
        skipB = 3 - plane_offset;
987
0
        skipR = 4 - plane_offset;
988
989
0
        stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
990
0
        count = m_countPreDblk[addr][plane][SAO_BO];
991
992
0
        const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
993
0
        const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
994
0
        fenc = fenc0;
995
0
        rec  = rec0;
996
997
0
        startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
998
0
        startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
999
1000
0
        for (y = 0; y < ctuHeight; y++)
1001
0
        {
1002
0
            for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
1003
0
            {
1004
0
                int classIdx = rec[x] >> boShift;
1005
0
                stats[classIdx] += (fenc[x] - rec[x]);
1006
0
                count[classIdx]++;
1007
0
            }
1008
1009
0
            fenc += stride;
1010
0
            rec += stride;
1011
0
        }
1012
1013
        // SAO_EO_0: // dir: -
1014
0
        {
1015
0
            skipB = 3 - plane_offset;
1016
0
            skipR = 5 - plane_offset;
1017
1018
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
1019
0
            count = m_countPreDblk[addr][plane][SAO_EO_0];
1020
1021
0
            fenc = fenc0;
1022
0
            rec  = rec0;
1023
1024
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1025
0
            startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
1026
0
            firstX = !lpelx;
1027
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1028
0
            endX   = ctuWidth - 1;  // not refer right CTU
1029
1030
0
            for (y = 0; y < ctuHeight; y++)
1031
0
            {
1032
0
                x = (y < startY ? startX : firstX);
1033
0
                int signLeft = signOf(rec[x] - rec[x - 1]);
1034
0
                for (; x < endX; x++)
1035
0
                {
1036
0
                    int signRight = signOf(rec[x] - rec[x + 1]);
1037
0
                    int edgeType = signRight + signLeft + 2;
1038
0
                    signLeft = -signRight;
1039
1040
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1041
0
                    count[s_eoTable[edgeType]]++;
1042
0
                }
1043
1044
0
                fenc += stride;
1045
0
                rec += stride;
1046
0
            }
1047
0
        }
1048
1049
        // SAO_EO_1: // dir: |
1050
0
        {
1051
0
            skipB = 4 - plane_offset;
1052
0
            skipR = 4 - plane_offset;
1053
1054
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
1055
0
            count = m_countPreDblk[addr][plane][SAO_EO_1];
1056
1057
0
            fenc = fenc0;
1058
0
            rec  = rec0;
1059
1060
0
            startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
1061
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1062
0
            firstY = bAboveAvail;
1063
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1064
0
            endY   = ctuHeight - 1; // not refer below CTU
1065
0
            if (firstY)
1066
0
            {
1067
0
                fenc += stride;
1068
0
                rec += stride;
1069
0
            }
1070
1071
0
            for (x = startX; x < ctuWidth; x++)
1072
0
                upBuff1[x] = signOf(rec[x] - rec[x - stride]);
1073
1074
0
            for (y = firstY; y < endY; y++)
1075
0
            {
1076
0
                for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
1077
0
                {
1078
0
                    int signDown = signOf(rec[x] - rec[x + stride]);
1079
0
                    int edgeType = signDown + upBuff1[x] + 2;
1080
0
                    upBuff1[x] = -signDown;
1081
1082
0
                    if (x < startX && y < startY)
1083
0
                        continue;
1084
1085
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1086
0
                    count[s_eoTable[edgeType]]++;
1087
0
                }
1088
1089
0
                fenc += stride;
1090
0
                rec += stride;
1091
0
            }
1092
0
        }
1093
1094
        // SAO_EO_2: // dir: 135
1095
0
        {
1096
0
            skipB = 4 - plane_offset;
1097
0
            skipR = 5 - plane_offset;
1098
1099
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
1100
0
            count = m_countPreDblk[addr][plane][SAO_EO_2];
1101
1102
0
            fenc = fenc0;
1103
0
            rec  = rec0;
1104
1105
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1106
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1107
0
            firstX = !lpelx;
1108
0
            firstY = bAboveAvail;
1109
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1110
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1111
0
            endX   = ctuWidth - 1;  // not refer right CTU
1112
0
            endY   = ctuHeight - 1; // not refer below CTU
1113
0
            if (firstY)
1114
0
            {
1115
0
                fenc += stride;
1116
0
                rec += stride;
1117
0
            }
1118
1119
0
            for (x = startX; x < endX; x++)
1120
0
                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
1121
1122
0
            for (y = firstY; y < endY; y++)
1123
0
            {
1124
0
                x = (y < startY - 1 ? startX : firstX);
1125
0
                upBufft[x] = signOf(rec[x + stride] - rec[x - 1]);
1126
0
                for (; x < endX; x++)
1127
0
                {
1128
0
                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
1129
0
                    int edgeType = signDown + upBuff1[x] + 2;
1130
0
                    upBufft[x + 1] = -signDown;
1131
1132
0
                    if (x < startX && y < startY)
1133
0
                        continue;
1134
1135
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1136
0
                    count[s_eoTable[edgeType]]++;
1137
0
                }
1138
1139
0
                std::swap(upBuff1, upBufft);
1140
1141
0
                rec += stride;
1142
0
                fenc += stride;
1143
0
            }
1144
0
        }
1145
1146
        // SAO_EO_3: // dir: 45
1147
0
        {
1148
0
            skipB = 4 - plane_offset;
1149
0
            skipR = 5 - plane_offset;
1150
1151
0
            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
1152
0
            count = m_countPreDblk[addr][plane][SAO_EO_3];
1153
1154
0
            fenc = fenc0;
1155
0
            rec  = rec0;
1156
1157
0
            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
1158
0
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
1159
0
            firstX = !lpelx;
1160
0
            firstY = bAboveAvail;
1161
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
1162
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
1163
0
            endX   = ctuWidth - 1;  // not refer right CTU
1164
0
            endY   = ctuHeight - 1; // not refer below CTU
1165
0
            if (firstY)
1166
0
            {
1167
0
                fenc += stride;
1168
0
                rec += stride;
1169
0
            }
1170
1171
0
            for (x = startX - 1; x < endX; x++)
1172
0
                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
1173
1174
0
            for (y = firstY; y < endY; y++)
1175
0
            {
1176
0
                for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
1177
0
                {
1178
0
                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
1179
0
                    int edgeType = signDown + upBuff1[x] + 2;
1180
0
                    upBuff1[x - 1] = -signDown;
1181
1182
0
                    if (x < startX && y < startY)
1183
0
                        continue;
1184
1185
0
                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
1186
0
                    count[s_eoTable[edgeType]]++;
1187
0
                }
1188
1189
0
                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
1190
1191
0
                rec += stride;
1192
0
                fenc += stride;
1193
0
            }
1194
0
        }
1195
0
        plane_offset = 2;
1196
0
    }
1197
0
}
1198
1199
/* reset offset statistics */
1200
void SAO::resetStats()
1201
698
{
1202
698
    memset(m_count, 0, sizeof(m_count));
1203
698
    memset(m_offset, 0, sizeof(m_offset));
1204
698
    memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1205
698
}
1206
1207
void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
1208
0
{
1209
0
    if (!saoParam->bSaoFlag[0])
1210
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
1211
0
    else
1212
0
    {
1213
0
        X265_CHECK(m_numNoSao[0] <= numctus, "m_numNoSao check failure!");
1214
0
        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
1215
0
    }
1216
1217
0
    if (!saoParam->bSaoFlag[1])
1218
0
    {
1219
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
1220
0
    }
1221
0
    else
1222
0
        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
1223
0
}
1224
1225
void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
1226
13.9k
{
1227
13.9k
    Slice* slice = m_frame->m_encData->m_slice;
1228
13.9k
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1229
13.9k
    int qp = cu->m_qp[0];
1230
13.9k
    int64_t lambda[2] = { 0 };
1231
1232
13.9k
    int qpCb = qp + slice->m_pps->chromaQpOffset[0] + slice->m_chromaQpOffset[0];
1233
13.9k
    if (m_param->internalCsp == X265_CSP_I420)
1234
13.9k
        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)g_chromaScale[x265_clip3(QP_MIN, QP_MAX_MAX, qpCb)]);
1235
0
    else
1236
0
        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, qpCb);
1237
13.9k
    lambda[0] = (int64_t)floor(256.0 * x265_lambda2_tab[qp]);
1238
13.9k
    lambda[1] = (int64_t)floor(256.0 * x265_lambda2_tab[qpCb]); // Use Cb QP for SAO chroma
1239
1240
13.9k
    const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
1241
1242
13.9k
    const int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up
1243
1244
13.9k
    bool chroma = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
1245
13.9k
    int planes = chroma ? 3 : 1;
1246
1247
    // reset stats Y, Cb, Cr
1248
13.9k
    X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
1249
1250
    // TODO: Confirm the address space is continuous
1251
13.9k
    if (m_param->bSaoNonDeblocked)
1252
0
    {
1253
0
        memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
1254
0
        memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
1255
0
    }
1256
13.9k
    else
1257
13.9k
    {
1258
13.9k
        memset(m_count, 0, sizeof(m_count));
1259
13.9k
        memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1260
13.9k
    }
1261
1262
55.9k
    for (int i = 0; i < planes; i++)
1263
41.9k
        saoParam->ctuParam[i][addr].reset();
1264
    // SAO distortion calculation
1265
13.9k
    m_entropyCoder.load(m_rdContexts.cur);
1266
13.9k
    m_entropyCoder.resetBits();
1267
13.9k
    if (allowMerge[0])
1268
10.8k
        m_entropyCoder.codeSaoMerge(0);
1269
13.9k
    if (allowMerge[1])
1270
10.8k
        m_entropyCoder.codeSaoMerge(0);
1271
13.9k
    m_entropyCoder.store(m_rdContexts.temp);
1272
13.9k
    memset(m_offset, 0, sizeof(m_offset));
1273
13.9k
    int64_t bestCost = 0;
1274
13.9k
    int64_t rateDist = 0;
1275
1276
13.9k
    bool bAboveLeftAvail = true;
1277
41.9k
    for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1278
27.9k
    {
1279
27.9k
        if (!allowMerge[mergeIdx])
1280
6.27k
            continue;
1281
1282
21.7k
        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[0][addrMerge[mergeIdx]]);
1283
21.7k
        bAboveLeftAvail = bAboveLeftAvail && (mergeSrcParam->typeIdx == -1);
1284
21.7k
    }
1285
    // Don't apply sao if ctu is skipped or ajacent ctus are sao off
1286
13.9k
    bool bSaoOff = (slice->m_sliceType == B_SLICE) && (cu->isSkipped(0) || bAboveLeftAvail);
1287
1288
    // Estimate distortion and cost of new SAO params
1289
13.9k
    if (saoParam->bSaoFlag[0])
1290
13.9k
    {
1291
13.9k
        if (!m_param->bLimitSAO || !bSaoOff)
1292
13.9k
        {
1293
13.9k
            calcSaoStatsCTU(addr, 0);
1294
13.9k
            saoStatsInitialOffset(addr, 0);
1295
13.9k
            saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
1296
13.9k
        }
1297
13.9k
    }
1298
1299
13.9k
    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
1300
13.9k
    if (saoParam->bSaoFlag[1])
1301
13.9k
    {
1302
13.9k
        if (!m_param->bLimitSAO || ((lclCtuParam->typeIdx != -1) && !bSaoOff))
1303
13.9k
        {
1304
13.9k
            calcSaoStatsCTU(addr, 1);
1305
13.9k
            calcSaoStatsCTU(addr, 2);
1306
13.9k
            saoStatsInitialOffset(addr, 1);
1307
13.9k
            saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
1308
13.9k
        }
1309
13.9k
    }
1310
13.9k
    if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1311
13.9k
    {
1312
        // Cost of merge left or Up
1313
41.9k
        for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1314
27.9k
        {
1315
27.9k
            if (!allowMerge[mergeIdx])
1316
6.27k
                continue;
1317
1318
21.7k
            int64_t mergeDist = 0; 
1319
86.8k
            for (int plane = 0; plane < planes; plane++)
1320
65.1k
            {
1321
65.1k
                int64_t estDist = 0;
1322
65.1k
                SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
1323
65.1k
                int typeIdx = mergeSrcParam->typeIdx;
1324
65.1k
                if (typeIdx >= 0)
1325
0
                {
1326
0
                    int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 1;
1327
0
                    for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1328
0
                    {
1329
0
                        int mergeOffset = mergeSrcParam->offset[classIdx];
1330
0
                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]);
1331
0
                    }
1332
0
                }
1333
65.1k
                mergeDist += (estDist << 8) / lambda[!!plane];
1334
65.1k
            }
1335
1336
21.7k
            m_entropyCoder.load(m_rdContexts.cur);
1337
21.7k
            m_entropyCoder.resetBits();
1338
21.7k
            if (allowMerge[0])
1339
19.2k
                m_entropyCoder.codeSaoMerge(1 - mergeIdx);
1340
21.7k
            if (allowMerge[1] && (mergeIdx == 1))
1341
10.8k
                m_entropyCoder.codeSaoMerge(1);
1342
1343
21.7k
            uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1344
21.7k
            int64_t mergeCost = mergeDist + estRate;
1345
21.7k
            if (mergeCost < bestCost)
1346
9.56k
            {
1347
9.56k
                SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1348
9.56k
                bestCost = mergeCost;
1349
9.56k
                m_entropyCoder.store(m_rdContexts.temp);
1350
38.2k
                for (int plane = 0; plane < planes; plane++)
1351
28.6k
                {
1352
28.6k
                    if (saoParam->bSaoFlag[plane > 0])
1353
28.6k
                    {
1354
28.6k
                        SaoCtuParam* dstCtuParam   = &saoParam->ctuParam[plane][addr];
1355
28.6k
                        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
1356
28.6k
                        dstCtuParam->mergeMode = mergeMode;
1357
28.6k
                        dstCtuParam->typeIdx   = mergeSrcParam->typeIdx;
1358
28.6k
                        dstCtuParam->bandPos   = mergeSrcParam->bandPos;
1359
1360
143k
                        for (int i = 0; i < SAO_NUM_OFFSET; i++)
1361
114k
                            dstCtuParam->offset[i] = mergeSrcParam->offset[i];
1362
28.6k
                    }
1363
28.6k
                }
1364
9.56k
            }
1365
21.7k
        }
1366
1367
13.9k
        if (saoParam->ctuParam[0][addr].typeIdx < 0)
1368
13.9k
            m_numNoSao[0]++;
1369
13.9k
        if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
1370
13.9k
            m_numNoSao[1]++;
1371
13.9k
        m_entropyCoder.load(m_rdContexts.temp);
1372
13.9k
        m_entropyCoder.store(m_rdContexts.cur);
1373
13.9k
    }
1374
13.9k
}
1375
1376
// Rounds the division of initial offsets by the number of samples in
1377
// each of the statistics table entries.
1378
void SAO::saoStatsInitialOffset(int addr, int planes)
1379
27.9k
{
1380
27.9k
    Slice* slice = m_frame->m_encData->m_slice;
1381
27.9k
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1382
1383
27.9k
    int maxSaoType;
1384
27.9k
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1385
0
       (slice->m_sliceType == B_SLICE)))
1386
0
    {
1387
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1388
0
    }
1389
27.9k
    else
1390
27.9k
    {
1391
27.9k
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1392
27.9k
    }
1393
    // EO
1394
69.9k
    for (int plane = planes; plane <= planes * 2; plane++)
1395
41.9k
    {
1396
209k
        for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1397
167k
        {
1398
839k
            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1399
671k
            {
1400
671k
                int32_t&  count     = m_count[plane][typeIdx][classIdx];
1401
671k
                int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
1402
671k
                int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
1403
1404
671k
                if (count)
1405
1.62k
                {
1406
1.62k
                    offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
1407
1.62k
                    offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
1408
1409
1.62k
                    if (classIdx < 3) 
1410
1.48k
                        offsetOut = X265_MAX(offsetOut, 0);
1411
140
                    else
1412
140
                        offsetOut = X265_MIN(offsetOut, 0);
1413
1.62k
                }
1414
671k
            }
1415
167k
        }
1416
41.9k
    }
1417
    // BO
1418
69.9k
    for (int plane = planes; plane <= planes * 2; plane++)
1419
41.9k
    {
1420
1.38M
        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1421
1.34M
        {
1422
1.34M
            int32_t&  count     = m_count[plane][SAO_BO][classIdx];
1423
1.34M
            int32_t& offsetOrg = m_offsetOrg[plane][SAO_BO][classIdx];
1424
1.34M
            int32_t& offsetOut = m_offset[plane][SAO_BO][classIdx];
1425
1426
1.34M
            if (count)
1427
41.9k
            {
1428
41.9k
                offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
1429
41.9k
                offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
1430
41.9k
            }
1431
1.34M
        }
1432
41.9k
    }
1433
27.9k
}
1434
1435
inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda)
1436
2.18M
{
1437
2.18M
#if X265_DEPTH < 10
1438
2.18M
        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
1439
2.18M
                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
1440
2.18M
                   distortion, bits, lambda);
1441
#else
1442
        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
1443
                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
1444
                   distortion, bits, lambda);
1445
#endif
1446
2.18M
        return distortion + ((bits * lambda + 128) >> 8);
1447
2.18M
}
1448
1449
void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses)
1450
2.01M
{
1451
2.01M
    int bestOffset = 0;
1452
2.01M
    distClasses    = 0;
1453
1454
    // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit.
1455
    // entropy coder can be used to measure the exact rate here.
1456
2.01M
    int64_t bestCost = calcSaoRdoCost(0, 1, lambda);
1457
2.01M
    while (offset != 0)
1458
3.02k
    {
1459
        // Calculate the bits required for signalling the offset
1460
3.02k
        uint32_t rate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
1461
3.02k
        if (abs(offset) == OFFSET_THRESH - 1)
1462
0
            rate--;
1463
1464
        // Do the dequntization before distorion calculation
1465
3.02k
        int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg);
1466
3.02k
        int64_t cost  = calcSaoRdoCost(dist, rate, lambda);
1467
3.02k
        if (cost < bestCost)
1468
74
        {
1469
74
            bestCost = cost;
1470
74
            bestOffset = offset;
1471
74
            distClasses = (int)dist;
1472
74
        }
1473
3.02k
        offset = (offset > 0) ? (offset - 1) : (offset + 1);
1474
3.02k
    }
1475
1476
2.01M
    costClasses = bestCost;
1477
2.01M
    offset = bestOffset;
1478
2.01M
}
1479
void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1480
13.9k
{
1481
13.9k
    Slice* slice = m_frame->m_encData->m_slice;
1482
13.9k
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1483
13.9k
    int64_t bestDist = 0;
1484
13.9k
    int bestTypeIdx = -1;
1485
13.9k
    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
1486
1487
13.9k
    int32_t distClasses[MAX_NUM_SAO_CLASS];
1488
13.9k
    int64_t costClasses[MAX_NUM_SAO_CLASS];
1489
1490
    // RDO SAO_NA
1491
13.9k
    m_entropyCoder.load(m_rdContexts.temp);
1492
13.9k
    m_entropyCoder.resetBits();
1493
13.9k
    m_entropyCoder.codeSaoType(0);
1494
13.9k
    int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1495
13.9k
    int maxSaoType;
1496
13.9k
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1497
0
        (slice->m_sliceType == B_SLICE)))
1498
0
    {
1499
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1500
0
    }
1501
13.9k
    else
1502
13.9k
    {
1503
13.9k
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1504
13.9k
    }
1505
1506
    //EO distortion calculation
1507
69.9k
    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1508
55.9k
    {
1509
55.9k
        int64_t estDist = 0;
1510
279k
        for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1511
223k
        {
1512
223k
            int32_t&  count    = m_count[0][typeIdx][classIdx];
1513
223k
            int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
1514
223k
            int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
1515
223k
            estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1516
1517
            //Calculate distortion
1518
223k
            estDist += distClasses[classIdx];
1519
223k
        }
1520
1521
55.9k
        m_entropyCoder.load(m_rdContexts.temp);
1522
55.9k
        m_entropyCoder.resetBits();
1523
55.9k
        m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
1524
1525
55.9k
        int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1526
1527
55.9k
        if (cost < costPartBest)
1528
0
        {
1529
0
            costPartBest = cost;
1530
0
            bestDist = estDist;
1531
0
            bestTypeIdx = typeIdx;
1532
0
        }
1533
55.9k
    }
1534
1535
13.9k
    if (bestTypeIdx != -1)
1536
0
    {
1537
0
        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1538
0
        lclCtuParam->typeIdx = bestTypeIdx;
1539
0
        lclCtuParam->bandPos = 0;
1540
0
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1541
0
            lclCtuParam->offset[classIdx] = m_offset[0][bestTypeIdx][classIdx + 1];
1542
0
    }
1543
1544
    //BO RDO
1545
13.9k
    int64_t estDist = 0;
1546
461k
    for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1547
447k
    {
1548
447k
        int32_t&  count    = m_count[0][SAO_BO][classIdx];
1549
447k
        int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx];
1550
447k
        int32_t& offsetOut = m_offset[0][SAO_BO][classIdx];
1551
1552
447k
        estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1553
447k
    }
1554
1555
    // Estimate Best Position
1556
13.9k
    int32_t bestClassBO  = 0;
1557
13.9k
    int64_t currentRDCost = costClasses[0];
1558
13.9k
    currentRDCost += costClasses[1];
1559
13.9k
    currentRDCost += costClasses[2];
1560
13.9k
    currentRDCost += costClasses[3];
1561
13.9k
    int64_t bestRDCostBO = currentRDCost;
1562
1563
405k
    for (int i = 1; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1564
391k
    {
1565
391k
        currentRDCost -= costClasses[i - 1];
1566
391k
        currentRDCost += costClasses[i + 3];
1567
1568
391k
        if (currentRDCost < bestRDCostBO)
1569
0
        {
1570
0
            bestRDCostBO = currentRDCost;
1571
0
            bestClassBO  = i;
1572
0
        }
1573
391k
    }
1574
1575
13.9k
    estDist = 0;
1576
69.9k
    for (int classIdx = bestClassBO; classIdx < bestClassBO + SAO_NUM_OFFSET; classIdx++)
1577
55.9k
        estDist += distClasses[classIdx];
1578
1579
13.9k
    m_entropyCoder.load(m_rdContexts.temp);
1580
13.9k
    m_entropyCoder.resetBits();
1581
13.9k
    m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
1582
1583
13.9k
    int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1584
1585
13.9k
    if (cost < costPartBest)
1586
0
    {
1587
0
        costPartBest = cost;
1588
0
        bestDist = estDist;
1589
1590
0
        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1591
0
        lclCtuParam->typeIdx = SAO_BO;
1592
0
        lclCtuParam->bandPos = bestClassBO;
1593
0
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1594
0
            lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO];
1595
0
    }
1596
1597
13.9k
    rateDist = (bestDist << 8) / lambda[0];
1598
13.9k
    m_entropyCoder.load(m_rdContexts.temp);
1599
13.9k
    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
1600
13.9k
    m_entropyCoder.store(m_rdContexts.temp);
1601
1602
13.9k
    if (m_param->internalCsp == X265_CSP_I400)
1603
0
    {
1604
0
        bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits();
1605
0
    }
1606
13.9k
}
1607
void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1608
13.9k
{
1609
13.9k
    Slice* slice = m_frame->m_encData->m_slice;
1610
13.9k
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
1611
13.9k
    int64_t bestDist = 0;
1612
13.9k
    int bestTypeIdx = -1;
1613
13.9k
    SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
1614
1615
13.9k
    int64_t costClasses[MAX_NUM_SAO_CLASS];
1616
13.9k
    int32_t distClasses[MAX_NUM_SAO_CLASS];
1617
13.9k
    int32_t bestClassBO[2] = { 0, 0 };
1618
1619
13.9k
    m_entropyCoder.load(m_rdContexts.temp);
1620
13.9k
    m_entropyCoder.resetBits();
1621
13.9k
    m_entropyCoder.codeSaoType(0);
1622
1623
13.9k
    uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1624
13.9k
    int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
1625
13.9k
    int maxSaoType;
1626
13.9k
    if (m_param->bLimitSAO && ((slice->m_sliceType == P_SLICE && cu->isSkipped(0)) ||
1627
0
        (slice->m_sliceType == B_SLICE)))
1628
0
    {
1629
0
        maxSaoType = MAX_NUM_SAO_TYPE - 3;
1630
0
    }
1631
13.9k
    else
1632
13.9k
    {
1633
13.9k
        maxSaoType = MAX_NUM_SAO_TYPE - 1;
1634
13.9k
    }
1635
1636
    //EO RDO
1637
69.9k
    for (int typeIdx = 0; typeIdx < maxSaoType; typeIdx++)
1638
55.9k
    {
1639
55.9k
        int64_t estDist[2] = {0, 0};
1640
167k
        for (int compIdx = 1; compIdx < 3; compIdx++)
1641
111k
        {
1642
559k
            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1643
447k
            {
1644
447k
                int32_t& count = m_count[compIdx][typeIdx][classIdx];
1645
447k
                int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx];
1646
447k
                int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx];
1647
1648
447k
                estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1649
1650
447k
                estDist[compIdx - 1] += distClasses[classIdx];
1651
447k
            }
1652
111k
        }
1653
1654
55.9k
        m_entropyCoder.load(m_rdContexts.temp);
1655
55.9k
        m_entropyCoder.resetBits();
1656
1657
167k
        for (int compIdx = 0; compIdx < 2; compIdx++)
1658
111k
            m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1);
1659
1660
55.9k
        uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1661
55.9k
        int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1662
1663
55.9k
        if (cost < costPartBest)
1664
0
        {
1665
0
            costPartBest = cost;
1666
0
            bestDist = (estDist[0] + estDist[1]);
1667
0
            bestTypeIdx = typeIdx;
1668
0
        }
1669
55.9k
    }
1670
1671
13.9k
    if (bestTypeIdx != -1)
1672
0
    {
1673
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1674
0
        {
1675
0
            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1676
0
            lclCtuParam[compIdx]->typeIdx = bestTypeIdx;
1677
0
            lclCtuParam[compIdx]->bandPos = 0;
1678
0
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1679
0
                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][bestTypeIdx][classIdx + 1];
1680
0
        }
1681
0
    }
1682
1683
    // BO RDO
1684
13.9k
    int64_t estDist[2];
1685
1686
    // Estimate Best Position
1687
41.9k
    for (int compIdx = 1; compIdx < 3; compIdx++)
1688
27.9k
    {
1689
27.9k
        int64_t bestRDCostBO = MAX_INT64;
1690
1691
923k
        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1692
895k
        {
1693
895k
            int32_t&  count = m_count[compIdx][SAO_BO][classIdx];
1694
895k
            int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx];
1695
895k
            int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx];
1696
1697
895k
            estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1698
895k
        }
1699
1700
839k
        for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1701
811k
        {
1702
811k
            int64_t currentRDCost = 0;
1703
4.05M
            for (int j = i; j < i + SAO_NUM_OFFSET; j++)
1704
3.24M
                currentRDCost += costClasses[j];
1705
1706
811k
            if (currentRDCost < bestRDCostBO)
1707
27.9k
            {
1708
27.9k
                bestRDCostBO = currentRDCost;
1709
27.9k
                bestClassBO[compIdx - 1]  = i;
1710
27.9k
            }
1711
811k
        }
1712
1713
27.9k
        estDist[compIdx - 1] = 0;
1714
139k
        for (int classIdx = bestClassBO[compIdx - 1]; classIdx < bestClassBO[compIdx - 1] + SAO_NUM_OFFSET; classIdx++)
1715
111k
            estDist[compIdx - 1] += distClasses[classIdx];
1716
27.9k
    }
1717
1718
13.9k
    m_entropyCoder.load(m_rdContexts.temp);
1719
13.9k
    m_entropyCoder.resetBits();
1720
1721
41.9k
    for (int compIdx = 0; compIdx < 2; compIdx++)
1722
27.9k
        m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1);
1723
1724
13.9k
    uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1725
13.9k
    int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1726
1727
13.9k
    if (cost < costPartBest)
1728
0
    {
1729
0
        costPartBest = cost;
1730
0
        bestDist = (estDist[0] + estDist[1]);
1731
1732
0
        for (int compIdx = 0; compIdx < 2; compIdx++)
1733
0
        {
1734
0
            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1735
0
            lclCtuParam[compIdx]->typeIdx = SAO_BO;
1736
0
            lclCtuParam[compIdx]->bandPos = bestClassBO[compIdx];
1737
0
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1738
0
                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]];
1739
0
        }
1740
0
    }
1741
1742
13.9k
    rateDist += (bestDist << 8) / lambda[1];
1743
13.9k
    m_entropyCoder.load(m_rdContexts.temp);
1744
1745
13.9k
    if (saoParam->bSaoFlag[1])
1746
13.9k
    {
1747
13.9k
        m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1748
13.9k
        m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
1749
13.9k
        m_entropyCoder.store(m_rdContexts.temp);
1750
1751
13.9k
        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1752
13.9k
        bestCost = rateDist + rate;
1753
13.9k
    }
1754
0
    else
1755
0
    {
1756
0
        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1757
0
        bestCost = rateDist + rate;
1758
0
    }
1759
13.9k
}
1760
1761
// NOTE: must put in namespace X265_NS since we need class SAO
1762
void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1763
41.9k
{
1764
41.9k
    const int boShift = X265_DEPTH - SAO_BO_BITS;
1765
1766
907k
    for (int y = 0; y < endY; y++)
1767
866k
    {
1768
28.6M
        for (int x = 0; x < endX; x++)
1769
27.7M
        {
1770
27.7M
            int classIdx = rec[x] >> boShift;
1771
27.7M
            stats[classIdx] += diff[x];
1772
27.7M
            count[classIdx]++;
1773
27.7M
        }
1774
1775
866k
        diff += MAX_CU_SIZE;
1776
866k
        rec += stride;
1777
866k
    }
1778
41.9k
}
1779
1780
void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1781
41.9k
{
1782
41.9k
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1783
41.9k
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1784
1785
41.9k
    X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n");
1786
1787
41.9k
    memset(tmp_stats, 0, sizeof(tmp_stats));
1788
41.9k
    memset(tmp_count, 0, sizeof(tmp_count));
1789
1790
883k
    for (int y = 0; y < endY; y++)
1791
841k
    {
1792
841k
        int signLeft = signOf(rec[0] - rec[-1]);
1793
27.4M
        for (int x = 0; x < endX; x++)
1794
26.6M
        {
1795
26.6M
            int signRight = signOf2(rec[x], rec[x + 1]);
1796
26.6M
            X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), "signDown check failure\n");
1797
26.6M
            uint32_t edgeType = signRight + signLeft + 2;
1798
26.6M
            signLeft = -signRight;
1799
1800
26.6M
            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1801
26.6M
            tmp_stats[edgeType] += diff[x];
1802
26.6M
            tmp_count[edgeType]++;
1803
26.6M
        }
1804
1805
841k
        diff += MAX_CU_SIZE;
1806
841k
        rec += stride;
1807
841k
    }
1808
1809
251k
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1810
209k
    {
1811
209k
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1812
209k
        count[SAO::s_eoTable[x]] += tmp_count[x];
1813
209k
    }
1814
41.9k
}
1815
1816
void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1817
41.9k
{
1818
41.9k
    X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
1819
41.9k
    X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
1820
1821
41.9k
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1822
41.9k
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1823
1824
41.9k
    memset(tmp_stats, 0, sizeof(tmp_stats));
1825
41.9k
    memset(tmp_count, 0, sizeof(tmp_count));
1826
1827
41.9k
    X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n");
1828
889k
    for (int y = 0; y < endY; y++)
1829
847k
    {
1830
28.1M
        for (int x = 0; x < endX; x++)
1831
27.3M
        {
1832
27.3M
            int signDown = signOf2(rec[x], rec[x + stride]);
1833
27.3M
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), "signDown check failure\n");
1834
27.3M
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1835
27.3M
            upBuff1[x] = (int8_t)(-signDown);
1836
1837
27.3M
            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1838
27.3M
            tmp_stats[edgeType] += diff[x];
1839
27.3M
            tmp_count[edgeType]++;
1840
27.3M
        }
1841
847k
        diff += MAX_CU_SIZE;
1842
847k
        rec += stride;
1843
847k
    }
1844
1845
251k
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1846
209k
    {
1847
209k
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1848
209k
        count[SAO::s_eoTable[x]] += tmp_count[x];
1849
209k
    }
1850
41.9k
}
1851
1852
void saoCuStatsE2_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
1853
41.9k
{
1854
41.9k
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1855
41.9k
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1856
1857
41.9k
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1858
41.9k
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1859
1860
41.9k
    memset(tmp_stats, 0, sizeof(tmp_stats));
1861
41.9k
    memset(tmp_count, 0, sizeof(tmp_count));
1862
1863
889k
    for (int y = 0; y < endY; y++)
1864
847k
    {
1865
847k
        upBufft[0] = signOf(rec[stride] - rec[-1]);
1866
27.7M
        for (int x = 0; x < endX; x++)
1867
26.8M
        {
1868
26.8M
            int signDown = signOf2(rec[x], rec[x + stride + 1]);
1869
26.8M
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
1870
26.8M
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1871
26.8M
            upBufft[x + 1] = (int8_t)(-signDown);
1872
26.8M
            tmp_stats[edgeType] += diff[x];
1873
26.8M
            tmp_count[edgeType]++;
1874
26.8M
        }
1875
1876
847k
        std::swap(upBuff1, upBufft);
1877
1878
847k
        rec += stride;
1879
847k
        diff += MAX_CU_SIZE;
1880
847k
    }
1881
1882
251k
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1883
209k
    {
1884
209k
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1885
209k
        count[SAO::s_eoTable[x]] += tmp_count[x];
1886
209k
    }
1887
41.9k
}
1888
1889
void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1890
41.9k
{
1891
41.9k
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1892
41.9k
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1893
1894
41.9k
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
1895
41.9k
    int32_t tmp_count[SAO::NUM_EDGETYPE];
1896
1897
41.9k
    memset(tmp_stats, 0, sizeof(tmp_stats));
1898
41.9k
    memset(tmp_count, 0, sizeof(tmp_count));
1899
1900
889k
    for (int y = 0; y < endY; y++)
1901
847k
    {
1902
27.7M
        for (int x = 0; x < endX; x++)
1903
26.8M
        {
1904
26.8M
            int signDown = signOf2(rec[x], rec[x + stride - 1]);
1905
26.8M
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
1906
26.8M
            X265_CHECK(abs(upBuff1[x]) <= 1, "upBuffer1 check failure\n");
1907
1908
26.8M
            uint32_t edgeType = signDown + upBuff1[x] + 2;
1909
26.8M
            upBuff1[x - 1] = (int8_t)(-signDown);
1910
26.8M
            tmp_stats[edgeType] += diff[x];
1911
26.8M
            tmp_count[edgeType]++;
1912
26.8M
        }
1913
1914
847k
        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
1915
1916
847k
        rec += stride;
1917
847k
        diff += MAX_CU_SIZE;
1918
847k
    }
1919
1920
251k
    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1921
209k
    {
1922
209k
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
1923
209k
        count[SAO::s_eoTable[x]] += tmp_count[x];
1924
209k
    }
1925
41.9k
}
1926
1927
void setupSaoPrimitives_c(EncoderPrimitives &p)
1928
1
{
1929
    // TODO: move other sao functions to here
1930
1
    p.saoCuStatsBO = saoCuStatsBO_c;
1931
1
    p.saoCuStatsE0 = saoCuStatsE0_c;
1932
1
    p.saoCuStatsE1 = saoCuStatsE1_c;
1933
1
    p.saoCuStatsE2 = saoCuStatsE2_c;
1934
1
    p.saoCuStatsE3 = saoCuStatsE3_c;
1935
1
}
1936
}
1937