Coverage Report

Created: 2022-08-24 06:17

/src/libde265/libde265/sao.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "sao.h"
22
#include "util.h"
23
24
#include <stdlib.h>
25
#include <string.h>
26
27
28
template <class pixel_t>
29
void apply_sao_internal(de265_image* img, int xCtb,int yCtb,
30
                        const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
31
                        const pixel_t* in_img,  int in_stride,
32
                        /* */ pixel_t* out_img, int out_stride)
33
3.11M
{
34
3.11M
  const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
35
36
3.11M
  int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
37
38
3.11M
  logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
39
40
3.11M
  if (SaoTypeIdx==0) {
41
1.49M
    return;
42
1.49M
  }
43
44
1.61M
  const seq_parameter_set* sps = &img->get_sps();
45
1.61M
  const pic_parameter_set* pps = &img->get_pps();
46
1.61M
  const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
47
1.61M
  const int maxPixelValue = (1<<bitDepth)-1;
48
49
  // top left position of CTB in pixels
50
1.61M
  const int xC = xCtb*nSW;
51
1.61M
  const int yC = yCtb*nSH;
52
53
1.61M
  const int width  = img->get_width(cIdx);
54
1.61M
  const int height = img->get_height(cIdx);
55
56
1.61M
  const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
57
58
1.61M
  const int picWidthInCtbs = sps->PicWidthInCtbsY;
59
1.61M
  const int chromashiftW = sps->get_chroma_shift_W(cIdx);
60
1.61M
  const int chromashiftH = sps->get_chroma_shift_H(cIdx);
61
1.61M
  const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
62
1.61M
  const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
63
64
65
9.65M
  for (int i=0;i<5;i++)
66
8.04M
    {
67
8.04M
      logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
68
8.04M
    }
69
70
71
  // actual size of CTB to be processed (can be smaller when partially outside of image)
72
1.61M
  const int ctbW = (xC+nSW>width)  ? width -xC : nSW;
73
1.61M
  const int ctbH = (yC+nSH>height) ? height-yC : nSH;
74
75
76
1.61M
  const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
77
78
1.61M
  if (SaoTypeIdx==2) {
79
79.5k
    int hPos[2], vPos[2];
80
79.5k
    int vPosStride[2]; // vPos[] multiplied by image stride
81
79.5k
    int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
82
83
79.5k
    switch (SaoEoClass) {
84
5.18k
    case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
85
10.8k
    case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
86
24.5k
    case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
87
38.9k
    case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
88
79.5k
    }
89
90
79.5k
    vPosStride[0] = vPos[0] * in_stride;
91
79.5k
    vPosStride[1] = vPos[1] * in_stride;
92
93
    /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
94
       directly with the sum of the two pixel-difference signs. */
95
79.5k
    int8_t  saoOffsetVal[5]; // [2] unused
96
79.5k
    saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
97
79.5k
    saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
98
79.5k
    saoOffsetVal[2] = 0;
99
79.5k
    saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
100
79.5k
    saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
101
102
103
2.04M
    for (int j=0;j<ctbH;j++) {
104
1.96M
      const pixel_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
105
1.96M
      /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
106
107
52.6M
      for (int i=0;i<ctbW;i++) {
108
50.6M
        int edgeIdx = -1;
109
110
50.6M
        logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
111
112
50.6M
        if ((extendedTests &&
113
50.6M
             (sps->pcm_loop_filter_disable_flag &&
114
899k
              img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
115
50.6M
            img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
116
727k
          continue;
117
727k
        }
118
119
        // do the expensive test for boundaries only at the boundaries
120
49.9M
        bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);
121
122
49.9M
        if (testBoundary)
123
21.4M
          for (int k=0;k<2;k++) {
124
14.3M
            int xS = xC+i+hPos[k];
125
14.3M
            int yS = yC+j+vPos[k];
126
127
14.3M
            if (xS<0 || yS<0 || xS>=width || yS>=height) {
128
318k
              edgeIdx=0;
129
318k
              break;
130
318k
            }
131
132
133
            // This part seems inefficient with all the get_SliceHeaderIndex() calls,
134
            // but removing this part (because the input was known to have only a single
135
            // slice anyway) reduced computation time only by 1.3%.
136
            // TODO: however, this may still be a big part of SAO itself.
137
138
14.0M
            slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
139
14.0M
                                                                     yS<<chromashiftH);
140
14.0M
            if (sliceHeader==NULL) { return; }
141
142
14.0M
            int sliceAddrRS = sliceHeader->SliceAddrRS;
143
14.0M
            if (sliceAddrRS <  ctbSliceAddrRS &&
144
14.0M
                img->get_SliceHeader((xC+i)<<chromashiftW,
145
0
                                     (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
146
0
              edgeIdx=0;
147
0
              break;
148
0
            }
149
150
14.0M
            if (sliceAddrRS >  ctbSliceAddrRS &&
151
14.0M
                img->get_SliceHeader(xS<<chromashiftW,
152
0
                                     yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
153
0
              edgeIdx=0;
154
0
              break;
155
0
            }
156
157
158
14.0M
            if (pps->loop_filter_across_tiles_enabled_flag==0 &&
159
14.0M
                pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
160
14.0M
                pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
161
0
              edgeIdx=0;
162
0
              break;
163
0
            }
164
14.0M
          }
165
166
49.9M
        if (edgeIdx != 0) {
167
168
49.6M
          edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
169
49.6M
                      Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]])   );
170
171
49.6M
          if (1) { // edgeIdx != 0) {   // seems to be faster without this check (zero in offset table)
172
49.6M
            int offset = saoOffsetVal[edgeIdx+2];
173
174
49.6M
            out_ptr[i] = Clip3(0,maxPixelValue,
175
49.6M
                               in_ptr[i] + offset);
176
49.6M
          }
177
49.6M
        }
178
49.9M
      }
179
1.96M
    }
180
79.5k
  }
181
1.53M
  else {
182
1.53M
    int bandShift = bitDepth-5;
183
1.53M
    int saoLeftClass = saoinfo->sao_band_position[cIdx];
184
1.53M
    logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
185
186
1.53M
    int bandTable[32];
187
1.53M
    memset(bandTable, 0, sizeof(int)*32);
188
189
7.63M
    for (int k=0;k<4;k++) {
190
6.10M
      bandTable[ (k+saoLeftClass)&31 ] = k+1;
191
6.10M
    }
192
193
194
    /* If PCM or transquant_bypass is used in this CTB, we have to
195
       run all checks (A).
196
       Otherwise, we run a simplified version of the code (B).
197
198
       NOTE: this whole part of SAO does not seem to be a significant part of the time spent
199
    */
200
201
1.53M
    if (extendedTests) {
202
203
      // (A) full version with all checks
204
205
3.88M
      for (int j=0;j<ctbH;j++)
206
87.0M
        for (int i=0;i<ctbW;i++) {
207
208
83.3M
          if ((sps->pcm_loop_filter_disable_flag &&
209
83.3M
               img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
210
83.3M
              img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
211
83.2M
            continue;
212
83.2M
          }
213
214
          // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
215
          // So we have to take care of large bandShifts.
216
88.7k
          int bandIdx;
217
88.7k
          if (bandShift >= 8) {
218
0
            bandIdx = 0;
219
88.7k
          } else {
220
88.7k
            bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
221
88.7k
          }
222
223
88.7k
          if (bandIdx>0) {
224
8.80k
            int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
225
226
8.80k
            logtrace(LogSAO,"%d %d (%d) offset %d  %x -> %x\n",xC+i,yC+j,bandIdx,
227
8.80k
                     offset,
228
8.80k
                     in_img[xC+i+(yC+j)*in_stride],
229
8.80k
                     in_img[xC+i+(yC+j)*in_stride]+offset);
230
231
8.80k
            out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
232
8.80k
                                                    in_img[xC+i+(yC+j)*in_stride] + offset);
233
8.80k
          }
234
88.7k
        }
235
183k
    }
236
1.35M
    else
237
1.35M
      {
238
        // (B) simplified version (only works if no PCM and transquant_bypass is active)
239
240
29.4M
        for (int j=0;j<ctbH;j++)
241
697M
          for (int i=0;i<ctbW;i++) {
242
243
            // see above
244
669M
            int bandIdx;
245
669M
            if (bandShift >= 8) {
246
29.6k
              bandIdx = 0;
247
669M
            } else {
248
669M
              bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
249
669M
            }
250
251
669M
            if (bandIdx>0) {
252
9.32M
              int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
253
254
9.32M
              out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
255
9.32M
                                                      in_img[xC+i+(yC+j)*in_stride] + offset);
256
9.32M
            }
257
669M
          }
258
1.35M
      }
259
1.53M
  }
260
1.61M
}
void apply_sao_internal<unsigned short>(de265_image*, int, int, slice_segment_header const*, int, int, int, unsigned short const*, int, unsigned short*, int)
Line
Count
Source
33
44.2k
{
34
44.2k
  const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
35
36
44.2k
  int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
37
38
44.2k
  logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
39
40
44.2k
  if (SaoTypeIdx==0) {
41
37.4k
    return;
42
37.4k
  }
43
44
6.82k
  const seq_parameter_set* sps = &img->get_sps();
45
6.82k
  const pic_parameter_set* pps = &img->get_pps();
46
6.82k
  const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
47
6.82k
  const int maxPixelValue = (1<<bitDepth)-1;
48
49
  // top left position of CTB in pixels
50
6.82k
  const int xC = xCtb*nSW;
51
6.82k
  const int yC = yCtb*nSH;
52
53
6.82k
  const int width  = img->get_width(cIdx);
54
6.82k
  const int height = img->get_height(cIdx);
55
56
6.82k
  const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
57
58
6.82k
  const int picWidthInCtbs = sps->PicWidthInCtbsY;
59
6.82k
  const int chromashiftW = sps->get_chroma_shift_W(cIdx);
60
6.82k
  const int chromashiftH = sps->get_chroma_shift_H(cIdx);
61
6.82k
  const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
62
6.82k
  const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
63
64
65
40.9k
  for (int i=0;i<5;i++)
66
34.1k
    {
67
34.1k
      logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
68
34.1k
    }
69
70
71
  // actual size of CTB to be processed (can be smaller when partially outside of image)
72
6.82k
  const int ctbW = (xC+nSW>width)  ? width -xC : nSW;
73
6.82k
  const int ctbH = (yC+nSH>height) ? height-yC : nSH;
74
75
76
6.82k
  const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
77
78
6.82k
  if (SaoTypeIdx==2) {
79
1.05k
    int hPos[2], vPos[2];
80
1.05k
    int vPosStride[2]; // vPos[] multiplied by image stride
81
1.05k
    int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
82
83
1.05k
    switch (SaoEoClass) {
84
451
    case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
85
114
    case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
86
197
    case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
87
297
    case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
88
1.05k
    }
89
90
1.05k
    vPosStride[0] = vPos[0] * in_stride;
91
1.05k
    vPosStride[1] = vPos[1] * in_stride;
92
93
    /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
94
       directly with the sum of the two pixel-difference signs. */
95
1.05k
    int8_t  saoOffsetVal[5]; // [2] unused
96
1.05k
    saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
97
1.05k
    saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
98
1.05k
    saoOffsetVal[2] = 0;
99
1.05k
    saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
100
1.05k
    saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
101
102
103
19.7k
    for (int j=0;j<ctbH;j++) {
104
18.7k
      const pixel_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
105
18.7k
      /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
106
107
496k
      for (int i=0;i<ctbW;i++) {
108
478k
        int edgeIdx = -1;
109
110
478k
        logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
111
112
478k
        if ((extendedTests &&
113
478k
             (sps->pcm_loop_filter_disable_flag &&
114
191k
              img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
115
478k
            img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
116
151k
          continue;
117
151k
        }
118
119
        // do the expensive test for boundaries only at the boundaries
120
326k
        bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);
121
122
326k
        if (testBoundary)
123
165k
          for (int k=0;k<2;k++) {
124
112k
            int xS = xC+i+hPos[k];
125
112k
            int yS = yC+j+vPos[k];
126
127
112k
            if (xS<0 || yS<0 || xS>=width || yS>=height) {
128
4.97k
              edgeIdx=0;
129
4.97k
              break;
130
4.97k
            }
131
132
133
            // This part seems inefficient with all the get_SliceHeaderIndex() calls,
134
            // but removing this part (because the input was known to have only a single
135
            // slice anyway) reduced computation time only by 1.3%.
136
            // TODO: however, this may still be a big part of SAO itself.
137
138
107k
            slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
139
107k
                                                                     yS<<chromashiftH);
140
107k
            if (sliceHeader==NULL) { return; }
141
142
107k
            int sliceAddrRS = sliceHeader->SliceAddrRS;
143
107k
            if (sliceAddrRS <  ctbSliceAddrRS &&
144
107k
                img->get_SliceHeader((xC+i)<<chromashiftW,
145
0
                                     (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
146
0
              edgeIdx=0;
147
0
              break;
148
0
            }
149
150
107k
            if (sliceAddrRS >  ctbSliceAddrRS &&
151
107k
                img->get_SliceHeader(xS<<chromashiftW,
152
0
                                     yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
153
0
              edgeIdx=0;
154
0
              break;
155
0
            }
156
157
158
107k
            if (pps->loop_filter_across_tiles_enabled_flag==0 &&
159
107k
                pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
160
61.8k
                pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
161
0
              edgeIdx=0;
162
0
              break;
163
0
            }
164
107k
          }
165
166
326k
        if (edgeIdx != 0) {
167
168
321k
          edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
169
321k
                      Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]])   );
170
171
321k
          if (1) { // edgeIdx != 0) {   // seems to be faster without this check (zero in offset table)
172
321k
            int offset = saoOffsetVal[edgeIdx+2];
173
174
321k
            out_ptr[i] = Clip3(0,maxPixelValue,
175
321k
                               in_ptr[i] + offset);
176
321k
          }
177
321k
        }
178
326k
      }
179
18.7k
    }
180
1.05k
  }
181
5.76k
  else {
182
5.76k
    int bandShift = bitDepth-5;
183
5.76k
    int saoLeftClass = saoinfo->sao_band_position[cIdx];
184
5.76k
    logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
185
186
5.76k
    int bandTable[32];
187
5.76k
    memset(bandTable, 0, sizeof(int)*32);
188
189
28.8k
    for (int k=0;k<4;k++) {
190
23.0k
      bandTable[ (k+saoLeftClass)&31 ] = k+1;
191
23.0k
    }
192
193
194
    /* If PCM or transquant_bypass is used in this CTB, we have to
195
       run all checks (A).
196
       Otherwise, we run a simplified version of the code (B).
197
198
       NOTE: this whole part of SAO does not seem to be a significant part of the time spent
199
    */
200
201
5.76k
    if (extendedTests) {
202
203
      // (A) full version with all checks
204
205
199k
      for (int j=0;j<ctbH;j++)
206
9.50M
        for (int i=0;i<ctbW;i++) {
207
208
9.30M
          if ((sps->pcm_loop_filter_disable_flag &&
209
9.30M
               img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
210
9.30M
              img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
211
9.30M
            continue;
212
9.30M
          }
213
214
          // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
215
          // So we have to take care of large bandShifts.
216
4.89k
          int bandIdx;
217
4.89k
          if (bandShift >= 8) {
218
0
            bandIdx = 0;
219
4.89k
          } else {
220
4.89k
            bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
221
4.89k
          }
222
223
4.89k
          if (bandIdx>0) {
224
132
            int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
225
226
132
            logtrace(LogSAO,"%d %d (%d) offset %d  %x -> %x\n",xC+i,yC+j,bandIdx,
227
132
                     offset,
228
132
                     in_img[xC+i+(yC+j)*in_stride],
229
132
                     in_img[xC+i+(yC+j)*in_stride]+offset);
230
231
132
            out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
232
132
                                                    in_img[xC+i+(yC+j)*in_stride] + offset);
233
132
          }
234
4.89k
        }
235
4.68k
    }
236
1.08k
    else
237
1.08k
      {
238
        // (B) simplified version (only works if no PCM and transquant_bypass is active)
239
240
38.4k
        for (int j=0;j<ctbH;j++)
241
1.69M
          for (int i=0;i<ctbW;i++) {
242
243
            // see above
244
1.65M
            int bandIdx;
245
1.65M
            if (bandShift >= 8) {
246
29.6k
              bandIdx = 0;
247
1.62M
            } else {
248
1.62M
              bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
249
1.62M
            }
250
251
1.65M
            if (bandIdx>0) {
252
7.42k
              int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
253
254
7.42k
              out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
255
7.42k
                                                      in_img[xC+i+(yC+j)*in_stride] + offset);
256
7.42k
            }
257
1.65M
          }
258
1.08k
      }
259
5.76k
  }
260
6.82k
}
void apply_sao_internal<unsigned char>(de265_image*, int, int, slice_segment_header const*, int, int, int, unsigned char const*, int, unsigned char*, int)
Line
Count
Source
33
3.07M
{
34
3.07M
  const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
35
36
3.07M
  int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
37
38
3.07M
  logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
39
40
3.07M
  if (SaoTypeIdx==0) {
41
1.46M
    return;
42
1.46M
  }
43
44
1.60M
  const seq_parameter_set* sps = &img->get_sps();
45
1.60M
  const pic_parameter_set* pps = &img->get_pps();
46
1.60M
  const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
47
1.60M
  const int maxPixelValue = (1<<bitDepth)-1;
48
49
  // top left position of CTB in pixels
50
1.60M
  const int xC = xCtb*nSW;
51
1.60M
  const int yC = yCtb*nSH;
52
53
1.60M
  const int width  = img->get_width(cIdx);
54
1.60M
  const int height = img->get_height(cIdx);
55
56
1.60M
  const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
57
58
1.60M
  const int picWidthInCtbs = sps->PicWidthInCtbsY;
59
1.60M
  const int chromashiftW = sps->get_chroma_shift_W(cIdx);
60
1.60M
  const int chromashiftH = sps->get_chroma_shift_H(cIdx);
61
1.60M
  const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
62
1.60M
  const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
63
64
65
9.61M
  for (int i=0;i<5;i++)
66
8.00M
    {
67
8.00M
      logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
68
8.00M
    }
69
70
71
  // actual size of CTB to be processed (can be smaller when partially outside of image)
72
1.60M
  const int ctbW = (xC+nSW>width)  ? width -xC : nSW;
73
1.60M
  const int ctbH = (yC+nSH>height) ? height-yC : nSH;
74
75
76
1.60M
  const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
77
78
1.60M
  if (SaoTypeIdx==2) {
79
78.4k
    int hPos[2], vPos[2];
80
78.4k
    int vPosStride[2]; // vPos[] multiplied by image stride
81
78.4k
    int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
82
83
78.4k
    switch (SaoEoClass) {
84
4.73k
    case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
85
10.7k
    case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
86
24.3k
    case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
87
38.6k
    case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
88
78.4k
    }
89
90
78.4k
    vPosStride[0] = vPos[0] * in_stride;
91
78.4k
    vPosStride[1] = vPos[1] * in_stride;
92
93
    /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
94
       directly with the sum of the two pixel-difference signs. */
95
78.4k
    int8_t  saoOffsetVal[5]; // [2] unused
96
78.4k
    saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
97
78.4k
    saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
98
78.4k
    saoOffsetVal[2] = 0;
99
78.4k
    saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
100
78.4k
    saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
101
102
103
2.02M
    for (int j=0;j<ctbH;j++) {
104
1.94M
      const pixel_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
105
1.94M
      /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
106
107
52.1M
      for (int i=0;i<ctbW;i++) {
108
50.1M
        int edgeIdx = -1;
109
110
50.1M
        logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
111
112
50.1M
        if ((extendedTests &&
113
50.1M
             (sps->pcm_loop_filter_disable_flag &&
114
708k
              img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
115
50.2M
            img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
116
576k
          continue;
117
576k
        }
118
119
        // do the expensive test for boundaries only at the boundaries
120
49.6M
        bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);
121
122
49.6M
        if (testBoundary)
123
21.2M
          for (int k=0;k<2;k++) {
124
14.2M
            int xS = xC+i+hPos[k];
125
14.2M
            int yS = yC+j+vPos[k];
126
127
14.2M
            if (xS<0 || yS<0 || xS>=width || yS>=height) {
128
313k
              edgeIdx=0;
129
313k
              break;
130
313k
            }
131
132
133
            // This part seems inefficient with all the get_SliceHeaderIndex() calls,
134
            // but removing this part (because the input was known to have only a single
135
            // slice anyway) reduced computation time only by 1.3%.
136
            // TODO: however, this may still be a big part of SAO itself.
137
138
13.9M
            slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
139
13.9M
                                                                     yS<<chromashiftH);
140
13.9M
            if (sliceHeader==NULL) { return; }
141
142
13.9M
            int sliceAddrRS = sliceHeader->SliceAddrRS;
143
13.9M
            if (sliceAddrRS <  ctbSliceAddrRS &&
144
13.9M
                img->get_SliceHeader((xC+i)<<chromashiftW,
145
0
                                     (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
146
0
              edgeIdx=0;
147
0
              break;
148
0
            }
149
150
13.9M
            if (sliceAddrRS >  ctbSliceAddrRS &&
151
13.9M
                img->get_SliceHeader(xS<<chromashiftW,
152
0
                                     yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
153
0
              edgeIdx=0;
154
0
              break;
155
0
            }
156
157
158
13.9M
            if (pps->loop_filter_across_tiles_enabled_flag==0 &&
159
13.9M
                pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
160
13.9M
                pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
161
0
              edgeIdx=0;
162
0
              break;
163
0
            }
164
13.9M
          }
165
166
49.6M
        if (edgeIdx != 0) {
167
168
49.2M
          edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
169
49.2M
                      Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]])   );
170
171
49.3M
          if (1) { // edgeIdx != 0) {   // seems to be faster without this check (zero in offset table)
172
49.3M
            int offset = saoOffsetVal[edgeIdx+2];
173
174
49.3M
            out_ptr[i] = Clip3(0,maxPixelValue,
175
49.3M
                               in_ptr[i] + offset);
176
49.3M
          }
177
49.2M
        }
178
49.6M
      }
179
1.94M
    }
180
78.4k
  }
181
1.53M
  else {
182
1.53M
    int bandShift = bitDepth-5;
183
1.53M
    int saoLeftClass = saoinfo->sao_band_position[cIdx];
184
1.53M
    logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
185
186
1.53M
    int bandTable[32];
187
1.53M
    memset(bandTable, 0, sizeof(int)*32);
188
189
7.61M
    for (int k=0;k<4;k++) {
190
6.07M
      bandTable[ (k+saoLeftClass)&31 ] = k+1;
191
6.07M
    }
192
193
194
    /* If PCM or transquant_bypass is used in this CTB, we have to
195
       run all checks (A).
196
       Otherwise, we run a simplified version of the code (B).
197
198
       NOTE: this whole part of SAO does not seem to be a significant part of the time spent
199
    */
200
201
1.53M
    if (extendedTests) {
202
203
      // (A) full version with all checks
204
205
3.68M
      for (int j=0;j<ctbH;j++)
206
77.5M
        for (int i=0;i<ctbW;i++) {
207
208
74.0M
          if ((sps->pcm_loop_filter_disable_flag &&
209
74.0M
               img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
210
74.0M
              img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
211
73.9M
            continue;
212
73.9M
          }
213
214
          // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
215
          // So we have to take care of large bandShifts.
216
83.8k
          int bandIdx;
217
83.8k
          if (bandShift >= 8) {
218
0
            bandIdx = 0;
219
83.8k
          } else {
220
83.8k
            bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
221
83.8k
          }
222
223
83.8k
          if (bandIdx>0) {
224
8.66k
            int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
225
226
8.66k
            logtrace(LogSAO,"%d %d (%d) offset %d  %x -> %x\n",xC+i,yC+j,bandIdx,
227
8.66k
                     offset,
228
8.66k
                     in_img[xC+i+(yC+j)*in_stride],
229
8.66k
                     in_img[xC+i+(yC+j)*in_stride]+offset);
230
231
8.66k
            out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
232
8.66k
                                                    in_img[xC+i+(yC+j)*in_stride] + offset);
233
8.66k
          }
234
83.8k
        }
235
178k
    }
236
1.35M
    else
237
1.35M
      {
238
        // (B) simplified version (only works if no PCM and transquant_bypass is active)
239
240
29.4M
        for (int j=0;j<ctbH;j++)
241
695M
          for (int i=0;i<ctbW;i++) {
242
243
            // see above
244
667M
            int bandIdx;
245
667M
            if (bandShift >= 8) {
246
0
              bandIdx = 0;
247
667M
            } else {
248
667M
              bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
249
667M
            }
250
251
667M
            if (bandIdx>0) {
252
9.32M
              int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
253
254
9.32M
              out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
255
9.32M
                                                      in_img[xC+i+(yC+j)*in_stride] + offset);
256
9.32M
            }
257
667M
          }
258
1.35M
      }
259
1.53M
  }
260
1.60M
}
261
262
263
template <class pixel_t>
264
void apply_sao(de265_image* img, int xCtb,int yCtb,
265
               const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
266
               const pixel_t* in_img,  int in_stride,
267
               /* */ pixel_t* out_img, int out_stride)
268
3.11M
{
269
3.11M
  if (img->high_bit_depth(cIdx)) {
270
44.2k
    apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
271
44.2k
                                 (uint16_t*)in_img, in_stride,
272
44.2k
                                 (uint16_t*)out_img,out_stride);
273
44.2k
  }
274
3.07M
  else {
275
3.07M
    apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
276
3.07M
                                in_img, in_stride,
277
3.07M
                                out_img,out_stride);
278
3.07M
  }
279
3.11M
}
280
281
282
void apply_sample_adaptive_offset(de265_image* img)
283
0
{
284
0
  const seq_parameter_set& sps = img->get_sps();
285
286
0
  if (sps.sample_adaptive_offset_enabled_flag==0) {
287
0
    return;
288
0
  }
289
290
0
  de265_image inputCopy;
291
0
  de265_error err = inputCopy.copy_image(img);
292
0
  if (err != DE265_OK) {
293
0
    img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
294
0
    return;
295
0
  }
296
297
0
  for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
298
0
    for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
299
0
      {
300
0
        const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
301
302
0
        if (shdr->slice_sao_luma_flag) {
303
0
          apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
304
0
                    inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
305
0
                    img->get_image_plane(0), img->get_image_stride(0));
306
0
        }
307
308
0
        if (shdr->slice_sao_chroma_flag) {
309
0
          int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
310
0
          int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
311
312
0
          apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH,
313
0
                    inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
314
0
                    img->get_image_plane(1), img->get_image_stride(1));
315
316
0
          apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH,
317
0
                    inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
318
0
                    img->get_image_plane(2), img->get_image_stride(2));
319
0
        }
320
0
      }
321
0
}
322
323
324
void apply_sample_adaptive_offset_sequential(de265_image* img)
325
0
{
326
0
  const seq_parameter_set& sps = img->get_sps();
327
328
0
  if (sps.sample_adaptive_offset_enabled_flag==0) {
329
0
    return;
330
0
  }
331
332
0
  int lumaImageSize   = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0);
333
0
  int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1);
334
335
0
  uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ];
336
0
  if (inputCopy == NULL) {
337
0
    img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
338
0
    return;
339
0
  }
340
341
342
0
  int nChannels = 3;
343
0
  if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; }
344
345
0
  for (int cIdx=0;cIdx<nChannels;cIdx++) {
346
347
0
    int stride = img->get_image_stride(cIdx);
348
0
    int height = img->get_height(cIdx);
349
350
0
    memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx));
351
352
0
    for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
353
0
      for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
354
0
        {
355
0
          const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
356
0
          if (shdr==NULL) {
357
0
      delete[] inputCopy;
358
0
      return;
359
0
    }
360
361
0
          if (cIdx==0 && shdr->slice_sao_luma_flag) {
362
0
            apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
363
0
                      inputCopy, stride,
364
0
                      img->get_image_plane(0), img->get_image_stride(0));
365
0
          }
366
367
0
          if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
368
0
            int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
369
0
            int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
370
371
0
            apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH,
372
0
                      inputCopy, stride,
373
0
                      img->get_image_plane(cIdx), img->get_image_stride(cIdx));
374
0
          }
375
0
        }
376
0
  }
377
378
0
  delete[] inputCopy;
379
0
}
380
381
382
383
384
class thread_task_sao : public thread_task
385
{
386
public:
387
  int  ctb_y;
388
  de265_image* img; /* this is where we get the SPS from
389
                       (either inputImg or outputImg can be a dummy image)
390
                    */
391
392
  de265_image* inputImg;
393
  de265_image* outputImg;
394
  int inputProgress;
395
396
  virtual void work();
397
0
  virtual std::string name() const {
398
0
    char buf[100];
399
0
    sprintf(buf,"sao-%d",ctb_y);
400
0
    return buf;
401
0
  }
402
};
403
404
405
void thread_task_sao::work()
406
56.0k
{
407
56.0k
  state = Running;
408
56.0k
  img->thread_run(this);
409
410
56.0k
  const seq_parameter_set& sps = img->get_sps();
411
412
56.0k
  const int rightCtb = sps.PicWidthInCtbsY-1;
413
56.0k
  const int ctbSize  = (1<<sps.Log2CtbSizeY);
414
415
416
  // wait until also the CTB-rows below and above are ready
417
418
56.0k
  img->wait_for_progress(this, rightCtb,ctb_y,  inputProgress);
419
420
56.0k
  if (ctb_y>0) {
421
52.5k
    img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
422
52.5k
  }
423
424
56.0k
  if (ctb_y+1<sps.PicHeightInCtbsY) {
425
52.5k
    img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
426
52.5k
  }
427
428
429
  // copy input image to output for this CTB-row
430
431
56.0k
  outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize);
432
433
434
  // process SAO in the CTB-row
435
436
1.10M
  for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
437
1.04M
    {
438
1.04M
      const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
439
1.04M
      if (shdr==NULL) {
440
0
        break;
441
0
      }
442
443
1.04M
      if (shdr->slice_sao_luma_flag) {
444
1.04M
        apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize,
445
1.04M
                  inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
446
1.04M
                  outputImg->get_image_plane(0), outputImg->get_image_stride(0));
447
1.04M
      }
448
449
1.04M
      if (shdr->slice_sao_chroma_flag) {
450
1.03M
        int nSW = ctbSize / sps.SubWidthC;
451
1.03M
        int nSH = ctbSize / sps.SubHeightC;
452
453
1.03M
        apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH,
454
1.03M
                  inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
455
1.03M
                  outputImg->get_image_plane(1), outputImg->get_image_stride(1));
456
457
1.03M
        apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH,
458
1.03M
                  inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
459
1.03M
                  outputImg->get_image_plane(2), outputImg->get_image_stride(2));
460
1.03M
      }
461
1.04M
    }
462
463
464
  // mark SAO progress
465
466
1.10M
  for (int x=0;x<=rightCtb;x++) {
467
1.04M
    const int CtbWidth = sps.PicWidthInCtbsY;
468
1.04M
    img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO);
469
1.04M
  }
470
471
472
56.0k
  state = Finished;
473
56.0k
  img->thread_finishes(this);
474
56.0k
}
475
476
477
bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
478
3.70k
{
479
3.70k
  de265_image* img = imgunit->img;
480
3.70k
  const seq_parameter_set& sps = img->get_sps();
481
482
3.70k
  if (sps.sample_adaptive_offset_enabled_flag==0) {
483
170
    return false;
484
170
  }
485
486
487
3.53k
  decoder_context* ctx = img->decctx;
488
489
3.53k
  de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
490
3.53k
                                                    img->get_chroma_format(),
491
3.53k
                                                    img->get_shared_sps(),
492
3.53k
                                                    false,
493
3.53k
                                                    img->decctx, //img->encctx,
494
3.53k
                                                    img->pts, img->user_data, true);
495
3.53k
  if (err != DE265_OK) {
496
0
    img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
497
0
    return false;
498
0
  }
499
500
3.53k
  int nRows = sps.PicHeightInCtbsY;
501
502
3.53k
  int n=0;
503
3.53k
  img->thread_start(nRows);
504
505
59.6k
  for (int y=0;y<nRows;y++)
506
56.0k
    {
507
56.0k
      thread_task_sao* task = new thread_task_sao;
508
509
56.0k
      task->inputImg  = img;
510
56.0k
      task->outputImg = &imgunit->sao_output;
511
56.0k
      task->img = img;
512
56.0k
      task->ctb_y = y;
513
56.0k
      task->inputProgress = saoInputProgress;
514
515
56.0k
      imgunit->tasks.push_back(task);
516
56.0k
      add_task(&ctx->thread_pool_, task);
517
56.0k
      n++;
518
56.0k
    }
519
520
  /* Currently need barrier here because when are finished, we have to swap the pixel
521
     data back into the main image. */
522
3.53k
  img->wait_for_completion();
523
524
3.53k
  img->exchange_pixel_data_with(imgunit->sao_output);
525
526
3.53k
  return true;
527
3.53k
}