/src/libde265/libde265/sao.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "sao.h" |
22 | | #include "util.h" |
23 | | |
24 | | #include <stdlib.h> |
25 | | #include <string.h> |
26 | | |
27 | | |
28 | | template <class pixel_t> |
29 | | void apply_sao_internal(de265_image* img, int xCtb,int yCtb, |
30 | | const slice_segment_header* shdr, int cIdx, int nSW,int nSH, |
31 | | const pixel_t* in_img, int in_stride, |
32 | | /* */ pixel_t* out_img, int out_stride) |
33 | 1.64M | { |
34 | 1.64M | const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb); |
35 | | |
36 | 1.64M | int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3; |
37 | | |
38 | 1.64M | logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH); |
39 | | |
40 | 1.64M | if (SaoTypeIdx==0) { |
41 | 970k | return; |
42 | 970k | } |
43 | | |
44 | 670k | const seq_parameter_set* sps = &img->get_sps(); |
45 | 670k | const pic_parameter_set* pps = &img->get_pps(); |
46 | 670k | const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C); |
47 | 670k | const int maxPixelValue = (1<<bitDepth)-1; |
48 | | |
49 | | // top left position of CTB in pixels |
50 | 670k | const int xC = xCtb*nSW; |
51 | 670k | const int yC = yCtb*nSH; |
52 | | |
53 | 670k | const int width = img->get_width(cIdx); |
54 | 670k | const int height = img->get_height(cIdx); |
55 | | |
56 | 670k | const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS; |
57 | | |
58 | 670k | const int picWidthInCtbs = sps->PicWidthInCtbsY; |
59 | 670k | const int chromashiftW = sps->get_chroma_shift_W(cIdx); |
60 | 670k | const int chromashiftH = sps->get_chroma_shift_H(cIdx); |
61 | 670k | const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW; |
62 | 670k | const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH; |
63 | | |
64 | | |
65 | 4.02M | for (int i=0;i<5;i++) |
66 | 3.35M | { |
67 | 3.35M | logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]); |
68 | 3.35M | } |
69 | | |
70 | | |
71 | | // actual size of CTB to be processed (can be smaller when partially outside of image) |
72 | 670k | const int ctbW = (xC+nSW>width) ? width -xC : nSW; |
73 | 670k | const int ctbH = (yC+nSH>height) ? height-yC : nSH; |
74 | | |
75 | | |
76 | 670k | const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb); |
77 | | |
78 | 670k | if (SaoTypeIdx==2) { |
79 | 289k | int hPos[2], vPos[2]; |
80 | 289k | int vPosStride[2]; // vPos[] multiplied by image stride |
81 | 289k | int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3; |
82 | | |
83 | 289k | switch (SaoEoClass) { |
84 | 67.4k | case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break; |
85 | 82.4k | case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break; |
86 | 66.7k | case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break; |
87 | 73.2k | case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break; |
88 | 289k | } |
89 | | |
90 | 289k | vPosStride[0] = vPos[0] * in_stride; |
91 | 289k | vPosStride[1] = vPos[1] * in_stride; |
92 | | |
93 | | /* Reorder sao_info.saoOffsetVal[] array, so that we can index it |
94 | | directly with the sum of the two pixel-difference signs. */ |
95 | 289k | int8_t saoOffsetVal[5]; // [2] unused |
96 | 289k | saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1]; |
97 | 289k | saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1]; |
98 | 289k | saoOffsetVal[2] = 0; |
99 | 289k | saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1]; |
100 | 289k | saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1]; |
101 | | |
102 | | |
103 | 12.7M | for (int j=0;j<ctbH;j++) { |
104 | 12.4M | const pixel_t* in_ptr = &in_img [xC+(yC+j)*in_stride]; |
105 | 12.4M | /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride]; |
106 | | |
107 | 633M | for (int i=0;i<ctbW;i++) { |
108 | 620M | int edgeIdx = -1; |
109 | | |
110 | 620M | logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j); |
111 | | |
112 | 620M | if ((extendedTests && |
113 | 37.3M | (sps->pcm_loop_filter_disable_flag && |
114 | 193k | img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) || |
115 | 621M | img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) { |
116 | 20.7M | continue; |
117 | 20.7M | } |
118 | | |
119 | | // do the expensive test for boundaries only at the boundaries |
120 | 600M | bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1); |
121 | | |
122 | 600M | if (testBoundary) |
123 | 138M | for (int k=0;k<2;k++) { |
124 | 93.5M | int xS = xC+i+hPos[k]; |
125 | 93.5M | int yS = yC+j+vPos[k]; |
126 | | |
127 | 93.5M | if (xS<0 || yS<0 || xS>=width || yS>=height) { |
128 | 3.29M | edgeIdx=0; |
129 | 3.29M | break; |
130 | 3.29M | } |
131 | | |
132 | | |
133 | | // This part seems inefficient with all the get_SliceHeaderIndex() calls, |
134 | | // but removing this part (because the input was known to have only a single |
135 | | // slice anyway) reduced computation time only by 1.3%. |
136 | | // TODO: however, this may still be a big part of SAO itself. |
137 | | |
138 | 90.2M | slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW, |
139 | 90.2M | yS<<chromashiftH); |
140 | 90.2M | if (sliceHeader==nullptr) { return; } |
141 | | |
142 | 90.2M | int sliceAddrRS = sliceHeader->SliceAddrRS; |
143 | 90.2M | if (sliceAddrRS < ctbSliceAddrRS && |
144 | 10.6k | img->get_SliceHeader((xC+i)<<chromashiftW, |
145 | 10.6k | (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) { |
146 | 7.70k | edgeIdx=0; |
147 | 7.70k | break; |
148 | 7.70k | } |
149 | | |
150 | 90.2M | if (sliceAddrRS > ctbSliceAddrRS && |
151 | 5.82k | img->get_SliceHeader(xS<<chromashiftW, |
152 | 5.82k | yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) { |
153 | 5.23k | edgeIdx=0; |
154 | 5.23k | break; |
155 | 5.23k | } |
156 | | |
157 | | |
158 | 90.2M | if (pps->loop_filter_across_tiles_enabled_flag==0 && |
159 | 90.2M | pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] != |
160 | 90.2M | pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) { |
161 | 4.92k | edgeIdx=0; |
162 | 4.92k | break; |
163 | 4.92k | } |
164 | 90.2M | } |
165 | | |
166 | 600M | if (edgeIdx != 0) { |
167 | | |
168 | 597M | edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) + |
169 | 597M | Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) ); |
170 | | |
171 | 597M | if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table) |
172 | 597M | int offset = saoOffsetVal[edgeIdx+2]; |
173 | | |
174 | 597M | out_ptr[i] = Clip3(0,maxPixelValue, |
175 | 597M | in_ptr[i] + offset); |
176 | 597M | } |
177 | 597M | } |
178 | 600M | } |
179 | 12.4M | } |
180 | 289k | } |
181 | 380k | else { |
182 | 380k | int bandShift = bitDepth-5; |
183 | 380k | int saoLeftClass = saoinfo->sao_band_position[cIdx]; |
184 | 380k | logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass); |
185 | | |
186 | 380k | int bandTable[32]; |
187 | 380k | memset(bandTable, 0, sizeof(int)*32); |
188 | | |
189 | 1.90M | for (int k=0;k<4;k++) { |
190 | 1.52M | bandTable[ (k+saoLeftClass)&31 ] = k+1; |
191 | 1.52M | } |
192 | | |
193 | | |
194 | | /* If PCM or transquant_bypass is used in this CTB, we have to |
195 | | run all checks (A). |
196 | | Otherwise, we run a simplified version of the code (B). |
197 | | |
198 | | NOTE: this whole part of SAO does not seem to be a significant part of the time spent |
199 | | */ |
200 | | |
201 | 380k | if (extendedTests) { |
202 | | |
203 | | // (A) full version with all checks |
204 | | |
205 | 1.13M | for (int j=0;j<ctbH;j++) |
206 | 54.4M | for (int i=0;i<ctbW;i++) { |
207 | | |
208 | 53.2M | if ((sps->pcm_loop_filter_disable_flag && |
209 | 229k | img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) || |
210 | 53.2M | img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) { |
211 | 39.7M | continue; |
212 | 39.7M | } |
213 | | |
214 | | // Shifts are a strange thing. On x86, >>x actually computes >>(x%64). |
215 | | // But this should never happen, because the maximum bit-depth is 16. |
216 | 13.5M | int pixel = in_img[xC + i + (yC + j) * in_stride]; |
217 | | |
218 | | // Note: the input pixel value should never exceed the valid range, but it seems that it still does, |
219 | | // maybe when there was a decoding error and the pixels have not been filled in correctly. |
220 | | // Thus, we have to limit the pixel range to ensure that we have no illegal table access. |
221 | 13.5M | pixel = Clip3(0, maxPixelValue, pixel); |
222 | | |
223 | 13.5M | int bandIdx = bandTable[pixel >> bandShift]; |
224 | | |
225 | 13.5M | if (bandIdx>0) { |
226 | 1.56M | int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; |
227 | | |
228 | 1.56M | logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx, |
229 | 1.56M | offset, |
230 | 1.56M | in_img[xC+i+(yC+j)*in_stride], |
231 | 1.56M | in_img[xC+i+(yC+j)*in_stride]+offset); |
232 | | |
233 | 1.56M | out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, |
234 | 1.56M | in_img[xC+i+(yC+j)*in_stride] + offset); |
235 | 1.56M | } |
236 | 13.5M | } |
237 | 35.6k | } |
238 | 344k | else |
239 | 344k | { |
240 | | // (B) simplified version (only works if no PCM and transquant_bypass is active) |
241 | | |
242 | 13.6M | for (int j=0;j<ctbH;j++) |
243 | 615M | for (int i=0;i<ctbW;i++) { |
244 | | |
245 | 602M | int pixel = in_img[xC + i + (yC + j) * in_stride]; |
246 | | |
247 | | // Note: the input pixel value should never exceed the valid range, but it seems that it still does, |
248 | | // maybe when there was a decoding error and the pixels have not been filled in correctly. |
249 | | // Thus, we have to limit the pixel range to ensure that we have no illegal table access. |
250 | 602M | pixel = Clip3(0, maxPixelValue, pixel); |
251 | | |
252 | 602M | int bandIdx = bandTable[pixel >> bandShift]; |
253 | | |
254 | 602M | if (bandIdx>0) { |
255 | 77.5M | int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; |
256 | | |
257 | 77.5M | out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, |
258 | 77.5M | in_img[xC+i+(yC+j)*in_stride] + offset); |
259 | 77.5M | } |
260 | 602M | } |
261 | 344k | } |
262 | 380k | } |
263 | 670k | } void apply_sao_internal<unsigned short>(de265_image*, int, int, slice_segment_header const*, int, int, int, unsigned short const*, int, unsigned short*, int) Line | Count | Source | 33 | 259k | { | 34 | 259k | const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb); | 35 | | | 36 | 259k | int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3; | 37 | | | 38 | 259k | logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH); | 39 | | | 40 | 259k | if (SaoTypeIdx==0) { | 41 | 159k | return; | 42 | 159k | } | 43 | | | 44 | 99.9k | const seq_parameter_set* sps = &img->get_sps(); | 45 | 99.9k | const pic_parameter_set* pps = &img->get_pps(); | 46 | 99.9k | const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C); | 47 | 99.9k | const int maxPixelValue = (1<<bitDepth)-1; | 48 | | | 49 | | // top left position of CTB in pixels | 50 | 99.9k | const int xC = xCtb*nSW; | 51 | 99.9k | const int yC = yCtb*nSH; | 52 | | | 53 | 99.9k | const int width = img->get_width(cIdx); | 54 | 99.9k | const int height = img->get_height(cIdx); | 55 | | | 56 | 99.9k | const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS; | 57 | | | 58 | 99.9k | const int picWidthInCtbs = sps->PicWidthInCtbsY; | 59 | 99.9k | const int chromashiftW = sps->get_chroma_shift_W(cIdx); | 60 | 99.9k | const int chromashiftH = sps->get_chroma_shift_H(cIdx); | 61 | 99.9k | const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW; | 62 | 99.9k | const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH; | 63 | | | 64 | | | 65 | 599k | for (int i=0;i<5;i++) | 66 | 499k | { | 67 | 499k | logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]); | 68 | 499k | } | 69 | | | 70 | | | 71 | | // actual size of CTB to be processed (can be smaller when partially outside of image) | 72 | 99.9k | const int ctbW = (xC+nSW>width) ? width -xC : nSW; | 73 | 99.9k | const int ctbH = (yC+nSH>height) ? height-yC : nSH; | 74 | | | 75 | | | 76 | 99.9k | const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb); | 77 | | | 78 | 99.9k | if (SaoTypeIdx==2) { | 79 | 14.7k | int hPos[2], vPos[2]; | 80 | 14.7k | int vPosStride[2]; // vPos[] multiplied by image stride | 81 | 14.7k | int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3; | 82 | | | 83 | 14.7k | switch (SaoEoClass) { | 84 | 3.11k | case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break; | 85 | 2.29k | case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break; | 86 | 4.99k | case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break; | 87 | 4.34k | case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break; | 88 | 14.7k | } | 89 | | | 90 | 14.7k | vPosStride[0] = vPos[0] * in_stride; | 91 | 14.7k | vPosStride[1] = vPos[1] * in_stride; | 92 | | | 93 | | /* Reorder sao_info.saoOffsetVal[] array, so that we can index it | 94 | | directly with the sum of the two pixel-difference signs. */ | 95 | 14.7k | int8_t saoOffsetVal[5]; // [2] unused | 96 | 14.7k | saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1]; | 97 | 14.7k | saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1]; | 98 | 14.7k | saoOffsetVal[2] = 0; | 99 | 14.7k | saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1]; | 100 | 14.7k | saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1]; | 101 | | | 102 | | | 103 | 410k | for (int j=0;j<ctbH;j++) { | 104 | 395k | const pixel_t* in_ptr = &in_img [xC+(yC+j)*in_stride]; | 105 | 395k | /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride]; | 106 | | | 107 | 18.3M | for (int i=0;i<ctbW;i++) { | 108 | 17.9M | int edgeIdx = -1; | 109 | | | 110 | 17.9M | logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j); | 111 | | | 112 | 17.9M | if ((extendedTests && | 113 | 10.7M | (sps->pcm_loop_filter_disable_flag && | 114 | 151k | img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) || | 115 | 17.9M | img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) { | 116 | 5.92M | continue; | 117 | 5.92M | } | 118 | | | 119 | | // do the expensive test for boundaries only at the boundaries | 120 | 12.0M | bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1); | 121 | | | 122 | 12.0M | if (testBoundary) | 123 | 2.60M | for (int k=0;k<2;k++) { | 124 | 1.89M | int xS = xC+i+hPos[k]; | 125 | 1.89M | int yS = yC+j+vPos[k]; | 126 | | | 127 | 1.89M | if (xS<0 || yS<0 || xS>=width || yS>=height) { | 128 | 318k | edgeIdx=0; | 129 | 318k | break; | 130 | 318k | } | 131 | | | 132 | | | 133 | | // This part seems inefficient with all the get_SliceHeaderIndex() calls, | 134 | | // but removing this part (because the input was known to have only a single | 135 | | // slice anyway) reduced computation time only by 1.3%. | 136 | | // TODO: however, this may still be a big part of SAO itself. | 137 | | | 138 | 1.57M | slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW, | 139 | 1.57M | yS<<chromashiftH); | 140 | 1.57M | if (sliceHeader==nullptr) { return; } | 141 | | | 142 | 1.57M | int sliceAddrRS = sliceHeader->SliceAddrRS; | 143 | 1.57M | if (sliceAddrRS < ctbSliceAddrRS && | 144 | 3.01k | img->get_SliceHeader((xC+i)<<chromashiftW, | 145 | 3.01k | (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) { | 146 | 2.23k | edgeIdx=0; | 147 | 2.23k | break; | 148 | 2.23k | } | 149 | | | 150 | 1.57M | if (sliceAddrRS > ctbSliceAddrRS && | 151 | 1.55k | img->get_SliceHeader(xS<<chromashiftW, | 152 | 1.55k | yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) { | 153 | 1.44k | edgeIdx=0; | 154 | 1.44k | break; | 155 | 1.44k | } | 156 | | | 157 | | | 158 | 1.57M | if (pps->loop_filter_across_tiles_enabled_flag==0 && | 159 | 1.56M | pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] != | 160 | 1.56M | pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) { | 161 | 1.03k | edgeIdx=0; | 162 | 1.03k | break; | 163 | 1.03k | } | 164 | 1.57M | } | 165 | | | 166 | 12.0M | if (edgeIdx != 0) { | 167 | | | 168 | 11.7M | edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) + | 169 | 11.7M | Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) ); | 170 | | | 171 | 11.7M | if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table) | 172 | 11.7M | int offset = saoOffsetVal[edgeIdx+2]; | 173 | | | 174 | 11.7M | out_ptr[i] = Clip3(0,maxPixelValue, | 175 | 11.7M | in_ptr[i] + offset); | 176 | 11.7M | } | 177 | 11.7M | } | 178 | 12.0M | } | 179 | 395k | } | 180 | 14.7k | } | 181 | 85.1k | else { | 182 | 85.1k | int bandShift = bitDepth-5; | 183 | 85.1k | int saoLeftClass = saoinfo->sao_band_position[cIdx]; | 184 | 85.1k | logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass); | 185 | | | 186 | 85.1k | int bandTable[32]; | 187 | 85.1k | memset(bandTable, 0, sizeof(int)*32); | 188 | | | 189 | 425k | for (int k=0;k<4;k++) { | 190 | 340k | bandTable[ (k+saoLeftClass)&31 ] = k+1; | 191 | 340k | } | 192 | | | 193 | | | 194 | | /* If PCM or transquant_bypass is used in this CTB, we have to | 195 | | run all checks (A). | 196 | | Otherwise, we run a simplified version of the code (B). | 197 | | | 198 | | NOTE: this whole part of SAO does not seem to be a significant part of the time spent | 199 | | */ | 200 | | | 201 | 85.1k | if (extendedTests) { | 202 | | | 203 | | // (A) full version with all checks | 204 | | | 205 | 468k | for (int j=0;j<ctbH;j++) | 206 | 21.9M | for (int i=0;i<ctbW;i++) { | 207 | | | 208 | 21.4M | if ((sps->pcm_loop_filter_disable_flag && | 209 | 184k | img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) || | 210 | 21.4M | img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) { | 211 | 16.4M | continue; | 212 | 16.4M | } | 213 | | | 214 | | // Shifts are a strange thing. On x86, >>x actually computes >>(x%64). | 215 | | // But this should never happen, because the maximum bit-depth is 16. | 216 | 5.07M | int pixel = in_img[xC + i + (yC + j) * in_stride]; | 217 | | | 218 | | // Note: the input pixel value should never exceed the valid range, but it seems that it still does, | 219 | | // maybe when there was a decoding error and the pixels have not been filled in correctly. | 220 | | // Thus, we have to limit the pixel range to ensure that we have no illegal table access. | 221 | 5.07M | pixel = Clip3(0, maxPixelValue, pixel); | 222 | | | 223 | 5.07M | int bandIdx = bandTable[pixel >> bandShift]; | 224 | | | 225 | 5.07M | if (bandIdx>0) { | 226 | 539k | int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; | 227 | | | 228 | 539k | logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx, | 229 | 539k | offset, | 230 | 539k | in_img[xC+i+(yC+j)*in_stride], | 231 | 539k | in_img[xC+i+(yC+j)*in_stride]+offset); | 232 | | | 233 | 539k | out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, | 234 | 539k | in_img[xC+i+(yC+j)*in_stride] + offset); | 235 | 539k | } | 236 | 5.07M | } | 237 | 15.3k | } | 238 | 69.7k | else | 239 | 69.7k | { | 240 | | // (B) simplified version (only works if no PCM and transquant_bypass is active) | 241 | | | 242 | 2.39M | for (int j=0;j<ctbH;j++) | 243 | 92.6M | for (int i=0;i<ctbW;i++) { | 244 | | | 245 | 90.3M | int pixel = in_img[xC + i + (yC + j) * in_stride]; | 246 | | | 247 | | // Note: the input pixel value should never exceed the valid range, but it seems that it still does, | 248 | | // maybe when there was a decoding error and the pixels have not been filled in correctly. | 249 | | // Thus, we have to limit the pixel range to ensure that we have no illegal table access. | 250 | 90.3M | pixel = Clip3(0, maxPixelValue, pixel); | 251 | | | 252 | 90.3M | int bandIdx = bandTable[pixel >> bandShift]; | 253 | | | 254 | 90.3M | if (bandIdx>0) { | 255 | 15.2M | int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; | 256 | | | 257 | 15.2M | out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, | 258 | 15.2M | in_img[xC+i+(yC+j)*in_stride] + offset); | 259 | 15.2M | } | 260 | 90.3M | } | 261 | 69.7k | } | 262 | 85.1k | } | 263 | 99.9k | } |
void apply_sao_internal<unsigned char>(de265_image*, int, int, slice_segment_header const*, int, int, int, unsigned char const*, int, unsigned char*, int) Line | Count | Source | 33 | 1.38M | { | 34 | 1.38M | const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb); | 35 | | | 36 | 1.38M | int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3; | 37 | | | 38 | 1.38M | logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH); | 39 | | | 40 | 1.38M | if (SaoTypeIdx==0) { | 41 | 810k | return; | 42 | 810k | } | 43 | | | 44 | 570k | const seq_parameter_set* sps = &img->get_sps(); | 45 | 570k | const pic_parameter_set* pps = &img->get_pps(); | 46 | 570k | const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C); | 47 | 570k | const int maxPixelValue = (1<<bitDepth)-1; | 48 | | | 49 | | // top left position of CTB in pixels | 50 | 570k | const int xC = xCtb*nSW; | 51 | 570k | const int yC = yCtb*nSH; | 52 | | | 53 | 570k | const int width = img->get_width(cIdx); | 54 | 570k | const int height = img->get_height(cIdx); | 55 | | | 56 | 570k | const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS; | 57 | | | 58 | 570k | const int picWidthInCtbs = sps->PicWidthInCtbsY; | 59 | 570k | const int chromashiftW = sps->get_chroma_shift_W(cIdx); | 60 | 570k | const int chromashiftH = sps->get_chroma_shift_H(cIdx); | 61 | 570k | const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW; | 62 | 570k | const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH; | 63 | | | 64 | | | 65 | 3.42M | for (int i=0;i<5;i++) | 66 | 2.85M | { | 67 | 2.85M | logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]); | 68 | 2.85M | } | 69 | | | 70 | | | 71 | | // actual size of CTB to be processed (can be smaller when partially outside of image) | 72 | 570k | const int ctbW = (xC+nSW>width) ? width -xC : nSW; | 73 | 570k | const int ctbH = (yC+nSH>height) ? height-yC : nSH; | 74 | | | 75 | | | 76 | 570k | const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb); | 77 | | | 78 | 570k | if (SaoTypeIdx==2) { | 79 | 274k | int hPos[2], vPos[2]; | 80 | 274k | int vPosStride[2]; // vPos[] multiplied by image stride | 81 | 274k | int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3; | 82 | | | 83 | 274k | switch (SaoEoClass) { | 84 | 64.2k | case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break; | 85 | 80.1k | case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break; | 86 | 61.7k | case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break; | 87 | 68.8k | case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break; | 88 | 274k | } | 89 | | | 90 | 274k | vPosStride[0] = vPos[0] * in_stride; | 91 | 274k | vPosStride[1] = vPos[1] * in_stride; | 92 | | | 93 | | /* Reorder sao_info.saoOffsetVal[] array, so that we can index it | 94 | | directly with the sum of the two pixel-difference signs. */ | 95 | 274k | int8_t saoOffsetVal[5]; // [2] unused | 96 | 274k | saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1]; | 97 | 274k | saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1]; | 98 | 274k | saoOffsetVal[2] = 0; | 99 | 274k | saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1]; | 100 | 274k | saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1]; | 101 | | | 102 | | | 103 | 12.3M | for (int j=0;j<ctbH;j++) { | 104 | 12.0M | const pixel_t* in_ptr = &in_img [xC+(yC+j)*in_stride]; | 105 | 12.0M | /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride]; | 106 | | | 107 | 615M | for (int i=0;i<ctbW;i++) { | 108 | 602M | int edgeIdx = -1; | 109 | | | 110 | 602M | logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j); | 111 | | | 112 | 602M | if ((extendedTests && | 113 | 26.6M | (sps->pcm_loop_filter_disable_flag && | 114 | 41.9k | img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) || | 115 | 603M | img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) { | 116 | 14.7M | continue; | 117 | 14.7M | } | 118 | | | 119 | | // do the expensive test for boundaries only at the boundaries | 120 | 588M | bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1); | 121 | | | 122 | 588M | if (testBoundary) | 123 | 135M | for (int k=0;k<2;k++) { | 124 | 91.6M | int xS = xC+i+hPos[k]; | 125 | 91.6M | int yS = yC+j+vPos[k]; | 126 | | | 127 | 91.6M | if (xS<0 || yS<0 || xS>=width || yS>=height) { | 128 | 2.98M | edgeIdx=0; | 129 | 2.98M | break; | 130 | 2.98M | } | 131 | | | 132 | | | 133 | | // This part seems inefficient with all the get_SliceHeaderIndex() calls, | 134 | | // but removing this part (because the input was known to have only a single | 135 | | // slice anyway) reduced computation time only by 1.3%. | 136 | | // TODO: however, this may still be a big part of SAO itself. | 137 | | | 138 | 88.6M | slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW, | 139 | 88.6M | yS<<chromashiftH); | 140 | 88.6M | if (sliceHeader==nullptr) { return; } | 141 | | | 142 | 88.6M | int sliceAddrRS = sliceHeader->SliceAddrRS; | 143 | 88.6M | if (sliceAddrRS < ctbSliceAddrRS && | 144 | 7.63k | img->get_SliceHeader((xC+i)<<chromashiftW, | 145 | 7.63k | (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) { | 146 | 5.47k | edgeIdx=0; | 147 | 5.47k | break; | 148 | 5.47k | } | 149 | | | 150 | 88.6M | if (sliceAddrRS > ctbSliceAddrRS && | 151 | 4.27k | img->get_SliceHeader(xS<<chromashiftW, | 152 | 4.27k | yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) { | 153 | 3.79k | edgeIdx=0; | 154 | 3.79k | break; | 155 | 3.79k | } | 156 | | | 157 | | | 158 | 88.6M | if (pps->loop_filter_across_tiles_enabled_flag==0 && | 159 | 88.6M | pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] != | 160 | 88.6M | pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) { | 161 | 3.89k | edgeIdx=0; | 162 | 3.89k | break; | 163 | 3.89k | } | 164 | 88.6M | } | 165 | | | 166 | 588M | if (edgeIdx != 0) { | 167 | | | 168 | 585M | edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) + | 169 | 585M | Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) ); | 170 | | | 171 | 585M | if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table) | 172 | 585M | int offset = saoOffsetVal[edgeIdx+2]; | 173 | | | 174 | 585M | out_ptr[i] = Clip3(0,maxPixelValue, | 175 | 585M | in_ptr[i] + offset); | 176 | 585M | } | 177 | 585M | } | 178 | 588M | } | 179 | 12.0M | } | 180 | 274k | } | 181 | 295k | else { | 182 | 295k | int bandShift = bitDepth-5; | 183 | 295k | int saoLeftClass = saoinfo->sao_band_position[cIdx]; | 184 | 295k | logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass); | 185 | | | 186 | 295k | int bandTable[32]; | 187 | 295k | memset(bandTable, 0, sizeof(int)*32); | 188 | | | 189 | 1.47M | for (int k=0;k<4;k++) { | 190 | 1.18M | bandTable[ (k+saoLeftClass)&31 ] = k+1; | 191 | 1.18M | } | 192 | | | 193 | | | 194 | | /* If PCM or transquant_bypass is used in this CTB, we have to | 195 | | run all checks (A). | 196 | | Otherwise, we run a simplified version of the code (B). | 197 | | | 198 | | NOTE: this whole part of SAO does not seem to be a significant part of the time spent | 199 | | */ | 200 | | | 201 | 295k | if (extendedTests) { | 202 | | | 203 | | // (A) full version with all checks | 204 | | | 205 | 671k | for (int j=0;j<ctbH;j++) | 206 | 32.4M | for (int i=0;i<ctbW;i++) { | 207 | | | 208 | 31.8M | if ((sps->pcm_loop_filter_disable_flag && | 209 | 45.7k | img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) || | 210 | 31.7M | img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) { | 211 | 23.3M | continue; | 212 | 23.3M | } | 213 | | | 214 | | // Shifts are a strange thing. On x86, >>x actually computes >>(x%64). | 215 | | // But this should never happen, because the maximum bit-depth is 16. | 216 | 8.49M | int pixel = in_img[xC + i + (yC + j) * in_stride]; | 217 | | | 218 | | // Note: the input pixel value should never exceed the valid range, but it seems that it still does, | 219 | | // maybe when there was a decoding error and the pixels have not been filled in correctly. | 220 | | // Thus, we have to limit the pixel range to ensure that we have no illegal table access. | 221 | 8.49M | pixel = Clip3(0, maxPixelValue, pixel); | 222 | | | 223 | 8.49M | int bandIdx = bandTable[pixel >> bandShift]; | 224 | | | 225 | 8.49M | if (bandIdx>0) { | 226 | 1.02M | int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; | 227 | | | 228 | 1.02M | logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx, | 229 | 1.02M | offset, | 230 | 1.02M | in_img[xC+i+(yC+j)*in_stride], | 231 | 1.02M | in_img[xC+i+(yC+j)*in_stride]+offset); | 232 | | | 233 | 1.02M | out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, | 234 | 1.02M | in_img[xC+i+(yC+j)*in_stride] + offset); | 235 | 1.02M | } | 236 | 8.49M | } | 237 | 20.2k | } | 238 | 275k | else | 239 | 275k | { | 240 | | // (B) simplified version (only works if no PCM and transquant_bypass is active) | 241 | | | 242 | 11.2M | for (int j=0;j<ctbH;j++) | 243 | 523M | for (int i=0;i<ctbW;i++) { | 244 | | | 245 | 512M | int pixel = in_img[xC + i + (yC + j) * in_stride]; | 246 | | | 247 | | // Note: the input pixel value should never exceed the valid range, but it seems that it still does, | 248 | | // maybe when there was a decoding error and the pixels have not been filled in correctly. | 249 | | // Thus, we have to limit the pixel range to ensure that we have no illegal table access. | 250 | 512M | pixel = Clip3(0, maxPixelValue, pixel); | 251 | | | 252 | 512M | int bandIdx = bandTable[pixel >> bandShift]; | 253 | | | 254 | 512M | if (bandIdx>0) { | 255 | 62.2M | int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; | 256 | | | 257 | 62.2M | out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, | 258 | 62.2M | in_img[xC+i+(yC+j)*in_stride] + offset); | 259 | 62.2M | } | 260 | 512M | } | 261 | 275k | } | 262 | 295k | } | 263 | 570k | } |
|
264 | | |
265 | | |
266 | | template <class pixel_t> |
267 | | void apply_sao(de265_image* img, int xCtb,int yCtb, |
268 | | const slice_segment_header* shdr, int cIdx, int nSW,int nSH, |
269 | | const pixel_t* in_img, int in_stride, |
270 | | /* */ pixel_t* out_img, int out_stride) |
271 | 1.64M | { |
272 | 1.64M | if (img->high_bit_depth(cIdx)) { |
273 | 259k | apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, |
274 | 259k | reinterpret_cast<const uint16_t*>(in_img), in_stride, |
275 | 259k | reinterpret_cast<uint16_t*>(out_img),out_stride); |
276 | 259k | } |
277 | 1.38M | else { |
278 | 1.38M | apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, |
279 | 1.38M | in_img, in_stride, |
280 | 1.38M | out_img,out_stride); |
281 | 1.38M | } |
282 | 1.64M | } |
283 | | |
284 | | |
285 | | void apply_sample_adaptive_offset(de265_image* img) |
286 | 0 | { |
287 | 0 | const seq_parameter_set& sps = img->get_sps(); |
288 | |
|
289 | 0 | if (sps.sample_adaptive_offset_enabled_flag==0) { |
290 | 0 | return; |
291 | 0 | } |
292 | | |
293 | 0 | de265_image inputCopy; |
294 | 0 | de265_error err = inputCopy.copy_image(img); |
295 | 0 | if (err != DE265_OK) { |
296 | 0 | img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); |
297 | 0 | return; |
298 | 0 | } |
299 | | |
300 | 0 | for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++) |
301 | 0 | for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++) |
302 | 0 | { |
303 | 0 | const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb); |
304 | |
|
305 | 0 | if (shdr->slice_sao_luma_flag) { |
306 | 0 | apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY, |
307 | 0 | inputCopy.get_image_plane(0), inputCopy.get_image_stride(0), |
308 | 0 | img->get_image_plane(0), img->get_image_stride(0)); |
309 | 0 | } |
310 | |
|
311 | 0 | if (shdr->slice_sao_chroma_flag) { |
312 | 0 | int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC; |
313 | 0 | int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC; |
314 | |
|
315 | 0 | apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH, |
316 | 0 | inputCopy.get_image_plane(1), inputCopy.get_image_stride(1), |
317 | 0 | img->get_image_plane(1), img->get_image_stride(1)); |
318 | |
|
319 | 0 | apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH, |
320 | 0 | inputCopy.get_image_plane(2), inputCopy.get_image_stride(2), |
321 | 0 | img->get_image_plane(2), img->get_image_stride(2)); |
322 | 0 | } |
323 | 0 | } |
324 | 0 | } |
325 | | |
326 | | |
327 | | void apply_sample_adaptive_offset_sequential(de265_image* img) |
328 | 0 | { |
329 | 0 | const seq_parameter_set& sps = img->get_sps(); |
330 | |
|
331 | 0 | if (sps.sample_adaptive_offset_enabled_flag==0) { |
332 | 0 | return; |
333 | 0 | } |
334 | | |
335 | 0 | int lumaImageSize = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0); |
336 | 0 | int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1); |
337 | |
|
338 | 0 | uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ]; |
339 | 0 | if (inputCopy == nullptr) { |
340 | 0 | img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); |
341 | 0 | return; |
342 | 0 | } |
343 | | |
344 | | |
345 | 0 | int nChannels = 3; |
346 | 0 | if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; } |
347 | |
|
348 | 0 | for (int cIdx=0;cIdx<nChannels;cIdx++) { |
349 | |
|
350 | 0 | int stride = img->get_image_stride(cIdx); |
351 | 0 | int height = img->get_height(cIdx); |
352 | |
|
353 | 0 | memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx)); |
354 | |
|
355 | 0 | for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++) |
356 | 0 | for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++) |
357 | 0 | { |
358 | 0 | const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb); |
359 | 0 | if (shdr==nullptr) { |
360 | 0 | delete[] inputCopy; |
361 | 0 | return; |
362 | 0 | } |
363 | | |
364 | 0 | if (cIdx==0 && shdr->slice_sao_luma_flag) { |
365 | 0 | apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY, |
366 | 0 | inputCopy, stride, |
367 | 0 | img->get_image_plane(0), img->get_image_stride(0)); |
368 | 0 | } |
369 | |
|
370 | 0 | if (cIdx!=0 && shdr->slice_sao_chroma_flag) { |
371 | 0 | int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC; |
372 | 0 | int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC; |
373 | |
|
374 | 0 | apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH, |
375 | 0 | inputCopy, stride, |
376 | 0 | img->get_image_plane(cIdx), img->get_image_stride(cIdx)); |
377 | 0 | } |
378 | 0 | } |
379 | 0 | } |
380 | | |
381 | 0 | delete[] inputCopy; |
382 | 0 | } |
383 | | |
384 | | |
385 | | |
386 | | |
387 | | class thread_task_sao : public thread_task |
388 | | { |
389 | | public: |
390 | | int ctb_y; |
391 | | de265_image* img; /* this is where we get the SPS from |
392 | | (either inputImg or outputImg can be a dummy image) |
393 | | */ |
394 | | |
395 | | de265_image* inputImg; |
396 | | de265_image* outputImg; |
397 | | int inputProgress; |
398 | | |
399 | | virtual void work(); |
400 | 0 | virtual std::string name() const { |
401 | 0 | char buf[100]; |
402 | 0 | sprintf(buf,"sao-%d",ctb_y); |
403 | 0 | return buf; |
404 | 0 | } |
405 | | }; |
406 | | |
407 | | |
408 | | void thread_task_sao::work() |
409 | 49.2k | { |
410 | 49.2k | state = Running; |
411 | 49.2k | img->thread_run(this); |
412 | | |
413 | 49.2k | const seq_parameter_set& sps = img->get_sps(); |
414 | | |
415 | 49.2k | const int rightCtb = sps.PicWidthInCtbsY-1; |
416 | 49.2k | const int ctbSize = (1<<sps.Log2CtbSizeY); |
417 | | |
418 | | |
419 | | // wait until also the CTB-rows below and above are ready |
420 | | |
421 | 49.2k | img->wait_for_progress(this, rightCtb,ctb_y, inputProgress); |
422 | | |
423 | 49.2k | if (ctb_y>0) { |
424 | 37.5k | img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress); |
425 | 37.5k | } |
426 | | |
427 | 49.2k | if (ctb_y+1<sps.PicHeightInCtbsY) { |
428 | 37.5k | img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress); |
429 | 37.5k | } |
430 | | |
431 | | |
432 | | // copy input image to output for this CTB-row |
433 | | |
434 | 49.2k | outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize); |
435 | | |
436 | | |
437 | | // process SAO in the CTB-row |
438 | | |
439 | 656k | for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++) |
440 | 607k | { |
441 | 607k | const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y); |
442 | 607k | if (shdr==nullptr) { |
443 | 2 | break; |
444 | 2 | } |
445 | | |
446 | 607k | if (shdr->slice_sao_luma_flag) { |
447 | 585k | apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize, |
448 | 585k | inputImg ->get_image_plane(0), inputImg ->get_image_stride(0), |
449 | 585k | outputImg->get_image_plane(0), outputImg->get_image_stride(0)); |
450 | 585k | } |
451 | | |
452 | 607k | if (shdr->slice_sao_chroma_flag) { |
453 | 527k | int nSW = ctbSize / sps.SubWidthC; |
454 | 527k | int nSH = ctbSize / sps.SubHeightC; |
455 | | |
456 | 527k | apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH, |
457 | 527k | inputImg ->get_image_plane(1), inputImg ->get_image_stride(1), |
458 | 527k | outputImg->get_image_plane(1), outputImg->get_image_stride(1)); |
459 | | |
460 | 527k | apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH, |
461 | 527k | inputImg ->get_image_plane(2), inputImg ->get_image_stride(2), |
462 | 527k | outputImg->get_image_plane(2), outputImg->get_image_stride(2)); |
463 | 527k | } |
464 | 607k | } |
465 | | |
466 | | |
467 | | // mark SAO progress |
468 | | |
469 | 656k | for (int x=0;x<=rightCtb;x++) { |
470 | 607k | const int CtbWidth = sps.PicWidthInCtbsY; |
471 | 607k | img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO); |
472 | 607k | } |
473 | | |
474 | | |
475 | 49.2k | state = Finished; |
476 | 49.2k | img->thread_finishes(this); |
477 | 49.2k | } |
478 | | |
479 | | |
480 | | bool add_sao_tasks(image_unit* imgunit, int saoInputProgress) |
481 | 13.0k | { |
482 | 13.0k | de265_image* img = imgunit->img; |
483 | 13.0k | const seq_parameter_set& sps = img->get_sps(); |
484 | | |
485 | 13.0k | if (sps.sample_adaptive_offset_enabled_flag==0) { |
486 | 1.27k | return false; |
487 | 1.27k | } |
488 | | |
489 | | |
490 | 11.7k | decoder_context* ctx = img->decctx; |
491 | | |
492 | 11.7k | de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(), |
493 | 11.7k | img->get_chroma_format(), |
494 | 11.7k | img->get_shared_sps(), |
495 | 11.7k | false, |
496 | 11.7k | img->decctx, //img->encctx, |
497 | 11.7k | img->pts, img->user_data, true); |
498 | 11.7k | if (err != DE265_OK) { |
499 | 0 | img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); |
500 | 0 | return false; |
501 | 0 | } |
502 | | |
503 | 11.7k | int nRows = sps.PicHeightInCtbsY; |
504 | | |
505 | 11.7k | img->thread_start(nRows); |
506 | | |
507 | 61.0k | for (int y=0;y<nRows;y++) |
508 | 49.2k | { |
509 | 49.2k | thread_task_sao* task = new thread_task_sao; |
510 | | |
511 | 49.2k | task->inputImg = img; |
512 | 49.2k | task->outputImg = &imgunit->sao_output; |
513 | 49.2k | task->img = img; |
514 | 49.2k | task->ctb_y = y; |
515 | 49.2k | task->inputProgress = saoInputProgress; |
516 | | |
517 | 49.2k | imgunit->tasks.push_back(task); |
518 | 49.2k | ctx->thread_pool_.add_task(task); |
519 | 49.2k | } |
520 | | |
521 | | /* Currently need barrier here because when are finished, we have to swap the pixel |
522 | | data back into the main image. */ |
523 | 11.7k | img->wait_for_completion(); |
524 | | |
525 | 11.7k | img->exchange_pixel_data_with(imgunit->sao_output); |
526 | | |
527 | 11.7k | return true; |
528 | 11.7k | } |