/src/libde265/libde265/transform.cc
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "transform.h" |
22 | | #include "util.h" |
23 | | |
24 | | #include <assert.h> |
25 | | |
26 | | |
27 | | const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ }; |
28 | | |
29 | | |
30 | | // (8.6.1) |
31 | | void decode_quantization_parameters(thread_context* tctx, int xC,int yC, |
32 | | int xCUBase, int yCUBase) |
33 | 5.08M | { |
34 | 5.08M | logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC); |
35 | | |
36 | 5.08M | const pic_parameter_set& pps = tctx->img->get_pps(); |
37 | 5.08M | const seq_parameter_set& sps = tctx->img->get_sps(); |
38 | 5.08M | slice_segment_header* shdr = tctx->shdr; |
39 | | |
40 | | // top left pixel position of current quantization group |
41 | 5.08M | int xQG = xCUBase - (xCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1)); |
42 | 5.08M | int yQG = yCUBase - (yCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1)); |
43 | | |
44 | 5.08M | logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG); |
45 | | |
46 | | |
47 | | // we only have to set QP in the first call in a quantization-group |
48 | | |
49 | | /* TODO: check why this does not work with HoneyBee stream |
50 | | |
51 | | if (xQG == tctx->currentQG_x && |
52 | | yQG == tctx->currentQG_y) |
53 | | { |
54 | | return; |
55 | | } |
56 | | */ |
57 | | |
58 | | // if first QG in CU, remember last QPY of last CU previous QG |
59 | | |
60 | 5.08M | if (xQG != tctx->currentQG_x || |
61 | 5.08M | yQG != tctx->currentQG_y) |
62 | 1.45M | { |
63 | 1.45M | tctx->lastQPYinPreviousQG = tctx->currentQPY; |
64 | 1.45M | tctx->currentQG_x = xQG; |
65 | 1.45M | tctx->currentQG_y = yQG; |
66 | 1.45M | } |
67 | | |
68 | 5.08M | int qPY_PRED; |
69 | | |
70 | | // first QG in CTB row ? |
71 | | |
72 | 5.08M | int ctbLSBMask = ((1<<sps.Log2CtbSizeY)-1); |
73 | 5.08M | bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0)); |
74 | | |
75 | | // first QG in slice ? TODO: a "firstQG" flag in the thread context would be faster |
76 | | |
77 | 5.08M | int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS; |
78 | | |
79 | 5.08M | int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY; |
80 | 5.08M | int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY; |
81 | | |
82 | 5.08M | bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG); |
83 | | |
84 | | // first QG in tile ? |
85 | | |
86 | 5.08M | bool firstQGInTile = false; |
87 | 5.08M | if (pps.tiles_enabled_flag) { |
88 | 914k | if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 && |
89 | 914k | (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0) |
90 | 878k | { |
91 | 878k | int ctbX = xQG >> sps.Log2CtbSizeY; |
92 | 878k | int ctbY = yQG >> sps.Log2CtbSizeY; |
93 | | |
94 | 878k | firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow |
95 | 878k | } |
96 | 914k | } |
97 | | |
98 | | |
99 | 5.08M | if (firstQGInSlice || firstQGInTile || |
100 | 5.08M | (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) { |
101 | 1.39M | qPY_PRED = tctx->shdr->SliceQPY; |
102 | 1.39M | } |
103 | 3.68M | else { |
104 | 3.68M | qPY_PRED = tctx->lastQPYinPreviousQG; |
105 | 3.68M | } |
106 | | |
107 | | |
108 | 5.08M | int qPYA,qPYB; |
109 | | |
110 | 5.08M | if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) { |
111 | 3.24M | int xTmp = (xQG-1) >> sps.Log2MinTrafoSize; |
112 | 3.24M | int yTmp = (yQG ) >> sps.Log2MinTrafoSize; |
113 | 3.24M | int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; |
114 | 3.24M | int ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); |
115 | 3.24M | if (ctbAddrA == tctx->CtbAddrInTS) { |
116 | 1.05M | qPYA = tctx->img->get_QPY(xQG-1,yQG); |
117 | 1.05M | } |
118 | 2.18M | else { |
119 | 2.18M | qPYA = qPY_PRED; |
120 | 2.18M | } |
121 | 3.24M | } |
122 | 1.83M | else { |
123 | 1.83M | qPYA = qPY_PRED; |
124 | 1.83M | } |
125 | | |
126 | 5.08M | if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) { |
127 | 2.97M | int xTmp = (xQG ) >> sps.Log2MinTrafoSize; |
128 | 2.97M | int yTmp = (yQG-1) >> sps.Log2MinTrafoSize; |
129 | 2.97M | int minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; |
130 | 2.97M | int ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); |
131 | 2.97M | if (ctbAddrB == tctx->CtbAddrInTS) { |
132 | 1.04M | qPYB = tctx->img->get_QPY(xQG,yQG-1); |
133 | 1.04M | } |
134 | 1.93M | else { |
135 | 1.93M | qPYB = qPY_PRED; |
136 | 1.93M | } |
137 | 2.97M | } |
138 | 2.10M | else { |
139 | 2.10M | qPYB = qPY_PRED; |
140 | 2.10M | } |
141 | | |
142 | 5.08M | qPY_PRED = (qPYA + qPYB + 1)>>1; |
143 | | |
144 | 5.08M | logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB); |
145 | | |
146 | 5.08M | int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) % |
147 | 5.08M | (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y; |
148 | | |
149 | 5.08M | tctx->qPYPrime = QPY + sps.QpBdOffset_Y; |
150 | 5.08M | if (tctx->qPYPrime<0) { |
151 | 46.5k | tctx->qPYPrime=0; |
152 | 46.5k | } |
153 | | |
154 | 5.08M | int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb); |
155 | 5.08M | int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr); |
156 | | |
157 | 5.08M | logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n", |
158 | 5.08M | qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset, |
159 | 5.08M | qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset); |
160 | | |
161 | 5.08M | int qPCb,qPCr; |
162 | | |
163 | 5.08M | if (sps.ChromaArrayType == CHROMA_420) { |
164 | 2.49M | qPCb = table8_22(qPiCb); |
165 | 2.49M | qPCr = table8_22(qPiCr); |
166 | 2.49M | } |
167 | 2.58M | else { |
168 | 2.58M | qPCb = qPiCb; |
169 | 2.58M | qPCr = qPiCr; |
170 | 2.58M | } |
171 | | |
172 | | //printf("q: %d %d\n",qPiCb, qPCb); |
173 | | |
174 | 5.08M | tctx->qPCbPrime = qPCb + sps.QpBdOffset_C; |
175 | 5.08M | if (tctx->qPCbPrime<0) { |
176 | 0 | tctx->qPCbPrime = 0; |
177 | 0 | } |
178 | | |
179 | 5.08M | tctx->qPCrPrime = qPCr + sps.QpBdOffset_C; |
180 | 5.08M | if (tctx->qPCrPrime<0) { |
181 | 0 | tctx->qPCrPrime = 0; |
182 | 0 | } |
183 | | |
184 | | /* |
185 | | printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY, |
186 | | sps->QpBdOffset_Y, |
187 | | pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset, |
188 | | pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset, |
189 | | sps->QpBdOffset_C, sps->QpBdOffset_C, |
190 | | tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime); |
191 | | */ |
192 | | |
193 | 5.08M | int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase); |
194 | | |
195 | | // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why. |
196 | | // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit. |
197 | | // id:000163,sig:06,src:002041,op:havoc,rep:16.bin |
198 | 5.08M | if (log2CbSize<3) { log2CbSize=3; } |
199 | | |
200 | 5.08M | tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY); |
201 | 5.08M | tctx->currentQPY = QPY; |
202 | | |
203 | | /* |
204 | | printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase, |
205 | | xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY); |
206 | | */ |
207 | | |
208 | 5.08M | logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n", |
209 | 5.08M | xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime); |
210 | 5.08M | } |
211 | | |
212 | | |
213 | | |
214 | | template <class pixel_t> |
215 | | void transform_coefficients(acceleration_functions* acceleration, |
216 | | int16_t* coeff, int coeffStride, int nT, int trType, |
217 | | pixel_t* dst, int dstStride, int bit_depth) |
218 | 6.52M | { |
219 | 6.52M | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); |
220 | | |
221 | | |
222 | 6.52M | if (trType==1) { |
223 | | |
224 | 3.25M | acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth); |
225 | | |
226 | 3.27M | } else { |
227 | | |
228 | 3.27M | /**/ if (nT==4) { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); } |
229 | 627k | else if (nT==8) { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); } |
230 | 68.3k | else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); } |
231 | 19.8k | else { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); } |
232 | 3.27M | } |
233 | | |
234 | | #if 0 |
235 | | printf("decoded pixels:\n"); |
236 | | for (int y=0;y<nT;y++,printf("\n")) |
237 | | for (int x=0;x<nT;x++) { |
238 | | printf("%02x ",dst[y*dstStride+x]); |
239 | | } |
240 | | #endif |
241 | 6.52M | } void transform_coefficients<unsigned short>(acceleration_functions*, short*, int, int, int, unsigned short*, int, int) Line | Count | Source | 218 | 2.79M | { | 219 | 2.79M | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); | 220 | | | 221 | | | 222 | 2.79M | if (trType==1) { | 223 | | | 224 | 1.47M | acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth); | 225 | | | 226 | 1.47M | } else { | 227 | | | 228 | 1.32M | /**/ if (nT==4) { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); } | 229 | 228k | else if (nT==8) { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); } | 230 | 24.7k | else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); } | 231 | 8.18k | else { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); } | 232 | 1.32M | } | 233 | | | 234 | | #if 0 | 235 | | printf("decoded pixels:\n"); | 236 | | for (int y=0;y<nT;y++,printf("\n")) | 237 | | for (int x=0;x<nT;x++) { | 238 | | printf("%02x ",dst[y*dstStride+x]); | 239 | | } | 240 | | #endif | 241 | 2.79M | } |
void transform_coefficients<unsigned char>(acceleration_functions*, short*, int, int, int, unsigned char*, int, int) Line | Count | Source | 218 | 3.72M | { | 219 | 3.72M | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); | 220 | | | 221 | | | 222 | 3.72M | if (trType==1) { | 223 | | | 224 | 1.78M | acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth); | 225 | | | 226 | 1.94M | } else { | 227 | | | 228 | 1.94M | /**/ if (nT==4) { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); } | 229 | 399k | else if (nT==8) { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); } | 230 | 43.5k | else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); } | 231 | 11.6k | else { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); } | 232 | 1.94M | } | 233 | | | 234 | | #if 0 | 235 | | printf("decoded pixels:\n"); | 236 | | for (int y=0;y<nT;y++,printf("\n")) | 237 | | for (int x=0;x<nT;x++) { | 238 | | printf("%02x ",dst[y*dstStride+x]); | 239 | | } | 240 | | #endif | 241 | 3.72M | } |
|
242 | | |
243 | | |
244 | | // TODO: make this an accelerated function |
245 | | void cross_comp_pred(const thread_context* tctx, int32_t* residual, int nT) |
246 | 53.4k | { |
247 | 53.4k | const int BitDepthC = tctx->img->get_sps().BitDepth_C; |
248 | 53.4k | const int BitDepthY = tctx->img->get_sps().BitDepth_Y; |
249 | | |
250 | 376k | for (int y=0;y<nT;y++) |
251 | 2.99M | for (int x=0;x<nT;x++) { |
252 | | /* TODO: the most usual case is definitely BitDepthY == BitDepthC, in which case |
253 | | we could just omit two shifts. The second most common case is probably |
254 | | BitDepthY>BitDepthC, for which we could also eliminate one shift. The remaining |
255 | | case is also one shift only. |
256 | | */ |
257 | | |
258 | 2.66M | residual[y*nT+x] += (tctx->ResScaleVal * |
259 | 2.66M | ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3; |
260 | 2.66M | } |
261 | 53.4k | } |
262 | | |
263 | | |
264 | | template <class pixel_t> |
265 | | void transform_coefficients_explicit(thread_context* tctx, |
266 | | int16_t* coeff, int coeffStride, int nT, int trType, |
267 | | pixel_t* dst, int dstStride, int bit_depth, int cIdx) |
268 | 163k | { |
269 | 163k | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); |
270 | | |
271 | 163k | const acceleration_functions* acceleration = &tctx->decctx->acceleration; |
272 | | |
273 | 163k | int32_t residual_buffer[32*32]; |
274 | 163k | int32_t* residual; |
275 | 163k | if (cIdx==0) { |
276 | 81.3k | residual = tctx->residual_luma; |
277 | 81.3k | } |
278 | 82.4k | else { |
279 | 82.4k | residual = residual_buffer; |
280 | 82.4k | } |
281 | | |
282 | | |
283 | | // TODO |
284 | 163k | int bdShift = 20 - bit_depth; |
285 | 163k | int max_coeff_bits = 15; |
286 | | |
287 | 163k | if (trType==1) { |
288 | | |
289 | 56.3k | acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); |
290 | | |
291 | 107k | } else { |
292 | | |
293 | 107k | /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } |
294 | 54.8k | else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } |
295 | 12.5k | else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } |
296 | 457 | else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } |
297 | 107k | } |
298 | | |
299 | | |
300 | | //printBlk("prediction",(uint8_t*)dst,nT,dstStride); |
301 | | //printBlk("residual",residual,nT,nT); |
302 | | |
303 | 163k | if (cIdx != 0) { |
304 | 82.4k | if (tctx->ResScaleVal != 0) { |
305 | 53.0k | cross_comp_pred(tctx, residual, nT); |
306 | 53.0k | } |
307 | | |
308 | | //printBlk("cross-comp-pred modified residual",residual,nT,nT); |
309 | 82.4k | } |
310 | | |
311 | 163k | acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); |
312 | 163k | } void transform_coefficients_explicit<unsigned short>(thread_context*, short*, int, int, int, unsigned short*, int, int, int) Line | Count | Source | 268 | 24.7k | { | 269 | 24.7k | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); | 270 | | | 271 | 24.7k | const acceleration_functions* acceleration = &tctx->decctx->acceleration; | 272 | | | 273 | 24.7k | int32_t residual_buffer[32*32]; | 274 | 24.7k | int32_t* residual; | 275 | 24.7k | if (cIdx==0) { | 276 | 0 | residual = tctx->residual_luma; | 277 | 0 | } | 278 | 24.7k | else { | 279 | 24.7k | residual = residual_buffer; | 280 | 24.7k | } | 281 | | | 282 | | | 283 | | // TODO | 284 | 24.7k | int bdShift = 20 - bit_depth; | 285 | 24.7k | int max_coeff_bits = 15; | 286 | | | 287 | 24.7k | if (trType==1) { | 288 | |
| 289 | 0 | acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); | 290 | |
| 291 | 24.7k | } else { | 292 | | | 293 | 24.7k | /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } | 294 | 12.0k | else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } | 295 | 1.79k | else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } | 296 | 0 | else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } | 297 | 24.7k | } | 298 | | | 299 | | | 300 | | //printBlk("prediction",(uint8_t*)dst,nT,dstStride); | 301 | | //printBlk("residual",residual,nT,nT); | 302 | | | 303 | 24.7k | if (cIdx != 0) { | 304 | 24.7k | if (tctx->ResScaleVal != 0) { | 305 | 17.6k | cross_comp_pred(tctx, residual, nT); | 306 | 17.6k | } | 307 | | | 308 | | //printBlk("cross-comp-pred modified residual",residual,nT,nT); | 309 | 24.7k | } | 310 | | | 311 | 24.7k | acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); | 312 | 24.7k | } |
void transform_coefficients_explicit<unsigned char>(thread_context*, short*, int, int, int, unsigned char*, int, int, int) Line | Count | Source | 268 | 139k | { | 269 | 139k | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); | 270 | | | 271 | 139k | const acceleration_functions* acceleration = &tctx->decctx->acceleration; | 272 | | | 273 | 139k | int32_t residual_buffer[32*32]; | 274 | 139k | int32_t* residual; | 275 | 139k | if (cIdx==0) { | 276 | 81.3k | residual = tctx->residual_luma; | 277 | 81.3k | } | 278 | 57.6k | else { | 279 | 57.6k | residual = residual_buffer; | 280 | 57.6k | } | 281 | | | 282 | | | 283 | | // TODO | 284 | 139k | int bdShift = 20 - bit_depth; | 285 | 139k | int max_coeff_bits = 15; | 286 | | | 287 | 139k | if (trType==1) { | 288 | | | 289 | 56.3k | acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); | 290 | | | 291 | 82.7k | } else { | 292 | | | 293 | 82.7k | /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } | 294 | 42.7k | else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } | 295 | 10.7k | else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } | 296 | 457 | else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } | 297 | 82.7k | } | 298 | | | 299 | | | 300 | | //printBlk("prediction",(uint8_t*)dst,nT,dstStride); | 301 | | //printBlk("residual",residual,nT,nT); | 302 | | | 303 | 139k | if (cIdx != 0) { | 304 | 57.6k | if (tctx->ResScaleVal != 0) { | 305 | 35.3k | cross_comp_pred(tctx, residual, nT); | 306 | 35.3k | } | 307 | | | 308 | | //printBlk("cross-comp-pred modified residual",residual,nT,nT); | 309 | 57.6k | } | 310 | | | 311 | 139k | acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); | 312 | 139k | } |
|
313 | | |
314 | | |
315 | | void inv_transform(acceleration_functions* acceleration, |
316 | | uint8_t* dst, int dstStride, int16_t* coeff, |
317 | | int log2TbSize, int trType) |
318 | 0 | { |
319 | 0 | if (trType==1) { |
320 | 0 | assert(log2TbSize==2); |
321 | | |
322 | 0 | acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride); |
323 | |
|
324 | 0 | } else { |
325 | 0 | acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride); |
326 | 0 | } |
327 | | |
328 | |
|
329 | | #if 0 |
330 | | int nT = 1<<log2TbSize; |
331 | | printf("decoded pixels:\n"); |
332 | | for (int y=0;y<nT;y++,printf("\n")) |
333 | | for (int x=0;x<nT;x++) { |
334 | | printf("%02x ",dst[y*dstStride+x]); |
335 | | } |
336 | | #endif |
337 | 0 | } |
338 | | |
339 | | |
340 | | void fwd_transform(acceleration_functions* acceleration, |
341 | | int16_t* coeff, int coeffStride, int log2TbSize, int trType, |
342 | | const int16_t* src, int srcStride) |
343 | 0 | { |
344 | 0 | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,1<<log2TbSize); |
345 | |
|
346 | 0 | if (trType==1) { |
347 | | // DST 4x4 |
348 | |
|
349 | 0 | acceleration->fwd_transform_4x4_dst_8(coeff, src, srcStride); |
350 | 0 | } else { |
351 | | // DCT 4x4, 8x8, 16x16, 32x32 |
352 | |
|
353 | 0 | acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride); |
354 | 0 | } |
355 | 0 | } |
356 | | |
357 | | |
358 | | |
359 | | static const int levelScale[] = { 40,45,51,57,64,72 }; |
360 | | |
361 | | // (8.6.2) and (8.6.3) |
362 | | template <class pixel_t> |
363 | | void scale_coefficients_internal(thread_context* tctx, |
364 | | int xT,int yT, // position of TU in frame (chroma adapted) |
365 | | int x0,int y0, // position of CU in frame (chroma adapted) |
366 | | int nT, int cIdx, |
367 | | bool transform_skip_flag, bool intra, int rdpcmMode) |
368 | 9.71M | { |
369 | 9.71M | const seq_parameter_set& sps = tctx->img->get_sps(); |
370 | 9.71M | const pic_parameter_set& pps = tctx->img->get_pps(); |
371 | | |
372 | 9.71M | int qP; |
373 | 9.71M | switch (cIdx) { |
374 | 6.48M | case 0: qP = tctx->qPYPrime; break; |
375 | 1.61M | case 1: qP = tctx->qPCbPrime; break; |
376 | 1.61M | case 2: qP = tctx->qPCrPrime; break; |
377 | 0 | default: qP = 0; assert(0); break; // should never happen |
378 | 9.71M | } |
379 | | |
380 | 9.71M | logtrace(LogTransform,"qP: %d\n",qP); |
381 | | |
382 | | |
383 | 9.71M | int16_t* coeff; |
384 | 9.71M | int coeffStride; |
385 | | |
386 | 9.71M | coeff = tctx->coeffBuf; |
387 | 9.71M | coeffStride = nT; |
388 | | |
389 | | |
390 | | |
391 | | |
392 | | |
393 | 9.71M | pixel_t* pred; |
394 | 9.71M | int stride; |
395 | 9.71M | pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT); |
396 | 9.71M | stride = tctx->img->get_image_stride(cIdx); |
397 | | |
398 | | // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler |
399 | | // can optimize away a lot of code for 8-bit pixels. |
400 | 9.71M | const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); |
401 | | |
402 | | //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); |
403 | 9.71M | int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); |
404 | | |
405 | 9.71M | bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && |
406 | 9.71M | nT == 4 && |
407 | 9.71M | cuPredModeIntra); |
408 | | |
409 | 9.71M | if (tctx->cu_transquant_bypass_flag) { |
410 | | |
411 | 1.82M | int32_t residual_buffer[32*32]; |
412 | | |
413 | 1.82M | int32_t* residual; |
414 | 1.82M | if (cIdx==0) residual = tctx->residual_luma; |
415 | 478k | else residual = residual_buffer; |
416 | | |
417 | | |
418 | | // TODO: we could fold the coefficient rotation into the coefficient expansion here: |
419 | 17.6M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
420 | 15.8M | int32_t currCoeff = tctx->coeffList[cIdx][i]; |
421 | 15.8M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; |
422 | 15.8M | } |
423 | | |
424 | 1.82M | if (rotateCoeffs) { |
425 | 388k | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); |
426 | 388k | } |
427 | | |
428 | 1.82M | if (rdpcmMode) { |
429 | 149k | if (rdpcmMode==2) |
430 | 63.6k | tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); |
431 | 85.7k | else |
432 | 85.7k | tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); |
433 | 149k | } |
434 | 1.67M | else { |
435 | 1.67M | tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); |
436 | 1.67M | } |
437 | | |
438 | 1.82M | if (cIdx != 0) { |
439 | 478k | if (tctx->ResScaleVal != 0) { |
440 | 467 | cross_comp_pred(tctx, residual, nT); |
441 | 467 | } |
442 | 478k | } |
443 | | |
444 | 1.82M | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); |
445 | | |
446 | 1.82M | if (rotateCoeffs) { |
447 | 388k | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around |
448 | 388k | } |
449 | 1.82M | } |
450 | 7.89M | else { |
451 | | // (8.6.3) |
452 | | |
453 | 7.89M | int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; |
454 | | |
455 | 7.89M | logtrace(LogTransform,"bdShift=%d\n",bdShift); |
456 | | |
457 | 7.89M | logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); |
458 | | |
459 | | |
460 | | // --- inverse quantization --- |
461 | | |
462 | 7.89M | if (sps.scaling_list_enable_flag==0) { |
463 | | |
464 | | //const int m_x_y = 16; |
465 | 5.72M | const int m_x_y = 1; |
466 | 5.72M | bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers |
467 | | |
468 | 5.72M | const int offset = (1<<(bdShift-1)); |
469 | 5.72M | const int fact = m_x_y * levelScale[qP%6] << (qP/6); |
470 | | |
471 | 49.6M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
472 | | |
473 | | // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit |
474 | 43.9M | int32_t currCoeff = tctx->coeffList[cIdx][i]; |
475 | | |
476 | | //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], |
477 | | //tctx->coeffList[cIdx][i]); |
478 | | |
479 | 43.9M | currCoeff = Clip3(-32768,32767, |
480 | 43.9M | ( (currCoeff * fact + offset ) >> bdShift)); |
481 | | |
482 | | //logtrace(LogTransform," -> %d\n",currCoeff); |
483 | | |
484 | 43.9M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; |
485 | 43.9M | } |
486 | 5.72M | } |
487 | 2.16M | else { |
488 | 2.16M | const int offset = (1<<(bdShift-1)); |
489 | | |
490 | 2.16M | const uint8_t* sclist; |
491 | 2.16M | int matrixID = cIdx; |
492 | | |
493 | 2.16M | if (nT==32) { |
494 | 16.4k | matrixID=0; |
495 | 16.4k | } |
496 | | |
497 | 2.16M | if (!intra) { |
498 | 96.5k | if (nT<32) { matrixID += 3; } |
499 | 3.06k | else { matrixID++; } |
500 | 96.5k | } |
501 | | |
502 | 2.16M | switch (nT) { |
503 | 1.85M | case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; |
504 | 259k | case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; |
505 | 32.0k | case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; |
506 | 16.4k | case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; |
507 | 0 | default: assert(0); |
508 | 2.16M | } |
509 | | |
510 | 22.9M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
511 | 20.7M | int pos = tctx->coeffPos[cIdx][i]; |
512 | | |
513 | 20.7M | const int m_x_y = sclist[pos]; |
514 | 20.7M | const int fact = m_x_y * levelScale[qP%6] << (qP/6); |
515 | | |
516 | 20.7M | int64_t currCoeff = tctx->coeffList[cIdx][i]; |
517 | | |
518 | 20.7M | currCoeff = Clip3(-32768,32767, |
519 | 20.7M | ( (currCoeff * fact + offset ) >> bdShift)); |
520 | | |
521 | 20.7M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; |
522 | 20.7M | } |
523 | 2.16M | } |
524 | | |
525 | | |
526 | | // --- do transform or skip --- |
527 | | |
528 | 7.89M | logtrace(LogTransform,"coefficients OUT:\n"); |
529 | 43.1M | for (int y=0;y<nT;y++) { |
530 | 35.2M | logtrace(LogTransform," "); |
531 | 225M | for (int x=0;x<nT;x++) { |
532 | 190M | logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]); |
533 | 190M | } |
534 | 35.2M | logtrace(LogTransform,"*\n"); |
535 | 35.2M | } |
536 | | |
537 | 7.89M | int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C; |
538 | | |
539 | 7.89M | logtrace(LogTransform,"bdShift2=%d\n",bdShift2); |
540 | | |
541 | 7.89M | logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx, |
542 | 7.89M | transform_skip_flag); |
543 | | |
544 | 7.89M | if (transform_skip_flag) { |
545 | | |
546 | 1.20M | int extended_precision_processing_flag = 0; |
547 | 1.20M | int Log2nTbS = Log2(nT); |
548 | 1.20M | int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 ); |
549 | 1.20M | int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 ) |
550 | 1.20M | + Log2nTbS; |
551 | | |
552 | 1.20M | if (rotateCoeffs) { |
553 | 410k | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); |
554 | 410k | } |
555 | | |
556 | 1.20M | int32_t residual_buffer[32*32]; |
557 | | |
558 | 1.20M | int32_t* residual; |
559 | 1.20M | if (cIdx==0) residual = tctx->residual_luma; |
560 | 433k | else residual = residual_buffer; |
561 | | |
562 | 1.20M | if (rdpcmMode) { |
563 | | /* |
564 | | if (rdpcmMode==2) |
565 | | tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); |
566 | | else |
567 | | tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); |
568 | | */ |
569 | | |
570 | 86.0k | if (rdpcmMode==2) |
571 | 51.5k | tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); |
572 | 34.4k | else |
573 | 34.4k | tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); |
574 | 86.0k | } |
575 | 1.11M | else { |
576 | | //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); |
577 | | |
578 | 1.11M | tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); |
579 | 1.11M | } |
580 | | |
581 | 1.20M | if (cIdx != 0) { |
582 | 433k | if (tctx->ResScaleVal != 0) { |
583 | 0 | cross_comp_pred(tctx, residual, nT); |
584 | 0 | } |
585 | 433k | } |
586 | | |
587 | 1.20M | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); |
588 | | |
589 | 1.20M | if (rotateCoeffs) { |
590 | 410k | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around |
591 | 410k | } |
592 | 1.20M | } |
593 | 6.69M | else { |
594 | 6.69M | int trType; |
595 | | |
596 | | //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { |
597 | 6.69M | if (nT==4 && cIdx==0 && cuPredModeIntra) { |
598 | 3.31M | trType=1; |
599 | 3.31M | } |
600 | 3.37M | else { |
601 | 3.37M | trType=0; |
602 | 3.37M | } |
603 | | |
604 | 6.69M | assert(rdpcmMode==0); |
605 | | |
606 | | |
607 | 6.69M | if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { |
608 | | // cross-component-prediction: transform to residual buffer and add in a separate step |
609 | | |
610 | 163k | transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, |
611 | 163k | pred, stride, bit_depth, cIdx); |
612 | 163k | } |
613 | 6.52M | else { |
614 | 6.52M | transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, |
615 | 6.52M | pred, stride, bit_depth); |
616 | 6.52M | } |
617 | 6.69M | } |
618 | 7.89M | } |
619 | | |
620 | | |
621 | 9.71M | logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); |
622 | | |
623 | 53.2M | for (int y=0;y<nT;y++) { |
624 | 43.5M | logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx); |
625 | | |
626 | 280M | for (int x=0;x<nT;x++) { |
627 | 236M | logtrace(LogTransform,"*%03x ", pred[x+y*stride]); |
628 | 236M | } |
629 | | |
630 | 43.5M | logtrace(LogTransform,"*\n"); |
631 | 43.5M | } |
632 | | |
633 | | // zero out scrap coefficient buffer again |
634 | | |
635 | 90.2M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
636 | 80.5M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; |
637 | 80.5M | } |
638 | 9.71M | } void scale_coefficients_internal<unsigned short>(thread_context*, int, int, int, int, int, int, bool, bool, int) Line | Count | Source | 368 | 4.48M | { | 369 | 4.48M | const seq_parameter_set& sps = tctx->img->get_sps(); | 370 | 4.48M | const pic_parameter_set& pps = tctx->img->get_pps(); | 371 | | | 372 | 4.48M | int qP; | 373 | 4.48M | switch (cIdx) { | 374 | 3.15M | case 0: qP = tctx->qPYPrime; break; | 375 | 662k | case 1: qP = tctx->qPCbPrime; break; | 376 | 661k | case 2: qP = tctx->qPCrPrime; break; | 377 | 0 | default: qP = 0; assert(0); break; // should never happen | 378 | 4.48M | } | 379 | | | 380 | 4.48M | logtrace(LogTransform,"qP: %d\n",qP); | 381 | | | 382 | | | 383 | 4.48M | int16_t* coeff; | 384 | 4.48M | int coeffStride; | 385 | | | 386 | 4.48M | coeff = tctx->coeffBuf; | 387 | 4.48M | coeffStride = nT; | 388 | | | 389 | | | 390 | | | 391 | | | 392 | | | 393 | 4.48M | pixel_t* pred; | 394 | 4.48M | int stride; | 395 | 4.48M | pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT); | 396 | 4.48M | stride = tctx->img->get_image_stride(cIdx); | 397 | | | 398 | | // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler | 399 | | // can optimize away a lot of code for 8-bit pixels. | 400 | 4.48M | const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); | 401 | | | 402 | | //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); | 403 | 4.48M | int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); | 404 | | | 405 | 4.48M | bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && | 406 | 4.48M | nT == 4 && | 407 | 4.48M | cuPredModeIntra); | 408 | | | 409 | 4.48M | if (tctx->cu_transquant_bypass_flag) { | 410 | | | 411 | 890k | int32_t residual_buffer[32*32]; | 412 | | | 413 | 890k | int32_t* residual; | 414 | 890k | if (cIdx==0) residual = tctx->residual_luma; | 415 | 181k | else residual = residual_buffer; | 416 | | | 417 | | | 418 | | // TODO: we could fold the coefficient rotation into the coefficient expansion here: | 419 | 7.44M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 420 | 6.55M | int32_t currCoeff = tctx->coeffList[cIdx][i]; | 421 | 6.55M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; | 422 | 6.55M | } | 423 | | | 424 | 890k | if (rotateCoeffs) { | 425 | 134k | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); | 426 | 134k | } | 427 | | | 428 | 890k | if (rdpcmMode) { | 429 | 92.0k | if (rdpcmMode==2) | 430 | 24.1k | tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); | 431 | 67.9k | else | 432 | 67.9k | tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); | 433 | 92.0k | } | 434 | 798k | else { | 435 | 798k | tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); | 436 | 798k | } | 437 | | | 438 | 890k | if (cIdx != 0) { | 439 | 181k | if (tctx->ResScaleVal != 0) { | 440 | 125 | cross_comp_pred(tctx, residual, nT); | 441 | 125 | } | 442 | 181k | } | 443 | | | 444 | 890k | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); | 445 | | | 446 | 890k | if (rotateCoeffs) { | 447 | 134k | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around | 448 | 134k | } | 449 | 890k | } | 450 | 3.59M | else { | 451 | | // (8.6.3) | 452 | | | 453 | 3.59M | int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; | 454 | | | 455 | 3.59M | logtrace(LogTransform,"bdShift=%d\n",bdShift); | 456 | | | 457 | 3.59M | logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); | 458 | | | 459 | | | 460 | | // --- inverse quantization --- | 461 | | | 462 | 3.59M | if (sps.scaling_list_enable_flag==0) { | 463 | | | 464 | | //const int m_x_y = 16; | 465 | 2.59M | const int m_x_y = 1; | 466 | 2.59M | bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers | 467 | | | 468 | 2.59M | const int offset = (1<<(bdShift-1)); | 469 | 2.59M | const int fact = m_x_y * levelScale[qP%6] << (qP/6); | 470 | | | 471 | 22.3M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 472 | | | 473 | | // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit | 474 | 19.7M | int32_t currCoeff = tctx->coeffList[cIdx][i]; | 475 | | | 476 | | //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], | 477 | | //tctx->coeffList[cIdx][i]); | 478 | | | 479 | 19.7M | currCoeff = Clip3(-32768,32767, | 480 | 19.7M | ( (currCoeff * fact + offset ) >> bdShift)); | 481 | | | 482 | | //logtrace(LogTransform," -> %d\n",currCoeff); | 483 | | | 484 | 19.7M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; | 485 | 19.7M | } | 486 | 2.59M | } | 487 | 997k | else { | 488 | 997k | const int offset = (1<<(bdShift-1)); | 489 | | | 490 | 997k | const uint8_t* sclist; | 491 | 997k | int matrixID = cIdx; | 492 | | | 493 | 997k | if (nT==32) { | 494 | 6.47k | matrixID=0; | 495 | 6.47k | } | 496 | | | 497 | 997k | if (!intra) { | 498 | 49.1k | if (nT<32) { matrixID += 3; } | 499 | 1.59k | else { matrixID++; } | 500 | 49.1k | } | 501 | | | 502 | 997k | switch (nT) { | 503 | 912k | case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; | 504 | 66.5k | case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; | 505 | 12.3k | case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; | 506 | 6.47k | case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; | 507 | 0 | default: assert(0); | 508 | 997k | } | 509 | | | 510 | 10.6M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 511 | 9.69M | int pos = tctx->coeffPos[cIdx][i]; | 512 | | | 513 | 9.69M | const int m_x_y = sclist[pos]; | 514 | 9.69M | const int fact = m_x_y * levelScale[qP%6] << (qP/6); | 515 | | | 516 | 9.69M | int64_t currCoeff = tctx->coeffList[cIdx][i]; | 517 | | | 518 | 9.69M | currCoeff = Clip3(-32768,32767, | 519 | 9.69M | ( (currCoeff * fact + offset ) >> bdShift)); | 520 | | | 521 | 9.69M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; | 522 | 9.69M | } | 523 | 997k | } | 524 | | | 525 | | | 526 | | // --- do transform or skip --- | 527 | | | 528 | 3.59M | logtrace(LogTransform,"coefficients OUT:\n"); | 529 | 19.2M | for (int y=0;y<nT;y++) { | 530 | 15.6M | logtrace(LogTransform," "); | 531 | 96.0M | for (int x=0;x<nT;x++) { | 532 | 80.4M | logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]); | 533 | 80.4M | } | 534 | 15.6M | logtrace(LogTransform,"*\n"); | 535 | 15.6M | } | 536 | | | 537 | 3.59M | int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C; | 538 | | | 539 | 3.59M | logtrace(LogTransform,"bdShift2=%d\n",bdShift2); | 540 | | | 541 | 3.59M | logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx, | 542 | 3.59M | transform_skip_flag); | 543 | | | 544 | 3.59M | if (transform_skip_flag) { | 545 | | | 546 | 766k | int extended_precision_processing_flag = 0; | 547 | 766k | int Log2nTbS = Log2(nT); | 548 | 766k | int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 ); | 549 | 766k | int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 ) | 550 | 766k | + Log2nTbS; | 551 | | | 552 | 766k | if (rotateCoeffs) { | 553 | 281k | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); | 554 | 281k | } | 555 | | | 556 | 766k | int32_t residual_buffer[32*32]; | 557 | | | 558 | 766k | int32_t* residual; | 559 | 766k | if (cIdx==0) residual = tctx->residual_luma; | 560 | 273k | else residual = residual_buffer; | 561 | | | 562 | 766k | if (rdpcmMode) { | 563 | | /* | 564 | | if (rdpcmMode==2) | 565 | | tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); | 566 | | else | 567 | | tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); | 568 | | */ | 569 | | | 570 | 56.0k | if (rdpcmMode==2) | 571 | 33.1k | tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); | 572 | 22.9k | else | 573 | 22.9k | tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); | 574 | 56.0k | } | 575 | 710k | else { | 576 | | //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); | 577 | | | 578 | 710k | tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); | 579 | 710k | } | 580 | | | 581 | 766k | if (cIdx != 0) { | 582 | 273k | if (tctx->ResScaleVal != 0) { | 583 | 0 | cross_comp_pred(tctx, residual, nT); | 584 | 0 | } | 585 | 273k | } | 586 | | | 587 | 766k | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); | 588 | | | 589 | 766k | if (rotateCoeffs) { | 590 | 281k | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around | 591 | 281k | } | 592 | 766k | } | 593 | 2.82M | else { | 594 | 2.82M | int trType; | 595 | | | 596 | | //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { | 597 | 2.82M | if (nT==4 && cIdx==0 && cuPredModeIntra) { | 598 | 1.47M | trType=1; | 599 | 1.47M | } | 600 | 1.34M | else { | 601 | 1.34M | trType=0; | 602 | 1.34M | } | 603 | | | 604 | 2.82M | assert(rdpcmMode==0); | 605 | | | 606 | | | 607 | 2.82M | if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { | 608 | | // cross-component-prediction: transform to residual buffer and add in a separate step | 609 | | | 610 | 24.7k | transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, | 611 | 24.7k | pred, stride, bit_depth, cIdx); | 612 | 24.7k | } | 613 | 2.79M | else { | 614 | 2.79M | transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, | 615 | 2.79M | pred, stride, bit_depth); | 616 | 2.79M | } | 617 | 2.82M | } | 618 | 3.59M | } | 619 | | | 620 | | | 621 | 4.48M | logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); | 622 | | | 623 | 24.0M | for (int y=0;y<nT;y++) { | 624 | 19.5M | logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx); | 625 | | | 626 | 120M | for (int x=0;x<nT;x++) { | 627 | 100M | logtrace(LogTransform,"*%03x ", pred[x+y*stride]); | 628 | 100M | } | 629 | | | 630 | 19.5M | logtrace(LogTransform,"*\n"); | 631 | 19.5M | } | 632 | | | 633 | | // zero out scrap coefficient buffer again | 634 | | | 635 | 40.4M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 636 | 35.9M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; | 637 | 35.9M | } | 638 | 4.48M | } |
void scale_coefficients_internal<unsigned char>(thread_context*, int, int, int, int, int, int, bool, bool, int) Line | Count | Source | 368 | 5.23M | { | 369 | 5.23M | const seq_parameter_set& sps = tctx->img->get_sps(); | 370 | 5.23M | const pic_parameter_set& pps = tctx->img->get_pps(); | 371 | | | 372 | 5.23M | int qP; | 373 | 5.23M | switch (cIdx) { | 374 | 3.32M | case 0: qP = tctx->qPYPrime; break; | 375 | 957k | case 1: qP = tctx->qPCbPrime; break; | 376 | 953k | case 2: qP = tctx->qPCrPrime; break; | 377 | 0 | default: qP = 0; assert(0); break; // should never happen | 378 | 5.23M | } | 379 | | | 380 | 5.23M | logtrace(LogTransform,"qP: %d\n",qP); | 381 | | | 382 | | | 383 | 5.23M | int16_t* coeff; | 384 | 5.23M | int coeffStride; | 385 | | | 386 | 5.23M | coeff = tctx->coeffBuf; | 387 | 5.23M | coeffStride = nT; | 388 | | | 389 | | | 390 | | | 391 | | | 392 | | | 393 | 5.23M | pixel_t* pred; | 394 | 5.23M | int stride; | 395 | 5.23M | pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT); | 396 | 5.23M | stride = tctx->img->get_image_stride(cIdx); | 397 | | | 398 | | // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler | 399 | | // can optimize away a lot of code for 8-bit pixels. | 400 | 5.23M | const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); | 401 | | | 402 | | //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); | 403 | 5.23M | int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); | 404 | | | 405 | 5.23M | bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && | 406 | 5.23M | nT == 4 && | 407 | 5.23M | cuPredModeIntra); | 408 | | | 409 | 5.23M | if (tctx->cu_transquant_bypass_flag) { | 410 | | | 411 | 937k | int32_t residual_buffer[32*32]; | 412 | | | 413 | 937k | int32_t* residual; | 414 | 937k | if (cIdx==0) residual = tctx->residual_luma; | 415 | 296k | else residual = residual_buffer; | 416 | | | 417 | | | 418 | | // TODO: we could fold the coefficient rotation into the coefficient expansion here: | 419 | 10.1M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 420 | 9.25M | int32_t currCoeff = tctx->coeffList[cIdx][i]; | 421 | 9.25M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; | 422 | 9.25M | } | 423 | | | 424 | 937k | if (rotateCoeffs) { | 425 | 254k | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); | 426 | 254k | } | 427 | | | 428 | 937k | if (rdpcmMode) { | 429 | 57.3k | if (rdpcmMode==2) | 430 | 39.5k | tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); | 431 | 17.7k | else | 432 | 17.7k | tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); | 433 | 57.3k | } | 434 | 880k | else { | 435 | 880k | tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); | 436 | 880k | } | 437 | | | 438 | 937k | if (cIdx != 0) { | 439 | 296k | if (tctx->ResScaleVal != 0) { | 440 | 342 | cross_comp_pred(tctx, residual, nT); | 441 | 342 | } | 442 | 296k | } | 443 | | | 444 | 937k | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); | 445 | | | 446 | 937k | if (rotateCoeffs) { | 447 | 254k | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around | 448 | 254k | } | 449 | 937k | } | 450 | 4.30M | else { | 451 | | // (8.6.3) | 452 | | | 453 | 4.30M | int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; | 454 | | | 455 | 4.30M | logtrace(LogTransform,"bdShift=%d\n",bdShift); | 456 | | | 457 | 4.30M | logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); | 458 | | | 459 | | | 460 | | // --- inverse quantization --- | 461 | | | 462 | 4.30M | if (sps.scaling_list_enable_flag==0) { | 463 | | | 464 | | //const int m_x_y = 16; | 465 | 3.13M | const int m_x_y = 1; | 466 | 3.13M | bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers | 467 | | | 468 | 3.13M | const int offset = (1<<(bdShift-1)); | 469 | 3.13M | const int fact = m_x_y * levelScale[qP%6] << (qP/6); | 470 | | | 471 | 27.3M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 472 | | | 473 | | // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit | 474 | 24.2M | int32_t currCoeff = tctx->coeffList[cIdx][i]; | 475 | | | 476 | | //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], | 477 | | //tctx->coeffList[cIdx][i]); | 478 | | | 479 | 24.2M | currCoeff = Clip3(-32768,32767, | 480 | 24.2M | ( (currCoeff * fact + offset ) >> bdShift)); | 481 | | | 482 | | //logtrace(LogTransform," -> %d\n",currCoeff); | 483 | | | 484 | 24.2M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; | 485 | 24.2M | } | 486 | 3.13M | } | 487 | 1.16M | else { | 488 | 1.16M | const int offset = (1<<(bdShift-1)); | 489 | | | 490 | 1.16M | const uint8_t* sclist; | 491 | 1.16M | int matrixID = cIdx; | 492 | | | 493 | 1.16M | if (nT==32) { | 494 | 9.94k | matrixID=0; | 495 | 9.94k | } | 496 | | | 497 | 1.16M | if (!intra) { | 498 | 47.3k | if (nT<32) { matrixID += 3; } | 499 | 1.46k | else { matrixID++; } | 500 | 47.3k | } | 501 | | | 502 | 1.16M | switch (nT) { | 503 | 944k | case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; | 504 | 192k | case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; | 505 | 19.6k | case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; | 506 | 9.94k | case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; | 507 | 0 | default: assert(0); | 508 | 1.16M | } | 509 | | | 510 | 12.2M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 511 | 11.0M | int pos = tctx->coeffPos[cIdx][i]; | 512 | | | 513 | 11.0M | const int m_x_y = sclist[pos]; | 514 | 11.0M | const int fact = m_x_y * levelScale[qP%6] << (qP/6); | 515 | | | 516 | 11.0M | int64_t currCoeff = tctx->coeffList[cIdx][i]; | 517 | | | 518 | 11.0M | currCoeff = Clip3(-32768,32767, | 519 | 11.0M | ( (currCoeff * fact + offset ) >> bdShift)); | 520 | | | 521 | 11.0M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; | 522 | 11.0M | } | 523 | 1.16M | } | 524 | | | 525 | | | 526 | | // --- do transform or skip --- | 527 | | | 528 | 4.30M | logtrace(LogTransform,"coefficients OUT:\n"); | 529 | 23.8M | for (int y=0;y<nT;y++) { | 530 | 19.5M | logtrace(LogTransform," "); | 531 | 129M | for (int x=0;x<nT;x++) { | 532 | 109M | logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]); | 533 | 109M | } | 534 | 19.5M | logtrace(LogTransform,"*\n"); | 535 | 19.5M | } | 536 | | | 537 | 4.30M | int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C; | 538 | | | 539 | 4.30M | logtrace(LogTransform,"bdShift2=%d\n",bdShift2); | 540 | | | 541 | 4.30M | logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx, | 542 | 4.30M | transform_skip_flag); | 543 | | | 544 | 4.30M | if (transform_skip_flag) { | 545 | | | 546 | 433k | int extended_precision_processing_flag = 0; | 547 | 433k | int Log2nTbS = Log2(nT); | 548 | 433k | int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 ); | 549 | 433k | int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 ) | 550 | 433k | + Log2nTbS; | 551 | | | 552 | 433k | if (rotateCoeffs) { | 553 | 129k | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); | 554 | 129k | } | 555 | | | 556 | 433k | int32_t residual_buffer[32*32]; | 557 | | | 558 | 433k | int32_t* residual; | 559 | 433k | if (cIdx==0) residual = tctx->residual_luma; | 560 | 159k | else residual = residual_buffer; | 561 | | | 562 | 433k | if (rdpcmMode) { | 563 | | /* | 564 | | if (rdpcmMode==2) | 565 | | tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); | 566 | | else | 567 | | tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); | 568 | | */ | 569 | | | 570 | 30.0k | if (rdpcmMode==2) | 571 | 18.4k | tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); | 572 | 11.5k | else | 573 | 11.5k | tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); | 574 | 30.0k | } | 575 | 403k | else { | 576 | | //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); | 577 | | | 578 | 403k | tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); | 579 | 403k | } | 580 | | | 581 | 433k | if (cIdx != 0) { | 582 | 159k | if (tctx->ResScaleVal != 0) { | 583 | 0 | cross_comp_pred(tctx, residual, nT); | 584 | 0 | } | 585 | 159k | } | 586 | | | 587 | 433k | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); | 588 | | | 589 | 433k | if (rotateCoeffs) { | 590 | 129k | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around | 591 | 129k | } | 592 | 433k | } | 593 | 3.86M | else { | 594 | 3.86M | int trType; | 595 | | | 596 | | //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { | 597 | 3.86M | if (nT==4 && cIdx==0 && cuPredModeIntra) { | 598 | 1.83M | trType=1; | 599 | 1.83M | } | 600 | 2.03M | else { | 601 | 2.03M | trType=0; | 602 | 2.03M | } | 603 | | | 604 | 3.86M | assert(rdpcmMode==0); | 605 | | | 606 | | | 607 | 3.86M | if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { | 608 | | // cross-component-prediction: transform to residual buffer and add in a separate step | 609 | | | 610 | 139k | transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, | 611 | 139k | pred, stride, bit_depth, cIdx); | 612 | 139k | } | 613 | 3.72M | else { | 614 | 3.72M | transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, | 615 | 3.72M | pred, stride, bit_depth); | 616 | 3.72M | } | 617 | 3.86M | } | 618 | 4.30M | } | 619 | | | 620 | | | 621 | 5.23M | logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); | 622 | | | 623 | 29.2M | for (int y=0;y<nT;y++) { | 624 | 24.0M | logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx); | 625 | | | 626 | 159M | for (int x=0;x<nT;x++) { | 627 | 135M | logtrace(LogTransform,"*%03x ", pred[x+y*stride]); | 628 | 135M | } | 629 | | | 630 | 24.0M | logtrace(LogTransform,"*\n"); | 631 | 24.0M | } | 632 | | | 633 | | // zero out scrap coefficient buffer again | 634 | | | 635 | 49.7M | for (int i=0;i<tctx->nCoeff[cIdx];i++) { | 636 | 44.5M | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; | 637 | 44.5M | } | 638 | 5.23M | } |
|
639 | | |
640 | | |
641 | | void scale_coefficients(thread_context* tctx, |
642 | | int xT,int yT, // position of TU in frame (chroma adapted) |
643 | | int x0,int y0, // position of CU in frame (chroma adapted) |
644 | | int nT, int cIdx, |
645 | | bool transform_skip_flag, bool intra, |
646 | | int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical |
647 | | ) |
648 | 9.71M | { |
649 | 9.71M | if (tctx->img->high_bit_depth(cIdx)) { |
650 | 4.48M | scale_coefficients_internal<uint16_t>(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, |
651 | 4.48M | rdpcmMode); |
652 | 5.23M | } else { |
653 | 5.23M | scale_coefficients_internal<uint8_t> (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, |
654 | 5.23M | rdpcmMode); |
655 | 5.23M | } |
656 | 9.71M | } |
657 | | |
658 | | |
659 | | //#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20 |
660 | 0 | #define QUANT_SHIFT 14 // Q(4) = 2^14 |
661 | | //#define SCALE_BITS 15 // Inherited from TMuC, presumably for fractional bit estimates in RDOQ |
662 | 0 | #define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit) |
663 | | |
664 | | |
665 | | const static uint16_t g_quantScales[6] = { |
666 | | 26214,23302,20560,18396,16384,14564 |
667 | | }; |
668 | | |
669 | | void quant_coefficients(//encoder_context* ectx, |
670 | | int16_t* out_coeff, |
671 | | const int16_t* in_coeff, |
672 | | int log2TrSize, int qp, |
673 | | bool intra) |
674 | 0 | { |
675 | 0 | const int qpDiv6 = qp / 6; |
676 | 0 | const int qpMod6 = qp % 6; |
677 | | |
678 | | //int uiLog2TrSize = xLog2( iWidth - 1); |
679 | |
|
680 | 0 | int uiQ = g_quantScales[qpMod6]; |
681 | 0 | int bitDepth = 8; |
682 | 0 | int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize; // Represents scaling through forward transform |
683 | 0 | int qBits = QUANT_SHIFT + qpDiv6 + transformShift; |
684 | | |
685 | | /* TODO: originally, this was checking for intra slices, why not for intra mode ? |
686 | | */ |
687 | 0 | int rnd = (intra ? 171 : 85) << (qBits-9); |
688 | |
|
689 | 0 | int x, y; |
690 | 0 | int uiAcSum = 0; |
691 | |
|
692 | 0 | int nStride = (1<<log2TrSize); |
693 | |
|
694 | 0 | for (y=0; y < (1<<log2TrSize) ; y++) { |
695 | 0 | for (x=0; x < (1<<log2TrSize) ; x++) { |
696 | 0 | int level; |
697 | 0 | int sign; |
698 | 0 | int blockPos = y * nStride + x; |
699 | 0 | level = in_coeff[blockPos]; |
700 | | //logtrace(LogTransform,"(%d,%d) %d -> ", x,y,level); |
701 | 0 | sign = (level < 0 ? -1: 1); |
702 | |
|
703 | 0 | level = (abs_value(level) * uiQ + rnd ) >> qBits; |
704 | 0 | uiAcSum += level; |
705 | 0 | level *= sign; |
706 | 0 | out_coeff[blockPos] = Clip3(-32768, 32767, level); |
707 | | //logtrace(LogTransform,"%d\n", out_coeff[blockPos]); |
708 | 0 | } |
709 | 0 | } |
710 | 0 | } |
711 | | |
712 | | |
713 | | void dequant_coefficients(int16_t* out_coeff, |
714 | | const int16_t* in_coeff, |
715 | | int log2TrSize, int qP) |
716 | 0 | { |
717 | 0 | const int m_x_y = 1; |
718 | 0 | int bitDepth = 8; |
719 | 0 | int bdShift = bitDepth + log2TrSize - 5; |
720 | 0 | bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers |
721 | |
|
722 | 0 | const int offset = (1<<(bdShift-1)); |
723 | 0 | const int fact = m_x_y * levelScale[qP%6] << (qP/6); |
724 | |
|
725 | 0 | int blkSize = (1<<log2TrSize); |
726 | 0 | int nCoeff = (1<<(log2TrSize<<1)); |
727 | |
|
728 | 0 | for (int i=0;i<nCoeff;i++) { |
729 | | |
730 | | // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit |
731 | 0 | int32_t currCoeff = in_coeff[i]; |
732 | | |
733 | | //logtrace(LogTransform,"coefficient[%d] = %d\n",i,currCoeff); |
734 | |
|
735 | 0 | currCoeff = Clip3(-32768,32767, |
736 | 0 | ( (currCoeff * fact + offset ) >> bdShift)); |
737 | | |
738 | | //logtrace(LogTransform," -> %d\n",currCoeff); |
739 | |
|
740 | 0 | out_coeff[i] = currCoeff; |
741 | 0 | } |
742 | 0 | } |