/work/libde265/libde265/transform.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "transform.h" |
22 | | #include "util.h" |
23 | | |
24 | | #include <assert.h> |
25 | | |
26 | | |
27 | | const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ }; |
28 | | |
29 | | |
30 | | // (8.6.1) |
31 | | void decode_quantization_parameters(thread_context* tctx, int xC,int yC, |
32 | | int xCUBase, int yCUBase) |
33 | 0 | { |
34 | 0 | logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC); |
35 | |
|
36 | 0 | const pic_parameter_set& pps = tctx->img->get_pps(); |
37 | 0 | const seq_parameter_set& sps = tctx->img->get_sps(); |
38 | 0 | slice_segment_header* shdr = tctx->shdr; |
39 | | |
40 | | // top left pixel position of current quantization group |
41 | 0 | int xQG = xCUBase - (xCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1)); |
42 | 0 | int yQG = yCUBase - (yCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1)); |
43 | |
|
44 | 0 | logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG); |
45 | | |
46 | | |
47 | | // we only have to set QP in the first call in a quantization-group |
48 | | |
49 | | /* TODO: check why this does not work with HoneyBee stream |
50 | | |
51 | | if (xQG == tctx->currentQG_x && |
52 | | yQG == tctx->currentQG_y) |
53 | | { |
54 | | return; |
55 | | } |
56 | | */ |
57 | | |
58 | | // if first QG in CU, remember last QPY of last CU previous QG |
59 | |
|
60 | 0 | if (xQG != tctx->currentQG_x || |
61 | 0 | yQG != tctx->currentQG_y) |
62 | 0 | { |
63 | 0 | tctx->lastQPYinPreviousQG = tctx->currentQPY; |
64 | 0 | tctx->currentQG_x = xQG; |
65 | 0 | tctx->currentQG_y = yQG; |
66 | 0 | } |
67 | |
|
68 | 0 | int qPY_PRED; |
69 | | |
70 | | // first QG in CTB row ? |
71 | |
|
72 | 0 | int ctbLSBMask = ((1<<sps.Log2CtbSizeY)-1); |
73 | 0 | bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0)); |
74 | | |
75 | | // first QG in slice ? TODO: a "firstQG" flag in the thread context would be faster |
76 | |
|
77 | 0 | int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS; |
78 | |
|
79 | 0 | int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY; |
80 | 0 | int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY; |
81 | |
|
82 | 0 | bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG); |
83 | | |
84 | | // first QG in tile ? |
85 | |
|
86 | 0 | bool firstQGInTile = false; |
87 | 0 | if (pps.tiles_enabled_flag) { |
88 | 0 | if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 && |
89 | 0 | (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0) |
90 | 0 | { |
91 | 0 | int ctbX = xQG >> sps.Log2CtbSizeY; |
92 | 0 | int ctbY = yQG >> sps.Log2CtbSizeY; |
93 | |
|
94 | 0 | firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | |
|
99 | 0 | if (firstQGInSlice || firstQGInTile || |
100 | 0 | (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) { |
101 | 0 | qPY_PRED = tctx->shdr->SliceQPY; |
102 | 0 | } |
103 | 0 | else { |
104 | 0 | qPY_PRED = tctx->lastQPYinPreviousQG; |
105 | 0 | } |
106 | | |
107 | |
|
108 | 0 | int qPYA,qPYB; |
109 | |
|
110 | 0 | if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) { |
111 | 0 | int xTmp = (xQG-1) >> sps.Log2MinTrafoSize; |
112 | 0 | int yTmp = (yQG ) >> sps.Log2MinTrafoSize; |
113 | 0 | int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; |
114 | 0 | uint32_t ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); |
115 | 0 | if (ctbAddrA == tctx->CtbAddrInTS) { |
116 | 0 | qPYA = tctx->img->get_QPY(xQG-1,yQG); |
117 | 0 | } |
118 | 0 | else { |
119 | 0 | qPYA = qPY_PRED; |
120 | 0 | } |
121 | 0 | } |
122 | 0 | else { |
123 | 0 | qPYA = qPY_PRED; |
124 | 0 | } |
125 | |
|
126 | 0 | if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) { |
127 | 0 | int xTmp = (xQG ) >> sps.Log2MinTrafoSize; |
128 | 0 | int yTmp = (yQG-1) >> sps.Log2MinTrafoSize; |
129 | 0 | uint32_t minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; |
130 | 0 | uint32_t ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); |
131 | 0 | if (ctbAddrB == tctx->CtbAddrInTS) { |
132 | 0 | qPYB = tctx->img->get_QPY(xQG,yQG-1); |
133 | 0 | } |
134 | 0 | else { |
135 | 0 | qPYB = qPY_PRED; |
136 | 0 | } |
137 | 0 | } |
138 | 0 | else { |
139 | 0 | qPYB = qPY_PRED; |
140 | 0 | } |
141 | |
|
142 | 0 | qPY_PRED = (qPYA + qPYB + 1)>>1; |
143 | |
|
144 | 0 | logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB); |
145 | |
|
146 | 0 | int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) % |
147 | 0 | (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y; |
148 | |
|
149 | 0 | assert(QPY >= -sps.QpBdOffset_Y && QPY <= 51); |
150 | | |
151 | 0 | tctx->qPYPrime = QPY + sps.QpBdOffset_Y; |
152 | |
|
153 | 0 | int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb); |
154 | 0 | int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr); |
155 | |
|
156 | 0 | logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n", |
157 | 0 | qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset, |
158 | 0 | qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset); |
159 | |
|
160 | 0 | int qPCb,qPCr; |
161 | |
|
162 | 0 | if (sps.ChromaArrayType == CHROMA_420) { |
163 | 0 | qPCb = table8_22(qPiCb); |
164 | 0 | qPCr = table8_22(qPiCr); |
165 | 0 | } |
166 | 0 | else { |
167 | 0 | qPCb = qPiCb; |
168 | 0 | qPCr = qPiCr; |
169 | 0 | } |
170 | | |
171 | | //printf("q: %d %d\n",qPiCb, qPCb); |
172 | |
|
173 | 0 | tctx->qPCbPrime = qPCb + sps.QpBdOffset_C; |
174 | 0 | if (tctx->qPCbPrime<0) { |
175 | 0 | tctx->qPCbPrime = 0; |
176 | 0 | } |
177 | |
|
178 | 0 | tctx->qPCrPrime = qPCr + sps.QpBdOffset_C; |
179 | 0 | if (tctx->qPCrPrime<0) { |
180 | 0 | tctx->qPCrPrime = 0; |
181 | 0 | } |
182 | | |
183 | | /* |
184 | | printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY, |
185 | | sps->QpBdOffset_Y, |
186 | | pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset, |
187 | | pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset, |
188 | | sps->QpBdOffset_C, sps->QpBdOffset_C, |
189 | | tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime); |
190 | | */ |
191 | |
|
192 | 0 | int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase); |
193 | | |
194 | | // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why. |
195 | | // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit. |
196 | | // id:000163,sig:06,src:002041,op:havoc,rep:16.bin |
197 | 0 | if (log2CbSize<3) { log2CbSize=3; } |
198 | |
|
199 | 0 | tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY); |
200 | 0 | tctx->currentQPY = QPY; |
201 | | |
202 | | /* |
203 | | printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase, |
204 | | xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY); |
205 | | */ |
206 | |
|
207 | 0 | logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n", |
208 | 0 | xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime); |
209 | 0 | } |
210 | | |
211 | | |
212 | | |
213 | | template <class pixel_t> |
214 | | void transform_coefficients(acceleration_functions* acceleration, |
215 | | int16_t* coeff, int coeffStride, int nT, int trType, |
216 | | pixel_t* dst, int dstStride, int bit_depth) |
217 | 0 | { |
218 | 0 | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); |
219 | | |
220 | |
|
221 | 0 | if (trType==1) { |
222 | |
|
223 | 0 | acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth); |
224 | |
|
225 | 0 | } else { |
226 | | |
227 | 0 | /**/ if (nT==4) { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); } |
228 | 0 | else if (nT==8) { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); } |
229 | 0 | else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); } |
230 | 0 | else { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); } |
231 | 0 | } |
232 | |
|
233 | | #if 0 |
234 | | printf("decoded pixels:\n"); |
235 | | for (int y=0;y<nT;y++,printf("\n")) |
236 | | for (int x=0;x<nT;x++) { |
237 | | printf("%02x ",dst[y*dstStride+x]); |
238 | | } |
239 | | #endif |
240 | 0 | } Unexecuted instantiation: void transform_coefficients<unsigned short>(acceleration_functions*, short*, int, int, int, unsigned short*, int, int) Unexecuted instantiation: void transform_coefficients<unsigned char>(acceleration_functions*, short*, int, int, int, unsigned char*, int, int) |
241 | | |
242 | | |
243 | | // TODO: make this an accelerated function |
244 | | void cross_comp_pred(const thread_context* tctx, int32_t* residual, int nT) |
245 | 0 | { |
246 | 0 | const int BitDepthC = tctx->img->get_sps().BitDepth_C; |
247 | 0 | const int BitDepthY = tctx->img->get_sps().BitDepth_Y; |
248 | |
|
249 | 0 | for (int y=0;y<nT;y++) |
250 | 0 | for (int x=0;x<nT;x++) { |
251 | | /* TODO: the most usual case is definitely BitDepthY == BitDepthC, in which case |
252 | | we could just omit two shifts. The second most common case is probably |
253 | | BitDepthY>BitDepthC, for which we could also eliminate one shift. The remaining |
254 | | case is also one shift only. |
255 | | */ |
256 | |
|
257 | 0 | residual[y*nT+x] += (tctx->ResScaleVal * |
258 | 0 | static_cast<int32_t>((static_cast<uint32_t>(tctx->residual_luma[y*nT+x]) << BitDepthC ) >> BitDepthY ) ) >> 3; |
259 | 0 | } |
260 | 0 | } |
261 | | |
262 | | |
263 | | template <class pixel_t> |
264 | | void transform_coefficients_explicit(thread_context* tctx, |
265 | | int16_t* coeff, int coeffStride, int nT, int trType, |
266 | | pixel_t* dst, int dstStride, int bit_depth, int cIdx) |
267 | 0 | { |
268 | 0 | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); |
269 | |
|
270 | 0 | const acceleration_functions* acceleration = &tctx->decctx->acceleration; |
271 | |
|
272 | 0 | int32_t residual_buffer[32*32]; |
273 | 0 | int32_t* residual; |
274 | 0 | if (cIdx==0) { |
275 | 0 | residual = tctx->residual_luma; |
276 | 0 | } |
277 | 0 | else { |
278 | 0 | residual = residual_buffer; |
279 | 0 | } |
280 | | |
281 | | |
282 | | // TODO |
283 | 0 | int bdShift = 20 - bit_depth; |
284 | 0 | int max_coeff_bits = 15; |
285 | |
|
286 | 0 | if (trType==1) { |
287 | |
|
288 | 0 | acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); |
289 | |
|
290 | 0 | } else { |
291 | | |
292 | 0 | /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } |
293 | 0 | else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } |
294 | 0 | else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } |
295 | 0 | else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } |
296 | 0 | } |
297 | | |
298 | | |
299 | | //printBlk("prediction",(uint8_t*)dst,nT,dstStride); |
300 | | //printBlk("residual",residual,nT,nT); |
301 | |
|
302 | 0 | if (cIdx != 0) { |
303 | 0 | if (tctx->ResScaleVal != 0) { |
304 | 0 | cross_comp_pred(tctx, residual, nT); |
305 | 0 | } |
306 | | |
307 | | //printBlk("cross-comp-pred modified residual",residual,nT,nT); |
308 | 0 | } |
309 | |
|
310 | 0 | acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); |
311 | 0 | } Unexecuted instantiation: void transform_coefficients_explicit<unsigned short>(thread_context*, short*, int, int, int, unsigned short*, int, int, int) Unexecuted instantiation: void transform_coefficients_explicit<unsigned char>(thread_context*, short*, int, int, int, unsigned char*, int, int, int) |
312 | | |
313 | | |
314 | | void inv_transform(acceleration_functions* acceleration, |
315 | | uint8_t* dst, int dstStride, int16_t* coeff, |
316 | | int log2TbSize, int trType) |
317 | 0 | { |
318 | 0 | if (trType==1) { |
319 | 0 | assert(log2TbSize==2); |
320 | | |
321 | 0 | acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride); |
322 | |
|
323 | 0 | } else { |
324 | 0 | acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride); |
325 | 0 | } |
326 | | |
327 | |
|
328 | | #if 0 |
329 | | int nT = 1<<log2TbSize; |
330 | | printf("decoded pixels:\n"); |
331 | | for (int y=0;y<nT;y++,printf("\n")) |
332 | | for (int x=0;x<nT;x++) { |
333 | | printf("%02x ",dst[y*dstStride+x]); |
334 | | } |
335 | | #endif |
336 | 0 | } |
337 | | |
338 | | |
339 | | void fwd_transform(acceleration_functions* acceleration, |
340 | | int16_t* coeff, int coeffStride, int log2TbSize, int trType, |
341 | | const int16_t* src, int srcStride) |
342 | 0 | { |
343 | 0 | logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,1<<log2TbSize); |
344 | |
|
345 | 0 | if (trType==1) { |
346 | | // DST 4x4 |
347 | |
|
348 | 0 | acceleration->fwd_transform_4x4_dst_8(coeff, src, srcStride); |
349 | 0 | } else { |
350 | | // DCT 4x4, 8x8, 16x16, 32x32 |
351 | |
|
352 | 0 | acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride); |
353 | 0 | } |
354 | 0 | } |
355 | | |
356 | | |
357 | | |
358 | | static const int levelScale[] = { 40,45,51,57,64,72 }; |
359 | | |
360 | | // (8.6.2) and (8.6.3) |
361 | | template <class pixel_t> |
362 | | void scale_coefficients_internal(thread_context* tctx, |
363 | | int xT,int yT, // position of TU in frame (chroma adapted) |
364 | | int x0,int y0, // position of CU in frame (chroma adapted) |
365 | | int nT, int cIdx, |
366 | | bool transform_skip_flag, bool intra, int rdpcmMode) |
367 | 0 | { |
368 | 0 | const seq_parameter_set& sps = tctx->img->get_sps(); |
369 | 0 | const pic_parameter_set& pps = tctx->img->get_pps(); |
370 | |
|
371 | 0 | int qP; |
372 | 0 | switch (cIdx) { |
373 | 0 | case 0: qP = tctx->qPYPrime; break; |
374 | 0 | case 1: qP = tctx->qPCbPrime; break; |
375 | 0 | case 2: qP = tctx->qPCrPrime; break; |
376 | 0 | default: qP = 0; assert(0); break; // should never happen |
377 | 0 | } |
378 | | |
379 | 0 | logtrace(LogTransform,"qP: %d\n",qP); |
380 | | |
381 | |
|
382 | 0 | int16_t* coeff; |
383 | 0 | int coeffStride; |
384 | |
|
385 | 0 | coeff = tctx->coeffBuf; |
386 | 0 | coeffStride = nT; |
387 | | |
388 | | |
389 | | |
390 | | |
391 | |
|
392 | 0 | pixel_t* pred; |
393 | 0 | int stride; |
394 | 0 | pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT); |
395 | 0 | stride = tctx->img->get_image_stride(cIdx); |
396 | | |
397 | | // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler |
398 | | // can optimize away a lot of code for 8-bit pixels. |
399 | 0 | const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); |
400 | | |
401 | | //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); |
402 | 0 | int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); |
403 | |
|
404 | 0 | bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && |
405 | 0 | nT == 4 && |
406 | 0 | cuPredModeIntra); |
407 | |
|
408 | 0 | if (tctx->cu_transquant_bypass_flag) { |
409 | |
|
410 | 0 | int32_t residual_buffer[32*32]; |
411 | |
|
412 | 0 | int32_t* residual; |
413 | 0 | if (cIdx==0) residual = tctx->residual_luma; |
414 | 0 | else residual = residual_buffer; |
415 | | |
416 | | |
417 | | // TODO: we could fold the coefficient rotation into the coefficient expansion here: |
418 | 0 | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
419 | 0 | int32_t currCoeff = tctx->coeffList[cIdx][i]; |
420 | 0 | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; |
421 | 0 | } |
422 | |
|
423 | 0 | if (rotateCoeffs) { |
424 | 0 | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); |
425 | 0 | } |
426 | |
|
427 | 0 | if (rdpcmMode) { |
428 | 0 | if (rdpcmMode==2) |
429 | 0 | tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); |
430 | 0 | else |
431 | 0 | tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); |
432 | 0 | } |
433 | 0 | else { |
434 | 0 | tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); |
435 | 0 | } |
436 | |
|
437 | 0 | if (cIdx != 0) { |
438 | 0 | if (tctx->ResScaleVal != 0) { |
439 | 0 | cross_comp_pred(tctx, residual, nT); |
440 | 0 | } |
441 | 0 | } |
442 | |
|
443 | 0 | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); |
444 | |
|
445 | 0 | if (rotateCoeffs) { |
446 | 0 | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around |
447 | 0 | } |
448 | 0 | } |
449 | 0 | else { |
450 | | // (8.6.3) |
451 | |
|
452 | 0 | int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; |
453 | |
|
454 | 0 | logtrace(LogTransform,"bdShift=%d\n",bdShift); |
455 | |
|
456 | 0 | logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); |
457 | | |
458 | | |
459 | | // --- inverse quantization --- |
460 | |
|
461 | 0 | if (sps.scaling_list_enable_flag==0) { |
462 | | |
463 | | //const int m_x_y = 16; |
464 | 0 | const int m_x_y = 1; |
465 | 0 | bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers |
466 | |
|
467 | 0 | const int offset = (1<<(bdShift-1)); |
468 | 0 | const int fact = m_x_y * levelScale[qP%6] << (qP/6); |
469 | |
|
470 | 0 | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
471 | |
|
472 | 0 | int64_t currCoeff = tctx->coeffList[cIdx][i]; |
473 | | |
474 | | //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], |
475 | | //tctx->coeffList[cIdx][i]); |
476 | |
|
477 | 0 | currCoeff = Clip3(-32768,32767, |
478 | 0 | ( (currCoeff * fact + offset ) >> bdShift)); |
479 | | |
480 | | //logtrace(LogTransform," -> %d\n",currCoeff); |
481 | |
|
482 | 0 | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; |
483 | 0 | } |
484 | 0 | } |
485 | 0 | else { |
486 | 0 | const int offset = (1<<(bdShift-1)); |
487 | |
|
488 | 0 | const uint8_t* sclist; |
489 | 0 | int matrixID = cIdx; |
490 | |
|
491 | 0 | if (nT==32) { |
492 | 0 | matrixID=0; |
493 | 0 | } |
494 | |
|
495 | 0 | if (!intra) { |
496 | 0 | if (nT<32) { matrixID += 3; } |
497 | 0 | else { matrixID++; } |
498 | 0 | } |
499 | |
|
500 | 0 | switch (nT) { |
501 | 0 | case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; |
502 | 0 | case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; |
503 | 0 | case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; |
504 | 0 | case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; |
505 | 0 | default: assert(0); sclist = nullptr; |
506 | 0 | } |
507 | | |
508 | 0 | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
509 | 0 | int pos = tctx->coeffPos[cIdx][i]; |
510 | |
|
511 | 0 | const int m_x_y = sclist[pos]; |
512 | 0 | const int fact = m_x_y * levelScale[qP%6] << (qP/6); |
513 | |
|
514 | 0 | int64_t currCoeff = tctx->coeffList[cIdx][i]; |
515 | |
|
516 | 0 | currCoeff = Clip3(-32768,32767, |
517 | 0 | ( (currCoeff * fact + offset ) >> bdShift)); |
518 | |
|
519 | 0 | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; |
520 | 0 | } |
521 | 0 | } |
522 | | |
523 | | |
524 | | // --- do transform or skip --- |
525 | | |
526 | 0 | logtrace(LogTransform,"coefficients OUT:\n"); |
527 | 0 | for (int y=0;y<nT;y++) { |
528 | 0 | logtrace(LogTransform," "); |
529 | 0 | for (int x=0;x<nT;x++) { |
530 | 0 | logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]); |
531 | 0 | } |
532 | 0 | logtrace(LogTransform,"*\n"); |
533 | 0 | } |
534 | |
|
535 | | #ifdef DE265_LOG_TRACE |
536 | | int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C; |
537 | | #endif |
538 | |
|
539 | 0 | logtrace(LogTransform,"bdShift2=%d\n",bdShift2); |
540 | |
|
541 | 0 | logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx, |
542 | 0 | transform_skip_flag); |
543 | |
|
544 | 0 | if (transform_skip_flag) { |
545 | |
|
546 | 0 | int extended_precision_processing_flag = 0; |
547 | 0 | int Log2nTbS = Log2(nT); |
548 | 0 | int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 ); |
549 | 0 | int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 ) |
550 | 0 | + Log2nTbS; |
551 | |
|
552 | 0 | if (rotateCoeffs) { |
553 | 0 | tctx->decctx->acceleration.rotate_coefficients(coeff, nT); |
554 | 0 | } |
555 | |
|
556 | 0 | int32_t residual_buffer[32*32]; |
557 | |
|
558 | 0 | int32_t* residual; |
559 | 0 | if (cIdx==0) residual = tctx->residual_luma; |
560 | 0 | else residual = residual_buffer; |
561 | |
|
562 | 0 | if (rdpcmMode) { |
563 | | /* |
564 | | if (rdpcmMode==2) |
565 | | tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); |
566 | | else |
567 | | tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); |
568 | | */ |
569 | |
|
570 | 0 | if (rdpcmMode==2) |
571 | 0 | tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); |
572 | 0 | else |
573 | 0 | tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); |
574 | 0 | } |
575 | 0 | else { |
576 | | //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); |
577 | |
|
578 | 0 | tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); |
579 | 0 | } |
580 | |
|
581 | 0 | if (cIdx != 0) { |
582 | 0 | if (tctx->ResScaleVal != 0) { |
583 | 0 | cross_comp_pred(tctx, residual, nT); |
584 | 0 | } |
585 | 0 | } |
586 | |
|
587 | 0 | tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); |
588 | |
|
589 | 0 | if (rotateCoeffs) { |
590 | 0 | memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around |
591 | 0 | } |
592 | 0 | } |
593 | 0 | else { |
594 | 0 | int trType; |
595 | | |
596 | | //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { |
597 | 0 | if (nT==4 && cIdx==0 && cuPredModeIntra) { |
598 | 0 | trType=1; |
599 | 0 | } |
600 | 0 | else { |
601 | 0 | trType=0; |
602 | 0 | } |
603 | |
|
604 | 0 | assert(rdpcmMode==0); |
605 | | |
606 | | |
607 | 0 | if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { |
608 | | // cross-component-prediction: transform to residual buffer and add in a separate step |
609 | |
|
610 | 0 | transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, |
611 | 0 | pred, stride, bit_depth, cIdx); |
612 | 0 | } |
613 | 0 | else { |
614 | 0 | transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, |
615 | 0 | pred, stride, bit_depth); |
616 | 0 | } |
617 | 0 | } |
618 | 0 | } |
619 | | |
620 | | |
621 | 0 | logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); |
622 | |
|
623 | 0 | for (int y=0;y<nT;y++) { |
624 | 0 | logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx); |
625 | |
|
626 | 0 | for (int x=0;x<nT;x++) { |
627 | 0 | logtrace(LogTransform,"*%03x ", pred[x+y*stride]); |
628 | 0 | } |
629 | |
|
630 | 0 | logtrace(LogTransform,"*\n"); |
631 | 0 | } |
632 | | |
633 | | // zero out scrap coefficient buffer again |
634 | |
|
635 | 0 | for (int i=0;i<tctx->nCoeff[cIdx];i++) { |
636 | 0 | tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; |
637 | 0 | } |
638 | 0 | } Unexecuted instantiation: void scale_coefficients_internal<unsigned short>(thread_context*, int, int, int, int, int, int, bool, bool, int) Unexecuted instantiation: void scale_coefficients_internal<unsigned char>(thread_context*, int, int, int, int, int, int, bool, bool, int) |
639 | | |
640 | | |
641 | | void scale_coefficients(thread_context* tctx, |
642 | | int xT,int yT, // position of TU in frame (chroma adapted) |
643 | | int x0,int y0, // position of CU in frame (chroma adapted) |
644 | | int nT, int cIdx, |
645 | | bool transform_skip_flag, bool intra, |
646 | | int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical |
647 | | ) |
648 | 0 | { |
649 | 0 | if (tctx->img->high_bit_depth(cIdx)) { |
650 | 0 | scale_coefficients_internal<uint16_t>(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, |
651 | 0 | rdpcmMode); |
652 | 0 | } else { |
653 | 0 | scale_coefficients_internal<uint8_t> (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, |
654 | 0 | rdpcmMode); |
655 | 0 | } |
656 | 0 | } |
657 | | |
658 | | |
659 | | //#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20 |
660 | 0 | #define QUANT_SHIFT 14 // Q(4) = 2^14 |
661 | | //#define SCALE_BITS 15 // Inherited from TMuC, presumably for fractional bit estimates in RDOQ |
662 | 0 | #define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit) |
663 | | |
664 | | |
665 | | const static uint16_t g_quantScales[6] = { |
666 | | 26214,23302,20560,18396,16384,14564 |
667 | | }; |
668 | | |
669 | | void quant_coefficients(//encoder_context* ectx, |
670 | | int16_t* out_coeff, |
671 | | const int16_t* in_coeff, |
672 | | int log2TrSize, int qp, |
673 | | bool intra) |
674 | 0 | { |
675 | 0 | const int qpDiv6 = qp / 6; |
676 | 0 | const int qpMod6 = qp % 6; |
677 | | |
678 | | //int uiLog2TrSize = xLog2( iWidth - 1); |
679 | |
|
680 | 0 | int uiQ = g_quantScales[qpMod6]; |
681 | 0 | int bitDepth = 8; |
682 | 0 | int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize; // Represents scaling through forward transform |
683 | 0 | int qBits = QUANT_SHIFT + qpDiv6 + transformShift; |
684 | | |
685 | | /* TODO: originally, this was checking for intra slices, why not for intra mode ? |
686 | | */ |
687 | 0 | int rnd = (intra ? 171 : 85) << (qBits-9); |
688 | |
|
689 | 0 | int x, y; |
690 | |
|
691 | 0 | int nStride = (1<<log2TrSize); |
692 | |
|
693 | 0 | for (y=0; y < (1<<log2TrSize) ; y++) { |
694 | 0 | for (x=0; x < (1<<log2TrSize) ; x++) { |
695 | 0 | int level; |
696 | 0 | int sign; |
697 | 0 | int blockPos = y * nStride + x; |
698 | 0 | level = in_coeff[blockPos]; |
699 | | //logtrace(LogTransform,"(%d,%d) %d -> ", x,y,level); |
700 | 0 | sign = (level < 0 ? -1: 1); |
701 | |
|
702 | 0 | level = (abs_value(level) * uiQ + rnd ) >> qBits; |
703 | 0 | level *= sign; |
704 | 0 | out_coeff[blockPos] = Clip3(-32768, 32767, level); |
705 | | //logtrace(LogTransform,"%d\n", out_coeff[blockPos]); |
706 | 0 | } |
707 | 0 | } |
708 | 0 | } |
709 | | |
710 | | |
711 | | void dequant_coefficients(int16_t* out_coeff, |
712 | | const int16_t* in_coeff, |
713 | | int log2TrSize, int qP) |
714 | 0 | { |
715 | 0 | const int m_x_y = 1; |
716 | 0 | int bitDepth = 8; |
717 | 0 | int bdShift = bitDepth + log2TrSize - 5; |
718 | 0 | bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers |
719 | |
|
720 | 0 | const int offset = (1<<(bdShift-1)); |
721 | 0 | const int fact = m_x_y * levelScale[qP%6] << (qP/6); |
722 | | |
723 | | //int blkSize = (1<<log2TrSize); |
724 | 0 | int nCoeff = (1<<(log2TrSize<<1)); |
725 | |
|
726 | 0 | for (int i=0;i<nCoeff;i++) { |
727 | | |
728 | | // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit |
729 | 0 | int32_t currCoeff = in_coeff[i]; |
730 | | |
731 | | //logtrace(LogTransform,"coefficient[%d] = %d\n",i,currCoeff); |
732 | |
|
733 | 0 | currCoeff = Clip3(-32768,32767, |
734 | 0 | ( (currCoeff * fact + offset ) >> bdShift)); |
735 | | |
736 | | //logtrace(LogTransform," -> %d\n",currCoeff); |
737 | |
|
738 | 0 | out_coeff[i] = currCoeff; |
739 | 0 | } |
740 | 0 | } |