Coverage Report

Created: 2025-07-23 08:18

/src/libde265/libde265/transform.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "transform.h"
22
#include "util.h"
23
24
#include <assert.h>
25
26
27
const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ };
28
29
30
// (8.6.1)
31
void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
32
                                    int xCUBase, int yCUBase)
33
5.08M
{
34
5.08M
  logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC);
35
36
5.08M
  const pic_parameter_set& pps = tctx->img->get_pps();
37
5.08M
  const seq_parameter_set& sps = tctx->img->get_sps();
38
5.08M
  slice_segment_header* shdr = tctx->shdr;
39
40
  // top left pixel position of current quantization group
41
5.08M
  int xQG = xCUBase - (xCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1));
42
5.08M
  int yQG = yCUBase - (yCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1));
43
44
5.08M
  logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG);
45
46
47
  // we only have to set QP in the first call in a quantization-group
48
49
  /* TODO: check why this does not work with HoneyBee stream
50
51
  if (xQG == tctx->currentQG_x &&
52
      yQG == tctx->currentQG_y)
53
    {
54
      return;
55
    }
56
  */
57
58
  // if first QG in CU, remember last QPY of last CU previous QG
59
60
5.08M
  if (xQG != tctx->currentQG_x ||
61
5.08M
      yQG != tctx->currentQG_y)
62
1.45M
    {
63
1.45M
      tctx->lastQPYinPreviousQG = tctx->currentQPY;
64
1.45M
      tctx->currentQG_x = xQG;
65
1.45M
      tctx->currentQG_y = yQG;
66
1.45M
    }
67
68
5.08M
  int qPY_PRED;
69
70
  // first QG in CTB row ?
71
72
5.08M
  int ctbLSBMask = ((1<<sps.Log2CtbSizeY)-1);
73
5.08M
  bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0));
74
75
  // first QG in slice ?    TODO: a "firstQG" flag in the thread context would be faster
76
77
5.08M
  int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS;
78
79
5.08M
  int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY;
80
5.08M
  int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY;
81
82
5.08M
  bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG);
83
84
  // first QG in tile ?
85
86
5.08M
  bool firstQGInTile = false;
87
5.08M
  if (pps.tiles_enabled_flag) {
88
914k
    if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 &&
89
914k
        (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0)
90
878k
      {
91
878k
        int ctbX = xQG >> sps.Log2CtbSizeY;
92
878k
        int ctbY = yQG >> sps.Log2CtbSizeY;
93
94
878k
        firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow
95
878k
      }
96
914k
  }
97
98
99
5.08M
  if (firstQGInSlice || firstQGInTile ||
100
5.08M
      (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) {
101
1.39M
    qPY_PRED = tctx->shdr->SliceQPY;
102
1.39M
  }
103
3.68M
  else {
104
3.68M
    qPY_PRED = tctx->lastQPYinPreviousQG;
105
3.68M
  }
106
107
108
5.08M
  int qPYA,qPYB;
109
110
5.08M
  if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) {
111
3.24M
    int xTmp = (xQG-1) >> sps.Log2MinTrafoSize;
112
3.24M
    int yTmp = (yQG  ) >> sps.Log2MinTrafoSize;
113
3.24M
    int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY];
114
3.24M
    int ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize));
115
3.24M
    if (ctbAddrA == tctx->CtbAddrInTS) {
116
1.05M
      qPYA = tctx->img->get_QPY(xQG-1,yQG);
117
1.05M
    }
118
2.18M
    else {
119
2.18M
      qPYA = qPY_PRED;
120
2.18M
    }
121
3.24M
  }
122
1.83M
  else {
123
1.83M
    qPYA = qPY_PRED;
124
1.83M
  }
125
126
5.08M
  if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) {
127
2.97M
    int xTmp = (xQG  ) >> sps.Log2MinTrafoSize;
128
2.97M
    int yTmp = (yQG-1) >> sps.Log2MinTrafoSize;
129
2.97M
    int minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY];
130
2.97M
    int ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize));
131
2.97M
    if (ctbAddrB == tctx->CtbAddrInTS) {
132
1.04M
      qPYB = tctx->img->get_QPY(xQG,yQG-1);
133
1.04M
    }
134
1.93M
    else {
135
1.93M
      qPYB = qPY_PRED;
136
1.93M
    }
137
2.97M
  }
138
2.10M
  else {
139
2.10M
    qPYB = qPY_PRED;
140
2.10M
  }
141
142
5.08M
  qPY_PRED = (qPYA + qPYB + 1)>>1;
143
144
5.08M
  logtrace(LogTransform,"qPY_PRED = %d  (%d, %d)\n",qPY_PRED, qPYA, qPYB);
145
146
5.08M
  int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) %
147
5.08M
             (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y;
148
149
5.08M
  tctx->qPYPrime = QPY + sps.QpBdOffset_Y;
150
5.08M
  if (tctx->qPYPrime<0) {
151
46.5k
    tctx->qPYPrime=0;
152
46.5k
  }
153
154
5.08M
  int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb);
155
5.08M
  int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr);
156
157
5.08M
  logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n",
158
5.08M
           qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset,
159
5.08M
           qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset);
160
161
5.08M
  int qPCb,qPCr;
162
163
5.08M
  if (sps.ChromaArrayType == CHROMA_420) {
164
2.49M
    qPCb = table8_22(qPiCb);
165
2.49M
    qPCr = table8_22(qPiCr);
166
2.49M
  }
167
2.58M
  else {
168
2.58M
    qPCb = qPiCb;
169
2.58M
    qPCr = qPiCr;
170
2.58M
  }
171
172
  //printf("q: %d %d\n",qPiCb, qPCb);
173
174
5.08M
  tctx->qPCbPrime = qPCb + sps.QpBdOffset_C;
175
5.08M
  if (tctx->qPCbPrime<0) {
176
0
    tctx->qPCbPrime = 0;
177
0
  }
178
179
5.08M
  tctx->qPCrPrime = qPCr + sps.QpBdOffset_C;
180
5.08M
  if (tctx->qPCrPrime<0) {
181
0
    tctx->qPCrPrime = 0;
182
0
  }
183
184
  /*
185
  printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY,
186
         sps->QpBdOffset_Y,
187
         pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset,
188
         pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset,
189
         sps->QpBdOffset_C, sps->QpBdOffset_C,
190
         tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime);
191
  */
192
193
5.08M
  int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase);
194
195
  // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why.
196
  // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit.
197
  // id:000163,sig:06,src:002041,op:havoc,rep:16.bin
198
5.08M
  if (log2CbSize<3) { log2CbSize=3; }
199
200
5.08M
  tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY);
201
5.08M
  tctx->currentQPY = QPY;
202
203
  /*
204
  printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase,
205
         xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY);
206
  */
207
208
5.08M
  logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n",
209
5.08M
           xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime);
210
5.08M
}
211
212
213
214
template <class pixel_t>
215
void transform_coefficients(acceleration_functions* acceleration,
216
                            int16_t* coeff, int coeffStride, int nT, int trType,
217
                            pixel_t* dst, int dstStride, int bit_depth)
218
6.52M
{
219
6.52M
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
220
221
222
6.52M
  if (trType==1) {
223
224
3.25M
    acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth);
225
226
3.27M
  } else {
227
228
3.27M
    /**/ if (nT==4)  { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); }
229
627k
    else if (nT==8)  { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); }
230
68.3k
    else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); }
231
19.8k
    else             { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); }
232
3.27M
  }
233
234
#if 0
235
  printf("decoded pixels:\n");
236
  for (int y=0;y<nT;y++,printf("\n"))
237
    for (int x=0;x<nT;x++) {
238
      printf("%02x ",dst[y*dstStride+x]);
239
    }
240
#endif
241
6.52M
}
void transform_coefficients<unsigned short>(acceleration_functions*, short*, int, int, int, unsigned short*, int, int)
Line
Count
Source
218
2.79M
{
219
2.79M
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
220
221
222
2.79M
  if (trType==1) {
223
224
1.47M
    acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth);
225
226
1.47M
  } else {
227
228
1.32M
    /**/ if (nT==4)  { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); }
229
228k
    else if (nT==8)  { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); }
230
24.7k
    else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); }
231
8.18k
    else             { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); }
232
1.32M
  }
233
234
#if 0
235
  printf("decoded pixels:\n");
236
  for (int y=0;y<nT;y++,printf("\n"))
237
    for (int x=0;x<nT;x++) {
238
      printf("%02x ",dst[y*dstStride+x]);
239
    }
240
#endif
241
2.79M
}
void transform_coefficients<unsigned char>(acceleration_functions*, short*, int, int, int, unsigned char*, int, int)
Line
Count
Source
218
3.72M
{
219
3.72M
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
220
221
222
3.72M
  if (trType==1) {
223
224
1.78M
    acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth);
225
226
1.94M
  } else {
227
228
1.94M
    /**/ if (nT==4)  { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); }
229
399k
    else if (nT==8)  { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); }
230
43.5k
    else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); }
231
11.6k
    else             { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); }
232
1.94M
  }
233
234
#if 0
235
  printf("decoded pixels:\n");
236
  for (int y=0;y<nT;y++,printf("\n"))
237
    for (int x=0;x<nT;x++) {
238
      printf("%02x ",dst[y*dstStride+x]);
239
    }
240
#endif
241
3.72M
}
242
243
244
// TODO: make this an accelerated function
245
void cross_comp_pred(const thread_context* tctx, int32_t* residual, int nT)
246
53.4k
{
247
53.4k
  const int BitDepthC = tctx->img->get_sps().BitDepth_C;
248
53.4k
  const int BitDepthY = tctx->img->get_sps().BitDepth_Y;
249
250
376k
  for (int y=0;y<nT;y++)
251
2.99M
    for (int x=0;x<nT;x++) {
252
      /* TODO: the most usual case is definitely BitDepthY == BitDepthC, in which case
253
         we could just omit two shifts. The second most common case is probably
254
         BitDepthY>BitDepthC, for which we could also eliminate one shift. The remaining
255
         case is also one shift only.
256
      */
257
258
2.66M
      residual[y*nT+x] += (tctx->ResScaleVal *
259
2.66M
                           ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3;
260
2.66M
    }
261
53.4k
}
262
263
264
template <class pixel_t>
265
void transform_coefficients_explicit(thread_context* tctx,
266
                                     int16_t* coeff, int coeffStride, int nT, int trType,
267
                                     pixel_t* dst, int dstStride, int bit_depth, int cIdx)
268
163k
{
269
163k
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
270
271
163k
  const acceleration_functions* acceleration = &tctx->decctx->acceleration;
272
273
163k
  int32_t residual_buffer[32*32];
274
163k
  int32_t* residual;
275
163k
  if (cIdx==0) {
276
81.3k
    residual = tctx->residual_luma;
277
81.3k
  }
278
82.4k
  else {
279
82.4k
    residual = residual_buffer;
280
82.4k
  }
281
282
283
  // TODO
284
163k
  int bdShift = 20 - bit_depth;
285
163k
  int max_coeff_bits = 15;
286
287
163k
  if (trType==1) {
288
289
56.3k
    acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits);
290
291
107k
  } else {
292
293
107k
    /**/ if (nT==4)  { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); }
294
54.8k
    else if (nT==8)  { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); }
295
12.5k
    else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); }
296
457
    else             { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); }
297
107k
  }
298
299
300
  //printBlk("prediction",(uint8_t*)dst,nT,dstStride);
301
  //printBlk("residual",residual,nT,nT);
302
303
163k
  if (cIdx != 0) {
304
82.4k
    if (tctx->ResScaleVal != 0) {
305
53.0k
      cross_comp_pred(tctx, residual, nT);
306
53.0k
    }
307
308
    //printBlk("cross-comp-pred modified residual",residual,nT,nT);
309
82.4k
  }
310
311
163k
  acceleration->add_residual(dst,dstStride, residual,nT, bit_depth);
312
163k
}
void transform_coefficients_explicit<unsigned short>(thread_context*, short*, int, int, int, unsigned short*, int, int, int)
Line
Count
Source
268
24.7k
{
269
24.7k
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
270
271
24.7k
  const acceleration_functions* acceleration = &tctx->decctx->acceleration;
272
273
24.7k
  int32_t residual_buffer[32*32];
274
24.7k
  int32_t* residual;
275
24.7k
  if (cIdx==0) {
276
0
    residual = tctx->residual_luma;
277
0
  }
278
24.7k
  else {
279
24.7k
    residual = residual_buffer;
280
24.7k
  }
281
282
283
  // TODO
284
24.7k
  int bdShift = 20 - bit_depth;
285
24.7k
  int max_coeff_bits = 15;
286
287
24.7k
  if (trType==1) {
288
289
0
    acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits);
290
291
24.7k
  } else {
292
293
24.7k
    /**/ if (nT==4)  { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); }
294
12.0k
    else if (nT==8)  { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); }
295
1.79k
    else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); }
296
0
    else             { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); }
297
24.7k
  }
298
299
300
  //printBlk("prediction",(uint8_t*)dst,nT,dstStride);
301
  //printBlk("residual",residual,nT,nT);
302
303
24.7k
  if (cIdx != 0) {
304
24.7k
    if (tctx->ResScaleVal != 0) {
305
17.6k
      cross_comp_pred(tctx, residual, nT);
306
17.6k
    }
307
308
    //printBlk("cross-comp-pred modified residual",residual,nT,nT);
309
24.7k
  }
310
311
24.7k
  acceleration->add_residual(dst,dstStride, residual,nT, bit_depth);
312
24.7k
}
void transform_coefficients_explicit<unsigned char>(thread_context*, short*, int, int, int, unsigned char*, int, int, int)
Line
Count
Source
268
139k
{
269
139k
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
270
271
139k
  const acceleration_functions* acceleration = &tctx->decctx->acceleration;
272
273
139k
  int32_t residual_buffer[32*32];
274
139k
  int32_t* residual;
275
139k
  if (cIdx==0) {
276
81.3k
    residual = tctx->residual_luma;
277
81.3k
  }
278
57.6k
  else {
279
57.6k
    residual = residual_buffer;
280
57.6k
  }
281
282
283
  // TODO
284
139k
  int bdShift = 20 - bit_depth;
285
139k
  int max_coeff_bits = 15;
286
287
139k
  if (trType==1) {
288
289
56.3k
    acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits);
290
291
82.7k
  } else {
292
293
82.7k
    /**/ if (nT==4)  { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); }
294
42.7k
    else if (nT==8)  { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); }
295
10.7k
    else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); }
296
457
    else             { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); }
297
82.7k
  }
298
299
300
  //printBlk("prediction",(uint8_t*)dst,nT,dstStride);
301
  //printBlk("residual",residual,nT,nT);
302
303
139k
  if (cIdx != 0) {
304
57.6k
    if (tctx->ResScaleVal != 0) {
305
35.3k
      cross_comp_pred(tctx, residual, nT);
306
35.3k
    }
307
308
    //printBlk("cross-comp-pred modified residual",residual,nT,nT);
309
57.6k
  }
310
311
139k
  acceleration->add_residual(dst,dstStride, residual,nT, bit_depth);
312
139k
}
313
314
315
void inv_transform(acceleration_functions* acceleration,
316
                   uint8_t* dst, int dstStride, int16_t* coeff,
317
                   int log2TbSize, int trType)
318
0
{
319
0
  if (trType==1) {
320
0
    assert(log2TbSize==2);
321
322
0
    acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride);
323
324
0
  } else {
325
0
    acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride);
326
0
  }
327
328
329
#if 0
330
  int nT = 1<<log2TbSize;
331
  printf("decoded pixels:\n");
332
  for (int y=0;y<nT;y++,printf("\n"))
333
    for (int x=0;x<nT;x++) {
334
  printf("%02x ",dst[y*dstStride+x]);
335
}
336
#endif
337
0
}
338
339
340
void fwd_transform(acceleration_functions* acceleration,
341
                   int16_t* coeff, int coeffStride, int log2TbSize, int trType,
342
                   const int16_t* src, int srcStride)
343
0
{
344
0
  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,1<<log2TbSize);
345
346
0
  if (trType==1) {
347
    // DST 4x4
348
349
0
    acceleration->fwd_transform_4x4_dst_8(coeff, src, srcStride);
350
0
  } else {
351
    // DCT 4x4, 8x8, 16x16, 32x32
352
353
0
    acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride);
354
0
  }
355
0
}
356
357
358
359
static const int levelScale[] = { 40,45,51,57,64,72 };
360
361
// (8.6.2) and (8.6.3)
362
template <class pixel_t>
363
void scale_coefficients_internal(thread_context* tctx,
364
                                 int xT,int yT, // position of TU in frame (chroma adapted)
365
                                 int x0,int y0, // position of CU in frame (chroma adapted)
366
                                 int nT, int cIdx,
367
                                 bool transform_skip_flag, bool intra, int rdpcmMode)
368
9.71M
{
369
9.71M
  const seq_parameter_set& sps = tctx->img->get_sps();
370
9.71M
  const pic_parameter_set& pps = tctx->img->get_pps();
371
372
9.71M
  int qP;
373
9.71M
  switch (cIdx) {
374
6.48M
  case 0: qP = tctx->qPYPrime;  break;
375
1.61M
  case 1: qP = tctx->qPCbPrime; break;
376
1.61M
  case 2: qP = tctx->qPCrPrime; break;
377
0
  default: qP = 0; assert(0); break; // should never happen
378
9.71M
  }
379
380
9.71M
  logtrace(LogTransform,"qP: %d\n",qP);
381
382
383
9.71M
  int16_t* coeff;
384
9.71M
  int      coeffStride;
385
386
9.71M
  coeff = tctx->coeffBuf;
387
9.71M
  coeffStride = nT;
388
389
390
391
392
393
9.71M
  pixel_t* pred;
394
9.71M
  int      stride;
395
9.71M
  pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT);
396
9.71M
  stride = tctx->img->get_image_stride(cIdx);
397
398
  // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler
399
  // can optimize away a lot of code for 8-bit pixels.
400
9.71M
  const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx));
401
402
  //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA));
403
9.71M
  int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA);
404
405
9.71M
  bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag &&
406
9.71M
                       nT == 4 &&
407
9.71M
                       cuPredModeIntra);
408
409
9.71M
  if (tctx->cu_transquant_bypass_flag) {
410
411
1.82M
    int32_t residual_buffer[32*32];
412
413
1.82M
    int32_t* residual;
414
1.82M
    if (cIdx==0) residual = tctx->residual_luma;
415
478k
    else         residual = residual_buffer;
416
417
418
    // TODO: we could fold the coefficient rotation into the coefficient expansion here:
419
17.6M
    for (int i=0;i<tctx->nCoeff[cIdx];i++) {
420
15.8M
      int32_t currCoeff = tctx->coeffList[cIdx][i];
421
15.8M
      tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
422
15.8M
    }
423
424
1.82M
    if (rotateCoeffs) {
425
388k
      tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
426
388k
    }
427
428
1.82M
    if (rdpcmMode) {
429
149k
      if (rdpcmMode==2)
430
63.6k
        tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT);
431
85.7k
      else
432
85.7k
        tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT);
433
149k
    }
434
1.67M
    else {
435
1.67M
      tctx->decctx->acceleration.transform_bypass(residual, coeff, nT);
436
1.67M
    }
437
438
1.82M
    if (cIdx != 0) {
439
478k
      if (tctx->ResScaleVal != 0) {
440
467
        cross_comp_pred(tctx, residual, nT);
441
467
      }
442
478k
    }
443
444
1.82M
    tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
445
446
1.82M
    if (rotateCoeffs) {
447
388k
      memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
448
388k
    }
449
1.82M
  }
450
7.89M
  else {
451
    // (8.6.3)
452
453
7.89M
    int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5;
454
455
7.89M
    logtrace(LogTransform,"bdShift=%d\n",bdShift);
456
457
7.89M
    logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
458
459
460
    // --- inverse quantization ---
461
462
7.89M
    if (sps.scaling_list_enable_flag==0) {
463
464
      //const int m_x_y = 16;
465
5.72M
      const int m_x_y = 1;
466
5.72M
      bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
467
468
5.72M
      const int offset = (1<<(bdShift-1));
469
5.72M
      const int fact = m_x_y * levelScale[qP%6] << (qP/6);
470
471
49.6M
      for (int i=0;i<tctx->nCoeff[cIdx];i++) {
472
473
        // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
474
43.9M
        int32_t currCoeff  = tctx->coeffList[cIdx][i];
475
476
        //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i],
477
        //tctx->coeffList[cIdx][i]);
478
479
43.9M
        currCoeff = Clip3(-32768,32767,
480
43.9M
                          ( (currCoeff * fact + offset ) >> bdShift));
481
482
        //logtrace(LogTransform," -> %d\n",currCoeff);
483
484
43.9M
        tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
485
43.9M
      }
486
5.72M
    }
487
2.16M
    else {
488
2.16M
      const int offset = (1<<(bdShift-1));
489
490
2.16M
      const uint8_t* sclist;
491
2.16M
      int matrixID = cIdx;
492
493
2.16M
      if (nT==32) {
494
16.4k
        matrixID=0;
495
16.4k
      }
496
497
2.16M
      if (!intra) {
498
96.5k
        if (nT<32) { matrixID += 3; }
499
3.06k
        else { matrixID++; }
500
96.5k
      }
501
502
2.16M
      switch (nT) {
503
1.85M
      case  4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
504
259k
      case  8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
505
32.0k
      case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
506
16.4k
      case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
507
0
      default: assert(0);
508
2.16M
      }
509
510
22.9M
      for (int i=0;i<tctx->nCoeff[cIdx];i++) {
511
20.7M
        int pos = tctx->coeffPos[cIdx][i];
512
513
20.7M
        const int m_x_y = sclist[pos];
514
20.7M
        const int fact = m_x_y * levelScale[qP%6] << (qP/6);
515
516
20.7M
        int64_t currCoeff  = tctx->coeffList[cIdx][i];
517
518
20.7M
        currCoeff = Clip3(-32768,32767,
519
20.7M
                          ( (currCoeff * fact + offset ) >> bdShift));
520
521
20.7M
        tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
522
20.7M
      }
523
2.16M
    }
524
525
526
    // --- do transform or skip ---
527
528
7.89M
    logtrace(LogTransform,"coefficients OUT:\n");
529
43.1M
    for (int y=0;y<nT;y++) {
530
35.2M
      logtrace(LogTransform,"  ");
531
225M
      for (int x=0;x<nT;x++) {
532
190M
        logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
533
190M
      }
534
35.2M
      logtrace(LogTransform,"*\n");
535
35.2M
    }
536
537
7.89M
    int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C;
538
539
7.89M
    logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
540
541
7.89M
    logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
542
7.89M
             transform_skip_flag);
543
544
7.89M
    if (transform_skip_flag) {
545
546
1.20M
      int extended_precision_processing_flag = 0;
547
1.20M
      int Log2nTbS = Log2(nT);
548
1.20M
      int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 );
549
1.20M
      int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 )
550
1.20M
        + Log2nTbS;
551
552
1.20M
      if (rotateCoeffs) {
553
410k
        tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
554
410k
      }
555
556
1.20M
      int32_t residual_buffer[32*32];
557
558
1.20M
      int32_t* residual;
559
1.20M
      if (cIdx==0) residual = tctx->residual_luma;
560
433k
      else         residual = residual_buffer;
561
562
1.20M
      if (rdpcmMode) {
563
        /*
564
        if (rdpcmMode==2)
565
          tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth);
566
        else
567
          tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth);
568
        */
569
570
86.0k
        if (rdpcmMode==2)
571
51.5k
          tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift);
572
34.4k
        else
573
34.4k
          tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift);
574
86.0k
      }
575
1.11M
      else {
576
        //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth);
577
578
1.11M
        tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift);
579
1.11M
      }
580
581
1.20M
      if (cIdx != 0) {
582
433k
        if (tctx->ResScaleVal != 0) {
583
0
          cross_comp_pred(tctx, residual, nT);
584
0
        }
585
433k
      }
586
587
1.20M
      tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
588
589
1.20M
      if (rotateCoeffs) {
590
410k
        memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
591
410k
      }
592
1.20M
    }
593
6.69M
    else {
594
6.69M
      int trType;
595
596
      //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
597
6.69M
      if (nT==4 && cIdx==0 && cuPredModeIntra) {
598
3.31M
        trType=1;
599
3.31M
      }
600
3.37M
      else {
601
3.37M
        trType=0;
602
3.37M
      }
603
604
6.69M
      assert(rdpcmMode==0);
605
606
607
6.69M
      if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) {
608
        // cross-component-prediction: transform to residual buffer and add in a separate step
609
610
163k
        transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType,
611
163k
                                        pred, stride, bit_depth, cIdx);
612
163k
      }
613
6.52M
      else {
614
6.52M
        transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType,
615
6.52M
                               pred, stride, bit_depth);
616
6.52M
      }
617
6.69M
    }
618
7.89M
  }
619
620
621
9.71M
  logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
622
623
53.2M
  for (int y=0;y<nT;y++) {
624
43.5M
    logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx);
625
626
280M
    for (int x=0;x<nT;x++) {
627
236M
      logtrace(LogTransform,"*%03x ", pred[x+y*stride]);
628
236M
    }
629
630
43.5M
    logtrace(LogTransform,"*\n");
631
43.5M
  }
632
633
  // zero out scrap coefficient buffer again
634
635
90.2M
  for (int i=0;i<tctx->nCoeff[cIdx];i++) {
636
80.5M
    tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
637
80.5M
  }
638
9.71M
}
void scale_coefficients_internal<unsigned short>(thread_context*, int, int, int, int, int, int, bool, bool, int)
Line
Count
Source
368
4.48M
{
369
4.48M
  const seq_parameter_set& sps = tctx->img->get_sps();
370
4.48M
  const pic_parameter_set& pps = tctx->img->get_pps();
371
372
4.48M
  int qP;
373
4.48M
  switch (cIdx) {
374
3.15M
  case 0: qP = tctx->qPYPrime;  break;
375
662k
  case 1: qP = tctx->qPCbPrime; break;
376
661k
  case 2: qP = tctx->qPCrPrime; break;
377
0
  default: qP = 0; assert(0); break; // should never happen
378
4.48M
  }
379
380
4.48M
  logtrace(LogTransform,"qP: %d\n",qP);
381
382
383
4.48M
  int16_t* coeff;
384
4.48M
  int      coeffStride;
385
386
4.48M
  coeff = tctx->coeffBuf;
387
4.48M
  coeffStride = nT;
388
389
390
391
392
393
4.48M
  pixel_t* pred;
394
4.48M
  int      stride;
395
4.48M
  pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT);
396
4.48M
  stride = tctx->img->get_image_stride(cIdx);
397
398
  // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler
399
  // can optimize away a lot of code for 8-bit pixels.
400
4.48M
  const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx));
401
402
  //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA));
403
4.48M
  int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA);
404
405
4.48M
  bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag &&
406
4.48M
                       nT == 4 &&
407
4.48M
                       cuPredModeIntra);
408
409
4.48M
  if (tctx->cu_transquant_bypass_flag) {
410
411
890k
    int32_t residual_buffer[32*32];
412
413
890k
    int32_t* residual;
414
890k
    if (cIdx==0) residual = tctx->residual_luma;
415
181k
    else         residual = residual_buffer;
416
417
418
    // TODO: we could fold the coefficient rotation into the coefficient expansion here:
419
7.44M
    for (int i=0;i<tctx->nCoeff[cIdx];i++) {
420
6.55M
      int32_t currCoeff = tctx->coeffList[cIdx][i];
421
6.55M
      tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
422
6.55M
    }
423
424
890k
    if (rotateCoeffs) {
425
134k
      tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
426
134k
    }
427
428
890k
    if (rdpcmMode) {
429
92.0k
      if (rdpcmMode==2)
430
24.1k
        tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT);
431
67.9k
      else
432
67.9k
        tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT);
433
92.0k
    }
434
798k
    else {
435
798k
      tctx->decctx->acceleration.transform_bypass(residual, coeff, nT);
436
798k
    }
437
438
890k
    if (cIdx != 0) {
439
181k
      if (tctx->ResScaleVal != 0) {
440
125
        cross_comp_pred(tctx, residual, nT);
441
125
      }
442
181k
    }
443
444
890k
    tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
445
446
890k
    if (rotateCoeffs) {
447
134k
      memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
448
134k
    }
449
890k
  }
450
3.59M
  else {
451
    // (8.6.3)
452
453
3.59M
    int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5;
454
455
3.59M
    logtrace(LogTransform,"bdShift=%d\n",bdShift);
456
457
3.59M
    logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
458
459
460
    // --- inverse quantization ---
461
462
3.59M
    if (sps.scaling_list_enable_flag==0) {
463
464
      //const int m_x_y = 16;
465
2.59M
      const int m_x_y = 1;
466
2.59M
      bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
467
468
2.59M
      const int offset = (1<<(bdShift-1));
469
2.59M
      const int fact = m_x_y * levelScale[qP%6] << (qP/6);
470
471
22.3M
      for (int i=0;i<tctx->nCoeff[cIdx];i++) {
472
473
        // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
474
19.7M
        int32_t currCoeff  = tctx->coeffList[cIdx][i];
475
476
        //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i],
477
        //tctx->coeffList[cIdx][i]);
478
479
19.7M
        currCoeff = Clip3(-32768,32767,
480
19.7M
                          ( (currCoeff * fact + offset ) >> bdShift));
481
482
        //logtrace(LogTransform," -> %d\n",currCoeff);
483
484
19.7M
        tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
485
19.7M
      }
486
2.59M
    }
487
997k
    else {
488
997k
      const int offset = (1<<(bdShift-1));
489
490
997k
      const uint8_t* sclist;
491
997k
      int matrixID = cIdx;
492
493
997k
      if (nT==32) {
494
6.47k
        matrixID=0;
495
6.47k
      }
496
497
997k
      if (!intra) {
498
49.1k
        if (nT<32) { matrixID += 3; }
499
1.59k
        else { matrixID++; }
500
49.1k
      }
501
502
997k
      switch (nT) {
503
912k
      case  4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
504
66.5k
      case  8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
505
12.3k
      case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
506
6.47k
      case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
507
0
      default: assert(0);
508
997k
      }
509
510
10.6M
      for (int i=0;i<tctx->nCoeff[cIdx];i++) {
511
9.69M
        int pos = tctx->coeffPos[cIdx][i];
512
513
9.69M
        const int m_x_y = sclist[pos];
514
9.69M
        const int fact = m_x_y * levelScale[qP%6] << (qP/6);
515
516
9.69M
        int64_t currCoeff  = tctx->coeffList[cIdx][i];
517
518
9.69M
        currCoeff = Clip3(-32768,32767,
519
9.69M
                          ( (currCoeff * fact + offset ) >> bdShift));
520
521
9.69M
        tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
522
9.69M
      }
523
997k
    }
524
525
526
    // --- do transform or skip ---
527
528
3.59M
    logtrace(LogTransform,"coefficients OUT:\n");
529
19.2M
    for (int y=0;y<nT;y++) {
530
15.6M
      logtrace(LogTransform,"  ");
531
96.0M
      for (int x=0;x<nT;x++) {
532
80.4M
        logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
533
80.4M
      }
534
15.6M
      logtrace(LogTransform,"*\n");
535
15.6M
    }
536
537
3.59M
    int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C;
538
539
3.59M
    logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
540
541
3.59M
    logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
542
3.59M
             transform_skip_flag);
543
544
3.59M
    if (transform_skip_flag) {
545
546
766k
      int extended_precision_processing_flag = 0;
547
766k
      int Log2nTbS = Log2(nT);
548
766k
      int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 );
549
766k
      int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 )
550
766k
        + Log2nTbS;
551
552
766k
      if (rotateCoeffs) {
553
281k
        tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
554
281k
      }
555
556
766k
      int32_t residual_buffer[32*32];
557
558
766k
      int32_t* residual;
559
766k
      if (cIdx==0) residual = tctx->residual_luma;
560
273k
      else         residual = residual_buffer;
561
562
766k
      if (rdpcmMode) {
563
        /*
564
        if (rdpcmMode==2)
565
          tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth);
566
        else
567
          tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth);
568
        */
569
570
56.0k
        if (rdpcmMode==2)
571
33.1k
          tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift);
572
22.9k
        else
573
22.9k
          tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift);
574
56.0k
      }
575
710k
      else {
576
        //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth);
577
578
710k
        tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift);
579
710k
      }
580
581
766k
      if (cIdx != 0) {
582
273k
        if (tctx->ResScaleVal != 0) {
583
0
          cross_comp_pred(tctx, residual, nT);
584
0
        }
585
273k
      }
586
587
766k
      tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
588
589
766k
      if (rotateCoeffs) {
590
281k
        memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
591
281k
      }
592
766k
    }
593
2.82M
    else {
594
2.82M
      int trType;
595
596
      //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
597
2.82M
      if (nT==4 && cIdx==0 && cuPredModeIntra) {
598
1.47M
        trType=1;
599
1.47M
      }
600
1.34M
      else {
601
1.34M
        trType=0;
602
1.34M
      }
603
604
2.82M
      assert(rdpcmMode==0);
605
606
607
2.82M
      if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) {
608
        // cross-component-prediction: transform to residual buffer and add in a separate step
609
610
24.7k
        transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType,
611
24.7k
                                        pred, stride, bit_depth, cIdx);
612
24.7k
      }
613
2.79M
      else {
614
2.79M
        transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType,
615
2.79M
                               pred, stride, bit_depth);
616
2.79M
      }
617
2.82M
    }
618
3.59M
  }
619
620
621
4.48M
  logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
622
623
24.0M
  for (int y=0;y<nT;y++) {
624
19.5M
    logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx);
625
626
120M
    for (int x=0;x<nT;x++) {
627
100M
      logtrace(LogTransform,"*%03x ", pred[x+y*stride]);
628
100M
    }
629
630
19.5M
    logtrace(LogTransform,"*\n");
631
19.5M
  }
632
633
  // zero out scrap coefficient buffer again
634
635
40.4M
  for (int i=0;i<tctx->nCoeff[cIdx];i++) {
636
35.9M
    tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
637
35.9M
  }
638
4.48M
}
void scale_coefficients_internal<unsigned char>(thread_context*, int, int, int, int, int, int, bool, bool, int)
Line
Count
Source
368
5.23M
{
369
5.23M
  const seq_parameter_set& sps = tctx->img->get_sps();
370
5.23M
  const pic_parameter_set& pps = tctx->img->get_pps();
371
372
5.23M
  int qP;
373
5.23M
  switch (cIdx) {
374
3.32M
  case 0: qP = tctx->qPYPrime;  break;
375
957k
  case 1: qP = tctx->qPCbPrime; break;
376
953k
  case 2: qP = tctx->qPCrPrime; break;
377
0
  default: qP = 0; assert(0); break; // should never happen
378
5.23M
  }
379
380
5.23M
  logtrace(LogTransform,"qP: %d\n",qP);
381
382
383
5.23M
  int16_t* coeff;
384
5.23M
  int      coeffStride;
385
386
5.23M
  coeff = tctx->coeffBuf;
387
5.23M
  coeffStride = nT;
388
389
390
391
392
393
5.23M
  pixel_t* pred;
394
5.23M
  int      stride;
395
5.23M
  pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT);
396
5.23M
  stride = tctx->img->get_image_stride(cIdx);
397
398
  // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler
399
  // can optimize away a lot of code for 8-bit pixels.
400
5.23M
  const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx));
401
402
  //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA));
403
5.23M
  int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA);
404
405
5.23M
  bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag &&
406
5.23M
                       nT == 4 &&
407
5.23M
                       cuPredModeIntra);
408
409
5.23M
  if (tctx->cu_transquant_bypass_flag) {
410
411
937k
    int32_t residual_buffer[32*32];
412
413
937k
    int32_t* residual;
414
937k
    if (cIdx==0) residual = tctx->residual_luma;
415
296k
    else         residual = residual_buffer;
416
417
418
    // TODO: we could fold the coefficient rotation into the coefficient expansion here:
419
10.1M
    for (int i=0;i<tctx->nCoeff[cIdx];i++) {
420
9.25M
      int32_t currCoeff = tctx->coeffList[cIdx][i];
421
9.25M
      tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
422
9.25M
    }
423
424
937k
    if (rotateCoeffs) {
425
254k
      tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
426
254k
    }
427
428
937k
    if (rdpcmMode) {
429
57.3k
      if (rdpcmMode==2)
430
39.5k
        tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT);
431
17.7k
      else
432
17.7k
        tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT);
433
57.3k
    }
434
880k
    else {
435
880k
      tctx->decctx->acceleration.transform_bypass(residual, coeff, nT);
436
880k
    }
437
438
937k
    if (cIdx != 0) {
439
296k
      if (tctx->ResScaleVal != 0) {
440
342
        cross_comp_pred(tctx, residual, nT);
441
342
      }
442
296k
    }
443
444
937k
    tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
445
446
937k
    if (rotateCoeffs) {
447
254k
      memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
448
254k
    }
449
937k
  }
450
4.30M
  else {
451
    // (8.6.3)
452
453
4.30M
    int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5;
454
455
4.30M
    logtrace(LogTransform,"bdShift=%d\n",bdShift);
456
457
4.30M
    logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
458
459
460
    // --- inverse quantization ---
461
462
4.30M
    if (sps.scaling_list_enable_flag==0) {
463
464
      //const int m_x_y = 16;
465
3.13M
      const int m_x_y = 1;
466
3.13M
      bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
467
468
3.13M
      const int offset = (1<<(bdShift-1));
469
3.13M
      const int fact = m_x_y * levelScale[qP%6] << (qP/6);
470
471
27.3M
      for (int i=0;i<tctx->nCoeff[cIdx];i++) {
472
473
        // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
474
24.2M
        int32_t currCoeff  = tctx->coeffList[cIdx][i];
475
476
        //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i],
477
        //tctx->coeffList[cIdx][i]);
478
479
24.2M
        currCoeff = Clip3(-32768,32767,
480
24.2M
                          ( (currCoeff * fact + offset ) >> bdShift));
481
482
        //logtrace(LogTransform," -> %d\n",currCoeff);
483
484
24.2M
        tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
485
24.2M
      }
486
3.13M
    }
487
1.16M
    else {
488
1.16M
      const int offset = (1<<(bdShift-1));
489
490
1.16M
      const uint8_t* sclist;
491
1.16M
      int matrixID = cIdx;
492
493
1.16M
      if (nT==32) {
494
9.94k
        matrixID=0;
495
9.94k
      }
496
497
1.16M
      if (!intra) {
498
47.3k
        if (nT<32) { matrixID += 3; }
499
1.46k
        else { matrixID++; }
500
47.3k
      }
501
502
1.16M
      switch (nT) {
503
944k
      case  4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
504
192k
      case  8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
505
19.6k
      case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
506
9.94k
      case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
507
0
      default: assert(0);
508
1.16M
      }
509
510
12.2M
      for (int i=0;i<tctx->nCoeff[cIdx];i++) {
511
11.0M
        int pos = tctx->coeffPos[cIdx][i];
512
513
11.0M
        const int m_x_y = sclist[pos];
514
11.0M
        const int fact = m_x_y * levelScale[qP%6] << (qP/6);
515
516
11.0M
        int64_t currCoeff  = tctx->coeffList[cIdx][i];
517
518
11.0M
        currCoeff = Clip3(-32768,32767,
519
11.0M
                          ( (currCoeff * fact + offset ) >> bdShift));
520
521
11.0M
        tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
522
11.0M
      }
523
1.16M
    }
524
525
526
    // --- do transform or skip ---
527
528
4.30M
    logtrace(LogTransform,"coefficients OUT:\n");
529
23.8M
    for (int y=0;y<nT;y++) {
530
19.5M
      logtrace(LogTransform,"  ");
531
129M
      for (int x=0;x<nT;x++) {
532
109M
        logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
533
109M
      }
534
19.5M
      logtrace(LogTransform,"*\n");
535
19.5M
    }
536
537
4.30M
    int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C;
538
539
4.30M
    logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
540
541
4.30M
    logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
542
4.30M
             transform_skip_flag);
543
544
4.30M
    if (transform_skip_flag) {
545
546
433k
      int extended_precision_processing_flag = 0;
547
433k
      int Log2nTbS = Log2(nT);
548
433k
      int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 );
549
433k
      int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 )
550
433k
        + Log2nTbS;
551
552
433k
      if (rotateCoeffs) {
553
129k
        tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
554
129k
      }
555
556
433k
      int32_t residual_buffer[32*32];
557
558
433k
      int32_t* residual;
559
433k
      if (cIdx==0) residual = tctx->residual_luma;
560
159k
      else         residual = residual_buffer;
561
562
433k
      if (rdpcmMode) {
563
        /*
564
        if (rdpcmMode==2)
565
          tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth);
566
        else
567
          tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth);
568
        */
569
570
30.0k
        if (rdpcmMode==2)
571
18.4k
          tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift);
572
11.5k
        else
573
11.5k
          tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift);
574
30.0k
      }
575
403k
      else {
576
        //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth);
577
578
403k
        tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift);
579
403k
      }
580
581
433k
      if (cIdx != 0) {
582
159k
        if (tctx->ResScaleVal != 0) {
583
0
          cross_comp_pred(tctx, residual, nT);
584
0
        }
585
159k
      }
586
587
433k
      tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
588
589
433k
      if (rotateCoeffs) {
590
129k
        memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
591
129k
      }
592
433k
    }
593
3.86M
    else {
594
3.86M
      int trType;
595
596
      //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
597
3.86M
      if (nT==4 && cIdx==0 && cuPredModeIntra) {
598
1.83M
        trType=1;
599
1.83M
      }
600
2.03M
      else {
601
2.03M
        trType=0;
602
2.03M
      }
603
604
3.86M
      assert(rdpcmMode==0);
605
606
607
3.86M
      if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) {
608
        // cross-component-prediction: transform to residual buffer and add in a separate step
609
610
139k
        transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType,
611
139k
                                        pred, stride, bit_depth, cIdx);
612
139k
      }
613
3.72M
      else {
614
3.72M
        transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType,
615
3.72M
                               pred, stride, bit_depth);
616
3.72M
      }
617
3.86M
    }
618
4.30M
  }
619
620
621
5.23M
  logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
622
623
29.2M
  for (int y=0;y<nT;y++) {
624
24.0M
    logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx);
625
626
159M
    for (int x=0;x<nT;x++) {
627
135M
      logtrace(LogTransform,"*%03x ", pred[x+y*stride]);
628
135M
    }
629
630
24.0M
    logtrace(LogTransform,"*\n");
631
24.0M
  }
632
633
  // zero out scrap coefficient buffer again
634
635
49.7M
  for (int i=0;i<tctx->nCoeff[cIdx];i++) {
636
44.5M
    tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
637
44.5M
  }
638
5.23M
}
639
640
641
void scale_coefficients(thread_context* tctx,
642
                        int xT,int yT, // position of TU in frame (chroma adapted)
643
                        int x0,int y0, // position of CU in frame (chroma adapted)
644
                        int nT, int cIdx,
645
                        bool transform_skip_flag, bool intra,
646
                        int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical
647
                        )
648
9.71M
{
649
9.71M
  if (tctx->img->high_bit_depth(cIdx)) {
650
4.48M
    scale_coefficients_internal<uint16_t>(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
651
4.48M
                                          rdpcmMode);
652
5.23M
  } else {
653
5.23M
    scale_coefficients_internal<uint8_t> (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
654
5.23M
                                          rdpcmMode);
655
5.23M
  }
656
9.71M
}
657
658
659
//#define QUANT_IQUANT_SHIFT    20 // Q(QP%6) * IQ(QP%6) = 2^20
660
0
#define QUANT_SHIFT           14 // Q(4) = 2^14
661
//#define SCALE_BITS            15 // Inherited from TMuC, presumably for fractional bit estimates in RDOQ
662
0
#define MAX_TR_DYNAMIC_RANGE  15 // Maximum transform dynamic range (excluding sign bit)
663
664
665
const static uint16_t g_quantScales[6] = {
666
  26214,23302,20560,18396,16384,14564
667
};
668
669
void quant_coefficients(//encoder_context* ectx,
670
                        int16_t* out_coeff,
671
                        const int16_t* in_coeff,
672
                        int log2TrSize, int qp,
673
                        bool intra)
674
0
{
675
0
  const int qpDiv6 = qp / 6;
676
0
  const int qpMod6 = qp % 6;
677
678
  //int uiLog2TrSize = xLog2( iWidth - 1);
679
680
0
  int uiQ = g_quantScales[qpMod6];
681
0
  int bitDepth = 8;
682
0
  int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize;  // Represents scaling through forward transform
683
0
  int qBits = QUANT_SHIFT + qpDiv6 + transformShift;
684
685
  /* TODO: originally, this was checking for intra slices, why not for intra mode ?
686
   */
687
0
  int rnd = (intra ? 171 : 85) << (qBits-9);
688
689
0
  int x, y;
690
0
  int uiAcSum = 0;
691
692
0
  int nStride = (1<<log2TrSize);
693
694
0
  for (y=0; y < (1<<log2TrSize) ; y++) {
695
0
    for (x=0; x < (1<<log2TrSize) ; x++) {
696
0
      int level;
697
0
      int sign;
698
0
      int blockPos = y * nStride + x;
699
0
      level  = in_coeff[blockPos];
700
      //logtrace(LogTransform,"(%d,%d) %d -> ", x,y,level);
701
0
      sign   = (level < 0 ? -1: 1);
702
703
0
      level = (abs_value(level) * uiQ + rnd ) >> qBits;
704
0
      uiAcSum += level;
705
0
      level *= sign;
706
0
      out_coeff[blockPos] = Clip3(-32768, 32767, level);
707
      //logtrace(LogTransform,"%d\n", out_coeff[blockPos]);
708
0
    }
709
0
  }
710
0
}
711
712
713
void dequant_coefficients(int16_t* out_coeff,
714
                          const int16_t* in_coeff,
715
                          int log2TrSize, int qP)
716
0
{
717
0
  const int m_x_y = 1;
718
0
  int bitDepth = 8;
719
0
  int bdShift = bitDepth + log2TrSize - 5;
720
0
  bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
721
722
0
  const int offset = (1<<(bdShift-1));
723
0
  const int fact = m_x_y * levelScale[qP%6] << (qP/6);
724
725
0
  int blkSize = (1<<log2TrSize);
726
0
  int nCoeff  = (1<<(log2TrSize<<1));
727
728
0
  for (int i=0;i<nCoeff;i++) {
729
730
    // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
731
0
    int32_t currCoeff  = in_coeff[i];
732
733
    //logtrace(LogTransform,"coefficient[%d] = %d\n",i,currCoeff);
734
735
0
    currCoeff = Clip3(-32768,32767,
736
0
                      ( (currCoeff * fact + offset ) >> bdShift));
737
738
    //logtrace(LogTransform," -> %d\n",currCoeff);
739
740
0
    out_coeff[i] = currCoeff;
741
0
  }
742
0
}