Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/TrQuant_EMT.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     TrQuant_EMT.cpp
45
    \brief    transform and quantization class
46
*/
47
48
#include "TrQuant_EMT.h"
49
#include "Rom.h"
50
51
#include <stdlib.h>
52
#include <math.h>
53
#include <memory.h>
54
55
//! \ingroup CommonLib
56
//! \{
57
58
namespace vvenc {
59
60
// ********************************** DCT-II **********************************
61
  
62
#if ENABLE_SIMD_TRAFO
63
template<int uiTrSize>
64
inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT );
65
66
template<>
67
inline void _fastInverseMM<2>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
68
0
{
69
0
  const int rnd_factor  = 1 << (shift - 1);
70
0
  const int reducedLine = line - iSkipLine;
71
0
  const int cutoff      = 2 - iSkipLine2;
72
0
73
0
  memset( dst, 0, reducedLine * 2 * sizeof( TCoeff ) );
74
0
75
0
  for( int k = 0; k < cutoff; k++ )
76
0
  {
77
0
    const TCoeff* srcPtr = &src[k * line];
78
0
    for( int i = 0; i < reducedLine; i++ )
79
0
    {
80
0
            TCoeff*       dstPtr = &dst[i << 1];
81
0
      const TMatrixCoeff*  itPtr =  &iT[k << 1];
82
0
      const TCoeff        srcVal = *srcPtr;
83
0
      for( int j = 0; j < 2; j++ )
84
0
      {
85
0
        *dstPtr++ += srcVal * *itPtr++;
86
0
      }
87
0
      srcPtr++;
88
0
    }
89
0
  }
90
0
91
0
  for( int i = 0; i < reducedLine; i++ )
92
0
  {
93
0
    TCoeff* dstPtr = &dst[i << 1];
94
0
    for( int j = 0; j < 2; j++, dstPtr++ )
95
0
    {
96
0
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
97
0
    }
98
0
  }
99
0
100
0
  if( iSkipLine )
101
0
  {
102
0
    memset( dst + ( reducedLine << 1 ), 0, ( iSkipLine << 1 ) * sizeof( TCoeff ) );
103
0
  }
104
0
}
105
106
template<>
107
inline void _fastInverseMM<4>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
108
0
{
109
0
  const int rnd_factor  = 1 << ( shift - 1 );
110
0
  const int reducedLine = line - iSkipLine;
111
0
  const int cutoff      = 4 - iSkipLine2;
112
113
0
  memset( dst, 0, reducedLine * 4 * sizeof( TCoeff ) );
114
115
0
#if ENABLE_SIMD_TRAFO
116
0
  g_tCoeffOps.fastInvCore[0]( iT, src, dst, line, reducedLine, cutoff );
117
0
  g_tCoeffOps.roundClip4( dst, 4, reducedLine, 4, outputMinimum, outputMaximum, rnd_factor, shift );
118
#else
119
  for( int k = 0; k < cutoff; k++ )
120
  {
121
    const TCoeff* srcPtr = &src[k * line];
122
    for( int i = 0; i < reducedLine; i++ )
123
    {
124
            TCoeff*       dstPtr = &dst[i << 2];
125
      const TMatrixCoeff*  itPtr =  &iT[k << 2];
126
      for( int j = 0; j < 4; j++ )
127
      {
128
        *dstPtr++ += *srcPtr * *itPtr++;
129
      }
130
      srcPtr++;
131
    }
132
  }
133
134
  for( int i = 0; i < reducedLine; i++ )
135
  {
136
    TCoeff* dstPtr = &dst[i << 2];
137
    for( int j = 0; j < 4; j++, dstPtr++ )
138
    {
139
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
140
    }
141
  }
142
#endif
143
144
0
  if( iSkipLine )
145
0
  {
146
0
    memset( dst + ( reducedLine << 2 ), 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) );
147
0
  }
148
0
}
149
150
#endif
151
152
template< int uiTrSize >
153
inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
154
0
{
155
0
  const int  rnd_factor  = 1 << (shift - 1);
156
0
  const int  reducedLine = line - iSkipLine;
157
0
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
0
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
0
#if ENABLE_SIMD_TRAFO
162
0
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
0
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
0
  if( iSkipLine )
191
0
  {
192
0
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
0
  }
194
0
}
Unexecuted instantiation: void vvenc::_fastInverseMM<16>(int const*, int*, int, int, int, int, int, int, short const*)
Unexecuted instantiation: void vvenc::_fastInverseMM<32>(int const*, int*, int, int, int, int, int, int, short const*)
Unexecuted instantiation: void vvenc::_fastInverseMM<64>(int const*, int*, int, int, int, int, int, int, short const*)
Unexecuted instantiation: void vvenc::_fastInverseMM<8>(int const*, int*, int, int, int, int, int, int, short const*)
195
196
//Fast DCT-II transforms
197
void fastForwardDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
198
0
{
199
0
  int j;
200
0
  int E, O;
201
0
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
202
203
0
  const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_FORWARD][0];
204
205
0
  TCoeff *pCoef = dst;
206
0
  const int  reducedLine = line - iSkipLine;
207
0
  for (j = 0; j<reducedLine; j++)
208
0
  {
209
    /* E and O */
210
0
    E = src[0] + src[1];
211
0
    O = src[0] - src[1];
212
213
0
    dst[0] = (iT[0] * E + add) >> shift;
214
0
    dst[line] = (iT[2] * O + add) >> shift;
215
216
217
0
    src += 2;
218
0
    dst++;
219
0
  }
220
0
  if (iSkipLine)
221
0
  {
222
0
    dst = pCoef + reducedLine;
223
0
    for (j = 0; j<2; j++)
224
0
    {
225
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
226
0
      dst += line;
227
0
    }
228
0
  }
229
0
}
230
231
void fastInverseDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
232
0
{
233
0
  int j;
234
0
  int E, O;
235
0
  int add = 1 << (shift - 1);
236
237
0
  const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_INVERSE][0];
238
239
0
  const int  reducedLine = line - iSkipLine;
240
0
  for (j = 0; j<reducedLine; j++)
241
0
  {
242
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
243
0
    E = iT[0] * (src[0] + src[line]);
244
0
    O = iT[2] * (src[0] - src[line]);
245
246
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
247
0
    dst[0] = Clip3(outputMinimum, outputMaximum, (E + add) >> shift);
248
0
    dst[1] = Clip3(outputMinimum, outputMaximum, (O + add) >> shift);
249
250
0
    src++;
251
0
    dst += 2;
252
0
  }
253
0
  if (iSkipLine)
254
0
  {
255
0
    memset(dst, 0, (iSkipLine << 1) * sizeof(TCoeff));
256
0
  }
257
0
}
258
259
/** 4x4 forward transform implemented using partial butterfly structure (1D)
260
*  \param src   input data (residual)
261
*  \param dst   output data (transform coefficients)
262
*  \param shift specifies right shift after 1D transform
263
*  \param line
264
*/
265
void fastForwardDCT2_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
266
0
{
267
0
  int j;
268
0
  TCoeff E[2], O[2];
269
0
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
270
271
0
  const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_FORWARD][0];
272
273
0
  TCoeff *pCoef = dst;
274
0
  const int  reducedLine = line - iSkipLine;
275
0
  for (j = 0; j<reducedLine; j++)
276
0
  {
277
    /* E and O */
278
0
    E[0] = src[0] + src[3];
279
0
    O[0] = src[0] - src[3];
280
0
    E[1] = src[1] + src[2];
281
0
    O[1] = src[1] - src[2];
282
283
0
    dst[0] = (iT[0] * E[0] + iT[1] * E[1] + add) >> shift;
284
0
    dst[2 * line] = (iT[8] * E[0] + iT[9] * E[1] + add) >> shift;
285
0
    dst[line] = (iT[4] * O[0] + iT[5] * O[1] + add) >> shift;
286
0
    dst[3 * line] = (iT[12] * O[0] + iT[13] * O[1] + add) >> shift;
287
288
0
    src += 4;
289
0
    dst++;
290
0
  }
291
0
  if (iSkipLine)
292
0
  {
293
0
    dst = pCoef + reducedLine;
294
0
    for (j = 0; j<4; j++)
295
0
    {
296
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
297
0
      dst += line;
298
0
    }
299
0
  }
300
0
}
301
302
/** 4x4 inverse transform implemented using partial butterfly structure (1D)
303
*  \param src   input data (transform coefficients)
304
*  \param dst   output data (residual)
305
*  \param shift specifies right shift after 1D transform
306
*  \param line
307
*  \param outputMinimum  minimum for clipping
308
*  \param outputMaximum  maximum for clipping
309
*/
310
void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum )
311
0
{
312
#if 0
313
  const TMatrixCoeff *iT = g_trCoreDCT2P4[0];
314
315
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
316
#else
317
0
  int j;
318
0
  int E[2], O[2];
319
0
  int add = 1 << ( shift - 1 );
320
321
0
  const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_INVERSE][0];
322
323
0
#if ENABLE_SIMD_TRAFO
324
0
  TCoeff* orgDst = dst;
325
326
0
#endif
327
0
  const int  reducedLine = line - iSkipLine;
328
0
  for( j = 0; j < reducedLine; j++ )
329
0
  {
330
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
331
0
    O[0] = iT[1 * 4 + 0] * src[line] + iT[3 * 4 + 0] * src[3 * line];
332
0
    O[1] = iT[1 * 4 + 1] * src[line] + iT[3 * 4 + 1] * src[3 * line];
333
0
    E[0] = iT[0 * 4 + 0] * src[   0] + iT[2 * 4 + 0] * src[2 * line];
334
0
    E[1] = iT[0 * 4 + 1] * src[   0] + iT[2 * 4 + 1] * src[2 * line];
335
336
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
337
0
#if ENABLE_SIMD_TRAFO
338
0
    dst[0] = E[0] + O[0];
339
0
    dst[1] = E[1] + O[1];
340
0
    dst[2] = E[1] - O[1];
341
0
    dst[3] = E[0] - O[0];
342
#else
343
    dst[0] = Clip3( outputMinimum, outputMaximum, ( E[0] + O[0] + add ) >> shift );
344
    dst[1] = Clip3( outputMinimum, outputMaximum, ( E[1] + O[1] + add ) >> shift );
345
    dst[2] = Clip3( outputMinimum, outputMaximum, ( E[1] - O[1] + add ) >> shift );
346
    dst[3] = Clip3( outputMinimum, outputMaximum, ( E[0] - O[0] + add ) >> shift );
347
#endif
348
349
0
    src++;
350
0
    dst += 4;
351
0
  }
352
353
0
#if ENABLE_SIMD_TRAFO
354
0
  g_tCoeffOps.roundClip4( orgDst, 4, reducedLine, 4, outputMinimum, outputMaximum, add, shift );
355
356
0
#endif
357
0
  if( iSkipLine )
358
0
  {
359
0
    memset( dst, 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) );
360
0
  }
361
0
#endif
362
0
}
363
364
365
366
template< int uiTrSize >
367
inline void _fastForwardMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TMatrixCoeff* tc )
368
0
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
0
  const int  reducedLine = line - iSkipLine;
373
0
  const int  cutoff      = uiTrSize - iSkipLine2;
374
0
  TCoeff *pCoef;
375
376
0
#if ENABLE_SIMD_TRAFO
377
0
  if( line == 1 )
378
0
  {
379
0
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
0
  }
381
0
  else
382
0
  {
383
0
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
0
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
0
  if( iSkipLine )
407
0
  {
408
0
    pCoef = dst + reducedLine;
409
0
    for( int j = 0; j<cutoff; j++ )
410
0
    {
411
0
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
0
      pCoef += line;
413
0
    }
414
0
  }
415
416
0
  if( iSkipLine2 )
417
0
  {
418
0
    pCoef = dst + line*cutoff;
419
0
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
0
  }
421
0
}
Unexecuted instantiation: void vvenc::_fastForwardMM<8>(int const*, int*, int, int, int, int, short const*)
Unexecuted instantiation: void vvenc::_fastForwardMM<16>(int const*, int*, int, int, int, int, short const*)
Unexecuted instantiation: void vvenc::_fastForwardMM<32>(int const*, int*, int, int, int, int, short const*)
Unexecuted instantiation: void vvenc::_fastForwardMM<64>(int const*, int*, int, int, int, int, short const*)
422
423
424
425
/** 8x8 forward transform implemented using partial butterfly structure (1D)
426
*  \param src   input data (residual)
427
*  \param dst   output data (transform coefficients)
428
*  \param shift specifies right shift after 1D transform
429
*  \param line
430
*/
431
void fastForwardDCT2_B8( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 )
432
0
{
433
#if !JVET_M0497_MATRIX_MULT
434
  int j, k;
435
  TCoeff E[4], O[4];
436
  TCoeff EE[2], EO[2];
437
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
438
439
  const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_FORWARD][0];
440
441
  TCoeff *pCoef = dst;
442
  const int  reducedLine = line - iSkipLine;
443
  for( j = 0; j < reducedLine; j++ )
444
  {
445
    /* E and O*/
446
    for( k = 0; k < 4; k++ )
447
    {
448
      E[k] = src[k] + src[7 - k];
449
      O[k] = src[k] - src[7 - k];
450
    }
451
    /* EE and EO */
452
    EE[0] = E[0] + E[3];
453
    EO[0] = E[0] - E[3];
454
    EE[1] = E[1] + E[2];
455
    EO[1] = E[1] - E[2];
456
457
    dst[0       ] = (iT[ 0] * EE[0] + iT[ 1] * EE[1] + add) >> shift;
458
    dst[4 * line] = (iT[32] * EE[0] + iT[33] * EE[1] + add) >> shift;
459
    dst[2 * line] = (iT[16] * EO[0] + iT[17] * EO[1] + add) >> shift;
460
    dst[6 * line] = (iT[48] * EO[0] + iT[49] * EO[1] + add) >> shift;
461
462
    dst[    line] = (iT[ 8] * O[0] + iT[ 9] * O[1] + iT[10] * O[2] + iT[11] * O[3] + add) >> shift;
463
    dst[3 * line] = (iT[24] * O[0] + iT[25] * O[1] + iT[26] * O[2] + iT[27] * O[3] + add) >> shift;
464
    dst[5 * line] = (iT[40] * O[0] + iT[41] * O[1] + iT[42] * O[2] + iT[43] * O[3] + add) >> shift;
465
    dst[7 * line] = (iT[56] * O[0] + iT[57] * O[1] + iT[58] * O[2] + iT[59] * O[3] + add) >> shift;
466
467
    src += 8;
468
    dst++;
469
  }
470
  if( iSkipLine )
471
  {
472
    dst = pCoef + reducedLine;
473
    for( j = 0; j < 8; j++ )
474
    {
475
      memset( dst, 0, sizeof( TCoeff )*iSkipLine );
476
      dst += line;
477
    }
478
  }
479
#else
480
0
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P8[TRANSFORM_FORWARD][0] );
481
0
#endif
482
0
}
483
484
/** 8x8 inverse transform implemented using partial butterfly structure (1D)
485
*  \param src   input data (transform coefficients)
486
*  \param dst   output data (residual)
487
*  \param shift specifies right shift after 1D transform
488
*  \param line
489
*  \param outputMinimum  minimum for clipping
490
*  \param outputMaximum  maximum for clipping
491
*/
492
void fastInverseDCT2_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
493
0
{
494
#if 0
495
  const TMatrixCoeff *iT = g_trCoreDCT2P8[0];
496
497
  _fastInverseMM<8>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
498
#else
499
0
  int j, k;
500
0
  int E[4], O[4];
501
0
  int EE[2], EO[2];
502
0
  int add = 1 << (shift - 1);
503
504
0
  const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_INVERSE][0];
505
506
0
#if ENABLE_SIMD_TRAFO
507
0
  TCoeff *orgDst = dst;
508
509
0
#endif
510
0
  const int  reducedLine = line - iSkipLine;
511
0
  for( j = 0; j < reducedLine; j++ )
512
0
  {
513
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
514
0
    for( k = 0; k < 4; k++ )
515
0
    {
516
0
      O[k] = iT[1 * 8 + k] * src[line] + iT[3 * 8 + k] * src[3 * line] + iT[5 * 8 + k] * src[5 * line] + iT[7 * 8 + k] * src[7 * line];
517
0
    }
518
519
0
    EO[0] = iT[2 * 8 + 0] * src[2 * line] + iT[6 * 8 + 0] * src[6 * line];
520
0
    EO[1] = iT[2 * 8 + 1] * src[2 * line] + iT[6 * 8 + 1] * src[6 * line];
521
0
    EE[0] = iT[0 * 8 + 0] * src[0       ] + iT[4 * 8 + 0] * src[4 * line];
522
0
    EE[1] = iT[0 * 8 + 1] * src[0       ] + iT[4 * 8 + 1] * src[4 * line];
523
524
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
525
0
    E[0] = EE[0] + EO[0];
526
0
    E[3] = EE[0] - EO[0];
527
0
    E[1] = EE[1] + EO[1];
528
0
    E[2] = EE[1] - EO[1];
529
530
0
    for( k = 0; k < 4; k++ )
531
0
    {
532
0
#if ENABLE_SIMD_TRAFO
533
0
      dst[k    ] = E[    k] + O[    k];
534
0
      dst[k + 4] = E[3 - k] - O[3 - k];
535
#else
536
      dst[k    ] = Clip3( outputMinimum, outputMaximum, ( E[    k] + O[    k] + add ) >> shift );
537
      dst[k + 4] = Clip3( outputMinimum, outputMaximum, ( E[3 - k] - O[3 - k] + add ) >> shift );
538
#endif
539
0
    }
540
0
    src++;
541
0
    dst += 8;
542
0
  }
543
544
0
#if ENABLE_SIMD_TRAFO
545
0
  g_tCoeffOps.roundClip8( orgDst, 8, reducedLine, 8, outputMinimum, outputMaximum, add, shift );
546
547
0
#endif
548
0
  if( iSkipLine )
549
0
  {
550
0
    memset( dst, 0, ( iSkipLine << 3 ) * sizeof( TCoeff ) );
551
0
  }
552
0
#endif
553
0
}
554
555
556
/** 16x16 forward transform implemented using partial butterfly structure (1D)
557
*  \param src   input data (residual)
558
*  \param dst   output data (transform coefficients)
559
*  \param shift specifies right shift after 1D transform
560
*  \param line
561
*/
562
void fastForwardDCT2_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
563
0
{
564
#if !JVET_M0497_MATRIX_MULT
565
  int j, k;
566
  TCoeff E  [8], O  [8];
567
  TCoeff EE [4], EO [4];
568
  TCoeff EEE[2], EEO[2];
569
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
570
571
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_FORWARD][0];
572
573
  TCoeff *pCoef = dst;
574
  const int  reducedLine = line - iSkipLine;
575
  for( j = 0; j < reducedLine; j++ )
576
  {
577
    /* E and O*/
578
    for( k = 0; k < 8; k++ )
579
    {
580
      E[k] = src[k] + src[15 - k];
581
      O[k] = src[k] - src[15 - k];
582
    }
583
    /* EE and EO */
584
    for( k = 0; k < 4; k++ )
585
    {
586
      EE[k] = E[k] + E[7 - k];
587
      EO[k] = E[k] - E[7 - k];
588
    }
589
    /* EEE and EEO */
590
    EEE[0] = EE[0] + EE[3];
591
    EEO[0] = EE[0] - EE[3];
592
    EEE[1] = EE[1] + EE[2];
593
    EEO[1] = EE[1] - EE[2];
594
595
    dst[ 0       ] = ( iT[ 0     ] * EEE[0] + iT[          1] * EEE[1] + add ) >> shift;
596
    dst[ 8 * line] = ( iT[ 8 * 16] * EEE[0] + iT[ 8 * 16 + 1] * EEE[1] + add ) >> shift;
597
    dst[ 4 * line] = ( iT[ 4 * 16] * EEO[0] + iT[ 4 * 16 + 1] * EEO[1] + add ) >> shift;
598
    dst[12 * line] = ( iT[12 * 16] * EEO[0] + iT[12 * 16 + 1] * EEO[1] + add ) >> shift;
599
600
    for( k = 2; k < 16; k += 4 )
601
    {
602
      dst[k*line] = ( iT[k * 16] * EO[0] + iT[k * 16 + 1] * EO[1] + iT[k * 16 + 2] * EO[2] + iT[k * 16 + 3] * EO[3] + add ) >> shift;
603
    }
604
605
    for( k = 1; k < 16; k += 2 )
606
    {
607
      dst[k*line] = ( iT[k * 16    ] * O[0] + iT[k * 16 + 1] * O[1] + iT[k * 16 + 2] * O[2] + iT[k * 16 + 3] * O[3] +
608
                      iT[k * 16 + 4] * O[4] + iT[k * 16 + 5] * O[5] + iT[k * 16 + 6] * O[6] + iT[k * 16 + 7] * O[7] + add ) >> shift;
609
    }
610
611
    src += 16;
612
    dst++;
613
614
  }
615
  if( iSkipLine )
616
  {
617
    dst = pCoef + reducedLine;
618
    for( j = 0; j < 16; j++ )
619
    {
620
      memset( dst, 0, sizeof( TCoeff )*iSkipLine );
621
      dst += line;
622
    }
623
  }
624
#else
625
0
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P16[TRANSFORM_FORWARD][0] );
626
0
#endif
627
0
}
628
629
/** 16x16 inverse transform implemented using partial butterfly structure (1D)
630
*  \param src            input data (transform coefficients)
631
*  \param dst            output data (residual)
632
*  \param shift          specifies right shift after 1D transform
633
*  \param line
634
*  \param outputMinimum  minimum for clipping
635
*  \param outputMaximum  maximum for clipping
636
*/
637
void fastInverseDCT2_B16( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum )
638
0
{
639
0
#if ENABLE_SIMD_TRAFO
640
0
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0];
641
642
0
  _fastInverseMM<16>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
643
#else
644
  int j, k;
645
  int E  [8], O  [8];
646
  int EE [4], EO [4];
647
  int EEE[2], EEO[2];
648
  int add = 1 << ( shift - 1 );
649
650
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0];
651
652
#if ENABLE_SIMD_TRAFO
653
  TCoeff *orgDst = dst;
654
655
#endif
656
  const int  reducedLine = line - iSkipLine;
657
658
  for( j = 0; j < reducedLine; j++ )
659
  {
660
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
661
    for( k = 0; k < 8; k++ )
662
    {
663
      O[k] = iT[1 * 16 + k] * src[    line] + iT[ 3 * 16 + k] * src[ 3 * line] + iT[ 5 * 16 + k] * src[ 5 * line] + iT[ 7 * 16 + k] * src[ 7 * line] +
664
        iT[9 * 16 + k] * src[9 * line] + iT[11 * 16 + k] * src[11 * line] + iT[13 * 16 + k] * src[13 * line] + iT[15 * 16 + k] * src[15 * line];
665
    }
666
    for( k = 0; k < 4; k++ )
667
    {
668
      EO[k] = iT[2 * 16 + k] * src[2 * line] + iT[6 * 16 + k] * src[6 * line] + iT[10 * 16 + k] * src[10 * line] + iT[14 * 16 + k] * src[14 * line];
669
    }
670
    EEO[0] = iT[4 * 16    ] * src[4 * line] + iT[12 * 16    ] * src[12 * line];
671
    EEE[0] = iT[0         ] * src[0       ] + iT[ 8 * 16    ] * src[ 8 * line];
672
    EEO[1] = iT[4 * 16 + 1] * src[4 * line] + iT[12 * 16 + 1] * src[12 * line];
673
    EEE[1] = iT[0 * 16 + 1] * src[0       ] + iT[ 8 * 16 + 1] * src[ 8 * line];
674
675
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
676
    for( k = 0; k < 2; k++ )
677
    {
678
      EE[k    ] = EEE[    k] + EEO[    k];
679
      EE[k + 2] = EEE[1 - k] - EEO[1 - k];
680
    }
681
    for( k = 0; k < 4; k++ )
682
    {
683
      E[k    ] = EE[    k] + EO[    k];
684
      E[k + 4] = EE[3 - k] - EO[3 - k];
685
    }
686
    for( k = 0; k < 8; k++ )
687
    {
688
#if ENABLE_SIMD_TRAFO
689
      dst[k    ] = E[    k] + O[    k];
690
      dst[k + 8] = E[7 - k] - O[7 - k];
691
#else
692
      dst[k    ] = Clip3( outputMinimum, outputMaximum, ( E[    k] + O[    k] + add ) >> shift );
693
      dst[k + 8] = Clip3( outputMinimum, outputMaximum, ( E[7 - k] - O[7 - k] + add ) >> shift );
694
#endif
695
    }
696
    src++;
697
    dst += 16;
698
  }
699
700
#if ENABLE_SIMD_TRAFO
701
  g_tCoeffOps.roundClip8( orgDst, 16, reducedLine, 16, outputMinimum, outputMaximum, add, shift );
702
703
#endif
704
  if( iSkipLine )
705
  {
706
    memset( dst, 0, ( iSkipLine << 4 ) * sizeof( TCoeff ) );
707
  }
708
#endif
709
0
}
710
711
712
713
/** 32x32 forward transform implemented using partial butterfly structure (1D)
714
*  \param src   input data (residual)
715
*  \param dst   output data (transform coefficients)
716
*  \param shift specifies right shift after 1D transform
717
*  \param line
718
*/
719
void fastForwardDCT2_B32( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 )
720
0
{
721
#if !JVET_M0497_MATRIX_MULT
722
  int j, k;
723
  TCoeff E   [16], O   [16];
724
  TCoeff EE  [ 8], EO  [ 8];
725
  TCoeff EEE [ 4], EEO [ 4];
726
  TCoeff EEEE[ 2], EEEO[ 2];
727
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
728
729
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_FORWARD][0];
730
731
  TCoeff *pCoef = dst;
732
  const int  reducedLine = line - iSkipLine;
733
  for (j = 0; j<reducedLine; j++)
734
  {
735
    /* E and O*/
736
    for (k = 0;k<16;k++)
737
    {
738
      E[k] = src[k] + src[31 - k];
739
      O[k] = src[k] - src[31 - k];
740
    }
741
    /* EE and EO */
742
    for (k = 0;k<8;k++)
743
    {
744
      EE[k] = E[k] + E[15 - k];
745
      EO[k] = E[k] - E[15 - k];
746
    }
747
    /* EEE and EEO */
748
    for (k = 0;k<4;k++)
749
    {
750
      EEE[k] = EE[k] + EE[7 - k];
751
      EEO[k] = EE[k] - EE[7 - k];
752
    }
753
    /* EEEE and EEEO */
754
    EEEE[0] = EEE[0] + EEE[3];
755
    EEEO[0] = EEE[0] - EEE[3];
756
    EEEE[1] = EEE[1] + EEE[2];
757
    EEEO[1] = EEE[1] - EEE[2];
758
759
    dst[0] = (iT[0 * 32 + 0] * EEEE[0] + iT[0 * 32 + 1] * EEEE[1] + add) >> shift;
760
    dst[16 * line] = (iT[16 * 32 + 0] * EEEE[0] + iT[16 * 32 + 1] * EEEE[1] + add) >> shift;
761
    dst[8 * line] = (iT[8 * 32 + 0] * EEEO[0] + iT[8 * 32 + 1] * EEEO[1] + add) >> shift;
762
    dst[24 * line] = (iT[24 * 32 + 0] * EEEO[0] + iT[24 * 32 + 1] * EEEO[1] + add) >> shift;
763
    for (k = 4;k<32;k += 8)
764
    {
765
      dst[k*line] = (iT[k * 32 + 0] * EEO[0] + iT[k * 32 + 1] * EEO[1] + iT[k * 32 + 2] * EEO[2] + iT[k * 32 + 3] * EEO[3] + add) >> shift;
766
    }
767
    for (k = 2;k<32;k += 4)
768
    {
769
      dst[k*line] = (iT[k * 32 + 0] * EO[0] + iT[k * 32 + 1] * EO[1] + iT[k * 32 + 2] * EO[2] + iT[k * 32 + 3] * EO[3] +
770
                      iT[k * 32 + 4] * EO[4] + iT[k * 32 + 5] * EO[5] + iT[k * 32 + 6] * EO[6] + iT[k * 32 + 7] * EO[7] + add) >> shift;
771
    }
772
    for (k = 1;k<32;k += 2)
773
    {
774
      dst[k*line] = (iT[k * 32 + 0] * O[0] + iT[k * 32 + 1] * O[1] + iT[k * 32 + 2] * O[2] + iT[k * 32 + 3] * O[3] +
775
                      iT[k * 32 + 4] * O[4] + iT[k * 32 + 5] * O[5] + iT[k * 32 + 6] * O[6] + iT[k * 32 + 7] * O[7] +
776
                      iT[k * 32 + 8] * O[8] + iT[k * 32 + 9] * O[9] + iT[k * 32 + 10] * O[10] + iT[k * 32 + 11] * O[11] +
777
                      iT[k * 32 + 12] * O[12] + iT[k * 32 + 13] * O[13] + iT[k * 32 + 14] * O[14] + iT[k * 32 + 15] * O[15] + add) >> shift;
778
    }
779
    src += 32;
780
    dst++;
781
  }
782
  if (iSkipLine)
783
  {
784
    dst = pCoef + reducedLine;
785
    for (j = 0; j<32; j++)
786
    {
787
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
788
      dst += line;
789
    }
790
  }
791
#else
792
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P32[TRANSFORM_FORWARD][0] );
793
0
#endif
794
0
}
795
796
/** 32x32 inverse transform implemented using partial butterfly structure (1D)
797
*  \param src   input data (transform coefficients)
798
*  \param dst   output data (residual)
799
*  \param shift specifies right shift after 1D transform
800
*  \param line
801
*  \param outputMinimum  minimum for clipping
802
*  \param outputMaximum  maximum for clipping
803
*/
804
void fastInverseDCT2_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
805
0
{
806
0
#if ENABLE_SIMD_TRAFO
807
0
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0];
808
809
0
  _fastInverseMM<32>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
810
#else
811
  int j, k;
812
  int E[16], O[16];
813
  int EE[8], EO[8];
814
  int EEE[4], EEO[4];
815
  int EEEE[2], EEEO[2];
816
  int add = 1 << (shift - 1);
817
818
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0];
819
820
#if ENABLE_SIMD_TRAFO
821
  TCoeff *orgDst = dst;
822
823
#endif
824
  const int  reducedLine = line - iSkipLine;
825
  for (j = 0; j<reducedLine; j++)
826
  {
827
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
828
    for (k = 0;k<16;k++)
829
    {
830
      O[k] = iT[1 * 32 + k] * src[line] + iT[3 * 32 + k] * src[3 * line] + iT[5 * 32 + k] * src[5 * line] + iT[7 * 32 + k] * src[7 * line] +
831
        iT[9 * 32 + k] * src[9 * line] + iT[11 * 32 + k] * src[11 * line] + iT[13 * 32 + k] * src[13 * line] + iT[15 * 32 + k] * src[15 * line] +
832
        iT[17 * 32 + k] * src[17 * line] + iT[19 * 32 + k] * src[19 * line] + iT[21 * 32 + k] * src[21 * line] + iT[23 * 32 + k] * src[23 * line] +
833
        iT[25 * 32 + k] * src[25 * line] + iT[27 * 32 + k] * src[27 * line] + iT[29 * 32 + k] * src[29 * line] + iT[31 * 32 + k] * src[31 * line];
834
    }
835
    for (k = 0;k<8;k++)
836
    {
837
      EO[k] = iT[2 * 32 + k] * src[2 * line] + iT[6 * 32 + k] * src[6 * line] + iT[10 * 32 + k] * src[10 * line] + iT[14 * 32 + k] * src[14 * line] +
838
        iT[18 * 32 + k] * src[18 * line] + iT[22 * 32 + k] * src[22 * line] + iT[26 * 32 + k] * src[26 * line] + iT[30 * 32 + k] * src[30 * line];
839
    }
840
    for (k = 0;k<4;k++)
841
    {
842
      EEO[k] = iT[4 * 32 + k] * src[4 * line] + iT[12 * 32 + k] * src[12 * line] + iT[20 * 32 + k] * src[20 * line] + iT[28 * 32 + k] * src[28 * line];
843
    }
844
    EEEO[0] = iT[8 * 32 + 0] * src[8 * line] + iT[24 * 32 + 0] * src[24 * line];
845
    EEEO[1] = iT[8 * 32 + 1] * src[8 * line] + iT[24 * 32 + 1] * src[24 * line];
846
    EEEE[0] = iT[0 * 32 + 0] * src[0] + iT[16 * 32 + 0] * src[16 * line];
847
    EEEE[1] = iT[0 * 32 + 1] * src[0] + iT[16 * 32 + 1] * src[16 * line];
848
849
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
850
    EEE[0] = EEEE[0] + EEEO[0];
851
    EEE[3] = EEEE[0] - EEEO[0];
852
    EEE[1] = EEEE[1] + EEEO[1];
853
    EEE[2] = EEEE[1] - EEEO[1];
854
    for (k = 0;k<4;k++)
855
    {
856
      EE[k] = EEE[k] + EEO[k];
857
      EE[k + 4] = EEE[3 - k] - EEO[3 - k];
858
    }
859
    for (k = 0;k<8;k++)
860
    {
861
      E[k] = EE[k] + EO[k];
862
      E[k + 8] = EE[7 - k] - EO[7 - k];
863
    }
864
    for (k = 0;k<16;k++)
865
    {
866
#if ENABLE_SIMD_TRAFO
867
      dst[k     ] = E[k     ] + O[k     ];
868
      dst[k + 16] = E[15 - k] - O[15 - k];
869
#else
870
      dst[k] = Clip3(outputMinimum, outputMaximum, (E[k] + O[k] + add) >> shift);
871
      dst[k + 16] = Clip3(outputMinimum, outputMaximum, (E[15 - k] - O[15 - k] + add) >> shift);
872
#endif
873
    }
874
    src++;
875
    dst += 32;
876
  }
877
878
#if ENABLE_SIMD_TRAFO
879
  g_tCoeffOps.roundClip8( orgDst, 32, reducedLine, 32, outputMinimum, outputMaximum, add, shift );
880
881
#endif
882
  if (iSkipLine)
883
  {
884
    memset(dst, 0, (iSkipLine << 5) * sizeof(TCoeff));
885
  }
886
#endif
887
0
}
888
889
void fastForwardDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
890
0
{
891
#if !JVET_M0497_MATRIX_MULT
892
  int rnd_factor = 1 << (shift - 1);
893
894
  const int uiTrSize = 64;
895
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_FORWARD][0];
896
897
  int   j, k;
898
  TCoeff E[32], O[32];
899
  TCoeff EE[16], EO[16];
900
  TCoeff EEE[8], EEO[8];
901
  TCoeff EEEE[4], EEEO[4];
902
  TCoeff EEEEE[2], EEEEO[2];
903
  TCoeff *tmp = dst;
904
905
  //bool zo = iSkipLine2 >= 32;
906
  bool zo = iSkipLine2 != 0;
907
  for (j = 0; j<line - iSkipLine; j++)
908
  {
909
    /* E and O*/
910
    for (k = 0;k<32;k++)
911
    {
912
      E[k] = src[k] + src[63 - k];
913
      O[k] = src[k] - src[63 - k];
914
    }
915
    /* EE and EO */
916
    for (k = 0;k<16;k++)
917
    {
918
      EE[k] = E[k] + E[31 - k];
919
      EO[k] = E[k] - E[31 - k];
920
    }
921
    /* EEE and EEO */
922
    for (k = 0;k<8;k++)
923
    {
924
      EEE[k] = EE[k] + EE[15 - k];
925
      EEO[k] = EE[k] - EE[15 - k];
926
    }
927
    /* EEEE and EEEO */
928
    for (k = 0;k<4;k++)
929
    {
930
      EEEE[k] = EEE[k] + EEE[7 - k];
931
      EEEO[k] = EEE[k] - EEE[7 - k];
932
    }
933
    /* EEEEE and EEEEO */
934
    EEEEE[0] = EEEE[0] + EEEE[3];
935
    EEEEO[0] = EEEE[0] - EEEE[3];
936
    EEEEE[1] = EEEE[1] + EEEE[2];
937
    EEEEO[1] = EEEE[1] - EEEE[2];
938
939
    dst[0] = (iT[0 * 64 + 0] * EEEEE[0] + iT[0 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift;
940
    dst[16 * line] = (iT[16 * 64 + 0] * EEEEO[0] + iT[16 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift;
941
942
    if (!zo)
943
    {
944
      dst[32 * line] = (iT[32 * 64 + 0] * EEEEE[0] + iT[32 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift;
945
      dst[48 * line] = (iT[48 * 64 + 0] * EEEEO[0] + iT[48 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift;
946
    }
947
    for (k = 8;k<(zo ? 32 : 64);k += 16)
948
    {
949
      dst[k*line] = (iT[k * 64 + 0] * EEEO[0] + iT[k * 64 + 1] * EEEO[1] + iT[k * 64 + 2] * EEEO[2] + iT[k * 64 + 3] * EEEO[3] + rnd_factor) >> shift;
950
    }
951
    for (k = 4;k<(zo ? 32 : 64);k += 8)
952
    {
953
      dst[k*line] = (iT[k * 64 + 0] * EEO[0] + iT[k * 64 + 1] * EEO[1] + iT[k * 64 + 2] * EEO[2] + iT[k * 64 + 3] * EEO[3] +
954
                      iT[k * 64 + 4] * EEO[4] + iT[k * 64 + 5] * EEO[5] + iT[k * 64 + 6] * EEO[6] + iT[k * 64 + 7] * EEO[7] + rnd_factor) >> shift;
955
    }
956
    for (k = 2;k<(zo ? 32 : 64);k += 4)
957
    {
958
      dst[k*line] = (iT[k * 64 + 0] * EO[0] + iT[k * 64 + 1] * EO[1] + iT[k * 64 + 2] * EO[2] + iT[k * 64 + 3] * EO[3] +
959
                      iT[k * 64 + 4] * EO[4] + iT[k * 64 + 5] * EO[5] + iT[k * 64 + 6] * EO[6] + iT[k * 64 + 7] * EO[7] +
960
                      iT[k * 64 + 8] * EO[8] + iT[k * 64 + 9] * EO[9] + iT[k * 64 + 10] * EO[10] + iT[k * 64 + 11] * EO[11] +
961
                      iT[k * 64 + 12] * EO[12] + iT[k * 64 + 13] * EO[13] + iT[k * 64 + 14] * EO[14] + iT[k * 64 + 15] * EO[15] + rnd_factor) >> shift;
962
    }
963
    for (k = 1;k<(zo ? 32 : 64);k += 2)
964
    {
965
      dst[k*line] = (iT[k * 64 + 0] * O[0] + iT[k * 64 + 1] * O[1] + iT[k * 64 + 2] * O[2] + iT[k * 64 + 3] * O[3] +
966
                      iT[k * 64 + 4] * O[4] + iT[k * 64 + 5] * O[5] + iT[k * 64 + 6] * O[6] + iT[k * 64 + 7] * O[7] +
967
                      iT[k * 64 + 8] * O[8] + iT[k * 64 + 9] * O[9] + iT[k * 64 + 10] * O[10] + iT[k * 64 + 11] * O[11] +
968
                      iT[k * 64 + 12] * O[12] + iT[k * 64 + 13] * O[13] + iT[k * 64 + 14] * O[14] + iT[k * 64 + 15] * O[15] +
969
                      iT[k * 64 + 16] * O[16] + iT[k * 64 + 17] * O[17] + iT[k * 64 + 18] * O[18] + iT[k * 64 + 19] * O[19] +
970
                      iT[k * 64 + 20] * O[20] + iT[k * 64 + 21] * O[21] + iT[k * 64 + 22] * O[22] + iT[k * 64 + 23] * O[23] +
971
                      iT[k * 64 + 24] * O[24] + iT[k * 64 + 25] * O[25] + iT[k * 64 + 26] * O[26] + iT[k * 64 + 27] * O[27] +
972
                      iT[k * 64 + 28] * O[28] + iT[k * 64 + 29] * O[29] + iT[k * 64 + 30] * O[30] + iT[k * 64 + 31] * O[31] + rnd_factor) >> shift;
973
    }
974
    src += uiTrSize;
975
    dst++;
976
  }
977
978
  const int  reducedLine = line - iSkipLine;
979
  const int  cutoff = uiTrSize - iSkipLine2;
980
  if (iSkipLine)
981
  {
982
    dst = tmp + reducedLine;
983
    for (j = 0; j<cutoff; j++)
984
    {
985
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
986
      dst += line;
987
    }
988
  }
989
  if (iSkipLine2)
990
  {
991
    dst = tmp + line*cutoff;
992
    memset(dst, 0, sizeof(TCoeff)*line*iSkipLine2);
993
  }
994
#else
995
0
  _fastForwardMM< 64 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P64[TRANSFORM_FORWARD][0] );
996
0
#endif
997
0
}
998
999
void fastInverseDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1000
0
{
1001
0
#if ENABLE_SIMD_TRAFO
1002
0
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0];
1003
1004
0
  _fastInverseMM<64>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
1005
#else
1006
  int rnd_factor = 1 << (shift - 1);
1007
  const int uiTrSize = 64;
1008
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0];
1009
1010
#if ENABLE_SIMD_TRAFO
1011
  TCoeff *orgDst = dst;
1012
1013
#endif
1014
  int    j, k;
1015
  TCoeff E[32], O[32];
1016
  TCoeff EE[16], EO[16];
1017
  TCoeff EEE[8], EEO[8];
1018
  TCoeff EEEE[4], EEEO[4];
1019
  TCoeff EEEEE[2], EEEEO[2];
1020
  bool zo = iSkipLine2 >= 32;
1021
  for (j = 0; j<line - iSkipLine; j++)
1022
  {
1023
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1024
    for (k = 0;k<32;k++)
1025
    {
1026
      O[k] = iT[1 * 64 + k] * src[line] + iT[3 * 64 + k] * src[3 * line] + iT[5 * 64 + k] * src[5 * line] + iT[7 * 64 + k] * src[7 * line] +
1027
        iT[9 * 64 + k] * src[9 * line] + iT[11 * 64 + k] * src[11 * line] + iT[13 * 64 + k] * src[13 * line] + iT[15 * 64 + k] * src[15 * line] +
1028
        iT[17 * 64 + k] * src[17 * line] + iT[19 * 64 + k] * src[19 * line] + iT[21 * 64 + k] * src[21 * line] + iT[23 * 64 + k] * src[23 * line] +
1029
        iT[25 * 64 + k] * src[25 * line] + iT[27 * 64 + k] * src[27 * line] + iT[29 * 64 + k] * src[29 * line] + iT[31 * 64 + k] * src[31 * line] +
1030
        (zo ? 0 : (
1031
        iT[33 * 64 + k] * src[33 * line] + iT[35 * 64 + k] * src[35 * line] + iT[37 * 64 + k] * src[37 * line] + iT[39 * 64 + k] * src[39 * line] +
1032
        iT[41 * 64 + k] * src[41 * line] + iT[43 * 64 + k] * src[43 * line] + iT[45 * 64 + k] * src[45 * line] + iT[47 * 64 + k] * src[47 * line] +
1033
        iT[49 * 64 + k] * src[49 * line] + iT[51 * 64 + k] * src[51 * line] + iT[53 * 64 + k] * src[53 * line] + iT[55 * 64 + k] * src[55 * line] +
1034
        iT[57 * 64 + k] * src[57 * line] + iT[59 * 64 + k] * src[59 * line] + iT[61 * 64 + k] * src[61 * line] + iT[63 * 64 + k] * src[63 * line]));
1035
    }
1036
    for (k = 0;k<16;k++)
1037
    {
1038
      EO[k] = iT[2 * 64 + k] * src[2 * line] + iT[6 * 64 + k] * src[6 * line] + iT[10 * 64 + k] * src[10 * line] + iT[14 * 64 + k] * src[14 * line] +
1039
        iT[18 * 64 + k] * src[18 * line] + iT[22 * 64 + k] * src[22 * line] + iT[26 * 64 + k] * src[26 * line] + iT[30 * 64 + k] * src[30 * line] +
1040
        (zo ? 0 : (
1041
        iT[34 * 64 + k] * src[34 * line] + iT[38 * 64 + k] * src[38 * line] + iT[42 * 64 + k] * src[42 * line] + iT[46 * 64 + k] * src[46 * line] +
1042
        iT[50 * 64 + k] * src[50 * line] + iT[54 * 64 + k] * src[54 * line] + iT[58 * 64 + k] * src[58 * line] + iT[62 * 64 + k] * src[62 * line]));
1043
    }
1044
    for (k = 0;k<8;k++)
1045
    {
1046
      EEO[k] = iT[4 * 64 + k] * src[4 * line] + iT[12 * 64 + k] * src[12 * line] + iT[20 * 64 + k] * src[20 * line] + iT[28 * 64 + k] * src[28 * line] +
1047
        (zo ? 0 : (
1048
        iT[36 * 64 + k] * src[36 * line] + iT[44 * 64 + k] * src[44 * line] + iT[52 * 64 + k] * src[52 * line] + iT[60 * 64 + k] * src[60 * line]));
1049
    }
1050
    for (k = 0;k<4;k++)
1051
    {
1052
      EEEO[k] = iT[8 * 64 + k] * src[8 * line] + iT[24 * 64 + k] * src[24 * line] + (zo ? 0 : (iT[40 * 64 + k] * src[40 * line] + iT[56 * 64 + k] * src[56 * line]));
1053
    }
1054
    EEEEO[0] = iT[16 * 64 + 0] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 0] * src[48 * line]);
1055
    EEEEO[1] = iT[16 * 64 + 1] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 1] * src[48 * line]);
1056
    EEEEE[0] = iT[0 * 64 + 0] * src[0] + (zo ? 0 : iT[32 * 64 + 0] * src[32 * line]);
1057
    EEEEE[1] = iT[0 * 64 + 1] * src[0] + (zo ? 0 : iT[32 * 64 + 1] * src[32 * line]);
1058
1059
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
1060
    for (k = 0;k<2;k++)
1061
    {
1062
      EEEE[k] = EEEEE[k] + EEEEO[k];
1063
      EEEE[k + 2] = EEEEE[1 - k] - EEEEO[1 - k];
1064
    }
1065
    for (k = 0;k<4;k++)
1066
    {
1067
      EEE[k] = EEEE[k] + EEEO[k];
1068
      EEE[k + 4] = EEEE[3 - k] - EEEO[3 - k];
1069
    }
1070
    for (k = 0;k<8;k++)
1071
    {
1072
      EE[k] = EEE[k] + EEO[k];
1073
      EE[k + 8] = EEE[7 - k] - EEO[7 - k];
1074
    }
1075
    for (k = 0;k<16;k++)
1076
    {
1077
      E[k] = EE[k] + EO[k];
1078
      E[k + 16] = EE[15 - k] - EO[15 - k];
1079
    }
1080
    for (k = 0;k<32;k++)
1081
    {
1082
#if ENABLE_SIMD_TRAFO
1083
      dst[k]      = E[k] + O[k];
1084
      dst[k + 32] = E[31 - k] - O[31 - k];
1085
#else
1086
      dst[k]      = Clip3( outputMinimum, outputMaximum, ( E[k] + O[k] + rnd_factor ) >> shift );
1087
      dst[k + 32] = Clip3( outputMinimum, outputMaximum, ( E[31 - k] - O[31 - k] + rnd_factor ) >> shift );
1088
#endif
1089
    }
1090
    src++;
1091
    dst += uiTrSize;
1092
  }
1093
1094
#if ENABLE_SIMD_TRAFO
1095
  g_tCoeffOps.roundClip8( orgDst, 32, line - iSkipLine, 32, outputMinimum, outputMaximum, rnd_factor, shift );
1096
1097
1098
#endif
1099
  memset( dst, 0, uiTrSize*iSkipLine * sizeof( TCoeff ) );
1100
#endif
1101
0
}
1102
1103
1104
1105
// ********************************** DST-VII **********************************
1106
void fastForwardDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1107
0
{
1108
0
  int i;
1109
0
  TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0;
1110
1111
0
  const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_FORWARD][0];
1112
1113
0
  int c[4];
1114
0
  TCoeff *pCoeff = dst;
1115
0
  const int  reducedLine = line - iSkipLine;
1116
0
  for (i = 0; i<reducedLine; i++)
1117
0
  {
1118
    // Intermediate Variables
1119
0
    c[0] = src[0] + src[3];
1120
0
    c[1] = src[1] + src[3];
1121
0
    c[2] = src[0] - src[1];
1122
0
    c[3] = iT[2] * src[2];
1123
1124
0
    dst[0 * line] = (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift;
1125
0
    dst[1 * line] = (iT[2] * (src[0] + src[1] - src[3]) + rnd_factor) >> shift;
1126
0
    dst[2 * line] = (iT[0] * c[2] + iT[1] * c[0] - c[3] + rnd_factor) >> shift;
1127
0
    dst[3 * line] = (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift;
1128
1129
0
    src += 4;
1130
0
    dst++;
1131
0
  }
1132
0
  if (iSkipLine)
1133
0
  {
1134
0
    dst = pCoeff + reducedLine;
1135
0
    for (i = 0; i<4; i++)
1136
0
    {
1137
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1138
0
      dst += line;
1139
0
    }
1140
0
  }
1141
0
}
1142
1143
void fastInverseDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1144
0
{
1145
0
#if ENABLE_SIMD_TRAFO
1146
0
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P4[TRANSFORM_INVERSE][0] );
1147
#else
1148
  int i;
1149
  TCoeff c[4];
1150
  TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0;
1151
1152
  const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_INVERSE][0];
1153
1154
  const int  reducedLine = line - iSkipLine;
1155
  for (i = 0; i<reducedLine; i++)
1156
  {
1157
    // Intermediate Variables
1158
    c[0] = src[0 * line] + src[2 * line];
1159
    c[1] = src[2 * line] + src[3 * line];
1160
    c[2] = src[0 * line] - src[3 * line];
1161
    c[3] = iT[2] * src[1 * line];
1162
1163
    dst[0] = Clip3(outputMinimum, outputMaximum, (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift);
1164
    dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift);
1165
    dst[2] = Clip3(outputMinimum, outputMaximum, (iT[2] * (src[0 * line] - src[2 * line] + src[3 * line]) + rnd_factor) >> shift);
1166
    dst[3] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[0] + iT[0] * c[2] - c[3] + rnd_factor) >> shift);
1167
1168
    dst += 4;
1169
    src++;
1170
  }
1171
  if (iSkipLine)
1172
  {
1173
    memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff));
1174
  }
1175
#endif
1176
0
}
1177
1178
1179
void fastForwardDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1180
0
{
1181
0
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P8[TRANSFORM_FORWARD][0] );
1182
0
}
1183
1184
1185
void fastInverseDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1186
0
{
1187
0
  _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P8[TRANSFORM_INVERSE][0]);
1188
0
}
1189
1190
1191
void fastForwardDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1192
0
{
1193
#if !JVET_M0497_MATRIX_MULT
1194
  int j, k;
1195
  TCoeff a[5], b[5], c[5], d[5], t;
1196
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1197
1198
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0];
1199
1200
  TCoeff *pCoef = dst;
1201
  const int  reducedLine = line - iSkipLine;
1202
  const int  cutoff = 16 - iSkipLine2;
1203
1204
  for (j = 0; j < reducedLine; j++)
1205
  {
1206
    for (k = 0; k < 5; k++)
1207
    {
1208
      a[k] = src[    k] + src[11 + k];
1209
      b[k] = src[9 - k] + src[11 + k];
1210
      c[k] = src[    k] - src[ 9 - k];
1211
      d[k] = src[    k] + src[ 9 - k] - src[11 + k];
1212
    }
1213
1214
    t = iT[10] * src[10];
1215
1216
    dst[ 1 * line] = ( iT[ 2]*d[0] + iT[ 5]*d[1] + iT[ 8]*d[2] + iT[11]*d[3] + iT[14]*d[4] + add) >> shift;
1217
    dst[ 4 * line] = ( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift;
1218
    dst[ 7 * line] = ( iT[14]*d[0] + iT[ 2]*d[1] - iT[11]*d[2] - iT[ 5]*d[3] + iT[ 8]*d[4] + add) >> shift;
1219
    dst[10 * line] = ( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift;
1220
    dst[13 * line] = ( iT[ 5]*d[0] - iT[11]*d[1] + iT[14]*d[2] - iT[ 8]*d[3] + iT[ 2]*d[4] + add) >> shift;
1221
1222
    dst[5 * line] = ( iT[10] * (src[0] + src[1] - src[3] - src[4] + src[6] + src[7] - src[9] - src[10] + src[12] + src[13] - src[15]) + add) >> shift;
1223
1224
    dst[ 0 * line] = ( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift;
1225
    dst[ 2 * line] = ( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift;
1226
    dst[ 3 * line] = ( iT[6]*a[0] + iT[3]*b[0] + iT[2]*c[1] + iT[7]*a[1] + iT[9]*c[2] + iT[0]*a[2] + iT[4]*c[3] - iT[5]*b[3] - iT[1]*a[4] - iT[8]*b[4] + t + add ) >> shift;
1227
    dst[ 6 * line] = ( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift;
1228
    dst[ 8 * line] = ( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift;
1229
    dst[ 9 * line] = ( iT[7]*c[0] + iT[2]*a[0] - iT[4]*a[1] - iT[5]*b[1] - iT[8]*c[2] + iT[1]*b[2] + iT[9]*a[3] + iT[0]*b[3] + iT[3]*c[4] - iT[6]*b[4] + t + add ) >> shift;
1230
    dst[11 * line] = ( iT[9]*a[0] + iT[0]*b[0] - iT[8]*c[1] - iT[1]*a[1] + iT[2]*c[2] - iT[7]*b[2] + iT[6]*a[3] + iT[3]*b[3] - iT[5]*c[4] - iT[4]*a[4] - t + add ) >> shift;
1231
    dst[12 * line] = ( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift;
1232
    dst[14 * line] = ( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift;
1233
    dst[15 * line] = ( iT[1]*c[0] - iT[8]*b[0] - iT[3]*c[1] + iT[6]*b[1] + iT[5]*c[2] - iT[4]*b[2] - iT[7]*c[3] + iT[2]*b[3] + iT[9]*c[4] - iT[0]*b[4] + t + add ) >> shift;
1234
1235
    src += 16;
1236
    dst++;
1237
  }
1238
1239
  if (iSkipLine)
1240
  {
1241
    dst = pCoef + reducedLine;
1242
    for (j = 0; j < cutoff; j++)
1243
    {
1244
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1245
      dst += line;
1246
    }
1247
  }
1248
1249
  if (iSkipLine2)
1250
  {
1251
    dst = pCoef + line * cutoff;
1252
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1253
  }
1254
#else
1255
0
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P16[TRANSFORM_FORWARD][0] );
1256
0
#endif
1257
0
}
1258
1259
1260
void fastInverseDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1261
0
{
1262
#if !JVET_M0497_MATRIX_MULT
1263
  int j, k;
1264
  TCoeff a[5], b[5], c[5], d[5], t;
1265
1266
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1267
1268
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0];
1269
1270
  const int  reducedLine = line - iSkipLine;
1271
1272
  for (j = 0; j < reducedLine; j++)
1273
  {
1274
    for (k = 0; k < 5; k++)
1275
    {
1276
      a[k] = src[       k * line] + src[(10 - k) * line];
1277
      b[k] = src[(11 + k) * line] + src[(10 - k) * line];
1278
      c[k] = src[       k * line] - src[(11 + k) * line];
1279
      d[k] = src[       k * line] + src[(11 + k) * line] - src[(10 - k)*line];
1280
    }
1281
1282
    t = iT[10] * src[5 * line];
1283
1284
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 2]*d[0] + iT[ 8]*d[1] + iT[14]*d[2] + iT[11]*d[3] + iT[ 5]*d[4] + add ) >> shift);
1285
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 5]*d[0] + iT[14]*d[1] + iT[ 2]*d[2] - iT[ 8]*d[3] - iT[11]*d[4] + add ) >> shift);
1286
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[ 5]*d[1] - iT[11]*d[2] - iT[ 2]*d[3] + iT[14]*d[4] + add ) >> shift);
1287
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[11]*d[0] - iT[ 2]*d[1] - iT[ 5]*d[2] + iT[14]*d[3] - iT[ 8]*d[4] + add ) >> shift);
1288
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[14]*d[0] - iT[11]*d[1] + iT[ 8]*d[2] - iT[ 5]*d[3] + iT[ 2]*d[4] + add ) >> shift);
1289
1290
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[10]*(src[ 0*line]-src[ 2*line]+src[ 3*line]-src[5*line]
1291
                                                                +src[ 6*line]-src[ 8*line]+src[ 9*line]-src[11*line]
1292
                                                                +src[12*line]-src[14*line]+src[15*line]) + add ) >> shift);
1293
1294
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0]*a[0] + iT[9]*b[0] + iT[2]*a[1] + iT[7]*b[1] + iT[4]*a[2] + iT[5]*b[2] + iT[6]*a[3] + iT[3]*b[3] + iT[8]*a[4] + iT[1]*b[4] + t + add ) >> shift);
1295
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] - iT[8]*b[0] + iT[5]*c[1] - iT[4]*b[1] + iT[9]*c[2] - iT[0]*b[2] + iT[2]*a[3] + iT[7]*c[3] + iT[6]*a[4] + iT[3]*c[4] + t + add ) >> shift);
1296
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[3]*a[0] + iT[6]*b[0] + iT[0]*c[1] + iT[9]*a[1] + iT[1]*a[2] + iT[8]*c[2] + iT[4]*c[3] - iT[5]*b[3] - iT[2]*a[4] - iT[7]*b[4] - t + add ) >> shift);
1297
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] - iT[5]*b[0] + iT[6]*c[1] + iT[3]*a[1] + iT[7]*a[2] + iT[2]*b[2] - iT[1]*c[3] + iT[8]*b[3] - iT[9]*c[4] - iT[0]*a[4] - t + add ) >> shift);
1298
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[6]*a[0] + iT[3]*b[0] + iT[9]*c[1] + iT[0]*a[1] - iT[1]*a[2] - iT[8]*b[2] - iT[4]*c[3] - iT[5]*a[3] - iT[2]*c[4] + iT[7]*b[4] + t + add ) >> shift);
1299
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] - iT[2]*b[0] + iT[8]*a[1] + iT[1]*b[1] - iT[6]*c[2] + iT[3]*b[2] - iT[9]*a[3] - iT[0]*b[3] + iT[5]*c[4] - iT[4]*b[4] + t + add ) >> shift);
1300
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( iT[9]*a[0] + iT[0]*b[0] + iT[2]*c[1] - iT[7]*b[1] - iT[5]*c[2] - iT[4]*a[2] + iT[3]*a[3] + iT[6]*b[3] + iT[8]*c[4] - iT[1]*b[4] - t + add ) >> shift);
1301
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] + iT[8]*a[0] - iT[5]*a[1] - iT[4]*b[1] - iT[0]*c[2] + iT[9]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[6]*c[4] - iT[3]*a[4] + t + add ) >> shift);
1302
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] + iT[2]*a[0] - iT[8]*c[1] + iT[1]*b[1] + iT[3]*c[2] - iT[6]*b[2] + iT[0]*a[3] + iT[9]*b[3] - iT[5]*a[4] - iT[4]*b[4] + t + add ) >> shift);
1303
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] + iT[5]*a[0] - iT[3]*c[1] - iT[6]*a[1] + iT[2]*c[2] + iT[7]*a[2] - iT[1]*c[3] - iT[8]*a[3] + iT[0]*c[4] + iT[9]*a[4] - t + add ) >> shift);
1304
1305
    src++;
1306
    dst += 16;
1307
  }
1308
1309
  if (iSkipLine)
1310
  {
1311
    memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff));
1312
  }
1313
#else
1314
0
  _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P16[TRANSFORM_INVERSE][0]);
1315
0
#endif
1316
0
}
1317
1318
1319
void fastForwardDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1320
0
{
1321
#if !JVET_M0497_MATRIX_MULT
1322
  int j, k;
1323
  TCoeff a[10][6];
1324
  TCoeff t[2];
1325
  TCoeff b[6];
1326
  TCoeff c[2];
1327
1328
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1329
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0];
1330
  TCoeff *pCoef = dst;
1331
  const int  reducedLine = line - iSkipLine;
1332
  const int  cutoff = 32 - iSkipLine2;
1333
1334
  for (j = 0; j < reducedLine; j++)
1335
  {
1336
    for (k = 0; k < 6; k++)
1337
    {
1338
      a[0][k] = src[     k] - src[11 - k];
1339
      a[1][k] = src[     k] + src[13 + k];
1340
      a[2][k] = src[     k] + src[24 - k];
1341
      a[3][k] = src[     k] - src[26 + k];
1342
      a[4][k] = src[ 6 + k] + src[18 - k];
1343
      a[5][k] = src[ 6 + k] + src[19 + k];
1344
      a[6][k] = src[ 6 + k] - src[31 - k];
1345
      a[7][k] = src[13 + k] - src[24 - k];
1346
      a[8][k] = src[13 + k] + src[26 + k];
1347
      a[9][k] = src[19 + k] + src[31 - k];
1348
1349
      b[k] = src[k] + src[11 - k] - src[13 + k] - src[24 - k] + src[26 + k];
1350
    }
1351
    for (k = 0; k < 2; k++)
1352
    {
1353
      c[k] = src[k] + src[3 - k] - src[5 + k] - src[8 - k] + src[10 + k] + src[13 - k] - src[15 + k] - src[18 - k] + src[20 + k] + src[23 - k] - src[25 + k] - src[28 - k] + src[30 + k];
1354
    }
1355
1356
    t[0] = iT[12] * src[12] + iT[25] * src[25];
1357
    t[1] = iT[12] * src[25] - iT[25] * src[12];
1358
1359
    dst[ 0 * line] = ( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift;
1360
    dst[ 1 * line] = (-iT[0] * a[5][2] + iT[11] * a[0][3] + iT[13] * a[4][2] + iT[24] * a[6][2] + iT[1] * a[9][1] + iT[10] * a[8][4] + iT[14] * a[3][4] + iT[23] * a[6][1] + iT[2] * a[0][0] - iT[9] * a[5][5] + iT[15] * a[6][5] + iT[22] * a[4][5] - iT[3] * a[5][3] + iT[8] * a[0][2] + iT[16] * a[4][3] + iT[21] * a[6][3] + iT[4] * a[9][0] + iT[7] * a[8][5] + iT[17] * a[3][5] + iT[20] * a[6][0] + iT[5] * a[0][1] - iT[6] * a[5][4] + iT[18] * a[6][4] + iT[19] * a[4][4] - t[1] + add) >> shift;
1361
    dst[ 3 * line] = (-iT[0] * a[9][4] - iT[11] * a[5][4] + iT[13] * a[2][1] - iT[24] * a[7][1] - iT[1] * a[0][3] - iT[10] * a[1][3] + iT[14] * a[3][3] + iT[23] * a[2][3] + iT[2] * a[8][5] + iT[9] * a[9][0] + iT[15] * a[6][0] + iT[22] * a[3][5] - iT[3] * a[1][4] - iT[8] * a[0][4] + iT[16] * a[2][4] + iT[21] * a[3][4] - iT[4] * a[5][3] - iT[7] * a[9][3] - iT[17] * a[7][2] + iT[20] * a[2][2] + iT[5] * a[8][0] + iT[6] * a[1][0] - iT[18] * a[4][5] - iT[19] * a[7][0] + t[1] + add) >> shift;
1362
    dst[ 4 * line] = (-iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift;
1363
    dst[ 5 * line] = (-iT[0] * a[3][5] - iT[11] * a[6][0] - iT[13] * a[8][5] - iT[24] * a[9][0] + iT[1] * a[6][5] + iT[10] * a[3][0] + iT[14] * a[9][5] + iT[23] * a[8][0] - iT[2] * a[7][4] + iT[9] * a[2][4] - iT[15] * a[9][1] - iT[22] * a[5][1] - iT[3] * a[7][1] - iT[8] * a[4][4] + iT[16] * a[8][1] + iT[21] * a[1][1] + iT[4] * a[6][2] + iT[7] * a[4][2] - iT[17] * a[5][2] + iT[20] * a[0][3] - iT[5] * a[3][2] - iT[6] * a[2][2] + iT[18] * a[1][2] + iT[19] * a[0][2] + t[0] + add) >> shift;
1364
    dst[ 8 * line] = ( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift;
1365
    dst[ 9 * line] = (-iT[0] * a[2][1] - iT[11] * a[3][1] + iT[13] * a[0][1] + iT[24] * a[1][1] + iT[1] * a[7][3] - iT[10] * a[2][3] + iT[14] * a[9][2] + iT[23] * a[5][2] + iT[2] * a[4][0] + iT[9] * a[7][5] - iT[15] * a[1][5] - iT[22] * a[8][5] + iT[3] * a[3][4] + iT[8] * a[2][4] - iT[16] * a[1][4] - iT[21] * a[0][4] + iT[4] * a[6][3] + iT[7] * a[3][2] + iT[17] * a[9][3] + iT[20] * a[8][2] + iT[5] * a[4][5] + iT[6] * a[6][5] + iT[18] * a[0][0] - iT[19] * a[5][5] - t[0] + add) >> shift;
1366
    dst[10 * line] = (-iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift;
1367
    dst[11 * line] = ( iT[0] * a[1][3] + iT[11] * a[0][3] - iT[13] * a[2][3] - iT[24] * a[3][3] + iT[1] * a[9][1] + iT[10] * a[5][1] - iT[14] * a[2][4] + iT[23] * a[7][4] + iT[2] * a[8][0] + iT[9] * a[9][5] + iT[15] * a[6][5] + iT[22] * a[3][0] - iT[3] * a[0][2] + iT[8] * a[5][3] - iT[16] * a[6][3] - iT[21] * a[4][3] - iT[4] * a[5][0] + iT[7] * a[0][5] + iT[17] * a[4][0] + iT[20] * a[6][0] - iT[5] * a[9][4] - iT[6] * a[5][4] + iT[18] * a[2][1] - iT[19] * a[7][1] - t[1] + add) >> shift;
1368
    dst[13 * line] = (-iT[0] * a[0][0] - iT[11] * a[1][0] + iT[13] * a[3][0] + iT[24] * a[2][0] - iT[1] * a[5][4] + iT[10] * a[0][1] + iT[14] * a[4][4] + iT[23] * a[6][4] + iT[2] * a[9][3] + iT[9] * a[5][3] - iT[15] * a[2][2] + iT[22] * a[7][2] - iT[3] * a[8][3] - iT[8] * a[9][2] - iT[16] * a[6][2] - iT[21] * a[3][3] + iT[4] * a[1][4] + iT[7] * a[8][4] - iT[17] * a[7][4] - iT[20] * a[4][1] - iT[5] * a[0][5] - iT[6] * a[1][5] + iT[18] * a[3][5] + iT[19] * a[2][5] + t[1] + add) >> shift;
1369
    dst[14 * line] = ( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift;
1370
    dst[15 * line] = (-iT[0] * a[7][4] - iT[11] * a[4][1] + iT[13] * a[8][4] + iT[24] * a[1][4] + iT[1] * a[2][2] + iT[10] * a[3][2] - iT[14] * a[0][2] - iT[23] * a[1][2] + iT[2] * a[2][1] - iT[9] * a[7][1] - iT[15] * a[5][4] - iT[22] * a[9][4] - iT[3] * a[7][5] + iT[8] * a[2][5] - iT[16] * a[9][0] - iT[21] * a[5][0] - iT[4] * a[2][0] - iT[7] * a[3][0] + iT[17] * a[0][0] + iT[20] * a[1][0] - iT[5] * a[2][3] + iT[6] * a[7][3] + iT[18] * a[5][2] + iT[19] * a[9][2] + t[0] + add) >> shift;
1371
    dst[16 * line] = (-iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift;
1372
    dst[18 * line] = ( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift;
1373
    dst[20 * line] = (-iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift;
1374
    dst[21 * line] = (-iT[0] * a[1][2] - iT[11] * a[8][2] + iT[13] * a[7][2] + iT[24] * a[4][3] - iT[1] * a[1][5] - iT[10] * a[8][5] + iT[14] * a[7][5] + iT[23] * a[4][0] - iT[2] * a[5][2] - iT[9] * a[9][2] - iT[15] * a[7][3] + iT[22] * a[2][3] - iT[3] * a[5][5] - iT[8] * a[9][5] - iT[16] * a[7][0] + iT[21] * a[2][0] - iT[4] * a[8][1] - iT[7] * a[9][4] - iT[17] * a[6][4] - iT[20] * a[3][1] - iT[5] * a[8][4] - iT[6] * a[9][1] - iT[18] * a[6][1] - iT[19] * a[3][4] - t[1] + add) >> shift;
1375
    dst[23 * line] = (-iT[0] * a[8][4] - iT[11] * a[9][1] - iT[13] * a[6][1] - iT[24] * a[3][4] + iT[1] * a[8][2] + iT[10] * a[1][2] - iT[14] * a[4][3] - iT[23] * a[7][2] + iT[2] * a[0][1] + iT[9] * a[1][1] - iT[15] * a[3][1] - iT[22] * a[2][1] - iT[3] * a[5][0] - iT[8] * a[9][0] - iT[16] * a[7][5] + iT[21] * a[2][5] + iT[4] * a[9][5] + iT[7] * a[8][0] + iT[17] * a[3][0] + iT[20] * a[6][5] - iT[5] * a[5][2] + iT[6] * a[0][3] + iT[18] * a[4][2] + iT[19] * a[6][2] + t[1] + add) >> shift;
1376
    dst[24 * line] = (-iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift;
1377
    dst[25 * line] = ( iT[0] * a[4][5] + iT[11] * a[6][5] + iT[13] * a[0][0] - iT[24] * a[5][5] + iT[1] * a[3][1] + iT[10] * a[2][1] - iT[14] * a[1][1] - iT[23] * a[0][1] - iT[2] * a[7][2] - iT[9] * a[4][3] + iT[15] * a[8][2] + iT[22] * a[1][2] - iT[3] * a[6][2] - iT[8] * a[3][3] - iT[16] * a[9][2] - iT[21] * a[8][3] - iT[4] * a[2][4] + iT[7] * a[7][4] + iT[17] * a[5][1] + iT[20] * a[9][1] + iT[5] * a[4][0] + iT[6] * a[6][0] + iT[18] * a[0][5] - iT[19] * a[5][0] + t[0] + add) >> shift;
1378
    dst[26 * line] = ( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift;
1379
    dst[28 * line] = (-iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift;
1380
    dst[29 * line] = (-iT[0] * a[6][4] - iT[11] * a[3][1] - iT[13] * a[9][4] - iT[24] * a[8][1] + iT[1] * a[7][3] + iT[10] * a[4][2] - iT[14] * a[8][3] - iT[23] * a[1][3] + iT[2] * a[3][5] + iT[9] * a[2][5] - iT[15] * a[1][5] - iT[22] * a[0][5] - iT[3] * a[2][4] - iT[8] * a[3][4] + iT[16] * a[0][4] + iT[21] * a[1][4] - iT[4] * a[4][3] - iT[7] * a[7][2] + iT[17] * a[1][2] + iT[20] * a[8][2] + iT[5] * a[3][0] + iT[6] * a[6][5] + iT[18] * a[8][0] + iT[19] * a[9][5] - t[0] + add) >> shift;
1381
    dst[30 * line] = (-iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift;
1382
    dst[31 * line] = (-iT[0] * a[8][5] - iT[11] * a[1][5] + iT[13] * a[4][0] + iT[24] * a[7][5] + iT[1] * a[1][0] + iT[10] * a[8][0] - iT[14] * a[7][0] - iT[23] * a[4][5] + iT[2] * a[8][4] + iT[9] * a[1][4] - iT[15] * a[4][1] - iT[22] * a[7][4] - iT[3] * a[1][1] - iT[8] * a[8][1] + iT[16] * a[7][1] + iT[21] * a[4][4] - iT[4] * a[8][3] - iT[7] * a[1][3] + iT[17] * a[4][2] + iT[20] * a[7][3] + iT[5] * a[1][2] + iT[6] * a[8][2] - iT[18] * a[7][2] - iT[19] * a[4][3] - t[1] + add) >> shift;
1383
1384
    dst[ 2 * line] = (iT[ 4]*b[0] + iT[ 9]*b[1] + iT[14]*b[2] + iT[19]*b[3] + iT[24]*b[4] + iT[29]*b[5] + add) >> shift;
1385
    dst[ 7 * line] = (iT[14]*b[0] + iT[29]*b[1] + iT[19]*b[2] + iT[ 4]*b[3] - iT[ 9]*b[4] - iT[24]*b[5] + add) >> shift;
1386
    dst[12 * line] = (iT[24]*b[0] + iT[14]*b[1] - iT[ 9]*b[2] - iT[29]*b[3] - iT[ 4]*b[4] + iT[19]*b[5] + add) >> shift;
1387
    dst[17 * line] = (iT[29]*b[0] - iT[ 4]*b[1] - iT[24]*b[2] + iT[ 9]*b[3] + iT[19]*b[4] - iT[14]*b[5] + add) >> shift;
1388
    dst[22 * line] = (iT[19]*b[0] - iT[24]*b[1] + iT[ 4]*b[2] + iT[14]*b[3] - iT[29]*b[4] + iT[ 9]*b[5] + add) >> shift;
1389
    dst[27 * line] = (iT[ 9]*b[0] - iT[19]*b[1] + iT[29]*b[2] - iT[24]*b[3] + iT[14]*b[4] - iT[ 4]*b[5] + add) >> shift;
1390
1391
    dst[ 6 * line] = (iT[12]*c[0] + iT[25]*c[1] + add) >> shift;
1392
    dst[19 * line] = (iT[25]*c[0] - iT[12]*c[1] + add) >> shift;
1393
1394
    src += 32;
1395
    dst++;
1396
  }
1397
1398
  if (iSkipLine)
1399
  {
1400
    dst = pCoef + reducedLine;
1401
    for (j = 0; j < cutoff; j++)
1402
    {
1403
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1404
      dst += line;
1405
    }
1406
  }
1407
1408
  if (iSkipLine2)
1409
  {
1410
    dst = pCoef + line * cutoff;
1411
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1412
  }
1413
#else
1414
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P32[TRANSFORM_FORWARD][0] );
1415
0
#endif
1416
0
}
1417
1418
1419
void fastInverseDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1420
0
{
1421
#if !JVET_M0497_MATRIX_MULT
1422
  int j, k;
1423
  TCoeff a[10][6];
1424
  TCoeff t[2];
1425
  TCoeff b[6];
1426
  TCoeff c[2];
1427
1428
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1429
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0];
1430
  const int  reducedLine = line - iSkipLine;
1431
1432
  for (j = 0; j < reducedLine; j++)
1433
  {
1434
    for (k = 0; k < 6; k++)
1435
    {
1436
      a[0][k] = src[      k  * line] + src[(12 - k) * line];
1437
      a[1][k] = src[      k  * line] - src[(13 + k) * line];
1438
      a[2][k] = src[      k  * line] + src[(25 - k) * line];
1439
      a[3][k] = src[      k  * line] - src[(26 + k) * line];
1440
      a[4][k] = src[( 7 + k) * line] + src[(18 - k) * line];
1441
      a[5][k] = src[( 7 + k) * line] - src[(20 + k) * line];
1442
      a[6][k] = src[( 7 + k) * line] + src[(31 - k) * line];
1443
      a[7][k] = src[(13 + k) * line] + src[(25 - k) * line];
1444
      a[8][k] = src[(13 + k) * line] - src[(26 + k) * line];
1445
      a[9][k] = src[(20 + k) * line] + src[(31 - k) * line];
1446
1447
      b[k] = src[k * line] - src[(12-k) * line] + src[(13+k) * line] - src[(25-k) * line] + src[(26+k) * line];
1448
    }
1449
    for (k = 0; k < 2; k++)
1450
    {
1451
      c[k] = src[k * line] - src[(4-k) * line] + src[(5+k) * line] - src[(9-k) * line] + src[(10+k) * line] - src[(14-k) * line] + src[(15+k)*line] - src[(19-k)*line] + src[(20+k)*line] - src[(24-k)*line] + src[(25+k)*line] - src[(29-k)*line] + src[(30+k)*line];
1452
    }
1453
1454
    t[0] = iT[12] * src[6*line] + iT[25] * src[19*line];
1455
    t[1] = iT[25] * src[6*line] - iT[12] * src[19*line];
1456
1457
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[1][0] - iT[11] * a[8][0] + iT[13] * a[7][0] + iT[24] * a[4][5] - iT[1] * a[8][5] + iT[10] * a[1][5] + iT[14] * a[4][0] + iT[23] * a[7][5] + iT[2] * a[1][1] - iT[9] * a[8][1] + iT[15] * a[7][1] + iT[22] * a[4][4] - iT[3] * a[8][4] + iT[8] * a[1][4] + iT[16] * a[4][1] + iT[21] * a[7][4] + iT[4] * a[1][2] - iT[7] * a[8][2] + iT[17] * a[7][2] + iT[20] * a[4][3] - iT[5] * a[8][3] + iT[6] * a[1][3] + iT[18] * a[4][2] + iT[19] * a[7][3] + t[0] + add) >> shift);
1458
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[4][2] - iT[11] * a[6][2] + iT[13] * a[0][3] + iT[24] * a[5][2] + iT[1] * a[2][0] + iT[10] * a[7][0] + iT[14] * a[5][5] - iT[23] * a[9][5] + iT[2] * a[7][2] + iT[9] * a[2][2] - iT[15] * a[9][3] + iT[22] * a[5][3] - iT[3] * a[6][0] - iT[8] * a[4][0] + iT[16] * a[5][0] + iT[21] * a[0][5] - iT[4] * a[4][1] - iT[7] * a[6][1] + iT[17] * a[0][4] + iT[20] * a[5][1] + iT[5] * a[2][1] + iT[6] * a[7][1] + iT[18] * a[5][4] - iT[19] * a[9][4] + t[1] + add) >> shift);
1459
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][4] - iT[11] * a[3][4] + iT[13] * a[0][4] + iT[24] * a[1][4] + iT[1] * a[4][3] + iT[10] * a[7][2] + iT[14] * a[1][2] - iT[23] * a[8][2] + iT[2] * a[3][0] - iT[9] * a[6][5] - iT[15] * a[8][0] + iT[22] * a[9][5] - iT[3] * a[6][4] + iT[8] * a[3][1] + iT[16] * a[9][4] - iT[21] * a[8][1] + iT[4] * a[7][3] + iT[7] * a[4][2] - iT[17] * a[8][3] + iT[20] * a[1][3] - iT[5] * a[3][5] - iT[6] * a[2][5] + iT[18] * a[1][5] + iT[19] * a[0][5] + t[1] + add) >> shift);
1460
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][4] + iT[11] * a[0][1] - iT[13] * a[4][4] - iT[24] * a[6][4] - iT[1] * a[1][3] - iT[10] * a[0][3] + iT[14] * a[2][3] + iT[23] * a[3][3] - iT[2] * a[0][4] - iT[9] * a[1][4] + iT[15] * a[3][4] + iT[22] * a[2][4] + iT[3] * a[0][0] + iT[8] * a[5][5] - iT[16] * a[6][5] - iT[21] * a[4][5] + iT[4] * a[5][0] - iT[7] * a[9][0] + iT[17] * a[7][5] + iT[20] * a[2][5] - iT[5] * a[8][2] + iT[6] * a[9][3] - iT[18] * a[6][3] + iT[19] * a[3][2] + t[0] + add) >> shift);
1461
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][5] + iT[11] * a[8][5] - iT[13] * a[7][5] - iT[24] * a[4][0] + iT[1] * a[5][1] + iT[10] * a[0][4] - iT[14] * a[4][1] - iT[23] * a[6][1] - iT[2] * a[8][3] + iT[9] * a[9][2] - iT[15] * a[6][2] + iT[22] * a[3][3] - iT[3] * a[0][2] - iT[8] * a[1][2] + iT[16] * a[3][2] + iT[21] * a[2][2] - iT[4] * a[9][4] + iT[7] * a[5][4] + iT[17] * a[2][1] + iT[20] * a[7][1] + iT[5] * a[1][0] - iT[6] * a[8][0] + iT[18] * a[7][0] + iT[19] * a[4][5] - t[0] + add) >> shift);
1462
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][5] - iT[11] * a[2][5] + iT[13] * a[9][0] - iT[24] * a[5][0] + iT[1] * a[3][4] - iT[10] * a[6][1] - iT[14] * a[8][4] + iT[23] * a[9][1] + iT[2] * a[4][2] + iT[9] * a[7][3] + iT[15] * a[1][3] - iT[22] * a[8][3] - iT[3] * a[2][2] - iT[8] * a[3][2] + iT[16] * a[0][2] + iT[21] * a[1][2] - iT[4] * a[6][4] - iT[7] * a[4][4] + iT[17] * a[5][4] + iT[20] * a[0][1] + iT[5] * a[7][0] + iT[6] * a[2][0] - iT[18] * a[9][5] + iT[19] * a[5][5] - t[1] + add) >> shift);
1463
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[6][3] - iT[11] * a[4][3] + iT[13] * a[5][3] + iT[24] * a[0][2] + iT[1] * a[7][1] + iT[10] * a[4][4] - iT[14] * a[8][1] + iT[23] * a[1][1] - iT[2] * a[7][5] - iT[9] * a[4][0] + iT[15] * a[8][5] - iT[22] * a[1][5] + iT[3] * a[7][3] + iT[8] * a[2][3] - iT[16] * a[9][2] + iT[21] * a[5][2] - iT[4] * a[6][5] + iT[7] * a[3][0] + iT[17] * a[9][5] - iT[20] * a[8][0] + iT[5] * a[6][1] - iT[6] * a[3][4] - iT[18] * a[9][1] + iT[19] * a[8][4] - t[1] + add) >> shift);
1464
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][1] - iT[11] * a[0][1] + iT[13] * a[2][1] + iT[24] * a[3][1] + iT[1] * a[1][3] - iT[10] * a[8][3] + iT[14] * a[7][3] + iT[23] * a[4][2] - iT[2] * a[9][1] + iT[9] * a[8][4] - iT[15] * a[3][4] + iT[22] * a[6][1] + iT[3] * a[5][5] + iT[8] * a[0][0] - iT[16] * a[4][5] - iT[21] * a[6][5] + iT[4] * a[0][5] + iT[7] * a[1][5] - iT[17] * a[3][5] - iT[20] * a[2][5] + iT[5] * a[5][3] - iT[6] * a[9][3] + iT[18] * a[7][2] + iT[19] * a[2][2] - t[0] + add) >> shift);
1465
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][3] - iT[11] * a[1][3] - iT[13] * a[4][2] - iT[24] * a[7][3] - iT[1] * a[8][0] + iT[10] * a[1][0] + iT[14] * a[4][5] + iT[23] * a[7][0] + iT[2] * a[5][3] + iT[9] * a[0][2] - iT[15] * a[4][3] - iT[22] * a[6][3] - iT[3] * a[5][0] - iT[8] * a[0][5] + iT[16] * a[4][0] + iT[21] * a[6][0] + iT[4] * a[1][4] + iT[7] * a[0][4] - iT[17] * a[2][4] - iT[20] * a[3][4] - iT[5] * a[1][1] - iT[6] * a[0][1] + iT[18] * a[2][1] + iT[19] * a[3][1] + t[0] + add) >> shift);
1466
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[7][0] + iT[11] * a[2][0] - iT[13] * a[9][5] + iT[24] * a[5][5] + iT[1] * a[2][5] + iT[10] * a[7][5] + iT[14] * a[5][0] - iT[23] * a[9][0] - iT[2] * a[2][1] - iT[9] * a[3][1] + iT[15] * a[0][1] + iT[22] * a[1][1] - iT[3] * a[7][4] - iT[8] * a[4][1] + iT[16] * a[8][4] - iT[21] * a[1][4] + iT[4] * a[3][2] - iT[7] * a[6][3] - iT[17] * a[8][2] + iT[20] * a[9][3] + iT[5] * a[4][2] + iT[6] * a[6][2] - iT[18] * a[0][3] - iT[19] * a[5][2] + t[1] + add) >> shift);
1467
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][5] - iT[11] * a[8][0] + iT[13] * a[3][0] - iT[24] * a[6][5] - iT[1] * a[8][5] + iT[10] * a[9][0] - iT[14] * a[6][0] + iT[23] * a[3][5] + iT[2] * a[5][4] - iT[9] * a[9][4] + iT[15] * a[7][1] + iT[22] * a[2][1] - iT[3] * a[1][4] + iT[8] * a[8][4] - iT[16] * a[7][4] - iT[21] * a[4][1] - iT[4] * a[0][2] - iT[7] * a[5][3] + iT[17] * a[6][3] + iT[20] * a[4][3] + iT[5] * a[0][3] + iT[6] * a[1][3] - iT[18] * a[3][3] - iT[19] * a[2][3] + t[0] + add) >> shift);
1468
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][1] + iT[11] * a[5][1] + iT[13] * a[2][4] + iT[24] * a[7][4] + iT[1] * a[9][3] - iT[10] * a[5][3] - iT[14] * a[2][2] - iT[23] * a[7][2] - iT[2] * a[9][5] + iT[9] * a[5][5] + iT[15] * a[2][0] + iT[22] * a[7][0] + iT[3] * a[9][4] - iT[8] * a[8][1] + iT[16] * a[3][1] - iT[21] * a[6][4] - iT[4] * a[9][2] + iT[7] * a[8][3] - iT[17] * a[3][3] + iT[20] * a[6][2] + iT[5] * a[9][0] - iT[6] * a[8][5] + iT[18] * a[3][5] - iT[19] * a[6][0] - t[0] + add) >> shift);
1469
    dst[16] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[4][4] + iT[11] * a[7][1] + iT[13] * a[1][1] - iT[24] * a[8][1] + iT[1] * a[6][2] - iT[10] * a[3][3] - iT[14] * a[9][2] + iT[23] * a[8][3] - iT[2] * a[6][1] - iT[9] * a[4][1] + iT[15] * a[5][1] + iT[22] * a[0][4] - iT[3] * a[4][5] - iT[8] * a[6][5] + iT[16] * a[0][0] + iT[21] * a[5][5] - iT[4] * a[6][0] + iT[7] * a[3][5] + iT[17] * a[9][0] - iT[20] * a[8][5] + iT[5] * a[6][3] + iT[6] * a[4][3] - iT[18] * a[5][3] - iT[19] * a[0][2] - t[1] + add) >> shift);
1470
    dst[17] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][2] - iT[11] * a[4][3] + iT[13] * a[8][2] - iT[24] * a[1][2] + iT[1] * a[7][1] + iT[10] * a[2][1] - iT[14] * a[9][4] + iT[23] * a[5][4] - iT[2] * a[3][5] + iT[9] * a[6][0] + iT[15] * a[8][5] - iT[22] * a[9][0] - iT[3] * a[2][3] - iT[8] * a[7][3] - iT[16] * a[5][2] + iT[21] * a[9][2] + iT[4] * a[4][5] + iT[7] * a[7][0] + iT[17] * a[1][0] - iT[20] * a[8][0] - iT[5] * a[2][4] - iT[6] * a[3][4] + iT[18] * a[0][4] + iT[19] * a[1][4] - t[1] + add) >> shift);
1471
    dst[18] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][0] + iT[11] * a[8][5] - iT[13] * a[3][5] + iT[24] * a[6][0] + iT[1] * a[5][1] - iT[10] * a[9][1] + iT[14] * a[7][4] + iT[23] * a[2][4] + iT[2] * a[0][3] + iT[9] * a[5][2] - iT[15] * a[6][2] - iT[22] * a[4][2] + iT[3] * a[1][2] + iT[8] * a[0][2] - iT[16] * a[2][2] - iT[21] * a[3][2] - iT[4] * a[8][1] + iT[7] * a[1][1] + iT[17] * a[4][4] + iT[20] * a[7][1] + iT[5] * a[9][5] - iT[6] * a[8][0] + iT[18] * a[3][0] - iT[19] * a[6][5] - t[0] + add) >> shift);
1472
    dst[20] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][2] - iT[11] * a[9][3] + iT[13] * a[6][3] - iT[24] * a[3][2] + iT[1] * a[0][1] + iT[10] * a[5][4] - iT[14] * a[6][4] - iT[23] * a[4][4] + iT[2] * a[1][5] + iT[9] * a[0][5] - iT[15] * a[2][5] - iT[22] * a[3][5] - iT[3] * a[9][2] + iT[8] * a[5][2] + iT[16] * a[2][3] + iT[21] * a[7][3] + iT[4] * a[5][5] - iT[7] * a[9][5] + iT[17] * a[7][0] + iT[20] * a[2][0] + iT[5] * a[0][4] + iT[6] * a[5][1] - iT[18] * a[6][1] - iT[19] * a[4][1] + t[0] + add) >> shift);
1473
    dst[21] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][1] - iT[11] * a[7][1] - iT[13] * a[5][4] + iT[24] * a[9][4] - iT[1] * a[6][2] - iT[10] * a[4][2] + iT[14] * a[5][2] + iT[23] * a[0][3] - iT[2] * a[2][4] - iT[9] * a[7][4] - iT[15] * a[5][1] + iT[22] * a[9][1] - iT[3] * a[6][5] - iT[8] * a[4][5] + iT[16] * a[5][5] + iT[21] * a[0][0] - iT[4] * a[4][0] - iT[7] * a[7][5] - iT[17] * a[1][5] + iT[20] * a[8][5] - iT[5] * a[7][2] - iT[6] * a[4][3] + iT[18] * a[8][2] - iT[19] * a[1][2] + t[1] + add) >> shift);
1474
    dst[22] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[6][1] - iT[11] * a[3][4] - iT[13] * a[9][1] + iT[24] * a[8][4] + iT[1] * a[4][3] + iT[10] * a[6][3] - iT[14] * a[0][2] - iT[23] * a[5][3] + iT[2] * a[7][0] + iT[9] * a[4][5] - iT[15] * a[8][0] + iT[22] * a[1][0] - iT[3] * a[3][1] + iT[8] * a[6][4] + iT[16] * a[8][1] - iT[21] * a[9][4] - iT[4] * a[2][3] - iT[7] * a[3][3] + iT[17] * a[0][3] + iT[20] * a[1][3] - iT[5] * a[7][5] - iT[6] * a[2][5] + iT[18] * a[9][0] - iT[19] * a[5][0] + t[1] + add) >> shift);
1475
    dst[23] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[0][3] - iT[11] * a[1][3] + iT[13] * a[3][3] + iT[24] * a[2][3] - iT[1] * a[8][0] + iT[10] * a[9][5] - iT[14] * a[6][5] + iT[23] * a[3][0] + iT[2] * a[8][2] - iT[9] * a[1][2] - iT[15] * a[4][3] - iT[22] * a[7][2] + iT[3] * a[0][5] + iT[8] * a[5][0] - iT[16] * a[6][0] - iT[21] * a[4][0] + iT[4] * a[8][4] - iT[7] * a[9][1] + iT[17] * a[6][1] - iT[20] * a[3][4] - iT[5] * a[5][4] - iT[6] * a[0][1] + iT[18] * a[4][4] + iT[19] * a[6][4] + t[0] + add) >> shift);
1476
    dst[26] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][0] - iT[11] * a[2][0] + iT[13] * a[1][0] + iT[24] * a[0][0] - iT[1] * a[2][5] - iT[10] * a[3][5] + iT[14] * a[0][5] + iT[23] * a[1][5] + iT[2] * a[4][4] + iT[9] * a[6][4] - iT[15] * a[0][1] - iT[22] * a[5][4] - iT[3] * a[4][1] - iT[8] * a[7][4] - iT[16] * a[1][4] + iT[21] * a[8][4] + iT[4] * a[2][2] + iT[7] * a[7][2] + iT[17] * a[5][3] - iT[20] * a[9][3] + iT[5] * a[3][3] - iT[6] * a[6][2] - iT[18] * a[8][3] + iT[19] * a[9][2] - t[1] + add) >> shift);
1477
    dst[27] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][3] + iT[11] * a[6][2] + iT[13] * a[8][3] - iT[24] * a[9][2] - iT[1] * a[2][0] - iT[10] * a[3][0] + iT[14] * a[0][0] + iT[23] * a[1][0] - iT[2] * a[6][3] + iT[9] * a[3][2] + iT[15] * a[9][3] - iT[22] * a[8][2] - iT[3] * a[4][0] - iT[8] * a[6][0] + iT[16] * a[0][5] + iT[21] * a[5][0] - iT[4] * a[7][4] - iT[7] * a[2][4] + iT[17] * a[9][1] - iT[20] * a[5][1] - iT[5] * a[4][4] - iT[6] * a[7][1] - iT[18] * a[1][1] + iT[19] * a[8][1] - t[1] + add) >> shift);
1478
    dst[28] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[0][4] + iT[11] * a[5][1] - iT[13] * a[6][1] - iT[24] * a[4][1] + iT[1] * a[9][3] - iT[10] * a[8][2] + iT[14] * a[3][2] - iT[23] * a[6][3] - iT[2] * a[1][0] - iT[9] * a[0][0] + iT[15] * a[2][0] + iT[22] * a[3][0] + iT[3] * a[8][1] - iT[8] * a[9][4] + iT[16] * a[6][4] - iT[21] * a[3][1] - iT[4] * a[5][2] - iT[7] * a[0][3] + iT[17] * a[4][2] + iT[20] * a[6][2] + iT[5] * a[1][5] - iT[6] * a[8][5] + iT[18] * a[7][5] + iT[19] * a[4][0] - t[0] + add) >> shift);
1479
    dst[30] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][3] - iT[11] * a[9][3] + iT[13] * a[7][2] + iT[24] * a[2][2] + iT[1] * a[0][1] + iT[10] * a[1][1] - iT[14] * a[3][1] - iT[23] * a[2][1] + iT[2] * a[9][0] - iT[9] * a[5][0] - iT[15] * a[2][5] - iT[22] * a[7][5] - iT[3] * a[5][2] + iT[8] * a[9][2] - iT[16] * a[7][3] - iT[21] * a[2][3] - iT[4] * a[0][0] - iT[7] * a[1][0] + iT[17] * a[3][0] + iT[20] * a[2][0] - iT[5] * a[9][1] + iT[6] * a[5][1] + iT[18] * a[2][4] + iT[19] * a[7][4] + t[0] + add) >> shift);
1480
    dst[31] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][5] + iT[11] * a[2][5] - iT[13] * a[1][5] - iT[24] * a[0][5] - iT[1] * a[3][4] - iT[10] * a[2][4] + iT[14] * a[1][4] + iT[23] * a[0][4] + iT[2] * a[3][3] + iT[9] * a[2][3] - iT[15] * a[1][3] - iT[22] * a[0][3] - iT[3] * a[3][2] - iT[8] * a[2][2] + iT[16] * a[1][2] + iT[21] * a[0][2] + iT[4] * a[3][1] + iT[7] * a[2][1] - iT[17] * a[1][1] - iT[20] * a[0][1] - iT[5] * a[3][0] - iT[6] * a[2][0] + iT[18] * a[1][0] + iT[19] * a[0][0] + t[1] + add) >> shift);
1481
1482
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 4] * b[0] + iT[14] * b[1] + iT[24] * b[2] + iT[29] * b[3] + iT[19] * b[4] + iT[ 9] * b[5] + add) >> shift);
1483
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 9] * b[0] + iT[29] * b[1] + iT[14] * b[2] - iT[ 4] * b[3] - iT[24] * b[4] - iT[19] * b[5] + add) >> shift);
1484
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(iT[14] * b[0] + iT[19] * b[1] - iT[ 9] * b[2] - iT[24] * b[3] + iT[ 4] * b[4] + iT[29] * b[5] + add) >> shift);
1485
    dst[19] = Clip3(outputMinimum, outputMaximum, (int)(iT[19] * b[0] + iT[ 4] * b[1] - iT[29] * b[2] + iT[ 9] * b[3] + iT[14] * b[4] - iT[24] * b[5] + add) >> shift);
1486
    dst[24] = Clip3(outputMinimum, outputMaximum, (int)(iT[24] * b[0] - iT[ 9] * b[1] - iT[ 4] * b[2] + iT[19] * b[3] - iT[29] * b[4] + iT[14] * b[5] + add) >> shift);
1487
    dst[29] = Clip3(outputMinimum, outputMaximum, (int)(iT[29] * b[0] - iT[24] * b[1] + iT[19] * b[2] - iT[14] * b[3] + iT[ 9] * b[4] - iT[ 4] * b[5] + add) >> shift);
1488
1489
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(iT[12]*c[0] + iT[25]*c[1] + add) >> shift);
1490
    dst[25] = Clip3(outputMinimum, outputMaximum, (int)(iT[25]*c[0] - iT[12]*c[1] + add) >> shift);
1491
1492
    src++;
1493
    dst += 32;
1494
  }
1495
1496
  if (iSkipLine)
1497
  {
1498
    memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff));
1499
  }
1500
#else
1501
0
  _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P32[TRANSFORM_INVERSE][0] );
1502
0
#endif
1503
0
}
1504
1505
1506
// ********************************** DCT-VIII **********************************
1507
void fastForwardDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1508
0
{
1509
0
  int i;
1510
0
  int rnd_factor = 1 << (shift - 1);
1511
0
  const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_FORWARD][0];
1512
1513
0
  int c[4];
1514
0
  TCoeff *pCoeff = dst;
1515
0
  const int  reducedLine = line - iSkipLine;
1516
0
  for (i = 0; i<reducedLine; i++)
1517
0
  {
1518
    // Intermediate Variables
1519
0
    c[0] = src[0] + src[3];
1520
0
    c[1] = src[2] + src[0];
1521
0
    c[2] = src[3] - src[2];
1522
0
    c[3] = iT[1] * src[1];
1523
1524
0
    dst[0 * line] = (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift;
1525
0
    dst[1 * line] = (iT[1] * (src[0] - src[2] - src[3]) + rnd_factor) >> shift;
1526
0
    dst[2 * line] = (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift;
1527
0
    dst[3 * line] = (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift;
1528
1529
0
    src += 4;
1530
0
    dst++;
1531
0
  }
1532
0
  if (iSkipLine)
1533
0
  {
1534
0
    dst = pCoeff + reducedLine;
1535
0
    for (i = 0; i<4; i++)
1536
0
    {
1537
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1538
0
      dst += line;
1539
0
    }
1540
0
  }
1541
0
}
1542
1543
1544
void fastInverseDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1545
0
{
1546
0
#if ENABLE_SIMD_TRAFO
1547
0
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P4[TRANSFORM_INVERSE][0] );
1548
#else
1549
  int i;
1550
  int rnd_factor = 1 << (shift - 1);
1551
1552
  const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_INVERSE][0];
1553
1554
  int c[4];
1555
  const int  reducedLine = line - iSkipLine;
1556
  for (i = 0; i<reducedLine; i++)
1557
  {
1558
    // Intermediate Variables
1559
    c[0] = src[0 * line] + src[3 * line];
1560
    c[1] = src[2 * line] + src[0 * line];
1561
    c[2] = src[3 * line] - src[2 * line];
1562
    c[3] = iT[1] * src[1 * line];
1563
1564
    dst[0] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift);
1565
    dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * (src[0 * line] - src[2 * line] - src[3 * line]) + rnd_factor) >> shift);
1566
    dst[2] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift);
1567
    dst[3] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift);
1568
1569
    dst += 4;
1570
    src++;
1571
  }
1572
  if (iSkipLine)
1573
  {
1574
    memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff));
1575
  }
1576
#endif
1577
0
}
1578
1579
1580
void fastForwardDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1581
0
{
1582
0
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P8[TRANSFORM_FORWARD][0] );
1583
0
}
1584
1585
1586
void fastInverseDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1587
0
{
1588
0
  _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P8[TRANSFORM_INVERSE][0] );
1589
0
}
1590
1591
1592
void fastForwardDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1593
0
{
1594
#if !JVET_M0497_MATRIX_MULT
1595
  int j, k;
1596
  TCoeff a[5], b[5], c[5], d[5], t;
1597
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1598
1599
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0];
1600
1601
  TCoeff *pCoef = dst;
1602
  const int  reducedLine = line - iSkipLine;
1603
  const int  cutoff = 16 - iSkipLine2;
1604
1605
  for (j = 0; j < reducedLine; j++)
1606
  {
1607
    for (k = 0; k < 5; k++)
1608
    {
1609
      a[k] = src[15 - k] + src[ 4 - k];
1610
      b[k] = src[ 6 + k] + src[ 4 - k];
1611
      c[k] = src[15 - k] - src[ 6 + k];
1612
      d[k] = src[15 - k] + src[ 6 + k] - src[ 4 - k];
1613
    }
1614
1615
    t = iT[10] * src[5];
1616
1617
    dst[ 1 * line] = ( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift;
1618
    dst[ 4 * line] = (   iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift;
1619
    dst[ 7 * line] = ( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift;
1620
    dst[10 * line] = (   iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift;
1621
    dst[13 * line] = ( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift;
1622
1623
    dst[ 5 * line] = ( - iT[10] * (src[15] + src[14] - src[12] - src[11] + src[9] + src[8] - src[6] - src[5] + src[3] + src[2] - src[0]) + add) >> shift;
1624
1625
    dst[ 0 * line] = (   iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift;
1626
    dst[ 2 * line] = (   iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift;
1627
    dst[ 3 * line] = ( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift;
1628
    dst[ 6 * line] = (   iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift;
1629
    dst[ 8 * line] = (   iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift;
1630
    dst[ 9 * line] = ( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift;
1631
    dst[11 * line] = ( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift;
1632
    dst[12 * line] = (   iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift;
1633
    dst[14 * line] = (   iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift;
1634
    dst[15 * line] = ( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift;
1635
1636
    src += 16;
1637
    dst++;
1638
  }
1639
1640
  if (iSkipLine)
1641
  {
1642
    dst = pCoef + reducedLine;
1643
    for (j = 0; j < cutoff; j++)
1644
    {
1645
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1646
      dst += line;
1647
    }
1648
  }
1649
1650
  if (iSkipLine2)
1651
  {
1652
    dst = pCoef + line * cutoff;
1653
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1654
  }
1655
#else
1656
0
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P16[TRANSFORM_FORWARD][0] );
1657
0
#endif
1658
0
}
1659
1660
1661
void fastInverseDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1662
0
{
1663
#if !JVET_M0497_MATRIX_MULT
1664
  int j, k;
1665
  TCoeff a[5], b[5], c[5], d[5], t;
1666
1667
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1668
1669
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0];
1670
1671
  const int reducedLine = line - iSkipLine;
1672
1673
  for (j = 0; j < reducedLine; j++)
1674
  {
1675
    for (k = 0; k < 5; k++)
1676
    {
1677
      a[k] = src[(15 - k ) * line] + src[( 4 - k) * line];
1678
      b[k] = src[( 6 + k ) * line] + src[( 4 - k) * line];
1679
      c[k] = src[(15 - k ) * line] - src[( 6 + k) * line];
1680
      d[k] = src[(15 - k ) * line] + src[( 6 + k) * line] - src[(4 - k) * line];
1681
    }
1682
1683
    t = iT[10] * src[5*line];
1684
1685
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift);
1686
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(   iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift);
1687
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift);
1688
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)(   iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift);
1689
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift);
1690
1691
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( - iT[10] * (src[15 * line] + src[14 * line] - src[12 * line] - src[11 * line] + src[9 * line] + src[8 * line] - src[6 * line] - src[5 * line] + src[3 * line] + src[2 * line] - src[0 * line]) + add) >> shift);
1692
1693
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift );
1694
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(   iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift );
1695
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift );
1696
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(   iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift );
1697
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(   iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift );
1698
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift );
1699
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift );
1700
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(   iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift );
1701
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(   iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift );
1702
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift );
1703
1704
    src++;
1705
    dst += 16;
1706
  }
1707
1708
  if (iSkipLine)
1709
  {
1710
    memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff));
1711
  }
1712
#else
1713
0
  _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P16[TRANSFORM_INVERSE][0] );
1714
0
#endif
1715
0
}
1716
1717
1718
void fastForwardDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1719
0
{
1720
#if !JVET_M0497_MATRIX_MULT
1721
  int j, k;
1722
  TCoeff a[10][6];
1723
  TCoeff t[2];
1724
  TCoeff b[6];
1725
  TCoeff c[2];
1726
1727
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1728
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0];
1729
  TCoeff *pCoef = dst;
1730
  const int  reducedLine = line - iSkipLine;
1731
  const int  cutoff = 32 - iSkipLine2;
1732
1733
  for (j = 0; j < reducedLine; j++)
1734
  {
1735
    for (k = 0; k < 6; k++)
1736
    {
1737
      a[0][k] = src[31-k] - src[20+k];
1738
      a[1][k] = src[31-k] + src[18-k];
1739
      a[2][k] = src[31-k] + src[ 7+k];
1740
      a[3][k] = src[31-k] - src[ 5-k];
1741
      a[4][k] = src[25-k] + src[13+k];
1742
      a[5][k] = src[25-k] + src[12-k];
1743
      a[6][k] = src[25-k] - src[   k];
1744
      a[7][k] = src[18-k] - src[ 7+k];
1745
      a[8][k] = src[18-k] + src[ 5-k];
1746
      a[9][k] = src[12-k] + src[   k];
1747
1748
      b[k] = src[31-k] + src[20+k] - src[18-k] - src[7+k] + src[5-k];
1749
    }
1750
1751
    for (k = 0; k < 2; k++)
1752
    {
1753
      c[k] = src[31-k] + src[28+k] - src[26-k] - src[23+k] + src[21-k] + src[18+k] - src[16-k] - src[13+k] + src[11-k] + src[8+k] - src[6-k] - src[3+k] + src[1-k];
1754
    }
1755
1756
    t[0] = iT[12] * src[19] + iT[25] * src[6];
1757
    t[1] = iT[12] * src[6] - iT[25] * src[19];
1758
1759
    dst[ 0 * line] = (   iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift;
1760
    dst[ 1 * line] = (   iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift;
1761
    dst[ 3 * line] = (   iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift;
1762
    dst[ 4 * line] = ( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift;
1763
    dst[ 5 * line] = (   iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift;
1764
    dst[ 8 * line] = (   iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift;
1765
    dst[ 9 * line] = (   iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift;
1766
    dst[10 * line] = ( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift;
1767
    dst[11 * line] = ( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift;
1768
    dst[13 * line] = (   iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift;
1769
    dst[14 * line] = (   iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift;
1770
    dst[15 * line] = (   iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift;
1771
    dst[16 * line] = ( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift;
1772
    dst[18 * line] = (   iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift;
1773
    dst[20 * line] = ( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift;
1774
    dst[21 * line] = (   iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift;
1775
    dst[23 * line] = (   iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift;
1776
    dst[24 * line] = ( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift;
1777
    dst[25 * line] = ( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift;
1778
    dst[26 * line] = (   iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift;
1779
    dst[28 * line] = ( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift;
1780
    dst[29 * line] = (   iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift;
1781
    dst[30 * line] = ( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift;
1782
    dst[31 * line] = (   iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift;
1783
1784
    dst[ 2 * line] = (   iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift;
1785
    dst[ 7 * line] = ( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift;
1786
    dst[12 * line] = (   iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift;
1787
    dst[17 * line] = ( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift;
1788
    dst[22 * line] = (   iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift;
1789
    dst[27 * line] = ( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift;
1790
1791
    dst[ 6 * line] = (   iT[12] * c[0] + iT[25] * c[1] + add) >> shift;
1792
    dst[19 * line] = ( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift;
1793
1794
    src += 32;
1795
    dst++;
1796
  }
1797
1798
  if (iSkipLine)
1799
  {
1800
    dst = pCoef + reducedLine;
1801
    for (j = 0; j < cutoff; j++)
1802
    {
1803
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1804
      dst += line;
1805
    }
1806
  }
1807
1808
  if (iSkipLine2)
1809
  {
1810
    dst = pCoef + line * cutoff;
1811
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1812
  }
1813
#else
1814
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P32[TRANSFORM_FORWARD][0] );
1815
0
#endif
1816
0
}
1817
1818
1819
void fastInverseDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1820
0
{
1821
#if !JVET_M0497_MATRIX_MULT
1822
  int j, k;
1823
  TCoeff a[10][6];
1824
  TCoeff t[2];
1825
  TCoeff b[6];
1826
  TCoeff c[2];
1827
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1828
1829
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0];
1830
1831
  const int  reducedLine = line - iSkipLine;
1832
1833
  for (j = 0; j < reducedLine; j++)
1834
  {
1835
    for (k = 0; k < 6; k++)
1836
    {
1837
      a[0][k] = src[(31 - k)*line] - src[(20 + k)*line];
1838
      a[1][k] = src[(31 - k)*line] + src[(18 - k)*line];
1839
      a[2][k] = src[(31 - k)*line] + src[( 7 + k)*line];
1840
      a[3][k] = src[(31 - k)*line] - src[( 5 - k)*line];
1841
      a[4][k] = src[(25 - k)*line] + src[(13 + k)*line];
1842
      a[5][k] = src[(25 - k)*line] + src[(12 - k)*line];
1843
      a[6][k] = src[(25 - k)*line] - src[      k *line];
1844
      a[7][k] = src[(18 - k)*line] - src[( 7 + k)*line];
1845
      a[8][k] = src[(18 - k)*line] + src[( 5 - k)*line];
1846
      a[9][k] = src[(12 - k)*line] + src[      k *line];
1847
1848
      b[k] = src[(31 - k)*line] + src[(20 + k)*line] - src[(18 - k)*line] - src[(7 + k)*line] + src[(5 - k)*line];
1849
    }
1850
1851
    for (k = 0; k < 2; k++)
1852
    {
1853
      c[k] = src[(31 - k)*line] + src[(28 + k)*line] - src[(26 - k)*line] - src[(23 + k)*line] + src[(21 - k)*line] + src[(18 + k)*line] - src[(16 - k)*line] - src[(13 + k)*line] + src[(11 - k)*line] + src[(8 + k)*line] - src[(6 - k)*line] - src[(3 + k)*line] + src[(1 - k)*line];
1854
    }
1855
1856
    t[0] = iT[12] * src[19 * line] + iT[25] * src[ 6 * line];
1857
    t[1] = iT[12] * src[ 6 * line] - iT[25] * src[19 * line];
1858
1859
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift);
1860
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift);
1861
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift);
1862
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift);
1863
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift);
1864
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift);
1865
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift);
1866
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift);
1867
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift);
1868
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift);
1869
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift);
1870
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift);
1871
    dst[16] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift);
1872
    dst[18] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift);
1873
    dst[20] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift);
1874
    dst[21] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift);
1875
    dst[23] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift);
1876
    dst[24] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift);
1877
    dst[25] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift);
1878
    dst[26] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift);
1879
    dst[28] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift);
1880
    dst[29] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift);
1881
    dst[30] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift);
1882
    dst[31] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift);
1883
1884
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(   iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift);
1885
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift);
1886
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(   iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift);
1887
    dst[17] = Clip3(outputMinimum, outputMaximum, (int)( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift);
1888
    dst[22] = Clip3(outputMinimum, outputMaximum, (int)(   iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift);
1889
    dst[27] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift);
1890
1891
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(   iT[12] * c[0] + iT[25] * c[1] + add) >> shift);
1892
    dst[19] = Clip3(outputMinimum, outputMaximum, (int)( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift);
1893
1894
    src++;
1895
    dst += 32;
1896
  }
1897
1898
  if (iSkipLine)
1899
  {
1900
    memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff));
1901
  }
1902
#else
1903
0
  _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P32[TRANSFORM_INVERSE][0] );
1904
0
#endif
1905
0
}
1906
1907
#if ENABLE_SIMD_TRAFO
1908
1909
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1
1910
1911
}   // namespace vvenc
1912
1913
#include "Unit.h"
1914
1915
namespace vvenc {
1916
  
1917
void cpyCoeffCore( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height )
1918
0
{
1919
0
#define CPYCOEFF_OP( ADDR ) dst[ADDR] = src[ADDR];
1920
0
#define CPYCOEFF_INC src += stride; dst += width;
1921
1922
0
  SIZE_AWARE_PER_EL_OP( CPYCOEFF_OP, CPYCOEFF_INC );
1923
1924
0
#undef CPYCOEFF_INC
1925
0
#undef CPYCOEFF_OP
1926
0
}
1927
1928
1929
void cpyResiCore( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height )
1930
0
{
1931
0
#define CPYRESI_OP( ADDR ) dst[ADDR] = Pel( src[ADDR] );
1932
0
#define CPYRESI_INC dst += stride; src += width;
1933
1934
0
  SIZE_AWARE_PER_EL_OP( CPYRESI_OP, CPYRESI_INC );
1935
1936
0
#undef CPYRESI_INC
1937
0
#undef CPYRESI_OP
1938
0
}
1939
1940
1941
void clipCore( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
1942
0
{
1943
0
#define CLIP_OP( ADDR ) dst[ADDR] = Clip3( outputMin, outputMax, ( dst[ADDR] + round ) >> shift )
1944
0
#define CLIP_INC        dst      += stride
1945
1946
0
  SIZE_AWARE_PER_EL_OP( CLIP_OP, CLIP_INC );
1947
1948
0
#undef CLIP_INC
1949
0
#undef CLIP_OP
1950
0
}
1951
1952
1953
template<unsigned trSize>
1954
void fastInvCore_( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
1955
0
{
1956
0
  for( int k = 0; k < rows; k++ )
1957
0
  {
1958
0
    const TCoeff* srcPtr = &src[k * lines];
1959
0
    for( int i = 0; i < reducedLines; i++ )
1960
0
    {
1961
0
            TCoeff*       dstPtr = &dst[i * trSize];
1962
0
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
0
      for( int j = 0; j < trSize; j++ )
1964
0
      {
1965
0
        *dstPtr++ += *srcPtr * *itPtr++;
1966
0
      }
1967
0
      srcPtr++;
1968
0
    }
1969
0
  }
1970
0
}
Unexecuted instantiation: void vvenc::fastInvCore_<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInvCore_<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInvCore_<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInvCore_<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Unexecuted instantiation: void vvenc::fastInvCore_<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
1971
1972
1973
template<unsigned trSize>
1974
void fastFwdCore( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, unsigned reducedLine, unsigned cutoff, int shift )
1975
0
{
1976
0
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
0
  for( int i = 0; i < reducedLine; i++ )
1979
0
  {
1980
0
          TCoeff*       dstPtr = dst;
1981
0
    const TMatrixCoeff* iT     = tc;
1982
1983
0
    for( int j = 0; j < cutoff; j++ )
1984
0
    {
1985
0
      int sum = 0;
1986
1987
0
      for( int k = 0; k < trSize; k++ )
1988
0
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
0
        sum += src[k] * iT[k];
1991
0
      }
1992
1993
0
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
0
      dstPtr   += line;
1995
0
      iT       += trSize;
1996
0
    }
1997
1998
0
    src += trSize;
1999
0
  }
2000
0
}
Unexecuted instantiation: void vvenc::fastFwdCore<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwdCore<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwdCore<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwdCore<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Unexecuted instantiation: void vvenc::fastFwdCore<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
2001
2002
2003
TCoeffOps::TCoeffOps()
2004
256
{
2005
256
  cpyResi4        = cpyResiCore;
2006
256
  cpyResi8        = cpyResiCore;
2007
256
  cpyCoeff4       = cpyCoeffCore;
2008
256
  cpyCoeff8       = cpyCoeffCore;
2009
256
  roundClip4      = clipCore;
2010
256
  roundClip8      = clipCore;
2011
256
  fastInvCore[0]  = fastInvCore_< 4>;
2012
256
  fastInvCore[1]  = fastInvCore_< 8>;
2013
256
  fastInvCore[2]  = fastInvCore_<16>;
2014
256
  fastInvCore[3]  = fastInvCore_<32>;
2015
256
  fastInvCore[4]  = fastInvCore_<64>;
2016
256
  fastFwdCore_1D[0] = fastFwdCore< 4>;
2017
256
  fastFwdCore_1D[1] = fastFwdCore< 8>;
2018
256
  fastFwdCore_1D[2] = fastFwdCore<16>;
2019
256
  fastFwdCore_1D[3] = fastFwdCore<32>;
2020
256
  fastFwdCore_1D[4] = fastFwdCore<64>;
2021
256
  fastFwdCore_2D[0] = fastFwdCore< 4>;
2022
256
  fastFwdCore_2D[1] = fastFwdCore< 8>;
2023
256
  fastFwdCore_2D[2] = fastFwdCore<16>;
2024
256
  fastFwdCore_2D[3] = fastFwdCore<32>;
2025
256
  fastFwdCore_2D[4] = fastFwdCore<64>;
2026
256
}
2027
2028
TCoeffOps g_tCoeffOps;
2029
2030
#endif
2031
2032
2033
} // namespace vvenc
2034
2035
//! \}
2036