Coverage Report

Created: 2026-05-30 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/CommonLib/TrQuant_EMT.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     TrQuant_EMT.cpp
45
    \brief    transform and quantization class
46
*/
47
48
#include "TrQuant_EMT.h"
49
#include "Rom.h"
50
51
#include <stdlib.h>
52
#include <math.h>
53
#include <memory.h>
54
55
//! \ingroup CommonLib
56
//! \{
57
58
namespace vvenc {
59
60
// ********************************** DCT-II **********************************
61
  
62
#if ENABLE_SIMD_TRAFO
63
template<int uiTrSize>
64
inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT );
65
66
template<>
67
inline void _fastInverseMM<2>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
68
0
{
69
0
  const int rnd_factor  = 1 << (shift - 1);
70
0
  const int reducedLine = line - iSkipLine;
71
0
  const int cutoff      = 2 - iSkipLine2;
72
0
73
0
  memset( dst, 0, reducedLine * 2 * sizeof( TCoeff ) );
74
0
75
0
  for( int k = 0; k < cutoff; k++ )
76
0
  {
77
0
    const TCoeff* srcPtr = &src[k * line];
78
0
    for( int i = 0; i < reducedLine; i++ )
79
0
    {
80
0
            TCoeff*       dstPtr = &dst[i << 1];
81
0
      const TMatrixCoeff*  itPtr =  &iT[k << 1];
82
0
      const TCoeff        srcVal = *srcPtr;
83
0
      for( int j = 0; j < 2; j++ )
84
0
      {
85
0
        *dstPtr++ += srcVal * *itPtr++;
86
0
      }
87
0
      srcPtr++;
88
0
    }
89
0
  }
90
0
91
0
  for( int i = 0; i < reducedLine; i++ )
92
0
  {
93
0
    TCoeff* dstPtr = &dst[i << 1];
94
0
    for( int j = 0; j < 2; j++, dstPtr++ )
95
0
    {
96
0
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
97
0
    }
98
0
  }
99
0
100
0
  if( iSkipLine )
101
0
  {
102
0
    memset( dst + ( reducedLine << 1 ), 0, ( iSkipLine << 1 ) * sizeof( TCoeff ) );
103
0
  }
104
0
}
105
106
template<>
107
inline void _fastInverseMM<4>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
108
1.86k
{
109
1.86k
  const int rnd_factor  = 1 << ( shift - 1 );
110
1.86k
  const int reducedLine = line - iSkipLine;
111
1.86k
  const int cutoff      = 4 - iSkipLine2;
112
113
1.86k
  memset( dst, 0, reducedLine * 4 * sizeof( TCoeff ) );
114
115
1.86k
#if ENABLE_SIMD_TRAFO
116
1.86k
  g_tCoeffOps.fastInvCore[0]( iT, src, dst, line, reducedLine, cutoff );
117
1.86k
  g_tCoeffOps.roundClip4( dst, 4, reducedLine, 4, outputMinimum, outputMaximum, rnd_factor, shift );
118
#else
119
  for( int k = 0; k < cutoff; k++ )
120
  {
121
    const TCoeff* srcPtr = &src[k * line];
122
    for( int i = 0; i < reducedLine; i++ )
123
    {
124
            TCoeff*       dstPtr = &dst[i << 2];
125
      const TMatrixCoeff*  itPtr =  &iT[k << 2];
126
      for( int j = 0; j < 4; j++ )
127
      {
128
        *dstPtr++ += *srcPtr * *itPtr++;
129
      }
130
      srcPtr++;
131
    }
132
  }
133
134
  for( int i = 0; i < reducedLine; i++ )
135
  {
136
    TCoeff* dstPtr = &dst[i << 2];
137
    for( int j = 0; j < 4; j++, dstPtr++ )
138
    {
139
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
140
    }
141
  }
142
#endif
143
144
1.86k
  if( iSkipLine )
145
0
  {
146
0
    memset( dst + ( reducedLine << 2 ), 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) );
147
0
  }
148
1.86k
}
149
150
#endif
151
152
template< int uiTrSize >
153
inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
154
743k
{
155
743k
  const int  rnd_factor  = 1 << (shift - 1);
156
743k
  const int  reducedLine = line - iSkipLine;
157
743k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
743k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
743k
#if ENABLE_SIMD_TRAFO
162
743k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
743k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
743k
  if( iSkipLine )
191
139k
  {
192
139k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
139k
  }
194
743k
}
void vvenc::_fastInverseMM<16>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
488k
{
155
488k
  const int  rnd_factor  = 1 << (shift - 1);
156
488k
  const int  reducedLine = line - iSkipLine;
157
488k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
488k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
488k
#if ENABLE_SIMD_TRAFO
162
488k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
488k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
488k
  if( iSkipLine )
191
86.4k
  {
192
86.4k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
86.4k
  }
194
488k
}
void vvenc::_fastInverseMM<32>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
231k
{
155
231k
  const int  rnd_factor  = 1 << (shift - 1);
156
231k
  const int  reducedLine = line - iSkipLine;
157
231k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
231k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
231k
#if ENABLE_SIMD_TRAFO
162
231k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
231k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
231k
  if( iSkipLine )
191
42.9k
  {
192
42.9k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
42.9k
  }
194
231k
}
void vvenc::_fastInverseMM<64>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
22.1k
{
155
22.1k
  const int  rnd_factor  = 1 << (shift - 1);
156
22.1k
  const int  reducedLine = line - iSkipLine;
157
22.1k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
22.1k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
22.1k
#if ENABLE_SIMD_TRAFO
162
22.1k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
22.1k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
22.1k
  if( iSkipLine )
191
10.3k
  {
192
10.3k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
10.3k
  }
194
22.1k
}
void vvenc::_fastInverseMM<8>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
1.19k
{
155
1.19k
  const int  rnd_factor  = 1 << (shift - 1);
156
1.19k
  const int  reducedLine = line - iSkipLine;
157
1.19k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
1.19k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
1.19k
#if ENABLE_SIMD_TRAFO
162
1.19k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
1.19k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
1.19k
  if( iSkipLine )
191
0
  {
192
0
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
0
  }
194
1.19k
}
195
196
//Fast DCT-II transforms
197
void fastForwardDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
198
27.8k
{
199
27.8k
  int j;
200
27.8k
  int E, O;
201
27.8k
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
202
203
27.8k
  const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_FORWARD][0];
204
205
27.8k
  TCoeff *pCoef = dst;
206
27.8k
  const int  reducedLine = line - iSkipLine;
207
371k
  for (j = 0; j<reducedLine; j++)
208
343k
  {
209
    /* E and O */
210
343k
    E = src[0] + src[1];
211
343k
    O = src[0] - src[1];
212
213
343k
    dst[0] = (iT[0] * E + add) >> shift;
214
343k
    dst[line] = (iT[2] * O + add) >> shift;
215
216
217
343k
    src += 2;
218
343k
    dst++;
219
343k
  }
220
27.8k
  if (iSkipLine)
221
0
  {
222
0
    dst = pCoef + reducedLine;
223
0
    for (j = 0; j<2; j++)
224
0
    {
225
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
226
0
      dst += line;
227
0
    }
228
0
  }
229
27.8k
}
230
231
void fastInverseDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
232
16.7k
{
233
16.7k
  int j;
234
16.7k
  int E, O;
235
16.7k
  int add = 1 << (shift - 1);
236
237
16.7k
  const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_INVERSE][0];
238
239
16.7k
  const int  reducedLine = line - iSkipLine;
240
221k
  for (j = 0; j<reducedLine; j++)
241
204k
  {
242
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
243
204k
    E = iT[0] * (src[0] + src[line]);
244
204k
    O = iT[2] * (src[0] - src[line]);
245
246
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
247
204k
    dst[0] = Clip3(outputMinimum, outputMaximum, (E + add) >> shift);
248
204k
    dst[1] = Clip3(outputMinimum, outputMaximum, (O + add) >> shift);
249
250
204k
    src++;
251
204k
    dst += 2;
252
204k
  }
253
16.7k
  if (iSkipLine)
254
0
  {
255
0
    memset(dst, 0, (iSkipLine << 1) * sizeof(TCoeff));
256
0
  }
257
16.7k
}
258
259
/** 4x4 forward transform implemented using partial butterfly structure (1D)
260
*  \param src   input data (residual)
261
*  \param dst   output data (transform coefficients)
262
*  \param shift specifies right shift after 1D transform
263
*  \param line
264
*/
265
void fastForwardDCT2_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
266
751k
{
267
751k
  int j;
268
751k
  TCoeff E[2], O[2];
269
751k
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
270
271
751k
  const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_FORWARD][0];
272
273
751k
  TCoeff *pCoef = dst;
274
751k
  const int  reducedLine = line - iSkipLine;
275
8.18M
  for (j = 0; j<reducedLine; j++)
276
7.43M
  {
277
    /* E and O */
278
7.43M
    E[0] = src[0] + src[3];
279
7.43M
    O[0] = src[0] - src[3];
280
7.43M
    E[1] = src[1] + src[2];
281
7.43M
    O[1] = src[1] - src[2];
282
283
7.43M
    dst[0] = (iT[0] * E[0] + iT[1] * E[1] + add) >> shift;
284
7.43M
    dst[2 * line] = (iT[8] * E[0] + iT[9] * E[1] + add) >> shift;
285
7.43M
    dst[line] = (iT[4] * O[0] + iT[5] * O[1] + add) >> shift;
286
7.43M
    dst[3 * line] = (iT[12] * O[0] + iT[13] * O[1] + add) >> shift;
287
288
7.43M
    src += 4;
289
7.43M
    dst++;
290
7.43M
  }
291
751k
  if (iSkipLine)
292
176k
  {
293
176k
    dst = pCoef + reducedLine;
294
880k
    for (j = 0; j<4; j++)
295
704k
    {
296
704k
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
297
704k
      dst += line;
298
704k
    }
299
176k
  }
300
751k
}
301
302
/** 4x4 inverse transform implemented using partial butterfly structure (1D)
303
*  \param src   input data (transform coefficients)
304
*  \param dst   output data (residual)
305
*  \param shift specifies right shift after 1D transform
306
*  \param line
307
*  \param outputMinimum  minimum for clipping
308
*  \param outputMaximum  maximum for clipping
309
*/
310
void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum )
311
349k
{
312
#if 0
313
  const TMatrixCoeff *iT = g_trCoreDCT2P4[0];
314
315
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
316
#else
317
349k
  int j;
318
349k
  int E[2], O[2];
319
349k
  int add = 1 << ( shift - 1 );
320
321
349k
  const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_INVERSE][0];
322
323
349k
#if ENABLE_SIMD_TRAFO
324
349k
  TCoeff* orgDst = dst;
325
326
349k
#endif
327
349k
  const int  reducedLine = line - iSkipLine;
328
3.23M
  for( j = 0; j < reducedLine; j++ )
329
2.89M
  {
330
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
331
2.89M
    O[0] = iT[1 * 4 + 0] * src[line] + iT[3 * 4 + 0] * src[3 * line];
332
2.89M
    O[1] = iT[1 * 4 + 1] * src[line] + iT[3 * 4 + 1] * src[3 * line];
333
2.89M
    E[0] = iT[0 * 4 + 0] * src[   0] + iT[2 * 4 + 0] * src[2 * line];
334
2.89M
    E[1] = iT[0 * 4 + 1] * src[   0] + iT[2 * 4 + 1] * src[2 * line];
335
336
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
337
2.89M
#if ENABLE_SIMD_TRAFO
338
2.89M
    dst[0] = E[0] + O[0];
339
2.89M
    dst[1] = E[1] + O[1];
340
2.89M
    dst[2] = E[1] - O[1];
341
2.89M
    dst[3] = E[0] - O[0];
342
#else
343
    dst[0] = Clip3( outputMinimum, outputMaximum, ( E[0] + O[0] + add ) >> shift );
344
    dst[1] = Clip3( outputMinimum, outputMaximum, ( E[1] + O[1] + add ) >> shift );
345
    dst[2] = Clip3( outputMinimum, outputMaximum, ( E[1] - O[1] + add ) >> shift );
346
    dst[3] = Clip3( outputMinimum, outputMaximum, ( E[0] - O[0] + add ) >> shift );
347
#endif
348
349
2.89M
    src++;
350
2.89M
    dst += 4;
351
2.89M
  }
352
353
349k
#if ENABLE_SIMD_TRAFO
354
349k
  g_tCoeffOps.roundClip4( orgDst, 4, reducedLine, 4, outputMinimum, outputMaximum, add, shift );
355
356
349k
#endif
357
349k
  if( iSkipLine )
358
75.6k
  {
359
75.6k
    memset( dst, 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) );
360
75.6k
  }
361
349k
#endif
362
349k
}
363
364
365
366
template< int uiTrSize >
367
inline void _fastForwardMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TMatrixCoeff* tc )
368
3.24M
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
3.24M
  const int  reducedLine = line - iSkipLine;
373
3.24M
  const int  cutoff      = uiTrSize - iSkipLine2;
374
3.24M
  TCoeff *pCoef;
375
376
3.24M
#if ENABLE_SIMD_TRAFO
377
3.24M
  if( line == 1 )
378
288
  {
379
288
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
288
  }
381
3.24M
  else
382
3.24M
  {
383
3.24M
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
3.24M
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
3.24M
  if( iSkipLine )
407
551k
  {
408
551k
    pCoef = dst + reducedLine;
409
5.62M
    for( int j = 0; j<cutoff; j++ )
410
5.07M
    {
411
5.07M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
5.07M
      pCoef += line;
413
5.07M
    }
414
551k
  }
415
416
3.24M
  if( iSkipLine2 )
417
1.45M
  {
418
1.45M
    pCoef = dst + line*cutoff;
419
1.45M
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
1.45M
  }
421
3.24M
}
void vvenc::_fastForwardMM<8>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
1.15M
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
1.15M
  const int  reducedLine = line - iSkipLine;
373
1.15M
  const int  cutoff      = uiTrSize - iSkipLine2;
374
1.15M
  TCoeff *pCoef;
375
376
1.15M
#if ENABLE_SIMD_TRAFO
377
1.15M
  if( line == 1 )
378
0
  {
379
0
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
0
  }
381
1.15M
  else
382
1.15M
  {
383
1.15M
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
1.15M
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
1.15M
  if( iSkipLine )
407
148k
  {
408
148k
    pCoef = dst + reducedLine;
409
1.34M
    for( int j = 0; j<cutoff; j++ )
410
1.19M
    {
411
1.19M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
1.19M
      pCoef += line;
413
1.19M
    }
414
148k
  }
415
416
1.15M
  if( iSkipLine2 )
417
134k
  {
418
134k
    pCoef = dst + line*cutoff;
419
134k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
134k
  }
421
1.15M
}
void vvenc::_fastForwardMM<16>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
1.05M
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
1.05M
  const int  reducedLine = line - iSkipLine;
373
1.05M
  const int  cutoff      = uiTrSize - iSkipLine2;
374
1.05M
  TCoeff *pCoef;
375
376
1.05M
#if ENABLE_SIMD_TRAFO
377
1.05M
  if( line == 1 )
378
236
  {
379
236
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
236
  }
381
1.05M
  else
382
1.05M
  {
383
1.05M
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
1.05M
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
1.05M
  if( iSkipLine )
407
189k
  {
408
189k
    pCoef = dst + reducedLine;
409
1.71M
    for( int j = 0; j<cutoff; j++ )
410
1.52M
    {
411
1.52M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
1.52M
      pCoef += line;
413
1.52M
    }
414
189k
  }
415
416
1.05M
  if( iSkipLine2 )
417
664k
  {
418
664k
    pCoef = dst + line*cutoff;
419
664k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
664k
  }
421
1.05M
}
void vvenc::_fastForwardMM<32>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
938k
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
938k
  const int  reducedLine = line - iSkipLine;
373
938k
  const int  cutoff      = uiTrSize - iSkipLine2;
374
938k
  TCoeff *pCoef;
375
376
938k
#if ENABLE_SIMD_TRAFO
377
938k
  if( line == 1 )
378
52
  {
379
52
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
52
  }
381
938k
  else
382
938k
  {
383
938k
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
938k
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
938k
  if( iSkipLine )
407
165k
  {
408
165k
    pCoef = dst + reducedLine;
409
1.49M
    for( int j = 0; j<cutoff; j++ )
410
1.32M
    {
411
1.32M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
1.32M
      pCoef += line;
413
1.32M
    }
414
165k
  }
415
416
938k
  if( iSkipLine2 )
417
564k
  {
418
564k
    pCoef = dst + line*cutoff;
419
564k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
564k
  }
421
938k
}
void vvenc::_fastForwardMM<64>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
95.5k
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
95.5k
  const int  reducedLine = line - iSkipLine;
373
95.5k
  const int  cutoff      = uiTrSize - iSkipLine2;
374
95.5k
  TCoeff *pCoef;
375
376
95.5k
#if ENABLE_SIMD_TRAFO
377
95.5k
  if( line == 1 )
378
0
  {
379
0
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
0
  }
381
95.5k
  else
382
95.5k
  {
383
95.5k
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
95.5k
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
95.5k
  if( iSkipLine )
407
46.6k
  {
408
46.6k
    pCoef = dst + reducedLine;
409
1.07M
    for( int j = 0; j<cutoff; j++ )
410
1.03M
    {
411
1.03M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
1.03M
      pCoef += line;
413
1.03M
    }
414
46.6k
  }
415
416
95.5k
  if( iSkipLine2 )
417
95.5k
  {
418
95.5k
    pCoef = dst + line*cutoff;
419
95.5k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
95.5k
  }
421
95.5k
}
422
423
424
425
/** 8x8 forward transform implemented using partial butterfly structure (1D)
426
*  \param src   input data (residual)
427
*  \param dst   output data (transform coefficients)
428
*  \param shift specifies right shift after 1D transform
429
*  \param line
430
*/
431
void fastForwardDCT2_B8( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 )
432
1.12M
{
433
#if !JVET_M0497_MATRIX_MULT
434
  int j, k;
435
  TCoeff E[4], O[4];
436
  TCoeff EE[2], EO[2];
437
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
438
439
  const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_FORWARD][0];
440
441
  TCoeff *pCoef = dst;
442
  const int  reducedLine = line - iSkipLine;
443
  for( j = 0; j < reducedLine; j++ )
444
  {
445
    /* E and O*/
446
    for( k = 0; k < 4; k++ )
447
    {
448
      E[k] = src[k] + src[7 - k];
449
      O[k] = src[k] - src[7 - k];
450
    }
451
    /* EE and EO */
452
    EE[0] = E[0] + E[3];
453
    EO[0] = E[0] - E[3];
454
    EE[1] = E[1] + E[2];
455
    EO[1] = E[1] - E[2];
456
457
    dst[0       ] = (iT[ 0] * EE[0] + iT[ 1] * EE[1] + add) >> shift;
458
    dst[4 * line] = (iT[32] * EE[0] + iT[33] * EE[1] + add) >> shift;
459
    dst[2 * line] = (iT[16] * EO[0] + iT[17] * EO[1] + add) >> shift;
460
    dst[6 * line] = (iT[48] * EO[0] + iT[49] * EO[1] + add) >> shift;
461
462
    dst[    line] = (iT[ 8] * O[0] + iT[ 9] * O[1] + iT[10] * O[2] + iT[11] * O[3] + add) >> shift;
463
    dst[3 * line] = (iT[24] * O[0] + iT[25] * O[1] + iT[26] * O[2] + iT[27] * O[3] + add) >> shift;
464
    dst[5 * line] = (iT[40] * O[0] + iT[41] * O[1] + iT[42] * O[2] + iT[43] * O[3] + add) >> shift;
465
    dst[7 * line] = (iT[56] * O[0] + iT[57] * O[1] + iT[58] * O[2] + iT[59] * O[3] + add) >> shift;
466
467
    src += 8;
468
    dst++;
469
  }
470
  if( iSkipLine )
471
  {
472
    dst = pCoef + reducedLine;
473
    for( j = 0; j < 8; j++ )
474
    {
475
      memset( dst, 0, sizeof( TCoeff )*iSkipLine );
476
      dst += line;
477
    }
478
  }
479
#else
480
1.12M
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P8[TRANSFORM_FORWARD][0] );
481
1.12M
#endif
482
1.12M
}
483
484
/** 8x8 inverse transform implemented using partial butterfly structure (1D)
485
*  \param src   input data (transform coefficients)
486
*  \param dst   output data (residual)
487
*  \param shift specifies right shift after 1D transform
488
*  \param line
489
*  \param outputMinimum  minimum for clipping
490
*  \param outputMaximum  maximum for clipping
491
*/
492
void fastInverseDCT2_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
493
537k
{
494
#if 0
495
  const TMatrixCoeff *iT = g_trCoreDCT2P8[0];
496
497
  _fastInverseMM<8>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
498
#else
499
537k
  int j, k;
500
537k
  int E[4], O[4];
501
537k
  int EE[2], EO[2];
502
537k
  int add = 1 << (shift - 1);
503
504
537k
  const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_INVERSE][0];
505
506
537k
#if ENABLE_SIMD_TRAFO
507
537k
  TCoeff *orgDst = dst;
508
509
537k
#endif
510
537k
  const int  reducedLine = line - iSkipLine;
511
6.33M
  for( j = 0; j < reducedLine; j++ )
512
5.79M
  {
513
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
514
28.9M
    for( k = 0; k < 4; k++ )
515
23.1M
    {
516
23.1M
      O[k] = iT[1 * 8 + k] * src[line] + iT[3 * 8 + k] * src[3 * line] + iT[5 * 8 + k] * src[5 * line] + iT[7 * 8 + k] * src[7 * line];
517
23.1M
    }
518
519
5.79M
    EO[0] = iT[2 * 8 + 0] * src[2 * line] + iT[6 * 8 + 0] * src[6 * line];
520
5.79M
    EO[1] = iT[2 * 8 + 1] * src[2 * line] + iT[6 * 8 + 1] * src[6 * line];
521
5.79M
    EE[0] = iT[0 * 8 + 0] * src[0       ] + iT[4 * 8 + 0] * src[4 * line];
522
5.79M
    EE[1] = iT[0 * 8 + 1] * src[0       ] + iT[4 * 8 + 1] * src[4 * line];
523
524
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
525
5.79M
    E[0] = EE[0] + EO[0];
526
5.79M
    E[3] = EE[0] - EO[0];
527
5.79M
    E[1] = EE[1] + EO[1];
528
5.79M
    E[2] = EE[1] - EO[1];
529
530
28.9M
    for( k = 0; k < 4; k++ )
531
23.1M
    {
532
23.1M
#if ENABLE_SIMD_TRAFO
533
23.1M
      dst[k    ] = E[    k] + O[    k];
534
23.1M
      dst[k + 4] = E[3 - k] - O[3 - k];
535
#else
536
      dst[k    ] = Clip3( outputMinimum, outputMaximum, ( E[    k] + O[    k] + add ) >> shift );
537
      dst[k + 4] = Clip3( outputMinimum, outputMaximum, ( E[3 - k] - O[3 - k] + add ) >> shift );
538
#endif
539
23.1M
    }
540
5.79M
    src++;
541
5.79M
    dst += 8;
542
5.79M
  }
543
544
537k
#if ENABLE_SIMD_TRAFO
545
537k
  g_tCoeffOps.roundClip8( orgDst, 8, reducedLine, 8, outputMinimum, outputMaximum, add, shift );
546
547
537k
#endif
548
537k
  if( iSkipLine )
549
63.8k
  {
550
63.8k
    memset( dst, 0, ( iSkipLine << 3 ) * sizeof( TCoeff ) );
551
63.8k
  }
552
537k
#endif
553
537k
}
554
555
556
/** 16x16 forward transform implemented using partial butterfly structure (1D)
557
*  \param src   input data (residual)
558
*  \param dst   output data (transform coefficients)
559
*  \param shift specifies right shift after 1D transform
560
*  \param line
561
*/
562
void fastForwardDCT2_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
563
1.01M
{
564
#if !JVET_M0497_MATRIX_MULT
565
  int j, k;
566
  TCoeff E  [8], O  [8];
567
  TCoeff EE [4], EO [4];
568
  TCoeff EEE[2], EEO[2];
569
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
570
571
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_FORWARD][0];
572
573
  TCoeff *pCoef = dst;
574
  const int  reducedLine = line - iSkipLine;
575
  for( j = 0; j < reducedLine; j++ )
576
  {
577
    /* E and O*/
578
    for( k = 0; k < 8; k++ )
579
    {
580
      E[k] = src[k] + src[15 - k];
581
      O[k] = src[k] - src[15 - k];
582
    }
583
    /* EE and EO */
584
    for( k = 0; k < 4; k++ )
585
    {
586
      EE[k] = E[k] + E[7 - k];
587
      EO[k] = E[k] - E[7 - k];
588
    }
589
    /* EEE and EEO */
590
    EEE[0] = EE[0] + EE[3];
591
    EEO[0] = EE[0] - EE[3];
592
    EEE[1] = EE[1] + EE[2];
593
    EEO[1] = EE[1] - EE[2];
594
595
    dst[ 0       ] = ( iT[ 0     ] * EEE[0] + iT[          1] * EEE[1] + add ) >> shift;
596
    dst[ 8 * line] = ( iT[ 8 * 16] * EEE[0] + iT[ 8 * 16 + 1] * EEE[1] + add ) >> shift;
597
    dst[ 4 * line] = ( iT[ 4 * 16] * EEO[0] + iT[ 4 * 16 + 1] * EEO[1] + add ) >> shift;
598
    dst[12 * line] = ( iT[12 * 16] * EEO[0] + iT[12 * 16 + 1] * EEO[1] + add ) >> shift;
599
600
    for( k = 2; k < 16; k += 4 )
601
    {
602
      dst[k*line] = ( iT[k * 16] * EO[0] + iT[k * 16 + 1] * EO[1] + iT[k * 16 + 2] * EO[2] + iT[k * 16 + 3] * EO[3] + add ) >> shift;
603
    }
604
605
    for( k = 1; k < 16; k += 2 )
606
    {
607
      dst[k*line] = ( iT[k * 16    ] * O[0] + iT[k * 16 + 1] * O[1] + iT[k * 16 + 2] * O[2] + iT[k * 16 + 3] * O[3] +
608
                      iT[k * 16 + 4] * O[4] + iT[k * 16 + 5] * O[5] + iT[k * 16 + 6] * O[6] + iT[k * 16 + 7] * O[7] + add ) >> shift;
609
    }
610
611
    src += 16;
612
    dst++;
613
614
  }
615
  if( iSkipLine )
616
  {
617
    dst = pCoef + reducedLine;
618
    for( j = 0; j < 16; j++ )
619
    {
620
      memset( dst, 0, sizeof( TCoeff )*iSkipLine );
621
      dst += line;
622
    }
623
  }
624
#else
625
1.01M
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P16[TRANSFORM_FORWARD][0] );
626
1.01M
#endif
627
1.01M
}
628
629
/** 16x16 inverse transform implemented using partial butterfly structure (1D)
630
*  \param src            input data (transform coefficients)
631
*  \param dst            output data (residual)
632
*  \param shift          specifies right shift after 1D transform
633
*  \param line
634
*  \param outputMinimum  minimum for clipping
635
*  \param outputMaximum  maximum for clipping
636
*/
637
void fastInverseDCT2_B16( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum )
638
483k
{
639
483k
#if ENABLE_SIMD_TRAFO
640
483k
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0];
641
642
483k
  _fastInverseMM<16>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
643
#else
644
  int j, k;
645
  int E  [8], O  [8];
646
  int EE [4], EO [4];
647
  int EEE[2], EEO[2];
648
  int add = 1 << ( shift - 1 );
649
650
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0];
651
652
#if ENABLE_SIMD_TRAFO
653
  TCoeff *orgDst = dst;
654
655
#endif
656
  const int  reducedLine = line - iSkipLine;
657
658
  for( j = 0; j < reducedLine; j++ )
659
  {
660
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
661
    for( k = 0; k < 8; k++ )
662
    {
663
      O[k] = iT[1 * 16 + k] * src[    line] + iT[ 3 * 16 + k] * src[ 3 * line] + iT[ 5 * 16 + k] * src[ 5 * line] + iT[ 7 * 16 + k] * src[ 7 * line] +
664
        iT[9 * 16 + k] * src[9 * line] + iT[11 * 16 + k] * src[11 * line] + iT[13 * 16 + k] * src[13 * line] + iT[15 * 16 + k] * src[15 * line];
665
    }
666
    for( k = 0; k < 4; k++ )
667
    {
668
      EO[k] = iT[2 * 16 + k] * src[2 * line] + iT[6 * 16 + k] * src[6 * line] + iT[10 * 16 + k] * src[10 * line] + iT[14 * 16 + k] * src[14 * line];
669
    }
670
    EEO[0] = iT[4 * 16    ] * src[4 * line] + iT[12 * 16    ] * src[12 * line];
671
    EEE[0] = iT[0         ] * src[0       ] + iT[ 8 * 16    ] * src[ 8 * line];
672
    EEO[1] = iT[4 * 16 + 1] * src[4 * line] + iT[12 * 16 + 1] * src[12 * line];
673
    EEE[1] = iT[0 * 16 + 1] * src[0       ] + iT[ 8 * 16 + 1] * src[ 8 * line];
674
675
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
676
    for( k = 0; k < 2; k++ )
677
    {
678
      EE[k    ] = EEE[    k] + EEO[    k];
679
      EE[k + 2] = EEE[1 - k] - EEO[1 - k];
680
    }
681
    for( k = 0; k < 4; k++ )
682
    {
683
      E[k    ] = EE[    k] + EO[    k];
684
      E[k + 4] = EE[3 - k] - EO[3 - k];
685
    }
686
    for( k = 0; k < 8; k++ )
687
    {
688
#if ENABLE_SIMD_TRAFO
689
      dst[k    ] = E[    k] + O[    k];
690
      dst[k + 8] = E[7 - k] - O[7 - k];
691
#else
692
      dst[k    ] = Clip3( outputMinimum, outputMaximum, ( E[    k] + O[    k] + add ) >> shift );
693
      dst[k + 8] = Clip3( outputMinimum, outputMaximum, ( E[7 - k] - O[7 - k] + add ) >> shift );
694
#endif
695
    }
696
    src++;
697
    dst += 16;
698
  }
699
700
#if ENABLE_SIMD_TRAFO
701
  g_tCoeffOps.roundClip8( orgDst, 16, reducedLine, 16, outputMinimum, outputMaximum, add, shift );
702
703
#endif
704
  if( iSkipLine )
705
  {
706
    memset( dst, 0, ( iSkipLine << 4 ) * sizeof( TCoeff ) );
707
  }
708
#endif
709
483k
}
710
711
712
713
/** 32x32 forward transform implemented using partial butterfly structure (1D)
714
*  \param src   input data (residual)
715
*  \param dst   output data (transform coefficients)
716
*  \param shift specifies right shift after 1D transform
717
*  \param line
718
*/
719
void fastForwardDCT2_B32( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 )
720
938k
{
721
#if !JVET_M0497_MATRIX_MULT
722
  int j, k;
723
  TCoeff E   [16], O   [16];
724
  TCoeff EE  [ 8], EO  [ 8];
725
  TCoeff EEE [ 4], EEO [ 4];
726
  TCoeff EEEE[ 2], EEEO[ 2];
727
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
728
729
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_FORWARD][0];
730
731
  TCoeff *pCoef = dst;
732
  const int  reducedLine = line - iSkipLine;
733
  for (j = 0; j<reducedLine; j++)
734
  {
735
    /* E and O*/
736
    for (k = 0;k<16;k++)
737
    {
738
      E[k] = src[k] + src[31 - k];
739
      O[k] = src[k] - src[31 - k];
740
    }
741
    /* EE and EO */
742
    for (k = 0;k<8;k++)
743
    {
744
      EE[k] = E[k] + E[15 - k];
745
      EO[k] = E[k] - E[15 - k];
746
    }
747
    /* EEE and EEO */
748
    for (k = 0;k<4;k++)
749
    {
750
      EEE[k] = EE[k] + EE[7 - k];
751
      EEO[k] = EE[k] - EE[7 - k];
752
    }
753
    /* EEEE and EEEO */
754
    EEEE[0] = EEE[0] + EEE[3];
755
    EEEO[0] = EEE[0] - EEE[3];
756
    EEEE[1] = EEE[1] + EEE[2];
757
    EEEO[1] = EEE[1] - EEE[2];
758
759
    dst[0] = (iT[0 * 32 + 0] * EEEE[0] + iT[0 * 32 + 1] * EEEE[1] + add) >> shift;
760
    dst[16 * line] = (iT[16 * 32 + 0] * EEEE[0] + iT[16 * 32 + 1] * EEEE[1] + add) >> shift;
761
    dst[8 * line] = (iT[8 * 32 + 0] * EEEO[0] + iT[8 * 32 + 1] * EEEO[1] + add) >> shift;
762
    dst[24 * line] = (iT[24 * 32 + 0] * EEEO[0] + iT[24 * 32 + 1] * EEEO[1] + add) >> shift;
763
    for (k = 4;k<32;k += 8)
764
    {
765
      dst[k*line] = (iT[k * 32 + 0] * EEO[0] + iT[k * 32 + 1] * EEO[1] + iT[k * 32 + 2] * EEO[2] + iT[k * 32 + 3] * EEO[3] + add) >> shift;
766
    }
767
    for (k = 2;k<32;k += 4)
768
    {
769
      dst[k*line] = (iT[k * 32 + 0] * EO[0] + iT[k * 32 + 1] * EO[1] + iT[k * 32 + 2] * EO[2] + iT[k * 32 + 3] * EO[3] +
770
                      iT[k * 32 + 4] * EO[4] + iT[k * 32 + 5] * EO[5] + iT[k * 32 + 6] * EO[6] + iT[k * 32 + 7] * EO[7] + add) >> shift;
771
    }
772
    for (k = 1;k<32;k += 2)
773
    {
774
      dst[k*line] = (iT[k * 32 + 0] * O[0] + iT[k * 32 + 1] * O[1] + iT[k * 32 + 2] * O[2] + iT[k * 32 + 3] * O[3] +
775
                      iT[k * 32 + 4] * O[4] + iT[k * 32 + 5] * O[5] + iT[k * 32 + 6] * O[6] + iT[k * 32 + 7] * O[7] +
776
                      iT[k * 32 + 8] * O[8] + iT[k * 32 + 9] * O[9] + iT[k * 32 + 10] * O[10] + iT[k * 32 + 11] * O[11] +
777
                      iT[k * 32 + 12] * O[12] + iT[k * 32 + 13] * O[13] + iT[k * 32 + 14] * O[14] + iT[k * 32 + 15] * O[15] + add) >> shift;
778
    }
779
    src += 32;
780
    dst++;
781
  }
782
  if (iSkipLine)
783
  {
784
    dst = pCoef + reducedLine;
785
    for (j = 0; j<32; j++)
786
    {
787
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
788
      dst += line;
789
    }
790
  }
791
#else
792
938k
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P32[TRANSFORM_FORWARD][0] );
793
938k
#endif
794
938k
}
795
796
/** 32x32 inverse transform implemented using partial butterfly structure (1D)
797
*  \param src   input data (transform coefficients)
798
*  \param dst   output data (residual)
799
*  \param shift specifies right shift after 1D transform
800
*  \param line
801
*  \param outputMinimum  minimum for clipping
802
*  \param outputMaximum  maximum for clipping
803
*/
804
void fastInverseDCT2_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
805
231k
{
806
231k
#if ENABLE_SIMD_TRAFO
807
231k
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0];
808
809
231k
  _fastInverseMM<32>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
810
#else
811
  int j, k;
812
  int E[16], O[16];
813
  int EE[8], EO[8];
814
  int EEE[4], EEO[4];
815
  int EEEE[2], EEEO[2];
816
  int add = 1 << (shift - 1);
817
818
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0];
819
820
#if ENABLE_SIMD_TRAFO
821
  TCoeff *orgDst = dst;
822
823
#endif
824
  const int  reducedLine = line - iSkipLine;
825
  for (j = 0; j<reducedLine; j++)
826
  {
827
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
828
    for (k = 0;k<16;k++)
829
    {
830
      O[k] = iT[1 * 32 + k] * src[line] + iT[3 * 32 + k] * src[3 * line] + iT[5 * 32 + k] * src[5 * line] + iT[7 * 32 + k] * src[7 * line] +
831
        iT[9 * 32 + k] * src[9 * line] + iT[11 * 32 + k] * src[11 * line] + iT[13 * 32 + k] * src[13 * line] + iT[15 * 32 + k] * src[15 * line] +
832
        iT[17 * 32 + k] * src[17 * line] + iT[19 * 32 + k] * src[19 * line] + iT[21 * 32 + k] * src[21 * line] + iT[23 * 32 + k] * src[23 * line] +
833
        iT[25 * 32 + k] * src[25 * line] + iT[27 * 32 + k] * src[27 * line] + iT[29 * 32 + k] * src[29 * line] + iT[31 * 32 + k] * src[31 * line];
834
    }
835
    for (k = 0;k<8;k++)
836
    {
837
      EO[k] = iT[2 * 32 + k] * src[2 * line] + iT[6 * 32 + k] * src[6 * line] + iT[10 * 32 + k] * src[10 * line] + iT[14 * 32 + k] * src[14 * line] +
838
        iT[18 * 32 + k] * src[18 * line] + iT[22 * 32 + k] * src[22 * line] + iT[26 * 32 + k] * src[26 * line] + iT[30 * 32 + k] * src[30 * line];
839
    }
840
    for (k = 0;k<4;k++)
841
    {
842
      EEO[k] = iT[4 * 32 + k] * src[4 * line] + iT[12 * 32 + k] * src[12 * line] + iT[20 * 32 + k] * src[20 * line] + iT[28 * 32 + k] * src[28 * line];
843
    }
844
    EEEO[0] = iT[8 * 32 + 0] * src[8 * line] + iT[24 * 32 + 0] * src[24 * line];
845
    EEEO[1] = iT[8 * 32 + 1] * src[8 * line] + iT[24 * 32 + 1] * src[24 * line];
846
    EEEE[0] = iT[0 * 32 + 0] * src[0] + iT[16 * 32 + 0] * src[16 * line];
847
    EEEE[1] = iT[0 * 32 + 1] * src[0] + iT[16 * 32 + 1] * src[16 * line];
848
849
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
850
    EEE[0] = EEEE[0] + EEEO[0];
851
    EEE[3] = EEEE[0] - EEEO[0];
852
    EEE[1] = EEEE[1] + EEEO[1];
853
    EEE[2] = EEEE[1] - EEEO[1];
854
    for (k = 0;k<4;k++)
855
    {
856
      EE[k] = EEE[k] + EEO[k];
857
      EE[k + 4] = EEE[3 - k] - EEO[3 - k];
858
    }
859
    for (k = 0;k<8;k++)
860
    {
861
      E[k] = EE[k] + EO[k];
862
      E[k + 8] = EE[7 - k] - EO[7 - k];
863
    }
864
    for (k = 0;k<16;k++)
865
    {
866
#if ENABLE_SIMD_TRAFO
867
      dst[k     ] = E[k     ] + O[k     ];
868
      dst[k + 16] = E[15 - k] - O[15 - k];
869
#else
870
      dst[k] = Clip3(outputMinimum, outputMaximum, (E[k] + O[k] + add) >> shift);
871
      dst[k + 16] = Clip3(outputMinimum, outputMaximum, (E[15 - k] - O[15 - k] + add) >> shift);
872
#endif
873
    }
874
    src++;
875
    dst += 32;
876
  }
877
878
#if ENABLE_SIMD_TRAFO
879
  g_tCoeffOps.roundClip8( orgDst, 32, reducedLine, 32, outputMinimum, outputMaximum, add, shift );
880
881
#endif
882
  if (iSkipLine)
883
  {
884
    memset(dst, 0, (iSkipLine << 5) * sizeof(TCoeff));
885
  }
886
#endif
887
231k
}
888
889
void fastForwardDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
890
95.5k
{
891
#if !JVET_M0497_MATRIX_MULT
892
  int rnd_factor = 1 << (shift - 1);
893
894
  const int uiTrSize = 64;
895
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_FORWARD][0];
896
897
  int   j, k;
898
  TCoeff E[32], O[32];
899
  TCoeff EE[16], EO[16];
900
  TCoeff EEE[8], EEO[8];
901
  TCoeff EEEE[4], EEEO[4];
902
  TCoeff EEEEE[2], EEEEO[2];
903
  TCoeff *tmp = dst;
904
905
  //bool zo = iSkipLine2 >= 32;
906
  bool zo = iSkipLine2 != 0;
907
  for (j = 0; j<line - iSkipLine; j++)
908
  {
909
    /* E and O*/
910
    for (k = 0;k<32;k++)
911
    {
912
      E[k] = src[k] + src[63 - k];
913
      O[k] = src[k] - src[63 - k];
914
    }
915
    /* EE and EO */
916
    for (k = 0;k<16;k++)
917
    {
918
      EE[k] = E[k] + E[31 - k];
919
      EO[k] = E[k] - E[31 - k];
920
    }
921
    /* EEE and EEO */
922
    for (k = 0;k<8;k++)
923
    {
924
      EEE[k] = EE[k] + EE[15 - k];
925
      EEO[k] = EE[k] - EE[15 - k];
926
    }
927
    /* EEEE and EEEO */
928
    for (k = 0;k<4;k++)
929
    {
930
      EEEE[k] = EEE[k] + EEE[7 - k];
931
      EEEO[k] = EEE[k] - EEE[7 - k];
932
    }
933
    /* EEEEE and EEEEO */
934
    EEEEE[0] = EEEE[0] + EEEE[3];
935
    EEEEO[0] = EEEE[0] - EEEE[3];
936
    EEEEE[1] = EEEE[1] + EEEE[2];
937
    EEEEO[1] = EEEE[1] - EEEE[2];
938
939
    dst[0] = (iT[0 * 64 + 0] * EEEEE[0] + iT[0 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift;
940
    dst[16 * line] = (iT[16 * 64 + 0] * EEEEO[0] + iT[16 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift;
941
942
    if (!zo)
943
    {
944
      dst[32 * line] = (iT[32 * 64 + 0] * EEEEE[0] + iT[32 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift;
945
      dst[48 * line] = (iT[48 * 64 + 0] * EEEEO[0] + iT[48 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift;
946
    }
947
    for (k = 8;k<(zo ? 32 : 64);k += 16)
948
    {
949
      dst[k*line] = (iT[k * 64 + 0] * EEEO[0] + iT[k * 64 + 1] * EEEO[1] + iT[k * 64 + 2] * EEEO[2] + iT[k * 64 + 3] * EEEO[3] + rnd_factor) >> shift;
950
    }
951
    for (k = 4;k<(zo ? 32 : 64);k += 8)
952
    {
953
      dst[k*line] = (iT[k * 64 + 0] * EEO[0] + iT[k * 64 + 1] * EEO[1] + iT[k * 64 + 2] * EEO[2] + iT[k * 64 + 3] * EEO[3] +
954
                      iT[k * 64 + 4] * EEO[4] + iT[k * 64 + 5] * EEO[5] + iT[k * 64 + 6] * EEO[6] + iT[k * 64 + 7] * EEO[7] + rnd_factor) >> shift;
955
    }
956
    for (k = 2;k<(zo ? 32 : 64);k += 4)
957
    {
958
      dst[k*line] = (iT[k * 64 + 0] * EO[0] + iT[k * 64 + 1] * EO[1] + iT[k * 64 + 2] * EO[2] + iT[k * 64 + 3] * EO[3] +
959
                      iT[k * 64 + 4] * EO[4] + iT[k * 64 + 5] * EO[5] + iT[k * 64 + 6] * EO[6] + iT[k * 64 + 7] * EO[7] +
960
                      iT[k * 64 + 8] * EO[8] + iT[k * 64 + 9] * EO[9] + iT[k * 64 + 10] * EO[10] + iT[k * 64 + 11] * EO[11] +
961
                      iT[k * 64 + 12] * EO[12] + iT[k * 64 + 13] * EO[13] + iT[k * 64 + 14] * EO[14] + iT[k * 64 + 15] * EO[15] + rnd_factor) >> shift;
962
    }
963
    for (k = 1;k<(zo ? 32 : 64);k += 2)
964
    {
965
      dst[k*line] = (iT[k * 64 + 0] * O[0] + iT[k * 64 + 1] * O[1] + iT[k * 64 + 2] * O[2] + iT[k * 64 + 3] * O[3] +
966
                      iT[k * 64 + 4] * O[4] + iT[k * 64 + 5] * O[5] + iT[k * 64 + 6] * O[6] + iT[k * 64 + 7] * O[7] +
967
                      iT[k * 64 + 8] * O[8] + iT[k * 64 + 9] * O[9] + iT[k * 64 + 10] * O[10] + iT[k * 64 + 11] * O[11] +
968
                      iT[k * 64 + 12] * O[12] + iT[k * 64 + 13] * O[13] + iT[k * 64 + 14] * O[14] + iT[k * 64 + 15] * O[15] +
969
                      iT[k * 64 + 16] * O[16] + iT[k * 64 + 17] * O[17] + iT[k * 64 + 18] * O[18] + iT[k * 64 + 19] * O[19] +
970
                      iT[k * 64 + 20] * O[20] + iT[k * 64 + 21] * O[21] + iT[k * 64 + 22] * O[22] + iT[k * 64 + 23] * O[23] +
971
                      iT[k * 64 + 24] * O[24] + iT[k * 64 + 25] * O[25] + iT[k * 64 + 26] * O[26] + iT[k * 64 + 27] * O[27] +
972
                      iT[k * 64 + 28] * O[28] + iT[k * 64 + 29] * O[29] + iT[k * 64 + 30] * O[30] + iT[k * 64 + 31] * O[31] + rnd_factor) >> shift;
973
    }
974
    src += uiTrSize;
975
    dst++;
976
  }
977
978
  const int  reducedLine = line - iSkipLine;
979
  const int  cutoff = uiTrSize - iSkipLine2;
980
  if (iSkipLine)
981
  {
982
    dst = tmp + reducedLine;
983
    for (j = 0; j<cutoff; j++)
984
    {
985
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
986
      dst += line;
987
    }
988
  }
989
  if (iSkipLine2)
990
  {
991
    dst = tmp + line*cutoff;
992
    memset(dst, 0, sizeof(TCoeff)*line*iSkipLine2);
993
  }
994
#else
995
95.5k
  _fastForwardMM< 64 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P64[TRANSFORM_FORWARD][0] );
996
95.5k
#endif
997
95.5k
}
998
999
void fastInverseDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1000
22.1k
{
1001
22.1k
#if ENABLE_SIMD_TRAFO
1002
22.1k
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0];
1003
1004
22.1k
  _fastInverseMM<64>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
1005
#else
1006
  int rnd_factor = 1 << (shift - 1);
1007
  const int uiTrSize = 64;
1008
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0];
1009
1010
#if ENABLE_SIMD_TRAFO
1011
  TCoeff *orgDst = dst;
1012
1013
#endif
1014
  int    j, k;
1015
  TCoeff E[32], O[32];
1016
  TCoeff EE[16], EO[16];
1017
  TCoeff EEE[8], EEO[8];
1018
  TCoeff EEEE[4], EEEO[4];
1019
  TCoeff EEEEE[2], EEEEO[2];
1020
  bool zo = iSkipLine2 >= 32;
1021
  for (j = 0; j<line - iSkipLine; j++)
1022
  {
1023
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1024
    for (k = 0;k<32;k++)
1025
    {
1026
      O[k] = iT[1 * 64 + k] * src[line] + iT[3 * 64 + k] * src[3 * line] + iT[5 * 64 + k] * src[5 * line] + iT[7 * 64 + k] * src[7 * line] +
1027
        iT[9 * 64 + k] * src[9 * line] + iT[11 * 64 + k] * src[11 * line] + iT[13 * 64 + k] * src[13 * line] + iT[15 * 64 + k] * src[15 * line] +
1028
        iT[17 * 64 + k] * src[17 * line] + iT[19 * 64 + k] * src[19 * line] + iT[21 * 64 + k] * src[21 * line] + iT[23 * 64 + k] * src[23 * line] +
1029
        iT[25 * 64 + k] * src[25 * line] + iT[27 * 64 + k] * src[27 * line] + iT[29 * 64 + k] * src[29 * line] + iT[31 * 64 + k] * src[31 * line] +
1030
        (zo ? 0 : (
1031
        iT[33 * 64 + k] * src[33 * line] + iT[35 * 64 + k] * src[35 * line] + iT[37 * 64 + k] * src[37 * line] + iT[39 * 64 + k] * src[39 * line] +
1032
        iT[41 * 64 + k] * src[41 * line] + iT[43 * 64 + k] * src[43 * line] + iT[45 * 64 + k] * src[45 * line] + iT[47 * 64 + k] * src[47 * line] +
1033
        iT[49 * 64 + k] * src[49 * line] + iT[51 * 64 + k] * src[51 * line] + iT[53 * 64 + k] * src[53 * line] + iT[55 * 64 + k] * src[55 * line] +
1034
        iT[57 * 64 + k] * src[57 * line] + iT[59 * 64 + k] * src[59 * line] + iT[61 * 64 + k] * src[61 * line] + iT[63 * 64 + k] * src[63 * line]));
1035
    }
1036
    for (k = 0;k<16;k++)
1037
    {
1038
      EO[k] = iT[2 * 64 + k] * src[2 * line] + iT[6 * 64 + k] * src[6 * line] + iT[10 * 64 + k] * src[10 * line] + iT[14 * 64 + k] * src[14 * line] +
1039
        iT[18 * 64 + k] * src[18 * line] + iT[22 * 64 + k] * src[22 * line] + iT[26 * 64 + k] * src[26 * line] + iT[30 * 64 + k] * src[30 * line] +
1040
        (zo ? 0 : (
1041
        iT[34 * 64 + k] * src[34 * line] + iT[38 * 64 + k] * src[38 * line] + iT[42 * 64 + k] * src[42 * line] + iT[46 * 64 + k] * src[46 * line] +
1042
        iT[50 * 64 + k] * src[50 * line] + iT[54 * 64 + k] * src[54 * line] + iT[58 * 64 + k] * src[58 * line] + iT[62 * 64 + k] * src[62 * line]));
1043
    }
1044
    for (k = 0;k<8;k++)
1045
    {
1046
      EEO[k] = iT[4 * 64 + k] * src[4 * line] + iT[12 * 64 + k] * src[12 * line] + iT[20 * 64 + k] * src[20 * line] + iT[28 * 64 + k] * src[28 * line] +
1047
        (zo ? 0 : (
1048
        iT[36 * 64 + k] * src[36 * line] + iT[44 * 64 + k] * src[44 * line] + iT[52 * 64 + k] * src[52 * line] + iT[60 * 64 + k] * src[60 * line]));
1049
    }
1050
    for (k = 0;k<4;k++)
1051
    {
1052
      EEEO[k] = iT[8 * 64 + k] * src[8 * line] + iT[24 * 64 + k] * src[24 * line] + (zo ? 0 : (iT[40 * 64 + k] * src[40 * line] + iT[56 * 64 + k] * src[56 * line]));
1053
    }
1054
    EEEEO[0] = iT[16 * 64 + 0] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 0] * src[48 * line]);
1055
    EEEEO[1] = iT[16 * 64 + 1] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 1] * src[48 * line]);
1056
    EEEEE[0] = iT[0 * 64 + 0] * src[0] + (zo ? 0 : iT[32 * 64 + 0] * src[32 * line]);
1057
    EEEEE[1] = iT[0 * 64 + 1] * src[0] + (zo ? 0 : iT[32 * 64 + 1] * src[32 * line]);
1058
1059
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
1060
    for (k = 0;k<2;k++)
1061
    {
1062
      EEEE[k] = EEEEE[k] + EEEEO[k];
1063
      EEEE[k + 2] = EEEEE[1 - k] - EEEEO[1 - k];
1064
    }
1065
    for (k = 0;k<4;k++)
1066
    {
1067
      EEE[k] = EEEE[k] + EEEO[k];
1068
      EEE[k + 4] = EEEE[3 - k] - EEEO[3 - k];
1069
    }
1070
    for (k = 0;k<8;k++)
1071
    {
1072
      EE[k] = EEE[k] + EEO[k];
1073
      EE[k + 8] = EEE[7 - k] - EEO[7 - k];
1074
    }
1075
    for (k = 0;k<16;k++)
1076
    {
1077
      E[k] = EE[k] + EO[k];
1078
      E[k + 16] = EE[15 - k] - EO[15 - k];
1079
    }
1080
    for (k = 0;k<32;k++)
1081
    {
1082
#if ENABLE_SIMD_TRAFO
1083
      dst[k]      = E[k] + O[k];
1084
      dst[k + 32] = E[31 - k] - O[31 - k];
1085
#else
1086
      dst[k]      = Clip3( outputMinimum, outputMaximum, ( E[k] + O[k] + rnd_factor ) >> shift );
1087
      dst[k + 32] = Clip3( outputMinimum, outputMaximum, ( E[31 - k] - O[31 - k] + rnd_factor ) >> shift );
1088
#endif
1089
    }
1090
    src++;
1091
    dst += uiTrSize;
1092
  }
1093
1094
#if ENABLE_SIMD_TRAFO
1095
  g_tCoeffOps.roundClip8( orgDst, 32, line - iSkipLine, 32, outputMinimum, outputMaximum, rnd_factor, shift );
1096
1097
1098
#endif
1099
  memset( dst, 0, uiTrSize*iSkipLine * sizeof( TCoeff ) );
1100
#endif
1101
22.1k
}
1102
1103
1104
1105
// ********************************** DST-VII **********************************
1106
void fastForwardDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1107
4.71k
{
1108
4.71k
  int i;
1109
4.71k
  TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0;
1110
1111
4.71k
  const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_FORWARD][0];
1112
1113
4.71k
  int c[4];
1114
4.71k
  TCoeff *pCoeff = dst;
1115
4.71k
  const int  reducedLine = line - iSkipLine;
1116
102k
  for (i = 0; i<reducedLine; i++)
1117
97.3k
  {
1118
    // Intermediate Variables
1119
97.3k
    c[0] = src[0] + src[3];
1120
97.3k
    c[1] = src[1] + src[3];
1121
97.3k
    c[2] = src[0] - src[1];
1122
97.3k
    c[3] = iT[2] * src[2];
1123
1124
97.3k
    dst[0 * line] = (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift;
1125
97.3k
    dst[1 * line] = (iT[2] * (src[0] + src[1] - src[3]) + rnd_factor) >> shift;
1126
97.3k
    dst[2 * line] = (iT[0] * c[2] + iT[1] * c[0] - c[3] + rnd_factor) >> shift;
1127
97.3k
    dst[3 * line] = (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift;
1128
1129
97.3k
    src += 4;
1130
97.3k
    dst++;
1131
97.3k
  }
1132
4.71k
  if (iSkipLine)
1133
0
  {
1134
0
    dst = pCoeff + reducedLine;
1135
0
    for (i = 0; i<4; i++)
1136
0
    {
1137
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1138
0
      dst += line;
1139
0
    }
1140
0
  }
1141
4.71k
}
1142
1143
void fastInverseDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1144
1.86k
{
1145
1.86k
#if ENABLE_SIMD_TRAFO
1146
1.86k
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P4[TRANSFORM_INVERSE][0] );
1147
#else
1148
  int i;
1149
  TCoeff c[4];
1150
  TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0;
1151
1152
  const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_INVERSE][0];
1153
1154
  const int  reducedLine = line - iSkipLine;
1155
  for (i = 0; i<reducedLine; i++)
1156
  {
1157
    // Intermediate Variables
1158
    c[0] = src[0 * line] + src[2 * line];
1159
    c[1] = src[2 * line] + src[3 * line];
1160
    c[2] = src[0 * line] - src[3 * line];
1161
    c[3] = iT[2] * src[1 * line];
1162
1163
    dst[0] = Clip3(outputMinimum, outputMaximum, (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift);
1164
    dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift);
1165
    dst[2] = Clip3(outputMinimum, outputMaximum, (iT[2] * (src[0 * line] - src[2 * line] + src[3 * line]) + rnd_factor) >> shift);
1166
    dst[3] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[0] + iT[0] * c[2] - c[3] + rnd_factor) >> shift);
1167
1168
    dst += 4;
1169
    src++;
1170
  }
1171
  if (iSkipLine)
1172
  {
1173
    memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff));
1174
  }
1175
#endif
1176
1.86k
}
1177
1178
1179
void fastForwardDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1180
27.5k
{
1181
27.5k
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P8[TRANSFORM_FORWARD][0] );
1182
27.5k
}
1183
1184
1185
void fastInverseDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1186
1.19k
{
1187
1.19k
  _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P8[TRANSFORM_INVERSE][0]);
1188
1.19k
}
1189
1190
1191
void fastForwardDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1192
36.7k
{
1193
#if !JVET_M0497_MATRIX_MULT
1194
  int j, k;
1195
  TCoeff a[5], b[5], c[5], d[5], t;
1196
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1197
1198
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0];
1199
1200
  TCoeff *pCoef = dst;
1201
  const int  reducedLine = line - iSkipLine;
1202
  const int  cutoff = 16 - iSkipLine2;
1203
1204
  for (j = 0; j < reducedLine; j++)
1205
  {
1206
    for (k = 0; k < 5; k++)
1207
    {
1208
      a[k] = src[    k] + src[11 + k];
1209
      b[k] = src[9 - k] + src[11 + k];
1210
      c[k] = src[    k] - src[ 9 - k];
1211
      d[k] = src[    k] + src[ 9 - k] - src[11 + k];
1212
    }
1213
1214
    t = iT[10] * src[10];
1215
1216
    dst[ 1 * line] = ( iT[ 2]*d[0] + iT[ 5]*d[1] + iT[ 8]*d[2] + iT[11]*d[3] + iT[14]*d[4] + add) >> shift;
1217
    dst[ 4 * line] = ( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift;
1218
    dst[ 7 * line] = ( iT[14]*d[0] + iT[ 2]*d[1] - iT[11]*d[2] - iT[ 5]*d[3] + iT[ 8]*d[4] + add) >> shift;
1219
    dst[10 * line] = ( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift;
1220
    dst[13 * line] = ( iT[ 5]*d[0] - iT[11]*d[1] + iT[14]*d[2] - iT[ 8]*d[3] + iT[ 2]*d[4] + add) >> shift;
1221
1222
    dst[5 * line] = ( iT[10] * (src[0] + src[1] - src[3] - src[4] + src[6] + src[7] - src[9] - src[10] + src[12] + src[13] - src[15]) + add) >> shift;
1223
1224
    dst[ 0 * line] = ( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift;
1225
    dst[ 2 * line] = ( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift;
1226
    dst[ 3 * line] = ( iT[6]*a[0] + iT[3]*b[0] + iT[2]*c[1] + iT[7]*a[1] + iT[9]*c[2] + iT[0]*a[2] + iT[4]*c[3] - iT[5]*b[3] - iT[1]*a[4] - iT[8]*b[4] + t + add ) >> shift;
1227
    dst[ 6 * line] = ( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift;
1228
    dst[ 8 * line] = ( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift;
1229
    dst[ 9 * line] = ( iT[7]*c[0] + iT[2]*a[0] - iT[4]*a[1] - iT[5]*b[1] - iT[8]*c[2] + iT[1]*b[2] + iT[9]*a[3] + iT[0]*b[3] + iT[3]*c[4] - iT[6]*b[4] + t + add ) >> shift;
1230
    dst[11 * line] = ( iT[9]*a[0] + iT[0]*b[0] - iT[8]*c[1] - iT[1]*a[1] + iT[2]*c[2] - iT[7]*b[2] + iT[6]*a[3] + iT[3]*b[3] - iT[5]*c[4] - iT[4]*a[4] - t + add ) >> shift;
1231
    dst[12 * line] = ( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift;
1232
    dst[14 * line] = ( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift;
1233
    dst[15 * line] = ( iT[1]*c[0] - iT[8]*b[0] - iT[3]*c[1] + iT[6]*b[1] + iT[5]*c[2] - iT[4]*b[2] - iT[7]*c[3] + iT[2]*b[3] + iT[9]*c[4] - iT[0]*b[4] + t + add ) >> shift;
1234
1235
    src += 16;
1236
    dst++;
1237
  }
1238
1239
  if (iSkipLine)
1240
  {
1241
    dst = pCoef + reducedLine;
1242
    for (j = 0; j < cutoff; j++)
1243
    {
1244
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1245
      dst += line;
1246
    }
1247
  }
1248
1249
  if (iSkipLine2)
1250
  {
1251
    dst = pCoef + line * cutoff;
1252
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1253
  }
1254
#else
1255
36.7k
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P16[TRANSFORM_FORWARD][0] );
1256
36.7k
#endif
1257
36.7k
}
1258
1259
1260
void fastInverseDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1261
5.79k
{
1262
#if !JVET_M0497_MATRIX_MULT
1263
  int j, k;
1264
  TCoeff a[5], b[5], c[5], d[5], t;
1265
1266
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1267
1268
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0];
1269
1270
  const int  reducedLine = line - iSkipLine;
1271
1272
  for (j = 0; j < reducedLine; j++)
1273
  {
1274
    for (k = 0; k < 5; k++)
1275
    {
1276
      a[k] = src[       k * line] + src[(10 - k) * line];
1277
      b[k] = src[(11 + k) * line] + src[(10 - k) * line];
1278
      c[k] = src[       k * line] - src[(11 + k) * line];
1279
      d[k] = src[       k * line] + src[(11 + k) * line] - src[(10 - k)*line];
1280
    }
1281
1282
    t = iT[10] * src[5 * line];
1283
1284
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 2]*d[0] + iT[ 8]*d[1] + iT[14]*d[2] + iT[11]*d[3] + iT[ 5]*d[4] + add ) >> shift);
1285
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 5]*d[0] + iT[14]*d[1] + iT[ 2]*d[2] - iT[ 8]*d[3] - iT[11]*d[4] + add ) >> shift);
1286
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[ 5]*d[1] - iT[11]*d[2] - iT[ 2]*d[3] + iT[14]*d[4] + add ) >> shift);
1287
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[11]*d[0] - iT[ 2]*d[1] - iT[ 5]*d[2] + iT[14]*d[3] - iT[ 8]*d[4] + add ) >> shift);
1288
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[14]*d[0] - iT[11]*d[1] + iT[ 8]*d[2] - iT[ 5]*d[3] + iT[ 2]*d[4] + add ) >> shift);
1289
1290
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[10]*(src[ 0*line]-src[ 2*line]+src[ 3*line]-src[5*line]
1291
                                                                +src[ 6*line]-src[ 8*line]+src[ 9*line]-src[11*line]
1292
                                                                +src[12*line]-src[14*line]+src[15*line]) + add ) >> shift);
1293
1294
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0]*a[0] + iT[9]*b[0] + iT[2]*a[1] + iT[7]*b[1] + iT[4]*a[2] + iT[5]*b[2] + iT[6]*a[3] + iT[3]*b[3] + iT[8]*a[4] + iT[1]*b[4] + t + add ) >> shift);
1295
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] - iT[8]*b[0] + iT[5]*c[1] - iT[4]*b[1] + iT[9]*c[2] - iT[0]*b[2] + iT[2]*a[3] + iT[7]*c[3] + iT[6]*a[4] + iT[3]*c[4] + t + add ) >> shift);
1296
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[3]*a[0] + iT[6]*b[0] + iT[0]*c[1] + iT[9]*a[1] + iT[1]*a[2] + iT[8]*c[2] + iT[4]*c[3] - iT[5]*b[3] - iT[2]*a[4] - iT[7]*b[4] - t + add ) >> shift);
1297
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] - iT[5]*b[0] + iT[6]*c[1] + iT[3]*a[1] + iT[7]*a[2] + iT[2]*b[2] - iT[1]*c[3] + iT[8]*b[3] - iT[9]*c[4] - iT[0]*a[4] - t + add ) >> shift);
1298
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[6]*a[0] + iT[3]*b[0] + iT[9]*c[1] + iT[0]*a[1] - iT[1]*a[2] - iT[8]*b[2] - iT[4]*c[3] - iT[5]*a[3] - iT[2]*c[4] + iT[7]*b[4] + t + add ) >> shift);
1299
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] - iT[2]*b[0] + iT[8]*a[1] + iT[1]*b[1] - iT[6]*c[2] + iT[3]*b[2] - iT[9]*a[3] - iT[0]*b[3] + iT[5]*c[4] - iT[4]*b[4] + t + add ) >> shift);
1300
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( iT[9]*a[0] + iT[0]*b[0] + iT[2]*c[1] - iT[7]*b[1] - iT[5]*c[2] - iT[4]*a[2] + iT[3]*a[3] + iT[6]*b[3] + iT[8]*c[4] - iT[1]*b[4] - t + add ) >> shift);
1301
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] + iT[8]*a[0] - iT[5]*a[1] - iT[4]*b[1] - iT[0]*c[2] + iT[9]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[6]*c[4] - iT[3]*a[4] + t + add ) >> shift);
1302
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] + iT[2]*a[0] - iT[8]*c[1] + iT[1]*b[1] + iT[3]*c[2] - iT[6]*b[2] + iT[0]*a[3] + iT[9]*b[3] - iT[5]*a[4] - iT[4]*b[4] + t + add ) >> shift);
1303
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] + iT[5]*a[0] - iT[3]*c[1] - iT[6]*a[1] + iT[2]*c[2] + iT[7]*a[2] - iT[1]*c[3] - iT[8]*a[3] + iT[0]*c[4] + iT[9]*a[4] - t + add ) >> shift);
1304
1305
    src++;
1306
    dst += 16;
1307
  }
1308
1309
  if (iSkipLine)
1310
  {
1311
    memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff));
1312
  }
1313
#else
1314
5.79k
  _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P16[TRANSFORM_INVERSE][0]);
1315
5.79k
#endif
1316
5.79k
}
1317
1318
1319
void fastForwardDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1320
0
{
1321
#if !JVET_M0497_MATRIX_MULT
1322
  int j, k;
1323
  TCoeff a[10][6];
1324
  TCoeff t[2];
1325
  TCoeff b[6];
1326
  TCoeff c[2];
1327
1328
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1329
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0];
1330
  TCoeff *pCoef = dst;
1331
  const int  reducedLine = line - iSkipLine;
1332
  const int  cutoff = 32 - iSkipLine2;
1333
1334
  for (j = 0; j < reducedLine; j++)
1335
  {
1336
    for (k = 0; k < 6; k++)
1337
    {
1338
      a[0][k] = src[     k] - src[11 - k];
1339
      a[1][k] = src[     k] + src[13 + k];
1340
      a[2][k] = src[     k] + src[24 - k];
1341
      a[3][k] = src[     k] - src[26 + k];
1342
      a[4][k] = src[ 6 + k] + src[18 - k];
1343
      a[5][k] = src[ 6 + k] + src[19 + k];
1344
      a[6][k] = src[ 6 + k] - src[31 - k];
1345
      a[7][k] = src[13 + k] - src[24 - k];
1346
      a[8][k] = src[13 + k] + src[26 + k];
1347
      a[9][k] = src[19 + k] + src[31 - k];
1348
1349
      b[k] = src[k] + src[11 - k] - src[13 + k] - src[24 - k] + src[26 + k];
1350
    }
1351
    for (k = 0; k < 2; k++)
1352
    {
1353
      c[k] = src[k] + src[3 - k] - src[5 + k] - src[8 - k] + src[10 + k] + src[13 - k] - src[15 + k] - src[18 - k] + src[20 + k] + src[23 - k] - src[25 + k] - src[28 - k] + src[30 + k];
1354
    }
1355
1356
    t[0] = iT[12] * src[12] + iT[25] * src[25];
1357
    t[1] = iT[12] * src[25] - iT[25] * src[12];
1358
1359
    dst[ 0 * line] = ( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift;
1360
    dst[ 1 * line] = (-iT[0] * a[5][2] + iT[11] * a[0][3] + iT[13] * a[4][2] + iT[24] * a[6][2] + iT[1] * a[9][1] + iT[10] * a[8][4] + iT[14] * a[3][4] + iT[23] * a[6][1] + iT[2] * a[0][0] - iT[9] * a[5][5] + iT[15] * a[6][5] + iT[22] * a[4][5] - iT[3] * a[5][3] + iT[8] * a[0][2] + iT[16] * a[4][3] + iT[21] * a[6][3] + iT[4] * a[9][0] + iT[7] * a[8][5] + iT[17] * a[3][5] + iT[20] * a[6][0] + iT[5] * a[0][1] - iT[6] * a[5][4] + iT[18] * a[6][4] + iT[19] * a[4][4] - t[1] + add) >> shift;
1361
    dst[ 3 * line] = (-iT[0] * a[9][4] - iT[11] * a[5][4] + iT[13] * a[2][1] - iT[24] * a[7][1] - iT[1] * a[0][3] - iT[10] * a[1][3] + iT[14] * a[3][3] + iT[23] * a[2][3] + iT[2] * a[8][5] + iT[9] * a[9][0] + iT[15] * a[6][0] + iT[22] * a[3][5] - iT[3] * a[1][4] - iT[8] * a[0][4] + iT[16] * a[2][4] + iT[21] * a[3][4] - iT[4] * a[5][3] - iT[7] * a[9][3] - iT[17] * a[7][2] + iT[20] * a[2][2] + iT[5] * a[8][0] + iT[6] * a[1][0] - iT[18] * a[4][5] - iT[19] * a[7][0] + t[1] + add) >> shift;
1362
    dst[ 4 * line] = (-iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift;
1363
    dst[ 5 * line] = (-iT[0] * a[3][5] - iT[11] * a[6][0] - iT[13] * a[8][5] - iT[24] * a[9][0] + iT[1] * a[6][5] + iT[10] * a[3][0] + iT[14] * a[9][5] + iT[23] * a[8][0] - iT[2] * a[7][4] + iT[9] * a[2][4] - iT[15] * a[9][1] - iT[22] * a[5][1] - iT[3] * a[7][1] - iT[8] * a[4][4] + iT[16] * a[8][1] + iT[21] * a[1][1] + iT[4] * a[6][2] + iT[7] * a[4][2] - iT[17] * a[5][2] + iT[20] * a[0][3] - iT[5] * a[3][2] - iT[6] * a[2][2] + iT[18] * a[1][2] + iT[19] * a[0][2] + t[0] + add) >> shift;
1364
    dst[ 8 * line] = ( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift;
1365
    dst[ 9 * line] = (-iT[0] * a[2][1] - iT[11] * a[3][1] + iT[13] * a[0][1] + iT[24] * a[1][1] + iT[1] * a[7][3] - iT[10] * a[2][3] + iT[14] * a[9][2] + iT[23] * a[5][2] + iT[2] * a[4][0] + iT[9] * a[7][5] - iT[15] * a[1][5] - iT[22] * a[8][5] + iT[3] * a[3][4] + iT[8] * a[2][4] - iT[16] * a[1][4] - iT[21] * a[0][4] + iT[4] * a[6][3] + iT[7] * a[3][2] + iT[17] * a[9][3] + iT[20] * a[8][2] + iT[5] * a[4][5] + iT[6] * a[6][5] + iT[18] * a[0][0] - iT[19] * a[5][5] - t[0] + add) >> shift;
1366
    dst[10 * line] = (-iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift;
1367
    dst[11 * line] = ( iT[0] * a[1][3] + iT[11] * a[0][3] - iT[13] * a[2][3] - iT[24] * a[3][3] + iT[1] * a[9][1] + iT[10] * a[5][1] - iT[14] * a[2][4] + iT[23] * a[7][4] + iT[2] * a[8][0] + iT[9] * a[9][5] + iT[15] * a[6][5] + iT[22] * a[3][0] - iT[3] * a[0][2] + iT[8] * a[5][3] - iT[16] * a[6][3] - iT[21] * a[4][3] - iT[4] * a[5][0] + iT[7] * a[0][5] + iT[17] * a[4][0] + iT[20] * a[6][0] - iT[5] * a[9][4] - iT[6] * a[5][4] + iT[18] * a[2][1] - iT[19] * a[7][1] - t[1] + add) >> shift;
1368
    dst[13 * line] = (-iT[0] * a[0][0] - iT[11] * a[1][0] + iT[13] * a[3][0] + iT[24] * a[2][0] - iT[1] * a[5][4] + iT[10] * a[0][1] + iT[14] * a[4][4] + iT[23] * a[6][4] + iT[2] * a[9][3] + iT[9] * a[5][3] - iT[15] * a[2][2] + iT[22] * a[7][2] - iT[3] * a[8][3] - iT[8] * a[9][2] - iT[16] * a[6][2] - iT[21] * a[3][3] + iT[4] * a[1][4] + iT[7] * a[8][4] - iT[17] * a[7][4] - iT[20] * a[4][1] - iT[5] * a[0][5] - iT[6] * a[1][5] + iT[18] * a[3][5] + iT[19] * a[2][5] + t[1] + add) >> shift;
1369
    dst[14 * line] = ( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift;
1370
    dst[15 * line] = (-iT[0] * a[7][4] - iT[11] * a[4][1] + iT[13] * a[8][4] + iT[24] * a[1][4] + iT[1] * a[2][2] + iT[10] * a[3][2] - iT[14] * a[0][2] - iT[23] * a[1][2] + iT[2] * a[2][1] - iT[9] * a[7][1] - iT[15] * a[5][4] - iT[22] * a[9][4] - iT[3] * a[7][5] + iT[8] * a[2][5] - iT[16] * a[9][0] - iT[21] * a[5][0] - iT[4] * a[2][0] - iT[7] * a[3][0] + iT[17] * a[0][0] + iT[20] * a[1][0] - iT[5] * a[2][3] + iT[6] * a[7][3] + iT[18] * a[5][2] + iT[19] * a[9][2] + t[0] + add) >> shift;
1371
    dst[16 * line] = (-iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift;
1372
    dst[18 * line] = ( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift;
1373
    dst[20 * line] = (-iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift;
1374
    dst[21 * line] = (-iT[0] * a[1][2] - iT[11] * a[8][2] + iT[13] * a[7][2] + iT[24] * a[4][3] - iT[1] * a[1][5] - iT[10] * a[8][5] + iT[14] * a[7][5] + iT[23] * a[4][0] - iT[2] * a[5][2] - iT[9] * a[9][2] - iT[15] * a[7][3] + iT[22] * a[2][3] - iT[3] * a[5][5] - iT[8] * a[9][5] - iT[16] * a[7][0] + iT[21] * a[2][0] - iT[4] * a[8][1] - iT[7] * a[9][4] - iT[17] * a[6][4] - iT[20] * a[3][1] - iT[5] * a[8][4] - iT[6] * a[9][1] - iT[18] * a[6][1] - iT[19] * a[3][4] - t[1] + add) >> shift;
1375
    dst[23 * line] = (-iT[0] * a[8][4] - iT[11] * a[9][1] - iT[13] * a[6][1] - iT[24] * a[3][4] + iT[1] * a[8][2] + iT[10] * a[1][2] - iT[14] * a[4][3] - iT[23] * a[7][2] + iT[2] * a[0][1] + iT[9] * a[1][1] - iT[15] * a[3][1] - iT[22] * a[2][1] - iT[3] * a[5][0] - iT[8] * a[9][0] - iT[16] * a[7][5] + iT[21] * a[2][5] + iT[4] * a[9][5] + iT[7] * a[8][0] + iT[17] * a[3][0] + iT[20] * a[6][5] - iT[5] * a[5][2] + iT[6] * a[0][3] + iT[18] * a[4][2] + iT[19] * a[6][2] + t[1] + add) >> shift;
1376
    dst[24 * line] = (-iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift;
1377
    dst[25 * line] = ( iT[0] * a[4][5] + iT[11] * a[6][5] + iT[13] * a[0][0] - iT[24] * a[5][5] + iT[1] * a[3][1] + iT[10] * a[2][1] - iT[14] * a[1][1] - iT[23] * a[0][1] - iT[2] * a[7][2] - iT[9] * a[4][3] + iT[15] * a[8][2] + iT[22] * a[1][2] - iT[3] * a[6][2] - iT[8] * a[3][3] - iT[16] * a[9][2] - iT[21] * a[8][3] - iT[4] * a[2][4] + iT[7] * a[7][4] + iT[17] * a[5][1] + iT[20] * a[9][1] + iT[5] * a[4][0] + iT[6] * a[6][0] + iT[18] * a[0][5] - iT[19] * a[5][0] + t[0] + add) >> shift;
1378
    dst[26 * line] = ( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift;
1379
    dst[28 * line] = (-iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift;
1380
    dst[29 * line] = (-iT[0] * a[6][4] - iT[11] * a[3][1] - iT[13] * a[9][4] - iT[24] * a[8][1] + iT[1] * a[7][3] + iT[10] * a[4][2] - iT[14] * a[8][3] - iT[23] * a[1][3] + iT[2] * a[3][5] + iT[9] * a[2][5] - iT[15] * a[1][5] - iT[22] * a[0][5] - iT[3] * a[2][4] - iT[8] * a[3][4] + iT[16] * a[0][4] + iT[21] * a[1][4] - iT[4] * a[4][3] - iT[7] * a[7][2] + iT[17] * a[1][2] + iT[20] * a[8][2] + iT[5] * a[3][0] + iT[6] * a[6][5] + iT[18] * a[8][0] + iT[19] * a[9][5] - t[0] + add) >> shift;
1381
    dst[30 * line] = (-iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift;
1382
    dst[31 * line] = (-iT[0] * a[8][5] - iT[11] * a[1][5] + iT[13] * a[4][0] + iT[24] * a[7][5] + iT[1] * a[1][0] + iT[10] * a[8][0] - iT[14] * a[7][0] - iT[23] * a[4][5] + iT[2] * a[8][4] + iT[9] * a[1][4] - iT[15] * a[4][1] - iT[22] * a[7][4] - iT[3] * a[1][1] - iT[8] * a[8][1] + iT[16] * a[7][1] + iT[21] * a[4][4] - iT[4] * a[8][3] - iT[7] * a[1][3] + iT[17] * a[4][2] + iT[20] * a[7][3] + iT[5] * a[1][2] + iT[6] * a[8][2] - iT[18] * a[7][2] - iT[19] * a[4][3] - t[1] + add) >> shift;
1383
1384
    dst[ 2 * line] = (iT[ 4]*b[0] + iT[ 9]*b[1] + iT[14]*b[2] + iT[19]*b[3] + iT[24]*b[4] + iT[29]*b[5] + add) >> shift;
1385
    dst[ 7 * line] = (iT[14]*b[0] + iT[29]*b[1] + iT[19]*b[2] + iT[ 4]*b[3] - iT[ 9]*b[4] - iT[24]*b[5] + add) >> shift;
1386
    dst[12 * line] = (iT[24]*b[0] + iT[14]*b[1] - iT[ 9]*b[2] - iT[29]*b[3] - iT[ 4]*b[4] + iT[19]*b[5] + add) >> shift;
1387
    dst[17 * line] = (iT[29]*b[0] - iT[ 4]*b[1] - iT[24]*b[2] + iT[ 9]*b[3] + iT[19]*b[4] - iT[14]*b[5] + add) >> shift;
1388
    dst[22 * line] = (iT[19]*b[0] - iT[24]*b[1] + iT[ 4]*b[2] + iT[14]*b[3] - iT[29]*b[4] + iT[ 9]*b[5] + add) >> shift;
1389
    dst[27 * line] = (iT[ 9]*b[0] - iT[19]*b[1] + iT[29]*b[2] - iT[24]*b[3] + iT[14]*b[4] - iT[ 4]*b[5] + add) >> shift;
1390
1391
    dst[ 6 * line] = (iT[12]*c[0] + iT[25]*c[1] + add) >> shift;
1392
    dst[19 * line] = (iT[25]*c[0] - iT[12]*c[1] + add) >> shift;
1393
1394
    src += 32;
1395
    dst++;
1396
  }
1397
1398
  if (iSkipLine)
1399
  {
1400
    dst = pCoef + reducedLine;
1401
    for (j = 0; j < cutoff; j++)
1402
    {
1403
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1404
      dst += line;
1405
    }
1406
  }
1407
1408
  if (iSkipLine2)
1409
  {
1410
    dst = pCoef + line * cutoff;
1411
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1412
  }
1413
#else
1414
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P32[TRANSFORM_FORWARD][0] );
1415
0
#endif
1416
0
}
1417
1418
1419
void fastInverseDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1420
0
{
1421
#if !JVET_M0497_MATRIX_MULT
1422
  int j, k;
1423
  TCoeff a[10][6];
1424
  TCoeff t[2];
1425
  TCoeff b[6];
1426
  TCoeff c[2];
1427
1428
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1429
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0];
1430
  const int  reducedLine = line - iSkipLine;
1431
1432
  for (j = 0; j < reducedLine; j++)
1433
  {
1434
    for (k = 0; k < 6; k++)
1435
    {
1436
      a[0][k] = src[      k  * line] + src[(12 - k) * line];
1437
      a[1][k] = src[      k  * line] - src[(13 + k) * line];
1438
      a[2][k] = src[      k  * line] + src[(25 - k) * line];
1439
      a[3][k] = src[      k  * line] - src[(26 + k) * line];
1440
      a[4][k] = src[( 7 + k) * line] + src[(18 - k) * line];
1441
      a[5][k] = src[( 7 + k) * line] - src[(20 + k) * line];
1442
      a[6][k] = src[( 7 + k) * line] + src[(31 - k) * line];
1443
      a[7][k] = src[(13 + k) * line] + src[(25 - k) * line];
1444
      a[8][k] = src[(13 + k) * line] - src[(26 + k) * line];
1445
      a[9][k] = src[(20 + k) * line] + src[(31 - k) * line];
1446
1447
      b[k] = src[k * line] - src[(12-k) * line] + src[(13+k) * line] - src[(25-k) * line] + src[(26+k) * line];
1448
    }
1449
    for (k = 0; k < 2; k++)
1450
    {
1451
      c[k] = src[k * line] - src[(4-k) * line] + src[(5+k) * line] - src[(9-k) * line] + src[(10+k) * line] - src[(14-k) * line] + src[(15+k)*line] - src[(19-k)*line] + src[(20+k)*line] - src[(24-k)*line] + src[(25+k)*line] - src[(29-k)*line] + src[(30+k)*line];
1452
    }
1453
1454
    t[0] = iT[12] * src[6*line] + iT[25] * src[19*line];
1455
    t[1] = iT[25] * src[6*line] - iT[12] * src[19*line];
1456
1457
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[1][0] - iT[11] * a[8][0] + iT[13] * a[7][0] + iT[24] * a[4][5] - iT[1] * a[8][5] + iT[10] * a[1][5] + iT[14] * a[4][0] + iT[23] * a[7][5] + iT[2] * a[1][1] - iT[9] * a[8][1] + iT[15] * a[7][1] + iT[22] * a[4][4] - iT[3] * a[8][4] + iT[8] * a[1][4] + iT[16] * a[4][1] + iT[21] * a[7][4] + iT[4] * a[1][2] - iT[7] * a[8][2] + iT[17] * a[7][2] + iT[20] * a[4][3] - iT[5] * a[8][3] + iT[6] * a[1][3] + iT[18] * a[4][2] + iT[19] * a[7][3] + t[0] + add) >> shift);
1458
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[4][2] - iT[11] * a[6][2] + iT[13] * a[0][3] + iT[24] * a[5][2] + iT[1] * a[2][0] + iT[10] * a[7][0] + iT[14] * a[5][5] - iT[23] * a[9][5] + iT[2] * a[7][2] + iT[9] * a[2][2] - iT[15] * a[9][3] + iT[22] * a[5][3] - iT[3] * a[6][0] - iT[8] * a[4][0] + iT[16] * a[5][0] + iT[21] * a[0][5] - iT[4] * a[4][1] - iT[7] * a[6][1] + iT[17] * a[0][4] + iT[20] * a[5][1] + iT[5] * a[2][1] + iT[6] * a[7][1] + iT[18] * a[5][4] - iT[19] * a[9][4] + t[1] + add) >> shift);
1459
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][4] - iT[11] * a[3][4] + iT[13] * a[0][4] + iT[24] * a[1][4] + iT[1] * a[4][3] + iT[10] * a[7][2] + iT[14] * a[1][2] - iT[23] * a[8][2] + iT[2] * a[3][0] - iT[9] * a[6][5] - iT[15] * a[8][0] + iT[22] * a[9][5] - iT[3] * a[6][4] + iT[8] * a[3][1] + iT[16] * a[9][4] - iT[21] * a[8][1] + iT[4] * a[7][3] + iT[7] * a[4][2] - iT[17] * a[8][3] + iT[20] * a[1][3] - iT[5] * a[3][5] - iT[6] * a[2][5] + iT[18] * a[1][5] + iT[19] * a[0][5] + t[1] + add) >> shift);
1460
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][4] + iT[11] * a[0][1] - iT[13] * a[4][4] - iT[24] * a[6][4] - iT[1] * a[1][3] - iT[10] * a[0][3] + iT[14] * a[2][3] + iT[23] * a[3][3] - iT[2] * a[0][4] - iT[9] * a[1][4] + iT[15] * a[3][4] + iT[22] * a[2][4] + iT[3] * a[0][0] + iT[8] * a[5][5] - iT[16] * a[6][5] - iT[21] * a[4][5] + iT[4] * a[5][0] - iT[7] * a[9][0] + iT[17] * a[7][5] + iT[20] * a[2][5] - iT[5] * a[8][2] + iT[6] * a[9][3] - iT[18] * a[6][3] + iT[19] * a[3][2] + t[0] + add) >> shift);
1461
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][5] + iT[11] * a[8][5] - iT[13] * a[7][5] - iT[24] * a[4][0] + iT[1] * a[5][1] + iT[10] * a[0][4] - iT[14] * a[4][1] - iT[23] * a[6][1] - iT[2] * a[8][3] + iT[9] * a[9][2] - iT[15] * a[6][2] + iT[22] * a[3][3] - iT[3] * a[0][2] - iT[8] * a[1][2] + iT[16] * a[3][2] + iT[21] * a[2][2] - iT[4] * a[9][4] + iT[7] * a[5][4] + iT[17] * a[2][1] + iT[20] * a[7][1] + iT[5] * a[1][0] - iT[6] * a[8][0] + iT[18] * a[7][0] + iT[19] * a[4][5] - t[0] + add) >> shift);
1462
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][5] - iT[11] * a[2][5] + iT[13] * a[9][0] - iT[24] * a[5][0] + iT[1] * a[3][4] - iT[10] * a[6][1] - iT[14] * a[8][4] + iT[23] * a[9][1] + iT[2] * a[4][2] + iT[9] * a[7][3] + iT[15] * a[1][3] - iT[22] * a[8][3] - iT[3] * a[2][2] - iT[8] * a[3][2] + iT[16] * a[0][2] + iT[21] * a[1][2] - iT[4] * a[6][4] - iT[7] * a[4][4] + iT[17] * a[5][4] + iT[20] * a[0][1] + iT[5] * a[7][0] + iT[6] * a[2][0] - iT[18] * a[9][5] + iT[19] * a[5][5] - t[1] + add) >> shift);
1463
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[6][3] - iT[11] * a[4][3] + iT[13] * a[5][3] + iT[24] * a[0][2] + iT[1] * a[7][1] + iT[10] * a[4][4] - iT[14] * a[8][1] + iT[23] * a[1][1] - iT[2] * a[7][5] - iT[9] * a[4][0] + iT[15] * a[8][5] - iT[22] * a[1][5] + iT[3] * a[7][3] + iT[8] * a[2][3] - iT[16] * a[9][2] + iT[21] * a[5][2] - iT[4] * a[6][5] + iT[7] * a[3][0] + iT[17] * a[9][5] - iT[20] * a[8][0] + iT[5] * a[6][1] - iT[6] * a[3][4] - iT[18] * a[9][1] + iT[19] * a[8][4] - t[1] + add) >> shift);
1464
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][1] - iT[11] * a[0][1] + iT[13] * a[2][1] + iT[24] * a[3][1] + iT[1] * a[1][3] - iT[10] * a[8][3] + iT[14] * a[7][3] + iT[23] * a[4][2] - iT[2] * a[9][1] + iT[9] * a[8][4] - iT[15] * a[3][4] + iT[22] * a[6][1] + iT[3] * a[5][5] + iT[8] * a[0][0] - iT[16] * a[4][5] - iT[21] * a[6][5] + iT[4] * a[0][5] + iT[7] * a[1][5] - iT[17] * a[3][5] - iT[20] * a[2][5] + iT[5] * a[5][3] - iT[6] * a[9][3] + iT[18] * a[7][2] + iT[19] * a[2][2] - t[0] + add) >> shift);
1465
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][3] - iT[11] * a[1][3] - iT[13] * a[4][2] - iT[24] * a[7][3] - iT[1] * a[8][0] + iT[10] * a[1][0] + iT[14] * a[4][5] + iT[23] * a[7][0] + iT[2] * a[5][3] + iT[9] * a[0][2] - iT[15] * a[4][3] - iT[22] * a[6][3] - iT[3] * a[5][0] - iT[8] * a[0][5] + iT[16] * a[4][0] + iT[21] * a[6][0] + iT[4] * a[1][4] + iT[7] * a[0][4] - iT[17] * a[2][4] - iT[20] * a[3][4] - iT[5] * a[1][1] - iT[6] * a[0][1] + iT[18] * a[2][1] + iT[19] * a[3][1] + t[0] + add) >> shift);
1466
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[7][0] + iT[11] * a[2][0] - iT[13] * a[9][5] + iT[24] * a[5][5] + iT[1] * a[2][5] + iT[10] * a[7][5] + iT[14] * a[5][0] - iT[23] * a[9][0] - iT[2] * a[2][1] - iT[9] * a[3][1] + iT[15] * a[0][1] + iT[22] * a[1][1] - iT[3] * a[7][4] - iT[8] * a[4][1] + iT[16] * a[8][4] - iT[21] * a[1][4] + iT[4] * a[3][2] - iT[7] * a[6][3] - iT[17] * a[8][2] + iT[20] * a[9][3] + iT[5] * a[4][2] + iT[6] * a[6][2] - iT[18] * a[0][3] - iT[19] * a[5][2] + t[1] + add) >> shift);
1467
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][5] - iT[11] * a[8][0] + iT[13] * a[3][0] - iT[24] * a[6][5] - iT[1] * a[8][5] + iT[10] * a[9][0] - iT[14] * a[6][0] + iT[23] * a[3][5] + iT[2] * a[5][4] - iT[9] * a[9][4] + iT[15] * a[7][1] + iT[22] * a[2][1] - iT[3] * a[1][4] + iT[8] * a[8][4] - iT[16] * a[7][4] - iT[21] * a[4][1] - iT[4] * a[0][2] - iT[7] * a[5][3] + iT[17] * a[6][3] + iT[20] * a[4][3] + iT[5] * a[0][3] + iT[6] * a[1][3] - iT[18] * a[3][3] - iT[19] * a[2][3] + t[0] + add) >> shift);
1468
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][1] + iT[11] * a[5][1] + iT[13] * a[2][4] + iT[24] * a[7][4] + iT[1] * a[9][3] - iT[10] * a[5][3] - iT[14] * a[2][2] - iT[23] * a[7][2] - iT[2] * a[9][5] + iT[9] * a[5][5] + iT[15] * a[2][0] + iT[22] * a[7][0] + iT[3] * a[9][4] - iT[8] * a[8][1] + iT[16] * a[3][1] - iT[21] * a[6][4] - iT[4] * a[9][2] + iT[7] * a[8][3] - iT[17] * a[3][3] + iT[20] * a[6][2] + iT[5] * a[9][0] - iT[6] * a[8][5] + iT[18] * a[3][5] - iT[19] * a[6][0] - t[0] + add) >> shift);
1469
    dst[16] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[4][4] + iT[11] * a[7][1] + iT[13] * a[1][1] - iT[24] * a[8][1] + iT[1] * a[6][2] - iT[10] * a[3][3] - iT[14] * a[9][2] + iT[23] * a[8][3] - iT[2] * a[6][1] - iT[9] * a[4][1] + iT[15] * a[5][1] + iT[22] * a[0][4] - iT[3] * a[4][5] - iT[8] * a[6][5] + iT[16] * a[0][0] + iT[21] * a[5][5] - iT[4] * a[6][0] + iT[7] * a[3][5] + iT[17] * a[9][0] - iT[20] * a[8][5] + iT[5] * a[6][3] + iT[6] * a[4][3] - iT[18] * a[5][3] - iT[19] * a[0][2] - t[1] + add) >> shift);
1470
    dst[17] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][2] - iT[11] * a[4][3] + iT[13] * a[8][2] - iT[24] * a[1][2] + iT[1] * a[7][1] + iT[10] * a[2][1] - iT[14] * a[9][4] + iT[23] * a[5][4] - iT[2] * a[3][5] + iT[9] * a[6][0] + iT[15] * a[8][5] - iT[22] * a[9][0] - iT[3] * a[2][3] - iT[8] * a[7][3] - iT[16] * a[5][2] + iT[21] * a[9][2] + iT[4] * a[4][5] + iT[7] * a[7][0] + iT[17] * a[1][0] - iT[20] * a[8][0] - iT[5] * a[2][4] - iT[6] * a[3][4] + iT[18] * a[0][4] + iT[19] * a[1][4] - t[1] + add) >> shift);
1471
    dst[18] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][0] + iT[11] * a[8][5] - iT[13] * a[3][5] + iT[24] * a[6][0] + iT[1] * a[5][1] - iT[10] * a[9][1] + iT[14] * a[7][4] + iT[23] * a[2][4] + iT[2] * a[0][3] + iT[9] * a[5][2] - iT[15] * a[6][2] - iT[22] * a[4][2] + iT[3] * a[1][2] + iT[8] * a[0][2] - iT[16] * a[2][2] - iT[21] * a[3][2] - iT[4] * a[8][1] + iT[7] * a[1][1] + iT[17] * a[4][4] + iT[20] * a[7][1] + iT[5] * a[9][5] - iT[6] * a[8][0] + iT[18] * a[3][0] - iT[19] * a[6][5] - t[0] + add) >> shift);
1472
    dst[20] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][2] - iT[11] * a[9][3] + iT[13] * a[6][3] - iT[24] * a[3][2] + iT[1] * a[0][1] + iT[10] * a[5][4] - iT[14] * a[6][4] - iT[23] * a[4][4] + iT[2] * a[1][5] + iT[9] * a[0][5] - iT[15] * a[2][5] - iT[22] * a[3][5] - iT[3] * a[9][2] + iT[8] * a[5][2] + iT[16] * a[2][3] + iT[21] * a[7][3] + iT[4] * a[5][5] - iT[7] * a[9][5] + iT[17] * a[7][0] + iT[20] * a[2][0] + iT[5] * a[0][4] + iT[6] * a[5][1] - iT[18] * a[6][1] - iT[19] * a[4][1] + t[0] + add) >> shift);
1473
    dst[21] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][1] - iT[11] * a[7][1] - iT[13] * a[5][4] + iT[24] * a[9][4] - iT[1] * a[6][2] - iT[10] * a[4][2] + iT[14] * a[5][2] + iT[23] * a[0][3] - iT[2] * a[2][4] - iT[9] * a[7][4] - iT[15] * a[5][1] + iT[22] * a[9][1] - iT[3] * a[6][5] - iT[8] * a[4][5] + iT[16] * a[5][5] + iT[21] * a[0][0] - iT[4] * a[4][0] - iT[7] * a[7][5] - iT[17] * a[1][5] + iT[20] * a[8][5] - iT[5] * a[7][2] - iT[6] * a[4][3] + iT[18] * a[8][2] - iT[19] * a[1][2] + t[1] + add) >> shift);
1474
    dst[22] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[6][1] - iT[11] * a[3][4] - iT[13] * a[9][1] + iT[24] * a[8][4] + iT[1] * a[4][3] + iT[10] * a[6][3] - iT[14] * a[0][2] - iT[23] * a[5][3] + iT[2] * a[7][0] + iT[9] * a[4][5] - iT[15] * a[8][0] + iT[22] * a[1][0] - iT[3] * a[3][1] + iT[8] * a[6][4] + iT[16] * a[8][1] - iT[21] * a[9][4] - iT[4] * a[2][3] - iT[7] * a[3][3] + iT[17] * a[0][3] + iT[20] * a[1][3] - iT[5] * a[7][5] - iT[6] * a[2][5] + iT[18] * a[9][0] - iT[19] * a[5][0] + t[1] + add) >> shift);
1475
    dst[23] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[0][3] - iT[11] * a[1][3] + iT[13] * a[3][3] + iT[24] * a[2][3] - iT[1] * a[8][0] + iT[10] * a[9][5] - iT[14] * a[6][5] + iT[23] * a[3][0] + iT[2] * a[8][2] - iT[9] * a[1][2] - iT[15] * a[4][3] - iT[22] * a[7][2] + iT[3] * a[0][5] + iT[8] * a[5][0] - iT[16] * a[6][0] - iT[21] * a[4][0] + iT[4] * a[8][4] - iT[7] * a[9][1] + iT[17] * a[6][1] - iT[20] * a[3][4] - iT[5] * a[5][4] - iT[6] * a[0][1] + iT[18] * a[4][4] + iT[19] * a[6][4] + t[0] + add) >> shift);
1476
    dst[26] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][0] - iT[11] * a[2][0] + iT[13] * a[1][0] + iT[24] * a[0][0] - iT[1] * a[2][5] - iT[10] * a[3][5] + iT[14] * a[0][5] + iT[23] * a[1][5] + iT[2] * a[4][4] + iT[9] * a[6][4] - iT[15] * a[0][1] - iT[22] * a[5][4] - iT[3] * a[4][1] - iT[8] * a[7][4] - iT[16] * a[1][4] + iT[21] * a[8][4] + iT[4] * a[2][2] + iT[7] * a[7][2] + iT[17] * a[5][3] - iT[20] * a[9][3] + iT[5] * a[3][3] - iT[6] * a[6][2] - iT[18] * a[8][3] + iT[19] * a[9][2] - t[1] + add) >> shift);
1477
    dst[27] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][3] + iT[11] * a[6][2] + iT[13] * a[8][3] - iT[24] * a[9][2] - iT[1] * a[2][0] - iT[10] * a[3][0] + iT[14] * a[0][0] + iT[23] * a[1][0] - iT[2] * a[6][3] + iT[9] * a[3][2] + iT[15] * a[9][3] - iT[22] * a[8][2] - iT[3] * a[4][0] - iT[8] * a[6][0] + iT[16] * a[0][5] + iT[21] * a[5][0] - iT[4] * a[7][4] - iT[7] * a[2][4] + iT[17] * a[9][1] - iT[20] * a[5][1] - iT[5] * a[4][4] - iT[6] * a[7][1] - iT[18] * a[1][1] + iT[19] * a[8][1] - t[1] + add) >> shift);
1478
    dst[28] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[0][4] + iT[11] * a[5][1] - iT[13] * a[6][1] - iT[24] * a[4][1] + iT[1] * a[9][3] - iT[10] * a[8][2] + iT[14] * a[3][2] - iT[23] * a[6][3] - iT[2] * a[1][0] - iT[9] * a[0][0] + iT[15] * a[2][0] + iT[22] * a[3][0] + iT[3] * a[8][1] - iT[8] * a[9][4] + iT[16] * a[6][4] - iT[21] * a[3][1] - iT[4] * a[5][2] - iT[7] * a[0][3] + iT[17] * a[4][2] + iT[20] * a[6][2] + iT[5] * a[1][5] - iT[6] * a[8][5] + iT[18] * a[7][5] + iT[19] * a[4][0] - t[0] + add) >> shift);
1479
    dst[30] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][3] - iT[11] * a[9][3] + iT[13] * a[7][2] + iT[24] * a[2][2] + iT[1] * a[0][1] + iT[10] * a[1][1] - iT[14] * a[3][1] - iT[23] * a[2][1] + iT[2] * a[9][0] - iT[9] * a[5][0] - iT[15] * a[2][5] - iT[22] * a[7][5] - iT[3] * a[5][2] + iT[8] * a[9][2] - iT[16] * a[7][3] - iT[21] * a[2][3] - iT[4] * a[0][0] - iT[7] * a[1][0] + iT[17] * a[3][0] + iT[20] * a[2][0] - iT[5] * a[9][1] + iT[6] * a[5][1] + iT[18] * a[2][4] + iT[19] * a[7][4] + t[0] + add) >> shift);
1480
    dst[31] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][5] + iT[11] * a[2][5] - iT[13] * a[1][5] - iT[24] * a[0][5] - iT[1] * a[3][4] - iT[10] * a[2][4] + iT[14] * a[1][4] + iT[23] * a[0][4] + iT[2] * a[3][3] + iT[9] * a[2][3] - iT[15] * a[1][3] - iT[22] * a[0][3] - iT[3] * a[3][2] - iT[8] * a[2][2] + iT[16] * a[1][2] + iT[21] * a[0][2] + iT[4] * a[3][1] + iT[7] * a[2][1] - iT[17] * a[1][1] - iT[20] * a[0][1] - iT[5] * a[3][0] - iT[6] * a[2][0] + iT[18] * a[1][0] + iT[19] * a[0][0] + t[1] + add) >> shift);
1481
1482
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 4] * b[0] + iT[14] * b[1] + iT[24] * b[2] + iT[29] * b[3] + iT[19] * b[4] + iT[ 9] * b[5] + add) >> shift);
1483
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 9] * b[0] + iT[29] * b[1] + iT[14] * b[2] - iT[ 4] * b[3] - iT[24] * b[4] - iT[19] * b[5] + add) >> shift);
1484
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(iT[14] * b[0] + iT[19] * b[1] - iT[ 9] * b[2] - iT[24] * b[3] + iT[ 4] * b[4] + iT[29] * b[5] + add) >> shift);
1485
    dst[19] = Clip3(outputMinimum, outputMaximum, (int)(iT[19] * b[0] + iT[ 4] * b[1] - iT[29] * b[2] + iT[ 9] * b[3] + iT[14] * b[4] - iT[24] * b[5] + add) >> shift);
1486
    dst[24] = Clip3(outputMinimum, outputMaximum, (int)(iT[24] * b[0] - iT[ 9] * b[1] - iT[ 4] * b[2] + iT[19] * b[3] - iT[29] * b[4] + iT[14] * b[5] + add) >> shift);
1487
    dst[29] = Clip3(outputMinimum, outputMaximum, (int)(iT[29] * b[0] - iT[24] * b[1] + iT[19] * b[2] - iT[14] * b[3] + iT[ 9] * b[4] - iT[ 4] * b[5] + add) >> shift);
1488
1489
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(iT[12]*c[0] + iT[25]*c[1] + add) >> shift);
1490
    dst[25] = Clip3(outputMinimum, outputMaximum, (int)(iT[25]*c[0] - iT[12]*c[1] + add) >> shift);
1491
1492
    src++;
1493
    dst += 32;
1494
  }
1495
1496
  if (iSkipLine)
1497
  {
1498
    memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff));
1499
  }
1500
#else
1501
0
  _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P32[TRANSFORM_INVERSE][0] );
1502
0
#endif
1503
0
}
1504
1505
1506
// ********************************** DCT-VIII **********************************
1507
void fastForwardDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1508
0
{
1509
0
  int i;
1510
0
  int rnd_factor = 1 << (shift - 1);
1511
0
  const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_FORWARD][0];
1512
1513
0
  int c[4];
1514
0
  TCoeff *pCoeff = dst;
1515
0
  const int  reducedLine = line - iSkipLine;
1516
0
  for (i = 0; i<reducedLine; i++)
1517
0
  {
1518
    // Intermediate Variables
1519
0
    c[0] = src[0] + src[3];
1520
0
    c[1] = src[2] + src[0];
1521
0
    c[2] = src[3] - src[2];
1522
0
    c[3] = iT[1] * src[1];
1523
1524
0
    dst[0 * line] = (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift;
1525
0
    dst[1 * line] = (iT[1] * (src[0] - src[2] - src[3]) + rnd_factor) >> shift;
1526
0
    dst[2 * line] = (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift;
1527
0
    dst[3 * line] = (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift;
1528
1529
0
    src += 4;
1530
0
    dst++;
1531
0
  }
1532
0
  if (iSkipLine)
1533
0
  {
1534
0
    dst = pCoeff + reducedLine;
1535
0
    for (i = 0; i<4; i++)
1536
0
    {
1537
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1538
0
      dst += line;
1539
0
    }
1540
0
  }
1541
0
}
1542
1543
1544
void fastInverseDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1545
0
{
1546
0
#if ENABLE_SIMD_TRAFO
1547
0
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P4[TRANSFORM_INVERSE][0] );
1548
#else
1549
  int i;
1550
  int rnd_factor = 1 << (shift - 1);
1551
1552
  const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_INVERSE][0];
1553
1554
  int c[4];
1555
  const int  reducedLine = line - iSkipLine;
1556
  for (i = 0; i<reducedLine; i++)
1557
  {
1558
    // Intermediate Variables
1559
    c[0] = src[0 * line] + src[3 * line];
1560
    c[1] = src[2 * line] + src[0 * line];
1561
    c[2] = src[3 * line] - src[2 * line];
1562
    c[3] = iT[1] * src[1 * line];
1563
1564
    dst[0] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift);
1565
    dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * (src[0 * line] - src[2 * line] - src[3 * line]) + rnd_factor) >> shift);
1566
    dst[2] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift);
1567
    dst[3] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift);
1568
1569
    dst += 4;
1570
    src++;
1571
  }
1572
  if (iSkipLine)
1573
  {
1574
    memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff));
1575
  }
1576
#endif
1577
0
}
1578
1579
1580
void fastForwardDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1581
0
{
1582
0
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P8[TRANSFORM_FORWARD][0] );
1583
0
}
1584
1585
1586
void fastInverseDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1587
0
{
1588
0
  _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P8[TRANSFORM_INVERSE][0] );
1589
0
}
1590
1591
1592
void fastForwardDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1593
0
{
1594
#if !JVET_M0497_MATRIX_MULT
1595
  int j, k;
1596
  TCoeff a[5], b[5], c[5], d[5], t;
1597
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1598
1599
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0];
1600
1601
  TCoeff *pCoef = dst;
1602
  const int  reducedLine = line - iSkipLine;
1603
  const int  cutoff = 16 - iSkipLine2;
1604
1605
  for (j = 0; j < reducedLine; j++)
1606
  {
1607
    for (k = 0; k < 5; k++)
1608
    {
1609
      a[k] = src[15 - k] + src[ 4 - k];
1610
      b[k] = src[ 6 + k] + src[ 4 - k];
1611
      c[k] = src[15 - k] - src[ 6 + k];
1612
      d[k] = src[15 - k] + src[ 6 + k] - src[ 4 - k];
1613
    }
1614
1615
    t = iT[10] * src[5];
1616
1617
    dst[ 1 * line] = ( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift;
1618
    dst[ 4 * line] = (   iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift;
1619
    dst[ 7 * line] = ( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift;
1620
    dst[10 * line] = (   iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift;
1621
    dst[13 * line] = ( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift;
1622
1623
    dst[ 5 * line] = ( - iT[10] * (src[15] + src[14] - src[12] - src[11] + src[9] + src[8] - src[6] - src[5] + src[3] + src[2] - src[0]) + add) >> shift;
1624
1625
    dst[ 0 * line] = (   iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift;
1626
    dst[ 2 * line] = (   iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift;
1627
    dst[ 3 * line] = ( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift;
1628
    dst[ 6 * line] = (   iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift;
1629
    dst[ 8 * line] = (   iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift;
1630
    dst[ 9 * line] = ( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift;
1631
    dst[11 * line] = ( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift;
1632
    dst[12 * line] = (   iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift;
1633
    dst[14 * line] = (   iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift;
1634
    dst[15 * line] = ( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift;
1635
1636
    src += 16;
1637
    dst++;
1638
  }
1639
1640
  if (iSkipLine)
1641
  {
1642
    dst = pCoef + reducedLine;
1643
    for (j = 0; j < cutoff; j++)
1644
    {
1645
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1646
      dst += line;
1647
    }
1648
  }
1649
1650
  if (iSkipLine2)
1651
  {
1652
    dst = pCoef + line * cutoff;
1653
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1654
  }
1655
#else
1656
0
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P16[TRANSFORM_FORWARD][0] );
1657
0
#endif
1658
0
}
1659
1660
1661
void fastInverseDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1662
0
{
1663
#if !JVET_M0497_MATRIX_MULT
1664
  int j, k;
1665
  TCoeff a[5], b[5], c[5], d[5], t;
1666
1667
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1668
1669
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0];
1670
1671
  const int reducedLine = line - iSkipLine;
1672
1673
  for (j = 0; j < reducedLine; j++)
1674
  {
1675
    for (k = 0; k < 5; k++)
1676
    {
1677
      a[k] = src[(15 - k ) * line] + src[( 4 - k) * line];
1678
      b[k] = src[( 6 + k ) * line] + src[( 4 - k) * line];
1679
      c[k] = src[(15 - k ) * line] - src[( 6 + k) * line];
1680
      d[k] = src[(15 - k ) * line] + src[( 6 + k) * line] - src[(4 - k) * line];
1681
    }
1682
1683
    t = iT[10] * src[5*line];
1684
1685
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift);
1686
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(   iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift);
1687
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift);
1688
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)(   iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift);
1689
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift);
1690
1691
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( - iT[10] * (src[15 * line] + src[14 * line] - src[12 * line] - src[11 * line] + src[9 * line] + src[8 * line] - src[6 * line] - src[5 * line] + src[3 * line] + src[2 * line] - src[0 * line]) + add) >> shift);
1692
1693
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift );
1694
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(   iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift );
1695
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift );
1696
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(   iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift );
1697
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(   iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift );
1698
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift );
1699
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift );
1700
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(   iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift );
1701
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(   iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift );
1702
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift );
1703
1704
    src++;
1705
    dst += 16;
1706
  }
1707
1708
  if (iSkipLine)
1709
  {
1710
    memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff));
1711
  }
1712
#else
1713
0
  _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P16[TRANSFORM_INVERSE][0] );
1714
0
#endif
1715
0
}
1716
1717
1718
void fastForwardDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1719
0
{
1720
#if !JVET_M0497_MATRIX_MULT
1721
  int j, k;
1722
  TCoeff a[10][6];
1723
  TCoeff t[2];
1724
  TCoeff b[6];
1725
  TCoeff c[2];
1726
1727
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1728
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0];
1729
  TCoeff *pCoef = dst;
1730
  const int  reducedLine = line - iSkipLine;
1731
  const int  cutoff = 32 - iSkipLine2;
1732
1733
  for (j = 0; j < reducedLine; j++)
1734
  {
1735
    for (k = 0; k < 6; k++)
1736
    {
1737
      a[0][k] = src[31-k] - src[20+k];
1738
      a[1][k] = src[31-k] + src[18-k];
1739
      a[2][k] = src[31-k] + src[ 7+k];
1740
      a[3][k] = src[31-k] - src[ 5-k];
1741
      a[4][k] = src[25-k] + src[13+k];
1742
      a[5][k] = src[25-k] + src[12-k];
1743
      a[6][k] = src[25-k] - src[   k];
1744
      a[7][k] = src[18-k] - src[ 7+k];
1745
      a[8][k] = src[18-k] + src[ 5-k];
1746
      a[9][k] = src[12-k] + src[   k];
1747
1748
      b[k] = src[31-k] + src[20+k] - src[18-k] - src[7+k] + src[5-k];
1749
    }
1750
1751
    for (k = 0; k < 2; k++)
1752
    {
1753
      c[k] = src[31-k] + src[28+k] - src[26-k] - src[23+k] + src[21-k] + src[18+k] - src[16-k] - src[13+k] + src[11-k] + src[8+k] - src[6-k] - src[3+k] + src[1-k];
1754
    }
1755
1756
    t[0] = iT[12] * src[19] + iT[25] * src[6];
1757
    t[1] = iT[12] * src[6] - iT[25] * src[19];
1758
1759
    dst[ 0 * line] = (   iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift;
1760
    dst[ 1 * line] = (   iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift;
1761
    dst[ 3 * line] = (   iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift;
1762
    dst[ 4 * line] = ( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift;
1763
    dst[ 5 * line] = (   iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift;
1764
    dst[ 8 * line] = (   iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift;
1765
    dst[ 9 * line] = (   iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift;
1766
    dst[10 * line] = ( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift;
1767
    dst[11 * line] = ( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift;
1768
    dst[13 * line] = (   iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift;
1769
    dst[14 * line] = (   iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift;
1770
    dst[15 * line] = (   iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift;
1771
    dst[16 * line] = ( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift;
1772
    dst[18 * line] = (   iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift;
1773
    dst[20 * line] = ( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift;
1774
    dst[21 * line] = (   iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift;
1775
    dst[23 * line] = (   iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift;
1776
    dst[24 * line] = ( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift;
1777
    dst[25 * line] = ( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift;
1778
    dst[26 * line] = (   iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift;
1779
    dst[28 * line] = ( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift;
1780
    dst[29 * line] = (   iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift;
1781
    dst[30 * line] = ( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift;
1782
    dst[31 * line] = (   iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift;
1783
1784
    dst[ 2 * line] = (   iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift;
1785
    dst[ 7 * line] = ( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift;
1786
    dst[12 * line] = (   iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift;
1787
    dst[17 * line] = ( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift;
1788
    dst[22 * line] = (   iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift;
1789
    dst[27 * line] = ( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift;
1790
1791
    dst[ 6 * line] = (   iT[12] * c[0] + iT[25] * c[1] + add) >> shift;
1792
    dst[19 * line] = ( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift;
1793
1794
    src += 32;
1795
    dst++;
1796
  }
1797
1798
  if (iSkipLine)
1799
  {
1800
    dst = pCoef + reducedLine;
1801
    for (j = 0; j < cutoff; j++)
1802
    {
1803
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1804
      dst += line;
1805
    }
1806
  }
1807
1808
  if (iSkipLine2)
1809
  {
1810
    dst = pCoef + line * cutoff;
1811
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1812
  }
1813
#else
1814
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P32[TRANSFORM_FORWARD][0] );
1815
0
#endif
1816
0
}
1817
1818
1819
void fastInverseDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1820
0
{
1821
#if !JVET_M0497_MATRIX_MULT
1822
  int j, k;
1823
  TCoeff a[10][6];
1824
  TCoeff t[2];
1825
  TCoeff b[6];
1826
  TCoeff c[2];
1827
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1828
1829
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0];
1830
1831
  const int  reducedLine = line - iSkipLine;
1832
1833
  for (j = 0; j < reducedLine; j++)
1834
  {
1835
    for (k = 0; k < 6; k++)
1836
    {
1837
      a[0][k] = src[(31 - k)*line] - src[(20 + k)*line];
1838
      a[1][k] = src[(31 - k)*line] + src[(18 - k)*line];
1839
      a[2][k] = src[(31 - k)*line] + src[( 7 + k)*line];
1840
      a[3][k] = src[(31 - k)*line] - src[( 5 - k)*line];
1841
      a[4][k] = src[(25 - k)*line] + src[(13 + k)*line];
1842
      a[5][k] = src[(25 - k)*line] + src[(12 - k)*line];
1843
      a[6][k] = src[(25 - k)*line] - src[      k *line];
1844
      a[7][k] = src[(18 - k)*line] - src[( 7 + k)*line];
1845
      a[8][k] = src[(18 - k)*line] + src[( 5 - k)*line];
1846
      a[9][k] = src[(12 - k)*line] + src[      k *line];
1847
1848
      b[k] = src[(31 - k)*line] + src[(20 + k)*line] - src[(18 - k)*line] - src[(7 + k)*line] + src[(5 - k)*line];
1849
    }
1850
1851
    for (k = 0; k < 2; k++)
1852
    {
1853
      c[k] = src[(31 - k)*line] + src[(28 + k)*line] - src[(26 - k)*line] - src[(23 + k)*line] + src[(21 - k)*line] + src[(18 + k)*line] - src[(16 - k)*line] - src[(13 + k)*line] + src[(11 - k)*line] + src[(8 + k)*line] - src[(6 - k)*line] - src[(3 + k)*line] + src[(1 - k)*line];
1854
    }
1855
1856
    t[0] = iT[12] * src[19 * line] + iT[25] * src[ 6 * line];
1857
    t[1] = iT[12] * src[ 6 * line] - iT[25] * src[19 * line];
1858
1859
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift);
1860
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift);
1861
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift);
1862
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift);
1863
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift);
1864
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift);
1865
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift);
1866
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift);
1867
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift);
1868
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift);
1869
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift);
1870
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift);
1871
    dst[16] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift);
1872
    dst[18] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift);
1873
    dst[20] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift);
1874
    dst[21] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift);
1875
    dst[23] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift);
1876
    dst[24] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift);
1877
    dst[25] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift);
1878
    dst[26] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift);
1879
    dst[28] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift);
1880
    dst[29] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift);
1881
    dst[30] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift);
1882
    dst[31] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift);
1883
1884
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(   iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift);
1885
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift);
1886
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(   iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift);
1887
    dst[17] = Clip3(outputMinimum, outputMaximum, (int)( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift);
1888
    dst[22] = Clip3(outputMinimum, outputMaximum, (int)(   iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift);
1889
    dst[27] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift);
1890
1891
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(   iT[12] * c[0] + iT[25] * c[1] + add) >> shift);
1892
    dst[19] = Clip3(outputMinimum, outputMaximum, (int)( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift);
1893
1894
    src++;
1895
    dst += 32;
1896
  }
1897
1898
  if (iSkipLine)
1899
  {
1900
    memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff));
1901
  }
1902
#else
1903
0
  _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P32[TRANSFORM_INVERSE][0] );
1904
0
#endif
1905
0
}
1906
1907
#if ENABLE_SIMD_TRAFO
1908
1909
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1
1910
1911
}   // namespace vvenc
1912
1913
#include "Unit.h"
1914
1915
namespace vvenc {
1916
  
1917
void cpyCoeffCore( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height )
1918
2.01M
{
1919
670M
#define CPYCOEFF_OP( ADDR ) dst[ADDR] = src[ADDR];
1920
2.01M
#define CPYCOEFF_INC src += stride; dst += width;
1921
1922
670M
  SIZE_AWARE_PER_EL_OP( CPYCOEFF_OP, CPYCOEFF_INC );
1923
1924
2.01M
#undef CPYCOEFF_INC
1925
2.01M
#undef CPYCOEFF_OP
1926
2.01M
}
1927
1928
1929
void cpyResiCore( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height )
1930
824k
{
1931
186M
#define CPYRESI_OP( ADDR ) dst[ADDR] = Pel( src[ADDR] );
1932
824k
#define CPYRESI_INC dst += stride; src += width;
1933
1934
186M
  SIZE_AWARE_PER_EL_OP( CPYRESI_OP, CPYRESI_INC );
1935
1936
824k
#undef CPYRESI_INC
1937
824k
#undef CPYRESI_OP
1938
824k
}
1939
1940
1941
void clipCore( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
1942
1.63M
{
1943
297M
#define CLIP_OP( ADDR ) dst[ADDR] = Clip3( outputMin, outputMax, ( dst[ADDR] + round ) >> shift )
1944
1.63M
#define CLIP_INC        dst      += stride
1945
1946
297M
  SIZE_AWARE_PER_EL_OP( CLIP_OP, CLIP_INC );
1947
1948
1.63M
#undef CLIP_INC
1949
1.63M
#undef CLIP_OP
1950
1.63M
}
1951
1952
1953
template<unsigned trSize>
1954
void fastInvCore_( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
1955
745k
{
1956
9.83M
  for( int k = 0; k < rows; k++ )
1957
9.09M
  {
1958
9.09M
    const TCoeff* srcPtr = &src[k * lines];
1959
141M
    for( int i = 0; i < reducedLines; i++ )
1960
132M
    {
1961
132M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
132M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
3.79G
      for( int j = 0; j < trSize; j++ )
1964
3.66G
      {
1965
3.66G
        *dstPtr++ += *srcPtr * *itPtr++;
1966
3.66G
      }
1967
132M
      srcPtr++;
1968
132M
    }
1969
9.09M
  }
1970
745k
}
void vvenc::fastInvCore_<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
1.86k
{
1956
9.32k
  for( int k = 0; k < rows; k++ )
1957
7.45k
  {
1958
7.45k
    const TCoeff* srcPtr = &src[k * lines];
1959
145k
    for( int i = 0; i < reducedLines; i++ )
1960
138k
    {
1961
138k
            TCoeff*       dstPtr = &dst[i * trSize];
1962
138k
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
692k
      for( int j = 0; j < trSize; j++ )
1964
553k
      {
1965
553k
        *dstPtr++ += *srcPtr * *itPtr++;
1966
553k
      }
1967
138k
      srcPtr++;
1968
138k
    }
1969
7.45k
  }
1970
1.86k
}
void vvenc::fastInvCore_<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
1.19k
{
1956
10.7k
  for( int k = 0; k < rows; k++ )
1957
9.57k
  {
1958
9.57k
    const TCoeff* srcPtr = &src[k * lines];
1959
220k
    for( int i = 0; i < reducedLines; i++ )
1960
211k
    {
1961
211k
            TCoeff*       dstPtr = &dst[i * trSize];
1962
211k
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
1.89M
      for( int j = 0; j < trSize; j++ )
1964
1.68M
      {
1965
1.68M
        *dstPtr++ += *srcPtr * *itPtr++;
1966
1.68M
      }
1967
211k
      srcPtr++;
1968
211k
    }
1969
9.57k
  }
1970
1.19k
}
void vvenc::fastInvCore_<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
488k
{
1956
5.47M
  for( int k = 0; k < rows; k++ )
1957
4.98M
  {
1958
4.98M
    const TCoeff* srcPtr = &src[k * lines];
1959
69.1M
    for( int i = 0; i < reducedLines; i++ )
1960
64.1M
    {
1961
64.1M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
64.1M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
1.09G
      for( int j = 0; j < trSize; j++ )
1964
1.02G
      {
1965
1.02G
        *dstPtr++ += *srcPtr * *itPtr++;
1966
1.02G
      }
1967
64.1M
      srcPtr++;
1968
64.1M
    }
1969
4.98M
  }
1970
488k
}
void vvenc::fastInvCore_<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
231k
{
1956
3.95M
  for( int k = 0; k < rows; k++ )
1957
3.71M
  {
1958
3.71M
    const TCoeff* srcPtr = &src[k * lines];
1959
57.2M
    for( int i = 0; i < reducedLines; i++ )
1960
53.4M
    {
1961
53.4M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
53.4M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
1.76G
      for( int j = 0; j < trSize; j++ )
1964
1.71G
      {
1965
1.71G
        *dstPtr++ += *srcPtr * *itPtr++;
1966
1.71G
      }
1967
53.4M
      srcPtr++;
1968
53.4M
    }
1969
3.71M
  }
1970
231k
}
void vvenc::fastInvCore_<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
22.1k
{
1956
391k
  for( int k = 0; k < rows; k++ )
1957
369k
  {
1958
369k
    const TCoeff* srcPtr = &src[k * lines];
1959
14.8M
    for( int i = 0; i < reducedLines; i++ )
1960
14.4M
    {
1961
14.4M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
14.4M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
939M
      for( int j = 0; j < trSize; j++ )
1964
925M
      {
1965
925M
        *dstPtr++ += *srcPtr * *itPtr++;
1966
925M
      }
1967
14.4M
      srcPtr++;
1968
14.4M
    }
1969
369k
  }
1970
22.1k
}
1971
1972
1973
template<unsigned trSize>
1974
void fastFwdCore( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, unsigned reducedLine, unsigned cutoff, int shift )
1975
3.24M
{
1976
3.24M
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
48.7M
  for( int i = 0; i < reducedLine; i++ )
1979
45.5M
  {
1980
45.5M
          TCoeff*       dstPtr = dst;
1981
45.5M
    const TMatrixCoeff* iT     = tc;
1982
1983
682M
    for( int j = 0; j < cutoff; j++ )
1984
637M
    {
1985
637M
      int sum = 0;
1986
1987
18.9G
      for( int k = 0; k < trSize; k++ )
1988
18.3G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
18.3G
        sum += src[k] * iT[k];
1991
18.3G
      }
1992
1993
637M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
637M
      dstPtr   += line;
1995
637M
      iT       += trSize;
1996
637M
    }
1997
1998
45.5M
    src += trSize;
1999
45.5M
  }
2000
3.24M
}
Unexecuted instantiation: void vvenc::fastFwdCore<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
void vvenc::fastFwdCore<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
1.15M
{
1976
1.15M
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
15.0M
  for( int i = 0; i < reducedLine; i++ )
1979
13.8M
  {
1980
13.8M
          TCoeff*       dstPtr = dst;
1981
13.8M
    const TMatrixCoeff* iT     = tc;
1982
1983
122M
    for( int j = 0; j < cutoff; j++ )
1984
108M
    {
1985
108M
      int sum = 0;
1986
1987
980M
      for( int k = 0; k < trSize; k++ )
1988
871M
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
871M
        sum += src[k] * iT[k];
1991
871M
      }
1992
1993
108M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
108M
      dstPtr   += line;
1995
108M
      iT       += trSize;
1996
108M
    }
1997
1998
13.8M
    src += trSize;
1999
13.8M
  }
2000
1.15M
}
void vvenc::fastFwdCore<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
1.05M
{
1976
1.05M
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
14.7M
  for( int i = 0; i < reducedLine; i++ )
1979
13.7M
  {
1980
13.7M
          TCoeff*       dstPtr = dst;
1981
13.7M
    const TMatrixCoeff* iT     = tc;
1982
1983
171M
    for( int j = 0; j < cutoff; j++ )
1984
157M
    {
1985
157M
      int sum = 0;
1986
1987
2.67G
      for( int k = 0; k < trSize; k++ )
1988
2.51G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
2.51G
        sum += src[k] * iT[k];
1991
2.51G
      }
1992
1993
157M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
157M
      dstPtr   += line;
1995
157M
      iT       += trSize;
1996
157M
    }
1997
1998
13.7M
    src += trSize;
1999
13.7M
  }
2000
1.05M
}
void vvenc::fastFwdCore<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
938k
{
1976
938k
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
14.8M
  for( int i = 0; i < reducedLine; i++ )
1979
13.9M
  {
1980
13.9M
          TCoeff*       dstPtr = dst;
1981
13.9M
    const TMatrixCoeff* iT     = tc;
1982
1983
289M
    for( int j = 0; j < cutoff; j++ )
1984
275M
    {
1985
275M
      int sum = 0;
1986
1987
9.07G
      for( int k = 0; k < trSize; k++ )
1988
8.80G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
8.80G
        sum += src[k] * iT[k];
1991
8.80G
      }
1992
1993
275M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
275M
      dstPtr   += line;
1995
275M
      iT       += trSize;
1996
275M
    }
1997
1998
13.9M
    src += trSize;
1999
13.9M
  }
2000
938k
}
void vvenc::fastFwdCore<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
95.5k
{
1976
95.5k
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
4.04M
  for( int i = 0; i < reducedLine; i++ )
1979
3.94M
  {
1980
3.94M
          TCoeff*       dstPtr = dst;
1981
3.94M
    const TMatrixCoeff* iT     = tc;
1982
1983
99.4M
    for( int j = 0; j < cutoff; j++ )
1984
95.5M
    {
1985
95.5M
      int sum = 0;
1986
1987
6.21G
      for( int k = 0; k < trSize; k++ )
1988
6.11G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
6.11G
        sum += src[k] * iT[k];
1991
6.11G
      }
1992
1993
95.5M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
95.5M
      dstPtr   += line;
1995
95.5M
      iT       += trSize;
1996
95.5M
    }
1997
1998
3.94M
    src += trSize;
1999
3.94M
  }
2000
95.5k
}
2001
2002
2003
TCoeffOps::TCoeffOps()
2004
12
{
2005
12
  cpyResi4        = cpyResiCore;
2006
12
  cpyResi8        = cpyResiCore;
2007
12
  cpyCoeff4       = cpyCoeffCore;
2008
12
  cpyCoeff8       = cpyCoeffCore;
2009
12
  roundClip4      = clipCore;
2010
12
  roundClip8      = clipCore;
2011
12
  fastInvCore[0]  = fastInvCore_< 4>;
2012
12
  fastInvCore[1]  = fastInvCore_< 8>;
2013
12
  fastInvCore[2]  = fastInvCore_<16>;
2014
12
  fastInvCore[3]  = fastInvCore_<32>;
2015
12
  fastInvCore[4]  = fastInvCore_<64>;
2016
12
  fastFwdCore_1D[0] = fastFwdCore< 4>;
2017
12
  fastFwdCore_1D[1] = fastFwdCore< 8>;
2018
12
  fastFwdCore_1D[2] = fastFwdCore<16>;
2019
12
  fastFwdCore_1D[3] = fastFwdCore<32>;
2020
12
  fastFwdCore_1D[4] = fastFwdCore<64>;
2021
12
  fastFwdCore_2D[0] = fastFwdCore< 4>;
2022
12
  fastFwdCore_2D[1] = fastFwdCore< 8>;
2023
12
  fastFwdCore_2D[2] = fastFwdCore<16>;
2024
12
  fastFwdCore_2D[3] = fastFwdCore<32>;
2025
12
  fastFwdCore_2D[4] = fastFwdCore<64>;
2026
12
}
2027
2028
TCoeffOps g_tCoeffOps;
2029
2030
#endif
2031
2032
2033
} // namespace vvenc
2034
2035
//! \}
2036