Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/CommonLib/TrQuant_EMT.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     TrQuant_EMT.cpp
45
    \brief    transform and quantization class
46
*/
47
48
#include "TrQuant_EMT.h"
49
#include "Rom.h"
50
51
#include <stdlib.h>
52
#include <math.h>
53
#include <memory.h>
54
55
//! \ingroup CommonLib
56
//! \{
57
58
namespace vvenc {
59
60
// ********************************** DCT-II **********************************
61
  
62
#if ENABLE_SIMD_TRAFO
63
template<int uiTrSize>
64
inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT );
65
66
template<>
67
inline void _fastInverseMM<2>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
68
0
{
69
0
  const int rnd_factor  = 1 << (shift - 1);
70
0
  const int reducedLine = line - iSkipLine;
71
0
  const int cutoff      = 2 - iSkipLine2;
72
0
73
0
  memset( dst, 0, reducedLine * 2 * sizeof( TCoeff ) );
74
0
75
0
  for( int k = 0; k < cutoff; k++ )
76
0
  {
77
0
    const TCoeff* srcPtr = &src[k * line];
78
0
    for( int i = 0; i < reducedLine; i++ )
79
0
    {
80
0
            TCoeff*       dstPtr = &dst[i << 1];
81
0
      const TMatrixCoeff*  itPtr =  &iT[k << 1];
82
0
      const TCoeff        srcVal = *srcPtr;
83
0
      for( int j = 0; j < 2; j++ )
84
0
      {
85
0
        *dstPtr++ += srcVal * *itPtr++;
86
0
      }
87
0
      srcPtr++;
88
0
    }
89
0
  }
90
0
91
0
  for( int i = 0; i < reducedLine; i++ )
92
0
  {
93
0
    TCoeff* dstPtr = &dst[i << 1];
94
0
    for( int j = 0; j < 2; j++, dstPtr++ )
95
0
    {
96
0
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
97
0
    }
98
0
  }
99
0
100
0
  if( iSkipLine )
101
0
  {
102
0
    memset( dst + ( reducedLine << 1 ), 0, ( iSkipLine << 1 ) * sizeof( TCoeff ) );
103
0
  }
104
0
}
105
106
template<>
107
inline void _fastInverseMM<4>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
108
1.78k
{
109
1.78k
  const int rnd_factor  = 1 << ( shift - 1 );
110
1.78k
  const int reducedLine = line - iSkipLine;
111
1.78k
  const int cutoff      = 4 - iSkipLine2;
112
113
1.78k
  memset( dst, 0, reducedLine * 4 * sizeof( TCoeff ) );
114
115
1.78k
#if ENABLE_SIMD_TRAFO
116
1.78k
  g_tCoeffOps.fastInvCore[0]( iT, src, dst, line, reducedLine, cutoff );
117
1.78k
  g_tCoeffOps.roundClip4( dst, 4, reducedLine, 4, outputMinimum, outputMaximum, rnd_factor, shift );
118
#else
119
  for( int k = 0; k < cutoff; k++ )
120
  {
121
    const TCoeff* srcPtr = &src[k * line];
122
    for( int i = 0; i < reducedLine; i++ )
123
    {
124
            TCoeff*       dstPtr = &dst[i << 2];
125
      const TMatrixCoeff*  itPtr =  &iT[k << 2];
126
      for( int j = 0; j < 4; j++ )
127
      {
128
        *dstPtr++ += *srcPtr * *itPtr++;
129
      }
130
      srcPtr++;
131
    }
132
  }
133
134
  for( int i = 0; i < reducedLine; i++ )
135
  {
136
    TCoeff* dstPtr = &dst[i << 2];
137
    for( int j = 0; j < 4; j++, dstPtr++ )
138
    {
139
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
140
    }
141
  }
142
#endif
143
144
1.78k
  if( iSkipLine )
145
0
  {
146
0
    memset( dst + ( reducedLine << 2 ), 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) );
147
0
  }
148
1.78k
}
149
150
#endif
151
152
template< int uiTrSize >
153
inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT )
154
608k
{
155
608k
  const int  rnd_factor  = 1 << (shift - 1);
156
608k
  const int  reducedLine = line - iSkipLine;
157
608k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
608k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
608k
#if ENABLE_SIMD_TRAFO
162
608k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
608k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
608k
  if( iSkipLine )
191
110k
  {
192
110k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
110k
  }
194
608k
}
void vvenc::_fastInverseMM<16>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
402k
{
155
402k
  const int  rnd_factor  = 1 << (shift - 1);
156
402k
  const int  reducedLine = line - iSkipLine;
157
402k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
402k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
402k
#if ENABLE_SIMD_TRAFO
162
402k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
402k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
402k
  if( iSkipLine )
191
68.9k
  {
192
68.9k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
68.9k
  }
194
402k
}
void vvenc::_fastInverseMM<32>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
187k
{
155
187k
  const int  rnd_factor  = 1 << (shift - 1);
156
187k
  const int  reducedLine = line - iSkipLine;
157
187k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
187k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
187k
#if ENABLE_SIMD_TRAFO
162
187k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
187k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
187k
  if( iSkipLine )
191
33.9k
  {
192
33.9k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
33.9k
  }
194
187k
}
void vvenc::_fastInverseMM<64>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
17.2k
{
155
17.2k
  const int  rnd_factor  = 1 << (shift - 1);
156
17.2k
  const int  reducedLine = line - iSkipLine;
157
17.2k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
17.2k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
17.2k
#if ENABLE_SIMD_TRAFO
162
17.2k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
17.2k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
17.2k
  if( iSkipLine )
191
8.09k
  {
192
8.09k
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
8.09k
  }
194
17.2k
}
void vvenc::_fastInverseMM<8>(int const*, int*, int, int, int, int, int, int, short const*)
Line
Count
Source
154
1.10k
{
155
1.10k
  const int  rnd_factor  = 1 << (shift - 1);
156
1.10k
  const int  reducedLine = line - iSkipLine;
157
1.10k
  const int  cutoff      = uiTrSize - iSkipLine2;
158
159
1.10k
  memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) );
160
161
1.10k
#if ENABLE_SIMD_TRAFO
162
1.10k
  g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff );
163
1.10k
  g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift );
164
#else
165
  for( int k = 0; k < cutoff; k++ )
166
  {
167
    const TCoeff* srcPtr = &src[k * line];
168
    for( int i = 0; i < reducedLine; i++ )
169
    {
170
            TCoeff*       dstPtr = &dst[i * uiTrSize];
171
      const TMatrixCoeff*  itPtr =  &iT[k * uiTrSize];
172
      for( int j = 0; j < uiTrSize; j++ )
173
      {
174
        *dstPtr++ += *srcPtr * *itPtr++;
175
      }
176
      srcPtr++;
177
    }
178
  }
179
180
  for( int i = 0; i < reducedLine; i++ )
181
  {
182
    TCoeff* dstPtr = &dst[i * uiTrSize];
183
    for( int j = 0; j < uiTrSize; j++, dstPtr++ )
184
    {
185
      *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift );
186
    }
187
  }
188
#endif
189
190
1.10k
  if( iSkipLine )
191
0
  {
192
0
    memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) );
193
0
  }
194
1.10k
}
195
196
//Fast DCT-II transforms
197
void fastForwardDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
198
24.8k
{
199
24.8k
  int j;
200
24.8k
  int E, O;
201
24.8k
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
202
203
24.8k
  const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_FORWARD][0];
204
205
24.8k
  TCoeff *pCoef = dst;
206
24.8k
  const int  reducedLine = line - iSkipLine;
207
337k
  for (j = 0; j<reducedLine; j++)
208
312k
  {
209
    /* E and O */
210
312k
    E = src[0] + src[1];
211
312k
    O = src[0] - src[1];
212
213
312k
    dst[0] = (iT[0] * E + add) >> shift;
214
312k
    dst[line] = (iT[2] * O + add) >> shift;
215
216
217
312k
    src += 2;
218
312k
    dst++;
219
312k
  }
220
24.8k
  if (iSkipLine)
221
0
  {
222
0
    dst = pCoef + reducedLine;
223
0
    for (j = 0; j<2; j++)
224
0
    {
225
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
226
0
      dst += line;
227
0
    }
228
0
  }
229
24.8k
}
230
231
void fastInverseDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
232
15.0k
{
233
15.0k
  int j;
234
15.0k
  int E, O;
235
15.0k
  int add = 1 << (shift - 1);
236
237
15.0k
  const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_INVERSE][0];
238
239
15.0k
  const int  reducedLine = line - iSkipLine;
240
201k
  for (j = 0; j<reducedLine; j++)
241
186k
  {
242
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
243
186k
    E = iT[0] * (src[0] + src[line]);
244
186k
    O = iT[2] * (src[0] - src[line]);
245
246
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
247
186k
    dst[0] = Clip3(outputMinimum, outputMaximum, (E + add) >> shift);
248
186k
    dst[1] = Clip3(outputMinimum, outputMaximum, (O + add) >> shift);
249
250
186k
    src++;
251
186k
    dst += 2;
252
186k
  }
253
15.0k
  if (iSkipLine)
254
0
  {
255
0
    memset(dst, 0, (iSkipLine << 1) * sizeof(TCoeff));
256
0
  }
257
15.0k
}
258
259
/** 4x4 forward transform implemented using partial butterfly structure (1D)
260
*  \param src   input data (residual)
261
*  \param dst   output data (transform coefficients)
262
*  \param shift specifies right shift after 1D transform
263
*  \param line
264
*/
265
void fastForwardDCT2_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
266
659k
{
267
659k
  int j;
268
659k
  TCoeff E[2], O[2];
269
659k
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
270
271
659k
  const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_FORWARD][0];
272
273
659k
  TCoeff *pCoef = dst;
274
659k
  const int  reducedLine = line - iSkipLine;
275
7.06M
  for (j = 0; j<reducedLine; j++)
276
6.40M
  {
277
    /* E and O */
278
6.40M
    E[0] = src[0] + src[3];
279
6.40M
    O[0] = src[0] - src[3];
280
6.40M
    E[1] = src[1] + src[2];
281
6.40M
    O[1] = src[1] - src[2];
282
283
6.40M
    dst[0] = (iT[0] * E[0] + iT[1] * E[1] + add) >> shift;
284
6.40M
    dst[2 * line] = (iT[8] * E[0] + iT[9] * E[1] + add) >> shift;
285
6.40M
    dst[line] = (iT[4] * O[0] + iT[5] * O[1] + add) >> shift;
286
6.40M
    dst[3 * line] = (iT[12] * O[0] + iT[13] * O[1] + add) >> shift;
287
288
6.40M
    src += 4;
289
6.40M
    dst++;
290
6.40M
  }
291
659k
  if (iSkipLine)
292
154k
  {
293
154k
    dst = pCoef + reducedLine;
294
771k
    for (j = 0; j<4; j++)
295
617k
    {
296
617k
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
297
617k
      dst += line;
298
617k
    }
299
154k
  }
300
659k
}
301
302
/** 4x4 inverse transform implemented using partial butterfly structure (1D)
303
*  \param src   input data (transform coefficients)
304
*  \param dst   output data (residual)
305
*  \param shift specifies right shift after 1D transform
306
*  \param line
307
*  \param outputMinimum  minimum for clipping
308
*  \param outputMaximum  maximum for clipping
309
*/
310
void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum )
311
308k
{
312
#if 0
313
  const TMatrixCoeff *iT = g_trCoreDCT2P4[0];
314
315
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
316
#else
317
308k
  int j;
318
308k
  int E[2], O[2];
319
308k
  int add = 1 << ( shift - 1 );
320
321
308k
  const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_INVERSE][0];
322
323
308k
#if ENABLE_SIMD_TRAFO
324
308k
  TCoeff* orgDst = dst;
325
326
308k
#endif
327
308k
  const int  reducedLine = line - iSkipLine;
328
2.83M
  for( j = 0; j < reducedLine; j++ )
329
2.52M
  {
330
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
331
2.52M
    O[0] = iT[1 * 4 + 0] * src[line] + iT[3 * 4 + 0] * src[3 * line];
332
2.52M
    O[1] = iT[1 * 4 + 1] * src[line] + iT[3 * 4 + 1] * src[3 * line];
333
2.52M
    E[0] = iT[0 * 4 + 0] * src[   0] + iT[2 * 4 + 0] * src[2 * line];
334
2.52M
    E[1] = iT[0 * 4 + 1] * src[   0] + iT[2 * 4 + 1] * src[2 * line];
335
336
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
337
2.52M
#if ENABLE_SIMD_TRAFO
338
2.52M
    dst[0] = E[0] + O[0];
339
2.52M
    dst[1] = E[1] + O[1];
340
2.52M
    dst[2] = E[1] - O[1];
341
2.52M
    dst[3] = E[0] - O[0];
342
#else
343
    dst[0] = Clip3( outputMinimum, outputMaximum, ( E[0] + O[0] + add ) >> shift );
344
    dst[1] = Clip3( outputMinimum, outputMaximum, ( E[1] + O[1] + add ) >> shift );
345
    dst[2] = Clip3( outputMinimum, outputMaximum, ( E[1] - O[1] + add ) >> shift );
346
    dst[3] = Clip3( outputMinimum, outputMaximum, ( E[0] - O[0] + add ) >> shift );
347
#endif
348
349
2.52M
    src++;
350
2.52M
    dst += 4;
351
2.52M
  }
352
353
308k
#if ENABLE_SIMD_TRAFO
354
308k
  g_tCoeffOps.roundClip4( orgDst, 4, reducedLine, 4, outputMinimum, outputMaximum, add, shift );
355
356
308k
#endif
357
308k
  if( iSkipLine )
358
67.0k
  {
359
67.0k
    memset( dst, 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) );
360
67.0k
  }
361
308k
#endif
362
308k
}
363
364
365
366
template< int uiTrSize >
367
inline void _fastForwardMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TMatrixCoeff* tc )
368
2.70M
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
2.70M
  const int  reducedLine = line - iSkipLine;
373
2.70M
  const int  cutoff      = uiTrSize - iSkipLine2;
374
2.70M
  TCoeff *pCoef;
375
376
2.70M
#if ENABLE_SIMD_TRAFO
377
2.70M
  if( line == 1 )
378
288
  {
379
288
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
288
  }
381
2.70M
  else
382
2.70M
  {
383
2.70M
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
2.70M
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
2.70M
  if( iSkipLine )
407
447k
  {
408
447k
    pCoef = dst + reducedLine;
409
4.56M
    for( int j = 0; j<cutoff; j++ )
410
4.11M
    {
411
4.11M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
4.11M
      pCoef += line;
413
4.11M
    }
414
447k
  }
415
416
2.70M
  if( iSkipLine2 )
417
1.20M
  {
418
1.20M
    pCoef = dst + line*cutoff;
419
1.20M
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
1.20M
  }
421
2.70M
}
void vvenc::_fastForwardMM<8>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
988k
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
988k
  const int  reducedLine = line - iSkipLine;
373
988k
  const int  cutoff      = uiTrSize - iSkipLine2;
374
988k
  TCoeff *pCoef;
375
376
988k
#if ENABLE_SIMD_TRAFO
377
988k
  if( line == 1 )
378
0
  {
379
0
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
0
  }
381
988k
  else
382
988k
  {
383
988k
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
988k
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
988k
  if( iSkipLine )
407
124k
  {
408
124k
    pCoef = dst + reducedLine;
409
1.12M
    for( int j = 0; j<cutoff; j++ )
410
997k
    {
411
997k
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
997k
      pCoef += line;
413
997k
    }
414
124k
  }
415
416
988k
  if( iSkipLine2 )
417
120k
  {
418
120k
    pCoef = dst + line*cutoff;
419
120k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
120k
  }
421
988k
}
void vvenc::_fastForwardMM<16>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
871k
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
871k
  const int  reducedLine = line - iSkipLine;
373
871k
  const int  cutoff      = uiTrSize - iSkipLine2;
374
871k
  TCoeff *pCoef;
375
376
871k
#if ENABLE_SIMD_TRAFO
377
871k
  if( line == 1 )
378
236
  {
379
236
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
236
  }
381
871k
  else
382
871k
  {
383
871k
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
871k
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
871k
  if( iSkipLine )
407
150k
  {
408
150k
    pCoef = dst + reducedLine;
409
1.36M
    for( int j = 0; j<cutoff; j++ )
410
1.21M
    {
411
1.21M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
1.21M
      pCoef += line;
413
1.21M
    }
414
150k
  }
415
416
871k
  if( iSkipLine2 )
417
548k
  {
418
548k
    pCoef = dst + line*cutoff;
419
548k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
548k
  }
421
871k
}
void vvenc::_fastForwardMM<32>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
764k
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
764k
  const int  reducedLine = line - iSkipLine;
373
764k
  const int  cutoff      = uiTrSize - iSkipLine2;
374
764k
  TCoeff *pCoef;
375
376
764k
#if ENABLE_SIMD_TRAFO
377
764k
  if( line == 1 )
378
52
  {
379
52
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
52
  }
381
764k
  else
382
764k
  {
383
764k
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
764k
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
764k
  if( iSkipLine )
407
134k
  {
408
134k
    pCoef = dst + reducedLine;
409
1.21M
    for( int j = 0; j<cutoff; j++ )
410
1.07M
    {
411
1.07M
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
1.07M
      pCoef += line;
413
1.07M
    }
414
134k
  }
415
416
764k
  if( iSkipLine2 )
417
458k
  {
418
458k
    pCoef = dst + line*cutoff;
419
458k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
458k
  }
421
764k
}
void vvenc::_fastForwardMM<64>(int const*, int*, int, int, int, int, short const*)
Line
Count
Source
368
76.8k
{
369
#if !ENABLE_SIMD_TRAFO
370
  const int  rnd_factor  = 1 << (shift - 1);
371
#endif
372
76.8k
  const int  reducedLine = line - iSkipLine;
373
76.8k
  const int  cutoff      = uiTrSize - iSkipLine2;
374
76.8k
  TCoeff *pCoef;
375
376
76.8k
#if ENABLE_SIMD_TRAFO
377
76.8k
  if( line == 1 )
378
0
  {
379
0
    g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
380
0
  }
381
76.8k
  else
382
76.8k
  {
383
76.8k
    g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift );
384
76.8k
  }
385
#else
386
  for( int i = 0; i<reducedLine; i++ )
387
  {
388
    pCoef = dst;
389
    const TMatrixCoeff* iT = tc;
390
    for( int j = 0; j<cutoff; j++ )
391
    {
392
      int iSum = 0;
393
      for( int k = 0; k<uiTrSize; k++ )
394
      {
395
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
396
        iSum += src[k] * iT[k];
397
      }
398
      pCoef[i] = (iSum + rnd_factor) >> shift;
399
      pCoef += line;
400
      iT += uiTrSize;
401
    }
402
    src += uiTrSize;
403
  }
404
#endif
405
406
76.8k
  if( iSkipLine )
407
37.5k
  {
408
37.5k
    pCoef = dst + reducedLine;
409
866k
    for( int j = 0; j<cutoff; j++ )
410
829k
    {
411
829k
      memset(pCoef, 0, sizeof(TCoeff) * iSkipLine);
412
829k
      pCoef += line;
413
829k
    }
414
37.5k
  }
415
416
76.8k
  if( iSkipLine2 )
417
76.8k
  {
418
76.8k
    pCoef = dst + line*cutoff;
419
76.8k
    memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2);
420
76.8k
  }
421
76.8k
}
422
423
424
425
/** 8x8 forward transform implemented using partial butterfly structure (1D)
426
*  \param src   input data (residual)
427
*  \param dst   output data (transform coefficients)
428
*  \param shift specifies right shift after 1D transform
429
*  \param line
430
*/
431
void fastForwardDCT2_B8( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 )
432
965k
{
433
#if !JVET_M0497_MATRIX_MULT
434
  int j, k;
435
  TCoeff E[4], O[4];
436
  TCoeff EE[2], EO[2];
437
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
438
439
  const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_FORWARD][0];
440
441
  TCoeff *pCoef = dst;
442
  const int  reducedLine = line - iSkipLine;
443
  for( j = 0; j < reducedLine; j++ )
444
  {
445
    /* E and O*/
446
    for( k = 0; k < 4; k++ )
447
    {
448
      E[k] = src[k] + src[7 - k];
449
      O[k] = src[k] - src[7 - k];
450
    }
451
    /* EE and EO */
452
    EE[0] = E[0] + E[3];
453
    EO[0] = E[0] - E[3];
454
    EE[1] = E[1] + E[2];
455
    EO[1] = E[1] - E[2];
456
457
    dst[0       ] = (iT[ 0] * EE[0] + iT[ 1] * EE[1] + add) >> shift;
458
    dst[4 * line] = (iT[32] * EE[0] + iT[33] * EE[1] + add) >> shift;
459
    dst[2 * line] = (iT[16] * EO[0] + iT[17] * EO[1] + add) >> shift;
460
    dst[6 * line] = (iT[48] * EO[0] + iT[49] * EO[1] + add) >> shift;
461
462
    dst[    line] = (iT[ 8] * O[0] + iT[ 9] * O[1] + iT[10] * O[2] + iT[11] * O[3] + add) >> shift;
463
    dst[3 * line] = (iT[24] * O[0] + iT[25] * O[1] + iT[26] * O[2] + iT[27] * O[3] + add) >> shift;
464
    dst[5 * line] = (iT[40] * O[0] + iT[41] * O[1] + iT[42] * O[2] + iT[43] * O[3] + add) >> shift;
465
    dst[7 * line] = (iT[56] * O[0] + iT[57] * O[1] + iT[58] * O[2] + iT[59] * O[3] + add) >> shift;
466
467
    src += 8;
468
    dst++;
469
  }
470
  if( iSkipLine )
471
  {
472
    dst = pCoef + reducedLine;
473
    for( j = 0; j < 8; j++ )
474
    {
475
      memset( dst, 0, sizeof( TCoeff )*iSkipLine );
476
      dst += line;
477
    }
478
  }
479
#else
480
965k
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P8[TRANSFORM_FORWARD][0] );
481
965k
#endif
482
965k
}
483
484
/** 8x8 inverse transform implemented using partial butterfly structure (1D)
485
*  \param src   input data (transform coefficients)
486
*  \param dst   output data (residual)
487
*  \param shift specifies right shift after 1D transform
488
*  \param line
489
*  \param outputMinimum  minimum for clipping
490
*  \param outputMaximum  maximum for clipping
491
*/
492
void fastInverseDCT2_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
493
460k
{
494
#if 0
495
  const TMatrixCoeff *iT = g_trCoreDCT2P8[0];
496
497
  _fastInverseMM<8>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
498
#else
499
460k
  int j, k;
500
460k
  int E[4], O[4];
501
460k
  int EE[2], EO[2];
502
460k
  int add = 1 << (shift - 1);
503
504
460k
  const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_INVERSE][0];
505
506
460k
#if ENABLE_SIMD_TRAFO
507
460k
  TCoeff *orgDst = dst;
508
509
460k
#endif
510
460k
  const int  reducedLine = line - iSkipLine;
511
5.32M
  for( j = 0; j < reducedLine; j++ )
512
4.86M
  {
513
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
514
24.3M
    for( k = 0; k < 4; k++ )
515
19.4M
    {
516
19.4M
      O[k] = iT[1 * 8 + k] * src[line] + iT[3 * 8 + k] * src[3 * line] + iT[5 * 8 + k] * src[5 * line] + iT[7 * 8 + k] * src[7 * line];
517
19.4M
    }
518
519
4.86M
    EO[0] = iT[2 * 8 + 0] * src[2 * line] + iT[6 * 8 + 0] * src[6 * line];
520
4.86M
    EO[1] = iT[2 * 8 + 1] * src[2 * line] + iT[6 * 8 + 1] * src[6 * line];
521
4.86M
    EE[0] = iT[0 * 8 + 0] * src[0       ] + iT[4 * 8 + 0] * src[4 * line];
522
4.86M
    EE[1] = iT[0 * 8 + 1] * src[0       ] + iT[4 * 8 + 1] * src[4 * line];
523
524
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
525
4.86M
    E[0] = EE[0] + EO[0];
526
4.86M
    E[3] = EE[0] - EO[0];
527
4.86M
    E[1] = EE[1] + EO[1];
528
4.86M
    E[2] = EE[1] - EO[1];
529
530
24.3M
    for( k = 0; k < 4; k++ )
531
19.4M
    {
532
19.4M
#if ENABLE_SIMD_TRAFO
533
19.4M
      dst[k    ] = E[    k] + O[    k];
534
19.4M
      dst[k + 4] = E[3 - k] - O[3 - k];
535
#else
536
      dst[k    ] = Clip3( outputMinimum, outputMaximum, ( E[    k] + O[    k] + add ) >> shift );
537
      dst[k + 4] = Clip3( outputMinimum, outputMaximum, ( E[3 - k] - O[3 - k] + add ) >> shift );
538
#endif
539
19.4M
    }
540
4.86M
    src++;
541
4.86M
    dst += 8;
542
4.86M
  }
543
544
460k
#if ENABLE_SIMD_TRAFO
545
460k
  g_tCoeffOps.roundClip8( orgDst, 8, reducedLine, 8, outputMinimum, outputMaximum, add, shift );
546
547
460k
#endif
548
460k
  if( iSkipLine )
549
53.7k
  {
550
53.7k
    memset( dst, 0, ( iSkipLine << 3 ) * sizeof( TCoeff ) );
551
53.7k
  }
552
460k
#endif
553
460k
}
554
555
556
/** 16x16 forward transform implemented using partial butterfly structure (1D)
557
*  \param src   input data (residual)
558
*  \param dst   output data (transform coefficients)
559
*  \param shift specifies right shift after 1D transform
560
*  \param line
561
*/
562
void fastForwardDCT2_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
563
839k
{
564
#if !JVET_M0497_MATRIX_MULT
565
  int j, k;
566
  TCoeff E  [8], O  [8];
567
  TCoeff EE [4], EO [4];
568
  TCoeff EEE[2], EEO[2];
569
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
570
571
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_FORWARD][0];
572
573
  TCoeff *pCoef = dst;
574
  const int  reducedLine = line - iSkipLine;
575
  for( j = 0; j < reducedLine; j++ )
576
  {
577
    /* E and O*/
578
    for( k = 0; k < 8; k++ )
579
    {
580
      E[k] = src[k] + src[15 - k];
581
      O[k] = src[k] - src[15 - k];
582
    }
583
    /* EE and EO */
584
    for( k = 0; k < 4; k++ )
585
    {
586
      EE[k] = E[k] + E[7 - k];
587
      EO[k] = E[k] - E[7 - k];
588
    }
589
    /* EEE and EEO */
590
    EEE[0] = EE[0] + EE[3];
591
    EEO[0] = EE[0] - EE[3];
592
    EEE[1] = EE[1] + EE[2];
593
    EEO[1] = EE[1] - EE[2];
594
595
    dst[ 0       ] = ( iT[ 0     ] * EEE[0] + iT[          1] * EEE[1] + add ) >> shift;
596
    dst[ 8 * line] = ( iT[ 8 * 16] * EEE[0] + iT[ 8 * 16 + 1] * EEE[1] + add ) >> shift;
597
    dst[ 4 * line] = ( iT[ 4 * 16] * EEO[0] + iT[ 4 * 16 + 1] * EEO[1] + add ) >> shift;
598
    dst[12 * line] = ( iT[12 * 16] * EEO[0] + iT[12 * 16 + 1] * EEO[1] + add ) >> shift;
599
600
    for( k = 2; k < 16; k += 4 )
601
    {
602
      dst[k*line] = ( iT[k * 16] * EO[0] + iT[k * 16 + 1] * EO[1] + iT[k * 16 + 2] * EO[2] + iT[k * 16 + 3] * EO[3] + add ) >> shift;
603
    }
604
605
    for( k = 1; k < 16; k += 2 )
606
    {
607
      dst[k*line] = ( iT[k * 16    ] * O[0] + iT[k * 16 + 1] * O[1] + iT[k * 16 + 2] * O[2] + iT[k * 16 + 3] * O[3] +
608
                      iT[k * 16 + 4] * O[4] + iT[k * 16 + 5] * O[5] + iT[k * 16 + 6] * O[6] + iT[k * 16 + 7] * O[7] + add ) >> shift;
609
    }
610
611
    src += 16;
612
    dst++;
613
614
  }
615
  if( iSkipLine )
616
  {
617
    dst = pCoef + reducedLine;
618
    for( j = 0; j < 16; j++ )
619
    {
620
      memset( dst, 0, sizeof( TCoeff )*iSkipLine );
621
      dst += line;
622
    }
623
  }
624
#else
625
839k
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P16[TRANSFORM_FORWARD][0] );
626
839k
#endif
627
839k
}
628
629
/** 16x16 inverse transform implemented using partial butterfly structure (1D)
630
*  \param src            input data (transform coefficients)
631
*  \param dst            output data (residual)
632
*  \param shift          specifies right shift after 1D transform
633
*  \param line
634
*  \param outputMinimum  minimum for clipping
635
*  \param outputMaximum  maximum for clipping
636
*/
637
void fastInverseDCT2_B16( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum )
638
397k
{
639
397k
#if ENABLE_SIMD_TRAFO
640
397k
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0];
641
642
397k
  _fastInverseMM<16>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
643
#else
644
  int j, k;
645
  int E  [8], O  [8];
646
  int EE [4], EO [4];
647
  int EEE[2], EEO[2];
648
  int add = 1 << ( shift - 1 );
649
650
  const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0];
651
652
#if ENABLE_SIMD_TRAFO
653
  TCoeff *orgDst = dst;
654
655
#endif
656
  const int  reducedLine = line - iSkipLine;
657
658
  for( j = 0; j < reducedLine; j++ )
659
  {
660
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
661
    for( k = 0; k < 8; k++ )
662
    {
663
      O[k] = iT[1 * 16 + k] * src[    line] + iT[ 3 * 16 + k] * src[ 3 * line] + iT[ 5 * 16 + k] * src[ 5 * line] + iT[ 7 * 16 + k] * src[ 7 * line] +
664
        iT[9 * 16 + k] * src[9 * line] + iT[11 * 16 + k] * src[11 * line] + iT[13 * 16 + k] * src[13 * line] + iT[15 * 16 + k] * src[15 * line];
665
    }
666
    for( k = 0; k < 4; k++ )
667
    {
668
      EO[k] = iT[2 * 16 + k] * src[2 * line] + iT[6 * 16 + k] * src[6 * line] + iT[10 * 16 + k] * src[10 * line] + iT[14 * 16 + k] * src[14 * line];
669
    }
670
    EEO[0] = iT[4 * 16    ] * src[4 * line] + iT[12 * 16    ] * src[12 * line];
671
    EEE[0] = iT[0         ] * src[0       ] + iT[ 8 * 16    ] * src[ 8 * line];
672
    EEO[1] = iT[4 * 16 + 1] * src[4 * line] + iT[12 * 16 + 1] * src[12 * line];
673
    EEE[1] = iT[0 * 16 + 1] * src[0       ] + iT[ 8 * 16 + 1] * src[ 8 * line];
674
675
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
676
    for( k = 0; k < 2; k++ )
677
    {
678
      EE[k    ] = EEE[    k] + EEO[    k];
679
      EE[k + 2] = EEE[1 - k] - EEO[1 - k];
680
    }
681
    for( k = 0; k < 4; k++ )
682
    {
683
      E[k    ] = EE[    k] + EO[    k];
684
      E[k + 4] = EE[3 - k] - EO[3 - k];
685
    }
686
    for( k = 0; k < 8; k++ )
687
    {
688
#if ENABLE_SIMD_TRAFO
689
      dst[k    ] = E[    k] + O[    k];
690
      dst[k + 8] = E[7 - k] - O[7 - k];
691
#else
692
      dst[k    ] = Clip3( outputMinimum, outputMaximum, ( E[    k] + O[    k] + add ) >> shift );
693
      dst[k + 8] = Clip3( outputMinimum, outputMaximum, ( E[7 - k] - O[7 - k] + add ) >> shift );
694
#endif
695
    }
696
    src++;
697
    dst += 16;
698
  }
699
700
#if ENABLE_SIMD_TRAFO
701
  g_tCoeffOps.roundClip8( orgDst, 16, reducedLine, 16, outputMinimum, outputMaximum, add, shift );
702
703
#endif
704
  if( iSkipLine )
705
  {
706
    memset( dst, 0, ( iSkipLine << 4 ) * sizeof( TCoeff ) );
707
  }
708
#endif
709
397k
}
710
711
712
713
/** 32x32 forward transform implemented using partial butterfly structure (1D)
714
*  \param src   input data (residual)
715
*  \param dst   output data (transform coefficients)
716
*  \param shift specifies right shift after 1D transform
717
*  \param line
718
*/
719
void fastForwardDCT2_B32( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 )
720
764k
{
721
#if !JVET_M0497_MATRIX_MULT
722
  int j, k;
723
  TCoeff E   [16], O   [16];
724
  TCoeff EE  [ 8], EO  [ 8];
725
  TCoeff EEE [ 4], EEO [ 4];
726
  TCoeff EEEE[ 2], EEEO[ 2];
727
  TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0;
728
729
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_FORWARD][0];
730
731
  TCoeff *pCoef = dst;
732
  const int  reducedLine = line - iSkipLine;
733
  for (j = 0; j<reducedLine; j++)
734
  {
735
    /* E and O*/
736
    for (k = 0;k<16;k++)
737
    {
738
      E[k] = src[k] + src[31 - k];
739
      O[k] = src[k] - src[31 - k];
740
    }
741
    /* EE and EO */
742
    for (k = 0;k<8;k++)
743
    {
744
      EE[k] = E[k] + E[15 - k];
745
      EO[k] = E[k] - E[15 - k];
746
    }
747
    /* EEE and EEO */
748
    for (k = 0;k<4;k++)
749
    {
750
      EEE[k] = EE[k] + EE[7 - k];
751
      EEO[k] = EE[k] - EE[7 - k];
752
    }
753
    /* EEEE and EEEO */
754
    EEEE[0] = EEE[0] + EEE[3];
755
    EEEO[0] = EEE[0] - EEE[3];
756
    EEEE[1] = EEE[1] + EEE[2];
757
    EEEO[1] = EEE[1] - EEE[2];
758
759
    dst[0] = (iT[0 * 32 + 0] * EEEE[0] + iT[0 * 32 + 1] * EEEE[1] + add) >> shift;
760
    dst[16 * line] = (iT[16 * 32 + 0] * EEEE[0] + iT[16 * 32 + 1] * EEEE[1] + add) >> shift;
761
    dst[8 * line] = (iT[8 * 32 + 0] * EEEO[0] + iT[8 * 32 + 1] * EEEO[1] + add) >> shift;
762
    dst[24 * line] = (iT[24 * 32 + 0] * EEEO[0] + iT[24 * 32 + 1] * EEEO[1] + add) >> shift;
763
    for (k = 4;k<32;k += 8)
764
    {
765
      dst[k*line] = (iT[k * 32 + 0] * EEO[0] + iT[k * 32 + 1] * EEO[1] + iT[k * 32 + 2] * EEO[2] + iT[k * 32 + 3] * EEO[3] + add) >> shift;
766
    }
767
    for (k = 2;k<32;k += 4)
768
    {
769
      dst[k*line] = (iT[k * 32 + 0] * EO[0] + iT[k * 32 + 1] * EO[1] + iT[k * 32 + 2] * EO[2] + iT[k * 32 + 3] * EO[3] +
770
                      iT[k * 32 + 4] * EO[4] + iT[k * 32 + 5] * EO[5] + iT[k * 32 + 6] * EO[6] + iT[k * 32 + 7] * EO[7] + add) >> shift;
771
    }
772
    for (k = 1;k<32;k += 2)
773
    {
774
      dst[k*line] = (iT[k * 32 + 0] * O[0] + iT[k * 32 + 1] * O[1] + iT[k * 32 + 2] * O[2] + iT[k * 32 + 3] * O[3] +
775
                      iT[k * 32 + 4] * O[4] + iT[k * 32 + 5] * O[5] + iT[k * 32 + 6] * O[6] + iT[k * 32 + 7] * O[7] +
776
                      iT[k * 32 + 8] * O[8] + iT[k * 32 + 9] * O[9] + iT[k * 32 + 10] * O[10] + iT[k * 32 + 11] * O[11] +
777
                      iT[k * 32 + 12] * O[12] + iT[k * 32 + 13] * O[13] + iT[k * 32 + 14] * O[14] + iT[k * 32 + 15] * O[15] + add) >> shift;
778
    }
779
    src += 32;
780
    dst++;
781
  }
782
  if (iSkipLine)
783
  {
784
    dst = pCoef + reducedLine;
785
    for (j = 0; j<32; j++)
786
    {
787
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
788
      dst += line;
789
    }
790
  }
791
#else
792
764k
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P32[TRANSFORM_FORWARD][0] );
793
764k
#endif
794
764k
}
795
796
/** 32x32 inverse transform implemented using partial butterfly structure (1D)
797
*  \param src   input data (transform coefficients)
798
*  \param dst   output data (residual)
799
*  \param shift specifies right shift after 1D transform
800
*  \param line
801
*  \param outputMinimum  minimum for clipping
802
*  \param outputMaximum  maximum for clipping
803
*/
804
void fastInverseDCT2_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
805
187k
{
806
187k
#if ENABLE_SIMD_TRAFO
807
187k
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0];
808
809
187k
  _fastInverseMM<32>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
810
#else
811
  int j, k;
812
  int E[16], O[16];
813
  int EE[8], EO[8];
814
  int EEE[4], EEO[4];
815
  int EEEE[2], EEEO[2];
816
  int add = 1 << (shift - 1);
817
818
  const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0];
819
820
#if ENABLE_SIMD_TRAFO
821
  TCoeff *orgDst = dst;
822
823
#endif
824
  const int  reducedLine = line - iSkipLine;
825
  for (j = 0; j<reducedLine; j++)
826
  {
827
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
828
    for (k = 0;k<16;k++)
829
    {
830
      O[k] = iT[1 * 32 + k] * src[line] + iT[3 * 32 + k] * src[3 * line] + iT[5 * 32 + k] * src[5 * line] + iT[7 * 32 + k] * src[7 * line] +
831
        iT[9 * 32 + k] * src[9 * line] + iT[11 * 32 + k] * src[11 * line] + iT[13 * 32 + k] * src[13 * line] + iT[15 * 32 + k] * src[15 * line] +
832
        iT[17 * 32 + k] * src[17 * line] + iT[19 * 32 + k] * src[19 * line] + iT[21 * 32 + k] * src[21 * line] + iT[23 * 32 + k] * src[23 * line] +
833
        iT[25 * 32 + k] * src[25 * line] + iT[27 * 32 + k] * src[27 * line] + iT[29 * 32 + k] * src[29 * line] + iT[31 * 32 + k] * src[31 * line];
834
    }
835
    for (k = 0;k<8;k++)
836
    {
837
      EO[k] = iT[2 * 32 + k] * src[2 * line] + iT[6 * 32 + k] * src[6 * line] + iT[10 * 32 + k] * src[10 * line] + iT[14 * 32 + k] * src[14 * line] +
838
        iT[18 * 32 + k] * src[18 * line] + iT[22 * 32 + k] * src[22 * line] + iT[26 * 32 + k] * src[26 * line] + iT[30 * 32 + k] * src[30 * line];
839
    }
840
    for (k = 0;k<4;k++)
841
    {
842
      EEO[k] = iT[4 * 32 + k] * src[4 * line] + iT[12 * 32 + k] * src[12 * line] + iT[20 * 32 + k] * src[20 * line] + iT[28 * 32 + k] * src[28 * line];
843
    }
844
    EEEO[0] = iT[8 * 32 + 0] * src[8 * line] + iT[24 * 32 + 0] * src[24 * line];
845
    EEEO[1] = iT[8 * 32 + 1] * src[8 * line] + iT[24 * 32 + 1] * src[24 * line];
846
    EEEE[0] = iT[0 * 32 + 0] * src[0] + iT[16 * 32 + 0] * src[16 * line];
847
    EEEE[1] = iT[0 * 32 + 1] * src[0] + iT[16 * 32 + 1] * src[16 * line];
848
849
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
850
    EEE[0] = EEEE[0] + EEEO[0];
851
    EEE[3] = EEEE[0] - EEEO[0];
852
    EEE[1] = EEEE[1] + EEEO[1];
853
    EEE[2] = EEEE[1] - EEEO[1];
854
    for (k = 0;k<4;k++)
855
    {
856
      EE[k] = EEE[k] + EEO[k];
857
      EE[k + 4] = EEE[3 - k] - EEO[3 - k];
858
    }
859
    for (k = 0;k<8;k++)
860
    {
861
      E[k] = EE[k] + EO[k];
862
      E[k + 8] = EE[7 - k] - EO[7 - k];
863
    }
864
    for (k = 0;k<16;k++)
865
    {
866
#if ENABLE_SIMD_TRAFO
867
      dst[k     ] = E[k     ] + O[k     ];
868
      dst[k + 16] = E[15 - k] - O[15 - k];
869
#else
870
      dst[k] = Clip3(outputMinimum, outputMaximum, (E[k] + O[k] + add) >> shift);
871
      dst[k + 16] = Clip3(outputMinimum, outputMaximum, (E[15 - k] - O[15 - k] + add) >> shift);
872
#endif
873
    }
874
    src++;
875
    dst += 32;
876
  }
877
878
#if ENABLE_SIMD_TRAFO
879
  g_tCoeffOps.roundClip8( orgDst, 32, reducedLine, 32, outputMinimum, outputMaximum, add, shift );
880
881
#endif
882
  if (iSkipLine)
883
  {
884
    memset(dst, 0, (iSkipLine << 5) * sizeof(TCoeff));
885
  }
886
#endif
887
187k
}
888
889
void fastForwardDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
890
76.8k
{
891
#if !JVET_M0497_MATRIX_MULT
892
  int rnd_factor = 1 << (shift - 1);
893
894
  const int uiTrSize = 64;
895
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_FORWARD][0];
896
897
  int   j, k;
898
  TCoeff E[32], O[32];
899
  TCoeff EE[16], EO[16];
900
  TCoeff EEE[8], EEO[8];
901
  TCoeff EEEE[4], EEEO[4];
902
  TCoeff EEEEE[2], EEEEO[2];
903
  TCoeff *tmp = dst;
904
905
  //bool zo = iSkipLine2 >= 32;
906
  bool zo = iSkipLine2 != 0;
907
  for (j = 0; j<line - iSkipLine; j++)
908
  {
909
    /* E and O*/
910
    for (k = 0;k<32;k++)
911
    {
912
      E[k] = src[k] + src[63 - k];
913
      O[k] = src[k] - src[63 - k];
914
    }
915
    /* EE and EO */
916
    for (k = 0;k<16;k++)
917
    {
918
      EE[k] = E[k] + E[31 - k];
919
      EO[k] = E[k] - E[31 - k];
920
    }
921
    /* EEE and EEO */
922
    for (k = 0;k<8;k++)
923
    {
924
      EEE[k] = EE[k] + EE[15 - k];
925
      EEO[k] = EE[k] - EE[15 - k];
926
    }
927
    /* EEEE and EEEO */
928
    for (k = 0;k<4;k++)
929
    {
930
      EEEE[k] = EEE[k] + EEE[7 - k];
931
      EEEO[k] = EEE[k] - EEE[7 - k];
932
    }
933
    /* EEEEE and EEEEO */
934
    EEEEE[0] = EEEE[0] + EEEE[3];
935
    EEEEO[0] = EEEE[0] - EEEE[3];
936
    EEEEE[1] = EEEE[1] + EEEE[2];
937
    EEEEO[1] = EEEE[1] - EEEE[2];
938
939
    dst[0] = (iT[0 * 64 + 0] * EEEEE[0] + iT[0 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift;
940
    dst[16 * line] = (iT[16 * 64 + 0] * EEEEO[0] + iT[16 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift;
941
942
    if (!zo)
943
    {
944
      dst[32 * line] = (iT[32 * 64 + 0] * EEEEE[0] + iT[32 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift;
945
      dst[48 * line] = (iT[48 * 64 + 0] * EEEEO[0] + iT[48 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift;
946
    }
947
    for (k = 8;k<(zo ? 32 : 64);k += 16)
948
    {
949
      dst[k*line] = (iT[k * 64 + 0] * EEEO[0] + iT[k * 64 + 1] * EEEO[1] + iT[k * 64 + 2] * EEEO[2] + iT[k * 64 + 3] * EEEO[3] + rnd_factor) >> shift;
950
    }
951
    for (k = 4;k<(zo ? 32 : 64);k += 8)
952
    {
953
      dst[k*line] = (iT[k * 64 + 0] * EEO[0] + iT[k * 64 + 1] * EEO[1] + iT[k * 64 + 2] * EEO[2] + iT[k * 64 + 3] * EEO[3] +
954
                      iT[k * 64 + 4] * EEO[4] + iT[k * 64 + 5] * EEO[5] + iT[k * 64 + 6] * EEO[6] + iT[k * 64 + 7] * EEO[7] + rnd_factor) >> shift;
955
    }
956
    for (k = 2;k<(zo ? 32 : 64);k += 4)
957
    {
958
      dst[k*line] = (iT[k * 64 + 0] * EO[0] + iT[k * 64 + 1] * EO[1] + iT[k * 64 + 2] * EO[2] + iT[k * 64 + 3] * EO[3] +
959
                      iT[k * 64 + 4] * EO[4] + iT[k * 64 + 5] * EO[5] + iT[k * 64 + 6] * EO[6] + iT[k * 64 + 7] * EO[7] +
960
                      iT[k * 64 + 8] * EO[8] + iT[k * 64 + 9] * EO[9] + iT[k * 64 + 10] * EO[10] + iT[k * 64 + 11] * EO[11] +
961
                      iT[k * 64 + 12] * EO[12] + iT[k * 64 + 13] * EO[13] + iT[k * 64 + 14] * EO[14] + iT[k * 64 + 15] * EO[15] + rnd_factor) >> shift;
962
    }
963
    for (k = 1;k<(zo ? 32 : 64);k += 2)
964
    {
965
      dst[k*line] = (iT[k * 64 + 0] * O[0] + iT[k * 64 + 1] * O[1] + iT[k * 64 + 2] * O[2] + iT[k * 64 + 3] * O[3] +
966
                      iT[k * 64 + 4] * O[4] + iT[k * 64 + 5] * O[5] + iT[k * 64 + 6] * O[6] + iT[k * 64 + 7] * O[7] +
967
                      iT[k * 64 + 8] * O[8] + iT[k * 64 + 9] * O[9] + iT[k * 64 + 10] * O[10] + iT[k * 64 + 11] * O[11] +
968
                      iT[k * 64 + 12] * O[12] + iT[k * 64 + 13] * O[13] + iT[k * 64 + 14] * O[14] + iT[k * 64 + 15] * O[15] +
969
                      iT[k * 64 + 16] * O[16] + iT[k * 64 + 17] * O[17] + iT[k * 64 + 18] * O[18] + iT[k * 64 + 19] * O[19] +
970
                      iT[k * 64 + 20] * O[20] + iT[k * 64 + 21] * O[21] + iT[k * 64 + 22] * O[22] + iT[k * 64 + 23] * O[23] +
971
                      iT[k * 64 + 24] * O[24] + iT[k * 64 + 25] * O[25] + iT[k * 64 + 26] * O[26] + iT[k * 64 + 27] * O[27] +
972
                      iT[k * 64 + 28] * O[28] + iT[k * 64 + 29] * O[29] + iT[k * 64 + 30] * O[30] + iT[k * 64 + 31] * O[31] + rnd_factor) >> shift;
973
    }
974
    src += uiTrSize;
975
    dst++;
976
  }
977
978
  const int  reducedLine = line - iSkipLine;
979
  const int  cutoff = uiTrSize - iSkipLine2;
980
  if (iSkipLine)
981
  {
982
    dst = tmp + reducedLine;
983
    for (j = 0; j<cutoff; j++)
984
    {
985
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
986
      dst += line;
987
    }
988
  }
989
  if (iSkipLine2)
990
  {
991
    dst = tmp + line*cutoff;
992
    memset(dst, 0, sizeof(TCoeff)*line*iSkipLine2);
993
  }
994
#else
995
76.8k
  _fastForwardMM< 64 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P64[TRANSFORM_FORWARD][0] );
996
76.8k
#endif
997
76.8k
}
998
999
void fastInverseDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1000
17.2k
{
1001
17.2k
#if ENABLE_SIMD_TRAFO
1002
17.2k
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0];
1003
1004
17.2k
  _fastInverseMM<64>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT );
1005
#else
1006
  int rnd_factor = 1 << (shift - 1);
1007
  const int uiTrSize = 64;
1008
  const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0];
1009
1010
#if ENABLE_SIMD_TRAFO
1011
  TCoeff *orgDst = dst;
1012
1013
#endif
1014
  int    j, k;
1015
  TCoeff E[32], O[32];
1016
  TCoeff EE[16], EO[16];
1017
  TCoeff EEE[8], EEO[8];
1018
  TCoeff EEEE[4], EEEO[4];
1019
  TCoeff EEEEE[2], EEEEO[2];
1020
  bool zo = iSkipLine2 >= 32;
1021
  for (j = 0; j<line - iSkipLine; j++)
1022
  {
1023
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1024
    for (k = 0;k<32;k++)
1025
    {
1026
      O[k] = iT[1 * 64 + k] * src[line] + iT[3 * 64 + k] * src[3 * line] + iT[5 * 64 + k] * src[5 * line] + iT[7 * 64 + k] * src[7 * line] +
1027
        iT[9 * 64 + k] * src[9 * line] + iT[11 * 64 + k] * src[11 * line] + iT[13 * 64 + k] * src[13 * line] + iT[15 * 64 + k] * src[15 * line] +
1028
        iT[17 * 64 + k] * src[17 * line] + iT[19 * 64 + k] * src[19 * line] + iT[21 * 64 + k] * src[21 * line] + iT[23 * 64 + k] * src[23 * line] +
1029
        iT[25 * 64 + k] * src[25 * line] + iT[27 * 64 + k] * src[27 * line] + iT[29 * 64 + k] * src[29 * line] + iT[31 * 64 + k] * src[31 * line] +
1030
        (zo ? 0 : (
1031
        iT[33 * 64 + k] * src[33 * line] + iT[35 * 64 + k] * src[35 * line] + iT[37 * 64 + k] * src[37 * line] + iT[39 * 64 + k] * src[39 * line] +
1032
        iT[41 * 64 + k] * src[41 * line] + iT[43 * 64 + k] * src[43 * line] + iT[45 * 64 + k] * src[45 * line] + iT[47 * 64 + k] * src[47 * line] +
1033
        iT[49 * 64 + k] * src[49 * line] + iT[51 * 64 + k] * src[51 * line] + iT[53 * 64 + k] * src[53 * line] + iT[55 * 64 + k] * src[55 * line] +
1034
        iT[57 * 64 + k] * src[57 * line] + iT[59 * 64 + k] * src[59 * line] + iT[61 * 64 + k] * src[61 * line] + iT[63 * 64 + k] * src[63 * line]));
1035
    }
1036
    for (k = 0;k<16;k++)
1037
    {
1038
      EO[k] = iT[2 * 64 + k] * src[2 * line] + iT[6 * 64 + k] * src[6 * line] + iT[10 * 64 + k] * src[10 * line] + iT[14 * 64 + k] * src[14 * line] +
1039
        iT[18 * 64 + k] * src[18 * line] + iT[22 * 64 + k] * src[22 * line] + iT[26 * 64 + k] * src[26 * line] + iT[30 * 64 + k] * src[30 * line] +
1040
        (zo ? 0 : (
1041
        iT[34 * 64 + k] * src[34 * line] + iT[38 * 64 + k] * src[38 * line] + iT[42 * 64 + k] * src[42 * line] + iT[46 * 64 + k] * src[46 * line] +
1042
        iT[50 * 64 + k] * src[50 * line] + iT[54 * 64 + k] * src[54 * line] + iT[58 * 64 + k] * src[58 * line] + iT[62 * 64 + k] * src[62 * line]));
1043
    }
1044
    for (k = 0;k<8;k++)
1045
    {
1046
      EEO[k] = iT[4 * 64 + k] * src[4 * line] + iT[12 * 64 + k] * src[12 * line] + iT[20 * 64 + k] * src[20 * line] + iT[28 * 64 + k] * src[28 * line] +
1047
        (zo ? 0 : (
1048
        iT[36 * 64 + k] * src[36 * line] + iT[44 * 64 + k] * src[44 * line] + iT[52 * 64 + k] * src[52 * line] + iT[60 * 64 + k] * src[60 * line]));
1049
    }
1050
    for (k = 0;k<4;k++)
1051
    {
1052
      EEEO[k] = iT[8 * 64 + k] * src[8 * line] + iT[24 * 64 + k] * src[24 * line] + (zo ? 0 : (iT[40 * 64 + k] * src[40 * line] + iT[56 * 64 + k] * src[56 * line]));
1053
    }
1054
    EEEEO[0] = iT[16 * 64 + 0] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 0] * src[48 * line]);
1055
    EEEEO[1] = iT[16 * 64 + 1] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 1] * src[48 * line]);
1056
    EEEEE[0] = iT[0 * 64 + 0] * src[0] + (zo ? 0 : iT[32 * 64 + 0] * src[32 * line]);
1057
    EEEEE[1] = iT[0 * 64 + 1] * src[0] + (zo ? 0 : iT[32 * 64 + 1] * src[32 * line]);
1058
1059
    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
1060
    for (k = 0;k<2;k++)
1061
    {
1062
      EEEE[k] = EEEEE[k] + EEEEO[k];
1063
      EEEE[k + 2] = EEEEE[1 - k] - EEEEO[1 - k];
1064
    }
1065
    for (k = 0;k<4;k++)
1066
    {
1067
      EEE[k] = EEEE[k] + EEEO[k];
1068
      EEE[k + 4] = EEEE[3 - k] - EEEO[3 - k];
1069
    }
1070
    for (k = 0;k<8;k++)
1071
    {
1072
      EE[k] = EEE[k] + EEO[k];
1073
      EE[k + 8] = EEE[7 - k] - EEO[7 - k];
1074
    }
1075
    for (k = 0;k<16;k++)
1076
    {
1077
      E[k] = EE[k] + EO[k];
1078
      E[k + 16] = EE[15 - k] - EO[15 - k];
1079
    }
1080
    for (k = 0;k<32;k++)
1081
    {
1082
#if ENABLE_SIMD_TRAFO
1083
      dst[k]      = E[k] + O[k];
1084
      dst[k + 32] = E[31 - k] - O[31 - k];
1085
#else
1086
      dst[k]      = Clip3( outputMinimum, outputMaximum, ( E[k] + O[k] + rnd_factor ) >> shift );
1087
      dst[k + 32] = Clip3( outputMinimum, outputMaximum, ( E[31 - k] - O[31 - k] + rnd_factor ) >> shift );
1088
#endif
1089
    }
1090
    src++;
1091
    dst += uiTrSize;
1092
  }
1093
1094
#if ENABLE_SIMD_TRAFO
1095
  g_tCoeffOps.roundClip8( orgDst, 32, line - iSkipLine, 32, outputMinimum, outputMaximum, rnd_factor, shift );
1096
1097
1098
#endif
1099
  memset( dst, 0, uiTrSize*iSkipLine * sizeof( TCoeff ) );
1100
#endif
1101
17.2k
}
1102
1103
1104
1105
// ********************************** DST-VII **********************************
1106
void fastForwardDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1107
4.41k
{
1108
4.41k
  int i;
1109
4.41k
  TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0;
1110
1111
4.41k
  const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_FORWARD][0];
1112
1113
4.41k
  int c[4];
1114
4.41k
  TCoeff *pCoeff = dst;
1115
4.41k
  const int  reducedLine = line - iSkipLine;
1116
94.4k
  for (i = 0; i<reducedLine; i++)
1117
90.0k
  {
1118
    // Intermediate Variables
1119
90.0k
    c[0] = src[0] + src[3];
1120
90.0k
    c[1] = src[1] + src[3];
1121
90.0k
    c[2] = src[0] - src[1];
1122
90.0k
    c[3] = iT[2] * src[2];
1123
1124
90.0k
    dst[0 * line] = (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift;
1125
90.0k
    dst[1 * line] = (iT[2] * (src[0] + src[1] - src[3]) + rnd_factor) >> shift;
1126
90.0k
    dst[2 * line] = (iT[0] * c[2] + iT[1] * c[0] - c[3] + rnd_factor) >> shift;
1127
90.0k
    dst[3 * line] = (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift;
1128
1129
90.0k
    src += 4;
1130
90.0k
    dst++;
1131
90.0k
  }
1132
4.41k
  if (iSkipLine)
1133
0
  {
1134
0
    dst = pCoeff + reducedLine;
1135
0
    for (i = 0; i<4; i++)
1136
0
    {
1137
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1138
0
      dst += line;
1139
0
    }
1140
0
  }
1141
4.41k
}
1142
1143
void fastInverseDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1144
1.78k
{
1145
1.78k
#if ENABLE_SIMD_TRAFO
1146
1.78k
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P4[TRANSFORM_INVERSE][0] );
1147
#else
1148
  int i;
1149
  TCoeff c[4];
1150
  TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0;
1151
1152
  const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_INVERSE][0];
1153
1154
  const int  reducedLine = line - iSkipLine;
1155
  for (i = 0; i<reducedLine; i++)
1156
  {
1157
    // Intermediate Variables
1158
    c[0] = src[0 * line] + src[2 * line];
1159
    c[1] = src[2 * line] + src[3 * line];
1160
    c[2] = src[0 * line] - src[3 * line];
1161
    c[3] = iT[2] * src[1 * line];
1162
1163
    dst[0] = Clip3(outputMinimum, outputMaximum, (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift);
1164
    dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift);
1165
    dst[2] = Clip3(outputMinimum, outputMaximum, (iT[2] * (src[0 * line] - src[2 * line] + src[3 * line]) + rnd_factor) >> shift);
1166
    dst[3] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[0] + iT[0] * c[2] - c[3] + rnd_factor) >> shift);
1167
1168
    dst += 4;
1169
    src++;
1170
  }
1171
  if (iSkipLine)
1172
  {
1173
    memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff));
1174
  }
1175
#endif
1176
1.78k
}
1177
1178
1179
void fastForwardDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1180
23.3k
{
1181
23.3k
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P8[TRANSFORM_FORWARD][0] );
1182
23.3k
}
1183
1184
1185
void fastInverseDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1186
1.10k
{
1187
1.10k
  _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P8[TRANSFORM_INVERSE][0]);
1188
1.10k
}
1189
1190
1191
void fastForwardDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1192
32.0k
{
1193
#if !JVET_M0497_MATRIX_MULT
1194
  int j, k;
1195
  TCoeff a[5], b[5], c[5], d[5], t;
1196
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1197
1198
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0];
1199
1200
  TCoeff *pCoef = dst;
1201
  const int  reducedLine = line - iSkipLine;
1202
  const int  cutoff = 16 - iSkipLine2;
1203
1204
  for (j = 0; j < reducedLine; j++)
1205
  {
1206
    for (k = 0; k < 5; k++)
1207
    {
1208
      a[k] = src[    k] + src[11 + k];
1209
      b[k] = src[9 - k] + src[11 + k];
1210
      c[k] = src[    k] - src[ 9 - k];
1211
      d[k] = src[    k] + src[ 9 - k] - src[11 + k];
1212
    }
1213
1214
    t = iT[10] * src[10];
1215
1216
    dst[ 1 * line] = ( iT[ 2]*d[0] + iT[ 5]*d[1] + iT[ 8]*d[2] + iT[11]*d[3] + iT[14]*d[4] + add) >> shift;
1217
    dst[ 4 * line] = ( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift;
1218
    dst[ 7 * line] = ( iT[14]*d[0] + iT[ 2]*d[1] - iT[11]*d[2] - iT[ 5]*d[3] + iT[ 8]*d[4] + add) >> shift;
1219
    dst[10 * line] = ( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift;
1220
    dst[13 * line] = ( iT[ 5]*d[0] - iT[11]*d[1] + iT[14]*d[2] - iT[ 8]*d[3] + iT[ 2]*d[4] + add) >> shift;
1221
1222
    dst[5 * line] = ( iT[10] * (src[0] + src[1] - src[3] - src[4] + src[6] + src[7] - src[9] - src[10] + src[12] + src[13] - src[15]) + add) >> shift;
1223
1224
    dst[ 0 * line] = ( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift;
1225
    dst[ 2 * line] = ( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift;
1226
    dst[ 3 * line] = ( iT[6]*a[0] + iT[3]*b[0] + iT[2]*c[1] + iT[7]*a[1] + iT[9]*c[2] + iT[0]*a[2] + iT[4]*c[3] - iT[5]*b[3] - iT[1]*a[4] - iT[8]*b[4] + t + add ) >> shift;
1227
    dst[ 6 * line] = ( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift;
1228
    dst[ 8 * line] = ( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift;
1229
    dst[ 9 * line] = ( iT[7]*c[0] + iT[2]*a[0] - iT[4]*a[1] - iT[5]*b[1] - iT[8]*c[2] + iT[1]*b[2] + iT[9]*a[3] + iT[0]*b[3] + iT[3]*c[4] - iT[6]*b[4] + t + add ) >> shift;
1230
    dst[11 * line] = ( iT[9]*a[0] + iT[0]*b[0] - iT[8]*c[1] - iT[1]*a[1] + iT[2]*c[2] - iT[7]*b[2] + iT[6]*a[3] + iT[3]*b[3] - iT[5]*c[4] - iT[4]*a[4] - t + add ) >> shift;
1231
    dst[12 * line] = ( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift;
1232
    dst[14 * line] = ( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift;
1233
    dst[15 * line] = ( iT[1]*c[0] - iT[8]*b[0] - iT[3]*c[1] + iT[6]*b[1] + iT[5]*c[2] - iT[4]*b[2] - iT[7]*c[3] + iT[2]*b[3] + iT[9]*c[4] - iT[0]*b[4] + t + add ) >> shift;
1234
1235
    src += 16;
1236
    dst++;
1237
  }
1238
1239
  if (iSkipLine)
1240
  {
1241
    dst = pCoef + reducedLine;
1242
    for (j = 0; j < cutoff; j++)
1243
    {
1244
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1245
      dst += line;
1246
    }
1247
  }
1248
1249
  if (iSkipLine2)
1250
  {
1251
    dst = pCoef + line * cutoff;
1252
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1253
  }
1254
#else
1255
32.0k
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P16[TRANSFORM_FORWARD][0] );
1256
32.0k
#endif
1257
32.0k
}
1258
1259
1260
void fastInverseDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1261
5.17k
{
1262
#if !JVET_M0497_MATRIX_MULT
1263
  int j, k;
1264
  TCoeff a[5], b[5], c[5], d[5], t;
1265
1266
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1267
1268
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0];
1269
1270
  const int  reducedLine = line - iSkipLine;
1271
1272
  for (j = 0; j < reducedLine; j++)
1273
  {
1274
    for (k = 0; k < 5; k++)
1275
    {
1276
      a[k] = src[       k * line] + src[(10 - k) * line];
1277
      b[k] = src[(11 + k) * line] + src[(10 - k) * line];
1278
      c[k] = src[       k * line] - src[(11 + k) * line];
1279
      d[k] = src[       k * line] + src[(11 + k) * line] - src[(10 - k)*line];
1280
    }
1281
1282
    t = iT[10] * src[5 * line];
1283
1284
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 2]*d[0] + iT[ 8]*d[1] + iT[14]*d[2] + iT[11]*d[3] + iT[ 5]*d[4] + add ) >> shift);
1285
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 5]*d[0] + iT[14]*d[1] + iT[ 2]*d[2] - iT[ 8]*d[3] - iT[11]*d[4] + add ) >> shift);
1286
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[ 5]*d[1] - iT[11]*d[2] - iT[ 2]*d[3] + iT[14]*d[4] + add ) >> shift);
1287
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[11]*d[0] - iT[ 2]*d[1] - iT[ 5]*d[2] + iT[14]*d[3] - iT[ 8]*d[4] + add ) >> shift);
1288
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[14]*d[0] - iT[11]*d[1] + iT[ 8]*d[2] - iT[ 5]*d[3] + iT[ 2]*d[4] + add ) >> shift);
1289
1290
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[10]*(src[ 0*line]-src[ 2*line]+src[ 3*line]-src[5*line]
1291
                                                                +src[ 6*line]-src[ 8*line]+src[ 9*line]-src[11*line]
1292
                                                                +src[12*line]-src[14*line]+src[15*line]) + add ) >> shift);
1293
1294
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0]*a[0] + iT[9]*b[0] + iT[2]*a[1] + iT[7]*b[1] + iT[4]*a[2] + iT[5]*b[2] + iT[6]*a[3] + iT[3]*b[3] + iT[8]*a[4] + iT[1]*b[4] + t + add ) >> shift);
1295
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] - iT[8]*b[0] + iT[5]*c[1] - iT[4]*b[1] + iT[9]*c[2] - iT[0]*b[2] + iT[2]*a[3] + iT[7]*c[3] + iT[6]*a[4] + iT[3]*c[4] + t + add ) >> shift);
1296
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[3]*a[0] + iT[6]*b[0] + iT[0]*c[1] + iT[9]*a[1] + iT[1]*a[2] + iT[8]*c[2] + iT[4]*c[3] - iT[5]*b[3] - iT[2]*a[4] - iT[7]*b[4] - t + add ) >> shift);
1297
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] - iT[5]*b[0] + iT[6]*c[1] + iT[3]*a[1] + iT[7]*a[2] + iT[2]*b[2] - iT[1]*c[3] + iT[8]*b[3] - iT[9]*c[4] - iT[0]*a[4] - t + add ) >> shift);
1298
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[6]*a[0] + iT[3]*b[0] + iT[9]*c[1] + iT[0]*a[1] - iT[1]*a[2] - iT[8]*b[2] - iT[4]*c[3] - iT[5]*a[3] - iT[2]*c[4] + iT[7]*b[4] + t + add ) >> shift);
1299
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] - iT[2]*b[0] + iT[8]*a[1] + iT[1]*b[1] - iT[6]*c[2] + iT[3]*b[2] - iT[9]*a[3] - iT[0]*b[3] + iT[5]*c[4] - iT[4]*b[4] + t + add ) >> shift);
1300
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( iT[9]*a[0] + iT[0]*b[0] + iT[2]*c[1] - iT[7]*b[1] - iT[5]*c[2] - iT[4]*a[2] + iT[3]*a[3] + iT[6]*b[3] + iT[8]*c[4] - iT[1]*b[4] - t + add ) >> shift);
1301
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] + iT[8]*a[0] - iT[5]*a[1] - iT[4]*b[1] - iT[0]*c[2] + iT[9]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[6]*c[4] - iT[3]*a[4] + t + add ) >> shift);
1302
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] + iT[2]*a[0] - iT[8]*c[1] + iT[1]*b[1] + iT[3]*c[2] - iT[6]*b[2] + iT[0]*a[3] + iT[9]*b[3] - iT[5]*a[4] - iT[4]*b[4] + t + add ) >> shift);
1303
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] + iT[5]*a[0] - iT[3]*c[1] - iT[6]*a[1] + iT[2]*c[2] + iT[7]*a[2] - iT[1]*c[3] - iT[8]*a[3] + iT[0]*c[4] + iT[9]*a[4] - t + add ) >> shift);
1304
1305
    src++;
1306
    dst += 16;
1307
  }
1308
1309
  if (iSkipLine)
1310
  {
1311
    memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff));
1312
  }
1313
#else
1314
5.17k
  _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P16[TRANSFORM_INVERSE][0]);
1315
5.17k
#endif
1316
5.17k
}
1317
1318
1319
void fastForwardDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1320
0
{
1321
#if !JVET_M0497_MATRIX_MULT
1322
  int j, k;
1323
  TCoeff a[10][6];
1324
  TCoeff t[2];
1325
  TCoeff b[6];
1326
  TCoeff c[2];
1327
1328
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1329
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0];
1330
  TCoeff *pCoef = dst;
1331
  const int  reducedLine = line - iSkipLine;
1332
  const int  cutoff = 32 - iSkipLine2;
1333
1334
  for (j = 0; j < reducedLine; j++)
1335
  {
1336
    for (k = 0; k < 6; k++)
1337
    {
1338
      a[0][k] = src[     k] - src[11 - k];
1339
      a[1][k] = src[     k] + src[13 + k];
1340
      a[2][k] = src[     k] + src[24 - k];
1341
      a[3][k] = src[     k] - src[26 + k];
1342
      a[4][k] = src[ 6 + k] + src[18 - k];
1343
      a[5][k] = src[ 6 + k] + src[19 + k];
1344
      a[6][k] = src[ 6 + k] - src[31 - k];
1345
      a[7][k] = src[13 + k] - src[24 - k];
1346
      a[8][k] = src[13 + k] + src[26 + k];
1347
      a[9][k] = src[19 + k] + src[31 - k];
1348
1349
      b[k] = src[k] + src[11 - k] - src[13 + k] - src[24 - k] + src[26 + k];
1350
    }
1351
    for (k = 0; k < 2; k++)
1352
    {
1353
      c[k] = src[k] + src[3 - k] - src[5 + k] - src[8 - k] + src[10 + k] + src[13 - k] - src[15 + k] - src[18 - k] + src[20 + k] + src[23 - k] - src[25 + k] - src[28 - k] + src[30 + k];
1354
    }
1355
1356
    t[0] = iT[12] * src[12] + iT[25] * src[25];
1357
    t[1] = iT[12] * src[25] - iT[25] * src[12];
1358
1359
    dst[ 0 * line] = ( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift;
1360
    dst[ 1 * line] = (-iT[0] * a[5][2] + iT[11] * a[0][3] + iT[13] * a[4][2] + iT[24] * a[6][2] + iT[1] * a[9][1] + iT[10] * a[8][4] + iT[14] * a[3][4] + iT[23] * a[6][1] + iT[2] * a[0][0] - iT[9] * a[5][5] + iT[15] * a[6][5] + iT[22] * a[4][5] - iT[3] * a[5][3] + iT[8] * a[0][2] + iT[16] * a[4][3] + iT[21] * a[6][3] + iT[4] * a[9][0] + iT[7] * a[8][5] + iT[17] * a[3][5] + iT[20] * a[6][0] + iT[5] * a[0][1] - iT[6] * a[5][4] + iT[18] * a[6][4] + iT[19] * a[4][4] - t[1] + add) >> shift;
1361
    dst[ 3 * line] = (-iT[0] * a[9][4] - iT[11] * a[5][4] + iT[13] * a[2][1] - iT[24] * a[7][1] - iT[1] * a[0][3] - iT[10] * a[1][3] + iT[14] * a[3][3] + iT[23] * a[2][3] + iT[2] * a[8][5] + iT[9] * a[9][0] + iT[15] * a[6][0] + iT[22] * a[3][5] - iT[3] * a[1][4] - iT[8] * a[0][4] + iT[16] * a[2][4] + iT[21] * a[3][4] - iT[4] * a[5][3] - iT[7] * a[9][3] - iT[17] * a[7][2] + iT[20] * a[2][2] + iT[5] * a[8][0] + iT[6] * a[1][0] - iT[18] * a[4][5] - iT[19] * a[7][0] + t[1] + add) >> shift;
1362
    dst[ 4 * line] = (-iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift;
1363
    dst[ 5 * line] = (-iT[0] * a[3][5] - iT[11] * a[6][0] - iT[13] * a[8][5] - iT[24] * a[9][0] + iT[1] * a[6][5] + iT[10] * a[3][0] + iT[14] * a[9][5] + iT[23] * a[8][0] - iT[2] * a[7][4] + iT[9] * a[2][4] - iT[15] * a[9][1] - iT[22] * a[5][1] - iT[3] * a[7][1] - iT[8] * a[4][4] + iT[16] * a[8][1] + iT[21] * a[1][1] + iT[4] * a[6][2] + iT[7] * a[4][2] - iT[17] * a[5][2] + iT[20] * a[0][3] - iT[5] * a[3][2] - iT[6] * a[2][2] + iT[18] * a[1][2] + iT[19] * a[0][2] + t[0] + add) >> shift;
1364
    dst[ 8 * line] = ( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift;
1365
    dst[ 9 * line] = (-iT[0] * a[2][1] - iT[11] * a[3][1] + iT[13] * a[0][1] + iT[24] * a[1][1] + iT[1] * a[7][3] - iT[10] * a[2][3] + iT[14] * a[9][2] + iT[23] * a[5][2] + iT[2] * a[4][0] + iT[9] * a[7][5] - iT[15] * a[1][5] - iT[22] * a[8][5] + iT[3] * a[3][4] + iT[8] * a[2][4] - iT[16] * a[1][4] - iT[21] * a[0][4] + iT[4] * a[6][3] + iT[7] * a[3][2] + iT[17] * a[9][3] + iT[20] * a[8][2] + iT[5] * a[4][5] + iT[6] * a[6][5] + iT[18] * a[0][0] - iT[19] * a[5][5] - t[0] + add) >> shift;
1366
    dst[10 * line] = (-iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift;
1367
    dst[11 * line] = ( iT[0] * a[1][3] + iT[11] * a[0][3] - iT[13] * a[2][3] - iT[24] * a[3][3] + iT[1] * a[9][1] + iT[10] * a[5][1] - iT[14] * a[2][4] + iT[23] * a[7][4] + iT[2] * a[8][0] + iT[9] * a[9][5] + iT[15] * a[6][5] + iT[22] * a[3][0] - iT[3] * a[0][2] + iT[8] * a[5][3] - iT[16] * a[6][3] - iT[21] * a[4][3] - iT[4] * a[5][0] + iT[7] * a[0][5] + iT[17] * a[4][0] + iT[20] * a[6][0] - iT[5] * a[9][4] - iT[6] * a[5][4] + iT[18] * a[2][1] - iT[19] * a[7][1] - t[1] + add) >> shift;
1368
    dst[13 * line] = (-iT[0] * a[0][0] - iT[11] * a[1][0] + iT[13] * a[3][0] + iT[24] * a[2][0] - iT[1] * a[5][4] + iT[10] * a[0][1] + iT[14] * a[4][4] + iT[23] * a[6][4] + iT[2] * a[9][3] + iT[9] * a[5][3] - iT[15] * a[2][2] + iT[22] * a[7][2] - iT[3] * a[8][3] - iT[8] * a[9][2] - iT[16] * a[6][2] - iT[21] * a[3][3] + iT[4] * a[1][4] + iT[7] * a[8][4] - iT[17] * a[7][4] - iT[20] * a[4][1] - iT[5] * a[0][5] - iT[6] * a[1][5] + iT[18] * a[3][5] + iT[19] * a[2][5] + t[1] + add) >> shift;
1369
    dst[14 * line] = ( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift;
1370
    dst[15 * line] = (-iT[0] * a[7][4] - iT[11] * a[4][1] + iT[13] * a[8][4] + iT[24] * a[1][4] + iT[1] * a[2][2] + iT[10] * a[3][2] - iT[14] * a[0][2] - iT[23] * a[1][2] + iT[2] * a[2][1] - iT[9] * a[7][1] - iT[15] * a[5][4] - iT[22] * a[9][4] - iT[3] * a[7][5] + iT[8] * a[2][5] - iT[16] * a[9][0] - iT[21] * a[5][0] - iT[4] * a[2][0] - iT[7] * a[3][0] + iT[17] * a[0][0] + iT[20] * a[1][0] - iT[5] * a[2][3] + iT[6] * a[7][3] + iT[18] * a[5][2] + iT[19] * a[9][2] + t[0] + add) >> shift;
1371
    dst[16 * line] = (-iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift;
1372
    dst[18 * line] = ( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift;
1373
    dst[20 * line] = (-iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift;
1374
    dst[21 * line] = (-iT[0] * a[1][2] - iT[11] * a[8][2] + iT[13] * a[7][2] + iT[24] * a[4][3] - iT[1] * a[1][5] - iT[10] * a[8][5] + iT[14] * a[7][5] + iT[23] * a[4][0] - iT[2] * a[5][2] - iT[9] * a[9][2] - iT[15] * a[7][3] + iT[22] * a[2][3] - iT[3] * a[5][5] - iT[8] * a[9][5] - iT[16] * a[7][0] + iT[21] * a[2][0] - iT[4] * a[8][1] - iT[7] * a[9][4] - iT[17] * a[6][4] - iT[20] * a[3][1] - iT[5] * a[8][4] - iT[6] * a[9][1] - iT[18] * a[6][1] - iT[19] * a[3][4] - t[1] + add) >> shift;
1375
    dst[23 * line] = (-iT[0] * a[8][4] - iT[11] * a[9][1] - iT[13] * a[6][1] - iT[24] * a[3][4] + iT[1] * a[8][2] + iT[10] * a[1][2] - iT[14] * a[4][3] - iT[23] * a[7][2] + iT[2] * a[0][1] + iT[9] * a[1][1] - iT[15] * a[3][1] - iT[22] * a[2][1] - iT[3] * a[5][0] - iT[8] * a[9][0] - iT[16] * a[7][5] + iT[21] * a[2][5] + iT[4] * a[9][5] + iT[7] * a[8][0] + iT[17] * a[3][0] + iT[20] * a[6][5] - iT[5] * a[5][2] + iT[6] * a[0][3] + iT[18] * a[4][2] + iT[19] * a[6][2] + t[1] + add) >> shift;
1376
    dst[24 * line] = (-iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift;
1377
    dst[25 * line] = ( iT[0] * a[4][5] + iT[11] * a[6][5] + iT[13] * a[0][0] - iT[24] * a[5][5] + iT[1] * a[3][1] + iT[10] * a[2][1] - iT[14] * a[1][1] - iT[23] * a[0][1] - iT[2] * a[7][2] - iT[9] * a[4][3] + iT[15] * a[8][2] + iT[22] * a[1][2] - iT[3] * a[6][2] - iT[8] * a[3][3] - iT[16] * a[9][2] - iT[21] * a[8][3] - iT[4] * a[2][4] + iT[7] * a[7][4] + iT[17] * a[5][1] + iT[20] * a[9][1] + iT[5] * a[4][0] + iT[6] * a[6][0] + iT[18] * a[0][5] - iT[19] * a[5][0] + t[0] + add) >> shift;
1378
    dst[26 * line] = ( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift;
1379
    dst[28 * line] = (-iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift;
1380
    dst[29 * line] = (-iT[0] * a[6][4] - iT[11] * a[3][1] - iT[13] * a[9][4] - iT[24] * a[8][1] + iT[1] * a[7][3] + iT[10] * a[4][2] - iT[14] * a[8][3] - iT[23] * a[1][3] + iT[2] * a[3][5] + iT[9] * a[2][5] - iT[15] * a[1][5] - iT[22] * a[0][5] - iT[3] * a[2][4] - iT[8] * a[3][4] + iT[16] * a[0][4] + iT[21] * a[1][4] - iT[4] * a[4][3] - iT[7] * a[7][2] + iT[17] * a[1][2] + iT[20] * a[8][2] + iT[5] * a[3][0] + iT[6] * a[6][5] + iT[18] * a[8][0] + iT[19] * a[9][5] - t[0] + add) >> shift;
1381
    dst[30 * line] = (-iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift;
1382
    dst[31 * line] = (-iT[0] * a[8][5] - iT[11] * a[1][5] + iT[13] * a[4][0] + iT[24] * a[7][5] + iT[1] * a[1][0] + iT[10] * a[8][0] - iT[14] * a[7][0] - iT[23] * a[4][5] + iT[2] * a[8][4] + iT[9] * a[1][4] - iT[15] * a[4][1] - iT[22] * a[7][4] - iT[3] * a[1][1] - iT[8] * a[8][1] + iT[16] * a[7][1] + iT[21] * a[4][4] - iT[4] * a[8][3] - iT[7] * a[1][3] + iT[17] * a[4][2] + iT[20] * a[7][3] + iT[5] * a[1][2] + iT[6] * a[8][2] - iT[18] * a[7][2] - iT[19] * a[4][3] - t[1] + add) >> shift;
1383
1384
    dst[ 2 * line] = (iT[ 4]*b[0] + iT[ 9]*b[1] + iT[14]*b[2] + iT[19]*b[3] + iT[24]*b[4] + iT[29]*b[5] + add) >> shift;
1385
    dst[ 7 * line] = (iT[14]*b[0] + iT[29]*b[1] + iT[19]*b[2] + iT[ 4]*b[3] - iT[ 9]*b[4] - iT[24]*b[5] + add) >> shift;
1386
    dst[12 * line] = (iT[24]*b[0] + iT[14]*b[1] - iT[ 9]*b[2] - iT[29]*b[3] - iT[ 4]*b[4] + iT[19]*b[5] + add) >> shift;
1387
    dst[17 * line] = (iT[29]*b[0] - iT[ 4]*b[1] - iT[24]*b[2] + iT[ 9]*b[3] + iT[19]*b[4] - iT[14]*b[5] + add) >> shift;
1388
    dst[22 * line] = (iT[19]*b[0] - iT[24]*b[1] + iT[ 4]*b[2] + iT[14]*b[3] - iT[29]*b[4] + iT[ 9]*b[5] + add) >> shift;
1389
    dst[27 * line] = (iT[ 9]*b[0] - iT[19]*b[1] + iT[29]*b[2] - iT[24]*b[3] + iT[14]*b[4] - iT[ 4]*b[5] + add) >> shift;
1390
1391
    dst[ 6 * line] = (iT[12]*c[0] + iT[25]*c[1] + add) >> shift;
1392
    dst[19 * line] = (iT[25]*c[0] - iT[12]*c[1] + add) >> shift;
1393
1394
    src += 32;
1395
    dst++;
1396
  }
1397
1398
  if (iSkipLine)
1399
  {
1400
    dst = pCoef + reducedLine;
1401
    for (j = 0; j < cutoff; j++)
1402
    {
1403
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1404
      dst += line;
1405
    }
1406
  }
1407
1408
  if (iSkipLine2)
1409
  {
1410
    dst = pCoef + line * cutoff;
1411
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1412
  }
1413
#else
1414
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P32[TRANSFORM_FORWARD][0] );
1415
0
#endif
1416
0
}
1417
1418
1419
void fastInverseDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1420
0
{
1421
#if !JVET_M0497_MATRIX_MULT
1422
  int j, k;
1423
  TCoeff a[10][6];
1424
  TCoeff t[2];
1425
  TCoeff b[6];
1426
  TCoeff c[2];
1427
1428
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1429
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0];
1430
  const int  reducedLine = line - iSkipLine;
1431
1432
  for (j = 0; j < reducedLine; j++)
1433
  {
1434
    for (k = 0; k < 6; k++)
1435
    {
1436
      a[0][k] = src[      k  * line] + src[(12 - k) * line];
1437
      a[1][k] = src[      k  * line] - src[(13 + k) * line];
1438
      a[2][k] = src[      k  * line] + src[(25 - k) * line];
1439
      a[3][k] = src[      k  * line] - src[(26 + k) * line];
1440
      a[4][k] = src[( 7 + k) * line] + src[(18 - k) * line];
1441
      a[5][k] = src[( 7 + k) * line] - src[(20 + k) * line];
1442
      a[6][k] = src[( 7 + k) * line] + src[(31 - k) * line];
1443
      a[7][k] = src[(13 + k) * line] + src[(25 - k) * line];
1444
      a[8][k] = src[(13 + k) * line] - src[(26 + k) * line];
1445
      a[9][k] = src[(20 + k) * line] + src[(31 - k) * line];
1446
1447
      b[k] = src[k * line] - src[(12-k) * line] + src[(13+k) * line] - src[(25-k) * line] + src[(26+k) * line];
1448
    }
1449
    for (k = 0; k < 2; k++)
1450
    {
1451
      c[k] = src[k * line] - src[(4-k) * line] + src[(5+k) * line] - src[(9-k) * line] + src[(10+k) * line] - src[(14-k) * line] + src[(15+k)*line] - src[(19-k)*line] + src[(20+k)*line] - src[(24-k)*line] + src[(25+k)*line] - src[(29-k)*line] + src[(30+k)*line];
1452
    }
1453
1454
    t[0] = iT[12] * src[6*line] + iT[25] * src[19*line];
1455
    t[1] = iT[25] * src[6*line] - iT[12] * src[19*line];
1456
1457
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[1][0] - iT[11] * a[8][0] + iT[13] * a[7][0] + iT[24] * a[4][5] - iT[1] * a[8][5] + iT[10] * a[1][5] + iT[14] * a[4][0] + iT[23] * a[7][5] + iT[2] * a[1][1] - iT[9] * a[8][1] + iT[15] * a[7][1] + iT[22] * a[4][4] - iT[3] * a[8][4] + iT[8] * a[1][4] + iT[16] * a[4][1] + iT[21] * a[7][4] + iT[4] * a[1][2] - iT[7] * a[8][2] + iT[17] * a[7][2] + iT[20] * a[4][3] - iT[5] * a[8][3] + iT[6] * a[1][3] + iT[18] * a[4][2] + iT[19] * a[7][3] + t[0] + add) >> shift);
1458
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[4][2] - iT[11] * a[6][2] + iT[13] * a[0][3] + iT[24] * a[5][2] + iT[1] * a[2][0] + iT[10] * a[7][0] + iT[14] * a[5][5] - iT[23] * a[9][5] + iT[2] * a[7][2] + iT[9] * a[2][2] - iT[15] * a[9][3] + iT[22] * a[5][3] - iT[3] * a[6][0] - iT[8] * a[4][0] + iT[16] * a[5][0] + iT[21] * a[0][5] - iT[4] * a[4][1] - iT[7] * a[6][1] + iT[17] * a[0][4] + iT[20] * a[5][1] + iT[5] * a[2][1] + iT[6] * a[7][1] + iT[18] * a[5][4] - iT[19] * a[9][4] + t[1] + add) >> shift);
1459
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][4] - iT[11] * a[3][4] + iT[13] * a[0][4] + iT[24] * a[1][4] + iT[1] * a[4][3] + iT[10] * a[7][2] + iT[14] * a[1][2] - iT[23] * a[8][2] + iT[2] * a[3][0] - iT[9] * a[6][5] - iT[15] * a[8][0] + iT[22] * a[9][5] - iT[3] * a[6][4] + iT[8] * a[3][1] + iT[16] * a[9][4] - iT[21] * a[8][1] + iT[4] * a[7][3] + iT[7] * a[4][2] - iT[17] * a[8][3] + iT[20] * a[1][3] - iT[5] * a[3][5] - iT[6] * a[2][5] + iT[18] * a[1][5] + iT[19] * a[0][5] + t[1] + add) >> shift);
1460
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][4] + iT[11] * a[0][1] - iT[13] * a[4][4] - iT[24] * a[6][4] - iT[1] * a[1][3] - iT[10] * a[0][3] + iT[14] * a[2][3] + iT[23] * a[3][3] - iT[2] * a[0][4] - iT[9] * a[1][4] + iT[15] * a[3][4] + iT[22] * a[2][4] + iT[3] * a[0][0] + iT[8] * a[5][5] - iT[16] * a[6][5] - iT[21] * a[4][5] + iT[4] * a[5][0] - iT[7] * a[9][0] + iT[17] * a[7][5] + iT[20] * a[2][5] - iT[5] * a[8][2] + iT[6] * a[9][3] - iT[18] * a[6][3] + iT[19] * a[3][2] + t[0] + add) >> shift);
1461
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][5] + iT[11] * a[8][5] - iT[13] * a[7][5] - iT[24] * a[4][0] + iT[1] * a[5][1] + iT[10] * a[0][4] - iT[14] * a[4][1] - iT[23] * a[6][1] - iT[2] * a[8][3] + iT[9] * a[9][2] - iT[15] * a[6][2] + iT[22] * a[3][3] - iT[3] * a[0][2] - iT[8] * a[1][2] + iT[16] * a[3][2] + iT[21] * a[2][2] - iT[4] * a[9][4] + iT[7] * a[5][4] + iT[17] * a[2][1] + iT[20] * a[7][1] + iT[5] * a[1][0] - iT[6] * a[8][0] + iT[18] * a[7][0] + iT[19] * a[4][5] - t[0] + add) >> shift);
1462
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][5] - iT[11] * a[2][5] + iT[13] * a[9][0] - iT[24] * a[5][0] + iT[1] * a[3][4] - iT[10] * a[6][1] - iT[14] * a[8][4] + iT[23] * a[9][1] + iT[2] * a[4][2] + iT[9] * a[7][3] + iT[15] * a[1][3] - iT[22] * a[8][3] - iT[3] * a[2][2] - iT[8] * a[3][2] + iT[16] * a[0][2] + iT[21] * a[1][2] - iT[4] * a[6][4] - iT[7] * a[4][4] + iT[17] * a[5][4] + iT[20] * a[0][1] + iT[5] * a[7][0] + iT[6] * a[2][0] - iT[18] * a[9][5] + iT[19] * a[5][5] - t[1] + add) >> shift);
1463
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[6][3] - iT[11] * a[4][3] + iT[13] * a[5][3] + iT[24] * a[0][2] + iT[1] * a[7][1] + iT[10] * a[4][4] - iT[14] * a[8][1] + iT[23] * a[1][1] - iT[2] * a[7][5] - iT[9] * a[4][0] + iT[15] * a[8][5] - iT[22] * a[1][5] + iT[3] * a[7][3] + iT[8] * a[2][3] - iT[16] * a[9][2] + iT[21] * a[5][2] - iT[4] * a[6][5] + iT[7] * a[3][0] + iT[17] * a[9][5] - iT[20] * a[8][0] + iT[5] * a[6][1] - iT[6] * a[3][4] - iT[18] * a[9][1] + iT[19] * a[8][4] - t[1] + add) >> shift);
1464
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][1] - iT[11] * a[0][1] + iT[13] * a[2][1] + iT[24] * a[3][1] + iT[1] * a[1][3] - iT[10] * a[8][3] + iT[14] * a[7][3] + iT[23] * a[4][2] - iT[2] * a[9][1] + iT[9] * a[8][4] - iT[15] * a[3][4] + iT[22] * a[6][1] + iT[3] * a[5][5] + iT[8] * a[0][0] - iT[16] * a[4][5] - iT[21] * a[6][5] + iT[4] * a[0][5] + iT[7] * a[1][5] - iT[17] * a[3][5] - iT[20] * a[2][5] + iT[5] * a[5][3] - iT[6] * a[9][3] + iT[18] * a[7][2] + iT[19] * a[2][2] - t[0] + add) >> shift);
1465
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][3] - iT[11] * a[1][3] - iT[13] * a[4][2] - iT[24] * a[7][3] - iT[1] * a[8][0] + iT[10] * a[1][0] + iT[14] * a[4][5] + iT[23] * a[7][0] + iT[2] * a[5][3] + iT[9] * a[0][2] - iT[15] * a[4][3] - iT[22] * a[6][3] - iT[3] * a[5][0] - iT[8] * a[0][5] + iT[16] * a[4][0] + iT[21] * a[6][0] + iT[4] * a[1][4] + iT[7] * a[0][4] - iT[17] * a[2][4] - iT[20] * a[3][4] - iT[5] * a[1][1] - iT[6] * a[0][1] + iT[18] * a[2][1] + iT[19] * a[3][1] + t[0] + add) >> shift);
1466
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[7][0] + iT[11] * a[2][0] - iT[13] * a[9][5] + iT[24] * a[5][5] + iT[1] * a[2][5] + iT[10] * a[7][5] + iT[14] * a[5][0] - iT[23] * a[9][0] - iT[2] * a[2][1] - iT[9] * a[3][1] + iT[15] * a[0][1] + iT[22] * a[1][1] - iT[3] * a[7][4] - iT[8] * a[4][1] + iT[16] * a[8][4] - iT[21] * a[1][4] + iT[4] * a[3][2] - iT[7] * a[6][3] - iT[17] * a[8][2] + iT[20] * a[9][3] + iT[5] * a[4][2] + iT[6] * a[6][2] - iT[18] * a[0][3] - iT[19] * a[5][2] + t[1] + add) >> shift);
1467
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][5] - iT[11] * a[8][0] + iT[13] * a[3][0] - iT[24] * a[6][5] - iT[1] * a[8][5] + iT[10] * a[9][0] - iT[14] * a[6][0] + iT[23] * a[3][5] + iT[2] * a[5][4] - iT[9] * a[9][4] + iT[15] * a[7][1] + iT[22] * a[2][1] - iT[3] * a[1][4] + iT[8] * a[8][4] - iT[16] * a[7][4] - iT[21] * a[4][1] - iT[4] * a[0][2] - iT[7] * a[5][3] + iT[17] * a[6][3] + iT[20] * a[4][3] + iT[5] * a[0][3] + iT[6] * a[1][3] - iT[18] * a[3][3] - iT[19] * a[2][3] + t[0] + add) >> shift);
1468
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][1] + iT[11] * a[5][1] + iT[13] * a[2][4] + iT[24] * a[7][4] + iT[1] * a[9][3] - iT[10] * a[5][3] - iT[14] * a[2][2] - iT[23] * a[7][2] - iT[2] * a[9][5] + iT[9] * a[5][5] + iT[15] * a[2][0] + iT[22] * a[7][0] + iT[3] * a[9][4] - iT[8] * a[8][1] + iT[16] * a[3][1] - iT[21] * a[6][4] - iT[4] * a[9][2] + iT[7] * a[8][3] - iT[17] * a[3][3] + iT[20] * a[6][2] + iT[5] * a[9][0] - iT[6] * a[8][5] + iT[18] * a[3][5] - iT[19] * a[6][0] - t[0] + add) >> shift);
1469
    dst[16] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[4][4] + iT[11] * a[7][1] + iT[13] * a[1][1] - iT[24] * a[8][1] + iT[1] * a[6][2] - iT[10] * a[3][3] - iT[14] * a[9][2] + iT[23] * a[8][3] - iT[2] * a[6][1] - iT[9] * a[4][1] + iT[15] * a[5][1] + iT[22] * a[0][4] - iT[3] * a[4][5] - iT[8] * a[6][5] + iT[16] * a[0][0] + iT[21] * a[5][5] - iT[4] * a[6][0] + iT[7] * a[3][5] + iT[17] * a[9][0] - iT[20] * a[8][5] + iT[5] * a[6][3] + iT[6] * a[4][3] - iT[18] * a[5][3] - iT[19] * a[0][2] - t[1] + add) >> shift);
1470
    dst[17] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][2] - iT[11] * a[4][3] + iT[13] * a[8][2] - iT[24] * a[1][2] + iT[1] * a[7][1] + iT[10] * a[2][1] - iT[14] * a[9][4] + iT[23] * a[5][4] - iT[2] * a[3][5] + iT[9] * a[6][0] + iT[15] * a[8][5] - iT[22] * a[9][0] - iT[3] * a[2][3] - iT[8] * a[7][3] - iT[16] * a[5][2] + iT[21] * a[9][2] + iT[4] * a[4][5] + iT[7] * a[7][0] + iT[17] * a[1][0] - iT[20] * a[8][0] - iT[5] * a[2][4] - iT[6] * a[3][4] + iT[18] * a[0][4] + iT[19] * a[1][4] - t[1] + add) >> shift);
1471
    dst[18] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][0] + iT[11] * a[8][5] - iT[13] * a[3][5] + iT[24] * a[6][0] + iT[1] * a[5][1] - iT[10] * a[9][1] + iT[14] * a[7][4] + iT[23] * a[2][4] + iT[2] * a[0][3] + iT[9] * a[5][2] - iT[15] * a[6][2] - iT[22] * a[4][2] + iT[3] * a[1][2] + iT[8] * a[0][2] - iT[16] * a[2][2] - iT[21] * a[3][2] - iT[4] * a[8][1] + iT[7] * a[1][1] + iT[17] * a[4][4] + iT[20] * a[7][1] + iT[5] * a[9][5] - iT[6] * a[8][0] + iT[18] * a[3][0] - iT[19] * a[6][5] - t[0] + add) >> shift);
1472
    dst[20] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][2] - iT[11] * a[9][3] + iT[13] * a[6][3] - iT[24] * a[3][2] + iT[1] * a[0][1] + iT[10] * a[5][4] - iT[14] * a[6][4] - iT[23] * a[4][4] + iT[2] * a[1][5] + iT[9] * a[0][5] - iT[15] * a[2][5] - iT[22] * a[3][5] - iT[3] * a[9][2] + iT[8] * a[5][2] + iT[16] * a[2][3] + iT[21] * a[7][3] + iT[4] * a[5][5] - iT[7] * a[9][5] + iT[17] * a[7][0] + iT[20] * a[2][0] + iT[5] * a[0][4] + iT[6] * a[5][1] - iT[18] * a[6][1] - iT[19] * a[4][1] + t[0] + add) >> shift);
1473
    dst[21] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][1] - iT[11] * a[7][1] - iT[13] * a[5][4] + iT[24] * a[9][4] - iT[1] * a[6][2] - iT[10] * a[4][2] + iT[14] * a[5][2] + iT[23] * a[0][3] - iT[2] * a[2][4] - iT[9] * a[7][4] - iT[15] * a[5][1] + iT[22] * a[9][1] - iT[3] * a[6][5] - iT[8] * a[4][5] + iT[16] * a[5][5] + iT[21] * a[0][0] - iT[4] * a[4][0] - iT[7] * a[7][5] - iT[17] * a[1][5] + iT[20] * a[8][5] - iT[5] * a[7][2] - iT[6] * a[4][3] + iT[18] * a[8][2] - iT[19] * a[1][2] + t[1] + add) >> shift);
1474
    dst[22] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[6][1] - iT[11] * a[3][4] - iT[13] * a[9][1] + iT[24] * a[8][4] + iT[1] * a[4][3] + iT[10] * a[6][3] - iT[14] * a[0][2] - iT[23] * a[5][3] + iT[2] * a[7][0] + iT[9] * a[4][5] - iT[15] * a[8][0] + iT[22] * a[1][0] - iT[3] * a[3][1] + iT[8] * a[6][4] + iT[16] * a[8][1] - iT[21] * a[9][4] - iT[4] * a[2][3] - iT[7] * a[3][3] + iT[17] * a[0][3] + iT[20] * a[1][3] - iT[5] * a[7][5] - iT[6] * a[2][5] + iT[18] * a[9][0] - iT[19] * a[5][0] + t[1] + add) >> shift);
1475
    dst[23] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[0][3] - iT[11] * a[1][3] + iT[13] * a[3][3] + iT[24] * a[2][3] - iT[1] * a[8][0] + iT[10] * a[9][5] - iT[14] * a[6][5] + iT[23] * a[3][0] + iT[2] * a[8][2] - iT[9] * a[1][2] - iT[15] * a[4][3] - iT[22] * a[7][2] + iT[3] * a[0][5] + iT[8] * a[5][0] - iT[16] * a[6][0] - iT[21] * a[4][0] + iT[4] * a[8][4] - iT[7] * a[9][1] + iT[17] * a[6][1] - iT[20] * a[3][4] - iT[5] * a[5][4] - iT[6] * a[0][1] + iT[18] * a[4][4] + iT[19] * a[6][4] + t[0] + add) >> shift);
1476
    dst[26] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][0] - iT[11] * a[2][0] + iT[13] * a[1][0] + iT[24] * a[0][0] - iT[1] * a[2][5] - iT[10] * a[3][5] + iT[14] * a[0][5] + iT[23] * a[1][5] + iT[2] * a[4][4] + iT[9] * a[6][4] - iT[15] * a[0][1] - iT[22] * a[5][4] - iT[3] * a[4][1] - iT[8] * a[7][4] - iT[16] * a[1][4] + iT[21] * a[8][4] + iT[4] * a[2][2] + iT[7] * a[7][2] + iT[17] * a[5][3] - iT[20] * a[9][3] + iT[5] * a[3][3] - iT[6] * a[6][2] - iT[18] * a[8][3] + iT[19] * a[9][2] - t[1] + add) >> shift);
1477
    dst[27] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][3] + iT[11] * a[6][2] + iT[13] * a[8][3] - iT[24] * a[9][2] - iT[1] * a[2][0] - iT[10] * a[3][0] + iT[14] * a[0][0] + iT[23] * a[1][0] - iT[2] * a[6][3] + iT[9] * a[3][2] + iT[15] * a[9][3] - iT[22] * a[8][2] - iT[3] * a[4][0] - iT[8] * a[6][0] + iT[16] * a[0][5] + iT[21] * a[5][0] - iT[4] * a[7][4] - iT[7] * a[2][4] + iT[17] * a[9][1] - iT[20] * a[5][1] - iT[5] * a[4][4] - iT[6] * a[7][1] - iT[18] * a[1][1] + iT[19] * a[8][1] - t[1] + add) >> shift);
1478
    dst[28] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[0][4] + iT[11] * a[5][1] - iT[13] * a[6][1] - iT[24] * a[4][1] + iT[1] * a[9][3] - iT[10] * a[8][2] + iT[14] * a[3][2] - iT[23] * a[6][3] - iT[2] * a[1][0] - iT[9] * a[0][0] + iT[15] * a[2][0] + iT[22] * a[3][0] + iT[3] * a[8][1] - iT[8] * a[9][4] + iT[16] * a[6][4] - iT[21] * a[3][1] - iT[4] * a[5][2] - iT[7] * a[0][3] + iT[17] * a[4][2] + iT[20] * a[6][2] + iT[5] * a[1][5] - iT[6] * a[8][5] + iT[18] * a[7][5] + iT[19] * a[4][0] - t[0] + add) >> shift);
1479
    dst[30] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][3] - iT[11] * a[9][3] + iT[13] * a[7][2] + iT[24] * a[2][2] + iT[1] * a[0][1] + iT[10] * a[1][1] - iT[14] * a[3][1] - iT[23] * a[2][1] + iT[2] * a[9][0] - iT[9] * a[5][0] - iT[15] * a[2][5] - iT[22] * a[7][5] - iT[3] * a[5][2] + iT[8] * a[9][2] - iT[16] * a[7][3] - iT[21] * a[2][3] - iT[4] * a[0][0] - iT[7] * a[1][0] + iT[17] * a[3][0] + iT[20] * a[2][0] - iT[5] * a[9][1] + iT[6] * a[5][1] + iT[18] * a[2][4] + iT[19] * a[7][4] + t[0] + add) >> shift);
1480
    dst[31] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][5] + iT[11] * a[2][5] - iT[13] * a[1][5] - iT[24] * a[0][5] - iT[1] * a[3][4] - iT[10] * a[2][4] + iT[14] * a[1][4] + iT[23] * a[0][4] + iT[2] * a[3][3] + iT[9] * a[2][3] - iT[15] * a[1][3] - iT[22] * a[0][3] - iT[3] * a[3][2] - iT[8] * a[2][2] + iT[16] * a[1][2] + iT[21] * a[0][2] + iT[4] * a[3][1] + iT[7] * a[2][1] - iT[17] * a[1][1] - iT[20] * a[0][1] - iT[5] * a[3][0] - iT[6] * a[2][0] + iT[18] * a[1][0] + iT[19] * a[0][0] + t[1] + add) >> shift);
1481
1482
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 4] * b[0] + iT[14] * b[1] + iT[24] * b[2] + iT[29] * b[3] + iT[19] * b[4] + iT[ 9] * b[5] + add) >> shift);
1483
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 9] * b[0] + iT[29] * b[1] + iT[14] * b[2] - iT[ 4] * b[3] - iT[24] * b[4] - iT[19] * b[5] + add) >> shift);
1484
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(iT[14] * b[0] + iT[19] * b[1] - iT[ 9] * b[2] - iT[24] * b[3] + iT[ 4] * b[4] + iT[29] * b[5] + add) >> shift);
1485
    dst[19] = Clip3(outputMinimum, outputMaximum, (int)(iT[19] * b[0] + iT[ 4] * b[1] - iT[29] * b[2] + iT[ 9] * b[3] + iT[14] * b[4] - iT[24] * b[5] + add) >> shift);
1486
    dst[24] = Clip3(outputMinimum, outputMaximum, (int)(iT[24] * b[0] - iT[ 9] * b[1] - iT[ 4] * b[2] + iT[19] * b[3] - iT[29] * b[4] + iT[14] * b[5] + add) >> shift);
1487
    dst[29] = Clip3(outputMinimum, outputMaximum, (int)(iT[29] * b[0] - iT[24] * b[1] + iT[19] * b[2] - iT[14] * b[3] + iT[ 9] * b[4] - iT[ 4] * b[5] + add) >> shift);
1488
1489
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(iT[12]*c[0] + iT[25]*c[1] + add) >> shift);
1490
    dst[25] = Clip3(outputMinimum, outputMaximum, (int)(iT[25]*c[0] - iT[12]*c[1] + add) >> shift);
1491
1492
    src++;
1493
    dst += 32;
1494
  }
1495
1496
  if (iSkipLine)
1497
  {
1498
    memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff));
1499
  }
1500
#else
1501
0
  _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P32[TRANSFORM_INVERSE][0] );
1502
0
#endif
1503
0
}
1504
1505
1506
// ********************************** DCT-VIII **********************************
1507
void fastForwardDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1508
0
{
1509
0
  int i;
1510
0
  int rnd_factor = 1 << (shift - 1);
1511
0
  const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_FORWARD][0];
1512
1513
0
  int c[4];
1514
0
  TCoeff *pCoeff = dst;
1515
0
  const int  reducedLine = line - iSkipLine;
1516
0
  for (i = 0; i<reducedLine; i++)
1517
0
  {
1518
    // Intermediate Variables
1519
0
    c[0] = src[0] + src[3];
1520
0
    c[1] = src[2] + src[0];
1521
0
    c[2] = src[3] - src[2];
1522
0
    c[3] = iT[1] * src[1];
1523
1524
0
    dst[0 * line] = (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift;
1525
0
    dst[1 * line] = (iT[1] * (src[0] - src[2] - src[3]) + rnd_factor) >> shift;
1526
0
    dst[2 * line] = (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift;
1527
0
    dst[3 * line] = (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift;
1528
1529
0
    src += 4;
1530
0
    dst++;
1531
0
  }
1532
0
  if (iSkipLine)
1533
0
  {
1534
0
    dst = pCoeff + reducedLine;
1535
0
    for (i = 0; i<4; i++)
1536
0
    {
1537
0
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1538
0
      dst += line;
1539
0
    }
1540
0
  }
1541
0
}
1542
1543
1544
void fastInverseDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1545
0
{
1546
0
#if ENABLE_SIMD_TRAFO
1547
0
  _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P4[TRANSFORM_INVERSE][0] );
1548
#else
1549
  int i;
1550
  int rnd_factor = 1 << (shift - 1);
1551
1552
  const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_INVERSE][0];
1553
1554
  int c[4];
1555
  const int  reducedLine = line - iSkipLine;
1556
  for (i = 0; i<reducedLine; i++)
1557
  {
1558
    // Intermediate Variables
1559
    c[0] = src[0 * line] + src[3 * line];
1560
    c[1] = src[2 * line] + src[0 * line];
1561
    c[2] = src[3 * line] - src[2 * line];
1562
    c[3] = iT[1] * src[1 * line];
1563
1564
    dst[0] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift);
1565
    dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * (src[0 * line] - src[2 * line] - src[3 * line]) + rnd_factor) >> shift);
1566
    dst[2] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift);
1567
    dst[3] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift);
1568
1569
    dst += 4;
1570
    src++;
1571
  }
1572
  if (iSkipLine)
1573
  {
1574
    memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff));
1575
  }
1576
#endif
1577
0
}
1578
1579
1580
void fastForwardDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1581
0
{
1582
0
  _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P8[TRANSFORM_FORWARD][0] );
1583
0
}
1584
1585
1586
void fastInverseDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1587
0
{
1588
0
  _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P8[TRANSFORM_INVERSE][0] );
1589
0
}
1590
1591
1592
void fastForwardDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1593
0
{
1594
#if !JVET_M0497_MATRIX_MULT
1595
  int j, k;
1596
  TCoeff a[5], b[5], c[5], d[5], t;
1597
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1598
1599
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0];
1600
1601
  TCoeff *pCoef = dst;
1602
  const int  reducedLine = line - iSkipLine;
1603
  const int  cutoff = 16 - iSkipLine2;
1604
1605
  for (j = 0; j < reducedLine; j++)
1606
  {
1607
    for (k = 0; k < 5; k++)
1608
    {
1609
      a[k] = src[15 - k] + src[ 4 - k];
1610
      b[k] = src[ 6 + k] + src[ 4 - k];
1611
      c[k] = src[15 - k] - src[ 6 + k];
1612
      d[k] = src[15 - k] + src[ 6 + k] - src[ 4 - k];
1613
    }
1614
1615
    t = iT[10] * src[5];
1616
1617
    dst[ 1 * line] = ( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift;
1618
    dst[ 4 * line] = (   iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift;
1619
    dst[ 7 * line] = ( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift;
1620
    dst[10 * line] = (   iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift;
1621
    dst[13 * line] = ( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift;
1622
1623
    dst[ 5 * line] = ( - iT[10] * (src[15] + src[14] - src[12] - src[11] + src[9] + src[8] - src[6] - src[5] + src[3] + src[2] - src[0]) + add) >> shift;
1624
1625
    dst[ 0 * line] = (   iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift;
1626
    dst[ 2 * line] = (   iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift;
1627
    dst[ 3 * line] = ( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift;
1628
    dst[ 6 * line] = (   iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift;
1629
    dst[ 8 * line] = (   iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift;
1630
    dst[ 9 * line] = ( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift;
1631
    dst[11 * line] = ( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift;
1632
    dst[12 * line] = (   iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift;
1633
    dst[14 * line] = (   iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift;
1634
    dst[15 * line] = ( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift;
1635
1636
    src += 16;
1637
    dst++;
1638
  }
1639
1640
  if (iSkipLine)
1641
  {
1642
    dst = pCoef + reducedLine;
1643
    for (j = 0; j < cutoff; j++)
1644
    {
1645
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1646
      dst += line;
1647
    }
1648
  }
1649
1650
  if (iSkipLine2)
1651
  {
1652
    dst = pCoef + line * cutoff;
1653
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1654
  }
1655
#else
1656
0
  _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P16[TRANSFORM_FORWARD][0] );
1657
0
#endif
1658
0
}
1659
1660
1661
void fastInverseDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1662
0
{
1663
#if !JVET_M0497_MATRIX_MULT
1664
  int j, k;
1665
  TCoeff a[5], b[5], c[5], d[5], t;
1666
1667
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1668
1669
  const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0];
1670
1671
  const int reducedLine = line - iSkipLine;
1672
1673
  for (j = 0; j < reducedLine; j++)
1674
  {
1675
    for (k = 0; k < 5; k++)
1676
    {
1677
      a[k] = src[(15 - k ) * line] + src[( 4 - k) * line];
1678
      b[k] = src[( 6 + k ) * line] + src[( 4 - k) * line];
1679
      c[k] = src[(15 - k ) * line] - src[( 6 + k) * line];
1680
      d[k] = src[(15 - k ) * line] + src[( 6 + k) * line] - src[(4 - k) * line];
1681
    }
1682
1683
    t = iT[10] * src[5*line];
1684
1685
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift);
1686
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(   iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift);
1687
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift);
1688
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)(   iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift);
1689
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift);
1690
1691
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( - iT[10] * (src[15 * line] + src[14 * line] - src[12 * line] - src[11 * line] + src[9 * line] + src[8 * line] - src[6 * line] - src[5 * line] + src[3 * line] + src[2 * line] - src[0 * line]) + add) >> shift);
1692
1693
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift );
1694
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(   iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift );
1695
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift );
1696
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(   iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift );
1697
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(   iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift );
1698
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift );
1699
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift );
1700
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(   iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift );
1701
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(   iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift );
1702
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift );
1703
1704
    src++;
1705
    dst += 16;
1706
  }
1707
1708
  if (iSkipLine)
1709
  {
1710
    memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff));
1711
  }
1712
#else
1713
0
  _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P16[TRANSFORM_INVERSE][0] );
1714
0
#endif
1715
0
}
1716
1717
1718
void fastForwardDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2)
1719
0
{
1720
#if !JVET_M0497_MATRIX_MULT
1721
  int j, k;
1722
  TCoeff a[10][6];
1723
  TCoeff t[2];
1724
  TCoeff b[6];
1725
  TCoeff c[2];
1726
1727
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1728
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0];
1729
  TCoeff *pCoef = dst;
1730
  const int  reducedLine = line - iSkipLine;
1731
  const int  cutoff = 32 - iSkipLine2;
1732
1733
  for (j = 0; j < reducedLine; j++)
1734
  {
1735
    for (k = 0; k < 6; k++)
1736
    {
1737
      a[0][k] = src[31-k] - src[20+k];
1738
      a[1][k] = src[31-k] + src[18-k];
1739
      a[2][k] = src[31-k] + src[ 7+k];
1740
      a[3][k] = src[31-k] - src[ 5-k];
1741
      a[4][k] = src[25-k] + src[13+k];
1742
      a[5][k] = src[25-k] + src[12-k];
1743
      a[6][k] = src[25-k] - src[   k];
1744
      a[7][k] = src[18-k] - src[ 7+k];
1745
      a[8][k] = src[18-k] + src[ 5-k];
1746
      a[9][k] = src[12-k] + src[   k];
1747
1748
      b[k] = src[31-k] + src[20+k] - src[18-k] - src[7+k] + src[5-k];
1749
    }
1750
1751
    for (k = 0; k < 2; k++)
1752
    {
1753
      c[k] = src[31-k] + src[28+k] - src[26-k] - src[23+k] + src[21-k] + src[18+k] - src[16-k] - src[13+k] + src[11-k] + src[8+k] - src[6-k] - src[3+k] + src[1-k];
1754
    }
1755
1756
    t[0] = iT[12] * src[19] + iT[25] * src[6];
1757
    t[1] = iT[12] * src[6] - iT[25] * src[19];
1758
1759
    dst[ 0 * line] = (   iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift;
1760
    dst[ 1 * line] = (   iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift;
1761
    dst[ 3 * line] = (   iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift;
1762
    dst[ 4 * line] = ( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift;
1763
    dst[ 5 * line] = (   iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift;
1764
    dst[ 8 * line] = (   iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift;
1765
    dst[ 9 * line] = (   iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift;
1766
    dst[10 * line] = ( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift;
1767
    dst[11 * line] = ( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift;
1768
    dst[13 * line] = (   iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift;
1769
    dst[14 * line] = (   iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift;
1770
    dst[15 * line] = (   iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift;
1771
    dst[16 * line] = ( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift;
1772
    dst[18 * line] = (   iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift;
1773
    dst[20 * line] = ( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift;
1774
    dst[21 * line] = (   iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift;
1775
    dst[23 * line] = (   iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift;
1776
    dst[24 * line] = ( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift;
1777
    dst[25 * line] = ( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift;
1778
    dst[26 * line] = (   iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift;
1779
    dst[28 * line] = ( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift;
1780
    dst[29 * line] = (   iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift;
1781
    dst[30 * line] = ( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift;
1782
    dst[31 * line] = (   iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift;
1783
1784
    dst[ 2 * line] = (   iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift;
1785
    dst[ 7 * line] = ( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift;
1786
    dst[12 * line] = (   iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift;
1787
    dst[17 * line] = ( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift;
1788
    dst[22 * line] = (   iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift;
1789
    dst[27 * line] = ( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift;
1790
1791
    dst[ 6 * line] = (   iT[12] * c[0] + iT[25] * c[1] + add) >> shift;
1792
    dst[19 * line] = ( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift;
1793
1794
    src += 32;
1795
    dst++;
1796
  }
1797
1798
  if (iSkipLine)
1799
  {
1800
    dst = pCoef + reducedLine;
1801
    for (j = 0; j < cutoff; j++)
1802
    {
1803
      memset(dst, 0, sizeof(TCoeff)*iSkipLine);
1804
      dst += line;
1805
    }
1806
  }
1807
1808
  if (iSkipLine2)
1809
  {
1810
    dst = pCoef + line * cutoff;
1811
    memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2);
1812
  }
1813
#else
1814
0
  _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P32[TRANSFORM_FORWARD][0] );
1815
0
#endif
1816
0
}
1817
1818
1819
void fastInverseDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum)
1820
0
{
1821
#if !JVET_M0497_MATRIX_MULT
1822
  int j, k;
1823
  TCoeff a[10][6];
1824
  TCoeff t[2];
1825
  TCoeff b[6];
1826
  TCoeff c[2];
1827
  TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0;
1828
1829
  const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0];
1830
1831
  const int  reducedLine = line - iSkipLine;
1832
1833
  for (j = 0; j < reducedLine; j++)
1834
  {
1835
    for (k = 0; k < 6; k++)
1836
    {
1837
      a[0][k] = src[(31 - k)*line] - src[(20 + k)*line];
1838
      a[1][k] = src[(31 - k)*line] + src[(18 - k)*line];
1839
      a[2][k] = src[(31 - k)*line] + src[( 7 + k)*line];
1840
      a[3][k] = src[(31 - k)*line] - src[( 5 - k)*line];
1841
      a[4][k] = src[(25 - k)*line] + src[(13 + k)*line];
1842
      a[5][k] = src[(25 - k)*line] + src[(12 - k)*line];
1843
      a[6][k] = src[(25 - k)*line] - src[      k *line];
1844
      a[7][k] = src[(18 - k)*line] - src[( 7 + k)*line];
1845
      a[8][k] = src[(18 - k)*line] + src[( 5 - k)*line];
1846
      a[9][k] = src[(12 - k)*line] + src[      k *line];
1847
1848
      b[k] = src[(31 - k)*line] + src[(20 + k)*line] - src[(18 - k)*line] - src[(7 + k)*line] + src[(5 - k)*line];
1849
    }
1850
1851
    for (k = 0; k < 2; k++)
1852
    {
1853
      c[k] = src[(31 - k)*line] + src[(28 + k)*line] - src[(26 - k)*line] - src[(23 + k)*line] + src[(21 - k)*line] + src[(18 + k)*line] - src[(16 - k)*line] - src[(13 + k)*line] + src[(11 - k)*line] + src[(8 + k)*line] - src[(6 - k)*line] - src[(3 + k)*line] + src[(1 - k)*line];
1854
    }
1855
1856
    t[0] = iT[12] * src[19 * line] + iT[25] * src[ 6 * line];
1857
    t[1] = iT[12] * src[ 6 * line] - iT[25] * src[19 * line];
1858
1859
    dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift);
1860
    dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift);
1861
    dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift);
1862
    dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift);
1863
    dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift);
1864
    dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift);
1865
    dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift);
1866
    dst[10] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift);
1867
    dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift);
1868
    dst[13] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift);
1869
    dst[14] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift);
1870
    dst[15] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift);
1871
    dst[16] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift);
1872
    dst[18] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift);
1873
    dst[20] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift);
1874
    dst[21] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift);
1875
    dst[23] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift);
1876
    dst[24] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift);
1877
    dst[25] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift);
1878
    dst[26] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift);
1879
    dst[28] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift);
1880
    dst[29] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift);
1881
    dst[30] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift);
1882
    dst[31] = Clip3(outputMinimum, outputMaximum, (int)(   iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift);
1883
1884
    dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(   iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift);
1885
    dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift);
1886
    dst[12] = Clip3(outputMinimum, outputMaximum, (int)(   iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift);
1887
    dst[17] = Clip3(outputMinimum, outputMaximum, (int)( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift);
1888
    dst[22] = Clip3(outputMinimum, outputMaximum, (int)(   iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift);
1889
    dst[27] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift);
1890
1891
    dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(   iT[12] * c[0] + iT[25] * c[1] + add) >> shift);
1892
    dst[19] = Clip3(outputMinimum, outputMaximum, (int)( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift);
1893
1894
    src++;
1895
    dst += 32;
1896
  }
1897
1898
  if (iSkipLine)
1899
  {
1900
    memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff));
1901
  }
1902
#else
1903
0
  _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P32[TRANSFORM_INVERSE][0] );
1904
0
#endif
1905
0
}
1906
1907
#if ENABLE_SIMD_TRAFO
1908
1909
#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1
1910
1911
}   // namespace vvenc
1912
1913
#include "Unit.h"
1914
1915
namespace vvenc {
1916
  
1917
void cpyCoeffCore( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height )
1918
1.69M
{
1919
545M
#define CPYCOEFF_OP( ADDR ) dst[ADDR] = src[ADDR];
1920
1.69M
#define CPYCOEFF_INC src += stride; dst += width;
1921
1922
545M
  SIZE_AWARE_PER_EL_OP( CPYCOEFF_OP, CPYCOEFF_INC );
1923
1924
1.69M
#undef CPYCOEFF_INC
1925
1.69M
#undef CPYCOEFF_OP
1926
1.69M
}
1927
1928
1929
void cpyResiCore( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height )
1930
696k
{
1931
150M
#define CPYRESI_OP( ADDR ) dst[ADDR] = Pel( src[ADDR] );
1932
696k
#define CPYRESI_INC dst += stride; src += width;
1933
1934
150M
  SIZE_AWARE_PER_EL_OP( CPYRESI_OP, CPYRESI_INC );
1935
1936
696k
#undef CPYRESI_INC
1937
696k
#undef CPYRESI_OP
1938
696k
}
1939
1940
1941
void clipCore( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
1942
1.37M
{
1943
240M
#define CLIP_OP( ADDR ) dst[ADDR] = Clip3( outputMin, outputMax, ( dst[ADDR] + round ) >> shift )
1944
1.37M
#define CLIP_INC        dst      += stride
1945
1946
240M
  SIZE_AWARE_PER_EL_OP( CLIP_OP, CLIP_INC );
1947
1948
1.37M
#undef CLIP_INC
1949
1.37M
#undef CLIP_OP
1950
1.37M
}
1951
1952
1953
template<unsigned trSize>
1954
void fastInvCore_( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows )
1955
609k
{
1956
8.02M
  for( int k = 0; k < rows; k++ )
1957
7.41M
  {
1958
7.41M
    const TCoeff* srcPtr = &src[k * lines];
1959
113M
    for( int i = 0; i < reducedLines; i++ )
1960
106M
    {
1961
106M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
106M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
3.02G
      for( int j = 0; j < trSize; j++ )
1964
2.92G
      {
1965
2.92G
        *dstPtr++ += *srcPtr * *itPtr++;
1966
2.92G
      }
1967
106M
      srcPtr++;
1968
106M
    }
1969
7.41M
  }
1970
609k
}
void vvenc::fastInvCore_<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
1.78k
{
1956
8.90k
  for( int k = 0; k < rows; k++ )
1957
7.12k
  {
1958
7.12k
    const TCoeff* srcPtr = &src[k * lines];
1959
138k
    for( int i = 0; i < reducedLines; i++ )
1960
131k
    {
1961
131k
            TCoeff*       dstPtr = &dst[i * trSize];
1962
131k
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
657k
      for( int j = 0; j < trSize; j++ )
1964
525k
      {
1965
525k
        *dstPtr++ += *srcPtr * *itPtr++;
1966
525k
      }
1967
131k
      srcPtr++;
1968
131k
    }
1969
7.12k
  }
1970
1.78k
}
void vvenc::fastInvCore_<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
1.10k
{
1956
9.97k
  for( int k = 0; k < rows; k++ )
1957
8.86k
  {
1958
8.86k
    const TCoeff* srcPtr = &src[k * lines];
1959
200k
    for( int i = 0; i < reducedLines; i++ )
1960
192k
    {
1961
192k
            TCoeff*       dstPtr = &dst[i * trSize];
1962
192k
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
1.72M
      for( int j = 0; j < trSize; j++ )
1964
1.53M
      {
1965
1.53M
        *dstPtr++ += *srcPtr * *itPtr++;
1966
1.53M
      }
1967
192k
      srcPtr++;
1968
192k
    }
1969
8.86k
  }
1970
1.10k
}
void vvenc::fastInvCore_<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
402k
{
1956
4.49M
  for( int k = 0; k < rows; k++ )
1957
4.09M
  {
1958
4.09M
    const TCoeff* srcPtr = &src[k * lines];
1959
55.9M
    for( int i = 0; i < reducedLines; i++ )
1960
51.8M
    {
1961
51.8M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
51.8M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
881M
      for( int j = 0; j < trSize; j++ )
1964
829M
      {
1965
829M
        *dstPtr++ += *srcPtr * *itPtr++;
1966
829M
      }
1967
51.8M
      srcPtr++;
1968
51.8M
    }
1969
4.09M
  }
1970
402k
}
void vvenc::fastInvCore_<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
187k
{
1956
3.19M
  for( int k = 0; k < rows; k++ )
1957
3.01M
  {
1958
3.01M
    const TCoeff* srcPtr = &src[k * lines];
1959
45.7M
    for( int i = 0; i < reducedLines; i++ )
1960
42.7M
    {
1961
42.7M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
42.7M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
1.40G
      for( int j = 0; j < trSize; j++ )
1964
1.36G
      {
1965
1.36G
        *dstPtr++ += *srcPtr * *itPtr++;
1966
1.36G
      }
1967
42.7M
      srcPtr++;
1968
42.7M
    }
1969
3.01M
  }
1970
187k
}
void vvenc::fastInvCore_<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int)
Line
Count
Source
1955
17.2k
{
1956
305k
  for( int k = 0; k < rows; k++ )
1957
288k
  {
1958
288k
    const TCoeff* srcPtr = &src[k * lines];
1959
11.5M
    for( int i = 0; i < reducedLines; i++ )
1960
11.2M
    {
1961
11.2M
            TCoeff*       dstPtr = &dst[i * trSize];
1962
11.2M
      const TMatrixCoeff*  itPtr =  &it[k * trSize];
1963
733M
      for( int j = 0; j < trSize; j++ )
1964
722M
      {
1965
722M
        *dstPtr++ += *srcPtr * *itPtr++;
1966
722M
      }
1967
11.2M
      srcPtr++;
1968
11.2M
    }
1969
288k
  }
1970
17.2k
}
1971
1972
1973
template<unsigned trSize>
1974
void fastFwdCore( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, unsigned reducedLine, unsigned cutoff, int shift )
1975
2.70M
{
1976
2.70M
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
40.0M
  for( int i = 0; i < reducedLine; i++ )
1979
37.3M
  {
1980
37.3M
          TCoeff*       dstPtr = dst;
1981
37.3M
    const TMatrixCoeff* iT     = tc;
1982
1983
557M
    for( int j = 0; j < cutoff; j++ )
1984
520M
    {
1985
520M
      int sum = 0;
1986
1987
15.3G
      for( int k = 0; k < trSize; k++ )
1988
14.8G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
14.8G
        sum += src[k] * iT[k];
1991
14.8G
      }
1992
1993
520M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
520M
      dstPtr   += line;
1995
520M
      iT       += trSize;
1996
520M
    }
1997
1998
37.3M
    src += trSize;
1999
37.3M
  }
2000
2.70M
}
Unexecuted instantiation: void vvenc::fastFwdCore<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
void vvenc::fastFwdCore<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
988k
{
1976
988k
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
12.6M
  for( int i = 0; i < reducedLine; i++ )
1979
11.6M
  {
1980
11.6M
          TCoeff*       dstPtr = dst;
1981
11.6M
    const TMatrixCoeff* iT     = tc;
1982
1983
102M
    for( int j = 0; j < cutoff; j++ )
1984
91.2M
    {
1985
91.2M
      int sum = 0;
1986
1987
821M
      for( int k = 0; k < trSize; k++ )
1988
730M
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
730M
        sum += src[k] * iT[k];
1991
730M
      }
1992
1993
91.2M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
91.2M
      dstPtr   += line;
1995
91.2M
      iT       += trSize;
1996
91.2M
    }
1997
1998
11.6M
    src += trSize;
1999
11.6M
  }
2000
988k
}
void vvenc::fastFwdCore<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
871k
{
1976
871k
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
12.1M
  for( int i = 0; i < reducedLine; i++ )
1979
11.2M
  {
1980
11.2M
          TCoeff*       dstPtr = dst;
1981
11.2M
    const TMatrixCoeff* iT     = tc;
1982
1983
140M
    for( int j = 0; j < cutoff; j++ )
1984
128M
    {
1985
128M
      int sum = 0;
1986
1987
2.19G
      for( int k = 0; k < trSize; k++ )
1988
2.06G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
2.06G
        sum += src[k] * iT[k];
1991
2.06G
      }
1992
1993
128M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
128M
      dstPtr   += line;
1995
128M
      iT       += trSize;
1996
128M
    }
1997
1998
11.2M
    src += trSize;
1999
11.2M
  }
2000
871k
}
void vvenc::fastFwdCore<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
764k
{
1976
764k
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
12.0M
  for( int i = 0; i < reducedLine; i++ )
1979
11.2M
  {
1980
11.2M
          TCoeff*       dstPtr = dst;
1981
11.2M
    const TMatrixCoeff* iT     = tc;
1982
1983
234M
    for( int j = 0; j < cutoff; j++ )
1984
223M
    {
1985
223M
      int sum = 0;
1986
1987
7.36G
      for( int k = 0; k < trSize; k++ )
1988
7.14G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
7.14G
        sum += src[k] * iT[k];
1991
7.14G
      }
1992
1993
223M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
223M
      dstPtr   += line;
1995
223M
      iT       += trSize;
1996
223M
    }
1997
1998
11.2M
    src += trSize;
1999
11.2M
  }
2000
764k
}
void vvenc::fastFwdCore<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int)
Line
Count
Source
1975
76.8k
{
1976
76.8k
  const int rnd_factor = 1 << ( shift - 1 );
1977
1978
3.25M
  for( int i = 0; i < reducedLine; i++ )
1979
3.17M
  {
1980
3.17M
          TCoeff*       dstPtr = dst;
1981
3.17M
    const TMatrixCoeff* iT     = tc;
1982
1983
80.0M
    for( int j = 0; j < cutoff; j++ )
1984
76.8M
    {
1985
76.8M
      int sum = 0;
1986
1987
4.99G
      for( int k = 0; k < trSize; k++ )
1988
4.91G
      {
1989
        // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k]
1990
4.91G
        sum += src[k] * iT[k];
1991
4.91G
      }
1992
1993
76.8M
      dstPtr[i] = ( sum + rnd_factor ) >> shift;
1994
76.8M
      dstPtr   += line;
1995
76.8M
      iT       += trSize;
1996
76.8M
    }
1997
1998
3.17M
    src += trSize;
1999
3.17M
  }
2000
76.8k
}
2001
2002
2003
TCoeffOps::TCoeffOps()
2004
11
{
2005
11
  cpyResi4        = cpyResiCore;
2006
11
  cpyResi8        = cpyResiCore;
2007
11
  cpyCoeff4       = cpyCoeffCore;
2008
11
  cpyCoeff8       = cpyCoeffCore;
2009
11
  roundClip4      = clipCore;
2010
11
  roundClip8      = clipCore;
2011
11
  fastInvCore[0]  = fastInvCore_< 4>;
2012
11
  fastInvCore[1]  = fastInvCore_< 8>;
2013
11
  fastInvCore[2]  = fastInvCore_<16>;
2014
11
  fastInvCore[3]  = fastInvCore_<32>;
2015
11
  fastInvCore[4]  = fastInvCore_<64>;
2016
11
  fastFwdCore_1D[0] = fastFwdCore< 4>;
2017
11
  fastFwdCore_1D[1] = fastFwdCore< 8>;
2018
11
  fastFwdCore_1D[2] = fastFwdCore<16>;
2019
11
  fastFwdCore_1D[3] = fastFwdCore<32>;
2020
11
  fastFwdCore_1D[4] = fastFwdCore<64>;
2021
11
  fastFwdCore_2D[0] = fastFwdCore< 4>;
2022
11
  fastFwdCore_2D[1] = fastFwdCore< 8>;
2023
11
  fastFwdCore_2D[2] = fastFwdCore<16>;
2024
11
  fastFwdCore_2D[3] = fastFwdCore<32>;
2025
11
  fastFwdCore_2D[4] = fastFwdCore<64>;
2026
11
}
2027
2028
TCoeffOps g_tCoeffOps;
2029
2030
#endif
2031
2032
2033
} // namespace vvenc
2034
2035
//! \}
2036