/src/vvenc/source/Lib/CommonLib/TrQuant_EMT.cpp
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | |
44 | | /** \file TrQuant_EMT.cpp |
45 | | \brief transform and quantization class |
46 | | */ |
47 | | |
48 | | #include "TrQuant_EMT.h" |
49 | | #include "Rom.h" |
50 | | |
51 | | #include <stdlib.h> |
52 | | #include <math.h> |
53 | | #include <memory.h> |
54 | | |
55 | | //! \ingroup CommonLib |
56 | | //! \{ |
57 | | |
58 | | namespace vvenc { |
59 | | |
60 | | // ********************************** DCT-II ********************************** |
61 | | |
62 | | #if ENABLE_SIMD_TRAFO |
63 | | template<int uiTrSize> |
64 | | inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT ); |
65 | | |
66 | | template<> |
67 | | inline void _fastInverseMM<2>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT ) |
68 | 0 | { |
69 | 0 | const int rnd_factor = 1 << (shift - 1); |
70 | 0 | const int reducedLine = line - iSkipLine; |
71 | 0 | const int cutoff = 2 - iSkipLine2; |
72 | 0 |
|
73 | 0 | memset( dst, 0, reducedLine * 2 * sizeof( TCoeff ) ); |
74 | 0 |
|
75 | 0 | for( int k = 0; k < cutoff; k++ ) |
76 | 0 | { |
77 | 0 | const TCoeff* srcPtr = &src[k * line]; |
78 | 0 | for( int i = 0; i < reducedLine; i++ ) |
79 | 0 | { |
80 | 0 | TCoeff* dstPtr = &dst[i << 1]; |
81 | 0 | const TMatrixCoeff* itPtr = &iT[k << 1]; |
82 | 0 | const TCoeff srcVal = *srcPtr; |
83 | 0 | for( int j = 0; j < 2; j++ ) |
84 | 0 | { |
85 | 0 | *dstPtr++ += srcVal * *itPtr++; |
86 | 0 | } |
87 | 0 | srcPtr++; |
88 | 0 | } |
89 | 0 | } |
90 | 0 |
|
91 | 0 | for( int i = 0; i < reducedLine; i++ ) |
92 | 0 | { |
93 | 0 | TCoeff* dstPtr = &dst[i << 1]; |
94 | 0 | for( int j = 0; j < 2; j++, dstPtr++ ) |
95 | 0 | { |
96 | 0 | *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift ); |
97 | 0 | } |
98 | 0 | } |
99 | 0 |
|
100 | 0 | if( iSkipLine ) |
101 | 0 | { |
102 | 0 | memset( dst + ( reducedLine << 1 ), 0, ( iSkipLine << 1 ) * sizeof( TCoeff ) ); |
103 | 0 | } |
104 | 0 | } |
105 | | |
106 | | template<> |
107 | | inline void _fastInverseMM<4>( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT ) |
108 | 0 | { |
109 | 0 | const int rnd_factor = 1 << ( shift - 1 ); |
110 | 0 | const int reducedLine = line - iSkipLine; |
111 | 0 | const int cutoff = 4 - iSkipLine2; |
112 | |
|
113 | 0 | memset( dst, 0, reducedLine * 4 * sizeof( TCoeff ) ); |
114 | |
|
115 | 0 | #if ENABLE_SIMD_TRAFO |
116 | 0 | g_tCoeffOps.fastInvCore[0]( iT, src, dst, line, reducedLine, cutoff ); |
117 | 0 | g_tCoeffOps.roundClip4( dst, 4, reducedLine, 4, outputMinimum, outputMaximum, rnd_factor, shift ); |
118 | | #else |
119 | | for( int k = 0; k < cutoff; k++ ) |
120 | | { |
121 | | const TCoeff* srcPtr = &src[k * line]; |
122 | | for( int i = 0; i < reducedLine; i++ ) |
123 | | { |
124 | | TCoeff* dstPtr = &dst[i << 2]; |
125 | | const TMatrixCoeff* itPtr = &iT[k << 2]; |
126 | | for( int j = 0; j < 4; j++ ) |
127 | | { |
128 | | *dstPtr++ += *srcPtr * *itPtr++; |
129 | | } |
130 | | srcPtr++; |
131 | | } |
132 | | } |
133 | | |
134 | | for( int i = 0; i < reducedLine; i++ ) |
135 | | { |
136 | | TCoeff* dstPtr = &dst[i << 2]; |
137 | | for( int j = 0; j < 4; j++, dstPtr++ ) |
138 | | { |
139 | | *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift ); |
140 | | } |
141 | | } |
142 | | #endif |
143 | |
|
144 | 0 | if( iSkipLine ) |
145 | 0 | { |
146 | 0 | memset( dst + ( reducedLine << 2 ), 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) ); |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | | #endif |
151 | | |
152 | | template< int uiTrSize > |
153 | | inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT ) |
154 | 0 | { |
155 | 0 | const int rnd_factor = 1 << (shift - 1); |
156 | 0 | const int reducedLine = line - iSkipLine; |
157 | 0 | const int cutoff = uiTrSize - iSkipLine2; |
158 | |
|
159 | 0 | memset( dst, 0, reducedLine * uiTrSize * sizeof( TCoeff ) ); |
160 | |
|
161 | 0 | #if ENABLE_SIMD_TRAFO |
162 | 0 | g_tCoeffOps.fastInvCore[Log2( uiTrSize ) - 2]( iT, src, dst, line, reducedLine, cutoff ); |
163 | 0 | g_tCoeffOps.roundClip8( dst, uiTrSize, reducedLine, uiTrSize, outputMinimum, outputMaximum, rnd_factor, shift ); |
164 | | #else |
165 | | for( int k = 0; k < cutoff; k++ ) |
166 | | { |
167 | | const TCoeff* srcPtr = &src[k * line]; |
168 | | for( int i = 0; i < reducedLine; i++ ) |
169 | | { |
170 | | TCoeff* dstPtr = &dst[i * uiTrSize]; |
171 | | const TMatrixCoeff* itPtr = &iT[k * uiTrSize]; |
172 | | for( int j = 0; j < uiTrSize; j++ ) |
173 | | { |
174 | | *dstPtr++ += *srcPtr * *itPtr++; |
175 | | } |
176 | | srcPtr++; |
177 | | } |
178 | | } |
179 | | |
180 | | for( int i = 0; i < reducedLine; i++ ) |
181 | | { |
182 | | TCoeff* dstPtr = &dst[i * uiTrSize]; |
183 | | for( int j = 0; j < uiTrSize; j++, dstPtr++ ) |
184 | | { |
185 | | *dstPtr = Clip3( outputMinimum, outputMaximum, ( int ) ( *dstPtr + rnd_factor ) >> shift ); |
186 | | } |
187 | | } |
188 | | #endif |
189 | |
|
190 | 0 | if( iSkipLine ) |
191 | 0 | { |
192 | 0 | memset( dst + ( reducedLine*uiTrSize ), 0, ( iSkipLine*uiTrSize ) * sizeof( TCoeff ) ); |
193 | 0 | } |
194 | 0 | } Unexecuted instantiation: void vvenc::_fastInverseMM<16>(int const*, int*, int, int, int, int, int, int, short const*) Unexecuted instantiation: void vvenc::_fastInverseMM<32>(int const*, int*, int, int, int, int, int, int, short const*) Unexecuted instantiation: void vvenc::_fastInverseMM<64>(int const*, int*, int, int, int, int, int, int, short const*) Unexecuted instantiation: void vvenc::_fastInverseMM<8>(int const*, int*, int, int, int, int, int, int, short const*) |
195 | | |
196 | | //Fast DCT-II transforms |
197 | | void fastForwardDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
198 | 0 | { |
199 | 0 | int j; |
200 | 0 | int E, O; |
201 | 0 | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
202 | |
|
203 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_FORWARD][0]; |
204 | |
|
205 | 0 | TCoeff *pCoef = dst; |
206 | 0 | const int reducedLine = line - iSkipLine; |
207 | 0 | for (j = 0; j<reducedLine; j++) |
208 | 0 | { |
209 | | /* E and O */ |
210 | 0 | E = src[0] + src[1]; |
211 | 0 | O = src[0] - src[1]; |
212 | |
|
213 | 0 | dst[0] = (iT[0] * E + add) >> shift; |
214 | 0 | dst[line] = (iT[2] * O + add) >> shift; |
215 | | |
216 | |
|
217 | 0 | src += 2; |
218 | 0 | dst++; |
219 | 0 | } |
220 | 0 | if (iSkipLine) |
221 | 0 | { |
222 | 0 | dst = pCoef + reducedLine; |
223 | 0 | for (j = 0; j<2; j++) |
224 | 0 | { |
225 | 0 | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
226 | 0 | dst += line; |
227 | 0 | } |
228 | 0 | } |
229 | 0 | } |
230 | | |
231 | | void fastInverseDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
232 | 0 | { |
233 | 0 | int j; |
234 | 0 | int E, O; |
235 | 0 | int add = 1 << (shift - 1); |
236 | |
|
237 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_INVERSE][0]; |
238 | |
|
239 | 0 | const int reducedLine = line - iSkipLine; |
240 | 0 | for (j = 0; j<reducedLine; j++) |
241 | 0 | { |
242 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
243 | 0 | E = iT[0] * (src[0] + src[line]); |
244 | 0 | O = iT[2] * (src[0] - src[line]); |
245 | | |
246 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
247 | 0 | dst[0] = Clip3(outputMinimum, outputMaximum, (E + add) >> shift); |
248 | 0 | dst[1] = Clip3(outputMinimum, outputMaximum, (O + add) >> shift); |
249 | |
|
250 | 0 | src++; |
251 | 0 | dst += 2; |
252 | 0 | } |
253 | 0 | if (iSkipLine) |
254 | 0 | { |
255 | 0 | memset(dst, 0, (iSkipLine << 1) * sizeof(TCoeff)); |
256 | 0 | } |
257 | 0 | } |
258 | | |
259 | | /** 4x4 forward transform implemented using partial butterfly structure (1D) |
260 | | * \param src input data (residual) |
261 | | * \param dst output data (transform coefficients) |
262 | | * \param shift specifies right shift after 1D transform |
263 | | * \param line |
264 | | */ |
265 | | void fastForwardDCT2_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
266 | 0 | { |
267 | 0 | int j; |
268 | 0 | TCoeff E[2], O[2]; |
269 | 0 | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
270 | |
|
271 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_FORWARD][0]; |
272 | |
|
273 | 0 | TCoeff *pCoef = dst; |
274 | 0 | const int reducedLine = line - iSkipLine; |
275 | 0 | for (j = 0; j<reducedLine; j++) |
276 | 0 | { |
277 | | /* E and O */ |
278 | 0 | E[0] = src[0] + src[3]; |
279 | 0 | O[0] = src[0] - src[3]; |
280 | 0 | E[1] = src[1] + src[2]; |
281 | 0 | O[1] = src[1] - src[2]; |
282 | |
|
283 | 0 | dst[0] = (iT[0] * E[0] + iT[1] * E[1] + add) >> shift; |
284 | 0 | dst[2 * line] = (iT[8] * E[0] + iT[9] * E[1] + add) >> shift; |
285 | 0 | dst[line] = (iT[4] * O[0] + iT[5] * O[1] + add) >> shift; |
286 | 0 | dst[3 * line] = (iT[12] * O[0] + iT[13] * O[1] + add) >> shift; |
287 | |
|
288 | 0 | src += 4; |
289 | 0 | dst++; |
290 | 0 | } |
291 | 0 | if (iSkipLine) |
292 | 0 | { |
293 | 0 | dst = pCoef + reducedLine; |
294 | 0 | for (j = 0; j<4; j++) |
295 | 0 | { |
296 | 0 | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
297 | 0 | dst += line; |
298 | 0 | } |
299 | 0 | } |
300 | 0 | } |
301 | | |
302 | | /** 4x4 inverse transform implemented using partial butterfly structure (1D) |
303 | | * \param src input data (transform coefficients) |
304 | | * \param dst output data (residual) |
305 | | * \param shift specifies right shift after 1D transform |
306 | | * \param line |
307 | | * \param outputMinimum minimum for clipping |
308 | | * \param outputMaximum maximum for clipping |
309 | | */ |
310 | | void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum ) |
311 | 0 | { |
312 | | #if 0 |
313 | | const TMatrixCoeff *iT = g_trCoreDCT2P4[0]; |
314 | | |
315 | | _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT ); |
316 | | #else |
317 | 0 | int j; |
318 | 0 | int E[2], O[2]; |
319 | 0 | int add = 1 << ( shift - 1 ); |
320 | |
|
321 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_INVERSE][0]; |
322 | |
|
323 | 0 | #if ENABLE_SIMD_TRAFO |
324 | 0 | TCoeff* orgDst = dst; |
325 | |
|
326 | 0 | #endif |
327 | 0 | const int reducedLine = line - iSkipLine; |
328 | 0 | for( j = 0; j < reducedLine; j++ ) |
329 | 0 | { |
330 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
331 | 0 | O[0] = iT[1 * 4 + 0] * src[line] + iT[3 * 4 + 0] * src[3 * line]; |
332 | 0 | O[1] = iT[1 * 4 + 1] * src[line] + iT[3 * 4 + 1] * src[3 * line]; |
333 | 0 | E[0] = iT[0 * 4 + 0] * src[ 0] + iT[2 * 4 + 0] * src[2 * line]; |
334 | 0 | E[1] = iT[0 * 4 + 1] * src[ 0] + iT[2 * 4 + 1] * src[2 * line]; |
335 | | |
336 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
337 | 0 | #if ENABLE_SIMD_TRAFO |
338 | 0 | dst[0] = E[0] + O[0]; |
339 | 0 | dst[1] = E[1] + O[1]; |
340 | 0 | dst[2] = E[1] - O[1]; |
341 | 0 | dst[3] = E[0] - O[0]; |
342 | | #else |
343 | | dst[0] = Clip3( outputMinimum, outputMaximum, ( E[0] + O[0] + add ) >> shift ); |
344 | | dst[1] = Clip3( outputMinimum, outputMaximum, ( E[1] + O[1] + add ) >> shift ); |
345 | | dst[2] = Clip3( outputMinimum, outputMaximum, ( E[1] - O[1] + add ) >> shift ); |
346 | | dst[3] = Clip3( outputMinimum, outputMaximum, ( E[0] - O[0] + add ) >> shift ); |
347 | | #endif |
348 | |
|
349 | 0 | src++; |
350 | 0 | dst += 4; |
351 | 0 | } |
352 | |
|
353 | 0 | #if ENABLE_SIMD_TRAFO |
354 | 0 | g_tCoeffOps.roundClip4( orgDst, 4, reducedLine, 4, outputMinimum, outputMaximum, add, shift ); |
355 | |
|
356 | 0 | #endif |
357 | 0 | if( iSkipLine ) |
358 | 0 | { |
359 | 0 | memset( dst, 0, ( iSkipLine << 2 ) * sizeof( TCoeff ) ); |
360 | 0 | } |
361 | 0 | #endif |
362 | 0 | } |
363 | | |
364 | | |
365 | | |
366 | | template< int uiTrSize > |
367 | | inline void _fastForwardMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TMatrixCoeff* tc ) |
368 | 0 | { |
369 | | #if !ENABLE_SIMD_TRAFO |
370 | | const int rnd_factor = 1 << (shift - 1); |
371 | | #endif |
372 | 0 | const int reducedLine = line - iSkipLine; |
373 | 0 | const int cutoff = uiTrSize - iSkipLine2; |
374 | 0 | TCoeff *pCoef; |
375 | |
|
376 | 0 | #if ENABLE_SIMD_TRAFO |
377 | 0 | if( line == 1 ) |
378 | 0 | { |
379 | 0 | g_tCoeffOps.fastFwdCore_1D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift ); |
380 | 0 | } |
381 | 0 | else |
382 | 0 | { |
383 | 0 | g_tCoeffOps.fastFwdCore_2D[Log2( uiTrSize ) - 2]( tc, src, dst, line, reducedLine, cutoff, shift ); |
384 | 0 | } |
385 | | #else |
386 | | for( int i = 0; i<reducedLine; i++ ) |
387 | | { |
388 | | pCoef = dst; |
389 | | const TMatrixCoeff* iT = tc; |
390 | | for( int j = 0; j<cutoff; j++ ) |
391 | | { |
392 | | int iSum = 0; |
393 | | for( int k = 0; k<uiTrSize; k++ ) |
394 | | { |
395 | | // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k] |
396 | | iSum += src[k] * iT[k]; |
397 | | } |
398 | | pCoef[i] = (iSum + rnd_factor) >> shift; |
399 | | pCoef += line; |
400 | | iT += uiTrSize; |
401 | | } |
402 | | src += uiTrSize; |
403 | | } |
404 | | #endif |
405 | |
|
406 | 0 | if( iSkipLine ) |
407 | 0 | { |
408 | 0 | pCoef = dst + reducedLine; |
409 | 0 | for( int j = 0; j<cutoff; j++ ) |
410 | 0 | { |
411 | 0 | memset(pCoef, 0, sizeof(TCoeff) * iSkipLine); |
412 | 0 | pCoef += line; |
413 | 0 | } |
414 | 0 | } |
415 | |
|
416 | 0 | if( iSkipLine2 ) |
417 | 0 | { |
418 | 0 | pCoef = dst + line*cutoff; |
419 | 0 | memset(pCoef, 0, sizeof(TCoeff) * line * iSkipLine2); |
420 | 0 | } |
421 | 0 | } Unexecuted instantiation: void vvenc::_fastForwardMM<8>(int const*, int*, int, int, int, int, short const*) Unexecuted instantiation: void vvenc::_fastForwardMM<16>(int const*, int*, int, int, int, int, short const*) Unexecuted instantiation: void vvenc::_fastForwardMM<32>(int const*, int*, int, int, int, int, short const*) Unexecuted instantiation: void vvenc::_fastForwardMM<64>(int const*, int*, int, int, int, int, short const*) |
422 | | |
423 | | |
424 | | |
425 | | /** 8x8 forward transform implemented using partial butterfly structure (1D) |
426 | | * \param src input data (residual) |
427 | | * \param dst output data (transform coefficients) |
428 | | * \param shift specifies right shift after 1D transform |
429 | | * \param line |
430 | | */ |
431 | | void fastForwardDCT2_B8( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 ) |
432 | 0 | { |
433 | | #if !JVET_M0497_MATRIX_MULT |
434 | | int j, k; |
435 | | TCoeff E[4], O[4]; |
436 | | TCoeff EE[2], EO[2]; |
437 | | TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0; |
438 | | |
439 | | const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_FORWARD][0]; |
440 | | |
441 | | TCoeff *pCoef = dst; |
442 | | const int reducedLine = line - iSkipLine; |
443 | | for( j = 0; j < reducedLine; j++ ) |
444 | | { |
445 | | /* E and O*/ |
446 | | for( k = 0; k < 4; k++ ) |
447 | | { |
448 | | E[k] = src[k] + src[7 - k]; |
449 | | O[k] = src[k] - src[7 - k]; |
450 | | } |
451 | | /* EE and EO */ |
452 | | EE[0] = E[0] + E[3]; |
453 | | EO[0] = E[0] - E[3]; |
454 | | EE[1] = E[1] + E[2]; |
455 | | EO[1] = E[1] - E[2]; |
456 | | |
457 | | dst[0 ] = (iT[ 0] * EE[0] + iT[ 1] * EE[1] + add) >> shift; |
458 | | dst[4 * line] = (iT[32] * EE[0] + iT[33] * EE[1] + add) >> shift; |
459 | | dst[2 * line] = (iT[16] * EO[0] + iT[17] * EO[1] + add) >> shift; |
460 | | dst[6 * line] = (iT[48] * EO[0] + iT[49] * EO[1] + add) >> shift; |
461 | | |
462 | | dst[ line] = (iT[ 8] * O[0] + iT[ 9] * O[1] + iT[10] * O[2] + iT[11] * O[3] + add) >> shift; |
463 | | dst[3 * line] = (iT[24] * O[0] + iT[25] * O[1] + iT[26] * O[2] + iT[27] * O[3] + add) >> shift; |
464 | | dst[5 * line] = (iT[40] * O[0] + iT[41] * O[1] + iT[42] * O[2] + iT[43] * O[3] + add) >> shift; |
465 | | dst[7 * line] = (iT[56] * O[0] + iT[57] * O[1] + iT[58] * O[2] + iT[59] * O[3] + add) >> shift; |
466 | | |
467 | | src += 8; |
468 | | dst++; |
469 | | } |
470 | | if( iSkipLine ) |
471 | | { |
472 | | dst = pCoef + reducedLine; |
473 | | for( j = 0; j < 8; j++ ) |
474 | | { |
475 | | memset( dst, 0, sizeof( TCoeff )*iSkipLine ); |
476 | | dst += line; |
477 | | } |
478 | | } |
479 | | #else |
480 | 0 | _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P8[TRANSFORM_FORWARD][0] ); |
481 | 0 | #endif |
482 | 0 | } |
483 | | |
484 | | /** 8x8 inverse transform implemented using partial butterfly structure (1D) |
485 | | * \param src input data (transform coefficients) |
486 | | * \param dst output data (residual) |
487 | | * \param shift specifies right shift after 1D transform |
488 | | * \param line |
489 | | * \param outputMinimum minimum for clipping |
490 | | * \param outputMaximum maximum for clipping |
491 | | */ |
492 | | void fastInverseDCT2_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
493 | 0 | { |
494 | | #if 0 |
495 | | const TMatrixCoeff *iT = g_trCoreDCT2P8[0]; |
496 | | |
497 | | _fastInverseMM<8>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT ); |
498 | | #else |
499 | 0 | int j, k; |
500 | 0 | int E[4], O[4]; |
501 | 0 | int EE[2], EO[2]; |
502 | 0 | int add = 1 << (shift - 1); |
503 | |
|
504 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_INVERSE][0]; |
505 | |
|
506 | 0 | #if ENABLE_SIMD_TRAFO |
507 | 0 | TCoeff *orgDst = dst; |
508 | |
|
509 | 0 | #endif |
510 | 0 | const int reducedLine = line - iSkipLine; |
511 | 0 | for( j = 0; j < reducedLine; j++ ) |
512 | 0 | { |
513 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
514 | 0 | for( k = 0; k < 4; k++ ) |
515 | 0 | { |
516 | 0 | O[k] = iT[1 * 8 + k] * src[line] + iT[3 * 8 + k] * src[3 * line] + iT[5 * 8 + k] * src[5 * line] + iT[7 * 8 + k] * src[7 * line]; |
517 | 0 | } |
518 | |
|
519 | 0 | EO[0] = iT[2 * 8 + 0] * src[2 * line] + iT[6 * 8 + 0] * src[6 * line]; |
520 | 0 | EO[1] = iT[2 * 8 + 1] * src[2 * line] + iT[6 * 8 + 1] * src[6 * line]; |
521 | 0 | EE[0] = iT[0 * 8 + 0] * src[0 ] + iT[4 * 8 + 0] * src[4 * line]; |
522 | 0 | EE[1] = iT[0 * 8 + 1] * src[0 ] + iT[4 * 8 + 1] * src[4 * line]; |
523 | | |
524 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
525 | 0 | E[0] = EE[0] + EO[0]; |
526 | 0 | E[3] = EE[0] - EO[0]; |
527 | 0 | E[1] = EE[1] + EO[1]; |
528 | 0 | E[2] = EE[1] - EO[1]; |
529 | |
|
530 | 0 | for( k = 0; k < 4; k++ ) |
531 | 0 | { |
532 | 0 | #if ENABLE_SIMD_TRAFO |
533 | 0 | dst[k ] = E[ k] + O[ k]; |
534 | 0 | dst[k + 4] = E[3 - k] - O[3 - k]; |
535 | | #else |
536 | | dst[k ] = Clip3( outputMinimum, outputMaximum, ( E[ k] + O[ k] + add ) >> shift ); |
537 | | dst[k + 4] = Clip3( outputMinimum, outputMaximum, ( E[3 - k] - O[3 - k] + add ) >> shift ); |
538 | | #endif |
539 | 0 | } |
540 | 0 | src++; |
541 | 0 | dst += 8; |
542 | 0 | } |
543 | |
|
544 | 0 | #if ENABLE_SIMD_TRAFO |
545 | 0 | g_tCoeffOps.roundClip8( orgDst, 8, reducedLine, 8, outputMinimum, outputMaximum, add, shift ); |
546 | |
|
547 | 0 | #endif |
548 | 0 | if( iSkipLine ) |
549 | 0 | { |
550 | 0 | memset( dst, 0, ( iSkipLine << 3 ) * sizeof( TCoeff ) ); |
551 | 0 | } |
552 | 0 | #endif |
553 | 0 | } |
554 | | |
555 | | |
556 | | /** 16x16 forward transform implemented using partial butterfly structure (1D) |
557 | | * \param src input data (residual) |
558 | | * \param dst output data (transform coefficients) |
559 | | * \param shift specifies right shift after 1D transform |
560 | | * \param line |
561 | | */ |
562 | | void fastForwardDCT2_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
563 | 0 | { |
564 | | #if !JVET_M0497_MATRIX_MULT |
565 | | int j, k; |
566 | | TCoeff E [8], O [8]; |
567 | | TCoeff EE [4], EO [4]; |
568 | | TCoeff EEE[2], EEO[2]; |
569 | | TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0; |
570 | | |
571 | | const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_FORWARD][0]; |
572 | | |
573 | | TCoeff *pCoef = dst; |
574 | | const int reducedLine = line - iSkipLine; |
575 | | for( j = 0; j < reducedLine; j++ ) |
576 | | { |
577 | | /* E and O*/ |
578 | | for( k = 0; k < 8; k++ ) |
579 | | { |
580 | | E[k] = src[k] + src[15 - k]; |
581 | | O[k] = src[k] - src[15 - k]; |
582 | | } |
583 | | /* EE and EO */ |
584 | | for( k = 0; k < 4; k++ ) |
585 | | { |
586 | | EE[k] = E[k] + E[7 - k]; |
587 | | EO[k] = E[k] - E[7 - k]; |
588 | | } |
589 | | /* EEE and EEO */ |
590 | | EEE[0] = EE[0] + EE[3]; |
591 | | EEO[0] = EE[0] - EE[3]; |
592 | | EEE[1] = EE[1] + EE[2]; |
593 | | EEO[1] = EE[1] - EE[2]; |
594 | | |
595 | | dst[ 0 ] = ( iT[ 0 ] * EEE[0] + iT[ 1] * EEE[1] + add ) >> shift; |
596 | | dst[ 8 * line] = ( iT[ 8 * 16] * EEE[0] + iT[ 8 * 16 + 1] * EEE[1] + add ) >> shift; |
597 | | dst[ 4 * line] = ( iT[ 4 * 16] * EEO[0] + iT[ 4 * 16 + 1] * EEO[1] + add ) >> shift; |
598 | | dst[12 * line] = ( iT[12 * 16] * EEO[0] + iT[12 * 16 + 1] * EEO[1] + add ) >> shift; |
599 | | |
600 | | for( k = 2; k < 16; k += 4 ) |
601 | | { |
602 | | dst[k*line] = ( iT[k * 16] * EO[0] + iT[k * 16 + 1] * EO[1] + iT[k * 16 + 2] * EO[2] + iT[k * 16 + 3] * EO[3] + add ) >> shift; |
603 | | } |
604 | | |
605 | | for( k = 1; k < 16; k += 2 ) |
606 | | { |
607 | | dst[k*line] = ( iT[k * 16 ] * O[0] + iT[k * 16 + 1] * O[1] + iT[k * 16 + 2] * O[2] + iT[k * 16 + 3] * O[3] + |
608 | | iT[k * 16 + 4] * O[4] + iT[k * 16 + 5] * O[5] + iT[k * 16 + 6] * O[6] + iT[k * 16 + 7] * O[7] + add ) >> shift; |
609 | | } |
610 | | |
611 | | src += 16; |
612 | | dst++; |
613 | | |
614 | | } |
615 | | if( iSkipLine ) |
616 | | { |
617 | | dst = pCoef + reducedLine; |
618 | | for( j = 0; j < 16; j++ ) |
619 | | { |
620 | | memset( dst, 0, sizeof( TCoeff )*iSkipLine ); |
621 | | dst += line; |
622 | | } |
623 | | } |
624 | | #else |
625 | 0 | _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P16[TRANSFORM_FORWARD][0] ); |
626 | 0 | #endif |
627 | 0 | } |
628 | | |
629 | | /** 16x16 inverse transform implemented using partial butterfly structure (1D) |
630 | | * \param src input data (transform coefficients) |
631 | | * \param dst output data (residual) |
632 | | * \param shift specifies right shift after 1D transform |
633 | | * \param line |
634 | | * \param outputMinimum minimum for clipping |
635 | | * \param outputMaximum maximum for clipping |
636 | | */ |
637 | | void fastInverseDCT2_B16( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum ) |
638 | 0 | { |
639 | 0 | #if ENABLE_SIMD_TRAFO |
640 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0]; |
641 | |
|
642 | 0 | _fastInverseMM<16>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT ); |
643 | | #else |
644 | | int j, k; |
645 | | int E [8], O [8]; |
646 | | int EE [4], EO [4]; |
647 | | int EEE[2], EEO[2]; |
648 | | int add = 1 << ( shift - 1 ); |
649 | | |
650 | | const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0]; |
651 | | |
652 | | #if ENABLE_SIMD_TRAFO |
653 | | TCoeff *orgDst = dst; |
654 | | |
655 | | #endif |
656 | | const int reducedLine = line - iSkipLine; |
657 | | |
658 | | for( j = 0; j < reducedLine; j++ ) |
659 | | { |
660 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
661 | | for( k = 0; k < 8; k++ ) |
662 | | { |
663 | | O[k] = iT[1 * 16 + k] * src[ line] + iT[ 3 * 16 + k] * src[ 3 * line] + iT[ 5 * 16 + k] * src[ 5 * line] + iT[ 7 * 16 + k] * src[ 7 * line] + |
664 | | iT[9 * 16 + k] * src[9 * line] + iT[11 * 16 + k] * src[11 * line] + iT[13 * 16 + k] * src[13 * line] + iT[15 * 16 + k] * src[15 * line]; |
665 | | } |
666 | | for( k = 0; k < 4; k++ ) |
667 | | { |
668 | | EO[k] = iT[2 * 16 + k] * src[2 * line] + iT[6 * 16 + k] * src[6 * line] + iT[10 * 16 + k] * src[10 * line] + iT[14 * 16 + k] * src[14 * line]; |
669 | | } |
670 | | EEO[0] = iT[4 * 16 ] * src[4 * line] + iT[12 * 16 ] * src[12 * line]; |
671 | | EEE[0] = iT[0 ] * src[0 ] + iT[ 8 * 16 ] * src[ 8 * line]; |
672 | | EEO[1] = iT[4 * 16 + 1] * src[4 * line] + iT[12 * 16 + 1] * src[12 * line]; |
673 | | EEE[1] = iT[0 * 16 + 1] * src[0 ] + iT[ 8 * 16 + 1] * src[ 8 * line]; |
674 | | |
675 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
676 | | for( k = 0; k < 2; k++ ) |
677 | | { |
678 | | EE[k ] = EEE[ k] + EEO[ k]; |
679 | | EE[k + 2] = EEE[1 - k] - EEO[1 - k]; |
680 | | } |
681 | | for( k = 0; k < 4; k++ ) |
682 | | { |
683 | | E[k ] = EE[ k] + EO[ k]; |
684 | | E[k + 4] = EE[3 - k] - EO[3 - k]; |
685 | | } |
686 | | for( k = 0; k < 8; k++ ) |
687 | | { |
688 | | #if ENABLE_SIMD_TRAFO |
689 | | dst[k ] = E[ k] + O[ k]; |
690 | | dst[k + 8] = E[7 - k] - O[7 - k]; |
691 | | #else |
692 | | dst[k ] = Clip3( outputMinimum, outputMaximum, ( E[ k] + O[ k] + add ) >> shift ); |
693 | | dst[k + 8] = Clip3( outputMinimum, outputMaximum, ( E[7 - k] - O[7 - k] + add ) >> shift ); |
694 | | #endif |
695 | | } |
696 | | src++; |
697 | | dst += 16; |
698 | | } |
699 | | |
700 | | #if ENABLE_SIMD_TRAFO |
701 | | g_tCoeffOps.roundClip8( orgDst, 16, reducedLine, 16, outputMinimum, outputMaximum, add, shift ); |
702 | | |
703 | | #endif |
704 | | if( iSkipLine ) |
705 | | { |
706 | | memset( dst, 0, ( iSkipLine << 4 ) * sizeof( TCoeff ) ); |
707 | | } |
708 | | #endif |
709 | 0 | } |
710 | | |
711 | | |
712 | | |
713 | | /** 32x32 forward transform implemented using partial butterfly structure (1D) |
714 | | * \param src input data (residual) |
715 | | * \param dst output data (transform coefficients) |
716 | | * \param shift specifies right shift after 1D transform |
717 | | * \param line |
718 | | */ |
719 | | void fastForwardDCT2_B32( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2 ) |
720 | 0 | { |
721 | | #if !JVET_M0497_MATRIX_MULT |
722 | | int j, k; |
723 | | TCoeff E [16], O [16]; |
724 | | TCoeff EE [ 8], EO [ 8]; |
725 | | TCoeff EEE [ 4], EEO [ 4]; |
726 | | TCoeff EEEE[ 2], EEEO[ 2]; |
727 | | TCoeff add = ( shift > 0 ) ? ( 1 << ( shift - 1 ) ) : 0; |
728 | | |
729 | | const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_FORWARD][0]; |
730 | | |
731 | | TCoeff *pCoef = dst; |
732 | | const int reducedLine = line - iSkipLine; |
733 | | for (j = 0; j<reducedLine; j++) |
734 | | { |
735 | | /* E and O*/ |
736 | | for (k = 0;k<16;k++) |
737 | | { |
738 | | E[k] = src[k] + src[31 - k]; |
739 | | O[k] = src[k] - src[31 - k]; |
740 | | } |
741 | | /* EE and EO */ |
742 | | for (k = 0;k<8;k++) |
743 | | { |
744 | | EE[k] = E[k] + E[15 - k]; |
745 | | EO[k] = E[k] - E[15 - k]; |
746 | | } |
747 | | /* EEE and EEO */ |
748 | | for (k = 0;k<4;k++) |
749 | | { |
750 | | EEE[k] = EE[k] + EE[7 - k]; |
751 | | EEO[k] = EE[k] - EE[7 - k]; |
752 | | } |
753 | | /* EEEE and EEEO */ |
754 | | EEEE[0] = EEE[0] + EEE[3]; |
755 | | EEEO[0] = EEE[0] - EEE[3]; |
756 | | EEEE[1] = EEE[1] + EEE[2]; |
757 | | EEEO[1] = EEE[1] - EEE[2]; |
758 | | |
759 | | dst[0] = (iT[0 * 32 + 0] * EEEE[0] + iT[0 * 32 + 1] * EEEE[1] + add) >> shift; |
760 | | dst[16 * line] = (iT[16 * 32 + 0] * EEEE[0] + iT[16 * 32 + 1] * EEEE[1] + add) >> shift; |
761 | | dst[8 * line] = (iT[8 * 32 + 0] * EEEO[0] + iT[8 * 32 + 1] * EEEO[1] + add) >> shift; |
762 | | dst[24 * line] = (iT[24 * 32 + 0] * EEEO[0] + iT[24 * 32 + 1] * EEEO[1] + add) >> shift; |
763 | | for (k = 4;k<32;k += 8) |
764 | | { |
765 | | dst[k*line] = (iT[k * 32 + 0] * EEO[0] + iT[k * 32 + 1] * EEO[1] + iT[k * 32 + 2] * EEO[2] + iT[k * 32 + 3] * EEO[3] + add) >> shift; |
766 | | } |
767 | | for (k = 2;k<32;k += 4) |
768 | | { |
769 | | dst[k*line] = (iT[k * 32 + 0] * EO[0] + iT[k * 32 + 1] * EO[1] + iT[k * 32 + 2] * EO[2] + iT[k * 32 + 3] * EO[3] + |
770 | | iT[k * 32 + 4] * EO[4] + iT[k * 32 + 5] * EO[5] + iT[k * 32 + 6] * EO[6] + iT[k * 32 + 7] * EO[7] + add) >> shift; |
771 | | } |
772 | | for (k = 1;k<32;k += 2) |
773 | | { |
774 | | dst[k*line] = (iT[k * 32 + 0] * O[0] + iT[k * 32 + 1] * O[1] + iT[k * 32 + 2] * O[2] + iT[k * 32 + 3] * O[3] + |
775 | | iT[k * 32 + 4] * O[4] + iT[k * 32 + 5] * O[5] + iT[k * 32 + 6] * O[6] + iT[k * 32 + 7] * O[7] + |
776 | | iT[k * 32 + 8] * O[8] + iT[k * 32 + 9] * O[9] + iT[k * 32 + 10] * O[10] + iT[k * 32 + 11] * O[11] + |
777 | | iT[k * 32 + 12] * O[12] + iT[k * 32 + 13] * O[13] + iT[k * 32 + 14] * O[14] + iT[k * 32 + 15] * O[15] + add) >> shift; |
778 | | } |
779 | | src += 32; |
780 | | dst++; |
781 | | } |
782 | | if (iSkipLine) |
783 | | { |
784 | | dst = pCoef + reducedLine; |
785 | | for (j = 0; j<32; j++) |
786 | | { |
787 | | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
788 | | dst += line; |
789 | | } |
790 | | } |
791 | | #else |
792 | 0 | _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P32[TRANSFORM_FORWARD][0] ); |
793 | 0 | #endif |
794 | 0 | } |
795 | | |
796 | | /** 32x32 inverse transform implemented using partial butterfly structure (1D) |
797 | | * \param src input data (transform coefficients) |
798 | | * \param dst output data (residual) |
799 | | * \param shift specifies right shift after 1D transform |
800 | | * \param line |
801 | | * \param outputMinimum minimum for clipping |
802 | | * \param outputMaximum maximum for clipping |
803 | | */ |
804 | | void fastInverseDCT2_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
805 | 0 | { |
806 | 0 | #if ENABLE_SIMD_TRAFO |
807 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0]; |
808 | |
|
809 | 0 | _fastInverseMM<32>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT ); |
810 | | #else |
811 | | int j, k; |
812 | | int E[16], O[16]; |
813 | | int EE[8], EO[8]; |
814 | | int EEE[4], EEO[4]; |
815 | | int EEEE[2], EEEO[2]; |
816 | | int add = 1 << (shift - 1); |
817 | | |
818 | | const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0]; |
819 | | |
820 | | #if ENABLE_SIMD_TRAFO |
821 | | TCoeff *orgDst = dst; |
822 | | |
823 | | #endif |
824 | | const int reducedLine = line - iSkipLine; |
825 | | for (j = 0; j<reducedLine; j++) |
826 | | { |
827 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
828 | | for (k = 0;k<16;k++) |
829 | | { |
830 | | O[k] = iT[1 * 32 + k] * src[line] + iT[3 * 32 + k] * src[3 * line] + iT[5 * 32 + k] * src[5 * line] + iT[7 * 32 + k] * src[7 * line] + |
831 | | iT[9 * 32 + k] * src[9 * line] + iT[11 * 32 + k] * src[11 * line] + iT[13 * 32 + k] * src[13 * line] + iT[15 * 32 + k] * src[15 * line] + |
832 | | iT[17 * 32 + k] * src[17 * line] + iT[19 * 32 + k] * src[19 * line] + iT[21 * 32 + k] * src[21 * line] + iT[23 * 32 + k] * src[23 * line] + |
833 | | iT[25 * 32 + k] * src[25 * line] + iT[27 * 32 + k] * src[27 * line] + iT[29 * 32 + k] * src[29 * line] + iT[31 * 32 + k] * src[31 * line]; |
834 | | } |
835 | | for (k = 0;k<8;k++) |
836 | | { |
837 | | EO[k] = iT[2 * 32 + k] * src[2 * line] + iT[6 * 32 + k] * src[6 * line] + iT[10 * 32 + k] * src[10 * line] + iT[14 * 32 + k] * src[14 * line] + |
838 | | iT[18 * 32 + k] * src[18 * line] + iT[22 * 32 + k] * src[22 * line] + iT[26 * 32 + k] * src[26 * line] + iT[30 * 32 + k] * src[30 * line]; |
839 | | } |
840 | | for (k = 0;k<4;k++) |
841 | | { |
842 | | EEO[k] = iT[4 * 32 + k] * src[4 * line] + iT[12 * 32 + k] * src[12 * line] + iT[20 * 32 + k] * src[20 * line] + iT[28 * 32 + k] * src[28 * line]; |
843 | | } |
844 | | EEEO[0] = iT[8 * 32 + 0] * src[8 * line] + iT[24 * 32 + 0] * src[24 * line]; |
845 | | EEEO[1] = iT[8 * 32 + 1] * src[8 * line] + iT[24 * 32 + 1] * src[24 * line]; |
846 | | EEEE[0] = iT[0 * 32 + 0] * src[0] + iT[16 * 32 + 0] * src[16 * line]; |
847 | | EEEE[1] = iT[0 * 32 + 1] * src[0] + iT[16 * 32 + 1] * src[16 * line]; |
848 | | |
849 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
850 | | EEE[0] = EEEE[0] + EEEO[0]; |
851 | | EEE[3] = EEEE[0] - EEEO[0]; |
852 | | EEE[1] = EEEE[1] + EEEO[1]; |
853 | | EEE[2] = EEEE[1] - EEEO[1]; |
854 | | for (k = 0;k<4;k++) |
855 | | { |
856 | | EE[k] = EEE[k] + EEO[k]; |
857 | | EE[k + 4] = EEE[3 - k] - EEO[3 - k]; |
858 | | } |
859 | | for (k = 0;k<8;k++) |
860 | | { |
861 | | E[k] = EE[k] + EO[k]; |
862 | | E[k + 8] = EE[7 - k] - EO[7 - k]; |
863 | | } |
864 | | for (k = 0;k<16;k++) |
865 | | { |
866 | | #if ENABLE_SIMD_TRAFO |
867 | | dst[k ] = E[k ] + O[k ]; |
868 | | dst[k + 16] = E[15 - k] - O[15 - k]; |
869 | | #else |
870 | | dst[k] = Clip3(outputMinimum, outputMaximum, (E[k] + O[k] + add) >> shift); |
871 | | dst[k + 16] = Clip3(outputMinimum, outputMaximum, (E[15 - k] - O[15 - k] + add) >> shift); |
872 | | #endif |
873 | | } |
874 | | src++; |
875 | | dst += 32; |
876 | | } |
877 | | |
878 | | #if ENABLE_SIMD_TRAFO |
879 | | g_tCoeffOps.roundClip8( orgDst, 32, reducedLine, 32, outputMinimum, outputMaximum, add, shift ); |
880 | | |
881 | | #endif |
882 | | if (iSkipLine) |
883 | | { |
884 | | memset(dst, 0, (iSkipLine << 5) * sizeof(TCoeff)); |
885 | | } |
886 | | #endif |
887 | 0 | } |
888 | | |
889 | | void fastForwardDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
890 | 0 | { |
891 | | #if !JVET_M0497_MATRIX_MULT |
892 | | int rnd_factor = 1 << (shift - 1); |
893 | | |
894 | | const int uiTrSize = 64; |
895 | | const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_FORWARD][0]; |
896 | | |
897 | | int j, k; |
898 | | TCoeff E[32], O[32]; |
899 | | TCoeff EE[16], EO[16]; |
900 | | TCoeff EEE[8], EEO[8]; |
901 | | TCoeff EEEE[4], EEEO[4]; |
902 | | TCoeff EEEEE[2], EEEEO[2]; |
903 | | TCoeff *tmp = dst; |
904 | | |
905 | | //bool zo = iSkipLine2 >= 32; |
906 | | bool zo = iSkipLine2 != 0; |
907 | | for (j = 0; j<line - iSkipLine; j++) |
908 | | { |
909 | | /* E and O*/ |
910 | | for (k = 0;k<32;k++) |
911 | | { |
912 | | E[k] = src[k] + src[63 - k]; |
913 | | O[k] = src[k] - src[63 - k]; |
914 | | } |
915 | | /* EE and EO */ |
916 | | for (k = 0;k<16;k++) |
917 | | { |
918 | | EE[k] = E[k] + E[31 - k]; |
919 | | EO[k] = E[k] - E[31 - k]; |
920 | | } |
921 | | /* EEE and EEO */ |
922 | | for (k = 0;k<8;k++) |
923 | | { |
924 | | EEE[k] = EE[k] + EE[15 - k]; |
925 | | EEO[k] = EE[k] - EE[15 - k]; |
926 | | } |
927 | | /* EEEE and EEEO */ |
928 | | for (k = 0;k<4;k++) |
929 | | { |
930 | | EEEE[k] = EEE[k] + EEE[7 - k]; |
931 | | EEEO[k] = EEE[k] - EEE[7 - k]; |
932 | | } |
933 | | /* EEEEE and EEEEO */ |
934 | | EEEEE[0] = EEEE[0] + EEEE[3]; |
935 | | EEEEO[0] = EEEE[0] - EEEE[3]; |
936 | | EEEEE[1] = EEEE[1] + EEEE[2]; |
937 | | EEEEO[1] = EEEE[1] - EEEE[2]; |
938 | | |
939 | | dst[0] = (iT[0 * 64 + 0] * EEEEE[0] + iT[0 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift; |
940 | | dst[16 * line] = (iT[16 * 64 + 0] * EEEEO[0] + iT[16 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift; |
941 | | |
942 | | if (!zo) |
943 | | { |
944 | | dst[32 * line] = (iT[32 * 64 + 0] * EEEEE[0] + iT[32 * 64 + 1] * EEEEE[1] + rnd_factor) >> shift; |
945 | | dst[48 * line] = (iT[48 * 64 + 0] * EEEEO[0] + iT[48 * 64 + 1] * EEEEO[1] + rnd_factor) >> shift; |
946 | | } |
947 | | for (k = 8;k<(zo ? 32 : 64);k += 16) |
948 | | { |
949 | | dst[k*line] = (iT[k * 64 + 0] * EEEO[0] + iT[k * 64 + 1] * EEEO[1] + iT[k * 64 + 2] * EEEO[2] + iT[k * 64 + 3] * EEEO[3] + rnd_factor) >> shift; |
950 | | } |
951 | | for (k = 4;k<(zo ? 32 : 64);k += 8) |
952 | | { |
953 | | dst[k*line] = (iT[k * 64 + 0] * EEO[0] + iT[k * 64 + 1] * EEO[1] + iT[k * 64 + 2] * EEO[2] + iT[k * 64 + 3] * EEO[3] + |
954 | | iT[k * 64 + 4] * EEO[4] + iT[k * 64 + 5] * EEO[5] + iT[k * 64 + 6] * EEO[6] + iT[k * 64 + 7] * EEO[7] + rnd_factor) >> shift; |
955 | | } |
956 | | for (k = 2;k<(zo ? 32 : 64);k += 4) |
957 | | { |
958 | | dst[k*line] = (iT[k * 64 + 0] * EO[0] + iT[k * 64 + 1] * EO[1] + iT[k * 64 + 2] * EO[2] + iT[k * 64 + 3] * EO[3] + |
959 | | iT[k * 64 + 4] * EO[4] + iT[k * 64 + 5] * EO[5] + iT[k * 64 + 6] * EO[6] + iT[k * 64 + 7] * EO[7] + |
960 | | iT[k * 64 + 8] * EO[8] + iT[k * 64 + 9] * EO[9] + iT[k * 64 + 10] * EO[10] + iT[k * 64 + 11] * EO[11] + |
961 | | iT[k * 64 + 12] * EO[12] + iT[k * 64 + 13] * EO[13] + iT[k * 64 + 14] * EO[14] + iT[k * 64 + 15] * EO[15] + rnd_factor) >> shift; |
962 | | } |
963 | | for (k = 1;k<(zo ? 32 : 64);k += 2) |
964 | | { |
965 | | dst[k*line] = (iT[k * 64 + 0] * O[0] + iT[k * 64 + 1] * O[1] + iT[k * 64 + 2] * O[2] + iT[k * 64 + 3] * O[3] + |
966 | | iT[k * 64 + 4] * O[4] + iT[k * 64 + 5] * O[5] + iT[k * 64 + 6] * O[6] + iT[k * 64 + 7] * O[7] + |
967 | | iT[k * 64 + 8] * O[8] + iT[k * 64 + 9] * O[9] + iT[k * 64 + 10] * O[10] + iT[k * 64 + 11] * O[11] + |
968 | | iT[k * 64 + 12] * O[12] + iT[k * 64 + 13] * O[13] + iT[k * 64 + 14] * O[14] + iT[k * 64 + 15] * O[15] + |
969 | | iT[k * 64 + 16] * O[16] + iT[k * 64 + 17] * O[17] + iT[k * 64 + 18] * O[18] + iT[k * 64 + 19] * O[19] + |
970 | | iT[k * 64 + 20] * O[20] + iT[k * 64 + 21] * O[21] + iT[k * 64 + 22] * O[22] + iT[k * 64 + 23] * O[23] + |
971 | | iT[k * 64 + 24] * O[24] + iT[k * 64 + 25] * O[25] + iT[k * 64 + 26] * O[26] + iT[k * 64 + 27] * O[27] + |
972 | | iT[k * 64 + 28] * O[28] + iT[k * 64 + 29] * O[29] + iT[k * 64 + 30] * O[30] + iT[k * 64 + 31] * O[31] + rnd_factor) >> shift; |
973 | | } |
974 | | src += uiTrSize; |
975 | | dst++; |
976 | | } |
977 | | |
978 | | const int reducedLine = line - iSkipLine; |
979 | | const int cutoff = uiTrSize - iSkipLine2; |
980 | | if (iSkipLine) |
981 | | { |
982 | | dst = tmp + reducedLine; |
983 | | for (j = 0; j<cutoff; j++) |
984 | | { |
985 | | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
986 | | dst += line; |
987 | | } |
988 | | } |
989 | | if (iSkipLine2) |
990 | | { |
991 | | dst = tmp + line*cutoff; |
992 | | memset(dst, 0, sizeof(TCoeff)*line*iSkipLine2); |
993 | | } |
994 | | #else |
995 | 0 | _fastForwardMM< 64 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT2P64[TRANSFORM_FORWARD][0] ); |
996 | 0 | #endif |
997 | 0 | } |
998 | | |
999 | | void fastInverseDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1000 | 0 | { |
1001 | 0 | #if ENABLE_SIMD_TRAFO |
1002 | 0 | const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0]; |
1003 | |
|
1004 | 0 | _fastInverseMM<64>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, iT ); |
1005 | | #else |
1006 | | int rnd_factor = 1 << (shift - 1); |
1007 | | const int uiTrSize = 64; |
1008 | | const TMatrixCoeff *iT = g_trCoreDCT2P64[TRANSFORM_INVERSE][0]; |
1009 | | |
1010 | | #if ENABLE_SIMD_TRAFO |
1011 | | TCoeff *orgDst = dst; |
1012 | | |
1013 | | #endif |
1014 | | int j, k; |
1015 | | TCoeff E[32], O[32]; |
1016 | | TCoeff EE[16], EO[16]; |
1017 | | TCoeff EEE[8], EEO[8]; |
1018 | | TCoeff EEEE[4], EEEO[4]; |
1019 | | TCoeff EEEEE[2], EEEEO[2]; |
1020 | | bool zo = iSkipLine2 >= 32; |
1021 | | for (j = 0; j<line - iSkipLine; j++) |
1022 | | { |
1023 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
1024 | | for (k = 0;k<32;k++) |
1025 | | { |
1026 | | O[k] = iT[1 * 64 + k] * src[line] + iT[3 * 64 + k] * src[3 * line] + iT[5 * 64 + k] * src[5 * line] + iT[7 * 64 + k] * src[7 * line] + |
1027 | | iT[9 * 64 + k] * src[9 * line] + iT[11 * 64 + k] * src[11 * line] + iT[13 * 64 + k] * src[13 * line] + iT[15 * 64 + k] * src[15 * line] + |
1028 | | iT[17 * 64 + k] * src[17 * line] + iT[19 * 64 + k] * src[19 * line] + iT[21 * 64 + k] * src[21 * line] + iT[23 * 64 + k] * src[23 * line] + |
1029 | | iT[25 * 64 + k] * src[25 * line] + iT[27 * 64 + k] * src[27 * line] + iT[29 * 64 + k] * src[29 * line] + iT[31 * 64 + k] * src[31 * line] + |
1030 | | (zo ? 0 : ( |
1031 | | iT[33 * 64 + k] * src[33 * line] + iT[35 * 64 + k] * src[35 * line] + iT[37 * 64 + k] * src[37 * line] + iT[39 * 64 + k] * src[39 * line] + |
1032 | | iT[41 * 64 + k] * src[41 * line] + iT[43 * 64 + k] * src[43 * line] + iT[45 * 64 + k] * src[45 * line] + iT[47 * 64 + k] * src[47 * line] + |
1033 | | iT[49 * 64 + k] * src[49 * line] + iT[51 * 64 + k] * src[51 * line] + iT[53 * 64 + k] * src[53 * line] + iT[55 * 64 + k] * src[55 * line] + |
1034 | | iT[57 * 64 + k] * src[57 * line] + iT[59 * 64 + k] * src[59 * line] + iT[61 * 64 + k] * src[61 * line] + iT[63 * 64 + k] * src[63 * line])); |
1035 | | } |
1036 | | for (k = 0;k<16;k++) |
1037 | | { |
1038 | | EO[k] = iT[2 * 64 + k] * src[2 * line] + iT[6 * 64 + k] * src[6 * line] + iT[10 * 64 + k] * src[10 * line] + iT[14 * 64 + k] * src[14 * line] + |
1039 | | iT[18 * 64 + k] * src[18 * line] + iT[22 * 64 + k] * src[22 * line] + iT[26 * 64 + k] * src[26 * line] + iT[30 * 64 + k] * src[30 * line] + |
1040 | | (zo ? 0 : ( |
1041 | | iT[34 * 64 + k] * src[34 * line] + iT[38 * 64 + k] * src[38 * line] + iT[42 * 64 + k] * src[42 * line] + iT[46 * 64 + k] * src[46 * line] + |
1042 | | iT[50 * 64 + k] * src[50 * line] + iT[54 * 64 + k] * src[54 * line] + iT[58 * 64 + k] * src[58 * line] + iT[62 * 64 + k] * src[62 * line])); |
1043 | | } |
1044 | | for (k = 0;k<8;k++) |
1045 | | { |
1046 | | EEO[k] = iT[4 * 64 + k] * src[4 * line] + iT[12 * 64 + k] * src[12 * line] + iT[20 * 64 + k] * src[20 * line] + iT[28 * 64 + k] * src[28 * line] + |
1047 | | (zo ? 0 : ( |
1048 | | iT[36 * 64 + k] * src[36 * line] + iT[44 * 64 + k] * src[44 * line] + iT[52 * 64 + k] * src[52 * line] + iT[60 * 64 + k] * src[60 * line])); |
1049 | | } |
1050 | | for (k = 0;k<4;k++) |
1051 | | { |
1052 | | EEEO[k] = iT[8 * 64 + k] * src[8 * line] + iT[24 * 64 + k] * src[24 * line] + (zo ? 0 : (iT[40 * 64 + k] * src[40 * line] + iT[56 * 64 + k] * src[56 * line])); |
1053 | | } |
1054 | | EEEEO[0] = iT[16 * 64 + 0] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 0] * src[48 * line]); |
1055 | | EEEEO[1] = iT[16 * 64 + 1] * src[16 * line] + (zo ? 0 : iT[48 * 64 + 1] * src[48 * line]); |
1056 | | EEEEE[0] = iT[0 * 64 + 0] * src[0] + (zo ? 0 : iT[32 * 64 + 0] * src[32 * line]); |
1057 | | EEEEE[1] = iT[0 * 64 + 1] * src[0] + (zo ? 0 : iT[32 * 64 + 1] * src[32 * line]); |
1058 | | |
1059 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
1060 | | for (k = 0;k<2;k++) |
1061 | | { |
1062 | | EEEE[k] = EEEEE[k] + EEEEO[k]; |
1063 | | EEEE[k + 2] = EEEEE[1 - k] - EEEEO[1 - k]; |
1064 | | } |
1065 | | for (k = 0;k<4;k++) |
1066 | | { |
1067 | | EEE[k] = EEEE[k] + EEEO[k]; |
1068 | | EEE[k + 4] = EEEE[3 - k] - EEEO[3 - k]; |
1069 | | } |
1070 | | for (k = 0;k<8;k++) |
1071 | | { |
1072 | | EE[k] = EEE[k] + EEO[k]; |
1073 | | EE[k + 8] = EEE[7 - k] - EEO[7 - k]; |
1074 | | } |
1075 | | for (k = 0;k<16;k++) |
1076 | | { |
1077 | | E[k] = EE[k] + EO[k]; |
1078 | | E[k + 16] = EE[15 - k] - EO[15 - k]; |
1079 | | } |
1080 | | for (k = 0;k<32;k++) |
1081 | | { |
1082 | | #if ENABLE_SIMD_TRAFO |
1083 | | dst[k] = E[k] + O[k]; |
1084 | | dst[k + 32] = E[31 - k] - O[31 - k]; |
1085 | | #else |
1086 | | dst[k] = Clip3( outputMinimum, outputMaximum, ( E[k] + O[k] + rnd_factor ) >> shift ); |
1087 | | dst[k + 32] = Clip3( outputMinimum, outputMaximum, ( E[31 - k] - O[31 - k] + rnd_factor ) >> shift ); |
1088 | | #endif |
1089 | | } |
1090 | | src++; |
1091 | | dst += uiTrSize; |
1092 | | } |
1093 | | |
1094 | | #if ENABLE_SIMD_TRAFO |
1095 | | g_tCoeffOps.roundClip8( orgDst, 32, line - iSkipLine, 32, outputMinimum, outputMaximum, rnd_factor, shift ); |
1096 | | |
1097 | | |
1098 | | #endif |
1099 | | memset( dst, 0, uiTrSize*iSkipLine * sizeof( TCoeff ) ); |
1100 | | #endif |
1101 | 0 | } |
1102 | | |
1103 | | |
1104 | | |
1105 | | // ********************************** DST-VII ********************************** |
1106 | | void fastForwardDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1107 | 0 | { |
1108 | 0 | int i; |
1109 | 0 | TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0; |
1110 | |
|
1111 | 0 | const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_FORWARD][0]; |
1112 | |
|
1113 | 0 | int c[4]; |
1114 | 0 | TCoeff *pCoeff = dst; |
1115 | 0 | const int reducedLine = line - iSkipLine; |
1116 | 0 | for (i = 0; i<reducedLine; i++) |
1117 | 0 | { |
1118 | | // Intermediate Variables |
1119 | 0 | c[0] = src[0] + src[3]; |
1120 | 0 | c[1] = src[1] + src[3]; |
1121 | 0 | c[2] = src[0] - src[1]; |
1122 | 0 | c[3] = iT[2] * src[2]; |
1123 | |
|
1124 | 0 | dst[0 * line] = (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift; |
1125 | 0 | dst[1 * line] = (iT[2] * (src[0] + src[1] - src[3]) + rnd_factor) >> shift; |
1126 | 0 | dst[2 * line] = (iT[0] * c[2] + iT[1] * c[0] - c[3] + rnd_factor) >> shift; |
1127 | 0 | dst[3 * line] = (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift; |
1128 | |
|
1129 | 0 | src += 4; |
1130 | 0 | dst++; |
1131 | 0 | } |
1132 | 0 | if (iSkipLine) |
1133 | 0 | { |
1134 | 0 | dst = pCoeff + reducedLine; |
1135 | 0 | for (i = 0; i<4; i++) |
1136 | 0 | { |
1137 | 0 | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
1138 | 0 | dst += line; |
1139 | 0 | } |
1140 | 0 | } |
1141 | 0 | } |
1142 | | |
1143 | | void fastInverseDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1144 | 0 | { |
1145 | 0 | #if ENABLE_SIMD_TRAFO |
1146 | 0 | _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P4[TRANSFORM_INVERSE][0] ); |
1147 | | #else |
1148 | | int i; |
1149 | | TCoeff c[4]; |
1150 | | TCoeff rnd_factor = (shift > 0) ? (1 << (shift - 1)) : 0; |
1151 | | |
1152 | | const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_INVERSE][0]; |
1153 | | |
1154 | | const int reducedLine = line - iSkipLine; |
1155 | | for (i = 0; i<reducedLine; i++) |
1156 | | { |
1157 | | // Intermediate Variables |
1158 | | c[0] = src[0 * line] + src[2 * line]; |
1159 | | c[1] = src[2 * line] + src[3 * line]; |
1160 | | c[2] = src[0 * line] - src[3 * line]; |
1161 | | c[3] = iT[2] * src[1 * line]; |
1162 | | |
1163 | | dst[0] = Clip3(outputMinimum, outputMaximum, (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift); |
1164 | | dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift); |
1165 | | dst[2] = Clip3(outputMinimum, outputMaximum, (iT[2] * (src[0 * line] - src[2 * line] + src[3 * line]) + rnd_factor) >> shift); |
1166 | | dst[3] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[0] + iT[0] * c[2] - c[3] + rnd_factor) >> shift); |
1167 | | |
1168 | | dst += 4; |
1169 | | src++; |
1170 | | } |
1171 | | if (iSkipLine) |
1172 | | { |
1173 | | memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff)); |
1174 | | } |
1175 | | #endif |
1176 | 0 | } |
1177 | | |
1178 | | |
1179 | | void fastForwardDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1180 | 0 | { |
1181 | 0 | _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P8[TRANSFORM_FORWARD][0] ); |
1182 | 0 | } |
1183 | | |
1184 | | |
1185 | | void fastInverseDST7_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1186 | 0 | { |
1187 | 0 | _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P8[TRANSFORM_INVERSE][0]); |
1188 | 0 | } |
1189 | | |
1190 | | |
1191 | | void fastForwardDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1192 | 0 | { |
1193 | | #if !JVET_M0497_MATRIX_MULT |
1194 | | int j, k; |
1195 | | TCoeff a[5], b[5], c[5], d[5], t; |
1196 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1197 | | |
1198 | | const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0]; |
1199 | | |
1200 | | TCoeff *pCoef = dst; |
1201 | | const int reducedLine = line - iSkipLine; |
1202 | | const int cutoff = 16 - iSkipLine2; |
1203 | | |
1204 | | for (j = 0; j < reducedLine; j++) |
1205 | | { |
1206 | | for (k = 0; k < 5; k++) |
1207 | | { |
1208 | | a[k] = src[ k] + src[11 + k]; |
1209 | | b[k] = src[9 - k] + src[11 + k]; |
1210 | | c[k] = src[ k] - src[ 9 - k]; |
1211 | | d[k] = src[ k] + src[ 9 - k] - src[11 + k]; |
1212 | | } |
1213 | | |
1214 | | t = iT[10] * src[10]; |
1215 | | |
1216 | | dst[ 1 * line] = ( iT[ 2]*d[0] + iT[ 5]*d[1] + iT[ 8]*d[2] + iT[11]*d[3] + iT[14]*d[4] + add) >> shift; |
1217 | | dst[ 4 * line] = ( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift; |
1218 | | dst[ 7 * line] = ( iT[14]*d[0] + iT[ 2]*d[1] - iT[11]*d[2] - iT[ 5]*d[3] + iT[ 8]*d[4] + add) >> shift; |
1219 | | dst[10 * line] = ( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift; |
1220 | | dst[13 * line] = ( iT[ 5]*d[0] - iT[11]*d[1] + iT[14]*d[2] - iT[ 8]*d[3] + iT[ 2]*d[4] + add) >> shift; |
1221 | | |
1222 | | dst[5 * line] = ( iT[10] * (src[0] + src[1] - src[3] - src[4] + src[6] + src[7] - src[9] - src[10] + src[12] + src[13] - src[15]) + add) >> shift; |
1223 | | |
1224 | | dst[ 0 * line] = ( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift; |
1225 | | dst[ 2 * line] = ( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift; |
1226 | | dst[ 3 * line] = ( iT[6]*a[0] + iT[3]*b[0] + iT[2]*c[1] + iT[7]*a[1] + iT[9]*c[2] + iT[0]*a[2] + iT[4]*c[3] - iT[5]*b[3] - iT[1]*a[4] - iT[8]*b[4] + t + add ) >> shift; |
1227 | | dst[ 6 * line] = ( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift; |
1228 | | dst[ 8 * line] = ( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift; |
1229 | | dst[ 9 * line] = ( iT[7]*c[0] + iT[2]*a[0] - iT[4]*a[1] - iT[5]*b[1] - iT[8]*c[2] + iT[1]*b[2] + iT[9]*a[3] + iT[0]*b[3] + iT[3]*c[4] - iT[6]*b[4] + t + add ) >> shift; |
1230 | | dst[11 * line] = ( iT[9]*a[0] + iT[0]*b[0] - iT[8]*c[1] - iT[1]*a[1] + iT[2]*c[2] - iT[7]*b[2] + iT[6]*a[3] + iT[3]*b[3] - iT[5]*c[4] - iT[4]*a[4] - t + add ) >> shift; |
1231 | | dst[12 * line] = ( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift; |
1232 | | dst[14 * line] = ( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift; |
1233 | | dst[15 * line] = ( iT[1]*c[0] - iT[8]*b[0] - iT[3]*c[1] + iT[6]*b[1] + iT[5]*c[2] - iT[4]*b[2] - iT[7]*c[3] + iT[2]*b[3] + iT[9]*c[4] - iT[0]*b[4] + t + add ) >> shift; |
1234 | | |
1235 | | src += 16; |
1236 | | dst++; |
1237 | | } |
1238 | | |
1239 | | if (iSkipLine) |
1240 | | { |
1241 | | dst = pCoef + reducedLine; |
1242 | | for (j = 0; j < cutoff; j++) |
1243 | | { |
1244 | | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
1245 | | dst += line; |
1246 | | } |
1247 | | } |
1248 | | |
1249 | | if (iSkipLine2) |
1250 | | { |
1251 | | dst = pCoef + line * cutoff; |
1252 | | memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2); |
1253 | | } |
1254 | | #else |
1255 | 0 | _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P16[TRANSFORM_FORWARD][0] ); |
1256 | 0 | #endif |
1257 | 0 | } |
1258 | | |
1259 | | |
1260 | | void fastInverseDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1261 | 0 | { |
1262 | | #if !JVET_M0497_MATRIX_MULT |
1263 | | int j, k; |
1264 | | TCoeff a[5], b[5], c[5], d[5], t; |
1265 | | |
1266 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1267 | | |
1268 | | const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0]; |
1269 | | |
1270 | | const int reducedLine = line - iSkipLine; |
1271 | | |
1272 | | for (j = 0; j < reducedLine; j++) |
1273 | | { |
1274 | | for (k = 0; k < 5; k++) |
1275 | | { |
1276 | | a[k] = src[ k * line] + src[(10 - k) * line]; |
1277 | | b[k] = src[(11 + k) * line] + src[(10 - k) * line]; |
1278 | | c[k] = src[ k * line] - src[(11 + k) * line]; |
1279 | | d[k] = src[ k * line] + src[(11 + k) * line] - src[(10 - k)*line]; |
1280 | | } |
1281 | | |
1282 | | t = iT[10] * src[5 * line]; |
1283 | | |
1284 | | dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 2]*d[0] + iT[ 8]*d[1] + iT[14]*d[2] + iT[11]*d[3] + iT[ 5]*d[4] + add ) >> shift); |
1285 | | dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 5]*d[0] + iT[14]*d[1] + iT[ 2]*d[2] - iT[ 8]*d[3] - iT[11]*d[4] + add ) >> shift); |
1286 | | dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[ 5]*d[1] - iT[11]*d[2] - iT[ 2]*d[3] + iT[14]*d[4] + add ) >> shift); |
1287 | | dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[11]*d[0] - iT[ 2]*d[1] - iT[ 5]*d[2] + iT[14]*d[3] - iT[ 8]*d[4] + add ) >> shift); |
1288 | | dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[14]*d[0] - iT[11]*d[1] + iT[ 8]*d[2] - iT[ 5]*d[3] + iT[ 2]*d[4] + add ) >> shift); |
1289 | | |
1290 | | dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[10]*(src[ 0*line]-src[ 2*line]+src[ 3*line]-src[5*line] |
1291 | | +src[ 6*line]-src[ 8*line]+src[ 9*line]-src[11*line] |
1292 | | +src[12*line]-src[14*line]+src[15*line]) + add ) >> shift); |
1293 | | |
1294 | | dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0]*a[0] + iT[9]*b[0] + iT[2]*a[1] + iT[7]*b[1] + iT[4]*a[2] + iT[5]*b[2] + iT[6]*a[3] + iT[3]*b[3] + iT[8]*a[4] + iT[1]*b[4] + t + add ) >> shift); |
1295 | | dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] - iT[8]*b[0] + iT[5]*c[1] - iT[4]*b[1] + iT[9]*c[2] - iT[0]*b[2] + iT[2]*a[3] + iT[7]*c[3] + iT[6]*a[4] + iT[3]*c[4] + t + add ) >> shift); |
1296 | | dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[3]*a[0] + iT[6]*b[0] + iT[0]*c[1] + iT[9]*a[1] + iT[1]*a[2] + iT[8]*c[2] + iT[4]*c[3] - iT[5]*b[3] - iT[2]*a[4] - iT[7]*b[4] - t + add ) >> shift); |
1297 | | dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] - iT[5]*b[0] + iT[6]*c[1] + iT[3]*a[1] + iT[7]*a[2] + iT[2]*b[2] - iT[1]*c[3] + iT[8]*b[3] - iT[9]*c[4] - iT[0]*a[4] - t + add ) >> shift); |
1298 | | dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[6]*a[0] + iT[3]*b[0] + iT[9]*c[1] + iT[0]*a[1] - iT[1]*a[2] - iT[8]*b[2] - iT[4]*c[3] - iT[5]*a[3] - iT[2]*c[4] + iT[7]*b[4] + t + add ) >> shift); |
1299 | | dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] - iT[2]*b[0] + iT[8]*a[1] + iT[1]*b[1] - iT[6]*c[2] + iT[3]*b[2] - iT[9]*a[3] - iT[0]*b[3] + iT[5]*c[4] - iT[4]*b[4] + t + add ) >> shift); |
1300 | | dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( iT[9]*a[0] + iT[0]*b[0] + iT[2]*c[1] - iT[7]*b[1] - iT[5]*c[2] - iT[4]*a[2] + iT[3]*a[3] + iT[6]*b[3] + iT[8]*c[4] - iT[1]*b[4] - t + add ) >> shift); |
1301 | | dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] + iT[8]*a[0] - iT[5]*a[1] - iT[4]*b[1] - iT[0]*c[2] + iT[9]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[6]*c[4] - iT[3]*a[4] + t + add ) >> shift); |
1302 | | dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] + iT[2]*a[0] - iT[8]*c[1] + iT[1]*b[1] + iT[3]*c[2] - iT[6]*b[2] + iT[0]*a[3] + iT[9]*b[3] - iT[5]*a[4] - iT[4]*b[4] + t + add ) >> shift); |
1303 | | dst[15] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] + iT[5]*a[0] - iT[3]*c[1] - iT[6]*a[1] + iT[2]*c[2] + iT[7]*a[2] - iT[1]*c[3] - iT[8]*a[3] + iT[0]*c[4] + iT[9]*a[4] - t + add ) >> shift); |
1304 | | |
1305 | | src++; |
1306 | | dst += 16; |
1307 | | } |
1308 | | |
1309 | | if (iSkipLine) |
1310 | | { |
1311 | | memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff)); |
1312 | | } |
1313 | | #else |
1314 | 0 | _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P16[TRANSFORM_INVERSE][0]); |
1315 | 0 | #endif |
1316 | 0 | } |
1317 | | |
1318 | | |
1319 | | void fastForwardDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1320 | 0 | { |
1321 | | #if !JVET_M0497_MATRIX_MULT |
1322 | | int j, k; |
1323 | | TCoeff a[10][6]; |
1324 | | TCoeff t[2]; |
1325 | | TCoeff b[6]; |
1326 | | TCoeff c[2]; |
1327 | | |
1328 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1329 | | const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0]; |
1330 | | TCoeff *pCoef = dst; |
1331 | | const int reducedLine = line - iSkipLine; |
1332 | | const int cutoff = 32 - iSkipLine2; |
1333 | | |
1334 | | for (j = 0; j < reducedLine; j++) |
1335 | | { |
1336 | | for (k = 0; k < 6; k++) |
1337 | | { |
1338 | | a[0][k] = src[ k] - src[11 - k]; |
1339 | | a[1][k] = src[ k] + src[13 + k]; |
1340 | | a[2][k] = src[ k] + src[24 - k]; |
1341 | | a[3][k] = src[ k] - src[26 + k]; |
1342 | | a[4][k] = src[ 6 + k] + src[18 - k]; |
1343 | | a[5][k] = src[ 6 + k] + src[19 + k]; |
1344 | | a[6][k] = src[ 6 + k] - src[31 - k]; |
1345 | | a[7][k] = src[13 + k] - src[24 - k]; |
1346 | | a[8][k] = src[13 + k] + src[26 + k]; |
1347 | | a[9][k] = src[19 + k] + src[31 - k]; |
1348 | | |
1349 | | b[k] = src[k] + src[11 - k] - src[13 + k] - src[24 - k] + src[26 + k]; |
1350 | | } |
1351 | | for (k = 0; k < 2; k++) |
1352 | | { |
1353 | | c[k] = src[k] + src[3 - k] - src[5 + k] - src[8 - k] + src[10 + k] + src[13 - k] - src[15 + k] - src[18 - k] + src[20 + k] + src[23 - k] - src[25 + k] - src[28 - k] + src[30 + k]; |
1354 | | } |
1355 | | |
1356 | | t[0] = iT[12] * src[12] + iT[25] * src[25]; |
1357 | | t[1] = iT[12] * src[25] - iT[25] * src[12]; |
1358 | | |
1359 | | dst[ 0 * line] = ( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift; |
1360 | | dst[ 1 * line] = (-iT[0] * a[5][2] + iT[11] * a[0][3] + iT[13] * a[4][2] + iT[24] * a[6][2] + iT[1] * a[9][1] + iT[10] * a[8][4] + iT[14] * a[3][4] + iT[23] * a[6][1] + iT[2] * a[0][0] - iT[9] * a[5][5] + iT[15] * a[6][5] + iT[22] * a[4][5] - iT[3] * a[5][3] + iT[8] * a[0][2] + iT[16] * a[4][3] + iT[21] * a[6][3] + iT[4] * a[9][0] + iT[7] * a[8][5] + iT[17] * a[3][5] + iT[20] * a[6][0] + iT[5] * a[0][1] - iT[6] * a[5][4] + iT[18] * a[6][4] + iT[19] * a[4][4] - t[1] + add) >> shift; |
1361 | | dst[ 3 * line] = (-iT[0] * a[9][4] - iT[11] * a[5][4] + iT[13] * a[2][1] - iT[24] * a[7][1] - iT[1] * a[0][3] - iT[10] * a[1][3] + iT[14] * a[3][3] + iT[23] * a[2][3] + iT[2] * a[8][5] + iT[9] * a[9][0] + iT[15] * a[6][0] + iT[22] * a[3][5] - iT[3] * a[1][4] - iT[8] * a[0][4] + iT[16] * a[2][4] + iT[21] * a[3][4] - iT[4] * a[5][3] - iT[7] * a[9][3] - iT[17] * a[7][2] + iT[20] * a[2][2] + iT[5] * a[8][0] + iT[6] * a[1][0] - iT[18] * a[4][5] - iT[19] * a[7][0] + t[1] + add) >> shift; |
1362 | | dst[ 4 * line] = (-iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift; |
1363 | | dst[ 5 * line] = (-iT[0] * a[3][5] - iT[11] * a[6][0] - iT[13] * a[8][5] - iT[24] * a[9][0] + iT[1] * a[6][5] + iT[10] * a[3][0] + iT[14] * a[9][5] + iT[23] * a[8][0] - iT[2] * a[7][4] + iT[9] * a[2][4] - iT[15] * a[9][1] - iT[22] * a[5][1] - iT[3] * a[7][1] - iT[8] * a[4][4] + iT[16] * a[8][1] + iT[21] * a[1][1] + iT[4] * a[6][2] + iT[7] * a[4][2] - iT[17] * a[5][2] + iT[20] * a[0][3] - iT[5] * a[3][2] - iT[6] * a[2][2] + iT[18] * a[1][2] + iT[19] * a[0][2] + t[0] + add) >> shift; |
1364 | | dst[ 8 * line] = ( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift; |
1365 | | dst[ 9 * line] = (-iT[0] * a[2][1] - iT[11] * a[3][1] + iT[13] * a[0][1] + iT[24] * a[1][1] + iT[1] * a[7][3] - iT[10] * a[2][3] + iT[14] * a[9][2] + iT[23] * a[5][2] + iT[2] * a[4][0] + iT[9] * a[7][5] - iT[15] * a[1][5] - iT[22] * a[8][5] + iT[3] * a[3][4] + iT[8] * a[2][4] - iT[16] * a[1][4] - iT[21] * a[0][4] + iT[4] * a[6][3] + iT[7] * a[3][2] + iT[17] * a[9][3] + iT[20] * a[8][2] + iT[5] * a[4][5] + iT[6] * a[6][5] + iT[18] * a[0][0] - iT[19] * a[5][5] - t[0] + add) >> shift; |
1366 | | dst[10 * line] = (-iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift; |
1367 | | dst[11 * line] = ( iT[0] * a[1][3] + iT[11] * a[0][3] - iT[13] * a[2][3] - iT[24] * a[3][3] + iT[1] * a[9][1] + iT[10] * a[5][1] - iT[14] * a[2][4] + iT[23] * a[7][4] + iT[2] * a[8][0] + iT[9] * a[9][5] + iT[15] * a[6][5] + iT[22] * a[3][0] - iT[3] * a[0][2] + iT[8] * a[5][3] - iT[16] * a[6][3] - iT[21] * a[4][3] - iT[4] * a[5][0] + iT[7] * a[0][5] + iT[17] * a[4][0] + iT[20] * a[6][0] - iT[5] * a[9][4] - iT[6] * a[5][4] + iT[18] * a[2][1] - iT[19] * a[7][1] - t[1] + add) >> shift; |
1368 | | dst[13 * line] = (-iT[0] * a[0][0] - iT[11] * a[1][0] + iT[13] * a[3][0] + iT[24] * a[2][0] - iT[1] * a[5][4] + iT[10] * a[0][1] + iT[14] * a[4][4] + iT[23] * a[6][4] + iT[2] * a[9][3] + iT[9] * a[5][3] - iT[15] * a[2][2] + iT[22] * a[7][2] - iT[3] * a[8][3] - iT[8] * a[9][2] - iT[16] * a[6][2] - iT[21] * a[3][3] + iT[4] * a[1][4] + iT[7] * a[8][4] - iT[17] * a[7][4] - iT[20] * a[4][1] - iT[5] * a[0][5] - iT[6] * a[1][5] + iT[18] * a[3][5] + iT[19] * a[2][5] + t[1] + add) >> shift; |
1369 | | dst[14 * line] = ( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift; |
1370 | | dst[15 * line] = (-iT[0] * a[7][4] - iT[11] * a[4][1] + iT[13] * a[8][4] + iT[24] * a[1][4] + iT[1] * a[2][2] + iT[10] * a[3][2] - iT[14] * a[0][2] - iT[23] * a[1][2] + iT[2] * a[2][1] - iT[9] * a[7][1] - iT[15] * a[5][4] - iT[22] * a[9][4] - iT[3] * a[7][5] + iT[8] * a[2][5] - iT[16] * a[9][0] - iT[21] * a[5][0] - iT[4] * a[2][0] - iT[7] * a[3][0] + iT[17] * a[0][0] + iT[20] * a[1][0] - iT[5] * a[2][3] + iT[6] * a[7][3] + iT[18] * a[5][2] + iT[19] * a[9][2] + t[0] + add) >> shift; |
1371 | | dst[16 * line] = (-iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift; |
1372 | | dst[18 * line] = ( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift; |
1373 | | dst[20 * line] = (-iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift; |
1374 | | dst[21 * line] = (-iT[0] * a[1][2] - iT[11] * a[8][2] + iT[13] * a[7][2] + iT[24] * a[4][3] - iT[1] * a[1][5] - iT[10] * a[8][5] + iT[14] * a[7][5] + iT[23] * a[4][0] - iT[2] * a[5][2] - iT[9] * a[9][2] - iT[15] * a[7][3] + iT[22] * a[2][3] - iT[3] * a[5][5] - iT[8] * a[9][5] - iT[16] * a[7][0] + iT[21] * a[2][0] - iT[4] * a[8][1] - iT[7] * a[9][4] - iT[17] * a[6][4] - iT[20] * a[3][1] - iT[5] * a[8][4] - iT[6] * a[9][1] - iT[18] * a[6][1] - iT[19] * a[3][4] - t[1] + add) >> shift; |
1375 | | dst[23 * line] = (-iT[0] * a[8][4] - iT[11] * a[9][1] - iT[13] * a[6][1] - iT[24] * a[3][4] + iT[1] * a[8][2] + iT[10] * a[1][2] - iT[14] * a[4][3] - iT[23] * a[7][2] + iT[2] * a[0][1] + iT[9] * a[1][1] - iT[15] * a[3][1] - iT[22] * a[2][1] - iT[3] * a[5][0] - iT[8] * a[9][0] - iT[16] * a[7][5] + iT[21] * a[2][5] + iT[4] * a[9][5] + iT[7] * a[8][0] + iT[17] * a[3][0] + iT[20] * a[6][5] - iT[5] * a[5][2] + iT[6] * a[0][3] + iT[18] * a[4][2] + iT[19] * a[6][2] + t[1] + add) >> shift; |
1376 | | dst[24 * line] = (-iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift; |
1377 | | dst[25 * line] = ( iT[0] * a[4][5] + iT[11] * a[6][5] + iT[13] * a[0][0] - iT[24] * a[5][5] + iT[1] * a[3][1] + iT[10] * a[2][1] - iT[14] * a[1][1] - iT[23] * a[0][1] - iT[2] * a[7][2] - iT[9] * a[4][3] + iT[15] * a[8][2] + iT[22] * a[1][2] - iT[3] * a[6][2] - iT[8] * a[3][3] - iT[16] * a[9][2] - iT[21] * a[8][3] - iT[4] * a[2][4] + iT[7] * a[7][4] + iT[17] * a[5][1] + iT[20] * a[9][1] + iT[5] * a[4][0] + iT[6] * a[6][0] + iT[18] * a[0][5] - iT[19] * a[5][0] + t[0] + add) >> shift; |
1378 | | dst[26 * line] = ( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift; |
1379 | | dst[28 * line] = (-iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift; |
1380 | | dst[29 * line] = (-iT[0] * a[6][4] - iT[11] * a[3][1] - iT[13] * a[9][4] - iT[24] * a[8][1] + iT[1] * a[7][3] + iT[10] * a[4][2] - iT[14] * a[8][3] - iT[23] * a[1][3] + iT[2] * a[3][5] + iT[9] * a[2][5] - iT[15] * a[1][5] - iT[22] * a[0][5] - iT[3] * a[2][4] - iT[8] * a[3][4] + iT[16] * a[0][4] + iT[21] * a[1][4] - iT[4] * a[4][3] - iT[7] * a[7][2] + iT[17] * a[1][2] + iT[20] * a[8][2] + iT[5] * a[3][0] + iT[6] * a[6][5] + iT[18] * a[8][0] + iT[19] * a[9][5] - t[0] + add) >> shift; |
1381 | | dst[30 * line] = (-iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift; |
1382 | | dst[31 * line] = (-iT[0] * a[8][5] - iT[11] * a[1][5] + iT[13] * a[4][0] + iT[24] * a[7][5] + iT[1] * a[1][0] + iT[10] * a[8][0] - iT[14] * a[7][0] - iT[23] * a[4][5] + iT[2] * a[8][4] + iT[9] * a[1][4] - iT[15] * a[4][1] - iT[22] * a[7][4] - iT[3] * a[1][1] - iT[8] * a[8][1] + iT[16] * a[7][1] + iT[21] * a[4][4] - iT[4] * a[8][3] - iT[7] * a[1][3] + iT[17] * a[4][2] + iT[20] * a[7][3] + iT[5] * a[1][2] + iT[6] * a[8][2] - iT[18] * a[7][2] - iT[19] * a[4][3] - t[1] + add) >> shift; |
1383 | | |
1384 | | dst[ 2 * line] = (iT[ 4]*b[0] + iT[ 9]*b[1] + iT[14]*b[2] + iT[19]*b[3] + iT[24]*b[4] + iT[29]*b[5] + add) >> shift; |
1385 | | dst[ 7 * line] = (iT[14]*b[0] + iT[29]*b[1] + iT[19]*b[2] + iT[ 4]*b[3] - iT[ 9]*b[4] - iT[24]*b[5] + add) >> shift; |
1386 | | dst[12 * line] = (iT[24]*b[0] + iT[14]*b[1] - iT[ 9]*b[2] - iT[29]*b[3] - iT[ 4]*b[4] + iT[19]*b[5] + add) >> shift; |
1387 | | dst[17 * line] = (iT[29]*b[0] - iT[ 4]*b[1] - iT[24]*b[2] + iT[ 9]*b[3] + iT[19]*b[4] - iT[14]*b[5] + add) >> shift; |
1388 | | dst[22 * line] = (iT[19]*b[0] - iT[24]*b[1] + iT[ 4]*b[2] + iT[14]*b[3] - iT[29]*b[4] + iT[ 9]*b[5] + add) >> shift; |
1389 | | dst[27 * line] = (iT[ 9]*b[0] - iT[19]*b[1] + iT[29]*b[2] - iT[24]*b[3] + iT[14]*b[4] - iT[ 4]*b[5] + add) >> shift; |
1390 | | |
1391 | | dst[ 6 * line] = (iT[12]*c[0] + iT[25]*c[1] + add) >> shift; |
1392 | | dst[19 * line] = (iT[25]*c[0] - iT[12]*c[1] + add) >> shift; |
1393 | | |
1394 | | src += 32; |
1395 | | dst++; |
1396 | | } |
1397 | | |
1398 | | if (iSkipLine) |
1399 | | { |
1400 | | dst = pCoef + reducedLine; |
1401 | | for (j = 0; j < cutoff; j++) |
1402 | | { |
1403 | | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
1404 | | dst += line; |
1405 | | } |
1406 | | } |
1407 | | |
1408 | | if (iSkipLine2) |
1409 | | { |
1410 | | dst = pCoef + line * cutoff; |
1411 | | memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2); |
1412 | | } |
1413 | | #else |
1414 | 0 | _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDST7P32[TRANSFORM_FORWARD][0] ); |
1415 | 0 | #endif |
1416 | 0 | } |
1417 | | |
1418 | | |
1419 | | void fastInverseDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1420 | 0 | { |
1421 | | #if !JVET_M0497_MATRIX_MULT |
1422 | | int j, k; |
1423 | | TCoeff a[10][6]; |
1424 | | TCoeff t[2]; |
1425 | | TCoeff b[6]; |
1426 | | TCoeff c[2]; |
1427 | | |
1428 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1429 | | const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0]; |
1430 | | const int reducedLine = line - iSkipLine; |
1431 | | |
1432 | | for (j = 0; j < reducedLine; j++) |
1433 | | { |
1434 | | for (k = 0; k < 6; k++) |
1435 | | { |
1436 | | a[0][k] = src[ k * line] + src[(12 - k) * line]; |
1437 | | a[1][k] = src[ k * line] - src[(13 + k) * line]; |
1438 | | a[2][k] = src[ k * line] + src[(25 - k) * line]; |
1439 | | a[3][k] = src[ k * line] - src[(26 + k) * line]; |
1440 | | a[4][k] = src[( 7 + k) * line] + src[(18 - k) * line]; |
1441 | | a[5][k] = src[( 7 + k) * line] - src[(20 + k) * line]; |
1442 | | a[6][k] = src[( 7 + k) * line] + src[(31 - k) * line]; |
1443 | | a[7][k] = src[(13 + k) * line] + src[(25 - k) * line]; |
1444 | | a[8][k] = src[(13 + k) * line] - src[(26 + k) * line]; |
1445 | | a[9][k] = src[(20 + k) * line] + src[(31 - k) * line]; |
1446 | | |
1447 | | b[k] = src[k * line] - src[(12-k) * line] + src[(13+k) * line] - src[(25-k) * line] + src[(26+k) * line]; |
1448 | | } |
1449 | | for (k = 0; k < 2; k++) |
1450 | | { |
1451 | | c[k] = src[k * line] - src[(4-k) * line] + src[(5+k) * line] - src[(9-k) * line] + src[(10+k) * line] - src[(14-k) * line] + src[(15+k)*line] - src[(19-k)*line] + src[(20+k)*line] - src[(24-k)*line] + src[(25+k)*line] - src[(29-k)*line] + src[(30+k)*line]; |
1452 | | } |
1453 | | |
1454 | | t[0] = iT[12] * src[6*line] + iT[25] * src[19*line]; |
1455 | | t[1] = iT[25] * src[6*line] - iT[12] * src[19*line]; |
1456 | | |
1457 | | dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[1][0] - iT[11] * a[8][0] + iT[13] * a[7][0] + iT[24] * a[4][5] - iT[1] * a[8][5] + iT[10] * a[1][5] + iT[14] * a[4][0] + iT[23] * a[7][5] + iT[2] * a[1][1] - iT[9] * a[8][1] + iT[15] * a[7][1] + iT[22] * a[4][4] - iT[3] * a[8][4] + iT[8] * a[1][4] + iT[16] * a[4][1] + iT[21] * a[7][4] + iT[4] * a[1][2] - iT[7] * a[8][2] + iT[17] * a[7][2] + iT[20] * a[4][3] - iT[5] * a[8][3] + iT[6] * a[1][3] + iT[18] * a[4][2] + iT[19] * a[7][3] + t[0] + add) >> shift); |
1458 | | dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[4][2] - iT[11] * a[6][2] + iT[13] * a[0][3] + iT[24] * a[5][2] + iT[1] * a[2][0] + iT[10] * a[7][0] + iT[14] * a[5][5] - iT[23] * a[9][5] + iT[2] * a[7][2] + iT[9] * a[2][2] - iT[15] * a[9][3] + iT[22] * a[5][3] - iT[3] * a[6][0] - iT[8] * a[4][0] + iT[16] * a[5][0] + iT[21] * a[0][5] - iT[4] * a[4][1] - iT[7] * a[6][1] + iT[17] * a[0][4] + iT[20] * a[5][1] + iT[5] * a[2][1] + iT[6] * a[7][1] + iT[18] * a[5][4] - iT[19] * a[9][4] + t[1] + add) >> shift); |
1459 | | dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][4] - iT[11] * a[3][4] + iT[13] * a[0][4] + iT[24] * a[1][4] + iT[1] * a[4][3] + iT[10] * a[7][2] + iT[14] * a[1][2] - iT[23] * a[8][2] + iT[2] * a[3][0] - iT[9] * a[6][5] - iT[15] * a[8][0] + iT[22] * a[9][5] - iT[3] * a[6][4] + iT[8] * a[3][1] + iT[16] * a[9][4] - iT[21] * a[8][1] + iT[4] * a[7][3] + iT[7] * a[4][2] - iT[17] * a[8][3] + iT[20] * a[1][3] - iT[5] * a[3][5] - iT[6] * a[2][5] + iT[18] * a[1][5] + iT[19] * a[0][5] + t[1] + add) >> shift); |
1460 | | dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][4] + iT[11] * a[0][1] - iT[13] * a[4][4] - iT[24] * a[6][4] - iT[1] * a[1][3] - iT[10] * a[0][3] + iT[14] * a[2][3] + iT[23] * a[3][3] - iT[2] * a[0][4] - iT[9] * a[1][4] + iT[15] * a[3][4] + iT[22] * a[2][4] + iT[3] * a[0][0] + iT[8] * a[5][5] - iT[16] * a[6][5] - iT[21] * a[4][5] + iT[4] * a[5][0] - iT[7] * a[9][0] + iT[17] * a[7][5] + iT[20] * a[2][5] - iT[5] * a[8][2] + iT[6] * a[9][3] - iT[18] * a[6][3] + iT[19] * a[3][2] + t[0] + add) >> shift); |
1461 | | dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][5] + iT[11] * a[8][5] - iT[13] * a[7][5] - iT[24] * a[4][0] + iT[1] * a[5][1] + iT[10] * a[0][4] - iT[14] * a[4][1] - iT[23] * a[6][1] - iT[2] * a[8][3] + iT[9] * a[9][2] - iT[15] * a[6][2] + iT[22] * a[3][3] - iT[3] * a[0][2] - iT[8] * a[1][2] + iT[16] * a[3][2] + iT[21] * a[2][2] - iT[4] * a[9][4] + iT[7] * a[5][4] + iT[17] * a[2][1] + iT[20] * a[7][1] + iT[5] * a[1][0] - iT[6] * a[8][0] + iT[18] * a[7][0] + iT[19] * a[4][5] - t[0] + add) >> shift); |
1462 | | dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][5] - iT[11] * a[2][5] + iT[13] * a[9][0] - iT[24] * a[5][0] + iT[1] * a[3][4] - iT[10] * a[6][1] - iT[14] * a[8][4] + iT[23] * a[9][1] + iT[2] * a[4][2] + iT[9] * a[7][3] + iT[15] * a[1][3] - iT[22] * a[8][3] - iT[3] * a[2][2] - iT[8] * a[3][2] + iT[16] * a[0][2] + iT[21] * a[1][2] - iT[4] * a[6][4] - iT[7] * a[4][4] + iT[17] * a[5][4] + iT[20] * a[0][1] + iT[5] * a[7][0] + iT[6] * a[2][0] - iT[18] * a[9][5] + iT[19] * a[5][5] - t[1] + add) >> shift); |
1463 | | dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[6][3] - iT[11] * a[4][3] + iT[13] * a[5][3] + iT[24] * a[0][2] + iT[1] * a[7][1] + iT[10] * a[4][4] - iT[14] * a[8][1] + iT[23] * a[1][1] - iT[2] * a[7][5] - iT[9] * a[4][0] + iT[15] * a[8][5] - iT[22] * a[1][5] + iT[3] * a[7][3] + iT[8] * a[2][3] - iT[16] * a[9][2] + iT[21] * a[5][2] - iT[4] * a[6][5] + iT[7] * a[3][0] + iT[17] * a[9][5] - iT[20] * a[8][0] + iT[5] * a[6][1] - iT[6] * a[3][4] - iT[18] * a[9][1] + iT[19] * a[8][4] - t[1] + add) >> shift); |
1464 | | dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[1][1] - iT[11] * a[0][1] + iT[13] * a[2][1] + iT[24] * a[3][1] + iT[1] * a[1][3] - iT[10] * a[8][3] + iT[14] * a[7][3] + iT[23] * a[4][2] - iT[2] * a[9][1] + iT[9] * a[8][4] - iT[15] * a[3][4] + iT[22] * a[6][1] + iT[3] * a[5][5] + iT[8] * a[0][0] - iT[16] * a[4][5] - iT[21] * a[6][5] + iT[4] * a[0][5] + iT[7] * a[1][5] - iT[17] * a[3][5] - iT[20] * a[2][5] + iT[5] * a[5][3] - iT[6] * a[9][3] + iT[18] * a[7][2] + iT[19] * a[2][2] - t[0] + add) >> shift); |
1465 | | dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][3] - iT[11] * a[1][3] - iT[13] * a[4][2] - iT[24] * a[7][3] - iT[1] * a[8][0] + iT[10] * a[1][0] + iT[14] * a[4][5] + iT[23] * a[7][0] + iT[2] * a[5][3] + iT[9] * a[0][2] - iT[15] * a[4][3] - iT[22] * a[6][3] - iT[3] * a[5][0] - iT[8] * a[0][5] + iT[16] * a[4][0] + iT[21] * a[6][0] + iT[4] * a[1][4] + iT[7] * a[0][4] - iT[17] * a[2][4] - iT[20] * a[3][4] - iT[5] * a[1][1] - iT[6] * a[0][1] + iT[18] * a[2][1] + iT[19] * a[3][1] + t[0] + add) >> shift); |
1466 | | dst[11] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[7][0] + iT[11] * a[2][0] - iT[13] * a[9][5] + iT[24] * a[5][5] + iT[1] * a[2][5] + iT[10] * a[7][5] + iT[14] * a[5][0] - iT[23] * a[9][0] - iT[2] * a[2][1] - iT[9] * a[3][1] + iT[15] * a[0][1] + iT[22] * a[1][1] - iT[3] * a[7][4] - iT[8] * a[4][1] + iT[16] * a[8][4] - iT[21] * a[1][4] + iT[4] * a[3][2] - iT[7] * a[6][3] - iT[17] * a[8][2] + iT[20] * a[9][3] + iT[5] * a[4][2] + iT[6] * a[6][2] - iT[18] * a[0][3] - iT[19] * a[5][2] + t[1] + add) >> shift); |
1467 | | dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][5] - iT[11] * a[8][0] + iT[13] * a[3][0] - iT[24] * a[6][5] - iT[1] * a[8][5] + iT[10] * a[9][0] - iT[14] * a[6][0] + iT[23] * a[3][5] + iT[2] * a[5][4] - iT[9] * a[9][4] + iT[15] * a[7][1] + iT[22] * a[2][1] - iT[3] * a[1][4] + iT[8] * a[8][4] - iT[16] * a[7][4] - iT[21] * a[4][1] - iT[4] * a[0][2] - iT[7] * a[5][3] + iT[17] * a[6][3] + iT[20] * a[4][3] + iT[5] * a[0][3] + iT[6] * a[1][3] - iT[18] * a[3][3] - iT[19] * a[2][3] + t[0] + add) >> shift); |
1468 | | dst[15] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][1] + iT[11] * a[5][1] + iT[13] * a[2][4] + iT[24] * a[7][4] + iT[1] * a[9][3] - iT[10] * a[5][3] - iT[14] * a[2][2] - iT[23] * a[7][2] - iT[2] * a[9][5] + iT[9] * a[5][5] + iT[15] * a[2][0] + iT[22] * a[7][0] + iT[3] * a[9][4] - iT[8] * a[8][1] + iT[16] * a[3][1] - iT[21] * a[6][4] - iT[4] * a[9][2] + iT[7] * a[8][3] - iT[17] * a[3][3] + iT[20] * a[6][2] + iT[5] * a[9][0] - iT[6] * a[8][5] + iT[18] * a[3][5] - iT[19] * a[6][0] - t[0] + add) >> shift); |
1469 | | dst[16] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[4][4] + iT[11] * a[7][1] + iT[13] * a[1][1] - iT[24] * a[8][1] + iT[1] * a[6][2] - iT[10] * a[3][3] - iT[14] * a[9][2] + iT[23] * a[8][3] - iT[2] * a[6][1] - iT[9] * a[4][1] + iT[15] * a[5][1] + iT[22] * a[0][4] - iT[3] * a[4][5] - iT[8] * a[6][5] + iT[16] * a[0][0] + iT[21] * a[5][5] - iT[4] * a[6][0] + iT[7] * a[3][5] + iT[17] * a[9][0] - iT[20] * a[8][5] + iT[5] * a[6][3] + iT[6] * a[4][3] - iT[18] * a[5][3] - iT[19] * a[0][2] - t[1] + add) >> shift); |
1470 | | dst[17] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[7][2] - iT[11] * a[4][3] + iT[13] * a[8][2] - iT[24] * a[1][2] + iT[1] * a[7][1] + iT[10] * a[2][1] - iT[14] * a[9][4] + iT[23] * a[5][4] - iT[2] * a[3][5] + iT[9] * a[6][0] + iT[15] * a[8][5] - iT[22] * a[9][0] - iT[3] * a[2][3] - iT[8] * a[7][3] - iT[16] * a[5][2] + iT[21] * a[9][2] + iT[4] * a[4][5] + iT[7] * a[7][0] + iT[17] * a[1][0] - iT[20] * a[8][0] - iT[5] * a[2][4] - iT[6] * a[3][4] + iT[18] * a[0][4] + iT[19] * a[1][4] - t[1] + add) >> shift); |
1471 | | dst[18] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[9][0] + iT[11] * a[8][5] - iT[13] * a[3][5] + iT[24] * a[6][0] + iT[1] * a[5][1] - iT[10] * a[9][1] + iT[14] * a[7][4] + iT[23] * a[2][4] + iT[2] * a[0][3] + iT[9] * a[5][2] - iT[15] * a[6][2] - iT[22] * a[4][2] + iT[3] * a[1][2] + iT[8] * a[0][2] - iT[16] * a[2][2] - iT[21] * a[3][2] - iT[4] * a[8][1] + iT[7] * a[1][1] + iT[17] * a[4][4] + iT[20] * a[7][1] + iT[5] * a[9][5] - iT[6] * a[8][0] + iT[18] * a[3][0] - iT[19] * a[6][5] - t[0] + add) >> shift); |
1472 | | dst[20] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][2] - iT[11] * a[9][3] + iT[13] * a[6][3] - iT[24] * a[3][2] + iT[1] * a[0][1] + iT[10] * a[5][4] - iT[14] * a[6][4] - iT[23] * a[4][4] + iT[2] * a[1][5] + iT[9] * a[0][5] - iT[15] * a[2][5] - iT[22] * a[3][5] - iT[3] * a[9][2] + iT[8] * a[5][2] + iT[16] * a[2][3] + iT[21] * a[7][3] + iT[4] * a[5][5] - iT[7] * a[9][5] + iT[17] * a[7][0] + iT[20] * a[2][0] + iT[5] * a[0][4] + iT[6] * a[5][1] - iT[18] * a[6][1] - iT[19] * a[4][1] + t[0] + add) >> shift); |
1473 | | dst[21] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][1] - iT[11] * a[7][1] - iT[13] * a[5][4] + iT[24] * a[9][4] - iT[1] * a[6][2] - iT[10] * a[4][2] + iT[14] * a[5][2] + iT[23] * a[0][3] - iT[2] * a[2][4] - iT[9] * a[7][4] - iT[15] * a[5][1] + iT[22] * a[9][1] - iT[3] * a[6][5] - iT[8] * a[4][5] + iT[16] * a[5][5] + iT[21] * a[0][0] - iT[4] * a[4][0] - iT[7] * a[7][5] - iT[17] * a[1][5] + iT[20] * a[8][5] - iT[5] * a[7][2] - iT[6] * a[4][3] + iT[18] * a[8][2] - iT[19] * a[1][2] + t[1] + add) >> shift); |
1474 | | dst[22] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[6][1] - iT[11] * a[3][4] - iT[13] * a[9][1] + iT[24] * a[8][4] + iT[1] * a[4][3] + iT[10] * a[6][3] - iT[14] * a[0][2] - iT[23] * a[5][3] + iT[2] * a[7][0] + iT[9] * a[4][5] - iT[15] * a[8][0] + iT[22] * a[1][0] - iT[3] * a[3][1] + iT[8] * a[6][4] + iT[16] * a[8][1] - iT[21] * a[9][4] - iT[4] * a[2][3] - iT[7] * a[3][3] + iT[17] * a[0][3] + iT[20] * a[1][3] - iT[5] * a[7][5] - iT[6] * a[2][5] + iT[18] * a[9][0] - iT[19] * a[5][0] + t[1] + add) >> shift); |
1475 | | dst[23] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[0][3] - iT[11] * a[1][3] + iT[13] * a[3][3] + iT[24] * a[2][3] - iT[1] * a[8][0] + iT[10] * a[9][5] - iT[14] * a[6][5] + iT[23] * a[3][0] + iT[2] * a[8][2] - iT[9] * a[1][2] - iT[15] * a[4][3] - iT[22] * a[7][2] + iT[3] * a[0][5] + iT[8] * a[5][0] - iT[16] * a[6][0] - iT[21] * a[4][0] + iT[4] * a[8][4] - iT[7] * a[9][1] + iT[17] * a[6][1] - iT[20] * a[3][4] - iT[5] * a[5][4] - iT[6] * a[0][1] + iT[18] * a[4][4] + iT[19] * a[6][4] + t[0] + add) >> shift); |
1476 | | dst[26] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][0] - iT[11] * a[2][0] + iT[13] * a[1][0] + iT[24] * a[0][0] - iT[1] * a[2][5] - iT[10] * a[3][5] + iT[14] * a[0][5] + iT[23] * a[1][5] + iT[2] * a[4][4] + iT[9] * a[6][4] - iT[15] * a[0][1] - iT[22] * a[5][4] - iT[3] * a[4][1] - iT[8] * a[7][4] - iT[16] * a[1][4] + iT[21] * a[8][4] + iT[4] * a[2][2] + iT[7] * a[7][2] + iT[17] * a[5][3] - iT[20] * a[9][3] + iT[5] * a[3][3] - iT[6] * a[6][2] - iT[18] * a[8][3] + iT[19] * a[9][2] - t[1] + add) >> shift); |
1477 | | dst[27] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[3][3] + iT[11] * a[6][2] + iT[13] * a[8][3] - iT[24] * a[9][2] - iT[1] * a[2][0] - iT[10] * a[3][0] + iT[14] * a[0][0] + iT[23] * a[1][0] - iT[2] * a[6][3] + iT[9] * a[3][2] + iT[15] * a[9][3] - iT[22] * a[8][2] - iT[3] * a[4][0] - iT[8] * a[6][0] + iT[16] * a[0][5] + iT[21] * a[5][0] - iT[4] * a[7][4] - iT[7] * a[2][4] + iT[17] * a[9][1] - iT[20] * a[5][1] - iT[5] * a[4][4] - iT[6] * a[7][1] - iT[18] * a[1][1] + iT[19] * a[8][1] - t[1] + add) >> shift); |
1478 | | dst[28] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[0][4] + iT[11] * a[5][1] - iT[13] * a[6][1] - iT[24] * a[4][1] + iT[1] * a[9][3] - iT[10] * a[8][2] + iT[14] * a[3][2] - iT[23] * a[6][3] - iT[2] * a[1][0] - iT[9] * a[0][0] + iT[15] * a[2][0] + iT[22] * a[3][0] + iT[3] * a[8][1] - iT[8] * a[9][4] + iT[16] * a[6][4] - iT[21] * a[3][1] - iT[4] * a[5][2] - iT[7] * a[0][3] + iT[17] * a[4][2] + iT[20] * a[6][2] + iT[5] * a[1][5] - iT[6] * a[8][5] + iT[18] * a[7][5] + iT[19] * a[4][0] - t[0] + add) >> shift); |
1479 | | dst[30] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][3] - iT[11] * a[9][3] + iT[13] * a[7][2] + iT[24] * a[2][2] + iT[1] * a[0][1] + iT[10] * a[1][1] - iT[14] * a[3][1] - iT[23] * a[2][1] + iT[2] * a[9][0] - iT[9] * a[5][0] - iT[15] * a[2][5] - iT[22] * a[7][5] - iT[3] * a[5][2] + iT[8] * a[9][2] - iT[16] * a[7][3] - iT[21] * a[2][3] - iT[4] * a[0][0] - iT[7] * a[1][0] + iT[17] * a[3][0] + iT[20] * a[2][0] - iT[5] * a[9][1] + iT[6] * a[5][1] + iT[18] * a[2][4] + iT[19] * a[7][4] + t[0] + add) >> shift); |
1480 | | dst[31] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][5] + iT[11] * a[2][5] - iT[13] * a[1][5] - iT[24] * a[0][5] - iT[1] * a[3][4] - iT[10] * a[2][4] + iT[14] * a[1][4] + iT[23] * a[0][4] + iT[2] * a[3][3] + iT[9] * a[2][3] - iT[15] * a[1][3] - iT[22] * a[0][3] - iT[3] * a[3][2] - iT[8] * a[2][2] + iT[16] * a[1][2] + iT[21] * a[0][2] + iT[4] * a[3][1] + iT[7] * a[2][1] - iT[17] * a[1][1] - iT[20] * a[0][1] - iT[5] * a[3][0] - iT[6] * a[2][0] + iT[18] * a[1][0] + iT[19] * a[0][0] + t[1] + add) >> shift); |
1481 | | |
1482 | | dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 4] * b[0] + iT[14] * b[1] + iT[24] * b[2] + iT[29] * b[3] + iT[19] * b[4] + iT[ 9] * b[5] + add) >> shift); |
1483 | | dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)(iT[ 9] * b[0] + iT[29] * b[1] + iT[14] * b[2] - iT[ 4] * b[3] - iT[24] * b[4] - iT[19] * b[5] + add) >> shift); |
1484 | | dst[14] = Clip3(outputMinimum, outputMaximum, (int)(iT[14] * b[0] + iT[19] * b[1] - iT[ 9] * b[2] - iT[24] * b[3] + iT[ 4] * b[4] + iT[29] * b[5] + add) >> shift); |
1485 | | dst[19] = Clip3(outputMinimum, outputMaximum, (int)(iT[19] * b[0] + iT[ 4] * b[1] - iT[29] * b[2] + iT[ 9] * b[3] + iT[14] * b[4] - iT[24] * b[5] + add) >> shift); |
1486 | | dst[24] = Clip3(outputMinimum, outputMaximum, (int)(iT[24] * b[0] - iT[ 9] * b[1] - iT[ 4] * b[2] + iT[19] * b[3] - iT[29] * b[4] + iT[14] * b[5] + add) >> shift); |
1487 | | dst[29] = Clip3(outputMinimum, outputMaximum, (int)(iT[29] * b[0] - iT[24] * b[1] + iT[19] * b[2] - iT[14] * b[3] + iT[ 9] * b[4] - iT[ 4] * b[5] + add) >> shift); |
1488 | | |
1489 | | dst[12] = Clip3(outputMinimum, outputMaximum, (int)(iT[12]*c[0] + iT[25]*c[1] + add) >> shift); |
1490 | | dst[25] = Clip3(outputMinimum, outputMaximum, (int)(iT[25]*c[0] - iT[12]*c[1] + add) >> shift); |
1491 | | |
1492 | | src++; |
1493 | | dst += 32; |
1494 | | } |
1495 | | |
1496 | | if (iSkipLine) |
1497 | | { |
1498 | | memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff)); |
1499 | | } |
1500 | | #else |
1501 | 0 | _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDST7P32[TRANSFORM_INVERSE][0] ); |
1502 | 0 | #endif |
1503 | 0 | } |
1504 | | |
1505 | | |
1506 | | // ********************************** DCT-VIII ********************************** |
1507 | | void fastForwardDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1508 | 0 | { |
1509 | 0 | int i; |
1510 | 0 | int rnd_factor = 1 << (shift - 1); |
1511 | 0 | const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_FORWARD][0]; |
1512 | |
|
1513 | 0 | int c[4]; |
1514 | 0 | TCoeff *pCoeff = dst; |
1515 | 0 | const int reducedLine = line - iSkipLine; |
1516 | 0 | for (i = 0; i<reducedLine; i++) |
1517 | 0 | { |
1518 | | // Intermediate Variables |
1519 | 0 | c[0] = src[0] + src[3]; |
1520 | 0 | c[1] = src[2] + src[0]; |
1521 | 0 | c[2] = src[3] - src[2]; |
1522 | 0 | c[3] = iT[1] * src[1]; |
1523 | |
|
1524 | 0 | dst[0 * line] = (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift; |
1525 | 0 | dst[1 * line] = (iT[1] * (src[0] - src[2] - src[3]) + rnd_factor) >> shift; |
1526 | 0 | dst[2 * line] = (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift; |
1527 | 0 | dst[3 * line] = (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift; |
1528 | |
|
1529 | 0 | src += 4; |
1530 | 0 | dst++; |
1531 | 0 | } |
1532 | 0 | if (iSkipLine) |
1533 | 0 | { |
1534 | 0 | dst = pCoeff + reducedLine; |
1535 | 0 | for (i = 0; i<4; i++) |
1536 | 0 | { |
1537 | 0 | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
1538 | 0 | dst += line; |
1539 | 0 | } |
1540 | 0 | } |
1541 | 0 | } |
1542 | | |
1543 | | |
1544 | | void fastInverseDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1545 | 0 | { |
1546 | 0 | #if ENABLE_SIMD_TRAFO |
1547 | 0 | _fastInverseMM<4>( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P4[TRANSFORM_INVERSE][0] ); |
1548 | | #else |
1549 | | int i; |
1550 | | int rnd_factor = 1 << (shift - 1); |
1551 | | |
1552 | | const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_INVERSE][0]; |
1553 | | |
1554 | | int c[4]; |
1555 | | const int reducedLine = line - iSkipLine; |
1556 | | for (i = 0; i<reducedLine; i++) |
1557 | | { |
1558 | | // Intermediate Variables |
1559 | | c[0] = src[0 * line] + src[3 * line]; |
1560 | | c[1] = src[2 * line] + src[0 * line]; |
1561 | | c[2] = src[3 * line] - src[2 * line]; |
1562 | | c[3] = iT[1] * src[1 * line]; |
1563 | | |
1564 | | dst[0] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift); |
1565 | | dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * (src[0 * line] - src[2 * line] - src[3 * line]) + rnd_factor) >> shift); |
1566 | | dst[2] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift); |
1567 | | dst[3] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift); |
1568 | | |
1569 | | dst += 4; |
1570 | | src++; |
1571 | | } |
1572 | | if (iSkipLine) |
1573 | | { |
1574 | | memset(dst, 0, (iSkipLine << 2) * sizeof(TCoeff)); |
1575 | | } |
1576 | | #endif |
1577 | 0 | } |
1578 | | |
1579 | | |
1580 | | void fastForwardDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1581 | 0 | { |
1582 | 0 | _fastForwardMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P8[TRANSFORM_FORWARD][0] ); |
1583 | 0 | } |
1584 | | |
1585 | | |
1586 | | void fastInverseDCT8_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1587 | 0 | { |
1588 | 0 | _fastInverseMM< 8 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P8[TRANSFORM_INVERSE][0] ); |
1589 | 0 | } |
1590 | | |
1591 | | |
1592 | | void fastForwardDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1593 | 0 | { |
1594 | | #if !JVET_M0497_MATRIX_MULT |
1595 | | int j, k; |
1596 | | TCoeff a[5], b[5], c[5], d[5], t; |
1597 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1598 | | |
1599 | | const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_FORWARD][0]; |
1600 | | |
1601 | | TCoeff *pCoef = dst; |
1602 | | const int reducedLine = line - iSkipLine; |
1603 | | const int cutoff = 16 - iSkipLine2; |
1604 | | |
1605 | | for (j = 0; j < reducedLine; j++) |
1606 | | { |
1607 | | for (k = 0; k < 5; k++) |
1608 | | { |
1609 | | a[k] = src[15 - k] + src[ 4 - k]; |
1610 | | b[k] = src[ 6 + k] + src[ 4 - k]; |
1611 | | c[k] = src[15 - k] - src[ 6 + k]; |
1612 | | d[k] = src[15 - k] + src[ 6 + k] - src[ 4 - k]; |
1613 | | } |
1614 | | |
1615 | | t = iT[10] * src[5]; |
1616 | | |
1617 | | dst[ 1 * line] = ( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift; |
1618 | | dst[ 4 * line] = ( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift; |
1619 | | dst[ 7 * line] = ( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift; |
1620 | | dst[10 * line] = ( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift; |
1621 | | dst[13 * line] = ( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift; |
1622 | | |
1623 | | dst[ 5 * line] = ( - iT[10] * (src[15] + src[14] - src[12] - src[11] + src[9] + src[8] - src[6] - src[5] + src[3] + src[2] - src[0]) + add) >> shift; |
1624 | | |
1625 | | dst[ 0 * line] = ( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift; |
1626 | | dst[ 2 * line] = ( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift; |
1627 | | dst[ 3 * line] = ( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift; |
1628 | | dst[ 6 * line] = ( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift; |
1629 | | dst[ 8 * line] = ( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift; |
1630 | | dst[ 9 * line] = ( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift; |
1631 | | dst[11 * line] = ( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift; |
1632 | | dst[12 * line] = ( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift; |
1633 | | dst[14 * line] = ( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift; |
1634 | | dst[15 * line] = ( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift; |
1635 | | |
1636 | | src += 16; |
1637 | | dst++; |
1638 | | } |
1639 | | |
1640 | | if (iSkipLine) |
1641 | | { |
1642 | | dst = pCoef + reducedLine; |
1643 | | for (j = 0; j < cutoff; j++) |
1644 | | { |
1645 | | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
1646 | | dst += line; |
1647 | | } |
1648 | | } |
1649 | | |
1650 | | if (iSkipLine2) |
1651 | | { |
1652 | | dst = pCoef + line * cutoff; |
1653 | | memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2); |
1654 | | } |
1655 | | #else |
1656 | 0 | _fastForwardMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P16[TRANSFORM_FORWARD][0] ); |
1657 | 0 | #endif |
1658 | 0 | } |
1659 | | |
1660 | | |
1661 | | void fastInverseDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1662 | 0 | { |
1663 | | #if !JVET_M0497_MATRIX_MULT |
1664 | | int j, k; |
1665 | | TCoeff a[5], b[5], c[5], d[5], t; |
1666 | | |
1667 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1668 | | |
1669 | | const TMatrixCoeff *iT = g_trCoreDST7P16[TRANSFORM_INVERSE][0]; |
1670 | | |
1671 | | const int reducedLine = line - iSkipLine; |
1672 | | |
1673 | | for (j = 0; j < reducedLine; j++) |
1674 | | { |
1675 | | for (k = 0; k < 5; k++) |
1676 | | { |
1677 | | a[k] = src[(15 - k ) * line] + src[( 4 - k) * line]; |
1678 | | b[k] = src[( 6 + k ) * line] + src[( 4 - k) * line]; |
1679 | | c[k] = src[(15 - k ) * line] - src[( 6 + k) * line]; |
1680 | | d[k] = src[(15 - k ) * line] + src[( 6 + k) * line] - src[(4 - k) * line]; |
1681 | | } |
1682 | | |
1683 | | t = iT[10] * src[5*line]; |
1684 | | |
1685 | | dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift); |
1686 | | dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift); |
1687 | | dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift); |
1688 | | dst[10] = Clip3(outputMinimum, outputMaximum, (int)( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift); |
1689 | | dst[13] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift); |
1690 | | |
1691 | | dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( - iT[10] * (src[15 * line] + src[14 * line] - src[12 * line] - src[11 * line] + src[9 * line] + src[8 * line] - src[6 * line] - src[5 * line] + src[3 * line] + src[2 * line] - src[0 * line]) + add) >> shift); |
1692 | | |
1693 | | dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift ); |
1694 | | dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift ); |
1695 | | dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift ); |
1696 | | dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift ); |
1697 | | dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift ); |
1698 | | dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift ); |
1699 | | dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift ); |
1700 | | dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift ); |
1701 | | dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift ); |
1702 | | dst[15] = Clip3(outputMinimum, outputMaximum, (int)( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift ); |
1703 | | |
1704 | | src++; |
1705 | | dst += 16; |
1706 | | } |
1707 | | |
1708 | | if (iSkipLine) |
1709 | | { |
1710 | | memset(dst, 0, (iSkipLine * 16) * sizeof(TCoeff)); |
1711 | | } |
1712 | | #else |
1713 | 0 | _fastInverseMM< 16 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P16[TRANSFORM_INVERSE][0] ); |
1714 | 0 | #endif |
1715 | 0 | } |
1716 | | |
1717 | | |
1718 | | void fastForwardDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) |
1719 | 0 | { |
1720 | | #if !JVET_M0497_MATRIX_MULT |
1721 | | int j, k; |
1722 | | TCoeff a[10][6]; |
1723 | | TCoeff t[2]; |
1724 | | TCoeff b[6]; |
1725 | | TCoeff c[2]; |
1726 | | |
1727 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1728 | | const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_FORWARD][0]; |
1729 | | TCoeff *pCoef = dst; |
1730 | | const int reducedLine = line - iSkipLine; |
1731 | | const int cutoff = 32 - iSkipLine2; |
1732 | | |
1733 | | for (j = 0; j < reducedLine; j++) |
1734 | | { |
1735 | | for (k = 0; k < 6; k++) |
1736 | | { |
1737 | | a[0][k] = src[31-k] - src[20+k]; |
1738 | | a[1][k] = src[31-k] + src[18-k]; |
1739 | | a[2][k] = src[31-k] + src[ 7+k]; |
1740 | | a[3][k] = src[31-k] - src[ 5-k]; |
1741 | | a[4][k] = src[25-k] + src[13+k]; |
1742 | | a[5][k] = src[25-k] + src[12-k]; |
1743 | | a[6][k] = src[25-k] - src[ k]; |
1744 | | a[7][k] = src[18-k] - src[ 7+k]; |
1745 | | a[8][k] = src[18-k] + src[ 5-k]; |
1746 | | a[9][k] = src[12-k] + src[ k]; |
1747 | | |
1748 | | b[k] = src[31-k] + src[20+k] - src[18-k] - src[7+k] + src[5-k]; |
1749 | | } |
1750 | | |
1751 | | for (k = 0; k < 2; k++) |
1752 | | { |
1753 | | c[k] = src[31-k] + src[28+k] - src[26-k] - src[23+k] + src[21-k] + src[18+k] - src[16-k] - src[13+k] + src[11-k] + src[8+k] - src[6-k] - src[3+k] + src[1-k]; |
1754 | | } |
1755 | | |
1756 | | t[0] = iT[12] * src[19] + iT[25] * src[6]; |
1757 | | t[1] = iT[12] * src[6] - iT[25] * src[19]; |
1758 | | |
1759 | | dst[ 0 * line] = ( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift; |
1760 | | dst[ 1 * line] = ( iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift; |
1761 | | dst[ 3 * line] = ( iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift; |
1762 | | dst[ 4 * line] = ( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift; |
1763 | | dst[ 5 * line] = ( iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift; |
1764 | | dst[ 8 * line] = ( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift; |
1765 | | dst[ 9 * line] = ( iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift; |
1766 | | dst[10 * line] = ( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift; |
1767 | | dst[11 * line] = ( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift; |
1768 | | dst[13 * line] = ( iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift; |
1769 | | dst[14 * line] = ( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift; |
1770 | | dst[15 * line] = ( iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift; |
1771 | | dst[16 * line] = ( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift; |
1772 | | dst[18 * line] = ( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift; |
1773 | | dst[20 * line] = ( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift; |
1774 | | dst[21 * line] = ( iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift; |
1775 | | dst[23 * line] = ( iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift; |
1776 | | dst[24 * line] = ( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift; |
1777 | | dst[25 * line] = ( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift; |
1778 | | dst[26 * line] = ( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift; |
1779 | | dst[28 * line] = ( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift; |
1780 | | dst[29 * line] = ( iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift; |
1781 | | dst[30 * line] = ( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift; |
1782 | | dst[31 * line] = ( iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift; |
1783 | | |
1784 | | dst[ 2 * line] = ( iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift; |
1785 | | dst[ 7 * line] = ( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift; |
1786 | | dst[12 * line] = ( iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift; |
1787 | | dst[17 * line] = ( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift; |
1788 | | dst[22 * line] = ( iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift; |
1789 | | dst[27 * line] = ( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift; |
1790 | | |
1791 | | dst[ 6 * line] = ( iT[12] * c[0] + iT[25] * c[1] + add) >> shift; |
1792 | | dst[19 * line] = ( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift; |
1793 | | |
1794 | | src += 32; |
1795 | | dst++; |
1796 | | } |
1797 | | |
1798 | | if (iSkipLine) |
1799 | | { |
1800 | | dst = pCoef + reducedLine; |
1801 | | for (j = 0; j < cutoff; j++) |
1802 | | { |
1803 | | memset(dst, 0, sizeof(TCoeff)*iSkipLine); |
1804 | | dst += line; |
1805 | | } |
1806 | | } |
1807 | | |
1808 | | if (iSkipLine2) |
1809 | | { |
1810 | | dst = pCoef + line * cutoff; |
1811 | | memset(dst, 0, sizeof(TCoeff) * line * iSkipLine2); |
1812 | | } |
1813 | | #else |
1814 | 0 | _fastForwardMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, g_trCoreDCT8P32[TRANSFORM_FORWARD][0] ); |
1815 | 0 | #endif |
1816 | 0 | } |
1817 | | |
1818 | | |
1819 | | void fastInverseDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) |
1820 | 0 | { |
1821 | | #if !JVET_M0497_MATRIX_MULT |
1822 | | int j, k; |
1823 | | TCoeff a[10][6]; |
1824 | | TCoeff t[2]; |
1825 | | TCoeff b[6]; |
1826 | | TCoeff c[2]; |
1827 | | TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; |
1828 | | |
1829 | | const TMatrixCoeff *iT = g_trCoreDST7P32[TRANSFORM_INVERSE][0]; |
1830 | | |
1831 | | const int reducedLine = line - iSkipLine; |
1832 | | |
1833 | | for (j = 0; j < reducedLine; j++) |
1834 | | { |
1835 | | for (k = 0; k < 6; k++) |
1836 | | { |
1837 | | a[0][k] = src[(31 - k)*line] - src[(20 + k)*line]; |
1838 | | a[1][k] = src[(31 - k)*line] + src[(18 - k)*line]; |
1839 | | a[2][k] = src[(31 - k)*line] + src[( 7 + k)*line]; |
1840 | | a[3][k] = src[(31 - k)*line] - src[( 5 - k)*line]; |
1841 | | a[4][k] = src[(25 - k)*line] + src[(13 + k)*line]; |
1842 | | a[5][k] = src[(25 - k)*line] + src[(12 - k)*line]; |
1843 | | a[6][k] = src[(25 - k)*line] - src[ k *line]; |
1844 | | a[7][k] = src[(18 - k)*line] - src[( 7 + k)*line]; |
1845 | | a[8][k] = src[(18 - k)*line] + src[( 5 - k)*line]; |
1846 | | a[9][k] = src[(12 - k)*line] + src[ k *line]; |
1847 | | |
1848 | | b[k] = src[(31 - k)*line] + src[(20 + k)*line] - src[(18 - k)*line] - src[(7 + k)*line] + src[(5 - k)*line]; |
1849 | | } |
1850 | | |
1851 | | for (k = 0; k < 2; k++) |
1852 | | { |
1853 | | c[k] = src[(31 - k)*line] + src[(28 + k)*line] - src[(26 - k)*line] - src[(23 + k)*line] + src[(21 - k)*line] + src[(18 + k)*line] - src[(16 - k)*line] - src[(13 + k)*line] + src[(11 - k)*line] + src[(8 + k)*line] - src[(6 - k)*line] - src[(3 + k)*line] + src[(1 - k)*line]; |
1854 | | } |
1855 | | |
1856 | | t[0] = iT[12] * src[19 * line] + iT[25] * src[ 6 * line]; |
1857 | | t[1] = iT[12] * src[ 6 * line] - iT[25] * src[19 * line]; |
1858 | | |
1859 | | dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift); |
1860 | | dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift); |
1861 | | dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift); |
1862 | | dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift); |
1863 | | dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift); |
1864 | | dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift); |
1865 | | dst[ 9] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift); |
1866 | | dst[10] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift); |
1867 | | dst[11] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift); |
1868 | | dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift); |
1869 | | dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift); |
1870 | | dst[15] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift); |
1871 | | dst[16] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift); |
1872 | | dst[18] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift); |
1873 | | dst[20] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift); |
1874 | | dst[21] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift); |
1875 | | dst[23] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift); |
1876 | | dst[24] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift); |
1877 | | dst[25] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift); |
1878 | | dst[26] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift); |
1879 | | dst[28] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift); |
1880 | | dst[29] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift); |
1881 | | dst[30] = Clip3(outputMinimum, outputMaximum, (int)( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift); |
1882 | | dst[31] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift); |
1883 | | |
1884 | | dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift); |
1885 | | dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift); |
1886 | | dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift); |
1887 | | dst[17] = Clip3(outputMinimum, outputMaximum, (int)( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift); |
1888 | | dst[22] = Clip3(outputMinimum, outputMaximum, (int)( iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift); |
1889 | | dst[27] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift); |
1890 | | |
1891 | | dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[12] * c[0] + iT[25] * c[1] + add) >> shift); |
1892 | | dst[19] = Clip3(outputMinimum, outputMaximum, (int)( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift); |
1893 | | |
1894 | | src++; |
1895 | | dst += 32; |
1896 | | } |
1897 | | |
1898 | | if (iSkipLine) |
1899 | | { |
1900 | | memset(dst, 0, (iSkipLine * 32) * sizeof(TCoeff)); |
1901 | | } |
1902 | | #else |
1903 | 0 | _fastInverseMM< 32 >( src, dst, shift, line, iSkipLine, iSkipLine2, outputMinimum, outputMaximum, g_trCoreDCT8P32[TRANSFORM_INVERSE][0] ); |
1904 | 0 | #endif |
1905 | 0 | } |
1906 | | |
1907 | | #if ENABLE_SIMD_TRAFO |
1908 | | |
1909 | | #define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1 |
1910 | | |
1911 | | } // namespace vvenc |
1912 | | |
1913 | | #include "Unit.h" |
1914 | | |
1915 | | namespace vvenc { |
1916 | | |
1917 | | void cpyCoeffCore( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height ) |
1918 | 0 | { |
1919 | 0 | #define CPYCOEFF_OP( ADDR ) dst[ADDR] = src[ADDR]; |
1920 | 0 | #define CPYCOEFF_INC src += stride; dst += width; |
1921 | |
|
1922 | 0 | SIZE_AWARE_PER_EL_OP( CPYCOEFF_OP, CPYCOEFF_INC ); |
1923 | |
|
1924 | 0 | #undef CPYCOEFF_INC |
1925 | 0 | #undef CPYCOEFF_OP |
1926 | 0 | } |
1927 | | |
1928 | | |
1929 | | void cpyResiCore( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height ) |
1930 | 0 | { |
1931 | 0 | #define CPYRESI_OP( ADDR ) dst[ADDR] = Pel( src[ADDR] ); |
1932 | 0 | #define CPYRESI_INC dst += stride; src += width; |
1933 | |
|
1934 | 0 | SIZE_AWARE_PER_EL_OP( CPYRESI_OP, CPYRESI_INC ); |
1935 | |
|
1936 | 0 | #undef CPYRESI_INC |
1937 | 0 | #undef CPYRESI_OP |
1938 | 0 | } |
1939 | | |
1940 | | |
1941 | | void clipCore( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift ) |
1942 | 0 | { |
1943 | 0 | #define CLIP_OP( ADDR ) dst[ADDR] = Clip3( outputMin, outputMax, ( dst[ADDR] + round ) >> shift ) |
1944 | 0 | #define CLIP_INC dst += stride |
1945 | |
|
1946 | 0 | SIZE_AWARE_PER_EL_OP( CLIP_OP, CLIP_INC ); |
1947 | |
|
1948 | 0 | #undef CLIP_INC |
1949 | 0 | #undef CLIP_OP |
1950 | 0 | } |
1951 | | |
1952 | | |
1953 | | template<unsigned trSize> |
1954 | | void fastInvCore_( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines, unsigned reducedLines, unsigned rows ) |
1955 | 0 | { |
1956 | 0 | for( int k = 0; k < rows; k++ ) |
1957 | 0 | { |
1958 | 0 | const TCoeff* srcPtr = &src[k * lines]; |
1959 | 0 | for( int i = 0; i < reducedLines; i++ ) |
1960 | 0 | { |
1961 | 0 | TCoeff* dstPtr = &dst[i * trSize]; |
1962 | 0 | const TMatrixCoeff* itPtr = &it[k * trSize]; |
1963 | 0 | for( int j = 0; j < trSize; j++ ) |
1964 | 0 | { |
1965 | 0 | *dstPtr++ += *srcPtr * *itPtr++; |
1966 | 0 | } |
1967 | 0 | srcPtr++; |
1968 | 0 | } |
1969 | 0 | } |
1970 | 0 | } Unexecuted instantiation: void vvenc::fastInvCore_<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInvCore_<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInvCore_<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInvCore_<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) Unexecuted instantiation: void vvenc::fastInvCore_<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int) |
1971 | | |
1972 | | |
1973 | | template<unsigned trSize> |
1974 | | void fastFwdCore( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, unsigned reducedLine, unsigned cutoff, int shift ) |
1975 | 0 | { |
1976 | 0 | const int rnd_factor = 1 << ( shift - 1 ); |
1977 | |
|
1978 | 0 | for( int i = 0; i < reducedLine; i++ ) |
1979 | 0 | { |
1980 | 0 | TCoeff* dstPtr = dst; |
1981 | 0 | const TMatrixCoeff* iT = tc; |
1982 | |
|
1983 | 0 | for( int j = 0; j < cutoff; j++ ) |
1984 | 0 | { |
1985 | 0 | int sum = 0; |
1986 | |
|
1987 | 0 | for( int k = 0; k < trSize; k++ ) |
1988 | 0 | { |
1989 | | // dst[j * line + i] += src[i * trSize + k] * t[j * trSize + k] |
1990 | 0 | sum += src[k] * iT[k]; |
1991 | 0 | } |
1992 | |
|
1993 | 0 | dstPtr[i] = ( sum + rnd_factor ) >> shift; |
1994 | 0 | dstPtr += line; |
1995 | 0 | iT += trSize; |
1996 | 0 | } |
1997 | |
|
1998 | 0 | src += trSize; |
1999 | 0 | } |
2000 | 0 | } Unexecuted instantiation: void vvenc::fastFwdCore<4u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwdCore<8u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwdCore<16u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwdCore<32u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) Unexecuted instantiation: void vvenc::fastFwdCore<64u>(short const*, int const*, int*, unsigned int, unsigned int, unsigned int, int) |
2001 | | |
2002 | | |
2003 | | TCoeffOps::TCoeffOps() |
2004 | 256 | { |
2005 | 256 | cpyResi4 = cpyResiCore; |
2006 | 256 | cpyResi8 = cpyResiCore; |
2007 | 256 | cpyCoeff4 = cpyCoeffCore; |
2008 | 256 | cpyCoeff8 = cpyCoeffCore; |
2009 | 256 | roundClip4 = clipCore; |
2010 | 256 | roundClip8 = clipCore; |
2011 | 256 | fastInvCore[0] = fastInvCore_< 4>; |
2012 | 256 | fastInvCore[1] = fastInvCore_< 8>; |
2013 | 256 | fastInvCore[2] = fastInvCore_<16>; |
2014 | 256 | fastInvCore[3] = fastInvCore_<32>; |
2015 | 256 | fastInvCore[4] = fastInvCore_<64>; |
2016 | 256 | fastFwdCore_1D[0] = fastFwdCore< 4>; |
2017 | 256 | fastFwdCore_1D[1] = fastFwdCore< 8>; |
2018 | 256 | fastFwdCore_1D[2] = fastFwdCore<16>; |
2019 | 256 | fastFwdCore_1D[3] = fastFwdCore<32>; |
2020 | 256 | fastFwdCore_1D[4] = fastFwdCore<64>; |
2021 | 256 | fastFwdCore_2D[0] = fastFwdCore< 4>; |
2022 | 256 | fastFwdCore_2D[1] = fastFwdCore< 8>; |
2023 | 256 | fastFwdCore_2D[2] = fastFwdCore<16>; |
2024 | 256 | fastFwdCore_2D[3] = fastFwdCore<32>; |
2025 | 256 | fastFwdCore_2D[4] = fastFwdCore<64>; |
2026 | 256 | } |
2027 | | |
2028 | | TCoeffOps g_tCoeffOps; |
2029 | | |
2030 | | #endif |
2031 | | |
2032 | | |
2033 | | } // namespace vvenc |
2034 | | |
2035 | | //! \} |
2036 | | |