Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * dct.c: transform and zigzag |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | | * Laurent Aimar <fenrir@via.ecp.fr> |
8 | | * Henrik Gramner <henrik@gramner.com> |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License as published by |
12 | | * the Free Software Foundation; either version 2 of the License, or |
13 | | * (at your option) any later version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | | * GNU General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License |
21 | | * along with this program; if not, write to the Free Software |
22 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
23 | | * |
24 | | * This program is also available under a commercial proprietary license. |
25 | | * For more information, contact us at licensing@x264.com. |
26 | | *****************************************************************************/ |
27 | | |
28 | | #include "common.h" |
29 | | #if HAVE_MMX |
30 | | # include "x86/dct.h" |
31 | | #endif |
32 | | #if HAVE_ALTIVEC |
33 | | # include "ppc/dct.h" |
34 | | #endif |
35 | | #if HAVE_ARMV6 |
36 | | # include "arm/dct.h" |
37 | | #endif |
38 | | #if HAVE_AARCH64 |
39 | | # include "aarch64/dct.h" |
40 | | #endif |
41 | | #if HAVE_MSA |
42 | | # include "mips/dct.h" |
43 | | #endif |
44 | | #if HAVE_LSX |
45 | | # include "loongarch/dct.h" |
46 | | #endif |
47 | | static void dct4x4dc( dctcoef d[16] ) |
48 | 0 | { |
49 | 0 | dctcoef tmp[16]; |
50 | |
|
51 | 0 | for( int i = 0; i < 4; i++ ) |
52 | 0 | { |
53 | 0 | int s01 = d[i*4+0] + d[i*4+1]; |
54 | 0 | int d01 = d[i*4+0] - d[i*4+1]; |
55 | 0 | int s23 = d[i*4+2] + d[i*4+3]; |
56 | 0 | int d23 = d[i*4+2] - d[i*4+3]; |
57 | |
|
58 | 0 | tmp[0*4+i] = s01 + s23; |
59 | 0 | tmp[1*4+i] = s01 - s23; |
60 | 0 | tmp[2*4+i] = d01 - d23; |
61 | 0 | tmp[3*4+i] = d01 + d23; |
62 | 0 | } |
63 | |
|
64 | 0 | for( int i = 0; i < 4; i++ ) |
65 | 0 | { |
66 | 0 | int s01 = tmp[i*4+0] + tmp[i*4+1]; |
67 | 0 | int d01 = tmp[i*4+0] - tmp[i*4+1]; |
68 | 0 | int s23 = tmp[i*4+2] + tmp[i*4+3]; |
69 | 0 | int d23 = tmp[i*4+2] - tmp[i*4+3]; |
70 | |
|
71 | 0 | d[i*4+0] = ( s01 + s23 + 1 ) >> 1; |
72 | 0 | d[i*4+1] = ( s01 - s23 + 1 ) >> 1; |
73 | 0 | d[i*4+2] = ( d01 - d23 + 1 ) >> 1; |
74 | 0 | d[i*4+3] = ( d01 + d23 + 1 ) >> 1; |
75 | 0 | } |
76 | 0 | } |
77 | | |
78 | | static void idct4x4dc( dctcoef d[16] ) |
79 | 0 | { |
80 | 0 | dctcoef tmp[16]; |
81 | |
|
82 | 0 | for( int i = 0; i < 4; i++ ) |
83 | 0 | { |
84 | 0 | int s01 = d[i*4+0] + d[i*4+1]; |
85 | 0 | int d01 = d[i*4+0] - d[i*4+1]; |
86 | 0 | int s23 = d[i*4+2] + d[i*4+3]; |
87 | 0 | int d23 = d[i*4+2] - d[i*4+3]; |
88 | |
|
89 | 0 | tmp[0*4+i] = s01 + s23; |
90 | 0 | tmp[1*4+i] = s01 - s23; |
91 | 0 | tmp[2*4+i] = d01 - d23; |
92 | 0 | tmp[3*4+i] = d01 + d23; |
93 | 0 | } |
94 | |
|
95 | 0 | for( int i = 0; i < 4; i++ ) |
96 | 0 | { |
97 | 0 | int s01 = tmp[i*4+0] + tmp[i*4+1]; |
98 | 0 | int d01 = tmp[i*4+0] - tmp[i*4+1]; |
99 | 0 | int s23 = tmp[i*4+2] + tmp[i*4+3]; |
100 | 0 | int d23 = tmp[i*4+2] - tmp[i*4+3]; |
101 | |
|
102 | 0 | d[i*4+0] = s01 + s23; |
103 | 0 | d[i*4+1] = s01 - s23; |
104 | 0 | d[i*4+2] = d01 - d23; |
105 | 0 | d[i*4+3] = d01 + d23; |
106 | 0 | } |
107 | 0 | } |
108 | | |
109 | | static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] ) |
110 | 0 | { |
111 | 0 | int a0 = dct4x4[0][0] + dct4x4[1][0]; |
112 | 0 | int a1 = dct4x4[2][0] + dct4x4[3][0]; |
113 | 0 | int a2 = dct4x4[4][0] + dct4x4[5][0]; |
114 | 0 | int a3 = dct4x4[6][0] + dct4x4[7][0]; |
115 | 0 | int a4 = dct4x4[0][0] - dct4x4[1][0]; |
116 | 0 | int a5 = dct4x4[2][0] - dct4x4[3][0]; |
117 | 0 | int a6 = dct4x4[4][0] - dct4x4[5][0]; |
118 | 0 | int a7 = dct4x4[6][0] - dct4x4[7][0]; |
119 | 0 | int b0 = a0 + a1; |
120 | 0 | int b1 = a2 + a3; |
121 | 0 | int b2 = a4 + a5; |
122 | 0 | int b3 = a6 + a7; |
123 | 0 | int b4 = a0 - a1; |
124 | 0 | int b5 = a2 - a3; |
125 | 0 | int b6 = a4 - a5; |
126 | 0 | int b7 = a6 - a7; |
127 | 0 | dct[0] = b0 + b1; |
128 | 0 | dct[1] = b2 + b3; |
129 | 0 | dct[2] = b0 - b1; |
130 | 0 | dct[3] = b2 - b3; |
131 | 0 | dct[4] = b4 - b5; |
132 | 0 | dct[5] = b6 - b7; |
133 | 0 | dct[6] = b4 + b5; |
134 | 0 | dct[7] = b6 + b7; |
135 | 0 | dct4x4[0][0] = 0; |
136 | 0 | dct4x4[1][0] = 0; |
137 | 0 | dct4x4[2][0] = 0; |
138 | 0 | dct4x4[3][0] = 0; |
139 | 0 | dct4x4[4][0] = 0; |
140 | 0 | dct4x4[5][0] = 0; |
141 | 0 | dct4x4[6][0] = 0; |
142 | 0 | dct4x4[7][0] = 0; |
143 | 0 | } |
144 | | |
145 | | static inline void pixel_sub_wxh( dctcoef *diff, int i_size, |
146 | | pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) |
147 | 0 | { |
148 | 0 | for( int y = 0; y < i_size; y++ ) |
149 | 0 | { |
150 | 0 | for( int x = 0; x < i_size; x++ ) |
151 | 0 | diff[x + y*i_size] = pix1[x] - pix2[x]; |
152 | 0 | pix1 += i_pix1; |
153 | 0 | pix2 += i_pix2; |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | | static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 ) |
158 | 0 | { |
159 | 0 | dctcoef d[16]; |
160 | 0 | dctcoef tmp[16]; |
161 | |
|
162 | 0 | pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); |
163 | |
|
164 | 0 | for( int i = 0; i < 4; i++ ) |
165 | 0 | { |
166 | 0 | int s03 = d[i*4+0] + d[i*4+3]; |
167 | 0 | int s12 = d[i*4+1] + d[i*4+2]; |
168 | 0 | int d03 = d[i*4+0] - d[i*4+3]; |
169 | 0 | int d12 = d[i*4+1] - d[i*4+2]; |
170 | |
|
171 | 0 | tmp[0*4+i] = s03 + s12; |
172 | 0 | tmp[1*4+i] = 2*d03 + d12; |
173 | 0 | tmp[2*4+i] = s03 - s12; |
174 | 0 | tmp[3*4+i] = d03 - 2*d12; |
175 | 0 | } |
176 | |
|
177 | 0 | for( int i = 0; i < 4; i++ ) |
178 | 0 | { |
179 | 0 | int s03 = tmp[i*4+0] + tmp[i*4+3]; |
180 | 0 | int s12 = tmp[i*4+1] + tmp[i*4+2]; |
181 | 0 | int d03 = tmp[i*4+0] - tmp[i*4+3]; |
182 | 0 | int d12 = tmp[i*4+1] - tmp[i*4+2]; |
183 | |
|
184 | 0 | dct[i*4+0] = s03 + s12; |
185 | 0 | dct[i*4+1] = 2*d03 + d12; |
186 | 0 | dct[i*4+2] = s03 - s12; |
187 | 0 | dct[i*4+3] = d03 - 2*d12; |
188 | 0 | } |
189 | 0 | } |
190 | | |
191 | | static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 ) |
192 | 0 | { |
193 | 0 | sub4x4_dct( dct[0], &pix1[0], &pix2[0] ); |
194 | 0 | sub4x4_dct( dct[1], &pix1[4], &pix2[4] ); |
195 | 0 | sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); |
196 | 0 | sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); |
197 | 0 | } |
198 | | |
199 | | static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ) |
200 | 0 | { |
201 | 0 | sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] ); |
202 | 0 | sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] ); |
203 | 0 | sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); |
204 | 0 | sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); |
205 | 0 | } |
206 | | |
207 | | static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 ) |
208 | 0 | { |
209 | 0 | int sum = 0; |
210 | 0 | for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE ) |
211 | 0 | sum += pix1[0] + pix1[1] + pix1[2] + pix1[3] |
212 | 0 | - pix2[0] - pix2[1] - pix2[2] - pix2[3]; |
213 | 0 | return sum; |
214 | 0 | } |
215 | | |
216 | | static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 ) |
217 | 0 | { |
218 | 0 | dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] ); |
219 | 0 | dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] ); |
220 | 0 | dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); |
221 | 0 | dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); |
222 | | |
223 | | /* 2x2 DC transform */ |
224 | 0 | int d0 = dct[0] + dct[1]; |
225 | 0 | int d1 = dct[2] + dct[3]; |
226 | 0 | int d2 = dct[0] - dct[1]; |
227 | 0 | int d3 = dct[2] - dct[3]; |
228 | 0 | dct[0] = d0 + d1; |
229 | 0 | dct[1] = d0 - d1; |
230 | 0 | dct[2] = d2 + d3; |
231 | 0 | dct[3] = d2 - d3; |
232 | 0 | } |
233 | | |
234 | | static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 ) |
235 | 0 | { |
236 | 0 | int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] ); |
237 | 0 | int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] ); |
238 | 0 | int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] ); |
239 | 0 | int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] ); |
240 | 0 | int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] ); |
241 | 0 | int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] ); |
242 | 0 | int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] ); |
243 | 0 | int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] ); |
244 | | |
245 | | /* 2x4 DC transform */ |
246 | 0 | int b0 = a0 + a1; |
247 | 0 | int b1 = a2 + a3; |
248 | 0 | int b2 = a4 + a5; |
249 | 0 | int b3 = a6 + a7; |
250 | 0 | int b4 = a0 - a1; |
251 | 0 | int b5 = a2 - a3; |
252 | 0 | int b6 = a4 - a5; |
253 | 0 | int b7 = a6 - a7; |
254 | 0 | a0 = b0 + b1; |
255 | 0 | a1 = b2 + b3; |
256 | 0 | a2 = b4 + b5; |
257 | 0 | a3 = b6 + b7; |
258 | 0 | a4 = b0 - b1; |
259 | 0 | a5 = b2 - b3; |
260 | 0 | a6 = b4 - b5; |
261 | 0 | a7 = b6 - b7; |
262 | 0 | dct[0] = a0 + a1; |
263 | 0 | dct[1] = a2 + a3; |
264 | 0 | dct[2] = a0 - a1; |
265 | 0 | dct[3] = a2 - a3; |
266 | 0 | dct[4] = a4 - a5; |
267 | 0 | dct[5] = a6 - a7; |
268 | 0 | dct[6] = a4 + a5; |
269 | 0 | dct[7] = a6 + a7; |
270 | 0 | } |
271 | | |
272 | | static void add4x4_idct( pixel *p_dst, dctcoef dct[16] ) |
273 | 0 | { |
274 | 0 | dctcoef d[16]; |
275 | 0 | dctcoef tmp[16]; |
276 | |
|
277 | 0 | for( int i = 0; i < 4; i++ ) |
278 | 0 | { |
279 | 0 | int s02 = dct[0*4+i] + dct[2*4+i]; |
280 | 0 | int d02 = dct[0*4+i] - dct[2*4+i]; |
281 | 0 | int s13 = dct[1*4+i] + (dct[3*4+i]>>1); |
282 | 0 | int d13 = (dct[1*4+i]>>1) - dct[3*4+i]; |
283 | |
|
284 | 0 | tmp[i*4+0] = s02 + s13; |
285 | 0 | tmp[i*4+1] = d02 + d13; |
286 | 0 | tmp[i*4+2] = d02 - d13; |
287 | 0 | tmp[i*4+3] = s02 - s13; |
288 | 0 | } |
289 | |
|
290 | 0 | for( int i = 0; i < 4; i++ ) |
291 | 0 | { |
292 | 0 | int s02 = tmp[0*4+i] + tmp[2*4+i]; |
293 | 0 | int d02 = tmp[0*4+i] - tmp[2*4+i]; |
294 | 0 | int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1); |
295 | 0 | int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i]; |
296 | |
|
297 | 0 | d[0*4+i] = ( s02 + s13 + 32 ) >> 6; |
298 | 0 | d[1*4+i] = ( d02 + d13 + 32 ) >> 6; |
299 | 0 | d[2*4+i] = ( d02 - d13 + 32 ) >> 6; |
300 | 0 | d[3*4+i] = ( s02 - s13 + 32 ) >> 6; |
301 | 0 | } |
302 | | |
303 | |
|
304 | 0 | for( int y = 0; y < 4; y++ ) |
305 | 0 | { |
306 | 0 | for( int x = 0; x < 4; x++ ) |
307 | 0 | p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] ); |
308 | 0 | p_dst += FDEC_STRIDE; |
309 | 0 | } |
310 | 0 | } |
311 | | |
312 | | static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] ) |
313 | 0 | { |
314 | 0 | add4x4_idct( &p_dst[0], dct[0] ); |
315 | 0 | add4x4_idct( &p_dst[4], dct[1] ); |
316 | 0 | add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] ); |
317 | 0 | add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] ); |
318 | 0 | } |
319 | | |
320 | | static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] ) |
321 | 0 | { |
322 | 0 | add8x8_idct( &p_dst[0], &dct[0] ); |
323 | 0 | add8x8_idct( &p_dst[8], &dct[4] ); |
324 | 0 | add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] ); |
325 | 0 | add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] ); |
326 | 0 | } |
327 | | |
328 | | /**************************************************************************** |
329 | | * 8x8 transform: |
330 | | ****************************************************************************/ |
331 | | |
332 | 0 | #define DCT8_1D {\ |
333 | 0 | int s07 = SRC(0) + SRC(7);\ |
334 | 0 | int s16 = SRC(1) + SRC(6);\ |
335 | 0 | int s25 = SRC(2) + SRC(5);\ |
336 | 0 | int s34 = SRC(3) + SRC(4);\ |
337 | 0 | int a0 = s07 + s34;\ |
338 | 0 | int a1 = s16 + s25;\ |
339 | 0 | int a2 = s07 - s34;\ |
340 | 0 | int a3 = s16 - s25;\ |
341 | 0 | int d07 = SRC(0) - SRC(7);\ |
342 | 0 | int d16 = SRC(1) - SRC(6);\ |
343 | 0 | int d25 = SRC(2) - SRC(5);\ |
344 | 0 | int d34 = SRC(3) - SRC(4);\ |
345 | 0 | int a4 = d16 + d25 + (d07 + (d07>>1));\ |
346 | 0 | int a5 = d07 - d34 - (d25 + (d25>>1));\ |
347 | 0 | int a6 = d07 + d34 - (d16 + (d16>>1));\ |
348 | 0 | int a7 = d16 - d25 + (d34 + (d34>>1));\ |
349 | 0 | DST(0) = a0 + a1 ;\ |
350 | 0 | DST(1) = a4 + (a7>>2);\ |
351 | 0 | DST(2) = a2 + (a3>>1);\ |
352 | 0 | DST(3) = a5 + (a6>>2);\ |
353 | 0 | DST(4) = a0 - a1 ;\ |
354 | 0 | DST(5) = a6 - (a5>>2);\ |
355 | 0 | DST(6) = (a2>>1) - a3 ;\ |
356 | 0 | DST(7) = (a4>>2) - a7 ;\ |
357 | 0 | } |
358 | | |
359 | | static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 ) |
360 | 0 | { |
361 | 0 | dctcoef tmp[64]; |
362 | |
|
363 | 0 | pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); |
364 | |
|
365 | 0 | #define SRC(x) tmp[x*8+i] |
366 | 0 | #define DST(x) tmp[x*8+i] |
367 | 0 | for( int i = 0; i < 8; i++ ) |
368 | 0 | DCT8_1D |
369 | 0 | #undef SRC |
370 | 0 | #undef DST |
371 | |
|
372 | 0 | #define SRC(x) tmp[i*8+x] |
373 | 0 | #define DST(x) dct[x*8+i] |
374 | 0 | for( int i = 0; i < 8; i++ ) |
375 | 0 | DCT8_1D |
376 | 0 | #undef SRC |
377 | 0 | #undef DST |
378 | 0 | } |
379 | | |
380 | | static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ) |
381 | 0 | { |
382 | 0 | sub8x8_dct8( dct[0], &pix1[0], &pix2[0] ); |
383 | 0 | sub8x8_dct8( dct[1], &pix1[8], &pix2[8] ); |
384 | 0 | sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); |
385 | 0 | sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); |
386 | 0 | } |
387 | | |
388 | 0 | #define IDCT8_1D {\ |
389 | 0 | int a0 = SRC(0) + SRC(4);\ |
390 | 0 | int a2 = SRC(0) - SRC(4);\ |
391 | 0 | int a4 = (SRC(2)>>1) - SRC(6);\ |
392 | 0 | int a6 = (SRC(6)>>1) + SRC(2);\ |
393 | 0 | int b0 = a0 + a6;\ |
394 | 0 | int b2 = a2 + a4;\ |
395 | 0 | int b4 = a2 - a4;\ |
396 | 0 | int b6 = a0 - a6;\ |
397 | 0 | int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\ |
398 | 0 | int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\ |
399 | 0 | int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\ |
400 | 0 | int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\ |
401 | 0 | int b1 = (a7>>2) + a1;\ |
402 | 0 | int b3 = a3 + (a5>>2);\ |
403 | 0 | int b5 = (a3>>2) - a5;\ |
404 | 0 | int b7 = a7 - (a1>>2);\ |
405 | 0 | DST(0, b0 + b7);\ |
406 | 0 | DST(1, b2 + b5);\ |
407 | 0 | DST(2, b4 + b3);\ |
408 | 0 | DST(3, b6 + b1);\ |
409 | 0 | DST(4, b6 - b1);\ |
410 | 0 | DST(5, b4 - b3);\ |
411 | 0 | DST(6, b2 - b5);\ |
412 | 0 | DST(7, b0 - b7);\ |
413 | 0 | } |
414 | | |
415 | | static void add8x8_idct8( pixel *dst, dctcoef dct[64] ) |
416 | 0 | { |
417 | 0 | dct[0] += 32; // rounding for the >>6 at the end |
418 | |
|
419 | 0 | #define SRC(x) dct[x*8+i] |
420 | 0 | #define DST(x,rhs) dct[x*8+i] = (rhs) |
421 | 0 | for( int i = 0; i < 8; i++ ) |
422 | 0 | IDCT8_1D |
423 | 0 | #undef SRC |
424 | 0 | #undef DST |
425 | |
|
426 | 0 | #define SRC(x) dct[i*8+x] |
427 | 0 | #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) ); |
428 | 0 | for( int i = 0; i < 8; i++ ) |
429 | 0 | IDCT8_1D |
430 | 0 | #undef SRC |
431 | 0 | #undef DST |
432 | 0 | } |
433 | | |
434 | | static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] ) |
435 | 0 | { |
436 | 0 | add8x8_idct8( &dst[0], dct[0] ); |
437 | 0 | add8x8_idct8( &dst[8], dct[1] ); |
438 | 0 | add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] ); |
439 | 0 | add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] ); |
440 | 0 | } |
441 | | |
442 | | static inline void add4x4_idct_dc( pixel *p_dst, dctcoef dc ) |
443 | 0 | { |
444 | 0 | dc = (dc + 32) >> 6; |
445 | 0 | for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE ) |
446 | 0 | { |
447 | 0 | p_dst[0] = x264_clip_pixel( p_dst[0] + dc ); |
448 | 0 | p_dst[1] = x264_clip_pixel( p_dst[1] + dc ); |
449 | 0 | p_dst[2] = x264_clip_pixel( p_dst[2] + dc ); |
450 | 0 | p_dst[3] = x264_clip_pixel( p_dst[3] + dc ); |
451 | 0 | } |
452 | 0 | } |
453 | | |
454 | | static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] ) |
455 | 0 | { |
456 | 0 | add4x4_idct_dc( &p_dst[0], dct[0] ); |
457 | 0 | add4x4_idct_dc( &p_dst[4], dct[1] ); |
458 | 0 | add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] ); |
459 | 0 | add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] ); |
460 | 0 | } |
461 | | |
462 | | static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] ) |
463 | 0 | { |
464 | 0 | for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE ) |
465 | 0 | { |
466 | 0 | add4x4_idct_dc( &p_dst[ 0], dct[0] ); |
467 | 0 | add4x4_idct_dc( &p_dst[ 4], dct[1] ); |
468 | 0 | add4x4_idct_dc( &p_dst[ 8], dct[2] ); |
469 | 0 | add4x4_idct_dc( &p_dst[12], dct[3] ); |
470 | 0 | } |
471 | 0 | } |
472 | | |
473 | | |
474 | | /**************************************************************************** |
475 | | * x264_dct_init: |
476 | | ****************************************************************************/ |
477 | | void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ) |
478 | 0 | { |
479 | 0 | dctf->sub4x4_dct = sub4x4_dct; |
480 | 0 | dctf->add4x4_idct = add4x4_idct; |
481 | |
|
482 | 0 | dctf->sub8x8_dct = sub8x8_dct; |
483 | 0 | dctf->sub8x8_dct_dc = sub8x8_dct_dc; |
484 | 0 | dctf->add8x8_idct = add8x8_idct; |
485 | 0 | dctf->add8x8_idct_dc = add8x8_idct_dc; |
486 | |
|
487 | 0 | dctf->sub8x16_dct_dc = sub8x16_dct_dc; |
488 | |
|
489 | 0 | dctf->sub16x16_dct = sub16x16_dct; |
490 | 0 | dctf->add16x16_idct = add16x16_idct; |
491 | 0 | dctf->add16x16_idct_dc = add16x16_idct_dc; |
492 | |
|
493 | 0 | dctf->sub8x8_dct8 = sub8x8_dct8; |
494 | 0 | dctf->add8x8_idct8 = add8x8_idct8; |
495 | |
|
496 | 0 | dctf->sub16x16_dct8 = sub16x16_dct8; |
497 | 0 | dctf->add16x16_idct8 = add16x16_idct8; |
498 | |
|
499 | 0 | dctf->dct4x4dc = dct4x4dc; |
500 | 0 | dctf->idct4x4dc = idct4x4dc; |
501 | |
|
502 | 0 | dctf->dct2x4dc = dct2x4dc; |
503 | |
|
504 | | #if HIGH_BIT_DEPTH |
505 | | #if HAVE_MMX |
506 | | if( cpu&X264_CPU_MMX ) |
507 | | { |
508 | | dctf->sub4x4_dct = x264_sub4x4_dct_mmx; |
509 | | dctf->sub8x8_dct = x264_sub8x8_dct_mmx; |
510 | | dctf->sub16x16_dct = x264_sub16x16_dct_mmx; |
511 | | } |
512 | | if( cpu&X264_CPU_SSE2 ) |
513 | | { |
514 | | dctf->add4x4_idct = x264_add4x4_idct_sse2; |
515 | | dctf->dct4x4dc = x264_dct4x4dc_sse2; |
516 | | dctf->idct4x4dc = x264_idct4x4dc_sse2; |
517 | | dctf->dct2x4dc = x264_dct2x4dc_sse2; |
518 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; |
519 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; |
520 | | dctf->add8x8_idct = x264_add8x8_idct_sse2; |
521 | | dctf->add16x16_idct = x264_add16x16_idct_sse2; |
522 | | dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; |
523 | | dctf->add16x16_idct8 = x264_add16x16_idct8_sse2; |
524 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; |
525 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2; |
526 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2; |
527 | | dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2; |
528 | | } |
529 | | if( cpu&X264_CPU_SSE4 ) |
530 | | { |
531 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4; |
532 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4; |
533 | | } |
534 | | if( cpu&X264_CPU_AVX ) |
535 | | { |
536 | | dctf->add4x4_idct = x264_add4x4_idct_avx; |
537 | | dctf->dct4x4dc = x264_dct4x4dc_avx; |
538 | | dctf->idct4x4dc = x264_idct4x4dc_avx; |
539 | | dctf->dct2x4dc = x264_dct2x4dc_avx; |
540 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; |
541 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; |
542 | | dctf->add8x8_idct = x264_add8x8_idct_avx; |
543 | | dctf->add16x16_idct = x264_add16x16_idct_avx; |
544 | | dctf->add8x8_idct8 = x264_add8x8_idct8_avx; |
545 | | dctf->add16x16_idct8 = x264_add16x16_idct8_avx; |
546 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx; |
547 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx; |
548 | | dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx; |
549 | | } |
550 | | #endif // HAVE_MMX |
551 | | #else // !HIGH_BIT_DEPTH |
552 | | #if HAVE_MMX |
553 | | if( cpu&X264_CPU_MMX ) |
554 | | { |
555 | | dctf->sub4x4_dct = x264_sub4x4_dct_mmx; |
556 | | dctf->add4x4_idct = x264_add4x4_idct_mmx; |
557 | | dctf->idct4x4dc = x264_idct4x4dc_mmx; |
558 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2; |
559 | | |
560 | | #if !ARCH_X86_64 |
561 | | dctf->sub8x8_dct = x264_sub8x8_dct_mmx; |
562 | | dctf->sub16x16_dct = x264_sub16x16_dct_mmx; |
563 | | dctf->add8x8_idct = x264_add8x8_idct_mmx; |
564 | | dctf->add16x16_idct = x264_add16x16_idct_mmx; |
565 | | |
566 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx; |
567 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx; |
568 | | dctf->add8x8_idct8 = x264_add8x8_idct8_mmx; |
569 | | dctf->add16x16_idct8= x264_add16x16_idct8_mmx; |
570 | | #endif |
571 | | } |
572 | | |
573 | | if( cpu&X264_CPU_MMX2 ) |
574 | | { |
575 | | dctf->dct4x4dc = x264_dct4x4dc_mmx2; |
576 | | dctf->dct2x4dc = x264_dct2x4dc_mmx2; |
577 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2; |
578 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2; |
579 | | } |
580 | | |
581 | | if( cpu&X264_CPU_SSE2 ) |
582 | | { |
583 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; |
584 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; |
585 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; |
586 | | dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2; |
587 | | dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; |
588 | | dctf->add16x16_idct8= x264_add16x16_idct8_sse2; |
589 | | |
590 | | if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) |
591 | | { |
592 | | dctf->sub8x8_dct = x264_sub8x8_dct_sse2; |
593 | | dctf->sub16x16_dct = x264_sub16x16_dct_sse2; |
594 | | dctf->add8x8_idct = x264_add8x8_idct_sse2; |
595 | | dctf->add16x16_idct = x264_add16x16_idct_sse2; |
596 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; |
597 | | } |
598 | | } |
599 | | |
600 | | if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) |
601 | | { |
602 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; |
603 | | if( !(cpu&X264_CPU_SLOW_ATOM) ) |
604 | | { |
605 | | dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; |
606 | | dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; |
607 | | dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; |
608 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; |
609 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; |
610 | | if( !(cpu&X264_CPU_SLOW_PSHUFB) ) |
611 | | { |
612 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; |
613 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; |
614 | | } |
615 | | } |
616 | | } |
617 | | |
618 | | if( cpu&X264_CPU_SSE4 ) |
619 | | dctf->add4x4_idct = x264_add4x4_idct_sse4; |
620 | | |
621 | | if( cpu&X264_CPU_AVX ) |
622 | | { |
623 | | dctf->add4x4_idct = x264_add4x4_idct_avx; |
624 | | dctf->add8x8_idct = x264_add8x8_idct_avx; |
625 | | dctf->add16x16_idct = x264_add16x16_idct_avx; |
626 | | dctf->add8x8_idct8 = x264_add8x8_idct8_avx; |
627 | | dctf->add16x16_idct8 = x264_add16x16_idct8_avx; |
628 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx; |
629 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx; |
630 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx; |
631 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; |
632 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; |
633 | | } |
634 | | |
635 | | if( cpu&X264_CPU_XOP ) |
636 | | { |
637 | | dctf->sub8x8_dct = x264_sub8x8_dct_xop; |
638 | | dctf->sub16x16_dct = x264_sub16x16_dct_xop; |
639 | | } |
640 | | |
641 | | if( cpu&X264_CPU_AVX2 ) |
642 | | { |
643 | | dctf->add8x8_idct = x264_add8x8_idct_avx2; |
644 | | dctf->add16x16_idct = x264_add16x16_idct_avx2; |
645 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx2; |
646 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx2; |
647 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; |
648 | | #if ARCH_X86_64 |
649 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; |
650 | | #endif |
651 | | } |
652 | | |
653 | | if( cpu&X264_CPU_AVX512 ) |
654 | | { |
655 | | dctf->sub4x4_dct = x264_sub4x4_dct_avx512; |
656 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx512; |
657 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx512; |
658 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512; |
659 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512; |
660 | | dctf->add8x8_idct = x264_add8x8_idct_avx512; |
661 | | } |
662 | | #endif //HAVE_MMX |
663 | | |
664 | | #if HAVE_ALTIVEC |
665 | | if( cpu&X264_CPU_ALTIVEC ) |
666 | | { |
667 | | dctf->sub4x4_dct = x264_sub4x4_dct_altivec; |
668 | | dctf->sub8x8_dct = x264_sub8x8_dct_altivec; |
669 | | dctf->sub16x16_dct = x264_sub16x16_dct_altivec; |
670 | | |
671 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_altivec; |
672 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_altivec; |
673 | | |
674 | | dctf->add4x4_idct = x264_add4x4_idct_altivec; |
675 | | dctf->add8x8_idct = x264_add8x8_idct_altivec; |
676 | | dctf->add16x16_idct = x264_add16x16_idct_altivec; |
677 | | |
678 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec; |
679 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec; |
680 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec; |
681 | | |
682 | | dctf->add8x8_idct8 = x264_add8x8_idct8_altivec; |
683 | | dctf->add16x16_idct8= x264_add16x16_idct8_altivec; |
684 | | } |
685 | | #endif |
686 | | |
687 | | #if HAVE_ARMV6 || HAVE_AARCH64 |
688 | | if( cpu&X264_CPU_NEON ) |
689 | | { |
690 | | dctf->sub4x4_dct = x264_sub4x4_dct_neon; |
691 | | dctf->sub8x8_dct = x264_sub8x8_dct_neon; |
692 | | dctf->sub16x16_dct = x264_sub16x16_dct_neon; |
693 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon; |
694 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon; |
695 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon; |
696 | | dctf->dct4x4dc = x264_dct4x4dc_neon; |
697 | | dctf->idct4x4dc = x264_idct4x4dc_neon; |
698 | | |
699 | | dctf->add4x4_idct = x264_add4x4_idct_neon; |
700 | | dctf->add8x8_idct = x264_add8x8_idct_neon; |
701 | | dctf->add16x16_idct = x264_add16x16_idct_neon; |
702 | | |
703 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon; |
704 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon; |
705 | | |
706 | | dctf->add8x8_idct8 = x264_add8x8_idct8_neon; |
707 | | dctf->add16x16_idct8= x264_add16x16_idct8_neon; |
708 | | dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; |
709 | | } |
710 | | #if HAVE_SVE |
711 | | if ( cpu&X264_CPU_SVE ) |
712 | | { |
713 | | dctf->sub4x4_dct = x264_sub4x4_dct_sve; |
714 | | } |
715 | | #endif |
716 | | #if HAVE_SVE2 |
717 | | if ( cpu&X264_CPU_SVE2 ) |
718 | | { |
719 | | dctf->add4x4_idct = x264_add4x4_idct_sve2; |
720 | | } |
721 | | #endif |
722 | | #endif |
723 | | |
724 | | #if HAVE_MSA |
725 | | if( cpu&X264_CPU_MSA ) |
726 | | { |
727 | | dctf->sub4x4_dct = x264_sub4x4_dct_msa; |
728 | | dctf->sub8x8_dct = x264_sub8x8_dct_msa; |
729 | | dctf->sub16x16_dct = x264_sub16x16_dct_msa; |
730 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa; |
731 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa; |
732 | | dctf->dct4x4dc = x264_dct4x4dc_msa; |
733 | | dctf->idct4x4dc = x264_idct4x4dc_msa; |
734 | | dctf->add4x4_idct = x264_add4x4_idct_msa; |
735 | | dctf->add8x8_idct = x264_add8x8_idct_msa; |
736 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa; |
737 | | dctf->add16x16_idct = x264_add16x16_idct_msa; |
738 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa; |
739 | | dctf->add8x8_idct8 = x264_add8x8_idct8_msa; |
740 | | dctf->add16x16_idct8 = x264_add16x16_idct8_msa; |
741 | | } |
742 | | #endif |
743 | | |
744 | | #if HAVE_LSX |
745 | | if( cpu&X264_CPU_LSX ) |
746 | | { |
747 | | dctf->sub4x4_dct = x264_sub4x4_dct_lsx; |
748 | | dctf->add4x4_idct = x264_add4x4_idct_lsx; |
749 | | dctf->dct4x4dc = x264_dct4x4dc_lsx; |
750 | | dctf->idct4x4dc = x264_idct4x4dc_lsx; |
751 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_lsx; |
752 | | dctf->sub8x8_dct = x264_sub8x8_dct_lsx; |
753 | | dctf->add8x8_idct = x264_add8x8_idct_lsx; |
754 | | dctf->add8x8_idct8 = x264_add8x8_idct8_lsx; |
755 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lsx; |
756 | | dctf->add16x16_idct = x264_add16x16_idct_lsx; |
757 | | dctf->sub16x16_dct = x264_sub16x16_dct_lsx; |
758 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lsx; |
759 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_lsx; |
760 | | } |
761 | | if( cpu&X264_CPU_LASX ) |
762 | | { |
763 | | dctf->sub8x8_dct = x264_sub8x8_dct_lasx; |
764 | | dctf->sub16x16_dct = x264_sub16x16_dct_lasx; |
765 | | dctf->add8x8_idct = x264_add8x8_idct_lasx; |
766 | | dctf->add8x8_idct8 = x264_add8x8_idct8_lasx; |
767 | | dctf->add16x16_idct = x264_add16x16_idct_lasx; |
768 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_lasx; |
769 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lasx; |
770 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lasx; |
771 | | dctf->dct4x4dc = x264_dct4x4dc_lasx; |
772 | | dctf->idct4x4dc = x264_idct4x4dc_lasx; |
773 | | } |
774 | | #endif |
775 | | |
776 | | #endif // HIGH_BIT_DEPTH |
777 | 0 | } Unexecuted instantiation: x264_8_dct_init Unexecuted instantiation: x264_10_dct_init |
778 | | |
779 | | |
780 | 0 | #define ZIG(i,y,x) level[i] = dct[x*8+y]; |
781 | | #define ZIGZAG8_FRAME\ |
782 | 0 | ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ |
783 | 0 | ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ |
784 | 0 | ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\ |
785 | 0 | ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\ |
786 | 0 | ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\ |
787 | 0 | ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\ |
788 | 0 | ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\ |
789 | 0 | ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\ |
790 | 0 | ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\ |
791 | 0 | ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\ |
792 | 0 | ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\ |
793 | 0 | ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\ |
794 | 0 | ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\ |
795 | 0 | ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\ |
796 | 0 | ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\ |
797 | 0 | ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\ |
798 | | |
799 | | #define ZIGZAG8_FIELD\ |
800 | 0 | ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\ |
801 | 0 | ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\ |
802 | 0 | ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\ |
803 | 0 | ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\ |
804 | 0 | ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\ |
805 | 0 | ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\ |
806 | 0 | ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\ |
807 | 0 | ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\ |
808 | 0 | ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\ |
809 | 0 | ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\ |
810 | 0 | ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\ |
811 | 0 | ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\ |
812 | 0 | ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\ |
813 | 0 | ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\ |
814 | 0 | ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\ |
815 | 0 | ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) |
816 | | |
817 | | #define ZIGZAG4_FRAME\ |
818 | 0 | ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ |
819 | 0 | ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ |
820 | 0 | ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\ |
821 | 0 | ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) |
822 | | |
823 | | #define ZIGZAG4_FIELD\ |
824 | 0 | ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\ |
825 | 0 | ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\ |
826 | 0 | ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ |
827 | 0 | ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) |
828 | | |
829 | | static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] ) |
830 | 0 | { |
831 | 0 | ZIGZAG8_FRAME |
832 | 0 | } |
833 | | |
834 | | static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] ) |
835 | 0 | { |
836 | 0 | ZIGZAG8_FIELD |
837 | 0 | } |
838 | | |
839 | | #undef ZIG |
840 | 0 | #define ZIG(i,y,x) level[i] = dct[x*4+y]; |
841 | 0 | #define ZIGDC(i,y,x) ZIG(i,y,x) |
842 | | |
843 | | static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) |
844 | 0 | { |
845 | 0 | ZIGZAG4_FRAME |
846 | 0 | } |
847 | | |
848 | | static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) |
849 | 0 | { |
850 | 0 | memcpy( level, dct, 2 * sizeof(dctcoef) ); |
851 | 0 | ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) |
852 | 0 | memcpy( level+6, dct+6, 10 * sizeof(dctcoef) ); |
853 | 0 | } |
854 | | |
855 | | #undef ZIG |
856 | 0 | #define ZIG(i,y,x) {\ |
857 | 0 | int oe = x+y*FENC_STRIDE;\ |
858 | 0 | int od = x+y*FDEC_STRIDE;\ |
859 | 0 | level[i] = p_src[oe] - p_dst[od];\ |
860 | 0 | nz |= level[i];\ |
861 | 0 | } |
862 | | #define COPY4x4\ |
863 | 0 | CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ |
864 | 0 | CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ |
865 | 0 | CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ |
866 | 0 | CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE ); |
867 | 0 | #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) ) |
868 | | #define COPY8x8\ |
869 | 0 | CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ |
870 | 0 | CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ |
871 | 0 | CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ |
872 | 0 | CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\ |
873 | 0 | CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\ |
874 | 0 | CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\ |
875 | 0 | CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\ |
876 | 0 | CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE ); |
877 | | |
878 | | static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst ) |
879 | 0 | { |
880 | 0 | int nz = 0; |
881 | 0 | ZIGZAG4_FRAME |
882 | 0 | COPY4x4 |
883 | 0 | return !!nz; |
884 | 0 | } |
885 | | |
886 | | static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst ) |
887 | 0 | { |
888 | 0 | int nz = 0; |
889 | 0 | ZIGZAG4_FIELD |
890 | 0 | COPY4x4 |
891 | 0 | return !!nz; |
892 | 0 | } |
893 | | |
894 | | #undef ZIGDC |
895 | 0 | #define ZIGDC(i,y,x) {\ |
896 | 0 | int oe = x+y*FENC_STRIDE;\ |
897 | 0 | int od = x+y*FDEC_STRIDE;\ |
898 | 0 | *dc = p_src[oe] - p_dst[od];\ |
899 | 0 | level[0] = 0;\ |
900 | 0 | } |
901 | | |
902 | | static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ) |
903 | 0 | { |
904 | 0 | int nz = 0; |
905 | 0 | ZIGZAG4_FRAME |
906 | 0 | COPY4x4 |
907 | 0 | return !!nz; |
908 | 0 | } |
909 | | |
910 | | static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ) |
911 | 0 | { |
912 | 0 | int nz = 0; |
913 | 0 | ZIGZAG4_FIELD |
914 | 0 | COPY4x4 |
915 | 0 | return !!nz; |
916 | 0 | } |
917 | | |
918 | | static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst ) |
919 | 0 | { |
920 | 0 | int nz = 0; |
921 | 0 | ZIGZAG8_FRAME |
922 | 0 | COPY8x8 |
923 | 0 | return !!nz; |
924 | 0 | } |
925 | | static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst ) |
926 | 0 | { |
927 | 0 | int nz = 0; |
928 | 0 | ZIGZAG8_FIELD |
929 | 0 | COPY8x8 |
930 | 0 | return !!nz; |
931 | 0 | } |
932 | | |
933 | | #undef ZIG |
934 | | #undef COPY4x4 |
935 | | |
936 | | static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz ) |
937 | 0 | { |
938 | 0 | for( int i = 0; i < 4; i++ ) |
939 | 0 | { |
940 | 0 | int nz = 0; |
941 | 0 | for( int j = 0; j < 16; j++ ) |
942 | 0 | { |
943 | 0 | nz |= src[i+j*4]; |
944 | 0 | dst[i*16+j] = src[i+j*4]; |
945 | 0 | } |
946 | 0 | nnz[(i&1) + (i>>1)*8] = !!nz; |
947 | 0 | } |
948 | 0 | } |
949 | | |
950 | | void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ) |
951 | 0 | { |
952 | 0 | pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; |
953 | 0 | pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; |
954 | 0 | pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; |
955 | 0 | pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; |
956 | 0 | pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; |
957 | 0 | pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; |
958 | 0 | pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; |
959 | 0 | pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; |
960 | 0 | pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; |
961 | 0 | pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; |
962 | |
|
963 | | #if HIGH_BIT_DEPTH |
964 | | #if HAVE_MMX |
965 | | if( cpu&X264_CPU_SSE2 ) |
966 | | { |
967 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; |
968 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; |
969 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; |
970 | | } |
971 | | if( cpu&X264_CPU_SSE4 ) |
972 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; |
973 | | if( cpu&X264_CPU_AVX ) |
974 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; |
975 | | #if ARCH_X86_64 |
976 | | if( cpu&X264_CPU_AVX ) |
977 | | { |
978 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; |
979 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; |
980 | | } |
981 | | #endif // ARCH_X86_64 |
982 | | if( cpu&X264_CPU_AVX512 ) |
983 | | { |
984 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; |
985 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; |
986 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; |
987 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; |
988 | | } |
989 | | #endif // HAVE_MMX |
990 | | #else |
991 | | #if HAVE_MMX |
992 | | if( cpu&X264_CPU_MMX ) |
993 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; |
994 | | if( cpu&X264_CPU_MMX2 ) |
995 | | { |
996 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2; |
997 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2; |
998 | | } |
999 | | if( cpu&X264_CPU_SSE ) |
1000 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse; |
1001 | | if( cpu&X264_CPU_SSE2_IS_FAST ) |
1002 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; |
1003 | | if( cpu&X264_CPU_SSSE3 ) |
1004 | | { |
1005 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; |
1006 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; |
1007 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; |
1008 | | pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; |
1009 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; |
1010 | | if( !(cpu&X264_CPU_SLOW_SHUFFLE) ) |
1011 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; |
1012 | | } |
1013 | | if( cpu&X264_CPU_AVX ) |
1014 | | { |
1015 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; |
1016 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; |
1017 | | #if ARCH_X86_64 |
1018 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; |
1019 | | pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; |
1020 | | #endif |
1021 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; |
1022 | | } |
1023 | | if( cpu&X264_CPU_XOP ) |
1024 | | { |
1025 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop; |
1026 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; |
1027 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; |
1028 | | } |
1029 | | if( cpu&X264_CPU_AVX512 ) |
1030 | | { |
1031 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; |
1032 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; |
1033 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; |
1034 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; |
1035 | | } |
1036 | | #endif // HAVE_MMX |
1037 | | #if HAVE_ALTIVEC |
1038 | | if( cpu&X264_CPU_ALTIVEC ) |
1039 | | { |
1040 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; |
1041 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; |
1042 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec; |
1043 | | } |
1044 | | #endif |
1045 | | #if HAVE_ARMV6 || HAVE_AARCH64 |
1046 | | if( cpu&X264_CPU_NEON ) |
1047 | | { |
1048 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; |
1049 | | #if HAVE_AARCH64 |
1050 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon; |
1051 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon; |
1052 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon; |
1053 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon; |
1054 | | pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon; |
1055 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon; |
1056 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon; |
1057 | | pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon; |
1058 | | pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon; |
1059 | | #endif // HAVE_AARCH64 |
1060 | | } |
1061 | | #endif // HAVE_ARMV6 || HAVE_AARCH64 |
1062 | | #endif // HIGH_BIT_DEPTH |
1063 | |
|
1064 | 0 | pf_interlaced->interleave_8x8_cavlc = |
1065 | 0 | pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; |
1066 | | #if HAVE_MMX |
1067 | | #if HIGH_BIT_DEPTH |
1068 | | if( cpu&X264_CPU_SSE2 ) |
1069 | | { |
1070 | | pf_interlaced->interleave_8x8_cavlc = |
1071 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; |
1072 | | } |
1073 | | if( cpu&X264_CPU_AVX ) |
1074 | | { |
1075 | | pf_interlaced->interleave_8x8_cavlc = |
1076 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; |
1077 | | } |
1078 | | if( cpu&X264_CPU_AVX512 ) |
1079 | | { |
1080 | | pf_interlaced->interleave_8x8_cavlc = |
1081 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; |
1082 | | } |
1083 | | #else |
1084 | | if( cpu&X264_CPU_MMX ) |
1085 | | { |
1086 | | pf_interlaced->interleave_8x8_cavlc = |
1087 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; |
1088 | | } |
1089 | | if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) ) |
1090 | | { |
1091 | | pf_interlaced->interleave_8x8_cavlc = |
1092 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; |
1093 | | } |
1094 | | |
1095 | | if( cpu&X264_CPU_AVX ) |
1096 | | { |
1097 | | pf_interlaced->interleave_8x8_cavlc = |
1098 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; |
1099 | | } |
1100 | | |
1101 | | if( cpu&X264_CPU_AVX2 ) |
1102 | | { |
1103 | | pf_interlaced->interleave_8x8_cavlc = |
1104 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; |
1105 | | } |
1106 | | if( cpu&X264_CPU_AVX512 ) |
1107 | | { |
1108 | | pf_interlaced->interleave_8x8_cavlc = |
1109 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; |
1110 | | } |
1111 | | #endif // HIGH_BIT_DEPTH |
1112 | | #endif |
1113 | | #if !HIGH_BIT_DEPTH |
1114 | | #if HAVE_AARCH64 |
1115 | | if( cpu&X264_CPU_NEON ) |
1116 | | { |
1117 | | pf_interlaced->interleave_8x8_cavlc = |
1118 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; |
1119 | | } |
1120 | | #if HAVE_SVE |
1121 | | if( cpu&X264_CPU_SVE ) |
1122 | | { |
1123 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sve; |
1124 | | } |
1125 | | #endif |
1126 | | #endif // HAVE_AARCH64 |
1127 | | |
1128 | | #if HAVE_ALTIVEC |
1129 | | if( cpu&X264_CPU_ALTIVEC ) |
1130 | | { |
1131 | | pf_interlaced->interleave_8x8_cavlc = |
1132 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_altivec; |
1133 | | } |
1134 | | #endif // HAVE_ALTIVEC |
1135 | | |
1136 | | #if HAVE_MSA |
1137 | | if( cpu&X264_CPU_MSA ) |
1138 | | { |
1139 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa; |
1140 | | } |
1141 | | #endif |
1142 | | |
1143 | | #if HAVE_LSX |
1144 | | if( cpu&X264_CPU_LASX ) |
1145 | | { |
1146 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_lasx; |
1147 | | } |
1148 | | #endif |
1149 | | #endif // !HIGH_BIT_DEPTH |
1150 | 0 | } Unexecuted instantiation: x264_8_zigzag_init Unexecuted instantiation: x264_10_zigzag_init |