Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * dct.c: transform and zigzag |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | | * Laurent Aimar <fenrir@via.ecp.fr> |
8 | | * Henrik Gramner <henrik@gramner.com> |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License as published by |
12 | | * the Free Software Foundation; either version 2 of the License, or |
13 | | * (at your option) any later version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | | * GNU General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License |
21 | | * along with this program; if not, write to the Free Software |
22 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
23 | | * |
24 | | * This program is also available under a commercial proprietary license. |
25 | | * For more information, contact us at licensing@x264.com. |
26 | | *****************************************************************************/ |
27 | | |
28 | | #include "common.h" |
29 | | #if HAVE_MMX |
30 | | # include "x86/dct.h" |
31 | | #endif |
32 | | #if HAVE_ALTIVEC |
33 | | # include "ppc/dct.h" |
34 | | #endif |
35 | | #if HAVE_ARMV6 |
36 | | # include "arm/dct.h" |
37 | | #endif |
38 | | #if HAVE_AARCH64 |
39 | | # include "aarch64/dct.h" |
40 | | #endif |
41 | | #if HAVE_MSA |
42 | | # include "mips/dct.h" |
43 | | #endif |
44 | | #if HAVE_LSX |
45 | | # include "loongarch/dct.h" |
46 | | #endif |
47 | | static void dct4x4dc( dctcoef d[16] ) |
48 | 30.2k | { |
49 | 30.2k | dctcoef tmp[16]; |
50 | | |
51 | 151k | for( int i = 0; i < 4; i++ ) |
52 | 121k | { |
53 | 121k | int s01 = d[i*4+0] + d[i*4+1]; |
54 | 121k | int d01 = d[i*4+0] - d[i*4+1]; |
55 | 121k | int s23 = d[i*4+2] + d[i*4+3]; |
56 | 121k | int d23 = d[i*4+2] - d[i*4+3]; |
57 | | |
58 | 121k | tmp[0*4+i] = s01 + s23; |
59 | 121k | tmp[1*4+i] = s01 - s23; |
60 | 121k | tmp[2*4+i] = d01 - d23; |
61 | 121k | tmp[3*4+i] = d01 + d23; |
62 | 121k | } |
63 | | |
64 | 151k | for( int i = 0; i < 4; i++ ) |
65 | 121k | { |
66 | 121k | int s01 = tmp[i*4+0] + tmp[i*4+1]; |
67 | 121k | int d01 = tmp[i*4+0] - tmp[i*4+1]; |
68 | 121k | int s23 = tmp[i*4+2] + tmp[i*4+3]; |
69 | 121k | int d23 = tmp[i*4+2] - tmp[i*4+3]; |
70 | | |
71 | 121k | d[i*4+0] = ( s01 + s23 + 1 ) >> 1; |
72 | 121k | d[i*4+1] = ( s01 - s23 + 1 ) >> 1; |
73 | 121k | d[i*4+2] = ( d01 - d23 + 1 ) >> 1; |
74 | 121k | d[i*4+3] = ( d01 + d23 + 1 ) >> 1; |
75 | 121k | } |
76 | 30.2k | } |
77 | | |
78 | | static void idct4x4dc( dctcoef d[16] ) |
79 | 232 | { |
80 | 232 | dctcoef tmp[16]; |
81 | | |
82 | 1.16k | for( int i = 0; i < 4; i++ ) |
83 | 928 | { |
84 | 928 | int s01 = d[i*4+0] + d[i*4+1]; |
85 | 928 | int d01 = d[i*4+0] - d[i*4+1]; |
86 | 928 | int s23 = d[i*4+2] + d[i*4+3]; |
87 | 928 | int d23 = d[i*4+2] - d[i*4+3]; |
88 | | |
89 | 928 | tmp[0*4+i] = s01 + s23; |
90 | 928 | tmp[1*4+i] = s01 - s23; |
91 | 928 | tmp[2*4+i] = d01 - d23; |
92 | 928 | tmp[3*4+i] = d01 + d23; |
93 | 928 | } |
94 | | |
95 | 1.16k | for( int i = 0; i < 4; i++ ) |
96 | 928 | { |
97 | 928 | int s01 = tmp[i*4+0] + tmp[i*4+1]; |
98 | 928 | int d01 = tmp[i*4+0] - tmp[i*4+1]; |
99 | 928 | int s23 = tmp[i*4+2] + tmp[i*4+3]; |
100 | 928 | int d23 = tmp[i*4+2] - tmp[i*4+3]; |
101 | | |
102 | 928 | d[i*4+0] = s01 + s23; |
103 | 928 | d[i*4+1] = s01 - s23; |
104 | 928 | d[i*4+2] = d01 - d23; |
105 | 928 | d[i*4+3] = d01 + d23; |
106 | 928 | } |
107 | 232 | } |
108 | | |
109 | | static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] ) |
110 | 0 | { |
111 | 0 | int a0 = dct4x4[0][0] + dct4x4[1][0]; |
112 | 0 | int a1 = dct4x4[2][0] + dct4x4[3][0]; |
113 | 0 | int a2 = dct4x4[4][0] + dct4x4[5][0]; |
114 | 0 | int a3 = dct4x4[6][0] + dct4x4[7][0]; |
115 | 0 | int a4 = dct4x4[0][0] - dct4x4[1][0]; |
116 | 0 | int a5 = dct4x4[2][0] - dct4x4[3][0]; |
117 | 0 | int a6 = dct4x4[4][0] - dct4x4[5][0]; |
118 | 0 | int a7 = dct4x4[6][0] - dct4x4[7][0]; |
119 | 0 | int b0 = a0 + a1; |
120 | 0 | int b1 = a2 + a3; |
121 | 0 | int b2 = a4 + a5; |
122 | 0 | int b3 = a6 + a7; |
123 | 0 | int b4 = a0 - a1; |
124 | 0 | int b5 = a2 - a3; |
125 | 0 | int b6 = a4 - a5; |
126 | 0 | int b7 = a6 - a7; |
127 | 0 | dct[0] = b0 + b1; |
128 | 0 | dct[1] = b2 + b3; |
129 | 0 | dct[2] = b0 - b1; |
130 | 0 | dct[3] = b2 - b3; |
131 | 0 | dct[4] = b4 - b5; |
132 | 0 | dct[5] = b6 - b7; |
133 | 0 | dct[6] = b4 + b5; |
134 | 0 | dct[7] = b6 + b7; |
135 | 0 | dct4x4[0][0] = 0; |
136 | 0 | dct4x4[1][0] = 0; |
137 | 0 | dct4x4[2][0] = 0; |
138 | 0 | dct4x4[3][0] = 0; |
139 | 0 | dct4x4[4][0] = 0; |
140 | 0 | dct4x4[5][0] = 0; |
141 | 0 | dct4x4[6][0] = 0; |
142 | 0 | dct4x4[7][0] = 0; |
143 | 0 | } |
144 | | |
145 | | static inline void pixel_sub_wxh( dctcoef *diff, int i_size, |
146 | | pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) |
147 | 729k | { |
148 | 3.64M | for( int y = 0; y < i_size; y++ ) |
149 | 2.91M | { |
150 | 14.5M | for( int x = 0; x < i_size; x++ ) |
151 | 11.6M | diff[x + y*i_size] = pix1[x] - pix2[x]; |
152 | 2.91M | pix1 += i_pix1; |
153 | 2.91M | pix2 += i_pix2; |
154 | 2.91M | } |
155 | 729k | } |
156 | | |
157 | | static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 ) |
158 | 729k | { |
159 | 729k | dctcoef d[16]; |
160 | 729k | dctcoef tmp[16]; |
161 | | |
162 | 729k | pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); |
163 | | |
164 | 3.64M | for( int i = 0; i < 4; i++ ) |
165 | 2.91M | { |
166 | 2.91M | int s03 = d[i*4+0] + d[i*4+3]; |
167 | 2.91M | int s12 = d[i*4+1] + d[i*4+2]; |
168 | 2.91M | int d03 = d[i*4+0] - d[i*4+3]; |
169 | 2.91M | int d12 = d[i*4+1] - d[i*4+2]; |
170 | | |
171 | 2.91M | tmp[0*4+i] = s03 + s12; |
172 | 2.91M | tmp[1*4+i] = 2*d03 + d12; |
173 | 2.91M | tmp[2*4+i] = s03 - s12; |
174 | 2.91M | tmp[3*4+i] = d03 - 2*d12; |
175 | 2.91M | } |
176 | | |
177 | 3.64M | for( int i = 0; i < 4; i++ ) |
178 | 2.91M | { |
179 | 2.91M | int s03 = tmp[i*4+0] + tmp[i*4+3]; |
180 | 2.91M | int s12 = tmp[i*4+1] + tmp[i*4+2]; |
181 | 2.91M | int d03 = tmp[i*4+0] - tmp[i*4+3]; |
182 | 2.91M | int d12 = tmp[i*4+1] - tmp[i*4+2]; |
183 | | |
184 | 2.91M | dct[i*4+0] = s03 + s12; |
185 | 2.91M | dct[i*4+1] = 2*d03 + d12; |
186 | 2.91M | dct[i*4+2] = s03 - s12; |
187 | 2.91M | dct[i*4+3] = d03 - 2*d12; |
188 | 2.91M | } |
189 | 729k | } |
190 | | |
191 | | static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 ) |
192 | 181k | { |
193 | 181k | sub4x4_dct( dct[0], &pix1[0], &pix2[0] ); |
194 | 181k | sub4x4_dct( dct[1], &pix1[4], &pix2[4] ); |
195 | 181k | sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); |
196 | 181k | sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); |
197 | 181k | } |
198 | | |
199 | | static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ) |
200 | 30.2k | { |
201 | 30.2k | sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] ); |
202 | 30.2k | sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] ); |
203 | 30.2k | sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); |
204 | 30.2k | sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); |
205 | 30.2k | } |
206 | | |
207 | | static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 ) |
208 | 0 | { |
209 | 0 | int sum = 0; |
210 | 0 | for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE ) |
211 | 0 | sum += pix1[0] + pix1[1] + pix1[2] + pix1[3] |
212 | 0 | - pix2[0] - pix2[1] - pix2[2] - pix2[3]; |
213 | 0 | return sum; |
214 | 0 | } |
215 | | |
216 | | static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 ) |
217 | 0 | { |
218 | 0 | dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] ); |
219 | 0 | dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] ); |
220 | 0 | dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); |
221 | 0 | dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); |
222 | | |
223 | | /* 2x2 DC transform */ |
224 | 0 | int d0 = dct[0] + dct[1]; |
225 | 0 | int d1 = dct[2] + dct[3]; |
226 | 0 | int d2 = dct[0] - dct[1]; |
227 | 0 | int d3 = dct[2] - dct[3]; |
228 | 0 | dct[0] = d0 + d1; |
229 | 0 | dct[1] = d0 - d1; |
230 | 0 | dct[2] = d2 + d3; |
231 | 0 | dct[3] = d2 - d3; |
232 | 0 | } |
233 | | |
234 | | static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 ) |
235 | 0 | { |
236 | 0 | int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] ); |
237 | 0 | int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] ); |
238 | 0 | int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] ); |
239 | 0 | int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] ); |
240 | 0 | int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] ); |
241 | 0 | int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] ); |
242 | 0 | int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] ); |
243 | 0 | int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] ); |
244 | | |
245 | | /* 2x4 DC transform */ |
246 | 0 | int b0 = a0 + a1; |
247 | 0 | int b1 = a2 + a3; |
248 | 0 | int b2 = a4 + a5; |
249 | 0 | int b3 = a6 + a7; |
250 | 0 | int b4 = a0 - a1; |
251 | 0 | int b5 = a2 - a3; |
252 | 0 | int b6 = a4 - a5; |
253 | 0 | int b7 = a6 - a7; |
254 | 0 | a0 = b0 + b1; |
255 | 0 | a1 = b2 + b3; |
256 | 0 | a2 = b4 + b5; |
257 | 0 | a3 = b6 + b7; |
258 | 0 | a4 = b0 - b1; |
259 | 0 | a5 = b2 - b3; |
260 | 0 | a6 = b4 - b5; |
261 | 0 | a7 = b6 - b7; |
262 | 0 | dct[0] = a0 + a1; |
263 | 0 | dct[1] = a2 + a3; |
264 | 0 | dct[2] = a0 - a1; |
265 | 0 | dct[3] = a2 - a3; |
266 | 0 | dct[4] = a4 - a5; |
267 | 0 | dct[5] = a6 - a7; |
268 | 0 | dct[6] = a4 + a5; |
269 | 0 | dct[7] = a6 + a7; |
270 | 0 | } |
271 | | |
272 | | static void add4x4_idct( pixel *p_dst, dctcoef dct[16] ) |
273 | 122 | { |
274 | 122 | dctcoef d[16]; |
275 | 122 | dctcoef tmp[16]; |
276 | | |
277 | 610 | for( int i = 0; i < 4; i++ ) |
278 | 488 | { |
279 | 488 | int s02 = dct[0*4+i] + dct[2*4+i]; |
280 | 488 | int d02 = dct[0*4+i] - dct[2*4+i]; |
281 | 488 | int s13 = dct[1*4+i] + (dct[3*4+i]>>1); |
282 | 488 | int d13 = (dct[1*4+i]>>1) - dct[3*4+i]; |
283 | | |
284 | 488 | tmp[i*4+0] = s02 + s13; |
285 | 488 | tmp[i*4+1] = d02 + d13; |
286 | 488 | tmp[i*4+2] = d02 - d13; |
287 | 488 | tmp[i*4+3] = s02 - s13; |
288 | 488 | } |
289 | | |
290 | 610 | for( int i = 0; i < 4; i++ ) |
291 | 488 | { |
292 | 488 | int s02 = tmp[0*4+i] + tmp[2*4+i]; |
293 | 488 | int d02 = tmp[0*4+i] - tmp[2*4+i]; |
294 | 488 | int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1); |
295 | 488 | int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i]; |
296 | | |
297 | 488 | d[0*4+i] = ( s02 + s13 + 32 ) >> 6; |
298 | 488 | d[1*4+i] = ( d02 + d13 + 32 ) >> 6; |
299 | 488 | d[2*4+i] = ( d02 - d13 + 32 ) >> 6; |
300 | 488 | d[3*4+i] = ( s02 - s13 + 32 ) >> 6; |
301 | 488 | } |
302 | | |
303 | | |
304 | 610 | for( int y = 0; y < 4; y++ ) |
305 | 488 | { |
306 | 2.44k | for( int x = 0; x < 4; x++ ) |
307 | 1.95k | p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] ); |
308 | 488 | p_dst += FDEC_STRIDE; |
309 | 488 | } |
310 | 122 | } |
311 | | |
312 | | static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] ) |
313 | 0 | { |
314 | 0 | add4x4_idct( &p_dst[0], dct[0] ); |
315 | 0 | add4x4_idct( &p_dst[4], dct[1] ); |
316 | 0 | add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] ); |
317 | 0 | add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] ); |
318 | 0 | } |
319 | | |
320 | | static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] ) |
321 | 0 | { |
322 | 0 | add8x8_idct( &p_dst[0], &dct[0] ); |
323 | 0 | add8x8_idct( &p_dst[8], &dct[4] ); |
324 | 0 | add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] ); |
325 | 0 | add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] ); |
326 | 0 | } |
327 | | |
328 | | /**************************************************************************** |
329 | | * 8x8 transform: |
330 | | ****************************************************************************/ |
331 | | |
332 | 0 | #define DCT8_1D {\ |
333 | 0 | int s07 = SRC(0) + SRC(7);\ |
334 | 0 | int s16 = SRC(1) + SRC(6);\ |
335 | 0 | int s25 = SRC(2) + SRC(5);\ |
336 | 0 | int s34 = SRC(3) + SRC(4);\ |
337 | 0 | int a0 = s07 + s34;\ |
338 | 0 | int a1 = s16 + s25;\ |
339 | 0 | int a2 = s07 - s34;\ |
340 | 0 | int a3 = s16 - s25;\ |
341 | 0 | int d07 = SRC(0) - SRC(7);\ |
342 | 0 | int d16 = SRC(1) - SRC(6);\ |
343 | 0 | int d25 = SRC(2) - SRC(5);\ |
344 | 0 | int d34 = SRC(3) - SRC(4);\ |
345 | 0 | int a4 = d16 + d25 + (d07 + (d07>>1));\ |
346 | 0 | int a5 = d07 - d34 - (d25 + (d25>>1));\ |
347 | 0 | int a6 = d07 + d34 - (d16 + (d16>>1));\ |
348 | 0 | int a7 = d16 - d25 + (d34 + (d34>>1));\ |
349 | 0 | DST(0) = a0 + a1 ;\ |
350 | 0 | DST(1) = a4 + (a7>>2);\ |
351 | 0 | DST(2) = a2 + (a3>>1);\ |
352 | 0 | DST(3) = a5 + (a6>>2);\ |
353 | 0 | DST(4) = a0 - a1 ;\ |
354 | 0 | DST(5) = a6 - (a5>>2);\ |
355 | 0 | DST(6) = (a2>>1) - a3 ;\ |
356 | 0 | DST(7) = (a4>>2) - a7 ;\ |
357 | 0 | } |
358 | | |
359 | | static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 ) |
360 | 0 | { |
361 | 0 | dctcoef tmp[64]; |
362 | |
|
363 | 0 | pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); |
364 | |
|
365 | 0 | #define SRC(x) tmp[x*8+i] |
366 | 0 | #define DST(x) tmp[x*8+i] |
367 | 0 | for( int i = 0; i < 8; i++ ) |
368 | 0 | DCT8_1D |
369 | 0 | #undef SRC |
370 | 0 | #undef DST |
371 | |
|
372 | 0 | #define SRC(x) tmp[i*8+x] |
373 | 0 | #define DST(x) dct[x*8+i] |
374 | 0 | for( int i = 0; i < 8; i++ ) |
375 | 0 | DCT8_1D |
376 | 0 | #undef SRC |
377 | 0 | #undef DST |
378 | 0 | } |
379 | | |
380 | | static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ) |
381 | 0 | { |
382 | 0 | sub8x8_dct8( dct[0], &pix1[0], &pix2[0] ); |
383 | 0 | sub8x8_dct8( dct[1], &pix1[8], &pix2[8] ); |
384 | 0 | sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); |
385 | 0 | sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); |
386 | 0 | } |
387 | | |
388 | 0 | #define IDCT8_1D {\ |
389 | 0 | int a0 = SRC(0) + SRC(4);\ |
390 | 0 | int a2 = SRC(0) - SRC(4);\ |
391 | 0 | int a4 = (SRC(2)>>1) - SRC(6);\ |
392 | 0 | int a6 = (SRC(6)>>1) + SRC(2);\ |
393 | 0 | int b0 = a0 + a6;\ |
394 | 0 | int b2 = a2 + a4;\ |
395 | 0 | int b4 = a2 - a4;\ |
396 | 0 | int b6 = a0 - a6;\ |
397 | 0 | int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\ |
398 | 0 | int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\ |
399 | 0 | int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\ |
400 | 0 | int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\ |
401 | 0 | int b1 = (a7>>2) + a1;\ |
402 | 0 | int b3 = a3 + (a5>>2);\ |
403 | 0 | int b5 = (a3>>2) - a5;\ |
404 | 0 | int b7 = a7 - (a1>>2);\ |
405 | 0 | DST(0, b0 + b7);\ |
406 | 0 | DST(1, b2 + b5);\ |
407 | 0 | DST(2, b4 + b3);\ |
408 | 0 | DST(3, b6 + b1);\ |
409 | 0 | DST(4, b6 - b1);\ |
410 | 0 | DST(5, b4 - b3);\ |
411 | 0 | DST(6, b2 - b5);\ |
412 | 0 | DST(7, b0 - b7);\ |
413 | 0 | } |
414 | | |
415 | | static void add8x8_idct8( pixel *dst, dctcoef dct[64] ) |
416 | 0 | { |
417 | 0 | dct[0] += 32; // rounding for the >>6 at the end |
418 | |
|
419 | 0 | #define SRC(x) dct[x*8+i] |
420 | 0 | #define DST(x,rhs) dct[x*8+i] = (rhs) |
421 | 0 | for( int i = 0; i < 8; i++ ) |
422 | 0 | IDCT8_1D |
423 | 0 | #undef SRC |
424 | 0 | #undef DST |
425 | |
|
426 | 0 | #define SRC(x) dct[i*8+x] |
427 | 0 | #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) ); |
428 | 0 | for( int i = 0; i < 8; i++ ) |
429 | 0 | IDCT8_1D |
430 | 0 | #undef SRC |
431 | 0 | #undef DST |
432 | 0 | } |
433 | | |
434 | | static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] ) |
435 | 0 | { |
436 | 0 | add8x8_idct8( &dst[0], dct[0] ); |
437 | 0 | add8x8_idct8( &dst[8], dct[1] ); |
438 | 0 | add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] ); |
439 | 0 | add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] ); |
440 | 0 | } |
441 | | |
442 | | static inline void add4x4_idct_dc( pixel *p_dst, dctcoef dc ) |
443 | 7.95k | { |
444 | 7.95k | dc = (dc + 32) >> 6; |
445 | 39.7k | for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE ) |
446 | 31.8k | { |
447 | 31.8k | p_dst[0] = x264_clip_pixel( p_dst[0] + dc ); |
448 | 31.8k | p_dst[1] = x264_clip_pixel( p_dst[1] + dc ); |
449 | 31.8k | p_dst[2] = x264_clip_pixel( p_dst[2] + dc ); |
450 | 31.8k | p_dst[3] = x264_clip_pixel( p_dst[3] + dc ); |
451 | 31.8k | } |
452 | 7.95k | } |
453 | | |
454 | | static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] ) |
455 | 1.06k | { |
456 | 1.06k | add4x4_idct_dc( &p_dst[0], dct[0] ); |
457 | 1.06k | add4x4_idct_dc( &p_dst[4], dct[1] ); |
458 | 1.06k | add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] ); |
459 | 1.06k | add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] ); |
460 | 1.06k | } |
461 | | |
462 | | static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] ) |
463 | 232 | { |
464 | 1.16k | for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE ) |
465 | 928 | { |
466 | 928 | add4x4_idct_dc( &p_dst[ 0], dct[0] ); |
467 | 928 | add4x4_idct_dc( &p_dst[ 4], dct[1] ); |
468 | 928 | add4x4_idct_dc( &p_dst[ 8], dct[2] ); |
469 | 928 | add4x4_idct_dc( &p_dst[12], dct[3] ); |
470 | 928 | } |
471 | 232 | } |
472 | | |
473 | | |
474 | | /**************************************************************************** |
475 | | * x264_dct_init: |
476 | | ****************************************************************************/ |
477 | | void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ) |
478 | 169 | { |
479 | 169 | dctf->sub4x4_dct = sub4x4_dct; |
480 | 169 | dctf->add4x4_idct = add4x4_idct; |
481 | | |
482 | 169 | dctf->sub8x8_dct = sub8x8_dct; |
483 | 169 | dctf->sub8x8_dct_dc = sub8x8_dct_dc; |
484 | 169 | dctf->add8x8_idct = add8x8_idct; |
485 | 169 | dctf->add8x8_idct_dc = add8x8_idct_dc; |
486 | | |
487 | 169 | dctf->sub8x16_dct_dc = sub8x16_dct_dc; |
488 | | |
489 | 169 | dctf->sub16x16_dct = sub16x16_dct; |
490 | 169 | dctf->add16x16_idct = add16x16_idct; |
491 | 169 | dctf->add16x16_idct_dc = add16x16_idct_dc; |
492 | | |
493 | 169 | dctf->sub8x8_dct8 = sub8x8_dct8; |
494 | 169 | dctf->add8x8_idct8 = add8x8_idct8; |
495 | | |
496 | 169 | dctf->sub16x16_dct8 = sub16x16_dct8; |
497 | 169 | dctf->add16x16_idct8 = add16x16_idct8; |
498 | | |
499 | 169 | dctf->dct4x4dc = dct4x4dc; |
500 | 169 | dctf->idct4x4dc = idct4x4dc; |
501 | | |
502 | 169 | dctf->dct2x4dc = dct2x4dc; |
503 | | |
504 | | #if HIGH_BIT_DEPTH |
505 | | #if HAVE_MMX |
506 | | if( cpu&X264_CPU_MMX ) |
507 | | { |
508 | | dctf->sub4x4_dct = x264_sub4x4_dct_mmx; |
509 | | dctf->sub8x8_dct = x264_sub8x8_dct_mmx; |
510 | | dctf->sub16x16_dct = x264_sub16x16_dct_mmx; |
511 | | } |
512 | | if( cpu&X264_CPU_SSE2 ) |
513 | | { |
514 | | dctf->add4x4_idct = x264_add4x4_idct_sse2; |
515 | | dctf->dct4x4dc = x264_dct4x4dc_sse2; |
516 | | dctf->idct4x4dc = x264_idct4x4dc_sse2; |
517 | | dctf->dct2x4dc = x264_dct2x4dc_sse2; |
518 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; |
519 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; |
520 | | dctf->add8x8_idct = x264_add8x8_idct_sse2; |
521 | | dctf->add16x16_idct = x264_add16x16_idct_sse2; |
522 | | dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; |
523 | | dctf->add16x16_idct8 = x264_add16x16_idct8_sse2; |
524 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; |
525 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2; |
526 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2; |
527 | | dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2; |
528 | | } |
529 | | if( cpu&X264_CPU_SSE4 ) |
530 | | { |
531 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4; |
532 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4; |
533 | | } |
534 | | if( cpu&X264_CPU_AVX ) |
535 | | { |
536 | | dctf->add4x4_idct = x264_add4x4_idct_avx; |
537 | | dctf->dct4x4dc = x264_dct4x4dc_avx; |
538 | | dctf->idct4x4dc = x264_idct4x4dc_avx; |
539 | | dctf->dct2x4dc = x264_dct2x4dc_avx; |
540 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; |
541 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; |
542 | | dctf->add8x8_idct = x264_add8x8_idct_avx; |
543 | | dctf->add16x16_idct = x264_add16x16_idct_avx; |
544 | | dctf->add8x8_idct8 = x264_add8x8_idct8_avx; |
545 | | dctf->add16x16_idct8 = x264_add16x16_idct8_avx; |
546 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx; |
547 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx; |
548 | | dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx; |
549 | | } |
550 | | #endif // HAVE_MMX |
551 | | #else // !HIGH_BIT_DEPTH |
552 | | #if HAVE_MMX |
553 | | if( cpu&X264_CPU_MMX ) |
554 | | { |
555 | | dctf->sub4x4_dct = x264_sub4x4_dct_mmx; |
556 | | dctf->add4x4_idct = x264_add4x4_idct_mmx; |
557 | | dctf->idct4x4dc = x264_idct4x4dc_mmx; |
558 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2; |
559 | | |
560 | | #if !ARCH_X86_64 |
561 | | dctf->sub8x8_dct = x264_sub8x8_dct_mmx; |
562 | | dctf->sub16x16_dct = x264_sub16x16_dct_mmx; |
563 | | dctf->add8x8_idct = x264_add8x8_idct_mmx; |
564 | | dctf->add16x16_idct = x264_add16x16_idct_mmx; |
565 | | |
566 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx; |
567 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx; |
568 | | dctf->add8x8_idct8 = x264_add8x8_idct8_mmx; |
569 | | dctf->add16x16_idct8= x264_add16x16_idct8_mmx; |
570 | | #endif |
571 | | } |
572 | | |
573 | | if( cpu&X264_CPU_MMX2 ) |
574 | | { |
575 | | dctf->dct4x4dc = x264_dct4x4dc_mmx2; |
576 | | dctf->dct2x4dc = x264_dct2x4dc_mmx2; |
577 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2; |
578 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2; |
579 | | } |
580 | | |
581 | | if( cpu&X264_CPU_SSE2 ) |
582 | | { |
583 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; |
584 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; |
585 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; |
586 | | dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2; |
587 | | dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; |
588 | | dctf->add16x16_idct8= x264_add16x16_idct8_sse2; |
589 | | |
590 | | if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) |
591 | | { |
592 | | dctf->sub8x8_dct = x264_sub8x8_dct_sse2; |
593 | | dctf->sub16x16_dct = x264_sub16x16_dct_sse2; |
594 | | dctf->add8x8_idct = x264_add8x8_idct_sse2; |
595 | | dctf->add16x16_idct = x264_add16x16_idct_sse2; |
596 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; |
597 | | } |
598 | | } |
599 | | |
600 | | if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) |
601 | | { |
602 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; |
603 | | if( !(cpu&X264_CPU_SLOW_ATOM) ) |
604 | | { |
605 | | dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; |
606 | | dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; |
607 | | dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; |
608 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; |
609 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; |
610 | | if( !(cpu&X264_CPU_SLOW_PSHUFB) ) |
611 | | { |
612 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; |
613 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; |
614 | | } |
615 | | } |
616 | | } |
617 | | |
618 | | if( cpu&X264_CPU_SSE4 ) |
619 | | dctf->add4x4_idct = x264_add4x4_idct_sse4; |
620 | | |
621 | | if( cpu&X264_CPU_AVX ) |
622 | | { |
623 | | dctf->add4x4_idct = x264_add4x4_idct_avx; |
624 | | dctf->add8x8_idct = x264_add8x8_idct_avx; |
625 | | dctf->add16x16_idct = x264_add16x16_idct_avx; |
626 | | dctf->add8x8_idct8 = x264_add8x8_idct8_avx; |
627 | | dctf->add16x16_idct8 = x264_add16x16_idct8_avx; |
628 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx; |
629 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx; |
630 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx; |
631 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; |
632 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; |
633 | | } |
634 | | |
635 | | if( cpu&X264_CPU_XOP ) |
636 | | { |
637 | | dctf->sub8x8_dct = x264_sub8x8_dct_xop; |
638 | | dctf->sub16x16_dct = x264_sub16x16_dct_xop; |
639 | | } |
640 | | |
641 | | if( cpu&X264_CPU_AVX2 ) |
642 | | { |
643 | | dctf->add8x8_idct = x264_add8x8_idct_avx2; |
644 | | dctf->add16x16_idct = x264_add16x16_idct_avx2; |
645 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx2; |
646 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx2; |
647 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; |
648 | | #if ARCH_X86_64 |
649 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; |
650 | | #endif |
651 | | } |
652 | | |
653 | | if( cpu&X264_CPU_AVX512 ) |
654 | | { |
655 | | dctf->sub4x4_dct = x264_sub4x4_dct_avx512; |
656 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx512; |
657 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx512; |
658 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512; |
659 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512; |
660 | | dctf->add8x8_idct = x264_add8x8_idct_avx512; |
661 | | } |
662 | | #endif //HAVE_MMX |
663 | | |
664 | | #if HAVE_ALTIVEC |
665 | | if( cpu&X264_CPU_ALTIVEC ) |
666 | | { |
667 | | dctf->sub4x4_dct = x264_sub4x4_dct_altivec; |
668 | | dctf->sub8x8_dct = x264_sub8x8_dct_altivec; |
669 | | dctf->sub16x16_dct = x264_sub16x16_dct_altivec; |
670 | | |
671 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_altivec; |
672 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_altivec; |
673 | | |
674 | | dctf->add4x4_idct = x264_add4x4_idct_altivec; |
675 | | dctf->add8x8_idct = x264_add8x8_idct_altivec; |
676 | | dctf->add16x16_idct = x264_add16x16_idct_altivec; |
677 | | |
678 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec; |
679 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec; |
680 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec; |
681 | | |
682 | | dctf->add8x8_idct8 = x264_add8x8_idct8_altivec; |
683 | | dctf->add16x16_idct8= x264_add16x16_idct8_altivec; |
684 | | } |
685 | | #endif |
686 | | |
687 | | #if HAVE_ARMV6 || HAVE_AARCH64 |
688 | | if( cpu&X264_CPU_NEON ) |
689 | | { |
690 | | dctf->sub4x4_dct = x264_sub4x4_dct_neon; |
691 | | dctf->sub8x8_dct = x264_sub8x8_dct_neon; |
692 | | dctf->sub16x16_dct = x264_sub16x16_dct_neon; |
693 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon; |
694 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon; |
695 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon; |
696 | | dctf->dct4x4dc = x264_dct4x4dc_neon; |
697 | | dctf->idct4x4dc = x264_idct4x4dc_neon; |
698 | | |
699 | | dctf->add4x4_idct = x264_add4x4_idct_neon; |
700 | | dctf->add8x8_idct = x264_add8x8_idct_neon; |
701 | | dctf->add16x16_idct = x264_add16x16_idct_neon; |
702 | | |
703 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon; |
704 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon; |
705 | | |
706 | | dctf->add8x8_idct8 = x264_add8x8_idct8_neon; |
707 | | dctf->add16x16_idct8= x264_add16x16_idct8_neon; |
708 | | dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; |
709 | | } |
710 | | #if HAVE_SVE |
711 | | if ( cpu&X264_CPU_SVE ) |
712 | | { |
713 | | dctf->sub4x4_dct = x264_sub4x4_dct_sve; |
714 | | } |
715 | | #endif |
716 | | #if HAVE_SVE2 |
717 | | if ( cpu&X264_CPU_SVE2 ) |
718 | | { |
719 | | dctf->add4x4_idct = x264_add4x4_idct_sve2; |
720 | | } |
721 | | #endif |
722 | | #endif |
723 | | |
724 | | #if HAVE_MSA |
725 | | if( cpu&X264_CPU_MSA ) |
726 | | { |
727 | | dctf->sub4x4_dct = x264_sub4x4_dct_msa; |
728 | | dctf->sub8x8_dct = x264_sub8x8_dct_msa; |
729 | | dctf->sub16x16_dct = x264_sub16x16_dct_msa; |
730 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa; |
731 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa; |
732 | | dctf->dct4x4dc = x264_dct4x4dc_msa; |
733 | | dctf->idct4x4dc = x264_idct4x4dc_msa; |
734 | | dctf->add4x4_idct = x264_add4x4_idct_msa; |
735 | | dctf->add8x8_idct = x264_add8x8_idct_msa; |
736 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa; |
737 | | dctf->add16x16_idct = x264_add16x16_idct_msa; |
738 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa; |
739 | | dctf->add8x8_idct8 = x264_add8x8_idct8_msa; |
740 | | dctf->add16x16_idct8 = x264_add16x16_idct8_msa; |
741 | | } |
742 | | #endif |
743 | | |
744 | | #if HAVE_LSX |
745 | | if( cpu&X264_CPU_LSX ) |
746 | | { |
747 | | dctf->sub4x4_dct = x264_sub4x4_dct_lsx; |
748 | | dctf->add4x4_idct = x264_add4x4_idct_lsx; |
749 | | dctf->dct4x4dc = x264_dct4x4dc_lsx; |
750 | | dctf->idct4x4dc = x264_idct4x4dc_lsx; |
751 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_lsx; |
752 | | dctf->sub8x8_dct = x264_sub8x8_dct_lsx; |
753 | | dctf->add8x8_idct = x264_add8x8_idct_lsx; |
754 | | dctf->add8x8_idct8 = x264_add8x8_idct8_lsx; |
755 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lsx; |
756 | | dctf->add16x16_idct = x264_add16x16_idct_lsx; |
757 | | dctf->sub16x16_dct = x264_sub16x16_dct_lsx; |
758 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lsx; |
759 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_lsx; |
760 | | } |
761 | | if( cpu&X264_CPU_LASX ) |
762 | | { |
763 | | dctf->sub8x8_dct = x264_sub8x8_dct_lasx; |
764 | | dctf->sub16x16_dct = x264_sub16x16_dct_lasx; |
765 | | dctf->add8x8_idct = x264_add8x8_idct_lasx; |
766 | | dctf->add8x8_idct8 = x264_add8x8_idct8_lasx; |
767 | | dctf->add16x16_idct = x264_add16x16_idct_lasx; |
768 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_lasx; |
769 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lasx; |
770 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lasx; |
771 | | dctf->dct4x4dc = x264_dct4x4dc_lasx; |
772 | | dctf->idct4x4dc = x264_idct4x4dc_lasx; |
773 | | } |
774 | | #endif |
775 | | |
776 | | #endif // HIGH_BIT_DEPTH |
777 | 169 | } Line | Count | Source | 478 | 169 | { | 479 | 169 | dctf->sub4x4_dct = sub4x4_dct; | 480 | 169 | dctf->add4x4_idct = add4x4_idct; | 481 | | | 482 | 169 | dctf->sub8x8_dct = sub8x8_dct; | 483 | 169 | dctf->sub8x8_dct_dc = sub8x8_dct_dc; | 484 | 169 | dctf->add8x8_idct = add8x8_idct; | 485 | 169 | dctf->add8x8_idct_dc = add8x8_idct_dc; | 486 | | | 487 | 169 | dctf->sub8x16_dct_dc = sub8x16_dct_dc; | 488 | | | 489 | 169 | dctf->sub16x16_dct = sub16x16_dct; | 490 | 169 | dctf->add16x16_idct = add16x16_idct; | 491 | 169 | dctf->add16x16_idct_dc = add16x16_idct_dc; | 492 | | | 493 | 169 | dctf->sub8x8_dct8 = sub8x8_dct8; | 494 | 169 | dctf->add8x8_idct8 = add8x8_idct8; | 495 | | | 496 | 169 | dctf->sub16x16_dct8 = sub16x16_dct8; | 497 | 169 | dctf->add16x16_idct8 = add16x16_idct8; | 498 | | | 499 | 169 | dctf->dct4x4dc = dct4x4dc; | 500 | 169 | dctf->idct4x4dc = idct4x4dc; | 501 | | | 502 | 169 | dctf->dct2x4dc = dct2x4dc; | 503 | | | 504 | | #if HIGH_BIT_DEPTH | 505 | | #if HAVE_MMX | 506 | | if( cpu&X264_CPU_MMX ) | 507 | | { | 508 | | dctf->sub4x4_dct = x264_sub4x4_dct_mmx; | 509 | | dctf->sub8x8_dct = x264_sub8x8_dct_mmx; | 510 | | dctf->sub16x16_dct = x264_sub16x16_dct_mmx; | 511 | | } | 512 | | if( cpu&X264_CPU_SSE2 ) | 513 | | { | 514 | | dctf->add4x4_idct = x264_add4x4_idct_sse2; | 515 | | dctf->dct4x4dc = x264_dct4x4dc_sse2; | 516 | | dctf->idct4x4dc = x264_idct4x4dc_sse2; | 517 | | dctf->dct2x4dc = x264_dct2x4dc_sse2; | 518 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; | 519 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; | 520 | | dctf->add8x8_idct = x264_add8x8_idct_sse2; | 521 | | dctf->add16x16_idct = x264_add16x16_idct_sse2; | 522 | | dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; | 523 | | dctf->add16x16_idct8 = x264_add16x16_idct8_sse2; | 524 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; | 525 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2; | 526 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2; | 527 | | dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2; | 528 | | } | 529 | | if( cpu&X264_CPU_SSE4 ) | 530 | | { | 531 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4; | 532 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4; | 533 | | } | 534 | | if( cpu&X264_CPU_AVX ) | 535 | | { | 536 | | dctf->add4x4_idct = x264_add4x4_idct_avx; | 537 | | dctf->dct4x4dc = x264_dct4x4dc_avx; | 538 | | dctf->idct4x4dc = x264_idct4x4dc_avx; | 539 | | dctf->dct2x4dc = x264_dct2x4dc_avx; | 540 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; | 541 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; | 542 | | dctf->add8x8_idct = x264_add8x8_idct_avx; | 543 | | dctf->add16x16_idct = x264_add16x16_idct_avx; | 544 | | dctf->add8x8_idct8 = x264_add8x8_idct8_avx; | 545 | | dctf->add16x16_idct8 = x264_add16x16_idct8_avx; | 546 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx; | 547 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx; | 548 | | dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx; | 549 | | } | 550 | | #endif // HAVE_MMX | 551 | | #else // !HIGH_BIT_DEPTH | 552 | | #if HAVE_MMX | 553 | | if( cpu&X264_CPU_MMX ) | 554 | | { | 555 | | dctf->sub4x4_dct = x264_sub4x4_dct_mmx; | 556 | | dctf->add4x4_idct = x264_add4x4_idct_mmx; | 557 | | dctf->idct4x4dc = x264_idct4x4dc_mmx; | 558 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2; | 559 | | | 560 | | #if !ARCH_X86_64 | 561 | | dctf->sub8x8_dct = x264_sub8x8_dct_mmx; | 562 | | dctf->sub16x16_dct = x264_sub16x16_dct_mmx; | 563 | | dctf->add8x8_idct = x264_add8x8_idct_mmx; | 564 | | dctf->add16x16_idct = x264_add16x16_idct_mmx; | 565 | | | 566 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx; | 567 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx; | 568 | | dctf->add8x8_idct8 = x264_add8x8_idct8_mmx; | 569 | | dctf->add16x16_idct8= x264_add16x16_idct8_mmx; | 570 | | #endif | 571 | | } | 572 | | | 573 | | if( cpu&X264_CPU_MMX2 ) | 574 | | { | 575 | | dctf->dct4x4dc = x264_dct4x4dc_mmx2; | 576 | | dctf->dct2x4dc = x264_dct2x4dc_mmx2; | 577 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2; | 578 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2; | 579 | | } | 580 | | | 581 | | if( cpu&X264_CPU_SSE2 ) | 582 | | { | 583 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; | 584 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; | 585 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; | 586 | | dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2; | 587 | | dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; | 588 | | dctf->add16x16_idct8= x264_add16x16_idct8_sse2; | 589 | | | 590 | | if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) | 591 | | { | 592 | | dctf->sub8x8_dct = x264_sub8x8_dct_sse2; | 593 | | dctf->sub16x16_dct = x264_sub16x16_dct_sse2; | 594 | | dctf->add8x8_idct = x264_add8x8_idct_sse2; | 595 | | dctf->add16x16_idct = x264_add16x16_idct_sse2; | 596 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; | 597 | | } | 598 | | } | 599 | | | 600 | | if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) | 601 | | { | 602 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; | 603 | | if( !(cpu&X264_CPU_SLOW_ATOM) ) | 604 | | { | 605 | | dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; | 606 | | dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; | 607 | | dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; | 608 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; | 609 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; | 610 | | if( !(cpu&X264_CPU_SLOW_PSHUFB) ) | 611 | | { | 612 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; | 613 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; | 614 | | } | 615 | | } | 616 | | } | 617 | | | 618 | | if( cpu&X264_CPU_SSE4 ) | 619 | | dctf->add4x4_idct = x264_add4x4_idct_sse4; | 620 | | | 621 | | if( cpu&X264_CPU_AVX ) | 622 | | { | 623 | | dctf->add4x4_idct = x264_add4x4_idct_avx; | 624 | | dctf->add8x8_idct = x264_add8x8_idct_avx; | 625 | | dctf->add16x16_idct = x264_add16x16_idct_avx; | 626 | | dctf->add8x8_idct8 = x264_add8x8_idct8_avx; | 627 | | dctf->add16x16_idct8 = x264_add16x16_idct8_avx; | 628 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx; | 629 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx; | 630 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx; | 631 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; | 632 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; | 633 | | } | 634 | | | 635 | | if( cpu&X264_CPU_XOP ) | 636 | | { | 637 | | dctf->sub8x8_dct = x264_sub8x8_dct_xop; | 638 | | dctf->sub16x16_dct = x264_sub16x16_dct_xop; | 639 | | } | 640 | | | 641 | | if( cpu&X264_CPU_AVX2 ) | 642 | | { | 643 | | dctf->add8x8_idct = x264_add8x8_idct_avx2; | 644 | | dctf->add16x16_idct = x264_add16x16_idct_avx2; | 645 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx2; | 646 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx2; | 647 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; | 648 | | #if ARCH_X86_64 | 649 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; | 650 | | #endif | 651 | | } | 652 | | | 653 | | if( cpu&X264_CPU_AVX512 ) | 654 | | { | 655 | | dctf->sub4x4_dct = x264_sub4x4_dct_avx512; | 656 | | dctf->sub8x8_dct = x264_sub8x8_dct_avx512; | 657 | | dctf->sub16x16_dct = x264_sub16x16_dct_avx512; | 658 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512; | 659 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512; | 660 | | dctf->add8x8_idct = x264_add8x8_idct_avx512; | 661 | | } | 662 | | #endif //HAVE_MMX | 663 | | | 664 | | #if HAVE_ALTIVEC | 665 | | if( cpu&X264_CPU_ALTIVEC ) | 666 | | { | 667 | | dctf->sub4x4_dct = x264_sub4x4_dct_altivec; | 668 | | dctf->sub8x8_dct = x264_sub8x8_dct_altivec; | 669 | | dctf->sub16x16_dct = x264_sub16x16_dct_altivec; | 670 | | | 671 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_altivec; | 672 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_altivec; | 673 | | | 674 | | dctf->add4x4_idct = x264_add4x4_idct_altivec; | 675 | | dctf->add8x8_idct = x264_add8x8_idct_altivec; | 676 | | dctf->add16x16_idct = x264_add16x16_idct_altivec; | 677 | | | 678 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec; | 679 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec; | 680 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec; | 681 | | | 682 | | dctf->add8x8_idct8 = x264_add8x8_idct8_altivec; | 683 | | dctf->add16x16_idct8= x264_add16x16_idct8_altivec; | 684 | | } | 685 | | #endif | 686 | | | 687 | | #if HAVE_ARMV6 || HAVE_AARCH64 | 688 | | if( cpu&X264_CPU_NEON ) | 689 | | { | 690 | | dctf->sub4x4_dct = x264_sub4x4_dct_neon; | 691 | | dctf->sub8x8_dct = x264_sub8x8_dct_neon; | 692 | | dctf->sub16x16_dct = x264_sub16x16_dct_neon; | 693 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon; | 694 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon; | 695 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon; | 696 | | dctf->dct4x4dc = x264_dct4x4dc_neon; | 697 | | dctf->idct4x4dc = x264_idct4x4dc_neon; | 698 | | | 699 | | dctf->add4x4_idct = x264_add4x4_idct_neon; | 700 | | dctf->add8x8_idct = x264_add8x8_idct_neon; | 701 | | dctf->add16x16_idct = x264_add16x16_idct_neon; | 702 | | | 703 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon; | 704 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon; | 705 | | | 706 | | dctf->add8x8_idct8 = x264_add8x8_idct8_neon; | 707 | | dctf->add16x16_idct8= x264_add16x16_idct8_neon; | 708 | | dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; | 709 | | } | 710 | | #if HAVE_SVE | 711 | | if ( cpu&X264_CPU_SVE ) | 712 | | { | 713 | | dctf->sub4x4_dct = x264_sub4x4_dct_sve; | 714 | | } | 715 | | #endif | 716 | | #if HAVE_SVE2 | 717 | | if ( cpu&X264_CPU_SVE2 ) | 718 | | { | 719 | | dctf->add4x4_idct = x264_add4x4_idct_sve2; | 720 | | } | 721 | | #endif | 722 | | #endif | 723 | | | 724 | | #if HAVE_MSA | 725 | | if( cpu&X264_CPU_MSA ) | 726 | | { | 727 | | dctf->sub4x4_dct = x264_sub4x4_dct_msa; | 728 | | dctf->sub8x8_dct = x264_sub8x8_dct_msa; | 729 | | dctf->sub16x16_dct = x264_sub16x16_dct_msa; | 730 | | dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa; | 731 | | dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa; | 732 | | dctf->dct4x4dc = x264_dct4x4dc_msa; | 733 | | dctf->idct4x4dc = x264_idct4x4dc_msa; | 734 | | dctf->add4x4_idct = x264_add4x4_idct_msa; | 735 | | dctf->add8x8_idct = x264_add8x8_idct_msa; | 736 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa; | 737 | | dctf->add16x16_idct = x264_add16x16_idct_msa; | 738 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa; | 739 | | dctf->add8x8_idct8 = x264_add8x8_idct8_msa; | 740 | | dctf->add16x16_idct8 = x264_add16x16_idct8_msa; | 741 | | } | 742 | | #endif | 743 | | | 744 | | #if HAVE_LSX | 745 | | if( cpu&X264_CPU_LSX ) | 746 | | { | 747 | | dctf->sub4x4_dct = x264_sub4x4_dct_lsx; | 748 | | dctf->add4x4_idct = x264_add4x4_idct_lsx; | 749 | | dctf->dct4x4dc = x264_dct4x4dc_lsx; | 750 | | dctf->idct4x4dc = x264_idct4x4dc_lsx; | 751 | | dctf->sub8x8_dct8 = x264_sub8x8_dct8_lsx; | 752 | | dctf->sub8x8_dct = x264_sub8x8_dct_lsx; | 753 | | dctf->add8x8_idct = x264_add8x8_idct_lsx; | 754 | | dctf->add8x8_idct8 = x264_add8x8_idct8_lsx; | 755 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lsx; | 756 | | dctf->add16x16_idct = x264_add16x16_idct_lsx; | 757 | | dctf->sub16x16_dct = x264_sub16x16_dct_lsx; | 758 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lsx; | 759 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_lsx; | 760 | | } | 761 | | if( cpu&X264_CPU_LASX ) | 762 | | { | 763 | | dctf->sub8x8_dct = x264_sub8x8_dct_lasx; | 764 | | dctf->sub16x16_dct = x264_sub16x16_dct_lasx; | 765 | | dctf->add8x8_idct = x264_add8x8_idct_lasx; | 766 | | dctf->add8x8_idct8 = x264_add8x8_idct8_lasx; | 767 | | dctf->add16x16_idct = x264_add16x16_idct_lasx; | 768 | | dctf->sub16x16_dct8 = x264_sub16x16_dct8_lasx; | 769 | | dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lasx; | 770 | | dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lasx; | 771 | | dctf->dct4x4dc = x264_dct4x4dc_lasx; | 772 | | dctf->idct4x4dc = x264_idct4x4dc_lasx; | 773 | | } | 774 | | #endif | 775 | | | 776 | 169 | #endif // HIGH_BIT_DEPTH | 777 | 169 | } |
Unexecuted instantiation: x264_10_dct_init |
778 | | |
779 | | |
780 | 0 | #define ZIG(i,y,x) level[i] = dct[x*8+y]; |
781 | | #define ZIGZAG8_FRAME\ |
782 | 0 | ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ |
783 | 0 | ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ |
784 | 0 | ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\ |
785 | 0 | ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\ |
786 | 0 | ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\ |
787 | 0 | ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\ |
788 | 0 | ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\ |
789 | 0 | ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\ |
790 | 0 | ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\ |
791 | 0 | ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\ |
792 | 0 | ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\ |
793 | 0 | ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\ |
794 | 0 | ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\ |
795 | 0 | ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\ |
796 | 0 | ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\ |
797 | 0 | ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\ |
798 | | |
799 | | #define ZIGZAG8_FIELD\ |
800 | 0 | ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\ |
801 | 0 | ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\ |
802 | 0 | ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\ |
803 | 0 | ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\ |
804 | 0 | ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\ |
805 | 0 | ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\ |
806 | 0 | ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\ |
807 | 0 | ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\ |
808 | 0 | ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\ |
809 | 0 | ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\ |
810 | 0 | ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\ |
811 | 0 | ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\ |
812 | 0 | ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\ |
813 | 0 | ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\ |
814 | 0 | ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\ |
815 | 0 | ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) |
816 | | |
817 | | #define ZIGZAG4_FRAME\ |
818 | 362k | ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ |
819 | 362k | ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ |
820 | 362k | ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\ |
821 | 362k | ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) |
822 | | |
823 | | #define ZIGZAG4_FIELD\ |
824 | 0 | ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\ |
825 | 0 | ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\ |
826 | 0 | ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ |
827 | 0 | ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) |
828 | | |
829 | | static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] ) |
830 | 0 | { |
831 | 0 | ZIGZAG8_FRAME |
832 | 0 | } |
833 | | |
834 | | static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] ) |
835 | 0 | { |
836 | 0 | ZIGZAG8_FIELD |
837 | 0 | } |
838 | | |
839 | | #undef ZIG |
840 | 237k | #define ZIG(i,y,x) level[i] = dct[x*4+y]; |
841 | 22.7k | #define ZIGDC(i,y,x) ZIG(i,y,x) |
842 | | |
843 | | static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) |
844 | 14.8k | { |
845 | 14.8k | ZIGZAG4_FRAME |
846 | 14.8k | } |
847 | | |
848 | | static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) |
849 | 0 | { |
850 | 0 | memcpy( level, dct, 2 * sizeof(dctcoef) ); |
851 | 0 | ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) |
852 | 0 | memcpy( level+6, dct+6, 10 * sizeof(dctcoef) ); |
853 | 0 | } |
854 | | |
855 | | #undef ZIG |
856 | 5.21M | #define ZIG(i,y,x) {\ |
857 | 5.21M | int oe = x+y*FENC_STRIDE;\ |
858 | 5.21M | int od = x+y*FDEC_STRIDE;\ |
859 | 5.21M | level[i] = p_src[oe] - p_dst[od];\ |
860 | 5.21M | nz |= level[i];\ |
861 | 5.21M | } |
862 | | #define COPY4x4\ |
863 | 347k | CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ |
864 | 347k | CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ |
865 | 347k | CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ |
866 | 347k | CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE ); |
867 | 0 | #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) ) |
868 | | #define COPY8x8\ |
869 | 0 | CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ |
870 | 0 | CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ |
871 | 0 | CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ |
872 | 0 | CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\ |
873 | 0 | CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\ |
874 | 0 | CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\ |
875 | 0 | CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\ |
876 | 0 | CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE ); |
877 | | |
878 | | static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst ) |
879 | 7.95k | { |
880 | 7.95k | int nz = 0; |
881 | 7.95k | ZIGZAG4_FRAME |
882 | 7.95k | COPY4x4 |
883 | 7.95k | return !!nz; |
884 | 7.95k | } |
885 | | |
886 | | static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst ) |
887 | 0 | { |
888 | 0 | int nz = 0; |
889 | 0 | ZIGZAG4_FIELD |
890 | 0 | COPY4x4 |
891 | 0 | return !!nz; |
892 | 0 | } |
893 | | |
894 | | #undef ZIGDC |
895 | 339k | #define ZIGDC(i,y,x) {\ |
896 | 339k | int oe = x+y*FENC_STRIDE;\ |
897 | 339k | int od = x+y*FDEC_STRIDE;\ |
898 | 339k | *dc = p_src[oe] - p_dst[od];\ |
899 | 339k | level[0] = 0;\ |
900 | 339k | } |
901 | | |
902 | | static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ) |
903 | 339k | { |
904 | 339k | int nz = 0; |
905 | 339k | ZIGZAG4_FRAME |
906 | 339k | COPY4x4 |
907 | 339k | return !!nz; |
908 | 339k | } |
909 | | |
910 | | static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ) |
911 | 0 | { |
912 | 0 | int nz = 0; |
913 | 0 | ZIGZAG4_FIELD |
914 | 0 | COPY4x4 |
915 | 0 | return !!nz; |
916 | 0 | } |
917 | | |
918 | | static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst ) |
919 | 0 | { |
920 | 0 | int nz = 0; |
921 | 0 | ZIGZAG8_FRAME |
922 | 0 | COPY8x8 |
923 | 0 | return !!nz; |
924 | 0 | } |
925 | | static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst ) |
926 | 0 | { |
927 | 0 | int nz = 0; |
928 | 0 | ZIGZAG8_FIELD |
929 | 0 | COPY8x8 |
930 | 0 | return !!nz; |
931 | 0 | } |
932 | | |
933 | | #undef ZIG |
934 | | #undef COPY4x4 |
935 | | |
936 | | static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz ) |
937 | 0 | { |
938 | 0 | for( int i = 0; i < 4; i++ ) |
939 | 0 | { |
940 | 0 | int nz = 0; |
941 | 0 | for( int j = 0; j < 16; j++ ) |
942 | 0 | { |
943 | 0 | nz |= src[i+j*4]; |
944 | 0 | dst[i*16+j] = src[i+j*4]; |
945 | 0 | } |
946 | 0 | nnz[(i&1) + (i>>1)*8] = !!nz; |
947 | 0 | } |
948 | 0 | } |
949 | | |
950 | | void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ) |
951 | 169 | { |
952 | 169 | pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; |
953 | 169 | pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; |
954 | 169 | pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; |
955 | 169 | pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; |
956 | 169 | pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; |
957 | 169 | pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; |
958 | 169 | pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; |
959 | 169 | pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; |
960 | 169 | pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; |
961 | 169 | pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; |
962 | | |
963 | | #if HIGH_BIT_DEPTH |
964 | | #if HAVE_MMX |
965 | | if( cpu&X264_CPU_SSE2 ) |
966 | | { |
967 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; |
968 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; |
969 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; |
970 | | } |
971 | | if( cpu&X264_CPU_SSE4 ) |
972 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; |
973 | | if( cpu&X264_CPU_AVX ) |
974 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; |
975 | | #if ARCH_X86_64 |
976 | | if( cpu&X264_CPU_AVX ) |
977 | | { |
978 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; |
979 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; |
980 | | } |
981 | | #endif // ARCH_X86_64 |
982 | | if( cpu&X264_CPU_AVX512 ) |
983 | | { |
984 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; |
985 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; |
986 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; |
987 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; |
988 | | } |
989 | | #endif // HAVE_MMX |
990 | | #else |
991 | | #if HAVE_MMX |
992 | | if( cpu&X264_CPU_MMX ) |
993 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; |
994 | | if( cpu&X264_CPU_MMX2 ) |
995 | | { |
996 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2; |
997 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2; |
998 | | } |
999 | | if( cpu&X264_CPU_SSE ) |
1000 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse; |
1001 | | if( cpu&X264_CPU_SSE2_IS_FAST ) |
1002 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; |
1003 | | if( cpu&X264_CPU_SSSE3 ) |
1004 | | { |
1005 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; |
1006 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; |
1007 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; |
1008 | | pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; |
1009 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; |
1010 | | if( !(cpu&X264_CPU_SLOW_SHUFFLE) ) |
1011 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; |
1012 | | } |
1013 | | if( cpu&X264_CPU_AVX ) |
1014 | | { |
1015 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; |
1016 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; |
1017 | | #if ARCH_X86_64 |
1018 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; |
1019 | | pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; |
1020 | | #endif |
1021 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; |
1022 | | } |
1023 | | if( cpu&X264_CPU_XOP ) |
1024 | | { |
1025 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop; |
1026 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; |
1027 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; |
1028 | | } |
1029 | | if( cpu&X264_CPU_AVX512 ) |
1030 | | { |
1031 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; |
1032 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; |
1033 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; |
1034 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; |
1035 | | } |
1036 | | #endif // HAVE_MMX |
1037 | | #if HAVE_ALTIVEC |
1038 | | if( cpu&X264_CPU_ALTIVEC ) |
1039 | | { |
1040 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; |
1041 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; |
1042 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec; |
1043 | | } |
1044 | | #endif |
1045 | | #if HAVE_ARMV6 || HAVE_AARCH64 |
1046 | | if( cpu&X264_CPU_NEON ) |
1047 | | { |
1048 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; |
1049 | | #if HAVE_AARCH64 |
1050 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon; |
1051 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon; |
1052 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon; |
1053 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon; |
1054 | | pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon; |
1055 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon; |
1056 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon; |
1057 | | pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon; |
1058 | | pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon; |
1059 | | #endif // HAVE_AARCH64 |
1060 | | } |
1061 | | #endif // HAVE_ARMV6 || HAVE_AARCH64 |
1062 | | #endif // HIGH_BIT_DEPTH |
1063 | | |
1064 | 169 | pf_interlaced->interleave_8x8_cavlc = |
1065 | 169 | pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; |
1066 | | #if HAVE_MMX |
1067 | | #if HIGH_BIT_DEPTH |
1068 | | if( cpu&X264_CPU_SSE2 ) |
1069 | | { |
1070 | | pf_interlaced->interleave_8x8_cavlc = |
1071 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; |
1072 | | } |
1073 | | if( cpu&X264_CPU_AVX ) |
1074 | | { |
1075 | | pf_interlaced->interleave_8x8_cavlc = |
1076 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; |
1077 | | } |
1078 | | if( cpu&X264_CPU_AVX512 ) |
1079 | | { |
1080 | | pf_interlaced->interleave_8x8_cavlc = |
1081 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; |
1082 | | } |
1083 | | #else |
1084 | | if( cpu&X264_CPU_MMX ) |
1085 | | { |
1086 | | pf_interlaced->interleave_8x8_cavlc = |
1087 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; |
1088 | | } |
1089 | | if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) ) |
1090 | | { |
1091 | | pf_interlaced->interleave_8x8_cavlc = |
1092 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; |
1093 | | } |
1094 | | |
1095 | | if( cpu&X264_CPU_AVX ) |
1096 | | { |
1097 | | pf_interlaced->interleave_8x8_cavlc = |
1098 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; |
1099 | | } |
1100 | | |
1101 | | if( cpu&X264_CPU_AVX2 ) |
1102 | | { |
1103 | | pf_interlaced->interleave_8x8_cavlc = |
1104 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; |
1105 | | } |
1106 | | if( cpu&X264_CPU_AVX512 ) |
1107 | | { |
1108 | | pf_interlaced->interleave_8x8_cavlc = |
1109 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; |
1110 | | } |
1111 | | #endif // HIGH_BIT_DEPTH |
1112 | | #endif |
1113 | | #if !HIGH_BIT_DEPTH |
1114 | | #if HAVE_AARCH64 |
1115 | | if( cpu&X264_CPU_NEON ) |
1116 | | { |
1117 | | pf_interlaced->interleave_8x8_cavlc = |
1118 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; |
1119 | | } |
1120 | | #if HAVE_SVE |
1121 | | if( cpu&X264_CPU_SVE ) |
1122 | | { |
1123 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sve; |
1124 | | } |
1125 | | #endif |
1126 | | #endif // HAVE_AARCH64 |
1127 | | |
1128 | | #if HAVE_ALTIVEC |
1129 | | if( cpu&X264_CPU_ALTIVEC ) |
1130 | | { |
1131 | | pf_interlaced->interleave_8x8_cavlc = |
1132 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_altivec; |
1133 | | } |
1134 | | #endif // HAVE_ALTIVEC |
1135 | | |
1136 | | #if HAVE_MSA |
1137 | | if( cpu&X264_CPU_MSA ) |
1138 | | { |
1139 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa; |
1140 | | } |
1141 | | #endif |
1142 | | |
1143 | | #if HAVE_LSX |
1144 | | if( cpu&X264_CPU_LASX ) |
1145 | | { |
1146 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_lasx; |
1147 | | } |
1148 | | #endif |
1149 | | #endif // !HIGH_BIT_DEPTH |
1150 | 169 | } Line | Count | Source | 951 | 169 | { | 952 | 169 | pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; | 953 | 169 | pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; | 954 | 169 | pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; | 955 | 169 | pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; | 956 | 169 | pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; | 957 | 169 | pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; | 958 | 169 | pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; | 959 | 169 | pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; | 960 | 169 | pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; | 961 | 169 | pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; | 962 | | | 963 | | #if HIGH_BIT_DEPTH | 964 | | #if HAVE_MMX | 965 | | if( cpu&X264_CPU_SSE2 ) | 966 | | { | 967 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; | 968 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; | 969 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; | 970 | | } | 971 | | if( cpu&X264_CPU_SSE4 ) | 972 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; | 973 | | if( cpu&X264_CPU_AVX ) | 974 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; | 975 | | #if ARCH_X86_64 | 976 | | if( cpu&X264_CPU_AVX ) | 977 | | { | 978 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; | 979 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; | 980 | | } | 981 | | #endif // ARCH_X86_64 | 982 | | if( cpu&X264_CPU_AVX512 ) | 983 | | { | 984 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; | 985 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; | 986 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; | 987 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; | 988 | | } | 989 | | #endif // HAVE_MMX | 990 | | #else | 991 | | #if HAVE_MMX | 992 | | if( cpu&X264_CPU_MMX ) | 993 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; | 994 | | if( cpu&X264_CPU_MMX2 ) | 995 | | { | 996 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2; | 997 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2; | 998 | | } | 999 | | if( cpu&X264_CPU_SSE ) | 1000 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse; | 1001 | | if( cpu&X264_CPU_SSE2_IS_FAST ) | 1002 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; | 1003 | | if( cpu&X264_CPU_SSSE3 ) | 1004 | | { | 1005 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; | 1006 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; | 1007 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; | 1008 | | pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; | 1009 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; | 1010 | | if( !(cpu&X264_CPU_SLOW_SHUFFLE) ) | 1011 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; | 1012 | | } | 1013 | | if( cpu&X264_CPU_AVX ) | 1014 | | { | 1015 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; | 1016 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; | 1017 | | #if ARCH_X86_64 | 1018 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; | 1019 | | pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; | 1020 | | #endif | 1021 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; | 1022 | | } | 1023 | | if( cpu&X264_CPU_XOP ) | 1024 | | { | 1025 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop; | 1026 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; | 1027 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; | 1028 | | } | 1029 | | if( cpu&X264_CPU_AVX512 ) | 1030 | | { | 1031 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; | 1032 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; | 1033 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; | 1034 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; | 1035 | | } | 1036 | | #endif // HAVE_MMX | 1037 | | #if HAVE_ALTIVEC | 1038 | | if( cpu&X264_CPU_ALTIVEC ) | 1039 | | { | 1040 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; | 1041 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; | 1042 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec; | 1043 | | } | 1044 | | #endif | 1045 | | #if HAVE_ARMV6 || HAVE_AARCH64 | 1046 | | if( cpu&X264_CPU_NEON ) | 1047 | | { | 1048 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; | 1049 | | #if HAVE_AARCH64 | 1050 | | pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon; | 1051 | | pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon; | 1052 | | pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon; | 1053 | | pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon; | 1054 | | pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon; | 1055 | | pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon; | 1056 | | pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon; | 1057 | | pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon; | 1058 | | pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon; | 1059 | | #endif // HAVE_AARCH64 | 1060 | | } | 1061 | | #endif // HAVE_ARMV6 || HAVE_AARCH64 | 1062 | 169 | #endif // HIGH_BIT_DEPTH | 1063 | | | 1064 | 169 | pf_interlaced->interleave_8x8_cavlc = | 1065 | 169 | pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; | 1066 | | #if HAVE_MMX | 1067 | | #if HIGH_BIT_DEPTH | 1068 | | if( cpu&X264_CPU_SSE2 ) | 1069 | | { | 1070 | | pf_interlaced->interleave_8x8_cavlc = | 1071 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; | 1072 | | } | 1073 | | if( cpu&X264_CPU_AVX ) | 1074 | | { | 1075 | | pf_interlaced->interleave_8x8_cavlc = | 1076 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; | 1077 | | } | 1078 | | if( cpu&X264_CPU_AVX512 ) | 1079 | | { | 1080 | | pf_interlaced->interleave_8x8_cavlc = | 1081 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; | 1082 | | } | 1083 | | #else | 1084 | | if( cpu&X264_CPU_MMX ) | 1085 | | { | 1086 | | pf_interlaced->interleave_8x8_cavlc = | 1087 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; | 1088 | | } | 1089 | | if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) ) | 1090 | | { | 1091 | | pf_interlaced->interleave_8x8_cavlc = | 1092 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; | 1093 | | } | 1094 | | | 1095 | | if( cpu&X264_CPU_AVX ) | 1096 | | { | 1097 | | pf_interlaced->interleave_8x8_cavlc = | 1098 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; | 1099 | | } | 1100 | | | 1101 | | if( cpu&X264_CPU_AVX2 ) | 1102 | | { | 1103 | | pf_interlaced->interleave_8x8_cavlc = | 1104 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; | 1105 | | } | 1106 | | if( cpu&X264_CPU_AVX512 ) | 1107 | | { | 1108 | | pf_interlaced->interleave_8x8_cavlc = | 1109 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; | 1110 | | } | 1111 | | #endif // HIGH_BIT_DEPTH | 1112 | | #endif | 1113 | 169 | #if !HIGH_BIT_DEPTH | 1114 | | #if HAVE_AARCH64 | 1115 | | if( cpu&X264_CPU_NEON ) | 1116 | | { | 1117 | | pf_interlaced->interleave_8x8_cavlc = | 1118 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; | 1119 | | } | 1120 | | #if HAVE_SVE | 1121 | | if( cpu&X264_CPU_SVE ) | 1122 | | { | 1123 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sve; | 1124 | | } | 1125 | | #endif | 1126 | | #endif // HAVE_AARCH64 | 1127 | | | 1128 | | #if HAVE_ALTIVEC | 1129 | | if( cpu&X264_CPU_ALTIVEC ) | 1130 | | { | 1131 | | pf_interlaced->interleave_8x8_cavlc = | 1132 | | pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_altivec; | 1133 | | } | 1134 | | #endif // HAVE_ALTIVEC | 1135 | | | 1136 | | #if HAVE_MSA | 1137 | | if( cpu&X264_CPU_MSA ) | 1138 | | { | 1139 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa; | 1140 | | } | 1141 | | #endif | 1142 | | | 1143 | | #if HAVE_LSX | 1144 | | if( cpu&X264_CPU_LASX ) | 1145 | | { | 1146 | | pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_lasx; | 1147 | | } | 1148 | | #endif | 1149 | 169 | #endif // !HIGH_BIT_DEPTH | 1150 | 169 | } |
Unexecuted instantiation: x264_10_zigzag_init |