Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * pixel.c: pixel metrics |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | | * Laurent Aimar <fenrir@via.ecp.fr> |
8 | | * Fiona Glaser <fiona@x264.com> |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License as published by |
12 | | * the Free Software Foundation; either version 2 of the License, or |
13 | | * (at your option) any later version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | | * GNU General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License |
21 | | * along with this program; if not, write to the Free Software |
22 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
23 | | * |
24 | | * This program is also available under a commercial proprietary license. |
25 | | * For more information, contact us at licensing@x264.com. |
26 | | *****************************************************************************/ |
27 | | |
28 | | #include "common.h" |
29 | | |
30 | | #if HAVE_MMX |
31 | | # include "x86/pixel.h" |
32 | | # include "x86/predict.h" |
33 | | #endif |
34 | | #if HAVE_ALTIVEC |
35 | | # include "ppc/pixel.h" |
36 | | #endif |
37 | | #if HAVE_ARMV6 |
38 | | # include "arm/pixel.h" |
39 | | # include "arm/predict.h" |
40 | | #endif |
41 | | #if HAVE_AARCH64 |
42 | | # include "aarch64/pixel.h" |
43 | | # include "aarch64/predict.h" |
44 | | #endif |
45 | | #if HAVE_MSA |
46 | | # include "mips/pixel.h" |
47 | | #endif |
48 | | #if HAVE_LSX |
49 | | # include "loongarch/pixel.h" |
50 | | #endif |
51 | | |
52 | | /**************************************************************************** |
53 | | * pixel_sad_WxH |
54 | | ****************************************************************************/ |
55 | | #define PIXEL_SAD_C( name, lx, ly ) \ |
56 | | static int name( pixel *pix1, intptr_t i_stride_pix1, \ |
57 | 0 | pixel *pix2, intptr_t i_stride_pix2 ) \ |
58 | 0 | { \ |
59 | 0 | int i_sum = 0; \ |
60 | 0 | for( int y = 0; y < ly; y++ ) \ |
61 | 0 | { \ |
62 | 0 | for( int x = 0; x < lx; x++ ) \ |
63 | 0 | { \ |
64 | 0 | i_sum += abs( pix1[x] - pix2[x] ); \ |
65 | 0 | } \ |
66 | 0 | pix1 += i_stride_pix1; \ |
67 | 0 | pix2 += i_stride_pix2; \ |
68 | 0 | } \ |
69 | 0 | return i_sum; \ |
70 | 0 | } Unexecuted instantiation: pixel.c:x264_pixel_sad_16x16 Unexecuted instantiation: pixel.c:x264_pixel_sad_16x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_8x16 Unexecuted instantiation: pixel.c:x264_pixel_sad_8x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_8x4 Unexecuted instantiation: pixel.c:x264_pixel_sad_4x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_4x4 Unexecuted instantiation: pixel.c:x264_pixel_sad_4x16 |
71 | | |
72 | | |
73 | | PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 ) |
74 | | PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 ) |
75 | | PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 ) |
76 | | PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 ) |
77 | | PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 ) |
78 | | PIXEL_SAD_C( x264_pixel_sad_4x16, 4, 16 ) |
79 | | PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 ) |
80 | | PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 ) |
81 | | |
82 | | /**************************************************************************** |
83 | | * pixel_ssd_WxH |
84 | | ****************************************************************************/ |
85 | | #define PIXEL_SSD_C( name, lx, ly ) \ |
86 | | static int name( pixel *pix1, intptr_t i_stride_pix1, \ |
87 | 0 | pixel *pix2, intptr_t i_stride_pix2 ) \ |
88 | 0 | { \ |
89 | 0 | int i_sum = 0; \ |
90 | 0 | for( int y = 0; y < ly; y++ ) \ |
91 | 0 | { \ |
92 | 0 | for( int x = 0; x < lx; x++ ) \ |
93 | 0 | { \ |
94 | 0 | int d = pix1[x] - pix2[x]; \ |
95 | 0 | i_sum += d*d; \ |
96 | 0 | } \ |
97 | 0 | pix1 += i_stride_pix1; \ |
98 | 0 | pix2 += i_stride_pix2; \ |
99 | 0 | } \ |
100 | 0 | return i_sum; \ |
101 | 0 | } Unexecuted instantiation: pixel.c:x264_pixel_ssd_16x16 Unexecuted instantiation: pixel.c:x264_pixel_ssd_16x8 Unexecuted instantiation: pixel.c:x264_pixel_ssd_8x16 Unexecuted instantiation: pixel.c:x264_pixel_ssd_8x8 Unexecuted instantiation: pixel.c:x264_pixel_ssd_8x4 Unexecuted instantiation: pixel.c:x264_pixel_ssd_4x8 Unexecuted instantiation: pixel.c:x264_pixel_ssd_4x4 Unexecuted instantiation: pixel.c:x264_pixel_ssd_4x16 |
102 | | |
103 | | PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 ) |
104 | | PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 ) |
105 | | PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 ) |
106 | | PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 ) |
107 | | PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) |
108 | | PIXEL_SSD_C( x264_pixel_ssd_4x16, 4, 16 ) |
109 | | PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) |
110 | | PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) |
111 | | |
112 | | uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, |
113 | | pixel *pix2, intptr_t i_pix2, int i_width, int i_height ) |
114 | 0 | { |
115 | 0 | uint64_t i_ssd = 0; |
116 | 0 | int y; |
117 | 0 | int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); |
118 | |
|
119 | 0 | #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \ |
120 | 0 | pix2 + y*i_pix2 + x, i_pix2 ); |
121 | 0 | for( y = 0; y < i_height-15; y += 16 ) |
122 | 0 | { |
123 | 0 | int x = 0; |
124 | 0 | if( align ) |
125 | 0 | for( ; x < i_width-15; x += 16 ) |
126 | 0 | SSD(PIXEL_16x16); |
127 | 0 | for( ; x < i_width-7; x += 8 ) |
128 | 0 | SSD(PIXEL_8x16); |
129 | 0 | } |
130 | 0 | if( y < i_height-7 ) |
131 | 0 | for( int x = 0; x < i_width-7; x += 8 ) |
132 | 0 | SSD(PIXEL_8x8); |
133 | 0 | #undef SSD |
134 | |
|
135 | 0 | #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; } |
136 | 0 | if( i_width & 7 ) |
137 | 0 | { |
138 | 0 | for( y = 0; y < (i_height & ~7); y++ ) |
139 | 0 | for( int x = i_width & ~7; x < i_width; x++ ) |
140 | 0 | SSD1; |
141 | 0 | } |
142 | 0 | if( i_height & 7 ) |
143 | 0 | { |
144 | 0 | for( y = i_height & ~7; y < i_height; y++ ) |
145 | 0 | for( int x = 0; x < i_width; x++ ) |
146 | 0 | SSD1; |
147 | 0 | } |
148 | 0 | #undef SSD1 |
149 | |
|
150 | 0 | return i_ssd; |
151 | 0 | } Unexecuted instantiation: x264_8_pixel_ssd_wxh Unexecuted instantiation: x264_10_pixel_ssd_wxh |
152 | | |
153 | | static void pixel_ssd_nv12_core( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, |
154 | | int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) |
155 | 0 | { |
156 | 0 | *ssd_u = 0, *ssd_v = 0; |
157 | 0 | for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 ) |
158 | 0 | for( int x = 0; x < width; x++ ) |
159 | 0 | { |
160 | 0 | int du = pixuv1[2*x] - pixuv2[2*x]; |
161 | 0 | int dv = pixuv1[2*x+1] - pixuv2[2*x+1]; |
162 | 0 | *ssd_u += du*du; |
163 | 0 | *ssd_v += dv*dv; |
164 | 0 | } |
165 | 0 | } |
166 | | |
167 | | void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, |
168 | | int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v ) |
169 | 0 | { |
170 | 0 | pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v ); |
171 | 0 | if( i_width&7 ) |
172 | 0 | { |
173 | 0 | uint64_t tmp[2]; |
174 | 0 | pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height, &tmp[0], &tmp[1] ); |
175 | 0 | *ssd_u += tmp[0]; |
176 | 0 | *ssd_v += tmp[1]; |
177 | 0 | } |
178 | 0 | } Unexecuted instantiation: x264_8_pixel_ssd_nv12 Unexecuted instantiation: x264_10_pixel_ssd_nv12 |
179 | | |
180 | | /**************************************************************************** |
181 | | * pixel_var_wxh |
182 | | ****************************************************************************/ |
183 | | #define PIXEL_VAR_C( name, w, h ) \ |
184 | 0 | static uint64_t name( pixel *pix, intptr_t i_stride ) \ |
185 | 0 | { \ |
186 | 0 | uint32_t sum = 0, sqr = 0; \ |
187 | 0 | for( int y = 0; y < h; y++ ) \ |
188 | 0 | { \ |
189 | 0 | for( int x = 0; x < w; x++ ) \ |
190 | 0 | { \ |
191 | 0 | sum += pix[x]; \ |
192 | 0 | sqr += pix[x] * pix[x]; \ |
193 | 0 | } \ |
194 | 0 | pix += i_stride; \ |
195 | 0 | } \ |
196 | 0 | return sum + ((uint64_t)sqr << 32); \ |
197 | 0 | } Unexecuted instantiation: pixel.c:pixel_var_16x16 Unexecuted instantiation: pixel.c:pixel_var_8x16 Unexecuted instantiation: pixel.c:pixel_var_8x8 |
198 | | |
199 | | PIXEL_VAR_C( pixel_var_16x16, 16, 16 ) |
200 | | PIXEL_VAR_C( pixel_var_8x16, 8, 16 ) |
201 | | PIXEL_VAR_C( pixel_var_8x8, 8, 8 ) |
202 | | |
203 | | /**************************************************************************** |
204 | | * pixel_var2_wxh |
205 | | ****************************************************************************/ |
206 | | #define PIXEL_VAR2_C( name, h, shift ) \ |
207 | 0 | static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \ |
208 | 0 | { \ |
209 | 0 | int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \ |
210 | 0 | for( int y = 0; y < h; y++ ) \ |
211 | 0 | { \ |
212 | 0 | for( int x = 0; x < 8; x++ ) \ |
213 | 0 | { \ |
214 | 0 | int diff_u = fenc[x] - fdec[x]; \ |
215 | 0 | int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \ |
216 | 0 | sum_u += diff_u; \ |
217 | 0 | sum_v += diff_v; \ |
218 | 0 | sqr_u += diff_u * diff_u; \ |
219 | 0 | sqr_v += diff_v * diff_v; \ |
220 | 0 | } \ |
221 | 0 | fenc += FENC_STRIDE; \ |
222 | 0 | fdec += FDEC_STRIDE; \ |
223 | 0 | } \ |
224 | 0 | ssd[0] = sqr_u; \ |
225 | 0 | ssd[1] = sqr_v; \ |
226 | 0 | return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \ |
227 | 0 | sqr_v - ((int64_t)sum_v * sum_v >> shift); \ |
228 | 0 | } Unexecuted instantiation: pixel.c:pixel_var2_8x16 Unexecuted instantiation: pixel.c:pixel_var2_8x8 |
229 | | |
230 | | PIXEL_VAR2_C( pixel_var2_8x16, 16, 7 ) |
231 | | PIXEL_VAR2_C( pixel_var2_8x8, 8, 6 ) |
232 | | |
233 | | #if BIT_DEPTH > 8 |
234 | | typedef uint32_t sum_t; |
235 | | typedef uint64_t sum2_t; |
236 | | #else |
237 | | typedef uint16_t sum_t; |
238 | | typedef uint32_t sum2_t; |
239 | | #endif |
240 | 0 | #define BITS_PER_SUM (8 * sizeof(sum_t)) |
241 | | |
242 | 0 | #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ |
243 | 0 | sum2_t t0 = s0 + s1;\ |
244 | 0 | sum2_t t1 = s0 - s1;\ |
245 | 0 | sum2_t t2 = s2 + s3;\ |
246 | 0 | sum2_t t3 = s2 - s3;\ |
247 | 0 | d0 = t0 + t2;\ |
248 | 0 | d2 = t0 - t2;\ |
249 | 0 | d1 = t1 + t3;\ |
250 | 0 | d3 = t1 - t3;\ |
251 | 0 | } |
252 | | |
253 | | // in: a pseudo-simd number of the form x+(y<<16) |
254 | | // return: abs(x)+(abs(y)<<16) |
255 | | static ALWAYS_INLINE sum2_t abs2( sum2_t a ) |
256 | 0 | { |
257 | 0 | sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<<BITS_PER_SUM)+1))*((sum_t)-1); |
258 | 0 | return (a+s)^s; |
259 | 0 | } |
260 | | |
261 | | /**************************************************************************** |
262 | | * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences |
263 | | ****************************************************************************/ |
264 | | |
265 | | static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) |
266 | 0 | { |
267 | 0 | sum2_t tmp[4][2]; |
268 | 0 | sum2_t a0, a1, a2, a3, b0, b1; |
269 | 0 | sum2_t sum = 0; |
270 | 0 | for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) |
271 | 0 | { |
272 | 0 | a0 = (sum2_t)(pix1[0] - pix2[0]); |
273 | 0 | a1 = (sum2_t)(pix1[1] - pix2[1]); |
274 | 0 | b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM); |
275 | 0 | a2 = (sum2_t)(pix1[2] - pix2[2]); |
276 | 0 | a3 = (sum2_t)(pix1[3] - pix2[3]); |
277 | 0 | b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM); |
278 | 0 | tmp[i][0] = b0 + b1; |
279 | 0 | tmp[i][1] = b0 - b1; |
280 | 0 | } |
281 | 0 | for( int i = 0; i < 2; i++ ) |
282 | 0 | { |
283 | 0 | HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); |
284 | 0 | a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
285 | 0 | sum += ((sum_t)a0) + (a0>>BITS_PER_SUM); |
286 | 0 | } |
287 | 0 | return sum >> 1; |
288 | 0 | } |
289 | | |
290 | | static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) |
291 | 0 | { |
292 | 0 | sum2_t tmp[4][4]; |
293 | 0 | sum2_t a0, a1, a2, a3; |
294 | 0 | sum2_t sum = 0; |
295 | 0 | for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) |
296 | 0 | { |
297 | 0 | a0 = (sum2_t)(pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); |
298 | 0 | a1 = (sum2_t)(pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); |
299 | 0 | a2 = (sum2_t)(pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); |
300 | 0 | a3 = (sum2_t)(pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); |
301 | 0 | HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 ); |
302 | 0 | } |
303 | 0 | for( int i = 0; i < 4; i++ ) |
304 | 0 | { |
305 | 0 | HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); |
306 | 0 | sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
307 | 0 | } |
308 | 0 | return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; |
309 | 0 | } |
310 | | |
311 | | #define PIXEL_SATD_C( w, h, sub )\ |
312 | 0 | static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\ |
313 | 0 | {\ |
314 | 0 | int sum = sub( pix1, i_pix1, pix2, i_pix2 )\ |
315 | 0 | + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\ |
316 | 0 | if( w==16 )\ |
317 | 0 | sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\ |
318 | 0 | + sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\ |
319 | 0 | if( h==16 )\ |
320 | 0 | sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\ |
321 | 0 | + sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\ |
322 | 0 | if( w==16 && h==16 )\ |
323 | 0 | sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\ |
324 | 0 | + sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\ |
325 | 0 | return sum;\ |
326 | 0 | } Unexecuted instantiation: pixel.c:x264_pixel_satd_16x16 Unexecuted instantiation: pixel.c:x264_pixel_satd_8x16 Unexecuted instantiation: pixel.c:x264_pixel_satd_4x16 |
327 | | PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 ) |
328 | | PIXEL_SATD_C( 16, 8, x264_pixel_satd_8x4 ) |
329 | | PIXEL_SATD_C( 8, 16, x264_pixel_satd_8x4 ) |
330 | | PIXEL_SATD_C( 8, 8, x264_pixel_satd_8x4 ) |
331 | | PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 ) |
332 | | PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 ) |
333 | | |
334 | | static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) |
335 | 0 | { |
336 | 0 | sum2_t tmp[8][4]; |
337 | 0 | sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; |
338 | 0 | sum2_t sum = 0; |
339 | 0 | for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 ) |
340 | 0 | { |
341 | 0 | a0 = (sum2_t)(pix1[0] - pix2[0]); |
342 | 0 | a1 = (sum2_t)(pix1[1] - pix2[1]); |
343 | 0 | b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM); |
344 | 0 | a2 = (sum2_t)(pix1[2] - pix2[2]); |
345 | 0 | a3 = (sum2_t)(pix1[3] - pix2[3]); |
346 | 0 | b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM); |
347 | 0 | a4 = (sum2_t)(pix1[4] - pix2[4]); |
348 | 0 | a5 = (sum2_t)(pix1[5] - pix2[5]); |
349 | 0 | b2 = (a4+a5) + ((a4-a5)<<BITS_PER_SUM); |
350 | 0 | a6 = (sum2_t)(pix1[6] - pix2[6]); |
351 | 0 | a7 = (sum2_t)(pix1[7] - pix2[7]); |
352 | 0 | b3 = (a6+a7) + ((a6-a7)<<BITS_PER_SUM); |
353 | 0 | HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0,b1,b2,b3 ); |
354 | 0 | } |
355 | 0 | for( int i = 0; i < 4; i++ ) |
356 | 0 | { |
357 | 0 | HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); |
358 | 0 | HADAMARD4( a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i] ); |
359 | 0 | b0 = abs2(a0+a4) + abs2(a0-a4); |
360 | 0 | b0 += abs2(a1+a5) + abs2(a1-a5); |
361 | 0 | b0 += abs2(a2+a6) + abs2(a2-a6); |
362 | 0 | b0 += abs2(a3+a7) + abs2(a3-a7); |
363 | 0 | sum += (sum_t)b0 + (b0>>BITS_PER_SUM); |
364 | 0 | } |
365 | 0 | return sum; |
366 | 0 | } |
367 | | |
368 | | static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) |
369 | 0 | { |
370 | 0 | int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 ); |
371 | 0 | return (sum+2)>>2; |
372 | 0 | } |
373 | | |
374 | | static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) |
375 | 0 | { |
376 | 0 | int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 ) |
377 | 0 | + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 ) |
378 | 0 | + sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 ) |
379 | 0 | + sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 ); |
380 | 0 | return (sum+2)>>2; |
381 | 0 | } |
382 | | |
383 | | static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride ) |
384 | 0 | { |
385 | 0 | sum2_t tmp[32]; |
386 | 0 | sum2_t a0, a1, a2, a3, dc; |
387 | 0 | sum2_t sum4 = 0, sum8 = 0; |
388 | 0 | for( int i = 0; i < 8; i++, pix+=stride ) |
389 | 0 | { |
390 | 0 | sum2_t *t = tmp + (i&3) + (i&4)*4; |
391 | 0 | a0 = (pix[0]+pix[1]) + ((sum2_t)(pix[0]-pix[1])<<BITS_PER_SUM); |
392 | 0 | a1 = (pix[2]+pix[3]) + ((sum2_t)(pix[2]-pix[3])<<BITS_PER_SUM); |
393 | 0 | t[0] = a0 + a1; |
394 | 0 | t[4] = a0 - a1; |
395 | 0 | a2 = (pix[4]+pix[5]) + ((sum2_t)(pix[4]-pix[5])<<BITS_PER_SUM); |
396 | 0 | a3 = (pix[6]+pix[7]) + ((sum2_t)(pix[6]-pix[7])<<BITS_PER_SUM); |
397 | 0 | t[8] = a2 + a3; |
398 | 0 | t[12] = a2 - a3; |
399 | 0 | } |
400 | 0 | for( int i = 0; i < 8; i++ ) |
401 | 0 | { |
402 | 0 | HADAMARD4( a0, a1, a2, a3, tmp[i*4+0], tmp[i*4+1], tmp[i*4+2], tmp[i*4+3] ); |
403 | 0 | tmp[i*4+0] = a0; |
404 | 0 | tmp[i*4+1] = a1; |
405 | 0 | tmp[i*4+2] = a2; |
406 | 0 | tmp[i*4+3] = a3; |
407 | 0 | sum4 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
408 | 0 | } |
409 | 0 | for( int i = 0; i < 8; i++ ) |
410 | 0 | { |
411 | 0 | HADAMARD4( a0,a1,a2,a3, tmp[i], tmp[8+i], tmp[16+i], tmp[24+i] ); |
412 | 0 | sum8 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
413 | 0 | } |
414 | 0 | dc = (sum_t)(tmp[0] + tmp[8] + tmp[16] + tmp[24]); |
415 | 0 | sum4 = (sum_t)sum4 + (sum4>>BITS_PER_SUM) - dc; |
416 | 0 | sum8 = (sum_t)sum8 + (sum8>>BITS_PER_SUM) - dc; |
417 | 0 | return ((uint64_t)sum8<<32) + sum4; |
418 | 0 | } |
419 | | |
420 | | #define HADAMARD_AC(w,h) \ |
421 | 0 | static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\ |
422 | 0 | {\ |
423 | 0 | uint64_t sum = pixel_hadamard_ac( pix, stride );\ |
424 | 0 | if( w==16 )\ |
425 | 0 | sum += pixel_hadamard_ac( pix+8, stride );\ |
426 | 0 | if( h==16 )\ |
427 | 0 | sum += pixel_hadamard_ac( pix+8*stride, stride );\ |
428 | 0 | if( w==16 && h==16 )\ |
429 | 0 | sum += pixel_hadamard_ac( pix+8*stride+8, stride );\ |
430 | 0 | return ((sum>>34)<<32) + ((uint32_t)sum>>1);\ |
431 | 0 | } Unexecuted instantiation: pixel.c:x264_pixel_hadamard_ac_16x16 Unexecuted instantiation: pixel.c:x264_pixel_hadamard_ac_8x16 |
432 | | HADAMARD_AC( 16, 16 ) |
433 | | HADAMARD_AC( 16, 8 ) |
434 | | HADAMARD_AC( 8, 16 ) |
435 | | HADAMARD_AC( 8, 8 ) |
436 | | |
437 | | |
438 | | /**************************************************************************** |
439 | | * pixel_sad_x4 |
440 | | ****************************************************************************/ |
441 | | #define SAD_X( size ) \ |
442 | | static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\ |
443 | 0 | intptr_t i_stride, int scores[3] )\ |
444 | 0 | {\ |
445 | 0 | scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ |
446 | 0 | scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ |
447 | 0 | scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ |
448 | 0 | }\ Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_16x16 Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_16x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_8x16 Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_8x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_8x4 Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_4x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_x3_4x4 |
449 | | static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\ |
450 | 0 | intptr_t i_stride, int scores[4] )\ |
451 | 0 | {\ |
452 | 0 | scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ |
453 | 0 | scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ |
454 | 0 | scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ |
455 | 0 | scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\ |
456 | 0 | } Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_16x16 Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_16x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_8x16 Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_8x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_8x4 Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_4x8 Unexecuted instantiation: pixel.c:x264_pixel_sad_x4_4x4 |
457 | | |
458 | | SAD_X( 16x16 ) |
459 | | SAD_X( 16x8 ) |
460 | | SAD_X( 8x16 ) |
461 | | SAD_X( 8x8 ) |
462 | | SAD_X( 8x4 ) |
463 | | SAD_X( 4x8 ) |
464 | | SAD_X( 4x4 ) |
465 | | |
466 | | /**************************************************************************** |
467 | | * pixel_satd_x4 |
468 | | * no faster than single satd, but needed for satd to be a drop-in replacement for sad |
469 | | ****************************************************************************/ |
470 | | |
471 | | #define SATD_X( size, cpu ) \ |
472 | | static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\ |
473 | 0 | intptr_t i_stride, int scores[3] )\ |
474 | 0 | {\ |
475 | 0 | scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ |
476 | 0 | scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ |
477 | 0 | scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ |
478 | 0 | }\ Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_16x16 Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_16x8 Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_8x16 Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_8x8 Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_8x4 Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_4x8 Unexecuted instantiation: pixel.c:x264_pixel_satd_x3_4x4 |
479 | | static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\ |
480 | 0 | intptr_t i_stride, int scores[4] )\ |
481 | 0 | {\ |
482 | 0 | scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ |
483 | 0 | scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ |
484 | 0 | scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ |
485 | 0 | scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\ |
486 | 0 | } Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_16x16 Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_16x8 Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_8x16 Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_8x8 Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_8x4 Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_4x8 Unexecuted instantiation: pixel.c:x264_pixel_satd_x4_4x4 |
487 | | #define SATD_X_DECL6( cpu )\ |
488 | | SATD_X( 16x16, cpu )\ |
489 | | SATD_X( 16x8, cpu )\ |
490 | | SATD_X( 8x16, cpu )\ |
491 | | SATD_X( 8x8, cpu )\ |
492 | | SATD_X( 8x4, cpu )\ |
493 | | SATD_X( 4x8, cpu ) |
494 | | #define SATD_X_DECL7( cpu )\ |
495 | | SATD_X_DECL6( cpu )\ |
496 | | SATD_X( 4x4, cpu ) |
497 | | |
498 | | SATD_X_DECL7() |
499 | | #if HAVE_MMX |
500 | | SATD_X_DECL7( _mmx2 ) |
501 | | #if !HIGH_BIT_DEPTH |
502 | | SATD_X_DECL6( _sse2 ) |
503 | | SATD_X_DECL7( _ssse3 ) |
504 | | SATD_X_DECL6( _ssse3_atom ) |
505 | | SATD_X_DECL7( _sse4 ) |
506 | | SATD_X_DECL7( _avx ) |
507 | | SATD_X_DECL7( _xop ) |
508 | | SATD_X_DECL7( _avx512 ) |
509 | | #endif // !HIGH_BIT_DEPTH |
510 | | #endif |
511 | | |
512 | | #if !HIGH_BIT_DEPTH |
513 | | #if HAVE_ARMV6 || HAVE_AARCH64 |
514 | | SATD_X_DECL7( _neon ) |
515 | | #endif |
516 | | #endif // !HIGH_BIT_DEPTH |
517 | | |
518 | | #define INTRA_MBCMP_8x8( mbcmp, cpu, cpu2 )\ |
519 | 0 | static void intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\ |
520 | 0 | {\ |
521 | 0 | ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\ |
522 | 0 | x264_predict_8x8_v##cpu2( pix, edge );\ |
523 | 0 | res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ |
524 | 0 | x264_predict_8x8_h##cpu2( pix, edge );\ |
525 | 0 | res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ |
526 | 0 | x264_predict_8x8_dc##cpu2( pix, edge );\ |
527 | 0 | res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ |
528 | 0 | } Unexecuted instantiation: pixel.c:intra_sad_x3_8x8 Unexecuted instantiation: pixel.c:intra_sa8d_x3_8x8 |
529 | | |
530 | | INTRA_MBCMP_8x8( sad,, _c ) |
531 | | INTRA_MBCMP_8x8(sa8d,, _c ) |
532 | | #if HIGH_BIT_DEPTH && HAVE_MMX |
533 | | #define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse |
534 | | INTRA_MBCMP_8x8( sad, _mmx2, _c ) |
535 | | INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) |
536 | | #endif |
537 | | #if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || HAVE_AARCH64) |
538 | | INTRA_MBCMP_8x8( sad, _neon, _neon ) |
539 | | INTRA_MBCMP_8x8(sa8d, _neon, _neon ) |
540 | | #endif |
541 | | |
542 | | #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\ |
543 | 0 | static void intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ |
544 | 0 | {\ |
545 | 0 | x264_predict_##size##chroma##_##pred1##cpu2( fdec );\ |
546 | 0 | res[0] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ |
547 | 0 | x264_predict_##size##chroma##_##pred2##cpu2( fdec );\ |
548 | 0 | res[1] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ |
549 | 0 | x264_predict_##size##chroma##_##pred3##cpu2( fdec );\ |
550 | 0 | res[2] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ |
551 | 0 | } Unexecuted instantiation: pixel.c:intra_sad_x3_4x4 Unexecuted instantiation: pixel.c:intra_satd_x3_4x4 Unexecuted instantiation: pixel.c:intra_sad_x3_8x8c Unexecuted instantiation: pixel.c:intra_satd_x3_8x8c Unexecuted instantiation: pixel.c:intra_sad_x3_8x16c Unexecuted instantiation: pixel.c:intra_satd_x3_8x16c Unexecuted instantiation: pixel.c:intra_sad_x3_16x16 Unexecuted instantiation: pixel.c:intra_satd_x3_16x16 |
552 | | |
553 | | INTRA_MBCMP( sad, 4x4, v, h, dc, ,, _c ) |
554 | | INTRA_MBCMP(satd, 4x4, v, h, dc, ,, _c ) |
555 | | INTRA_MBCMP( sad, 8x8, dc, h, v, c,, _c ) |
556 | | INTRA_MBCMP(satd, 8x8, dc, h, v, c,, _c ) |
557 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c,, _c ) |
558 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c ) |
559 | | INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c ) |
560 | | INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c ) |
561 | | |
562 | | #if HAVE_MMX |
563 | | #if HIGH_BIT_DEPTH |
564 | | #define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx |
565 | | #define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c |
566 | | #define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c |
567 | | #define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse |
568 | | #define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse |
569 | | #define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse |
570 | | INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c ) |
571 | | INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _mmx2 ) |
572 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 ) |
573 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 ) |
574 | | INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 ) |
575 | | INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 ) |
576 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _sse2 ) |
577 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _sse2 ) |
578 | | INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 ) |
579 | | INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 ) |
580 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _ssse3, _sse2 ) |
581 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _sse2 ) |
582 | | INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 ) |
583 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _sse2 ) |
584 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _sse2 ) |
585 | | #else |
586 | | #define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx |
587 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 ) |
588 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 ) |
589 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 ) |
590 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 ) |
591 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 ) |
592 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 ) |
593 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 ) |
594 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 ) |
595 | | #endif |
596 | | #endif |
597 | | #if !HIGH_BIT_DEPTH && HAVE_ARMV6 |
598 | | INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _armv6 ) |
599 | | INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _armv6 ) |
600 | | INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) |
601 | | INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) |
602 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) |
603 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) |
604 | | INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) |
605 | | INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) |
606 | | #endif |
607 | | #if !HIGH_BIT_DEPTH && HAVE_AARCH64 |
608 | | INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon ) |
609 | | INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon ) |
610 | | INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) |
611 | | INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) |
612 | | INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _neon ) |
613 | | INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _neon ) |
614 | | INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) |
615 | | INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) |
616 | | #endif |
617 | | |
618 | | // No C implementation of intra_satd_x9. See checkasm for its behavior, |
619 | | // or see mb_analyse_intra for the entirely different algorithm we |
620 | | // use when lacking an asm implementation of it. |
621 | | |
622 | | |
623 | | |
624 | | /**************************************************************************** |
625 | | * structural similarity metric |
626 | | ****************************************************************************/ |
627 | | static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1, |
628 | | const pixel *pix2, intptr_t stride2, |
629 | | int sums[2][4] ) |
630 | 0 | { |
631 | 0 | for( int z = 0; z < 2; z++ ) |
632 | 0 | { |
633 | 0 | uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0; |
634 | 0 | for( int y = 0; y < 4; y++ ) |
635 | 0 | for( int x = 0; x < 4; x++ ) |
636 | 0 | { |
637 | 0 | int a = pix1[x+y*stride1]; |
638 | 0 | int b = pix2[x+y*stride2]; |
639 | 0 | s1 += a; |
640 | 0 | s2 += b; |
641 | 0 | ss += a*a; |
642 | 0 | ss += b*b; |
643 | 0 | s12 += a*b; |
644 | 0 | } |
645 | 0 | sums[z][0] = s1; |
646 | 0 | sums[z][1] = s2; |
647 | 0 | sums[z][2] = ss; |
648 | 0 | sums[z][3] = s12; |
649 | 0 | pix1 += 4; |
650 | 0 | pix2 += 4; |
651 | 0 | } |
652 | 0 | } |
653 | | |
654 | | static float ssim_end1( int s1, int s2, int ss, int s12 ) |
655 | 0 | { |
656 | | /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases. |
657 | | * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784. |
658 | | * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */ |
659 | | #if BIT_DEPTH > 9 |
660 | | #define type float |
661 | | static const float ssim_c1 = .01*.01*PIXEL_MAX*PIXEL_MAX*64; |
662 | | static const float ssim_c2 = .03*.03*PIXEL_MAX*PIXEL_MAX*64*63; |
663 | | #else |
664 | 0 | #define type int |
665 | 0 | static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5); |
666 | 0 | static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5); |
667 | 0 | #endif |
668 | 0 | type fs1 = s1; |
669 | 0 | type fs2 = s2; |
670 | 0 | type fss = ss; |
671 | 0 | type fs12 = s12; |
672 | 0 | type vars = fss*64 - fs1*fs1 - fs2*fs2; |
673 | 0 | type covar = fs12*64 - fs1*fs2; |
674 | 0 | return (float)(2*fs1*fs2 + ssim_c1) * (float)(2*covar + ssim_c2) |
675 | 0 | / ((float)(fs1*fs1 + fs2*fs2 + ssim_c1) * (float)(vars + ssim_c2)); |
676 | 0 | #undef type |
677 | 0 | } |
678 | | |
679 | | static float ssim_end4( int sum0[5][4], int sum1[5][4], int width ) |
680 | 0 | { |
681 | 0 | float ssim = 0.0; |
682 | 0 | for( int i = 0; i < width; i++ ) |
683 | 0 | ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0], |
684 | 0 | sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1], |
685 | 0 | sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2], |
686 | 0 | sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] ); |
687 | 0 | return ssim; |
688 | 0 | } |
689 | | |
690 | | float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, |
691 | | pixel *pix1, intptr_t stride1, |
692 | | pixel *pix2, intptr_t stride2, |
693 | | int width, int height, void *buf, int *cnt ) |
694 | 0 | { |
695 | 0 | int z = 0; |
696 | 0 | float ssim = 0.0; |
697 | 0 | int (*sum0)[4] = buf; |
698 | 0 | int (*sum1)[4] = sum0 + (width >> 2) + 3; |
699 | 0 | width >>= 2; |
700 | 0 | height >>= 2; |
701 | 0 | for( int y = 1; y < height; y++ ) |
702 | 0 | { |
703 | 0 | for( ; z <= y; z++ ) |
704 | 0 | { |
705 | 0 | XCHG( void*, sum0, sum1 ); |
706 | 0 | for( int x = 0; x < width; x+=2 ) |
707 | 0 | pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] ); |
708 | 0 | } |
709 | 0 | for( int x = 0; x < width-1; x += 4 ) |
710 | 0 | ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) ); |
711 | 0 | } |
712 | 0 | *cnt = (height-1) * (width-1); |
713 | 0 | return ssim; |
714 | 0 | } Unexecuted instantiation: x264_8_pixel_ssim_wxh Unexecuted instantiation: x264_10_pixel_ssim_wxh |
715 | | |
716 | | static int pixel_vsad( pixel *src, intptr_t stride, int height ) |
717 | 0 | { |
718 | 0 | int score = 0; |
719 | 0 | for( int i = 1; i < height; i++, src += stride ) |
720 | 0 | for( int j = 0; j < 16; j++ ) |
721 | 0 | score += abs(src[j] - src[j+stride]); |
722 | 0 | return score; |
723 | 0 | } |
724 | | |
725 | | int x264_field_vsad( x264_t *h, int mb_x, int mb_y ) |
726 | 0 | { |
727 | 0 | int score_field, score_frame; |
728 | 0 | int stride = h->fenc->i_stride[0]; |
729 | 0 | int mb_stride = h->mb.i_mb_stride; |
730 | 0 | pixel *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride); |
731 | 0 | int mb_xy = mb_x + mb_y*mb_stride; |
732 | | |
733 | | /* We don't want to analyze pixels outside the frame, as it gives inaccurate results. */ |
734 | 0 | int mbpair_height = X264_MIN( h->param.i_height - mb_y * 16, 32 ); |
735 | 0 | score_frame = h->pixf.vsad( fenc, stride, mbpair_height ); |
736 | 0 | score_field = h->pixf.vsad( fenc, stride*2, mbpair_height >> 1 ); |
737 | 0 | score_field += h->pixf.vsad( fenc+stride, stride*2, mbpair_height >> 1 ); |
738 | |
|
739 | 0 | if( mb_x > 0 ) |
740 | 0 | score_field += 512 - h->mb.field[mb_xy -1]*1024; |
741 | 0 | if( mb_y > 0 ) |
742 | 0 | score_field += 512 - h->mb.field[mb_xy-mb_stride]*1024; |
743 | |
|
744 | 0 | return (score_field < score_frame); |
745 | 0 | } Unexecuted instantiation: x264_8_field_vsad Unexecuted instantiation: x264_10_field_vsad |
746 | | |
747 | | static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ) |
748 | 0 | { |
749 | 0 | int sum = 0; |
750 | 0 | for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 ) |
751 | 0 | for( int x = 0; x < 8; x++ ) |
752 | 0 | sum += pix1[x] - pix2[x]; |
753 | 0 | return abs( sum ); |
754 | 0 | } |
755 | | |
756 | | /**************************************************************************** |
757 | | * successive elimination |
758 | | ****************************************************************************/ |
759 | | static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, |
760 | | uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) |
761 | 0 | { |
762 | 0 | int nmv = 0; |
763 | 0 | for( int i = 0; i < width; i++, sums++ ) |
764 | 0 | { |
765 | 0 | int ads = abs( enc_dc[0] - sums[0] ) |
766 | 0 | + abs( enc_dc[1] - sums[8] ) |
767 | 0 | + abs( enc_dc[2] - sums[delta] ) |
768 | 0 | + abs( enc_dc[3] - sums[delta+8] ) |
769 | 0 | + cost_mvx[i]; |
770 | 0 | if( ads < thresh ) |
771 | 0 | mvs[nmv++] = i; |
772 | 0 | } |
773 | 0 | return nmv; |
774 | 0 | } |
775 | | |
776 | | static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta, |
777 | | uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) |
778 | 0 | { |
779 | 0 | int nmv = 0; |
780 | 0 | for( int i = 0; i < width; i++, sums++ ) |
781 | 0 | { |
782 | 0 | int ads = abs( enc_dc[0] - sums[0] ) |
783 | 0 | + abs( enc_dc[1] - sums[delta] ) |
784 | 0 | + cost_mvx[i]; |
785 | 0 | if( ads < thresh ) |
786 | 0 | mvs[nmv++] = i; |
787 | 0 | } |
788 | 0 | return nmv; |
789 | 0 | } |
790 | | |
791 | | static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta, |
792 | | uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) |
793 | 0 | { |
794 | 0 | int nmv = 0; |
795 | 0 | for( int i = 0; i<width; i++, sums++ ) |
796 | 0 | { |
797 | 0 | int ads = abs( enc_dc[0] - sums[0] ) |
798 | 0 | + cost_mvx[i]; |
799 | 0 | if( ads < thresh ) |
800 | 0 | mvs[nmv++] = i; |
801 | 0 | } |
802 | 0 | return nmv; |
803 | 0 | } |
804 | | |
805 | | |
806 | | /**************************************************************************** |
807 | | * x264_pixel_init: |
808 | | ****************************************************************************/ |
809 | | void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) |
810 | 0 | { |
811 | 0 | memset( pixf, 0, sizeof(*pixf) ); |
812 | |
|
813 | 0 | #define INIT2_NAME( name1, name2, cpu ) \ |
814 | 0 | pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\ |
815 | 0 | pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu; |
816 | 0 | #define INIT4_NAME( name1, name2, cpu ) \ |
817 | 0 | INIT2_NAME( name1, name2, cpu ) \ |
818 | 0 | pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\ |
819 | 0 | pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu; |
820 | 0 | #define INIT5_NAME( name1, name2, cpu ) \ |
821 | 0 | INIT4_NAME( name1, name2, cpu ) \ |
822 | 0 | pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu; |
823 | 0 | #define INIT6_NAME( name1, name2, cpu ) \ |
824 | 0 | INIT5_NAME( name1, name2, cpu ) \ |
825 | 0 | pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu; |
826 | 0 | #define INIT7_NAME( name1, name2, cpu ) \ |
827 | 0 | INIT6_NAME( name1, name2, cpu ) \ |
828 | 0 | pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu; |
829 | 0 | #define INIT8_NAME( name1, name2, cpu ) \ |
830 | 0 | INIT7_NAME( name1, name2, cpu ) \ |
831 | 0 | pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu; |
832 | | #if HAVE_SVE |
833 | | #define INIT7_NAME_SVE_SSD_10BIT( ) \ |
834 | | pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \ |
835 | | pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; |
836 | | #endif |
837 | | #if HAVE_SVE |
838 | | #define INIT8_NAME_SVE_SSD( ) \ |
839 | | pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_sve; \ |
840 | | pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_sve; \ |
841 | | pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; \ |
842 | | pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \ |
843 | | pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve; |
844 | | #define INIT8_NAME_SVE_SSD_10BIT() \ |
845 | | INIT7_NAME_SVE_SSD_10BIT() \ |
846 | | pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve; |
847 | | #endif |
848 | 0 | #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) |
849 | 0 | #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) |
850 | 0 | #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) |
851 | 0 | #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu ) |
852 | 0 | #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) |
853 | 0 | #define INIT8( name, cpu ) INIT8_NAME( name, name, cpu ) |
854 | | #if HAVE_SVE |
855 | | #define INIT8_SVE_SSD( ) INIT8_NAME_SVE_SSD( ) |
856 | | #define INIT8_SVE_SSD_10BIT( ) INIT8_NAME_SVE_SSD_10BIT( ) |
857 | | #endif |
858 | |
|
859 | 0 | #define INIT_ADS( cpu ) \ |
860 | 0 | pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ |
861 | 0 | pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\ |
862 | 0 | pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; |
863 | |
|
864 | 0 | INIT8( sad, ); |
865 | 0 | INIT8_NAME( sad_aligned, sad, ); |
866 | 0 | INIT7( sad_x3, ); |
867 | 0 | INIT7( sad_x4, ); |
868 | 0 | INIT8( ssd, ); |
869 | 0 | INIT8( satd, ); |
870 | 0 | INIT7( satd_x3, ); |
871 | 0 | INIT7( satd_x4, ); |
872 | 0 | INIT4( hadamard_ac, ); |
873 | 0 | INIT_ADS( ); |
874 | |
|
875 | 0 | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16; |
876 | 0 | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; |
877 | 0 | pixf->var[PIXEL_16x16] = pixel_var_16x16; |
878 | 0 | pixf->var[PIXEL_8x16] = pixel_var_8x16; |
879 | 0 | pixf->var[PIXEL_8x8] = pixel_var_8x8; |
880 | 0 | pixf->var2[PIXEL_8x16] = pixel_var2_8x16; |
881 | 0 | pixf->var2[PIXEL_8x8] = pixel_var2_8x8; |
882 | |
|
883 | 0 | pixf->ssd_nv12_core = pixel_ssd_nv12_core; |
884 | 0 | pixf->ssim_4x4x2_core = ssim_4x4x2_core; |
885 | 0 | pixf->ssim_end4 = ssim_end4; |
886 | 0 | pixf->vsad = pixel_vsad; |
887 | 0 | pixf->asd8 = pixel_asd8; |
888 | |
|
889 | 0 | pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4; |
890 | 0 | pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4; |
891 | 0 | pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8; |
892 | 0 | pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8; |
893 | 0 | pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c; |
894 | 0 | pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c; |
895 | 0 | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c; |
896 | 0 | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c; |
897 | 0 | pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16; |
898 | 0 | pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16; |
899 | |
|
900 | | #if HIGH_BIT_DEPTH |
901 | | #if HAVE_MMX |
902 | | if( cpu&X264_CPU_MMX2 ) |
903 | | { |
904 | | INIT7( sad, _mmx2 ); |
905 | | INIT7_NAME( sad_aligned, sad, _mmx2 ); |
906 | | INIT7( sad_x3, _mmx2 ); |
907 | | INIT7( sad_x4, _mmx2 ); |
908 | | INIT8( satd, _mmx2 ); |
909 | | INIT7( satd_x3, _mmx2 ); |
910 | | INIT7( satd_x4, _mmx2 ); |
911 | | INIT4( hadamard_ac, _mmx2 ); |
912 | | INIT8( ssd, _mmx2 ); |
913 | | |
914 | | pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_mmx2; |
915 | | pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; |
916 | | pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_mmx2; |
917 | | pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_mmx2; |
918 | | pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2; |
919 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_mmx2; |
920 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_mmx2; |
921 | | pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_mmx2; |
922 | | pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2; |
923 | | } |
924 | | if( cpu&X264_CPU_SSE2 ) |
925 | | { |
926 | | INIT4_NAME( sad_aligned, sad, _sse2_aligned ); |
927 | | INIT5( ssd, _sse2 ); |
928 | | INIT6( satd, _sse2 ); |
929 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; |
930 | | |
931 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; |
932 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; |
933 | | pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; |
934 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; |
935 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; |
936 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; |
937 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; |
938 | | |
939 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; |
940 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; |
941 | | #if ARCH_X86_64 |
942 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; |
943 | | #endif |
944 | | pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2; |
945 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; |
946 | | pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_sse2; |
947 | | } |
948 | | if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) |
949 | | { |
950 | | INIT5( sad, _sse2 ); |
951 | | INIT2( sad_x3, _sse2 ); |
952 | | INIT2( sad_x4, _sse2 ); |
953 | | INIT_ADS( _sse2 ); |
954 | | |
955 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
956 | | { |
957 | | INIT4( hadamard_ac, _sse2 ); |
958 | | } |
959 | | pixf->vsad = x264_pixel_vsad_sse2; |
960 | | pixf->asd8 = x264_pixel_asd8_sse2; |
961 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; |
962 | | pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_sse2; |
963 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_sse2; |
964 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse2; |
965 | | pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_sse2; |
966 | | } |
967 | | if( cpu&X264_CPU_SSE2_IS_FAST ) |
968 | | { |
969 | | pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; |
970 | | pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2; |
971 | | pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2; |
972 | | pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2; |
973 | | pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2; |
974 | | pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2; |
975 | | pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2; |
976 | | } |
977 | | if( cpu&X264_CPU_SSSE3 ) |
978 | | { |
979 | | INIT4_NAME( sad_aligned, sad, _ssse3_aligned ); |
980 | | pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3; |
981 | | pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3; |
982 | | INIT7( sad, _ssse3 ); |
983 | | INIT7( sad_x3, _ssse3 ); |
984 | | INIT7( sad_x4, _ssse3 ); |
985 | | INIT_ADS( _ssse3 ); |
986 | | INIT6( satd, _ssse3 ); |
987 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3; |
988 | | |
989 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
990 | | { |
991 | | INIT4( hadamard_ac, _ssse3 ); |
992 | | } |
993 | | pixf->vsad = x264_pixel_vsad_ssse3; |
994 | | pixf->asd8 = x264_pixel_asd8_ssse3; |
995 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; |
996 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; |
997 | | #if ARCH_X86_64 |
998 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; |
999 | | #endif |
1000 | | pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; |
1001 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; |
1002 | | pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_ssse3; |
1003 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_ssse3; |
1004 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_ssse3; |
1005 | | pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_ssse3; |
1006 | | } |
1007 | | if( cpu&X264_CPU_SSE4 ) |
1008 | | { |
1009 | | INIT6( satd, _sse4 ); |
1010 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4; |
1011 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
1012 | | { |
1013 | | INIT4( hadamard_ac, _sse4 ); |
1014 | | } |
1015 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; |
1016 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; |
1017 | | #if ARCH_X86_64 |
1018 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; |
1019 | | #endif |
1020 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse4; |
1021 | | } |
1022 | | if( cpu&X264_CPU_AVX ) |
1023 | | { |
1024 | | INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ |
1025 | | INIT_ADS( _avx ); |
1026 | | INIT6( satd, _avx ); |
1027 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx; |
1028 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
1029 | | { |
1030 | | INIT4( hadamard_ac, _avx ); |
1031 | | } |
1032 | | pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx; |
1033 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; |
1034 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; |
1035 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; |
1036 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx; |
1037 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; |
1038 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; |
1039 | | pixf->ssim_end4 = x264_pixel_ssim_end4_avx; |
1040 | | #if ARCH_X86_64 |
1041 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; |
1042 | | #endif |
1043 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_avx; |
1044 | | } |
1045 | | if( cpu&X264_CPU_XOP ) |
1046 | | { |
1047 | | INIT5( sad_x3, _xop ); |
1048 | | INIT5( sad_x4, _xop ); |
1049 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; |
1050 | | pixf->vsad = x264_pixel_vsad_xop; |
1051 | | pixf->asd8 = x264_pixel_asd8_xop; |
1052 | | #if ARCH_X86_64 |
1053 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; |
1054 | | #endif |
1055 | | } |
1056 | | if( cpu&X264_CPU_AVX2 ) |
1057 | | { |
1058 | | INIT2( ssd, _avx2 ); |
1059 | | INIT2( sad, _avx2 ); |
1060 | | INIT2_NAME( sad_aligned, sad, _avx2 ); |
1061 | | INIT2( sad_x3, _avx2 ); |
1062 | | INIT2( sad_x4, _avx2 ); |
1063 | | INIT_ADS( _avx2 ); |
1064 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; |
1065 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; |
1066 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; |
1067 | | pixf->vsad = x264_pixel_vsad_avx2; |
1068 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; |
1069 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; |
1070 | | } |
1071 | | if( cpu&X264_CPU_AVX512 ) |
1072 | | { |
1073 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; |
1074 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; |
1075 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; |
1076 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; |
1077 | | } |
1078 | | #endif // HAVE_MMX |
1079 | | #if HAVE_AARCH64 |
1080 | | if( cpu&X264_CPU_NEON ) |
1081 | | { |
1082 | | INIT8( sad, _neon ); |
1083 | | INIT7( sad_x3, _neon); |
1084 | | pixf->vsad = x264_pixel_vsad_neon; |
1085 | | pixf->asd8 = x264_pixel_asd8_neon; |
1086 | | INIT8(ssd, _neon); |
1087 | | pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_neon; |
1088 | | pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_neon; |
1089 | | pixf->satd[PIXEL_4x4] = x264_pixel_satd_4x4_neon; |
1090 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_neon; |
1091 | | pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_neon; |
1092 | | pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_neon; |
1093 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; |
1094 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; |
1095 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; |
1096 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; |
1097 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; |
1098 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; |
1099 | | pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_neon; |
1100 | | pixf->satd[PIXEL_16x16] = x264_pixel_satd_16x16_neon; |
1101 | | INIT7(sad_x4, _neon); |
1102 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; |
1103 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; |
1104 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; |
1105 | | INIT4(hadamard_ac, _neon); |
1106 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; |
1107 | | pixf->ssim_end4 = x264_pixel_ssim_end4_neon; |
1108 | | } |
1109 | | #if HAVE_SVE |
1110 | | if( cpu&X264_CPU_SVE ) |
1111 | | { |
1112 | | INIT8_SVE_SSD_10BIT(); |
1113 | | } |
1114 | | #endif |
1115 | | #endif // HAVE_AARCH64 |
1116 | | |
1117 | | #else // !HIGH_BIT_DEPTH |
1118 | | #if HAVE_MMX |
1119 | | if( cpu&X264_CPU_MMX ) |
1120 | | { |
1121 | | INIT8( ssd, _mmx ); |
1122 | | } |
1123 | | |
1124 | | if( cpu&X264_CPU_MMX2 ) |
1125 | | { |
1126 | | INIT8( sad, _mmx2 ); |
1127 | | INIT8_NAME( sad_aligned, sad, _mmx2 ); |
1128 | | INIT7( sad_x3, _mmx2 ); |
1129 | | INIT7( sad_x4, _mmx2 ); |
1130 | | INIT8( satd, _mmx2 ); |
1131 | | INIT7( satd_x3, _mmx2 ); |
1132 | | INIT7( satd_x4, _mmx2 ); |
1133 | | INIT4( hadamard_ac, _mmx2 ); |
1134 | | INIT_ADS( _mmx2 ); |
1135 | | #if ARCH_X86 |
1136 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2; |
1137 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2; |
1138 | | pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2; |
1139 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2; |
1140 | | pixf->vsad = x264_pixel_vsad_mmx2; |
1141 | | |
1142 | | if( cpu&X264_CPU_CACHELINE_32 ) |
1143 | | { |
1144 | | INIT5( sad, _cache32_mmx2 ); |
1145 | | INIT4( sad_x3, _cache32_mmx2 ); |
1146 | | INIT4( sad_x4, _cache32_mmx2 ); |
1147 | | } |
1148 | | else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) |
1149 | | { |
1150 | | INIT5( sad, _cache64_mmx2 ); |
1151 | | INIT4( sad_x3, _cache64_mmx2 ); |
1152 | | INIT4( sad_x4, _cache64_mmx2 ); |
1153 | | } |
1154 | | #else |
1155 | | if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) |
1156 | | { |
1157 | | pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2; |
1158 | | pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2; |
1159 | | pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmx2; |
1160 | | pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2; |
1161 | | pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmx2; |
1162 | | pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2; |
1163 | | pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmx2; |
1164 | | } |
1165 | | #endif |
1166 | | pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2; |
1167 | | pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2; |
1168 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_mmx2; |
1169 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_mmx2; |
1170 | | pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2; |
1171 | | pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2; |
1172 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2; |
1173 | | pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; |
1174 | | pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2; |
1175 | | } |
1176 | | |
1177 | | if( cpu&X264_CPU_SSE2 ) |
1178 | | { |
1179 | | INIT5( ssd, _sse2slow ); |
1180 | | INIT2_NAME( sad_aligned, sad, _sse2_aligned ); |
1181 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; |
1182 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; |
1183 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; |
1184 | | pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; |
1185 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; |
1186 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; |
1187 | | #if ARCH_X86_64 |
1188 | | pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; |
1189 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; |
1190 | | #endif |
1191 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; |
1192 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; |
1193 | | pixf->vsad = x264_pixel_vsad_sse2; |
1194 | | pixf->asd8 = x264_pixel_asd8_sse2; |
1195 | | } |
1196 | | |
1197 | | if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) |
1198 | | { |
1199 | | INIT2( sad, _sse2 ); |
1200 | | INIT2( sad_x3, _sse2 ); |
1201 | | INIT2( sad_x4, _sse2 ); |
1202 | | INIT6( satd, _sse2 ); |
1203 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; |
1204 | | INIT6( satd_x3, _sse2 ); |
1205 | | INIT6( satd_x4, _sse2 ); |
1206 | | INIT4( hadamard_ac, _sse2 ); |
1207 | | INIT_ADS( _sse2 ); |
1208 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; |
1209 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2; |
1210 | | pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; |
1211 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse2; |
1212 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_sse2; |
1213 | | if( cpu&X264_CPU_CACHELINE_64 ) |
1214 | | { |
1215 | | INIT2( ssd, _sse2); /* faster for width 16 on p4 */ |
1216 | | #if ARCH_X86 |
1217 | | INIT2( sad, _cache64_sse2 ); |
1218 | | INIT2( sad_x3, _cache64_sse2 ); |
1219 | | INIT2( sad_x4, _cache64_sse2 ); |
1220 | | #endif |
1221 | | if( cpu&X264_CPU_SSE2_IS_FAST ) |
1222 | | { |
1223 | | pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2; |
1224 | | pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2; |
1225 | | } |
1226 | | } |
1227 | | } |
1228 | | |
1229 | | if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) ) |
1230 | | { |
1231 | | pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; |
1232 | | pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; |
1233 | | pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2; |
1234 | | pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2; |
1235 | | pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2; |
1236 | | pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2; |
1237 | | pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2; |
1238 | | pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2; |
1239 | | } |
1240 | | |
1241 | | if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) ) |
1242 | | { |
1243 | | INIT2( sad, _sse3 ); |
1244 | | INIT2( sad_x3, _sse3 ); |
1245 | | INIT2( sad_x4, _sse3 ); |
1246 | | } |
1247 | | |
1248 | | if( cpu&X264_CPU_SSSE3 ) |
1249 | | { |
1250 | | INIT4( hadamard_ac, _ssse3 ); |
1251 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
1252 | | { |
1253 | | pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3; |
1254 | | pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3; |
1255 | | pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3; |
1256 | | #if ARCH_X86_64 |
1257 | | pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3; |
1258 | | #endif |
1259 | | } |
1260 | | INIT_ADS( _ssse3 ); |
1261 | | if( cpu&X264_CPU_SLOW_ATOM ) |
1262 | | { |
1263 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom; |
1264 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom; |
1265 | | INIT6( satd, _ssse3_atom ); |
1266 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom; |
1267 | | INIT6( satd_x3, _ssse3_atom ); |
1268 | | INIT6( satd_x4, _ssse3_atom ); |
1269 | | INIT4( hadamard_ac, _ssse3_atom ); |
1270 | | #if ARCH_X86_64 |
1271 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom; |
1272 | | #endif |
1273 | | } |
1274 | | else |
1275 | | { |
1276 | | INIT8( ssd, _ssse3 ); |
1277 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; |
1278 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; |
1279 | | INIT8( satd, _ssse3 ); |
1280 | | INIT7( satd_x3, _ssse3 ); |
1281 | | INIT7( satd_x4, _ssse3 ); |
1282 | | #if ARCH_X86_64 |
1283 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; |
1284 | | #endif |
1285 | | } |
1286 | | pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; |
1287 | | if( !(cpu&X264_CPU_SLOW_PSHUFB) ) |
1288 | | pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; |
1289 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_ssse3; |
1290 | | pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; |
1291 | | pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; |
1292 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3; |
1293 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3; |
1294 | | pixf->asd8 = x264_pixel_asd8_ssse3; |
1295 | | if( cpu&X264_CPU_CACHELINE_64 ) |
1296 | | { |
1297 | | INIT2( sad, _cache64_ssse3 ); |
1298 | | INIT2( sad_x3, _cache64_ssse3 ); |
1299 | | INIT2( sad_x4, _cache64_ssse3 ); |
1300 | | } |
1301 | | else |
1302 | | { |
1303 | | INIT2( sad_x3, _ssse3 ); |
1304 | | INIT5( sad_x4, _ssse3 ); |
1305 | | } |
1306 | | if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) ) |
1307 | | { |
1308 | | INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */ |
1309 | | } |
1310 | | } |
1311 | | |
1312 | | if( cpu&X264_CPU_SSE4 ) |
1313 | | { |
1314 | | INIT8( satd, _sse4 ); |
1315 | | INIT7( satd_x3, _sse4 ); |
1316 | | INIT7( satd_x4, _sse4 ); |
1317 | | INIT4( hadamard_ac, _sse4 ); |
1318 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
1319 | | { |
1320 | | pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4; |
1321 | | pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4; |
1322 | | pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4; |
1323 | | #if ARCH_X86_64 |
1324 | | pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_sse4; |
1325 | | #endif |
1326 | | } |
1327 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; |
1328 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; |
1329 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse4; |
1330 | | #if ARCH_X86_64 |
1331 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; |
1332 | | #endif |
1333 | | } |
1334 | | |
1335 | | if( cpu&X264_CPU_AVX ) |
1336 | | { |
1337 | | INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ |
1338 | | INIT2( sad_x3, _avx ); |
1339 | | INIT2( sad_x4, _avx ); |
1340 | | INIT8( satd, _avx ); |
1341 | | INIT7( satd_x3, _avx ); |
1342 | | INIT7( satd_x4, _avx ); |
1343 | | INIT_ADS( _avx ); |
1344 | | INIT4( hadamard_ac, _avx ); |
1345 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
1346 | | { |
1347 | | pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx; |
1348 | | pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx; |
1349 | | pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx; |
1350 | | #if ARCH_X86_64 |
1351 | | pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_avx; |
1352 | | #endif |
1353 | | } |
1354 | | INIT5( ssd, _avx ); |
1355 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; |
1356 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; |
1357 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_avx; |
1358 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; |
1359 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; |
1360 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx; |
1361 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx; |
1362 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; |
1363 | | pixf->ssim_end4 = x264_pixel_ssim_end4_avx; |
1364 | | #if ARCH_X86_64 |
1365 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; |
1366 | | #endif |
1367 | | } |
1368 | | |
1369 | | if( cpu&X264_CPU_XOP ) |
1370 | | { |
1371 | | INIT7( satd, _xop ); |
1372 | | INIT7( satd_x3, _xop ); |
1373 | | INIT7( satd_x4, _xop ); |
1374 | | INIT4( hadamard_ac, _xop ); |
1375 | | if( !(cpu&X264_CPU_STACK_MOD4) ) |
1376 | | { |
1377 | | pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop; |
1378 | | } |
1379 | | INIT5( ssd, _xop ); |
1380 | | pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; |
1381 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; |
1382 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_xop; |
1383 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; |
1384 | | #if ARCH_X86_64 |
1385 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; |
1386 | | #endif |
1387 | | } |
1388 | | |
1389 | | if( cpu&X264_CPU_AVX2 ) |
1390 | | { |
1391 | | INIT2( ssd, _avx2 ); |
1392 | | INIT2( sad_x3, _avx2 ); |
1393 | | INIT2( sad_x4, _avx2 ); |
1394 | | INIT4( satd, _avx2 ); |
1395 | | INIT2( hadamard_ac, _avx2 ); |
1396 | | INIT_ADS( _avx2 ); |
1397 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2; |
1398 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; |
1399 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; |
1400 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; |
1401 | | pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2; |
1402 | | pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2; |
1403 | | pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2; |
1404 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; |
1405 | | #if ARCH_X86_64 |
1406 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2; |
1407 | | #endif |
1408 | | } |
1409 | | |
1410 | | if( cpu&X264_CPU_AVX512 ) |
1411 | | { |
1412 | | INIT8( sad, _avx512 ); |
1413 | | INIT8_NAME( sad_aligned, sad, _avx512 ); |
1414 | | INIT7( sad_x3, _avx512 ); |
1415 | | INIT7( sad_x4, _avx512 ); |
1416 | | INIT8( satd, _avx512 ); |
1417 | | INIT7( satd_x3, _avx512 ); |
1418 | | INIT7( satd_x4, _avx512 ); |
1419 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512; |
1420 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512; |
1421 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; |
1422 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; |
1423 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; |
1424 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; |
1425 | | } |
1426 | | #endif //HAVE_MMX |
1427 | | |
1428 | | #if HAVE_ARMV6 |
1429 | | if( cpu&X264_CPU_ARMV6 ) |
1430 | | { |
1431 | | pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6; |
1432 | | pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6; |
1433 | | pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6; |
1434 | | pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6; |
1435 | | } |
1436 | | if( cpu&X264_CPU_NEON ) |
1437 | | { |
1438 | | INIT5( sad, _neon ); |
1439 | | INIT5( sad_aligned, _neon ); |
1440 | | INIT7( sad_x3, _neon ); |
1441 | | INIT7( sad_x4, _neon ); |
1442 | | INIT7( ssd, _neon ); |
1443 | | INIT7( satd, _neon ); |
1444 | | INIT7( satd_x3, _neon ); |
1445 | | INIT7( satd_x4, _neon ); |
1446 | | INIT4( hadamard_ac, _neon ); |
1447 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; |
1448 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; |
1449 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; |
1450 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; |
1451 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; |
1452 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; |
1453 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; |
1454 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; |
1455 | | pixf->vsad = x264_pixel_vsad_neon; |
1456 | | pixf->asd8 = x264_pixel_asd8_neon; |
1457 | | |
1458 | | pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_neon; |
1459 | | pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_neon; |
1460 | | pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_neon; |
1461 | | pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_neon; |
1462 | | pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_neon; |
1463 | | pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_neon; |
1464 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_neon; |
1465 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_neon; |
1466 | | pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_neon; |
1467 | | pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_neon; |
1468 | | |
1469 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; |
1470 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; |
1471 | | pixf->ssim_end4 = x264_pixel_ssim_end4_neon; |
1472 | | |
1473 | | if( cpu&X264_CPU_FAST_NEON_MRC ) |
1474 | | { |
1475 | | pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon; |
1476 | | pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon; |
1477 | | pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon; |
1478 | | pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon; |
1479 | | } |
1480 | | else // really just scheduled for dual issue / A8 |
1481 | | { |
1482 | | INIT5( sad_aligned, _neon_dual ); |
1483 | | } |
1484 | | } |
1485 | | #endif |
1486 | | |
1487 | | #if HAVE_AARCH64 |
1488 | | if( cpu&X264_CPU_NEON ) |
1489 | | { |
1490 | | INIT8( sad, _neon ); |
1491 | | // AArch64 has no distinct instructions for aligned load/store |
1492 | | INIT8_NAME( sad_aligned, sad, _neon ); |
1493 | | INIT7( sad_x3, _neon ); |
1494 | | INIT7( sad_x4, _neon ); |
1495 | | INIT8( ssd, _neon ); |
1496 | | INIT8( satd, _neon ); |
1497 | | INIT7( satd_x3, _neon ); |
1498 | | INIT7( satd_x4, _neon ); |
1499 | | INIT4( hadamard_ac, _neon ); |
1500 | | |
1501 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; |
1502 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; |
1503 | | pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; |
1504 | | |
1505 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; |
1506 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; |
1507 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; |
1508 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; |
1509 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; |
1510 | | pixf->vsad = x264_pixel_vsad_neon; |
1511 | | pixf->asd8 = x264_pixel_asd8_neon; |
1512 | | |
1513 | | pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_neon; |
1514 | | pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_neon; |
1515 | | pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_neon; |
1516 | | pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_neon; |
1517 | | pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_neon; |
1518 | | pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_neon; |
1519 | | pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_neon; |
1520 | | pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_neon; |
1521 | | pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_neon; |
1522 | | pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_neon; |
1523 | | |
1524 | | pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; |
1525 | | pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; |
1526 | | pixf->ssim_end4 = x264_pixel_ssim_end4_neon; |
1527 | | } |
1528 | | #if HAVE_DOTPROD |
1529 | | if( cpu&X264_CPU_DOTPROD ) { |
1530 | | pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_neon_dotprod; |
1531 | | pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_neon_dotprod; |
1532 | | pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_neon_dotprod; |
1533 | | pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_neon_dotprod; |
1534 | | pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_neon_dotprod; |
1535 | | pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_neon_dotprod; |
1536 | | pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_neon_dotprod; |
1537 | | pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_neon_dotprod; |
1538 | | pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_neon_dotprod; |
1539 | | pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_neon_dotprod; |
1540 | | pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_neon_dotprod; |
1541 | | pixf->vsad = x264_pixel_vsad_neon_dotprod; |
1542 | | } |
1543 | | #endif // HAVE_DOTPROD |
1544 | | |
1545 | | #if HAVE_SVE |
1546 | | if( cpu&X264_CPU_SVE ) |
1547 | | { |
1548 | | INIT8_SVE_SSD( ); |
1549 | | INIT4( hadamard_ac, _sve ); |
1550 | | |
1551 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sve; |
1552 | | |
1553 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sve; |
1554 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sve; |
1555 | | } |
1556 | | #endif |
1557 | | #endif // HAVE_AARCH64 |
1558 | | |
1559 | | #if HAVE_MSA |
1560 | | if( cpu&X264_CPU_MSA ) |
1561 | | { |
1562 | | INIT8( sad, _msa ); |
1563 | | INIT8_NAME( sad_aligned, sad, _msa ); |
1564 | | INIT8( ssd, _msa ); |
1565 | | INIT7( sad_x3, _msa ); |
1566 | | INIT7( sad_x4, _msa ); |
1567 | | INIT8( satd, _msa ); |
1568 | | INIT4( hadamard_ac, _msa ); |
1569 | | |
1570 | | pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_msa; |
1571 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_msa; |
1572 | | pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_msa; |
1573 | | pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa; |
1574 | | pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_msa; |
1575 | | pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa; |
1576 | | pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_msa; |
1577 | | pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_msa; |
1578 | | |
1579 | | pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa; |
1580 | | |
1581 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa; |
1582 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa; |
1583 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa; |
1584 | | //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; |
1585 | | //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; |
1586 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_msa; |
1587 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_msa; |
1588 | | } |
1589 | | #endif // HAVE_MSA |
1590 | | |
1591 | | #if HAVE_LSX |
1592 | | if( cpu&X264_CPU_LSX ) |
1593 | | { |
1594 | | INIT8( sad, _lsx ); |
1595 | | INIT8_NAME( sad_aligned, sad, _lsx ); |
1596 | | INIT8( ssd, _lsx ); |
1597 | | INIT7( sad_x3, _lsx ); |
1598 | | INIT7( sad_x4, _lsx ); |
1599 | | INIT8( satd, _lsx ); |
1600 | | INIT4( hadamard_ac, _lsx ); |
1601 | | |
1602 | | pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_lsx; |
1603 | | pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_lsx; |
1604 | | |
1605 | | pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_lsx; |
1606 | | pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_lsx; |
1607 | | pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_lsx; |
1608 | | pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_lsx; |
1609 | | pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_lsx; |
1610 | | pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_lsx; |
1611 | | |
1612 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_lsx; |
1613 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_lsx; |
1614 | | pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_lsx; |
1615 | | pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_lsx; |
1616 | | pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_lsx; |
1617 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_lsx; |
1618 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_lsx; |
1619 | | } |
1620 | | |
1621 | | if( cpu&X264_CPU_LASX ) |
1622 | | { |
1623 | | INIT4( ssd, _lasx ); |
1624 | | INIT4( hadamard_ac, _lasx ); |
1625 | | |
1626 | | pixf->satd[PIXEL_16x16] = x264_pixel_satd_16x16_lasx; |
1627 | | pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_lasx; |
1628 | | pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_lasx; |
1629 | | pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_lasx; |
1630 | | pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_lasx; |
1631 | | pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_lasx; |
1632 | | pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_lasx; |
1633 | | |
1634 | | pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_lasx; |
1635 | | |
1636 | | pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_lasx; |
1637 | | pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_lasx; |
1638 | | pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_lasx; |
1639 | | pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_lasx; |
1640 | | pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_lasx; |
1641 | | pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_lasx; |
1642 | | pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_lasx; |
1643 | | pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_lasx; |
1644 | | |
1645 | | pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_lasx; |
1646 | | pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_lasx; |
1647 | | pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_lasx; |
1648 | | pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_lasx; |
1649 | | } |
1650 | | #endif /* HAVE_LSX */ |
1651 | | |
1652 | | #endif // HIGH_BIT_DEPTH |
1653 | | #if HAVE_ALTIVEC |
1654 | | if( cpu&X264_CPU_ALTIVEC ) |
1655 | | { |
1656 | | x264_pixel_init_altivec( pixf ); |
1657 | | } |
1658 | | #endif |
1659 | |
|
1660 | 0 | pixf->ads[PIXEL_8x16] = |
1661 | 0 | pixf->ads[PIXEL_8x4] = |
1662 | 0 | pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8]; |
1663 | 0 | pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8]; |
1664 | 0 | } Unexecuted instantiation: x264_8_pixel_init Unexecuted instantiation: x264_10_pixel_init |
1665 | | |