/work/libde265/libde265/x86/sse.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #ifdef _MSC_VER |
22 | | #include <intrin.h> |
23 | | #endif |
24 | | |
25 | | #include "x86/sse.h" |
26 | | #include "x86/sse-motion.h" |
27 | | #include "x86/sse-dct.h" |
28 | | #include "x86/sse-intrapred.h" |
29 | | #include "x86/sse-deblk.h" |
30 | | |
31 | | #ifdef HAVE_CONFIG_H |
32 | | #include "config.h" |
33 | | #endif |
34 | | |
35 | | #if HAVE_AVX2 |
36 | | #include "x86/transform-avx2.h" |
37 | | #endif |
38 | | #if HAVE_AVX512 |
39 | | #include "x86/transform-avx512.h" |
40 | | #endif |
41 | | |
42 | | #if defined(__GNUC__) && !defined(__EMSCRIPTEN__) |
43 | | #include <cpuid.h> |
44 | | #endif |
45 | | |
46 | | void init_acceleration_functions_sse(struct acceleration_functions* accel) |
47 | 2.05k | { |
48 | 2.05k | uint32_t ecx=0,edx=0; |
49 | | |
50 | | #ifdef _MSC_VER |
51 | | uint32_t regs[4]; |
52 | | int a = 1; |
53 | | |
54 | | __cpuid((int *)regs, (int)a); |
55 | | |
56 | | ecx = regs[2]; |
57 | | edx = regs[3]; |
58 | | #elif !defined(__EMSCRIPTEN__) |
59 | | uint32_t eax,ebx; |
60 | 2.05k | __get_cpuid(1, &eax,&ebx,&ecx,&edx); |
61 | 2.05k | #endif |
62 | | |
63 | | #ifdef __EMSCRIPTEN__ |
64 | | int have_SSE = 0; |
65 | | int have_SSE4_1 = 0; |
66 | | #ifdef __SSE__ |
67 | | have_SSE = 1; |
68 | | #endif |
69 | | #ifdef __SSE4_1__ |
70 | | have_SSE4_1 = 1; |
71 | | #endif |
72 | | #else |
73 | | // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]); |
74 | | |
75 | | //int have_MMX = !!(edx & (1<<23)); |
76 | 2.05k | int have_SSE = !!(edx & (1<<25)); |
77 | 2.05k | int have_SSE4_1 = !!(ecx & (1<<19)); |
78 | | |
79 | | // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1); |
80 | | |
81 | 2.05k | if (have_SSE) { |
82 | 2.05k | } |
83 | 2.05k | #endif |
84 | | |
85 | 2.05k | #if HAVE_SSE4_1 |
86 | 2.05k | if (have_SSE4_1) { |
87 | 2.05k | accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse; |
88 | 2.05k | accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse; |
89 | | |
90 | 2.05k | accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse; |
91 | 2.05k | accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse; |
92 | 2.05k | accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse; |
93 | 2.05k | accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse; |
94 | | |
95 | 2.05k | accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse; |
96 | 2.05k | accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse; |
97 | 2.05k | accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse; |
98 | 2.05k | accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse; |
99 | 2.05k | accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse; |
100 | 2.05k | accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse; |
101 | 2.05k | accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse; |
102 | 2.05k | accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse; |
103 | 2.05k | accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse; |
104 | 2.05k | accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse; |
105 | 2.05k | accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse; |
106 | 2.05k | accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse; |
107 | 2.05k | accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse; |
108 | 2.05k | accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse; |
109 | 2.05k | accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse; |
110 | 2.05k | accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse; |
111 | | |
112 | 2.05k | accel->transform_skip_8 = ff_hevc_transform_skip_8_sse; |
113 | | |
114 | | // actually, for these two functions, the scalar fallback seems to be faster than the SSE code |
115 | | //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO |
116 | | //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4; |
117 | | |
118 | 2.05k | accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4; |
119 | 2.05k | accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4; |
120 | 2.05k | accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4; |
121 | | |
122 | 2.05k | accel->add_residual_8 = add_residual_8_sse4; |
123 | 2.05k | accel->add_residual_16 = add_residual_16_sse4; |
124 | 2.05k | accel->dequant_coeff_block = dequant_coeff_block_sse4; |
125 | | |
126 | 2.05k | accel->intra_pred_dc_8 = intra_pred_dc_8_sse4; |
127 | 2.05k | accel->intra_pred_planar_8 = intra_pred_planar_8_sse4; |
128 | 2.05k | accel->intra_pred_angular_8 = intra_pred_angular_8_sse4; |
129 | | |
130 | 2.05k | accel->deblock_luma_8 = deblock_luma_8_sse4; |
131 | | // chroma deblock stays on the scalar fallback: the filter is too cheap to |
132 | | // amortize the SIMD load/transpose/scatter overhead (SSE measured slower). |
133 | 2.05k | } |
134 | 2.05k | #endif |
135 | 2.05k | } |
136 | | |
137 | | |
138 | | void init_acceleration_functions_avx2(struct acceleration_functions* accel) |
139 | 2.05k | { |
140 | 2.05k | #if HAVE_AVX2 |
141 | | // __builtin_cpu_supports("avx2") handles the OSXSAVE / XGETBV (YMM-enabled) |
142 | | // checks internally, so this is safe to call on any CPU. This TU is *not* |
143 | | // compiled with -mavx2, so reaching here never executes an AVX2 instruction. |
144 | 2.05k | #if defined(__GNUC__) && !defined(__EMSCRIPTEN__) |
145 | 2.05k | __builtin_cpu_init(); |
146 | 2.05k | if (__builtin_cpu_supports("avx2")) { |
147 | 2.05k | accel->transform_add_8[2] = transform_16x16_add_8_avx2; |
148 | 2.05k | accel->transform_add_8[3] = transform_32x32_add_8_avx2; |
149 | | // NB: dequant intentionally stays on the SSE version. An AVX2 variant was |
150 | | // implemented and benchmarked, but inverse quantization is scatter-bound, so |
151 | | // the wider arithmetic gave no benefit and AVX2 measured actually slightly |
152 | | // slower than SSE. (AVX-512 would be no better, for the same reason.) |
153 | 2.05k | } |
154 | 2.05k | #endif |
155 | 2.05k | #endif |
156 | 2.05k | } |
157 | | |
158 | | |
159 | | void init_acceleration_functions_avx512(struct acceleration_functions* accel) |
160 | 2.05k | { |
161 | 2.05k | #if HAVE_AVX512 |
162 | 2.05k | #if defined(__GNUC__) && !defined(__EMSCRIPTEN__) |
163 | 2.05k | __builtin_cpu_init(); |
164 | | // need AVX-512F + AVX-512BW (16-bit ops). __builtin_cpu_supports handles the |
165 | | // OS (XCR0/ZMM-enabled) check. This TU is not compiled with -mavx512*. |
166 | 2.05k | if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) { |
167 | 0 | accel->transform_add_8[3] = transform_32x32_add_8_avx512; |
168 | 0 | } |
169 | 2.05k | #endif |
170 | 2.05k | #endif |
171 | 2.05k | } |
172 | | |