/src/libde265/libde265/x86/sse.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #ifdef _MSC_VER |
22 | | #include <intrin.h> |
23 | | #endif |
24 | | |
25 | | #include "x86/sse.h" |
26 | | #include "x86/sse-motion.h" |
27 | | #include "x86/sse-dct.h" |
28 | | |
29 | | #ifdef HAVE_CONFIG_H |
30 | | #include "config.h" |
31 | | #endif |
32 | | |
33 | | #if defined(__GNUC__) && !defined(__EMSCRIPTEN__) |
34 | | #include <cpuid.h> |
35 | | #endif |
36 | | |
37 | | void init_acceleration_functions_sse(struct acceleration_functions* accel) |
38 | 4.47k | { |
39 | 4.47k | uint32_t ecx=0,edx=0; |
40 | | |
41 | | #ifdef _MSC_VER |
42 | | uint32_t regs[4]; |
43 | | int a = 1; |
44 | | |
45 | | __cpuid((int *)regs, (int)a); |
46 | | |
47 | | ecx = regs[2]; |
48 | | edx = regs[3]; |
49 | | #elif !defined(__EMSCRIPTEN__) |
50 | | uint32_t eax,ebx; |
51 | 4.47k | __get_cpuid(1, &eax,&ebx,&ecx,&edx); |
52 | 4.47k | #endif |
53 | | |
54 | | #ifdef __EMSCRIPTEN__ |
55 | | int have_SSE = 0; |
56 | | int have_SSE4_1 = 0; |
57 | | #ifdef __SSE__ |
58 | | have_SSE = 1; |
59 | | #endif |
60 | | #ifdef __SSE4_1__ |
61 | | have_SSE4_1 = 1; |
62 | | #endif |
63 | | #else |
64 | | // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]); |
65 | | |
66 | | //int have_MMX = !!(edx & (1<<23)); |
67 | 4.47k | int have_SSE = !!(edx & (1<<25)); |
68 | 4.47k | int have_SSE4_1 = !!(ecx & (1<<19)); |
69 | | |
70 | | // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1); |
71 | | |
72 | 4.47k | if (have_SSE) { |
73 | 4.47k | } |
74 | 4.47k | #endif |
75 | | |
76 | 4.47k | #if HAVE_SSE4_1 |
77 | 4.47k | if (have_SSE4_1) { |
78 | 4.47k | accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse; |
79 | 4.47k | accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse; |
80 | | |
81 | 4.47k | accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse; |
82 | 4.47k | accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse; |
83 | 4.47k | accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse; |
84 | 4.47k | accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse; |
85 | | |
86 | 4.47k | accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse; |
87 | 4.47k | accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse; |
88 | 4.47k | accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse; |
89 | 4.47k | accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse; |
90 | 4.47k | accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse; |
91 | 4.47k | accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse; |
92 | 4.47k | accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse; |
93 | 4.47k | accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse; |
94 | 4.47k | accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse; |
95 | 4.47k | accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse; |
96 | 4.47k | accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse; |
97 | 4.47k | accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse; |
98 | 4.47k | accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse; |
99 | 4.47k | accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse; |
100 | 4.47k | accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse; |
101 | 4.47k | accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse; |
102 | | |
103 | 4.47k | accel->transform_skip_8 = ff_hevc_transform_skip_8_sse; |
104 | | |
105 | | // actually, for these two functions, the scalar fallback seems to be faster than the SSE code |
106 | | //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO |
107 | | //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4; |
108 | | |
109 | 4.47k | accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4; |
110 | 4.47k | accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4; |
111 | 4.47k | accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4; |
112 | 4.47k | } |
113 | 4.47k | #endif |
114 | 4.47k | } |
115 | | |