Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/x86/sse.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#ifdef _MSC_VER
22
#include <intrin.h>
23
#endif
24
25
#include "x86/sse.h"
26
#include "x86/sse-motion.h"
27
#include "x86/sse-dct.h"
28
#include "x86/sse-intrapred.h"
29
#include "x86/sse-deblk.h"
30
31
#ifdef HAVE_CONFIG_H
32
#include "config.h"
33
#endif
34
35
#if HAVE_AVX2
36
#include "x86/transform-avx2.h"
37
#endif
38
#if HAVE_AVX512
39
#include "x86/transform-avx512.h"
40
#endif
41
42
#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
43
#include <cpuid.h>
44
#endif
45
46
void init_acceleration_functions_sse(struct acceleration_functions* accel)
47
2.05k
{
48
2.05k
  uint32_t ecx=0,edx=0;
49
50
#ifdef _MSC_VER
51
  uint32_t regs[4];
52
  int a = 1;
53
54
  __cpuid((int *)regs, (int)a);
55
56
  ecx = regs[2];
57
  edx = regs[3];
58
#elif !defined(__EMSCRIPTEN__)
59
  uint32_t eax,ebx;
60
2.05k
  __get_cpuid(1, &eax,&ebx,&ecx,&edx);
61
2.05k
#endif
62
63
#ifdef __EMSCRIPTEN__
64
  int have_SSE    = 0;
65
  int have_SSE4_1 = 0;
66
#ifdef __SSE__
67
  have_SSE = 1;
68
#endif
69
#ifdef __SSE4_1__
70
  have_SSE4_1 = 1;
71
#endif
72
#else
73
  // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);
74
75
  //int have_MMX    = !!(edx & (1<<23));
76
2.05k
  int have_SSE    = !!(edx & (1<<25));
77
2.05k
  int have_SSE4_1 = !!(ecx & (1<<19));
78
79
  // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);
80
81
2.05k
  if (have_SSE) {
82
2.05k
  }
83
2.05k
#endif
84
85
2.05k
#if HAVE_SSE4_1
86
2.05k
  if (have_SSE4_1) {
87
2.05k
    accel->put_unweighted_pred_8   = ff_hevc_put_unweighted_pred_8_sse;
88
2.05k
    accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;
89
90
2.05k
    accel->put_hevc_epel_8    = ff_hevc_put_hevc_epel_pixels_8_sse;
91
2.05k
    accel->put_hevc_epel_h_8  = ff_hevc_put_hevc_epel_h_8_sse;
92
2.05k
    accel->put_hevc_epel_v_8  = ff_hevc_put_hevc_epel_v_8_sse;
93
2.05k
    accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;
94
95
2.05k
    accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
96
2.05k
    accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
97
2.05k
    accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
98
2.05k
    accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
99
2.05k
    accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
100
2.05k
    accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
101
2.05k
    accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
102
2.05k
    accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
103
2.05k
    accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
104
2.05k
    accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
105
2.05k
    accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
106
2.05k
    accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
107
2.05k
    accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
108
2.05k
    accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
109
2.05k
    accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
110
2.05k
    accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;
111
112
2.05k
    accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;
113
114
    // actually, for these two functions, the scalar fallback seems to be faster than the SSE code
115
    //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
116
    //accel->transform_4x4_add_8   = ff_hevc_transform_4x4_add_8_sse4;
117
118
2.05k
    accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
119
2.05k
    accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
120
2.05k
    accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;
121
122
2.05k
    accel->add_residual_8  = add_residual_8_sse4;
123
2.05k
    accel->add_residual_16 = add_residual_16_sse4;
124
2.05k
    accel->dequant_coeff_block = dequant_coeff_block_sse4;
125
126
2.05k
    accel->intra_pred_dc_8      = intra_pred_dc_8_sse4;
127
2.05k
    accel->intra_pred_planar_8  = intra_pred_planar_8_sse4;
128
2.05k
    accel->intra_pred_angular_8 = intra_pred_angular_8_sse4;
129
130
2.05k
    accel->deblock_luma_8   = deblock_luma_8_sse4;
131
    // chroma deblock stays on the scalar fallback: the filter is too cheap to
132
    // amortize the SIMD load/transpose/scatter overhead (SSE measured slower).
133
2.05k
  }
134
2.05k
#endif
135
2.05k
}
136
137
138
void init_acceleration_functions_avx2(struct acceleration_functions* accel)
139
2.05k
{
140
2.05k
#if HAVE_AVX2
141
  // __builtin_cpu_supports("avx2") handles the OSXSAVE / XGETBV (YMM-enabled)
142
  // checks internally, so this is safe to call on any CPU. This TU is *not*
143
  // compiled with -mavx2, so reaching here never executes an AVX2 instruction.
144
2.05k
#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
145
2.05k
  __builtin_cpu_init();
146
2.05k
  if (__builtin_cpu_supports("avx2")) {
147
2.05k
    accel->transform_add_8[2] = transform_16x16_add_8_avx2;
148
2.05k
    accel->transform_add_8[3] = transform_32x32_add_8_avx2;
149
    // NB: dequant intentionally stays on the SSE version. An AVX2 variant was
150
    // implemented and benchmarked, but inverse quantization is scatter-bound, so
151
    // the wider arithmetic gave no benefit and AVX2 measured actually slightly
152
    // slower than SSE. (AVX-512 would be no better, for the same reason.)
153
2.05k
  }
154
2.05k
#endif
155
2.05k
#endif
156
2.05k
}
157
158
159
void init_acceleration_functions_avx512(struct acceleration_functions* accel)
160
2.05k
{
161
2.05k
#if HAVE_AVX512
162
2.05k
#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
163
2.05k
  __builtin_cpu_init();
164
  // need AVX-512F + AVX-512BW (16-bit ops). __builtin_cpu_supports handles the
165
  // OS (XCR0/ZMM-enabled) check. This TU is not compiled with -mavx512*.
166
2.05k
  if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
167
0
    accel->transform_add_8[3] = transform_32x32_add_8_avx512;
168
0
  }
169
2.05k
#endif
170
2.05k
#endif
171
2.05k
}
172