Coverage Report

Created: 2025-11-14 07:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libde265/libde265/x86/sse.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#ifdef _MSC_VER
22
#include <intrin.h>
23
#endif
24
25
#include "x86/sse.h"
26
#include "x86/sse-motion.h"
27
#include "x86/sse-dct.h"
28
29
#ifdef HAVE_CONFIG_H
30
#include "config.h"
31
#endif
32
33
#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
34
#include <cpuid.h>
35
#endif
36
37
void init_acceleration_functions_sse(struct acceleration_functions* accel)
38
4.47k
{
39
4.47k
  uint32_t ecx=0,edx=0;
40
41
#ifdef _MSC_VER
42
  uint32_t regs[4];
43
  int a = 1;
44
45
  __cpuid((int *)regs, (int)a);
46
47
  ecx = regs[2];
48
  edx = regs[3];
49
#elif !defined(__EMSCRIPTEN__)
50
  uint32_t eax,ebx;
51
4.47k
  __get_cpuid(1, &eax,&ebx,&ecx,&edx);
52
4.47k
#endif
53
54
#ifdef __EMSCRIPTEN__
55
  int have_SSE    = 0;
56
  int have_SSE4_1 = 0;
57
#ifdef __SSE__
58
  have_SSE = 1;
59
#endif
60
#ifdef __SSE4_1__
61
  have_SSE4_1 = 1;
62
#endif
63
#else
64
  // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);
65
66
  //int have_MMX    = !!(edx & (1<<23));
67
4.47k
  int have_SSE    = !!(edx & (1<<25));
68
4.47k
  int have_SSE4_1 = !!(ecx & (1<<19));
69
70
  // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);
71
72
4.47k
  if (have_SSE) {
73
4.47k
  }
74
4.47k
#endif
75
76
4.47k
#if HAVE_SSE4_1
77
4.47k
  if (have_SSE4_1) {
78
4.47k
    accel->put_unweighted_pred_8   = ff_hevc_put_unweighted_pred_8_sse;
79
4.47k
    accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;
80
81
4.47k
    accel->put_hevc_epel_8    = ff_hevc_put_hevc_epel_pixels_8_sse;
82
4.47k
    accel->put_hevc_epel_h_8  = ff_hevc_put_hevc_epel_h_8_sse;
83
4.47k
    accel->put_hevc_epel_v_8  = ff_hevc_put_hevc_epel_v_8_sse;
84
4.47k
    accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;
85
86
4.47k
    accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
87
4.47k
    accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
88
4.47k
    accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
89
4.47k
    accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
90
4.47k
    accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
91
4.47k
    accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
92
4.47k
    accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
93
4.47k
    accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
94
4.47k
    accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
95
4.47k
    accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
96
4.47k
    accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
97
4.47k
    accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
98
4.47k
    accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
99
4.47k
    accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
100
4.47k
    accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
101
4.47k
    accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;
102
103
4.47k
    accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;
104
105
    // actually, for these two functions, the scalar fallback seems to be faster than the SSE code
106
    //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
107
    //accel->transform_4x4_add_8   = ff_hevc_transform_4x4_add_8_sse4;
108
109
4.47k
    accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
110
4.47k
    accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
111
4.47k
    accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;
112
4.47k
  }
113
4.47k
#endif
114
4.47k
}
115