/src/libde265/libde265/x86/sse.cc

Source
/*
 * H.265 video codec.
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
 *
 * This file is part of libde265.
 *
 * libde265 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * libde265 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifdef _MSC_VER
#include <intrin.h>
#endif

#include "x86/sse.h"
#include "x86/sse-motion.h"
#include "x86/sse-dct.h"

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
#include <cpuid.h>
#endif

void init_acceleration_functions_sse(struct acceleration_functions* accel)
{
  uint32_t ecx=0,edx=0;

#ifdef _MSC_VER
  uint32_t regs[4];
  int a = 1;

  __cpuid((int *)regs, (int)a);

  ecx = regs[2];
  edx = regs[3];
#elif !defined(__EMSCRIPTEN__)
  uint32_t eax,ebx;
  __get_cpuid(1, &eax,&ebx,&ecx,&edx);
#endif

#ifdef __EMSCRIPTEN__
  int have_SSE    = 0;
  int have_SSE4_1 = 0;
#ifdef __SSE__
  have_SSE = 1;
#endif
#ifdef __SSE4_1__
  have_SSE4_1 = 1;
#endif
#else
  // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);

  //int have_MMX    = !!(edx & (1<<23));
  int have_SSE    = !!(edx & (1<<25));
  int have_SSE4_1 = !!(ecx & (1<<19));

  // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);

  if (have_SSE) {
  }
#endif

#if HAVE_SSE4_1
  if (have_SSE4_1) {
    accel->put_unweighted_pred_8   = ff_hevc_put_unweighted_pred_8_sse;
    accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;

    accel->put_hevc_epel_8    = ff_hevc_put_hevc_epel_pixels_8_sse;
    accel->put_hevc_epel_h_8  = ff_hevc_put_hevc_epel_h_8_sse;
    accel->put_hevc_epel_v_8  = ff_hevc_put_hevc_epel_v_8_sse;
    accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;

    accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
    accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
    accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
    accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
    accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
    accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
    accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
    accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
    accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
    accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
    accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
    accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
    accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
    accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
    accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
    accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;

    accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;

    // actually, for these two functions, the scalar fallback seems to be faster than the SSE code
    //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
    //accel->transform_4x4_add_8   = ff_hevc_transform_4x4_add_8_sse4;

    accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
    accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
    accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;
  }
#endif
}


Line	Count	Source
1		/*
2		* H.265 video codec.
3		* Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4		*
5		* This file is part of libde265.
6		*
7		* libde265 is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as
9		* published by the Free Software Foundation, either version 3 of
10		* the License, or (at your option) any later version.
11		*
12		* libde265 is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libde265. If not, see <http://www.gnu.org/licenses/>.
19		*/
20
21		#ifdef _MSC_VER
22		#include <intrin.h>
23		#endif
24
25		#include "x86/sse.h"
26		#include "x86/sse-motion.h"
27		#include "x86/sse-dct.h"
28
29		#ifdef HAVE_CONFIG_H
30		#include "config.h"
31		#endif
32
33		#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
34		#include <cpuid.h>
35		#endif
36
37		void init_acceleration_functions_sse(struct acceleration_functions* accel)
38	4.47k	{
39	4.47k	uint32_t ecx=0,edx=0;
40
41		#ifdef _MSC_VER
42		uint32_t regs[4];
43		int a = 1;
44
45		__cpuid((int *)regs, (int)a);
46
47		ecx = regs[2];
48		edx = regs[3];
49		#elif !defined(__EMSCRIPTEN__)
50		uint32_t eax,ebx;
51	4.47k	__get_cpuid(1, &eax,&ebx,&ecx,&edx);
52	4.47k	#endif
53
54		#ifdef __EMSCRIPTEN__
55		int have_SSE = 0;
56		int have_SSE4_1 = 0;
57		#ifdef __SSE__
58		have_SSE = 1;
59		#endif
60		#ifdef __SSE4_1__
61		have_SSE4_1 = 1;
62		#endif
63		#else
64		// printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);
65
66		//int have_MMX = !!(edx & (1<<23));
67	4.47k	int have_SSE = !!(edx & (1<<25));
68	4.47k	int have_SSE4_1 = !!(ecx & (1<<19));
69
70		// printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);
71
72	4.47k	if (have_SSE) {
73	4.47k	}
74	4.47k	#endif
75
76	4.47k	#if HAVE_SSE4_1
77	4.47k	if (have_SSE4_1) {
78	4.47k	accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse;
79	4.47k	accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;
80
81	4.47k	accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse;
82	4.47k	accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse;
83	4.47k	accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse;
84	4.47k	accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;
85
86	4.47k	accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
87	4.47k	accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
88	4.47k	accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
89	4.47k	accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
90	4.47k	accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
91	4.47k	accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
92	4.47k	accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
93	4.47k	accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
94	4.47k	accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
95	4.47k	accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
96	4.47k	accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
97	4.47k	accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
98	4.47k	accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
99	4.47k	accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
100	4.47k	accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
101	4.47k	accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;
102
103	4.47k	accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;
104
105		// actually, for these two functions, the scalar fallback seems to be faster than the SSE code
106		//accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
107		//accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4;
108
109	4.47k	accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
110	4.47k	accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
111	4.47k	accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;
112	4.47k	}
113	4.47k	#endif
114	4.47k	}
115

Coverage Report

Created: 2025-11-14 07:32