/work/libde265/libde265/x86/sse.cc

Source
/*
 * H.265 video codec.
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
 *
 * This file is part of libde265.
 *
 * libde265 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * libde265 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifdef _MSC_VER
#include <intrin.h>
#endif

#include "x86/sse.h"
#include "x86/sse-motion.h"
#include "x86/sse-dct.h"
#include "x86/sse-intrapred.h"
#include "x86/sse-deblk.h"

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#if HAVE_AVX2
#include "x86/transform-avx2.h"
#endif
#if HAVE_AVX512
#include "x86/transform-avx512.h"
#endif

#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
#include <cpuid.h>
#endif

void init_acceleration_functions_sse(struct acceleration_functions* accel)
{
  uint32_t ecx=0,edx=0;

#ifdef _MSC_VER
  uint32_t regs[4];
  int a = 1;

  __cpuid((int *)regs, (int)a);

  ecx = regs[2];
  edx = regs[3];
#elif !defined(__EMSCRIPTEN__)
  uint32_t eax,ebx;
  __get_cpuid(1, &eax,&ebx,&ecx,&edx);
#endif

#ifdef __EMSCRIPTEN__
  int have_SSE    = 0;
  int have_SSE4_1 = 0;
#ifdef __SSE__
  have_SSE = 1;
#endif
#ifdef __SSE4_1__
  have_SSE4_1 = 1;
#endif
#else
  // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);

  //int have_MMX    = !!(edx & (1<<23));
  int have_SSE    = !!(edx & (1<<25));
  int have_SSE4_1 = !!(ecx & (1<<19));

  // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);

  if (have_SSE) {
  }
#endif

#if HAVE_SSE4_1
  if (have_SSE4_1) {
    accel->put_unweighted_pred_8   = ff_hevc_put_unweighted_pred_8_sse;
    accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;

    accel->put_hevc_epel_8    = ff_hevc_put_hevc_epel_pixels_8_sse;
    accel->put_hevc_epel_h_8  = ff_hevc_put_hevc_epel_h_8_sse;
    accel->put_hevc_epel_v_8  = ff_hevc_put_hevc_epel_v_8_sse;
    accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;

    accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
    accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
    accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
    accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
    accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
    accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
    accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
    accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
    accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
    accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
    accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
    accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
    accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
    accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
    accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
    accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;

    accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;

    // actually, for these two functions, the scalar fallback seems to be faster than the SSE code
    //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
    //accel->transform_4x4_add_8   = ff_hevc_transform_4x4_add_8_sse4;

    accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
    accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
    accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;

    accel->add_residual_8  = add_residual_8_sse4;
    accel->add_residual_16 = add_residual_16_sse4;
    accel->dequant_coeff_block = dequant_coeff_block_sse4;

    accel->intra_pred_dc_8      = intra_pred_dc_8_sse4;
    accel->intra_pred_planar_8  = intra_pred_planar_8_sse4;
    accel->intra_pred_angular_8 = intra_pred_angular_8_sse4;

    accel->deblock_luma_8   = deblock_luma_8_sse4;
    // chroma deblock stays on the scalar fallback: the filter is too cheap to
    // amortize the SIMD load/transpose/scatter overhead (SSE measured slower).
  }
#endif
}


void init_acceleration_functions_avx2(struct acceleration_functions* accel)
{
#if HAVE_AVX2
  // __builtin_cpu_supports("avx2") handles the OSXSAVE / XGETBV (YMM-enabled)
  // checks internally, so this is safe to call on any CPU. This TU is *not*
  // compiled with -mavx2, so reaching here never executes an AVX2 instruction.
#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
  __builtin_cpu_init();
  if (__builtin_cpu_supports("avx2")) {
    accel->transform_add_8[2] = transform_16x16_add_8_avx2;
    accel->transform_add_8[3] = transform_32x32_add_8_avx2;
    // NB: dequant intentionally stays on the SSE version. An AVX2 variant was
    // implemented and benchmarked, but inverse quantization is scatter-bound, so
    // the wider arithmetic gave no benefit and AVX2 measured actually slightly
    // slower than SSE. (AVX-512 would be no better, for the same reason.)
  }
#endif
#endif
}


void init_acceleration_functions_avx512(struct acceleration_functions* accel)
{
#if HAVE_AVX512
#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
  __builtin_cpu_init();
  // need AVX-512F + AVX-512BW (16-bit ops). __builtin_cpu_supports handles the
  // OS (XCR0/ZMM-enabled) check. This TU is not compiled with -mavx512*.
  if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
    accel->transform_add_8[3] = transform_32x32_add_8_avx512;
  }
#endif
#endif
}


Line	Count	Source
1		/*
2		* H.265 video codec.
3		* Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4		*
5		* This file is part of libde265.
6		*
7		* libde265 is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as
9		* published by the Free Software Foundation, either version 3 of
10		* the License, or (at your option) any later version.
11		*
12		* libde265 is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libde265. If not, see <http://www.gnu.org/licenses/>.
19		*/
20
21		#ifdef _MSC_VER
22		#include <intrin.h>
23		#endif
24
25		#include "x86/sse.h"
26		#include "x86/sse-motion.h"
27		#include "x86/sse-dct.h"
28		#include "x86/sse-intrapred.h"
29		#include "x86/sse-deblk.h"
30
31		#ifdef HAVE_CONFIG_H
32		#include "config.h"
33		#endif
34
35		#if HAVE_AVX2
36		#include "x86/transform-avx2.h"
37		#endif
38		#if HAVE_AVX512
39		#include "x86/transform-avx512.h"
40		#endif
41
42		#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
43		#include <cpuid.h>
44		#endif
45
46		void init_acceleration_functions_sse(struct acceleration_functions* accel)
47	2.05k	{
48	2.05k	uint32_t ecx=0,edx=0;
49
50		#ifdef _MSC_VER
51		uint32_t regs[4];
52		int a = 1;
53
54		__cpuid((int *)regs, (int)a);
55
56		ecx = regs[2];
57		edx = regs[3];
58		#elif !defined(__EMSCRIPTEN__)
59		uint32_t eax,ebx;
60	2.05k	__get_cpuid(1, &eax,&ebx,&ecx,&edx);
61	2.05k	#endif
62
63		#ifdef __EMSCRIPTEN__
64		int have_SSE = 0;
65		int have_SSE4_1 = 0;
66		#ifdef __SSE__
67		have_SSE = 1;
68		#endif
69		#ifdef __SSE4_1__
70		have_SSE4_1 = 1;
71		#endif
72		#else
73		// printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]);
74
75		//int have_MMX = !!(edx & (1<<23));
76	2.05k	int have_SSE = !!(edx & (1<<25));
77	2.05k	int have_SSE4_1 = !!(ecx & (1<<19));
78
79		// printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1);
80
81	2.05k	if (have_SSE) {
82	2.05k	}
83	2.05k	#endif
84
85	2.05k	#if HAVE_SSE4_1
86	2.05k	if (have_SSE4_1) {
87	2.05k	accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse;
88	2.05k	accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse;
89
90	2.05k	accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse;
91	2.05k	accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse;
92	2.05k	accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse;
93	2.05k	accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse;
94
95	2.05k	accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse;
96	2.05k	accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse;
97	2.05k	accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse;
98	2.05k	accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse;
99	2.05k	accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse;
100	2.05k	accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse;
101	2.05k	accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse;
102	2.05k	accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse;
103	2.05k	accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse;
104	2.05k	accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse;
105	2.05k	accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse;
106	2.05k	accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse;
107	2.05k	accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse;
108	2.05k	accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse;
109	2.05k	accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse;
110	2.05k	accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse;
111
112	2.05k	accel->transform_skip_8 = ff_hevc_transform_skip_8_sse;
113
114		// actually, for these two functions, the scalar fallback seems to be faster than the SSE code
115		//accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
116		//accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4;
117
118	2.05k	accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
119	2.05k	accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
120	2.05k	accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;
121
122	2.05k	accel->add_residual_8 = add_residual_8_sse4;
123	2.05k	accel->add_residual_16 = add_residual_16_sse4;
124	2.05k	accel->dequant_coeff_block = dequant_coeff_block_sse4;
125
126	2.05k	accel->intra_pred_dc_8 = intra_pred_dc_8_sse4;
127	2.05k	accel->intra_pred_planar_8 = intra_pred_planar_8_sse4;
128	2.05k	accel->intra_pred_angular_8 = intra_pred_angular_8_sse4;
129
130	2.05k	accel->deblock_luma_8 = deblock_luma_8_sse4;
131		// chroma deblock stays on the scalar fallback: the filter is too cheap to
132		// amortize the SIMD load/transpose/scatter overhead (SSE measured slower).
133	2.05k	}
134	2.05k	#endif
135	2.05k	}
136
137
138		void init_acceleration_functions_avx2(struct acceleration_functions* accel)
139	2.05k	{
140	2.05k	#if HAVE_AVX2
141		// __builtin_cpu_supports("avx2") handles the OSXSAVE / XGETBV (YMM-enabled)
142		// checks internally, so this is safe to call on any CPU. This TU is not
143		// compiled with -mavx2, so reaching here never executes an AVX2 instruction.
144	2.05k	#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
145	2.05k	__builtin_cpu_init();
146	2.05k	if (__builtin_cpu_supports("avx2")) {
147	2.05k	accel->transform_add_8[2] = transform_16x16_add_8_avx2;
148	2.05k	accel->transform_add_8[3] = transform_32x32_add_8_avx2;
149		// NB: dequant intentionally stays on the SSE version. An AVX2 variant was
150		// implemented and benchmarked, but inverse quantization is scatter-bound, so
151		// the wider arithmetic gave no benefit and AVX2 measured actually slightly
152		// slower than SSE. (AVX-512 would be no better, for the same reason.)
153	2.05k	}
154	2.05k	#endif
155	2.05k	#endif
156	2.05k	}
157
158
159		void init_acceleration_functions_avx512(struct acceleration_functions* accel)
160	2.05k	{
161	2.05k	#if HAVE_AVX512
162	2.05k	#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
163	2.05k	__builtin_cpu_init();
164		// need AVX-512F + AVX-512BW (16-bit ops). __builtin_cpu_supports handles the
165		// OS (XCR0/ZMM-enabled) check. This TU is not compiled with -mavx512*.
166	2.05k	if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
167	0	accel->transform_add_8[3] = transform_32x32_add_8_avx512;
168	0	}
169	2.05k	#endif
170	2.05k	#endif
171	2.05k	}
172

Coverage Report

Created: 2026-06-15 06:25