Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/x86/synonyms_avx2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
13
#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
14
15
#include <immintrin.h>
16
17
#include "config/aom_config.h"
18
19
#include "aom/aom_integer.h"
20
21
/**
22
 * Various reusable shorthands for x86 SIMD intrinsics.
23
 *
24
 * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
25
 * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
26
 */
27
28
// Loads and stores to do away with the tedium of casting the address
29
// to the right type.
30
422M
static inline __m256i yy_load_256(const void *a) {
31
422M
  return _mm256_load_si256((const __m256i *)a);
32
422M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_load_256
Unexecuted instantiation: blend_a64_mask_avx2.c:yy_load_256
Unexecuted instantiation: jnt_convolve_avx2.c:yy_load_256
Unexecuted instantiation: reconinter_avx2.c:yy_load_256
selfguided_avx2.c:yy_load_256
Line
Count
Source
30
422M
static inline __m256i yy_load_256(const void *a) {
31
422M
  return _mm256_load_si256((const __m256i *)a);
32
422M
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_load_256
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_load_256
33
34
3.00G
static inline __m256i yy_loadu_256(const void *a) {
35
3.00G
  return _mm256_loadu_si256((const __m256i *)a);
36
3.00G
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_loadu_256
blend_a64_mask_avx2.c:yy_loadu_256
Line
Count
Source
34
22.5M
static inline __m256i yy_loadu_256(const void *a) {
35
22.5M
  return _mm256_loadu_si256((const __m256i *)a);
36
22.5M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_loadu_256
reconinter_avx2.c:yy_loadu_256
Line
Count
Source
34
10.7M
static inline __m256i yy_loadu_256(const void *a) {
35
10.7M
  return _mm256_loadu_si256((const __m256i *)a);
36
10.7M
}
selfguided_avx2.c:yy_loadu_256
Line
Count
Source
34
2.69G
static inline __m256i yy_loadu_256(const void *a) {
35
2.69G
  return _mm256_loadu_si256((const __m256i *)a);
36
2.69G
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_loadu_256
highbd_wiener_convolve_avx2.c:yy_loadu_256
Line
Count
Source
34
274M
static inline __m256i yy_loadu_256(const void *a) {
35
274M
  return _mm256_loadu_si256((const __m256i *)a);
36
274M
}
37
38
404M
static inline void yy_store_256(void *const a, const __m256i v) {
39
404M
  _mm256_store_si256((__m256i *)a, v);
40
404M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_store_256
Unexecuted instantiation: blend_a64_mask_avx2.c:yy_store_256
Unexecuted instantiation: jnt_convolve_avx2.c:yy_store_256
Unexecuted instantiation: reconinter_avx2.c:yy_store_256
selfguided_avx2.c:yy_store_256
Line
Count
Source
38
404M
static inline void yy_store_256(void *const a, const __m256i v) {
39
404M
  _mm256_store_si256((__m256i *)a, v);
40
404M
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_store_256
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_store_256
41
42
456M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
456M
  _mm256_storeu_si256((__m256i *)a, v);
44
456M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_storeu_256
blend_a64_mask_avx2.c:yy_storeu_256
Line
Count
Source
42
5.84M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
5.84M
  _mm256_storeu_si256((__m256i *)a, v);
44
5.84M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_storeu_256
reconinter_avx2.c:yy_storeu_256
Line
Count
Source
42
2.77M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
2.77M
  _mm256_storeu_si256((__m256i *)a, v);
44
2.77M
}
selfguided_avx2.c:yy_storeu_256
Line
Count
Source
42
411M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
411M
  _mm256_storeu_si256((__m256i *)a, v);
44
411M
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_storeu_256
highbd_wiener_convolve_avx2.c:yy_storeu_256
Line
Count
Source
42
36.3M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
36.3M
  _mm256_storeu_si256((__m256i *)a, v);
44
36.3M
}
45
46
// Fill an AVX register using an interleaved pair of values, ie. set the
47
// 16 channels to {a, b} repeated 8 times, using the same channel ordering
48
// as when a register is stored to / loaded from memory.
49
//
50
// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
51
// instruction
52
0
static inline __m256i yy_set2_epi16(int16_t a, int16_t b) {
53
0
  return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
54
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_set2_epi16
Unexecuted instantiation: blend_a64_mask_avx2.c:yy_set2_epi16
Unexecuted instantiation: jnt_convolve_avx2.c:yy_set2_epi16
Unexecuted instantiation: reconinter_avx2.c:yy_set2_epi16
Unexecuted instantiation: selfguided_avx2.c:yy_set2_epi16
Unexecuted instantiation: wiener_convolve_avx2.c:yy_set2_epi16
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_set2_epi16
55
56
// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
57
// therefore define an equivalent function using a different intrinsic.
58
// ([ hi ], [ lo ]) -> [ hi ][ lo ]
59
4.13M
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
4.13M
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
4.13M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_set_m128i
blend_a64_mask_avx2.c:yy_set_m128i
Line
Count
Source
59
1.91M
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
1.91M
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
1.91M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_set_m128i
reconinter_avx2.c:yy_set_m128i
Line
Count
Source
59
289k
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
289k
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
289k
}
Unexecuted instantiation: selfguided_avx2.c:yy_set_m128i
Unexecuted instantiation: wiener_convolve_avx2.c:yy_set_m128i
highbd_wiener_convolve_avx2.c:yy_set_m128i
Line
Count
Source
59
1.92M
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
1.92M
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
1.92M
}
62
63
// This behaves similarly to _mm256_set_epi64x(), but avoids undefined
64
// sanitizer warnings when loading values from unaligned buffers using
65
// `*(int64_t *)val`.
66
static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
67
228k
                                    const void *e1, const void *e0) {
68
228k
  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
69
228k
  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
70
228k
  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
71
228k
  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
72
  // Note this can be replaced with
73
  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
74
  // _mm256_set_m128d() with all supported compilers. This version is used to
75
  // match the behavior with yy_set_m128i().
76
228k
  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
77
228k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_loadu_4x64
blend_a64_mask_avx2.c:yy_loadu_4x64
Line
Count
Source
67
228k
                                    const void *e1, const void *e0) {
68
228k
  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
69
228k
  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
70
228k
  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
71
228k
  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
72
  // Note this can be replaced with
73
  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
74
  // _mm256_set_m128d() with all supported compilers. This version is used to
75
  // match the behavior with yy_set_m128i().
76
228k
  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
77
228k
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_loadu_4x64
Unexecuted instantiation: reconinter_avx2.c:yy_loadu_4x64
Unexecuted instantiation: selfguided_avx2.c:yy_loadu_4x64
Unexecuted instantiation: wiener_convolve_avx2.c:yy_loadu_4x64
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_loadu_4x64
78
79
1.97M
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
80
1.97M
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
81
1.97M
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
82
1.97M
  return yy_set_m128i(mhi, mlo);
83
1.97M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_loadu2_128
blend_a64_mask_avx2.c:yy_loadu2_128
Line
Count
Source
79
1.68M
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
80
1.68M
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
81
1.68M
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
82
1.68M
  return yy_set_m128i(mhi, mlo);
83
1.68M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_loadu2_128
reconinter_avx2.c:yy_loadu2_128
Line
Count
Source
79
289k
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
80
289k
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
81
289k
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
82
289k
  return yy_set_m128i(mhi, mlo);
83
289k
}
Unexecuted instantiation: selfguided_avx2.c:yy_loadu2_128
Unexecuted instantiation: wiener_convolve_avx2.c:yy_loadu2_128
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_loadu2_128
84
85
538k
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
86
538k
  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
87
538k
  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
88
538k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_storeu2_128
blend_a64_mask_avx2.c:yy_storeu2_128
Line
Count
Source
85
538k
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
86
538k
  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
87
538k
  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
88
538k
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_storeu2_128
Unexecuted instantiation: reconinter_avx2.c:yy_storeu2_128
Unexecuted instantiation: selfguided_avx2.c:yy_storeu2_128
Unexecuted instantiation: wiener_convolve_avx2.c:yy_storeu2_128
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_storeu2_128
89
90
775k
static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
91
775k
  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
92
775k
  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
93
775k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_roundn_epu16
blend_a64_mask_avx2.c:yy_roundn_epu16
Line
Count
Source
90
775k
static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
91
775k
  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
92
775k
  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
93
775k
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_roundn_epu16
Unexecuted instantiation: reconinter_avx2.c:yy_roundn_epu16
Unexecuted instantiation: selfguided_avx2.c:yy_roundn_epu16
Unexecuted instantiation: wiener_convolve_avx2.c:yy_roundn_epu16
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_roundn_epu16
94
#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_