Coverage Report

Created: 2026-02-14 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/aom_dsp/x86/synonyms_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
13
#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
14
15
#include <immintrin.h>
16
17
#include "config/aom_config.h"
18
19
#include "aom/aom_integer.h"
20
21
/**
22
 * Various reusable shorthands for x86 SIMD intrinsics.
23
 *
24
 * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
25
 * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
26
 */
27
28
// Loads and stores to do away with the tedium of casting the address
29
// to the right type.
30
436M
static inline __m256i yy_load_256(const void *a) {
31
436M
  return _mm256_load_si256((const __m256i *)a);
32
436M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_load_256
Unexecuted instantiation: blend_a64_mask_avx2.c:yy_load_256
Unexecuted instantiation: jnt_convolve_avx2.c:yy_load_256
Unexecuted instantiation: reconinter_avx2.c:yy_load_256
selfguided_avx2.c:yy_load_256
Line
Count
Source
30
436M
static inline __m256i yy_load_256(const void *a) {
31
436M
  return _mm256_load_si256((const __m256i *)a);
32
436M
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_load_256
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_load_256
33
34
2.55G
static inline __m256i yy_loadu_256(const void *a) {
35
2.55G
  return _mm256_loadu_si256((const __m256i *)a);
36
2.55G
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_loadu_256
blend_a64_mask_avx2.c:yy_loadu_256
Line
Count
Source
34
16.4M
static inline __m256i yy_loadu_256(const void *a) {
35
16.4M
  return _mm256_loadu_si256((const __m256i *)a);
36
16.4M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_loadu_256
reconinter_avx2.c:yy_loadu_256
Line
Count
Source
34
7.78M
static inline __m256i yy_loadu_256(const void *a) {
35
7.78M
  return _mm256_loadu_si256((const __m256i *)a);
36
7.78M
}
selfguided_avx2.c:yy_loadu_256
Line
Count
Source
34
2.29G
static inline __m256i yy_loadu_256(const void *a) {
35
2.29G
  return _mm256_loadu_si256((const __m256i *)a);
36
2.29G
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_loadu_256
highbd_wiener_convolve_avx2.c:yy_loadu_256
Line
Count
Source
34
240M
static inline __m256i yy_loadu_256(const void *a) {
35
240M
  return _mm256_loadu_si256((const __m256i *)a);
36
240M
}
37
38
429M
static inline void yy_store_256(void *const a, const __m256i v) {
39
429M
  _mm256_store_si256((__m256i *)a, v);
40
429M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_store_256
Unexecuted instantiation: blend_a64_mask_avx2.c:yy_store_256
Unexecuted instantiation: jnt_convolve_avx2.c:yy_store_256
Unexecuted instantiation: reconinter_avx2.c:yy_store_256
selfguided_avx2.c:yy_store_256
Line
Count
Source
38
429M
static inline void yy_store_256(void *const a, const __m256i v) {
39
429M
  _mm256_store_si256((__m256i *)a, v);
40
429M
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_store_256
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_store_256
41
42
372M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
372M
  _mm256_storeu_si256((__m256i *)a, v);
44
372M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_storeu_256
blend_a64_mask_avx2.c:yy_storeu_256
Line
Count
Source
42
4.41M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
4.41M
  _mm256_storeu_si256((__m256i *)a, v);
44
4.41M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_storeu_256
reconinter_avx2.c:yy_storeu_256
Line
Count
Source
42
2.00M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
2.00M
  _mm256_storeu_si256((__m256i *)a, v);
44
2.00M
}
selfguided_avx2.c:yy_storeu_256
Line
Count
Source
42
335M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
335M
  _mm256_storeu_si256((__m256i *)a, v);
44
335M
}
Unexecuted instantiation: wiener_convolve_avx2.c:yy_storeu_256
highbd_wiener_convolve_avx2.c:yy_storeu_256
Line
Count
Source
42
30.1M
static inline void yy_storeu_256(void *const a, const __m256i v) {
43
30.1M
  _mm256_storeu_si256((__m256i *)a, v);
44
30.1M
}
45
46
// Fill an AVX register using an interleaved pair of values, ie. set the
47
// 16 channels to {a, b} repeated 8 times, using the same channel ordering
48
// as when a register is stored to / loaded from memory.
49
//
50
// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
51
// instruction
52
0
static inline __m256i yy_set2_epi16(int16_t a, int16_t b) {
53
0
  return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
54
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_set2_epi16
Unexecuted instantiation: blend_a64_mask_avx2.c:yy_set2_epi16
Unexecuted instantiation: jnt_convolve_avx2.c:yy_set2_epi16
Unexecuted instantiation: reconinter_avx2.c:yy_set2_epi16
Unexecuted instantiation: selfguided_avx2.c:yy_set2_epi16
Unexecuted instantiation: wiener_convolve_avx2.c:yy_set2_epi16
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_set2_epi16
55
56
// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
57
// therefore define an equivalent function using a different intrinsic.
58
// ([ hi ], [ lo ]) -> [ hi ][ lo ]
59
3.45M
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
3.45M
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
3.45M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_set_m128i
blend_a64_mask_avx2.c:yy_set_m128i
Line
Count
Source
59
1.32M
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
1.32M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_set_m128i
reconinter_avx2.c:yy_set_m128i
Line
Count
Source
59
246k
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
246k
}
Unexecuted instantiation: selfguided_avx2.c:yy_set_m128i
Unexecuted instantiation: wiener_convolve_avx2.c:yy_set_m128i
highbd_wiener_convolve_avx2.c:yy_set_m128i
Line
Count
Source
59
1.88M
static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
60
  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
61
1.88M
}
62
63
// This behaves similarly to _mm256_set_epi64x(), but avoids undefined
64
// sanitizer warnings when loading values from unaligned buffers using
65
// `*(int64_t *)val`.
66
static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
67
183k
                                    const void *e1, const void *e0) {
68
183k
  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
69
183k
  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
70
183k
  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
71
183k
  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
72
  // Note this can be replaced with
73
  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
74
  // _mm256_set_m128d() with all supported compilers. This version is used to
75
  // match the behavior with yy_set_m128i().
76
183k
  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
77
183k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_loadu_4x64
blend_a64_mask_avx2.c:yy_loadu_4x64
Line
Count
Source
67
183k
                                    const void *e1, const void *e0) {
68
183k
  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
69
183k
  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
70
183k
  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
71
183k
  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
72
  // Note this can be replaced with
73
  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
74
  // _mm256_set_m128d() with all supported compilers. This version is used to
75
  // match the behavior with yy_set_m128i().
76
183k
  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
77
183k
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_loadu_4x64
Unexecuted instantiation: reconinter_avx2.c:yy_loadu_4x64
Unexecuted instantiation: selfguided_avx2.c:yy_loadu_4x64
Unexecuted instantiation: wiener_convolve_avx2.c:yy_loadu_4x64
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_loadu_4x64
78
79
1.39M
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
80
1.39M
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
81
1.39M
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
82
1.39M
  return yy_set_m128i(mhi, mlo);
83
1.39M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_loadu2_128
blend_a64_mask_avx2.c:yy_loadu2_128
Line
Count
Source
79
1.14M
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
80
1.14M
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
81
1.14M
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
82
1.14M
  return yy_set_m128i(mhi, mlo);
83
1.14M
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_loadu2_128
reconinter_avx2.c:yy_loadu2_128
Line
Count
Source
79
246k
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
80
246k
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
81
246k
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
82
246k
  return yy_set_m128i(mhi, mlo);
83
246k
}
Unexecuted instantiation: selfguided_avx2.c:yy_loadu2_128
Unexecuted instantiation: wiener_convolve_avx2.c:yy_loadu2_128
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_loadu2_128
84
85
378k
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
86
378k
  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
87
378k
  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
88
378k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_storeu2_128
blend_a64_mask_avx2.c:yy_storeu2_128
Line
Count
Source
85
378k
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
86
  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
87
378k
  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
88
378k
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_storeu2_128
Unexecuted instantiation: reconinter_avx2.c:yy_storeu2_128
Unexecuted instantiation: selfguided_avx2.c:yy_storeu2_128
Unexecuted instantiation: wiener_convolve_avx2.c:yy_storeu2_128
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_storeu2_128
89
90
765k
static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
91
765k
  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
92
765k
  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
93
765k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:yy_roundn_epu16
blend_a64_mask_avx2.c:yy_roundn_epu16
Line
Count
Source
90
765k
static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
91
765k
  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
92
765k
  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
93
765k
}
Unexecuted instantiation: jnt_convolve_avx2.c:yy_roundn_epu16
Unexecuted instantiation: reconinter_avx2.c:yy_roundn_epu16
Unexecuted instantiation: selfguided_avx2.c:yy_roundn_epu16
Unexecuted instantiation: wiener_convolve_avx2.c:yy_roundn_epu16
Unexecuted instantiation: highbd_wiener_convolve_avx2.c:yy_roundn_epu16
94
#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_