/src/aom/third_party/SVT-AV1/EbMemory_AVX2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright(c) 2019 Intel Corporation |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license/software-license. If the |
8 | | * Alliance for Open Media Patent License 1.0 was not distributed with this |
9 | | * source code in the PATENTS file, you can obtain it at |
10 | | * https://www.aomedia.org/license/patent-license. |
11 | | */ |
12 | | |
13 | | #ifndef AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_ |
14 | | #define AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_ |
15 | | |
16 | | #include <immintrin.h> |
17 | | |
18 | | #include "config/aom_config.h" |
19 | | |
20 | | #include "aom/aom_integer.h" |
21 | | |
22 | | #ifndef _mm256_set_m128i |
23 | | #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \ |
24 | 21.9M | _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) |
25 | | #endif |
26 | | |
27 | | #ifndef _mm256_setr_m128i |
28 | | #define _mm256_setr_m128i(/* __m128i */ lo, /* __m128i */ hi) \ |
29 | 21.9M | _mm256_set_m128i((hi), (lo)) |
30 | | #endif |
31 | | |
32 | | static inline __m256i load_u8_4x2_avx2(const uint8_t *const src, |
33 | 0 | const ptrdiff_t stride) { |
34 | 0 | __m128i src01; |
35 | 0 | src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride)); |
36 | 0 | src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1); |
37 | 0 | return _mm256_setr_m128i(src01, _mm_setzero_si128()); |
38 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:load_u8_4x2_avx2 Unexecuted instantiation: convolve_avx2.c:load_u8_4x2_avx2 |
39 | | |
40 | | static inline __m256i load_u8_4x4_avx2(const uint8_t *const src, |
41 | 0 | const ptrdiff_t stride) { |
42 | 0 | __m128i src01, src23; |
43 | 0 | src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride)); |
44 | 0 | src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1); |
45 | 0 | src23 = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride)); |
46 | 0 | src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1); |
47 | 0 | return _mm256_setr_m128i(src01, src23); |
48 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:load_u8_4x4_avx2 Unexecuted instantiation: convolve_avx2.c:load_u8_4x4_avx2 |
49 | | |
50 | | static inline __m256i load_u8_8x2_avx2(const uint8_t *const src, |
51 | 0 | const ptrdiff_t stride) { |
52 | 0 | const __m128i src0 = _mm_loadl_epi64((__m128i *)(src + 0 * stride)); |
53 | 0 | const __m128i src1 = _mm_loadl_epi64((__m128i *)(src + 1 * stride)); |
54 | 0 | return _mm256_setr_m128i(src0, src1); |
55 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:load_u8_8x2_avx2 Unexecuted instantiation: convolve_avx2.c:load_u8_8x2_avx2 |
56 | | |
57 | | static inline __m256i load_u8_8x4_avx2(const uint8_t *const src, |
58 | 0 | const ptrdiff_t stride) { |
59 | 0 | __m128i src01, src23; |
60 | 0 | src01 = _mm_loadl_epi64((__m128i *)(src + 0 * stride)); |
61 | 0 | src01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src01), |
62 | 0 | (double *)(void *)(src + 1 * stride))); |
63 | 0 | src23 = _mm_loadl_epi64((__m128i *)(src + 2 * stride)); |
64 | 0 | src23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src23), |
65 | 0 | (double *)(void *)(src + 3 * stride))); |
66 | 0 | return _mm256_setr_m128i(src01, src23); |
67 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:load_u8_8x4_avx2 Unexecuted instantiation: convolve_avx2.c:load_u8_8x4_avx2 |
68 | | |
69 | | static inline __m256i loadu_8bit_16x2_avx2(const void *const src, |
70 | 11.4M | const ptrdiff_t strideInByte) { |
71 | 11.4M | const __m128i src0 = _mm_loadu_si128((__m128i *)src); |
72 | 11.4M | const __m128i src1 = |
73 | 11.4M | _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte)); |
74 | 11.4M | return _mm256_setr_m128i(src0, src1); |
75 | 11.4M | } convolve_2d_avx2.c:loadu_8bit_16x2_avx2 Line | Count | Source | 70 | 8.77M | const ptrdiff_t strideInByte) { | 71 | 8.77M | const __m128i src0 = _mm_loadu_si128((__m128i *)src); | 72 | 8.77M | const __m128i src1 = | 73 | 8.77M | _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte)); | 74 | 8.77M | return _mm256_setr_m128i(src0, src1); | 75 | 8.77M | } |
convolve_avx2.c:loadu_8bit_16x2_avx2 Line | Count | Source | 70 | 2.68M | const ptrdiff_t strideInByte) { | 71 | 2.68M | const __m128i src0 = _mm_loadu_si128((__m128i *)src); | 72 | 2.68M | const __m128i src1 = | 73 | 2.68M | _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte)); | 74 | 2.68M | return _mm256_setr_m128i(src0, src1); | 75 | 2.68M | } |
|
76 | | |
77 | | static inline __m256i loadu_u8_16x2_avx2(const uint8_t *const src, |
78 | 0 | const ptrdiff_t stride) { |
79 | 0 | return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride); |
80 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:loadu_u8_16x2_avx2 Unexecuted instantiation: convolve_avx2.c:loadu_u8_16x2_avx2 |
81 | | |
82 | | static inline __m256i loadu_u16_8x2_avx2(const uint16_t *const src, |
83 | 0 | const ptrdiff_t stride) { |
84 | 0 | return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride); |
85 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:loadu_u16_8x2_avx2 Unexecuted instantiation: convolve_avx2.c:loadu_u16_8x2_avx2 |
86 | | |
87 | | static inline void storeu_8bit_16x2_avx2(const __m256i src, void *const dst, |
88 | 3.35M | const ptrdiff_t strideInByte) { |
89 | 3.35M | const __m128i d0 = _mm256_castsi256_si128(src); |
90 | 3.35M | const __m128i d1 = _mm256_extracti128_si256(src, 1); |
91 | 3.35M | _mm_storeu_si128((__m128i *)dst, d0); |
92 | 3.35M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1); |
93 | 3.35M | } convolve_2d_avx2.c:storeu_8bit_16x2_avx2 Line | Count | Source | 88 | 1.71M | const ptrdiff_t strideInByte) { | 89 | 1.71M | const __m128i d0 = _mm256_castsi256_si128(src); | 90 | 1.71M | const __m128i d1 = _mm256_extracti128_si256(src, 1); | 91 | 1.71M | _mm_storeu_si128((__m128i *)dst, d0); | 92 | 1.71M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1); | 93 | 1.71M | } |
convolve_avx2.c:storeu_8bit_16x2_avx2 Line | Count | Source | 88 | 1.64M | const ptrdiff_t strideInByte) { | 89 | 1.64M | const __m128i d0 = _mm256_castsi256_si128(src); | 90 | 1.64M | const __m128i d1 = _mm256_extracti128_si256(src, 1); | 91 | 1.64M | _mm_storeu_si128((__m128i *)dst, d0); | 92 | 1.64M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1); | 93 | 1.64M | } |
|
94 | | |
95 | | static inline void storeu_u8_16x2_avx2(const __m256i src, uint8_t *const dst, |
96 | 3.35M | const ptrdiff_t stride) { |
97 | 3.35M | storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride); |
98 | 3.35M | } convolve_2d_avx2.c:storeu_u8_16x2_avx2 Line | Count | Source | 96 | 1.71M | const ptrdiff_t stride) { | 97 | 1.71M | storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride); | 98 | 1.71M | } |
convolve_avx2.c:storeu_u8_16x2_avx2 Line | Count | Source | 96 | 1.64M | const ptrdiff_t stride) { | 97 | 1.64M | storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride); | 98 | 1.64M | } |
|
99 | | |
100 | | static inline void storeu_s16_8x2_avx2(const __m256i src, int16_t *const dst, |
101 | 0 | const ptrdiff_t stride) { |
102 | 0 | storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride); |
103 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:storeu_s16_8x2_avx2 Unexecuted instantiation: convolve_avx2.c:storeu_s16_8x2_avx2 |
104 | | |
105 | | static inline void storeu_u16_8x2_avx2(const __m256i src, uint16_t *const dst, |
106 | 0 | const ptrdiff_t stride) { |
107 | 0 | storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride); |
108 | 0 | } Unexecuted instantiation: convolve_2d_avx2.c:storeu_u16_8x2_avx2 Unexecuted instantiation: convolve_avx2.c:storeu_u16_8x2_avx2 |
109 | | |
110 | | #endif // AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_ |