/src/aom/aom_dsp/x86/mem_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ |
13 | | #define AOM_AOM_DSP_X86_MEM_SSE2_H_ |
14 | | |
15 | | #include <emmintrin.h> // SSE2 |
16 | | #include <string.h> |
17 | | |
18 | | #include "config/aom_config.h" |
19 | | |
20 | | #include "aom/aom_integer.h" |
21 | | |
22 | 462k | static inline int16_t loadu_int16(const void *src) { |
23 | 462k | int16_t v; |
24 | 462k | memcpy(&v, src, sizeof(v)); |
25 | 462k | return v; |
26 | 462k | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:loadu_int16 Unexecuted instantiation: resize_ssse3.c:loadu_int16 Unexecuted instantiation: convolve_2d_avx2.c:loadu_int16 convolve_avx2.c:loadu_int16 Line | Count | Source | 22 | 462k | static inline int16_t loadu_int16(const void *src) { | 23 | 462k | int16_t v; | 24 | 462k | memcpy(&v, src, sizeof(v)); | 25 | 462k | return v; | 26 | 462k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_int16 |
27 | | |
28 | 3.90M | static inline int32_t loadu_int32(const void *src) { |
29 | 3.90M | int32_t v; |
30 | 3.90M | memcpy(&v, src, sizeof(v)); |
31 | 3.90M | return v; |
32 | 3.90M | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:loadu_int32 Unexecuted instantiation: resize_ssse3.c:loadu_int32 convolve_2d_avx2.c:loadu_int32 Line | Count | Source | 28 | 1.00M | static inline int32_t loadu_int32(const void *src) { | 29 | 1.00M | int32_t v; | 30 | 1.00M | memcpy(&v, src, sizeof(v)); | 31 | 1.00M | return v; | 32 | 1.00M | } |
convolve_avx2.c:loadu_int32 Line | Count | Source | 28 | 2.69M | static inline int32_t loadu_int32(const void *src) { | 29 | 2.69M | int32_t v; | 30 | 2.69M | memcpy(&v, src, sizeof(v)); | 31 | 2.69M | return v; | 32 | 2.69M | } |
jnt_convolve_avx2.c:loadu_int32 Line | Count | Source | 28 | 210k | static inline int32_t loadu_int32(const void *src) { | 29 | 210k | int32_t v; | 30 | 210k | memcpy(&v, src, sizeof(v)); | 31 | 210k | return v; | 32 | 210k | } |
|
33 | | |
34 | 0 | static inline int64_t loadu_int64(const void *src) { |
35 | 0 | int64_t v; |
36 | 0 | memcpy(&v, src, sizeof(v)); |
37 | 0 | return v; |
38 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:loadu_int64 Unexecuted instantiation: resize_ssse3.c:loadu_int64 Unexecuted instantiation: convolve_2d_avx2.c:loadu_int64 Unexecuted instantiation: convolve_avx2.c:loadu_int64 Unexecuted instantiation: jnt_convolve_avx2.c:loadu_int64 |
39 | | |
40 | 336k | static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) { |
41 | 336k | _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); |
42 | 336k | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:_mm_storeh_epi64 Unexecuted instantiation: resize_ssse3.c:_mm_storeh_epi64 Unexecuted instantiation: convolve_2d_avx2.c:_mm_storeh_epi64 convolve_avx2.c:_mm_storeh_epi64 Line | Count | Source | 40 | 51.3k | static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) { | 41 | 51.3k | _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); | 42 | 51.3k | } |
jnt_convolve_avx2.c:_mm_storeh_epi64 Line | Count | Source | 40 | 285k | static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) { | 41 | 285k | _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); | 42 | 285k | } |
|
43 | | |
44 | 5.40M | static inline __m128i loadh_epi64(const void *const src, const __m128i s) { |
45 | 5.40M | return _mm_castps_si128( |
46 | 5.40M | _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); |
47 | 5.40M | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:loadh_epi64 Unexecuted instantiation: resize_ssse3.c:loadh_epi64 convolve_2d_avx2.c:loadh_epi64 Line | Count | Source | 44 | 4.28M | static inline __m128i loadh_epi64(const void *const src, const __m128i s) { | 45 | 4.28M | return _mm_castps_si128( | 46 | 4.28M | _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); | 47 | 4.28M | } |
convolve_avx2.c:loadh_epi64 Line | Count | Source | 44 | 1.11M | static inline __m128i loadh_epi64(const void *const src, const __m128i s) { | 45 | 1.11M | return _mm_castps_si128( | 46 | 1.11M | _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); | 47 | 1.11M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:loadh_epi64 |
48 | | |
49 | | static inline __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, |
50 | 0 | const int byte_stride) { |
51 | 0 | return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride), |
52 | 0 | loadu_int32((int8_t *)src + 1 * byte_stride), |
53 | 0 | loadu_int32((int8_t *)src + 2 * byte_stride), |
54 | 0 | loadu_int32((int8_t *)src + 3 * byte_stride)); |
55 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_4x4_to_1_reg_sse2 Unexecuted instantiation: resize_ssse3.c:load_8bit_4x4_to_1_reg_sse2 Unexecuted instantiation: convolve_2d_avx2.c:load_8bit_4x4_to_1_reg_sse2 Unexecuted instantiation: convolve_avx2.c:load_8bit_4x4_to_1_reg_sse2 Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_4x4_to_1_reg_sse2 |
56 | | |
57 | | static inline __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, |
58 | 5.40M | const int byte_stride) { |
59 | 5.40M | __m128i dst; |
60 | 5.40M | dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); |
61 | 5.40M | dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); |
62 | 5.40M | return dst; |
63 | 5.40M | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_8x2_to_1_reg_sse2 Unexecuted instantiation: resize_ssse3.c:load_8bit_8x2_to_1_reg_sse2 convolve_2d_avx2.c:load_8bit_8x2_to_1_reg_sse2 Line | Count | Source | 58 | 4.28M | const int byte_stride) { | 59 | 4.28M | __m128i dst; | 60 | 4.28M | dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); | 61 | 4.28M | dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); | 62 | 4.28M | return dst; | 63 | 4.28M | } |
convolve_avx2.c:load_8bit_8x2_to_1_reg_sse2 Line | Count | Source | 58 | 1.11M | const int byte_stride) { | 59 | 1.11M | __m128i dst; | 60 | 1.11M | dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); | 61 | 1.11M | dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); | 62 | 1.11M | return dst; | 63 | 1.11M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_8x2_to_1_reg_sse2 |
64 | | |
65 | | static inline void store_8bit_8x4_from_16x2(const __m128i *const s, |
66 | | uint8_t *const d, |
67 | 0 | const ptrdiff_t stride) { |
68 | 0 | _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); |
69 | 0 | _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); |
70 | 0 | _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); |
71 | 0 | _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); |
72 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:store_8bit_8x4_from_16x2 Unexecuted instantiation: resize_ssse3.c:store_8bit_8x4_from_16x2 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_8x4_from_16x2 Unexecuted instantiation: convolve_avx2.c:store_8bit_8x4_from_16x2 Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_8x4_from_16x2 |
73 | | |
74 | | static inline void store_8bit_4x4(const __m128i *const s, uint8_t *const d, |
75 | 0 | const ptrdiff_t stride) { |
76 | 0 | *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); |
77 | 0 | *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); |
78 | 0 | *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); |
79 | 0 | *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); |
80 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:store_8bit_4x4 Unexecuted instantiation: resize_ssse3.c:store_8bit_4x4 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x4 Unexecuted instantiation: convolve_avx2.c:store_8bit_4x4 Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x4 |
81 | | |
82 | | static inline void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, |
83 | 0 | const ptrdiff_t stride) { |
84 | 0 | __m128i ss[4]; |
85 | |
|
86 | 0 | ss[0] = s; |
87 | 0 | ss[1] = _mm_srli_si128(s, 4); |
88 | 0 | ss[2] = _mm_srli_si128(s, 8); |
89 | 0 | ss[3] = _mm_srli_si128(s, 12); |
90 | 0 | store_8bit_4x4(ss, d, stride); |
91 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:store_8bit_4x4_sse2 Unexecuted instantiation: resize_ssse3.c:store_8bit_4x4_sse2 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x4_sse2 Unexecuted instantiation: convolve_avx2.c:store_8bit_4x4_sse2 Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x4_sse2 |
92 | | |
93 | | static inline void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, |
94 | 0 | __m128i *const d) { |
95 | 0 | d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); |
96 | 0 | d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); |
97 | 0 | d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); |
98 | 0 | d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); |
99 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_4x4 Unexecuted instantiation: resize_ssse3.c:load_8bit_4x4 Unexecuted instantiation: convolve_2d_avx2.c:load_8bit_4x4 Unexecuted instantiation: convolve_avx2.c:load_8bit_4x4 Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_4x4 |
100 | | |
101 | | static inline void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, |
102 | 0 | __m128i *const d) { |
103 | 0 | load_8bit_4x4(s + 0 * stride, stride, &d[0]); |
104 | 0 | load_8bit_4x4(s + 4 * stride, stride, &d[4]); |
105 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_4x8 Unexecuted instantiation: resize_ssse3.c:load_8bit_4x8 Unexecuted instantiation: convolve_2d_avx2.c:load_8bit_4x8 Unexecuted instantiation: convolve_avx2.c:load_8bit_4x8 Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_4x8 |
106 | | |
107 | | static inline void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, |
108 | 0 | __m128i *const d) { |
109 | 0 | d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); |
110 | 0 | d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); |
111 | 0 | d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); |
112 | 0 | d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); |
113 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_8x4 Unexecuted instantiation: resize_ssse3.c:load_8bit_8x4 Unexecuted instantiation: convolve_2d_avx2.c:load_8bit_8x4 Unexecuted instantiation: convolve_avx2.c:load_8bit_8x4 Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_8x4 |
114 | | |
115 | | static inline void loadu_8bit_16x4(const uint8_t *const s, |
116 | 0 | const ptrdiff_t stride, __m128i *const d) { |
117 | 0 | d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); |
118 | 0 | d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); |
119 | 0 | d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); |
120 | 0 | d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); |
121 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:loadu_8bit_16x4 Unexecuted instantiation: resize_ssse3.c:loadu_8bit_16x4 Unexecuted instantiation: convolve_2d_avx2.c:loadu_8bit_16x4 Unexecuted instantiation: convolve_avx2.c:loadu_8bit_16x4 Unexecuted instantiation: jnt_convolve_avx2.c:loadu_8bit_16x4 |
122 | | |
123 | | static inline void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, |
124 | 0 | __m128i *const d) { |
125 | 0 | load_8bit_8x4(s + 0 * stride, stride, &d[0]); |
126 | 0 | load_8bit_8x4(s + 4 * stride, stride, &d[4]); |
127 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_8x8 Unexecuted instantiation: resize_ssse3.c:load_8bit_8x8 Unexecuted instantiation: convolve_2d_avx2.c:load_8bit_8x8 Unexecuted instantiation: convolve_avx2.c:load_8bit_8x8 Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_8x8 |
128 | | |
129 | | static inline void load_8bit_16x8(const uint8_t *const s, |
130 | 0 | const ptrdiff_t stride, __m128i *const d) { |
131 | 0 | d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); |
132 | 0 | d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); |
133 | 0 | d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); |
134 | 0 | d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); |
135 | 0 | d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); |
136 | 0 | d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); |
137 | 0 | d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); |
138 | 0 | d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); |
139 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:load_8bit_16x8 Unexecuted instantiation: resize_ssse3.c:load_8bit_16x8 Unexecuted instantiation: convolve_2d_avx2.c:load_8bit_16x8 Unexecuted instantiation: convolve_avx2.c:load_8bit_16x8 Unexecuted instantiation: jnt_convolve_avx2.c:load_8bit_16x8 |
140 | | |
141 | | static inline void loadu_8bit_16x8(const uint8_t *const s, |
142 | 0 | const ptrdiff_t stride, __m128i *const d) { |
143 | 0 | loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); |
144 | 0 | loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); |
145 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:loadu_8bit_16x8 Unexecuted instantiation: resize_ssse3.c:loadu_8bit_16x8 Unexecuted instantiation: convolve_2d_avx2.c:loadu_8bit_16x8 Unexecuted instantiation: convolve_avx2.c:loadu_8bit_16x8 Unexecuted instantiation: jnt_convolve_avx2.c:loadu_8bit_16x8 |
146 | | |
147 | | static inline void store_8bit_8x8(const __m128i *const s, uint8_t *const d, |
148 | 0 | const ptrdiff_t stride) { |
149 | 0 | _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); |
150 | 0 | _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); |
151 | 0 | _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); |
152 | 0 | _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); |
153 | 0 | _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); |
154 | 0 | _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); |
155 | 0 | _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); |
156 | 0 | _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); |
157 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:store_8bit_8x8 Unexecuted instantiation: resize_ssse3.c:store_8bit_8x8 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_8x8 Unexecuted instantiation: convolve_avx2.c:store_8bit_8x8 Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_8x8 |
158 | | |
159 | | static inline void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, |
160 | 0 | const ptrdiff_t stride) { |
161 | 0 | _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); |
162 | 0 | _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); |
163 | 0 | _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); |
164 | 0 | _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); |
165 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:storeu_8bit_16x4 Unexecuted instantiation: resize_ssse3.c:storeu_8bit_16x4 Unexecuted instantiation: convolve_2d_avx2.c:storeu_8bit_16x4 Unexecuted instantiation: convolve_avx2.c:storeu_8bit_16x4 Unexecuted instantiation: jnt_convolve_avx2.c:storeu_8bit_16x4 |
166 | | |
167 | | #endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ |