/src/libvpx/vpx_dsp/x86/convolve_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ |
12 | | #define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ |
13 | | |
14 | | #include <emmintrin.h> // SSE2 |
15 | | |
16 | | #include "./vpx_config.h" |
17 | | |
18 | | // Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns |
19 | | // values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words |
20 | 0 | static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) { |
21 | 0 | __m128i tmp = _mm_unpacklo_epi32(*reg, *reg); |
22 | 0 | return _mm_unpackhi_epi64(tmp, tmp); |
23 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:extract_quarter_2_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:extract_quarter_2_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:extract_quarter_2_epi16_sse2 |
24 | | |
25 | | // Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns |
26 | | // values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words. |
27 | 0 | static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { |
28 | 0 | __m128i tmp = _mm_unpackhi_epi32(*reg, *reg); |
29 | 0 | return _mm_unpacklo_epi64(tmp, tmp); |
30 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:extract_quarter_3_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:extract_quarter_3_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:extract_quarter_3_epi16_sse2 |
31 | | |
32 | | // Interprets src as 8-bit words, zero extends to form 16-bit words, then |
33 | | // multiplies with ker and add the adjacent results to form 32-bit words. |
34 | | // Finally adds the result from 1 and 2 together. |
35 | | static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, |
36 | | const __m128i *const src_2, |
37 | | const __m128i *const ker_1, |
38 | 0 | const __m128i *const ker_2) { |
39 | 0 | const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); |
40 | 0 | const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); |
41 | 0 | const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); |
42 | 0 | const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2); |
43 | 0 | return _mm_add_epi32(madd_1, madd_2); |
44 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:mm_madd_add_epi8_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm_madd_add_epi8_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:mm_madd_add_epi8_sse2 |
45 | | |
46 | | // Interprets src as 16-bit words, then multiplies with ker and add the |
47 | | // adjacent results to form 32-bit words. Finally adds the result from 1 and 2 |
48 | | // together. |
49 | | static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, |
50 | | const __m128i *const src_2, |
51 | | const __m128i *const ker_1, |
52 | 0 | const __m128i *const ker_2) { |
53 | 0 | const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); |
54 | 0 | const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); |
55 | 0 | return _mm_add_epi32(madd_1, madd_2); |
56 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:mm_madd_add_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm_madd_add_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:mm_madd_add_epi16_sse2 |
57 | | |
58 | | static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, |
59 | | const __m128i *const src_1, |
60 | 0 | const __m128i *const ker) { |
61 | 0 | const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); |
62 | 0 | const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); |
63 | 0 | return _mm_packs_epi32(madd_1, madd_2); |
64 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:mm_madd_packs_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm_madd_packs_epi16_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:mm_madd_packs_epi16_sse2 |
65 | | |
66 | | // Interleaves src_1 and src_2 |
67 | | static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, |
68 | 0 | const __m128i *const src_2) { |
69 | 0 | const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); |
70 | 0 | const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); |
71 | 0 | return _mm_packs_epi32(tmp_1, tmp_2); |
72 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:mm_zip_epi32_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm_zip_epi32_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:mm_zip_epi32_sse2 |
73 | | |
74 | | static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, |
75 | | const __m128i *const half_depth, |
76 | 0 | const int depth) { |
77 | 0 | const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); |
78 | 0 | return _mm_srai_epi32(nearest_src, depth); |
79 | 0 | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:mm_round_epi32_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm_round_epi32_sse2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:mm_round_epi32_sse2 |
80 | | |
81 | | static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, |
82 | | const __m128i *const half_depth, |
83 | 42.6M | const int depth) { |
84 | 42.6M | const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); |
85 | 42.6M | return _mm_srai_epi16(nearest_src, depth); |
86 | 42.6M | } Unexecuted instantiation: vpx_subpixel_4t_intrin_sse2.c:mm_round_epi16_sse2 vpx_subpixel_8t_intrin_avx2.c:mm_round_epi16_sse2 Line | Count | Source | 83 | 42.6M | const int depth) { | 84 | 42.6M | const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); | 85 | 42.6M | return _mm_srai_epi16(nearest_src, depth); | 86 | 42.6M | } |
Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:mm_round_epi16_sse2 |
87 | | |
88 | | #endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ |