/src/aom/aom_dsp/x86/blend_sse4.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ |
13 | | #define AOM_AOM_DSP_X86_BLEND_SSE4_H_ |
14 | | |
15 | | #include "aom_dsp/blend.h" |
16 | | #include "aom_dsp/x86/synonyms.h" |
17 | | static const uint8_t g_blend_a64_mask_shuffle[32] = { |
18 | | 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, |
19 | | 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, |
20 | | }; |
21 | | |
22 | | ////////////////////////////////////////////////////////////////////////////// |
23 | | // Common kernels |
24 | | ////////////////////////////////////////////////////////////////////////////// |
25 | | |
26 | | static inline __m128i blend_4(const uint8_t *src0, const uint8_t *src1, |
27 | 324k | const __m128i *v_m0_w, const __m128i *v_m1_w) { |
28 | 324k | const __m128i v_s0_b = xx_loadl_32(src0); |
29 | 324k | const __m128i v_s1_b = xx_loadl_32(src1); |
30 | 324k | const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); |
31 | 324k | const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); |
32 | | |
33 | 324k | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); |
34 | 324k | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); |
35 | 324k | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); |
36 | 324k | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); |
37 | | |
38 | 324k | return v_res_w; |
39 | 324k | } Unexecuted instantiation: blend_a64_mask_sse4.c:blend_4 blend_a64_vmask_sse4.c:blend_4 Line | Count | Source | 27 | 324k | const __m128i *v_m0_w, const __m128i *v_m1_w) { | 28 | 324k | const __m128i v_s0_b = xx_loadl_32(src0); | 29 | 324k | const __m128i v_s1_b = xx_loadl_32(src1); | 30 | 324k | const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); | 31 | 324k | const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); | 32 | | | 33 | 324k | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); | 34 | 324k | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); | 35 | 324k | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); | 36 | 324k | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); | 37 | | | 38 | 324k | return v_res_w; | 39 | 324k | } |
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_4 |
40 | | |
41 | | static inline __m128i blend_8(const uint8_t *src0, const uint8_t *src1, |
42 | 5.31M | const __m128i *v_m0_w, const __m128i *v_m1_w) { |
43 | 5.31M | const __m128i v_s0_b = xx_loadl_64(src0); |
44 | 5.31M | const __m128i v_s1_b = xx_loadl_64(src1); |
45 | 5.31M | const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); |
46 | 5.31M | const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); |
47 | | |
48 | 5.31M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); |
49 | 5.31M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); |
50 | | |
51 | 5.31M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); |
52 | | |
53 | 5.31M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); |
54 | | |
55 | 5.31M | return v_res_w; |
56 | 5.31M | } Unexecuted instantiation: blend_a64_mask_sse4.c:blend_8 blend_a64_vmask_sse4.c:blend_8 Line | Count | Source | 42 | 5.31M | const __m128i *v_m0_w, const __m128i *v_m1_w) { | 43 | 5.31M | const __m128i v_s0_b = xx_loadl_64(src0); | 44 | 5.31M | const __m128i v_s1_b = xx_loadl_64(src1); | 45 | 5.31M | const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); | 46 | 5.31M | const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); | 47 | | | 48 | 5.31M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); | 49 | 5.31M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); | 50 | | | 51 | 5.31M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); | 52 | | | 53 | 5.31M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); | 54 | | | 55 | 5.31M | return v_res_w; | 56 | 5.31M | } |
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_8 |
57 | | |
58 | | static inline __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, |
59 | | const __m128i *v_m0_b, const __m128i *v_m1_b, |
60 | 3.95M | const __m128i *rounding) { |
61 | 3.95M | const __m128i v_s0_b = xx_loadl_32(src0); |
62 | 3.95M | const __m128i v_s1_b = xx_loadl_32(src1); |
63 | | |
64 | 3.95M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), |
65 | 3.95M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); |
66 | | |
67 | 3.95M | const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); |
68 | 3.95M | const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); |
69 | 3.95M | return v_res; |
70 | 3.95M | } blend_a64_mask_sse4.c:blend_4_u8 Line | Count | Source | 60 | 3.41M | const __m128i *rounding) { | 61 | 3.41M | const __m128i v_s0_b = xx_loadl_32(src0); | 62 | 3.41M | const __m128i v_s1_b = xx_loadl_32(src1); | 63 | | | 64 | 3.41M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), | 65 | 3.41M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); | 66 | | | 67 | 3.41M | const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); | 68 | 3.41M | const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); | 69 | 3.41M | return v_res; | 70 | 3.41M | } |
Unexecuted instantiation: blend_a64_vmask_sse4.c:blend_4_u8 blend_a64_mask_avx2.c:blend_4_u8 Line | Count | Source | 60 | 540k | const __m128i *rounding) { | 61 | 540k | const __m128i v_s0_b = xx_loadl_32(src0); | 62 | 540k | const __m128i v_s1_b = xx_loadl_32(src1); | 63 | | | 64 | 540k | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), | 65 | 540k | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); | 66 | | | 67 | 540k | const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); | 68 | 540k | const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); | 69 | 540k | return v_res; | 70 | 540k | } |
|
71 | | |
72 | | static inline __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, |
73 | | const __m128i *v_m0_b, const __m128i *v_m1_b, |
74 | 3.94M | const __m128i *rounding) { |
75 | 3.94M | const __m128i v_s0_b = xx_loadl_64(src0); |
76 | 3.94M | const __m128i v_s1_b = xx_loadl_64(src1); |
77 | | |
78 | 3.94M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), |
79 | 3.94M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); |
80 | | |
81 | 3.94M | const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); |
82 | 3.94M | const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); |
83 | 3.94M | return v_res; |
84 | 3.94M | } blend_a64_mask_sse4.c:blend_8_u8 Line | Count | Source | 74 | 2.22M | const __m128i *rounding) { | 75 | 2.22M | const __m128i v_s0_b = xx_loadl_64(src0); | 76 | 2.22M | const __m128i v_s1_b = xx_loadl_64(src1); | 77 | | | 78 | 2.22M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), | 79 | 2.22M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); | 80 | | | 81 | 2.22M | const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); | 82 | 2.22M | const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); | 83 | 2.22M | return v_res; | 84 | 2.22M | } |
Unexecuted instantiation: blend_a64_vmask_sse4.c:blend_8_u8 blend_a64_mask_avx2.c:blend_8_u8 Line | Count | Source | 74 | 1.71M | const __m128i *rounding) { | 75 | 1.71M | const __m128i v_s0_b = xx_loadl_64(src0); | 76 | 1.71M | const __m128i v_s1_b = xx_loadl_64(src1); | 77 | | | 78 | 1.71M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), | 79 | 1.71M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); | 80 | | | 81 | 1.71M | const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); | 82 | 1.71M | const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); | 83 | 1.71M | return v_res; | 84 | 1.71M | } |
|
85 | | |
86 | | static inline __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, |
87 | | const __m128i *v_m0_b, const __m128i *v_m1_b, |
88 | 2.49M | const __m128i *rounding) { |
89 | 2.49M | const __m128i v_s0_b = xx_loadu_128(src0); |
90 | 2.49M | const __m128i v_s1_b = xx_loadu_128(src1); |
91 | | |
92 | 2.49M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), |
93 | 2.49M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); |
94 | 2.49M | const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), |
95 | 2.49M | _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); |
96 | | |
97 | 2.49M | const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); |
98 | 2.49M | const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); |
99 | 2.49M | const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); |
100 | 2.49M | return v_res; |
101 | 2.49M | } blend_a64_mask_sse4.c:blend_16_u8 Line | Count | Source | 88 | 1.05M | const __m128i *rounding) { | 89 | 1.05M | const __m128i v_s0_b = xx_loadu_128(src0); | 90 | 1.05M | const __m128i v_s1_b = xx_loadu_128(src1); | 91 | | | 92 | 1.05M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), | 93 | 1.05M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); | 94 | 1.05M | const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), | 95 | 1.05M | _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); | 96 | | | 97 | 1.05M | const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); | 98 | 1.05M | const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); | 99 | 1.05M | const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); | 100 | 1.05M | return v_res; | 101 | 1.05M | } |
Unexecuted instantiation: blend_a64_vmask_sse4.c:blend_16_u8 blend_a64_mask_avx2.c:blend_16_u8 Line | Count | Source | 88 | 1.44M | const __m128i *rounding) { | 89 | 1.44M | const __m128i v_s0_b = xx_loadu_128(src0); | 90 | 1.44M | const __m128i v_s1_b = xx_loadu_128(src1); | 91 | | | 92 | 1.44M | const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), | 93 | 1.44M | _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); | 94 | 1.44M | const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), | 95 | 1.44M | _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); | 96 | | | 97 | 1.44M | const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); | 98 | 1.44M | const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); | 99 | 1.44M | const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); | 100 | 1.44M | return v_res; | 101 | 1.44M | } |
|
102 | | |
103 | | typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, |
104 | | const __m128i v_m0_w, const __m128i v_m1_w); |
105 | | |
106 | | static inline __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, |
107 | 3.42M | const __m128i v_m0_w, const __m128i v_m1_w) { |
108 | 3.42M | const __m128i v_s0_w = xx_loadl_64(src0); |
109 | 3.42M | const __m128i v_s1_w = xx_loadl_64(src1); |
110 | | |
111 | 3.42M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); |
112 | 3.42M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); |
113 | | |
114 | 3.42M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); |
115 | | |
116 | 3.42M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); |
117 | | |
118 | 3.42M | return v_res_w; |
119 | 3.42M | } blend_a64_mask_sse4.c:blend_4_b10 Line | Count | Source | 107 | 3.20M | const __m128i v_m0_w, const __m128i v_m1_w) { | 108 | 3.20M | const __m128i v_s0_w = xx_loadl_64(src0); | 109 | 3.20M | const __m128i v_s1_w = xx_loadl_64(src1); | 110 | | | 111 | 3.20M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); | 112 | 3.20M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); | 113 | | | 114 | 3.20M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); | 115 | | | 116 | 3.20M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); | 117 | | | 118 | 3.20M | return v_res_w; | 119 | 3.20M | } |
blend_a64_vmask_sse4.c:blend_4_b10 Line | Count | Source | 107 | 222k | const __m128i v_m0_w, const __m128i v_m1_w) { | 108 | 222k | const __m128i v_s0_w = xx_loadl_64(src0); | 109 | 222k | const __m128i v_s1_w = xx_loadl_64(src1); | 110 | | | 111 | 222k | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); | 112 | 222k | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); | 113 | | | 114 | 222k | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); | 115 | | | 116 | 222k | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); | 117 | | | 118 | 222k | return v_res_w; | 119 | 222k | } |
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_4_b10 |
120 | | |
121 | | static inline __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, |
122 | 13.9M | const __m128i v_m0_w, const __m128i v_m1_w) { |
123 | 13.9M | const __m128i v_s0_w = xx_loadu_128(src0); |
124 | 13.9M | const __m128i v_s1_w = xx_loadu_128(src1); |
125 | | |
126 | 13.9M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); |
127 | 13.9M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); |
128 | | |
129 | 13.9M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); |
130 | | |
131 | 13.9M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); |
132 | | |
133 | 13.9M | return v_res_w; |
134 | 13.9M | } blend_a64_mask_sse4.c:blend_8_b10 Line | Count | Source | 122 | 10.0M | const __m128i v_m0_w, const __m128i v_m1_w) { | 123 | 10.0M | const __m128i v_s0_w = xx_loadu_128(src0); | 124 | 10.0M | const __m128i v_s1_w = xx_loadu_128(src1); | 125 | | | 126 | 10.0M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); | 127 | 10.0M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); | 128 | | | 129 | 10.0M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); | 130 | | | 131 | 10.0M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); | 132 | | | 133 | 10.0M | return v_res_w; | 134 | 10.0M | } |
blend_a64_vmask_sse4.c:blend_8_b10 Line | Count | Source | 122 | 3.98M | const __m128i v_m0_w, const __m128i v_m1_w) { | 123 | 3.98M | const __m128i v_s0_w = xx_loadu_128(src0); | 124 | 3.98M | const __m128i v_s1_w = xx_loadu_128(src1); | 125 | | | 126 | 3.98M | const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); | 127 | 3.98M | const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); | 128 | | | 129 | 3.98M | const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); | 130 | | | 131 | 3.98M | const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); | 132 | | | 133 | 3.98M | return v_res_w; | 134 | 3.98M | } |
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_8_b10 |
135 | | |
136 | | static inline __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, |
137 | 159k | const __m128i v_m0_w, const __m128i v_m1_w) { |
138 | 159k | const __m128i v_s0_w = xx_loadl_64(src0); |
139 | 159k | const __m128i v_s1_w = xx_loadl_64(src1); |
140 | | |
141 | | // Interleave |
142 | 159k | const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); |
143 | 159k | const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); |
144 | | |
145 | | // Multiply-Add |
146 | 159k | const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); |
147 | | |
148 | | // Scale |
149 | 159k | const __m128i v_ssum_d = |
150 | 159k | _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); |
151 | | |
152 | | // Pack |
153 | 159k | const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); |
154 | | |
155 | | // Round |
156 | 159k | const __m128i v_res_w = xx_round_epu16(v_pssum_d); |
157 | | |
158 | 159k | return v_res_w; |
159 | 159k | } blend_a64_mask_sse4.c:blend_4_b12 Line | Count | Source | 137 | 151k | const __m128i v_m0_w, const __m128i v_m1_w) { | 138 | 151k | const __m128i v_s0_w = xx_loadl_64(src0); | 139 | 151k | const __m128i v_s1_w = xx_loadl_64(src1); | 140 | | | 141 | | // Interleave | 142 | 151k | const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); | 143 | 151k | const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); | 144 | | | 145 | | // Multiply-Add | 146 | 151k | const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); | 147 | | | 148 | | // Scale | 149 | 151k | const __m128i v_ssum_d = | 150 | 151k | _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); | 151 | | | 152 | | // Pack | 153 | 151k | const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); | 154 | | | 155 | | // Round | 156 | 151k | const __m128i v_res_w = xx_round_epu16(v_pssum_d); | 157 | | | 158 | 151k | return v_res_w; | 159 | 151k | } |
blend_a64_vmask_sse4.c:blend_4_b12 Line | Count | Source | 137 | 8.02k | const __m128i v_m0_w, const __m128i v_m1_w) { | 138 | 8.02k | const __m128i v_s0_w = xx_loadl_64(src0); | 139 | 8.02k | const __m128i v_s1_w = xx_loadl_64(src1); | 140 | | | 141 | | // Interleave | 142 | 8.02k | const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); | 143 | 8.02k | const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); | 144 | | | 145 | | // Multiply-Add | 146 | 8.02k | const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); | 147 | | | 148 | | // Scale | 149 | 8.02k | const __m128i v_ssum_d = | 150 | 8.02k | _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); | 151 | | | 152 | | // Pack | 153 | 8.02k | const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); | 154 | | | 155 | | // Round | 156 | 8.02k | const __m128i v_res_w = xx_round_epu16(v_pssum_d); | 157 | | | 158 | 8.02k | return v_res_w; | 159 | 8.02k | } |
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_4_b12 |
160 | | |
161 | | static inline __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, |
162 | 1.01M | const __m128i v_m0_w, const __m128i v_m1_w) { |
163 | 1.01M | const __m128i v_s0_w = xx_loadu_128(src0); |
164 | 1.01M | const __m128i v_s1_w = xx_loadu_128(src1); |
165 | | |
166 | | // Interleave |
167 | 1.01M | const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); |
168 | 1.01M | const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); |
169 | 1.01M | const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); |
170 | 1.01M | const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); |
171 | | |
172 | | // Multiply-Add |
173 | 1.01M | const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); |
174 | 1.01M | const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); |
175 | | |
176 | | // Scale |
177 | 1.01M | const __m128i v_ssuml_d = |
178 | 1.01M | _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); |
179 | 1.01M | const __m128i v_ssumh_d = |
180 | 1.01M | _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); |
181 | | |
182 | | // Pack |
183 | 1.01M | const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); |
184 | | |
185 | | // Round |
186 | 1.01M | const __m128i v_res_w = xx_round_epu16(v_pssum_d); |
187 | | |
188 | 1.01M | return v_res_w; |
189 | 1.01M | } blend_a64_mask_sse4.c:blend_8_b12 Line | Count | Source | 162 | 659k | const __m128i v_m0_w, const __m128i v_m1_w) { | 163 | 659k | const __m128i v_s0_w = xx_loadu_128(src0); | 164 | 659k | const __m128i v_s1_w = xx_loadu_128(src1); | 165 | | | 166 | | // Interleave | 167 | 659k | const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); | 168 | 659k | const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); | 169 | 659k | const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); | 170 | 659k | const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); | 171 | | | 172 | | // Multiply-Add | 173 | 659k | const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); | 174 | 659k | const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); | 175 | | | 176 | | // Scale | 177 | 659k | const __m128i v_ssuml_d = | 178 | 659k | _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); | 179 | 659k | const __m128i v_ssumh_d = | 180 | 659k | _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); | 181 | | | 182 | | // Pack | 183 | 659k | const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); | 184 | | | 185 | | // Round | 186 | 659k | const __m128i v_res_w = xx_round_epu16(v_pssum_d); | 187 | | | 188 | 659k | return v_res_w; | 189 | 659k | } |
blend_a64_vmask_sse4.c:blend_8_b12 Line | Count | Source | 162 | 351k | const __m128i v_m0_w, const __m128i v_m1_w) { | 163 | 351k | const __m128i v_s0_w = xx_loadu_128(src0); | 164 | 351k | const __m128i v_s1_w = xx_loadu_128(src1); | 165 | | | 166 | | // Interleave | 167 | 351k | const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); | 168 | 351k | const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); | 169 | 351k | const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); | 170 | 351k | const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); | 171 | | | 172 | | // Multiply-Add | 173 | 351k | const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); | 174 | 351k | const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); | 175 | | | 176 | | // Scale | 177 | 351k | const __m128i v_ssuml_d = | 178 | 351k | _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); | 179 | 351k | const __m128i v_ssumh_d = | 180 | 351k | _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); | 181 | | | 182 | | // Pack | 183 | 351k | const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); | 184 | | | 185 | | // Round | 186 | 351k | const __m128i v_res_w = xx_round_epu16(v_pssum_d); | 187 | | | 188 | 351k | return v_res_w; | 189 | 351k | } |
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_8_b12 |
190 | | |
191 | | #endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ |