Coverage Report

Created: 2023-06-07 06:31

/src/aom/aom_dsp/x86/blend_sse4.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
13
#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
14
15
#include "aom_dsp/blend.h"
16
#include "aom_dsp/x86/synonyms.h"
17
static const uint8_t g_blend_a64_mask_shuffle[32] = {
18
  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
19
  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
20
};
21
22
//////////////////////////////////////////////////////////////////////////////
23
// Common kernels
24
//////////////////////////////////////////////////////////////////////////////
25
26
static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
27
1.12M
                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
28
1.12M
  const __m128i v_s0_b = xx_loadl_32(src0);
29
1.12M
  const __m128i v_s1_b = xx_loadl_32(src1);
30
1.12M
  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
31
1.12M
  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
32
33
1.12M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
34
1.12M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
35
1.12M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
36
1.12M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
37
38
1.12M
  return v_res_w;
39
1.12M
}
Unexecuted instantiation: blend_a64_mask_sse4.c:blend_4
blend_a64_vmask_sse4.c:blend_4
Line
Count
Source
27
1.12M
                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
28
1.12M
  const __m128i v_s0_b = xx_loadl_32(src0);
29
1.12M
  const __m128i v_s1_b = xx_loadl_32(src1);
30
1.12M
  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
31
1.12M
  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
32
33
1.12M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
34
1.12M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
35
1.12M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
36
1.12M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
37
38
1.12M
  return v_res_w;
39
1.12M
}
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_4
40
41
static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
42
13.6M
                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
43
13.6M
  const __m128i v_s0_b = xx_loadl_64(src0);
44
13.6M
  const __m128i v_s1_b = xx_loadl_64(src1);
45
13.6M
  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
46
13.6M
  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
47
48
13.6M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
49
13.6M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
50
51
13.6M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
52
53
13.6M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
54
55
13.6M
  return v_res_w;
56
13.6M
}
Unexecuted instantiation: blend_a64_mask_sse4.c:blend_8
blend_a64_vmask_sse4.c:blend_8
Line
Count
Source
42
13.6M
                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
43
13.6M
  const __m128i v_s0_b = xx_loadl_64(src0);
44
13.6M
  const __m128i v_s1_b = xx_loadl_64(src1);
45
13.6M
  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
46
13.6M
  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
47
48
13.6M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
49
13.6M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
50
51
13.6M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
52
53
13.6M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
54
55
13.6M
  return v_res_w;
56
13.6M
}
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_8
57
58
static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
59
                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
60
11.1M
                                 const __m128i *rounding) {
61
11.1M
  const __m128i v_s0_b = xx_loadl_32(src0);
62
11.1M
  const __m128i v_s1_b = xx_loadl_32(src1);
63
64
11.1M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
65
11.1M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
66
67
11.1M
  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
68
11.1M
  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
69
11.1M
  return v_res;
70
11.1M
}
blend_a64_mask_sse4.c:blend_4_u8
Line
Count
Source
60
9.69M
                                 const __m128i *rounding) {
61
9.69M
  const __m128i v_s0_b = xx_loadl_32(src0);
62
9.69M
  const __m128i v_s1_b = xx_loadl_32(src1);
63
64
9.69M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
65
9.69M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
66
67
9.69M
  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
68
9.69M
  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
69
9.69M
  return v_res;
70
9.69M
}
Unexecuted instantiation: blend_a64_vmask_sse4.c:blend_4_u8
blend_a64_mask_avx2.c:blend_4_u8
Line
Count
Source
60
1.41M
                                 const __m128i *rounding) {
61
1.41M
  const __m128i v_s0_b = xx_loadl_32(src0);
62
1.41M
  const __m128i v_s1_b = xx_loadl_32(src1);
63
64
1.41M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
65
1.41M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
66
67
1.41M
  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
68
1.41M
  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
69
1.41M
  return v_res;
70
1.41M
}
71
72
static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
73
                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
74
8.94M
                                 const __m128i *rounding) {
75
8.94M
  const __m128i v_s0_b = xx_loadl_64(src0);
76
8.94M
  const __m128i v_s1_b = xx_loadl_64(src1);
77
78
8.94M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
79
8.94M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
80
81
8.94M
  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
82
8.94M
  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
83
8.94M
  return v_res;
84
8.94M
}
blend_a64_mask_sse4.c:blend_8_u8
Line
Count
Source
74
5.77M
                                 const __m128i *rounding) {
75
5.77M
  const __m128i v_s0_b = xx_loadl_64(src0);
76
5.77M
  const __m128i v_s1_b = xx_loadl_64(src1);
77
78
5.77M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
79
5.77M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
80
81
5.77M
  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
82
5.77M
  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
83
5.77M
  return v_res;
84
5.77M
}
Unexecuted instantiation: blend_a64_vmask_sse4.c:blend_8_u8
blend_a64_mask_avx2.c:blend_8_u8
Line
Count
Source
74
3.17M
                                 const __m128i *rounding) {
75
3.17M
  const __m128i v_s0_b = xx_loadl_64(src0);
76
3.17M
  const __m128i v_s1_b = xx_loadl_64(src1);
77
78
3.17M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
79
3.17M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
80
81
3.17M
  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
82
3.17M
  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
83
3.17M
  return v_res;
84
3.17M
}
85
86
static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
87
                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
88
4.42M
                                  const __m128i *rounding) {
89
4.42M
  const __m128i v_s0_b = xx_loadu_128(src0);
90
4.42M
  const __m128i v_s1_b = xx_loadu_128(src1);
91
92
4.42M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
93
4.42M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
94
4.42M
  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
95
4.42M
                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
96
97
4.42M
  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
98
4.42M
  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
99
4.42M
  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
100
4.42M
  return v_res;
101
4.42M
}
blend_a64_mask_sse4.c:blend_16_u8
Line
Count
Source
88
2.25M
                                  const __m128i *rounding) {
89
2.25M
  const __m128i v_s0_b = xx_loadu_128(src0);
90
2.25M
  const __m128i v_s1_b = xx_loadu_128(src1);
91
92
2.25M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
93
2.25M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
94
2.25M
  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
95
2.25M
                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
96
97
2.25M
  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
98
2.25M
  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
99
2.25M
  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
100
2.25M
  return v_res;
101
2.25M
}
Unexecuted instantiation: blend_a64_vmask_sse4.c:blend_16_u8
blend_a64_mask_avx2.c:blend_16_u8
Line
Count
Source
88
2.17M
                                  const __m128i *rounding) {
89
2.17M
  const __m128i v_s0_b = xx_loadu_128(src0);
90
2.17M
  const __m128i v_s1_b = xx_loadu_128(src1);
91
92
2.17M
  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
93
2.17M
                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
94
2.17M
  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
95
2.17M
                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
96
97
2.17M
  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
98
2.17M
  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
99
2.17M
  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
100
2.17M
  return v_res;
101
2.17M
}
102
103
typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
104
                                 const __m128i v_m0_w, const __m128i v_m1_w);
105
106
static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
107
2.37M
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
108
2.37M
  const __m128i v_s0_w = xx_loadl_64(src0);
109
2.37M
  const __m128i v_s1_w = xx_loadl_64(src1);
110
111
2.37M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
112
2.37M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
113
114
2.37M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
115
116
2.37M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
117
118
2.37M
  return v_res_w;
119
2.37M
}
blend_a64_mask_sse4.c:blend_4_b10
Line
Count
Source
107
2.22M
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
108
2.22M
  const __m128i v_s0_w = xx_loadl_64(src0);
109
2.22M
  const __m128i v_s1_w = xx_loadl_64(src1);
110
111
2.22M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
112
2.22M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
113
114
2.22M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
115
116
2.22M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
117
118
2.22M
  return v_res_w;
119
2.22M
}
blend_a64_vmask_sse4.c:blend_4_b10
Line
Count
Source
107
146k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
108
146k
  const __m128i v_s0_w = xx_loadl_64(src0);
109
146k
  const __m128i v_s1_w = xx_loadl_64(src1);
110
111
146k
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
112
146k
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
113
114
146k
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
115
116
146k
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
117
118
146k
  return v_res_w;
119
146k
}
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_4_b10
120
121
static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
122
8.57M
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
123
8.57M
  const __m128i v_s0_w = xx_loadu_128(src0);
124
8.57M
  const __m128i v_s1_w = xx_loadu_128(src1);
125
126
8.57M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
127
8.57M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
128
129
8.57M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
130
131
8.57M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
132
133
8.57M
  return v_res_w;
134
8.57M
}
blend_a64_mask_sse4.c:blend_8_b10
Line
Count
Source
122
5.95M
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
123
5.95M
  const __m128i v_s0_w = xx_loadu_128(src0);
124
5.95M
  const __m128i v_s1_w = xx_loadu_128(src1);
125
126
5.95M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
127
5.95M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
128
129
5.95M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
130
131
5.95M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
132
133
5.95M
  return v_res_w;
134
5.95M
}
blend_a64_vmask_sse4.c:blend_8_b10
Line
Count
Source
122
2.61M
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
123
2.61M
  const __m128i v_s0_w = xx_loadu_128(src0);
124
2.61M
  const __m128i v_s1_w = xx_loadu_128(src1);
125
126
2.61M
  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
127
2.61M
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
128
129
2.61M
  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
130
131
2.61M
  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
132
133
2.61M
  return v_res_w;
134
2.61M
}
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_8_b10
135
136
static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
137
116k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
138
116k
  const __m128i v_s0_w = xx_loadl_64(src0);
139
116k
  const __m128i v_s1_w = xx_loadl_64(src1);
140
141
  // Interleave
142
116k
  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
143
116k
  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
144
145
  // Multiply-Add
146
116k
  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
147
148
  // Scale
149
116k
  const __m128i v_ssum_d =
150
116k
      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
151
152
  // Pack
153
116k
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
154
155
  // Round
156
116k
  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
157
158
116k
  return v_res_w;
159
116k
}
blend_a64_mask_sse4.c:blend_4_b12
Line
Count
Source
137
109k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
138
109k
  const __m128i v_s0_w = xx_loadl_64(src0);
139
109k
  const __m128i v_s1_w = xx_loadl_64(src1);
140
141
  // Interleave
142
109k
  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
143
109k
  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
144
145
  // Multiply-Add
146
109k
  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
147
148
  // Scale
149
109k
  const __m128i v_ssum_d =
150
109k
      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
151
152
  // Pack
153
109k
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
154
155
  // Round
156
109k
  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
157
158
109k
  return v_res_w;
159
109k
}
blend_a64_vmask_sse4.c:blend_4_b12
Line
Count
Source
137
7.46k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
138
7.46k
  const __m128i v_s0_w = xx_loadl_64(src0);
139
7.46k
  const __m128i v_s1_w = xx_loadl_64(src1);
140
141
  // Interleave
142
7.46k
  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
143
7.46k
  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
144
145
  // Multiply-Add
146
7.46k
  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
147
148
  // Scale
149
7.46k
  const __m128i v_ssum_d =
150
7.46k
      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
151
152
  // Pack
153
7.46k
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
154
155
  // Round
156
7.46k
  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
157
158
7.46k
  return v_res_w;
159
7.46k
}
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_4_b12
160
161
static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
162
848k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
163
848k
  const __m128i v_s0_w = xx_loadu_128(src0);
164
848k
  const __m128i v_s1_w = xx_loadu_128(src1);
165
166
  // Interleave
167
848k
  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
168
848k
  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
169
848k
  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
170
848k
  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
171
172
  // Multiply-Add
173
848k
  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
174
848k
  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
175
176
  // Scale
177
848k
  const __m128i v_ssuml_d =
178
848k
      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
179
848k
  const __m128i v_ssumh_d =
180
848k
      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
181
182
  // Pack
183
848k
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
184
185
  // Round
186
848k
  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
187
188
848k
  return v_res_w;
189
848k
}
blend_a64_mask_sse4.c:blend_8_b12
Line
Count
Source
162
587k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
163
587k
  const __m128i v_s0_w = xx_loadu_128(src0);
164
587k
  const __m128i v_s1_w = xx_loadu_128(src1);
165
166
  // Interleave
167
587k
  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
168
587k
  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
169
587k
  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
170
587k
  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
171
172
  // Multiply-Add
173
587k
  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
174
587k
  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
175
176
  // Scale
177
587k
  const __m128i v_ssuml_d =
178
587k
      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
179
587k
  const __m128i v_ssumh_d =
180
587k
      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
181
182
  // Pack
183
587k
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
184
185
  // Round
186
587k
  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
187
188
587k
  return v_res_w;
189
587k
}
blend_a64_vmask_sse4.c:blend_8_b12
Line
Count
Source
162
261k
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
163
261k
  const __m128i v_s0_w = xx_loadu_128(src0);
164
261k
  const __m128i v_s1_w = xx_loadu_128(src1);
165
166
  // Interleave
167
261k
  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
168
261k
  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
169
261k
  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
170
261k
  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
171
172
  // Multiply-Add
173
261k
  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
174
261k
  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
175
176
  // Scale
177
261k
  const __m128i v_ssuml_d =
178
261k
      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
179
261k
  const __m128i v_ssumh_d =
180
261k
      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
181
182
  // Pack
183
261k
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
184
185
  // Round
186
261k
  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
187
188
261k
  return v_res_w;
189
261k
}
Unexecuted instantiation: blend_a64_mask_avx2.c:blend_8_b12
190
191
#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_