/src/aom/aom_dsp/x86/blend_mask_sse4.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.  | 
3  |  |  *  | 
4  |  |  * This source code is subject to the terms of the BSD 2 Clause License and  | 
5  |  |  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License  | 
6  |  |  * was not distributed with this source code in the LICENSE file, you can  | 
7  |  |  * obtain it at www.aomedia.org/license/software. If the Alliance for Open  | 
8  |  |  * Media Patent License 1.0 was not distributed with this source code in the  | 
9  |  |  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.  | 
10  |  |  */  | 
11  |  |  | 
12  |  | #ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_  | 
13  |  | #define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_  | 
14  |  | #include <smmintrin.h>  // SSE4.1  | 
15  |  |  | 
16  |  | #include <assert.h>  | 
17  |  |  | 
18  |  | #include "aom/aom_integer.h"  | 
19  |  | #include "aom_ports/mem.h"  | 
20  |  | #include "aom_dsp/aom_dsp_common.h"  | 
21  |  | #include "aom_dsp/blend.h"  | 
22  |  |  | 
23  |  | #include "aom_dsp/x86/synonyms.h"  | 
24  |  |  | 
25  |  | #include "config/aom_dsp_rtcd.h"  | 
26  |  |  | 
27  |  | static inline void blend_a64_d16_mask_w4_sse41(  | 
28  |  |     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,  | 
29  |  |     const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,  | 
30  | 193k  |     int shift) { | 
31  | 193k  |   const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);  | 
32  | 193k  |   const __m128i s0 = xx_loadl_64(src0);  | 
33  | 193k  |   const __m128i s1 = xx_loadl_64(src1);  | 
34  | 193k  |   const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);  | 
35  | 193k  |   const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);  | 
36  | 193k  |   const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);  | 
37  | 193k  |   const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);  | 
38  | 193k  |   const __m128i res_d = _mm_srai_epi32(res_c, shift);  | 
39  | 193k  |   const __m128i res_e = _mm_packs_epi32(res_d, res_d);  | 
40  | 193k  |   const __m128i res = _mm_packus_epi16(res_e, res_e);  | 
41  |  |  | 
42  | 193k  |   xx_storel_32(dst, res);  | 
43  | 193k  | } Unexecuted instantiation: blend_a64_mask_sse4.c:blend_a64_d16_mask_w4_sse41 blend_a64_mask_avx2.c:blend_a64_d16_mask_w4_sse41 Line  | Count  | Source  |  30  | 193k  |     int shift) { |  31  | 193k  |   const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);  |  32  | 193k  |   const __m128i s0 = xx_loadl_64(src0);  |  33  | 193k  |   const __m128i s1 = xx_loadl_64(src1);  |  34  | 193k  |   const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);  |  35  | 193k  |   const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);  |  36  | 193k  |   const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);  |  37  | 193k  |   const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);  |  38  | 193k  |   const __m128i res_d = _mm_srai_epi32(res_c, shift);  |  39  | 193k  |   const __m128i res_e = _mm_packs_epi32(res_d, res_d);  |  40  | 193k  |   const __m128i res = _mm_packus_epi16(res_e, res_e);  |  41  |  |  |  42  | 193k  |   xx_storel_32(dst, res);  |  43  | 193k  | }  |  
  | 
44  |  |  | 
45  |  | static inline void blend_a64_d16_mask_w8_sse41(  | 
46  |  |     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,  | 
47  |  |     const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,  | 
48  | 615k  |     int shift) { | 
49  | 615k  |   const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);  | 
50  | 615k  |   const __m128i s0 = xx_loadu_128(src0);  | 
51  | 615k  |   const __m128i s1 = xx_loadu_128(src1);  | 
52  | 615k  |   __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),  | 
53  | 615k  |                                   _mm_unpacklo_epi16(*m, max_minus_m));  | 
54  | 615k  |   __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),  | 
55  | 615k  |                                   _mm_unpackhi_epi16(*m, max_minus_m));  | 
56  | 615k  |   res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);  | 
57  | 615k  |   res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);  | 
58  | 615k  |   const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);  | 
59  | 615k  |   const __m128i res = _mm_packus_epi16(res_e, res_e);  | 
60  |  |  | 
61  | 615k  |   _mm_storel_epi64((__m128i *)(dst), res);  | 
62  | 615k  | } Unexecuted instantiation: blend_a64_mask_sse4.c:blend_a64_d16_mask_w8_sse41 blend_a64_mask_avx2.c:blend_a64_d16_mask_w8_sse41 Line  | Count  | Source  |  48  | 615k  |     int shift) { |  49  | 615k  |   const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);  |  50  | 615k  |   const __m128i s0 = xx_loadu_128(src0);  |  51  | 615k  |   const __m128i s1 = xx_loadu_128(src1);  |  52  | 615k  |   __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),  |  53  | 615k  |                                   _mm_unpacklo_epi16(*m, max_minus_m));  |  54  | 615k  |   __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),  |  55  | 615k  |                                   _mm_unpackhi_epi16(*m, max_minus_m));  |  56  | 615k  |   res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);  |  57  | 615k  |   res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);  |  58  | 615k  |   const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);  |  59  | 615k  |   const __m128i res = _mm_packus_epi16(res_e, res_e);  |  60  |  |  |  61  | 615k  |   _mm_storel_epi64((__m128i *)(dst), res);  |  62  | 615k  | }  |  
  | 
63  |  |  | 
64  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(  | 
65  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
66  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
67  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
68  | 0  |     const __m128i *round_offset, int shift) { | 
69  | 0  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
70  | 0  |   for (int i = 0; i < h; ++i) { | 
71  | 0  |     const __m128i m0 = xx_loadl_32(mask);  | 
72  | 0  |     const __m128i m = _mm_cvtepu8_epi16(m0);  | 
73  |  | 
  | 
74  | 0  |     blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
75  | 0  |                                 shift);  | 
76  | 0  |     mask += mask_stride;  | 
77  | 0  |     dst += dst_stride;  | 
78  | 0  |     src0 += src0_stride;  | 
79  | 0  |     src1 += src1_stride;  | 
80  | 0  |   }  | 
81  | 0  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1 Unexecuted instantiation: blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1  | 
82  |  |  | 
83  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(  | 
84  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
85  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
86  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
87  | 16.4k  |     const __m128i *round_offset, int shift) { | 
88  | 16.4k  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
89  | 218k  |   for (int i = 0; i < h; ++i) { | 
90  | 201k  |     const __m128i m0 = xx_loadl_64(mask);  | 
91  | 201k  |     const __m128i m = _mm_cvtepu8_epi16(m0);  | 
92  | 201k  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
93  | 201k  |                                 shift);  | 
94  | 201k  |     mask += mask_stride;  | 
95  | 201k  |     dst += dst_stride;  | 
96  | 201k  |     src0 += src0_stride;  | 
97  | 201k  |     src1 += src1_stride;  | 
98  | 201k  |   }  | 
99  | 16.4k  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1 Line  | Count  | Source  |  87  | 16.4k  |     const __m128i *round_offset, int shift) { |  88  | 16.4k  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  |  89  | 218k  |   for (int i = 0; i < h; ++i) { |  90  | 201k  |     const __m128i m0 = xx_loadl_64(mask);  |  91  | 201k  |     const __m128i m = _mm_cvtepu8_epi16(m0);  |  92  | 201k  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  |  93  | 201k  |                                 shift);  |  94  | 201k  |     mask += mask_stride;  |  95  | 201k  |     dst += dst_stride;  |  96  | 201k  |     src0 += src0_stride;  |  97  | 201k  |     src1 += src1_stride;  |  98  | 201k  |   }  |  99  | 16.4k  | }  |  
  | 
100  |  |  | 
101  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(  | 
102  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
103  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
104  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
105  | 31.5k  |     const __m128i *round_offset, int shift) { | 
106  | 31.5k  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
107  | 31.5k  |   const __m128i one_b = _mm_set1_epi8(1);  | 
108  | 31.5k  |   const __m128i two_w = _mm_set1_epi16(2);  | 
109  | 224k  |   for (int i = 0; i < h; ++i) { | 
110  | 193k  |     const __m128i m_i0 = xx_loadl_64(mask);  | 
111  | 193k  |     const __m128i m_i1 = xx_loadl_64(mask + mask_stride);  | 
112  | 193k  |     const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);  | 
113  | 193k  |     const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);  | 
114  | 193k  |     const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);  | 
115  | 193k  |     const __m128i m = _mm_srli_epi16(m_acbd_2, 2);  | 
116  |  |  | 
117  | 193k  |     blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
118  | 193k  |                                 shift);  | 
119  | 193k  |     mask += mask_stride << 1;  | 
120  | 193k  |     dst += dst_stride;  | 
121  | 193k  |     src0 += src0_stride;  | 
122  | 193k  |     src1 += src1_stride;  | 
123  | 193k  |   }  | 
124  | 31.5k  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1 Line  | Count  | Source  |  105  | 31.5k  |     const __m128i *round_offset, int shift) { |  106  | 31.5k  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  |  107  | 31.5k  |   const __m128i one_b = _mm_set1_epi8(1);  |  108  | 31.5k  |   const __m128i two_w = _mm_set1_epi16(2);  |  109  | 224k  |   for (int i = 0; i < h; ++i) { |  110  | 193k  |     const __m128i m_i0 = xx_loadl_64(mask);  |  111  | 193k  |     const __m128i m_i1 = xx_loadl_64(mask + mask_stride);  |  112  | 193k  |     const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);  |  113  | 193k  |     const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);  |  114  | 193k  |     const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);  |  115  | 193k  |     const __m128i m = _mm_srli_epi16(m_acbd_2, 2);  |  116  |  |  |  117  | 193k  |     blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  |  118  | 193k  |                                 shift);  |  119  | 193k  |     mask += mask_stride << 1;  |  120  | 193k  |     dst += dst_stride;  |  121  | 193k  |     src0 += src0_stride;  |  122  | 193k  |     src1 += src1_stride;  |  123  | 193k  |   }  |  124  | 31.5k  | }  |  
  | 
125  |  |  | 
126  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(  | 
127  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
128  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
129  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
130  | 30.9k  |     const __m128i *round_offset, int shift) { | 
131  | 30.9k  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
132  | 30.9k  |   const __m128i one_b = _mm_set1_epi8(1);  | 
133  | 30.9k  |   const __m128i two_w = _mm_set1_epi16(2);  | 
134  | 443k  |   for (int i = 0; i < h; ++i) { | 
135  | 412k  |     const __m128i m_i0 = xx_loadu_128(mask);  | 
136  | 412k  |     const __m128i m_i1 = xx_loadu_128(mask + mask_stride);  | 
137  | 412k  |     const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);  | 
138  | 412k  |     const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);  | 
139  | 412k  |     const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);  | 
140  | 412k  |     const __m128i m = _mm_srli_epi16(m_acbd_2, 2);  | 
141  |  |  | 
142  | 412k  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
143  | 412k  |                                 shift);  | 
144  | 412k  |     mask += mask_stride << 1;  | 
145  | 412k  |     dst += dst_stride;  | 
146  | 412k  |     src0 += src0_stride;  | 
147  | 412k  |     src1 += src1_stride;  | 
148  | 412k  |   }  | 
149  | 30.9k  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1 Line  | Count  | Source  |  130  | 30.9k  |     const __m128i *round_offset, int shift) { |  131  | 30.9k  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  |  132  | 30.9k  |   const __m128i one_b = _mm_set1_epi8(1);  |  133  | 30.9k  |   const __m128i two_w = _mm_set1_epi16(2);  |  134  | 443k  |   for (int i = 0; i < h; ++i) { |  135  | 412k  |     const __m128i m_i0 = xx_loadu_128(mask);  |  136  | 412k  |     const __m128i m_i1 = xx_loadu_128(mask + mask_stride);  |  137  | 412k  |     const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);  |  138  | 412k  |     const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);  |  139  | 412k  |     const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);  |  140  | 412k  |     const __m128i m = _mm_srli_epi16(m_acbd_2, 2);  |  141  |  |  |  142  | 412k  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  |  143  | 412k  |                                 shift);  |  144  | 412k  |     mask += mask_stride << 1;  |  145  | 412k  |     dst += dst_stride;  |  146  | 412k  |     src0 += src0_stride;  |  147  | 412k  |     src1 += src1_stride;  |  148  | 412k  |   }  |  149  | 30.9k  | }  |  
  | 
150  |  |  | 
151  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(  | 
152  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
153  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
154  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
155  | 36  |     const __m128i *round_offset, int shift) { | 
156  | 36  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
157  | 36  |   const __m128i one_b = _mm_set1_epi8(1);  | 
158  | 36  |   const __m128i zeros = _mm_setzero_si128();  | 
159  | 324  |   for (int i = 0; i < h; ++i) { | 
160  | 288  |     const __m128i m_i0 = xx_loadl_64(mask);  | 
161  | 288  |     const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);  | 
162  | 288  |     const __m128i m = _mm_avg_epu16(m_ac, zeros);  | 
163  |  |  | 
164  | 288  |     blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
165  | 288  |                                 shift);  | 
166  | 288  |     mask += mask_stride;  | 
167  | 288  |     dst += dst_stride;  | 
168  | 288  |     src0 += src0_stride;  | 
169  | 288  |     src1 += src1_stride;  | 
170  | 288  |   }  | 
171  | 36  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1 Line  | Count  | Source  |  155  | 36  |     const __m128i *round_offset, int shift) { |  156  | 36  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  |  157  | 36  |   const __m128i one_b = _mm_set1_epi8(1);  |  158  | 36  |   const __m128i zeros = _mm_setzero_si128();  |  159  | 324  |   for (int i = 0; i < h; ++i) { |  160  | 288  |     const __m128i m_i0 = xx_loadl_64(mask);  |  161  | 288  |     const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);  |  162  | 288  |     const __m128i m = _mm_avg_epu16(m_ac, zeros);  |  163  |  |  |  164  | 288  |     blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  |  165  | 288  |                                 shift);  |  166  | 288  |     mask += mask_stride;  |  167  | 288  |     dst += dst_stride;  |  168  | 288  |     src0 += src0_stride;  |  169  | 288  |     src1 += src1_stride;  |  170  | 288  |   }  |  171  | 36  | }  |  
  | 
172  |  |  | 
173  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(  | 
174  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
175  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
176  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
177  | 50  |     const __m128i *round_offset, int shift) { | 
178  | 50  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
179  | 50  |   const __m128i one_b = _mm_set1_epi8(1);  | 
180  | 50  |   const __m128i zeros = _mm_setzero_si128();  | 
181  | 690  |   for (int i = 0; i < h; ++i) { | 
182  | 640  |     const __m128i m_i0 = xx_loadu_128(mask);  | 
183  | 640  |     const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);  | 
184  | 640  |     const __m128i m = _mm_avg_epu16(m_ac, zeros);  | 
185  |  |  | 
186  | 640  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
187  | 640  |                                 shift);  | 
188  | 640  |     mask += mask_stride;  | 
189  | 640  |     dst += dst_stride;  | 
190  | 640  |     src0 += src0_stride;  | 
191  | 640  |     src1 += src1_stride;  | 
192  | 640  |   }  | 
193  | 50  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1 blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1 Line  | Count  | Source  |  177  | 50  |     const __m128i *round_offset, int shift) { |  178  | 50  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  |  179  | 50  |   const __m128i one_b = _mm_set1_epi8(1);  |  180  | 50  |   const __m128i zeros = _mm_setzero_si128();  |  181  | 690  |   for (int i = 0; i < h; ++i) { |  182  | 640  |     const __m128i m_i0 = xx_loadu_128(mask);  |  183  | 640  |     const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);  |  184  | 640  |     const __m128i m = _mm_avg_epu16(m_ac, zeros);  |  185  |  |  |  186  | 640  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  |  187  | 640  |                                 shift);  |  188  | 640  |     mask += mask_stride;  |  189  | 640  |     dst += dst_stride;  |  190  | 640  |     src0 += src0_stride;  |  191  | 640  |     src1 += src1_stride;  |  192  | 640  |   }  |  193  | 50  | }  |  
  | 
194  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(  | 
195  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
196  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
197  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
198  | 0  |     const __m128i *round_offset, int shift) { | 
199  | 0  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
200  | 0  |   const __m128i zeros = _mm_setzero_si128();  | 
201  | 0  |   for (int i = 0; i < h; ++i) { | 
202  | 0  |     const __m128i m_i0 = xx_loadl_64(mask);  | 
203  | 0  |     const __m128i m_i1 = xx_loadl_64(mask + mask_stride);  | 
204  | 0  |     const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);  | 
205  | 0  |     const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));  | 
206  |  | 
  | 
207  | 0  |     blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
208  | 0  |                                 shift);  | 
209  | 0  |     mask += mask_stride << 1;  | 
210  | 0  |     dst += dst_stride;  | 
211  | 0  |     src0 += src0_stride;  | 
212  | 0  |     src1 += src1_stride;  | 
213  | 0  |   }  | 
214  | 0  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1 Unexecuted instantiation: blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1  | 
215  |  |  | 
216  |  | static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(  | 
217  |  |     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,  | 
218  |  |     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  | 
219  |  |     const uint8_t *mask, uint32_t mask_stride, int h,  | 
220  | 0  |     const __m128i *round_offset, int shift) { | 
221  | 0  |   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);  | 
222  | 0  |   const __m128i zeros = _mm_setzero_si128();  | 
223  | 0  |   for (int i = 0; i < h; ++i) { | 
224  | 0  |     const __m128i m_i0 = xx_loadl_64(mask);  | 
225  | 0  |     const __m128i m_i1 = xx_loadl_64(mask + mask_stride);  | 
226  | 0  |     const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);  | 
227  | 0  |     const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));  | 
228  |  | 
  | 
229  | 0  |     blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,  | 
230  | 0  |                                 shift);  | 
231  | 0  |     mask += mask_stride << 1;  | 
232  | 0  |     dst += dst_stride;  | 
233  | 0  |     src0 += src0_stride;  | 
234  | 0  |     src1 += src1_stride;  | 
235  | 0  |   }  | 
236  | 0  | } Unexecuted instantiation: blend_a64_mask_sse4.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1 Unexecuted instantiation: blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1  | 
237  |  | #endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_  |