/src/aom/av1/common/x86/highbd_convolve_2d_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <immintrin.h> |
13 | | #include <assert.h> |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/x86/convolve_avx2.h" |
18 | | #include "aom_dsp/x86/synonyms.h" |
19 | | #include "aom_dsp/aom_dsp_common.h" |
20 | | #include "aom_dsp/aom_filter.h" |
21 | | #include "av1/common/convolve.h" |
22 | | |
23 | | void av1_highbd_convolve_2d_sr_ssse3( |
24 | | const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, |
25 | | int h, const InterpFilterParams *filter_params_x, |
26 | | const InterpFilterParams *filter_params_y, const int subpel_x_qn, |
27 | | const int subpel_y_qn, ConvolveParams *conv_params, int bd); |
28 | | |
29 | | void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, |
30 | | uint16_t *dst, int dst_stride, int w, int h, |
31 | | const InterpFilterParams *filter_params_x, |
32 | | const InterpFilterParams *filter_params_y, |
33 | | const int subpel_x_qn, |
34 | | const int subpel_y_qn, |
35 | 2.92M | ConvolveParams *conv_params, int bd) { |
36 | 2.92M | if (filter_params_x->taps == 12) { |
37 | 0 | av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h, |
38 | 0 | filter_params_x, filter_params_y, |
39 | 0 | subpel_x_qn, subpel_y_qn, conv_params, bd); |
40 | 0 | return; |
41 | 0 | } |
42 | | |
43 | 2.92M | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
44 | 2.92M | int im_h = h + filter_params_y->taps - 1; |
45 | 2.92M | int im_stride = 8; |
46 | 2.92M | int i, j; |
47 | 2.92M | const int fo_vert = filter_params_y->taps / 2 - 1; |
48 | 2.92M | const int fo_horiz = filter_params_x->taps / 2 - 1; |
49 | 2.92M | const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
50 | | |
51 | | // Check that, even with 12-bit input, the intermediate values will fit |
52 | | // into an unsigned 16-bit intermediate array. |
53 | 2.92M | assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); |
54 | | |
55 | 2.92M | __m256i s[8], coeffs_y[4], coeffs_x[4]; |
56 | | |
57 | 2.92M | const __m256i round_const_x = _mm256_set1_epi32( |
58 | 2.92M | ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); |
59 | 2.92M | const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); |
60 | | |
61 | 2.92M | const __m256i round_const_y = _mm256_set1_epi32( |
62 | 2.92M | ((1 << conv_params->round_1) >> 1) - |
63 | 2.92M | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); |
64 | 2.92M | const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); |
65 | | |
66 | 2.92M | const int bits = |
67 | 2.92M | FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; |
68 | 2.92M | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
69 | 2.92M | const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); |
70 | 2.92M | const __m256i clip_pixel = |
71 | 18.4E | _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
72 | 2.92M | const __m256i zero = _mm256_setzero_si256(); |
73 | | |
74 | 2.92M | prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); |
75 | 2.92M | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
76 | | |
77 | 6.84M | for (j = 0; j < w; j += 8) { |
78 | | /* Horizontal filter */ |
79 | 3.91M | { |
80 | 43.6M | for (i = 0; i < im_h; i += 2) { |
81 | 39.7M | const __m256i row0 = |
82 | 39.7M | _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); |
83 | 39.7M | __m256i row1 = _mm256_setzero_si256(); |
84 | 39.7M | if (i + 1 < im_h) |
85 | 35.8M | row1 = |
86 | 35.8M | _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); |
87 | | |
88 | 39.7M | const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); |
89 | 39.7M | const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); |
90 | | |
91 | | // even pixels |
92 | 39.7M | s[0] = _mm256_alignr_epi8(r1, r0, 0); |
93 | 39.7M | s[1] = _mm256_alignr_epi8(r1, r0, 4); |
94 | 39.7M | s[2] = _mm256_alignr_epi8(r1, r0, 8); |
95 | 39.7M | s[3] = _mm256_alignr_epi8(r1, r0, 12); |
96 | | |
97 | 39.7M | __m256i res_even = convolve(s, coeffs_x); |
98 | 39.7M | res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), |
99 | 39.7M | round_shift_x); |
100 | | |
101 | | // odd pixels |
102 | 39.7M | s[0] = _mm256_alignr_epi8(r1, r0, 2); |
103 | 39.7M | s[1] = _mm256_alignr_epi8(r1, r0, 6); |
104 | 39.7M | s[2] = _mm256_alignr_epi8(r1, r0, 10); |
105 | 39.7M | s[3] = _mm256_alignr_epi8(r1, r0, 14); |
106 | | |
107 | 39.7M | __m256i res_odd = convolve(s, coeffs_x); |
108 | 39.7M | res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), |
109 | 39.7M | round_shift_x); |
110 | | |
111 | 39.7M | __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); |
112 | 39.7M | __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); |
113 | 39.7M | __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); |
114 | | |
115 | 39.7M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
116 | 39.7M | } |
117 | 3.91M | } |
118 | | |
119 | | /* Vertical filter */ |
120 | 3.91M | { |
121 | 3.91M | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); |
122 | 3.91M | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); |
123 | 3.91M | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); |
124 | 3.91M | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); |
125 | 3.91M | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); |
126 | 3.91M | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); |
127 | | |
128 | 3.91M | s[0] = _mm256_unpacklo_epi16(s0, s1); |
129 | 3.91M | s[1] = _mm256_unpacklo_epi16(s2, s3); |
130 | 3.91M | s[2] = _mm256_unpacklo_epi16(s4, s5); |
131 | | |
132 | 3.91M | s[4] = _mm256_unpackhi_epi16(s0, s1); |
133 | 3.91M | s[5] = _mm256_unpackhi_epi16(s2, s3); |
134 | 3.91M | s[6] = _mm256_unpackhi_epi16(s4, s5); |
135 | | |
136 | 27.6M | for (i = 0; i < h; i += 2) { |
137 | 23.7M | const int16_t *data = &im_block[i * im_stride]; |
138 | | |
139 | 23.7M | const __m256i s6 = |
140 | 23.7M | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); |
141 | 23.7M | const __m256i s7 = |
142 | 23.7M | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); |
143 | | |
144 | 23.7M | s[3] = _mm256_unpacklo_epi16(s6, s7); |
145 | 23.7M | s[7] = _mm256_unpackhi_epi16(s6, s7); |
146 | | |
147 | 23.7M | const __m256i res_a = convolve(s, coeffs_y); |
148 | 23.7M | __m256i res_a_round = _mm256_sra_epi32( |
149 | 23.7M | _mm256_add_epi32(res_a, round_const_y), round_shift_y); |
150 | | |
151 | 23.7M | res_a_round = _mm256_sra_epi32( |
152 | 23.7M | _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits); |
153 | | |
154 | 23.7M | if (w - j > 4) { |
155 | 19.5M | const __m256i res_b = convolve(s + 4, coeffs_y); |
156 | 19.5M | __m256i res_b_round = _mm256_sra_epi32( |
157 | 19.5M | _mm256_add_epi32(res_b, round_const_y), round_shift_y); |
158 | 19.5M | res_b_round = |
159 | 19.5M | _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits), |
160 | 19.5M | round_shift_bits); |
161 | | |
162 | 19.5M | __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); |
163 | 19.5M | res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); |
164 | 19.5M | res_16bit = _mm256_max_epi16(res_16bit, zero); |
165 | | |
166 | 19.5M | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], |
167 | 19.5M | _mm256_castsi256_si128(res_16bit)); |
168 | 19.5M | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
169 | 19.5M | _mm256_extracti128_si256(res_16bit, 1)); |
170 | 19.5M | } else if (w == 4) { |
171 | 3.24M | res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); |
172 | 3.24M | res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); |
173 | 3.24M | res_a_round = _mm256_max_epi16(res_a_round, zero); |
174 | | |
175 | 3.24M | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], |
176 | 3.24M | _mm256_castsi256_si128(res_a_round)); |
177 | 3.24M | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
178 | 3.24M | _mm256_extracti128_si256(res_a_round, 1)); |
179 | 3.24M | } else { |
180 | 930k | res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); |
181 | 930k | res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); |
182 | 930k | res_a_round = _mm256_max_epi16(res_a_round, zero); |
183 | | |
184 | 930k | xx_storel_32(&dst[i * dst_stride + j], |
185 | 930k | _mm256_castsi256_si128(res_a_round)); |
186 | 930k | xx_storel_32(&dst[i * dst_stride + j + dst_stride], |
187 | 930k | _mm256_extracti128_si256(res_a_round, 1)); |
188 | 930k | } |
189 | | |
190 | 23.7M | s[0] = s[1]; |
191 | 23.7M | s[1] = s[2]; |
192 | 23.7M | s[2] = s[3]; |
193 | | |
194 | 23.7M | s[4] = s[5]; |
195 | 23.7M | s[5] = s[6]; |
196 | 23.7M | s[6] = s[7]; |
197 | 23.7M | } |
198 | 3.91M | } |
199 | 3.91M | } |
200 | 2.92M | } |