/src/aom/av1/common/x86/cdef_block_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "aom_dsp/aom_simd.h" |
13 | | #define SIMD_FUNC(name) name##_avx2 |
14 | | #include "av1/common/cdef_block_simd.h" |
15 | | |
16 | | // Mask used to shuffle the elements present in 256bit register. |
17 | | const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504, |
18 | | 0x0f0e0100, 0x0b0a0d0c, 0x07060908, |
19 | | 0x03020504, 0x0f0e0100 }; |
20 | | |
21 | | /* partial A is a 16-bit vector of the form: |
22 | | [x8 - - x1 | x16 - - x9] and partial B has the form: |
23 | | [0 y1 - y7 | 0 y9 - y15]. |
24 | | This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... |
25 | | (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants |
26 | | are in const1 and const2. */ |
27 | | static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala, |
28 | | __m256i *partialb, |
29 | | const __m256i *const1, |
30 | 78.7M | const __m256i *const2) { |
31 | 78.7M | __m256i tmp; |
32 | | /* Reverse partial B. */ |
33 | 78.7M | *partialb = _mm256_shuffle_epi8( |
34 | 78.7M | *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit)); |
35 | | |
36 | | /* Interleave the x and y values of identical indices and pair x8 with 0. */ |
37 | 78.7M | tmp = *partiala; |
38 | 78.7M | *partiala = _mm256_unpacklo_epi16(*partiala, *partialb); |
39 | 78.7M | *partialb = _mm256_unpackhi_epi16(tmp, *partialb); |
40 | | |
41 | | /* Square and add the corresponding x and y values. */ |
42 | 78.7M | *partiala = _mm256_madd_epi16(*partiala, *partiala); |
43 | 78.7M | *partialb = _mm256_madd_epi16(*partialb, *partialb); |
44 | | /* Multiply by constant. */ |
45 | 78.7M | *partiala = _mm256_mullo_epi32(*partiala, *const1); |
46 | 78.7M | *partialb = _mm256_mullo_epi32(*partialb, *const2); |
47 | | /* Sum all results. */ |
48 | 78.7M | *partiala = _mm256_add_epi32(*partiala, *partialb); |
49 | 78.7M | return *partiala; |
50 | 78.7M | } |
51 | | |
52 | | static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2, |
53 | 26.2M | __m256i *x3) { |
54 | 26.2M | const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1); |
55 | 26.2M | const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3); |
56 | 26.2M | const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1); |
57 | 26.2M | const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3); |
58 | | |
59 | 26.2M | *x0 = _mm256_unpacklo_epi64(t0, t1); |
60 | 26.2M | *x1 = _mm256_unpackhi_epi64(t0, t1); |
61 | 26.2M | *x2 = _mm256_unpacklo_epi64(t2, t3); |
62 | 26.2M | *x3 = _mm256_unpackhi_epi64(t2, t3); |
63 | 26.2M | return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1), |
64 | 26.2M | _mm256_add_epi32(*x2, *x3)); |
65 | 26.2M | } |
66 | | |
67 | | /* Computes cost for directions 0, 5, 6 and 7. We can call this function again |
68 | | to compute the remaining directions. */ |
69 | | static INLINE __m256i compute_directions_avx2(__m256i *lines, |
70 | | int32_t cost_frist_8x8[4], |
71 | 26.2M | int32_t cost_second_8x8[4]) { |
72 | 26.2M | __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; |
73 | 26.2M | __m256i partial6; |
74 | 26.2M | __m256i tmp; |
75 | | /* Partial sums for lines 0 and 1. */ |
76 | 26.2M | partial4a = _mm256_slli_si256(lines[0], 14); |
77 | 26.2M | partial4b = _mm256_srli_si256(lines[0], 2); |
78 | 26.2M | partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12)); |
79 | 26.2M | partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4)); |
80 | 26.2M | tmp = _mm256_add_epi16(lines[0], lines[1]); |
81 | 26.2M | partial5a = _mm256_slli_si256(tmp, 10); |
82 | 26.2M | partial5b = _mm256_srli_si256(tmp, 6); |
83 | 26.2M | partial7a = _mm256_slli_si256(tmp, 4); |
84 | 26.2M | partial7b = _mm256_srli_si256(tmp, 12); |
85 | 26.2M | partial6 = tmp; |
86 | | |
87 | | /* Partial sums for lines 2 and 3. */ |
88 | 26.2M | partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10)); |
89 | 26.2M | partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6)); |
90 | 26.2M | partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8)); |
91 | 26.2M | partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8)); |
92 | 26.2M | tmp = _mm256_add_epi16(lines[2], lines[3]); |
93 | 26.2M | partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8)); |
94 | 26.2M | partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8)); |
95 | 26.2M | partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6)); |
96 | 26.2M | partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10)); |
97 | 26.2M | partial6 = _mm256_add_epi16(partial6, tmp); |
98 | | |
99 | | /* Partial sums for lines 4 and 5. */ |
100 | 26.2M | partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6)); |
101 | 26.2M | partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10)); |
102 | 26.2M | partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4)); |
103 | 26.2M | partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12)); |
104 | 26.2M | tmp = _mm256_add_epi16(lines[4], lines[5]); |
105 | 26.2M | partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6)); |
106 | 26.2M | partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10)); |
107 | 26.2M | partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8)); |
108 | 26.2M | partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8)); |
109 | 26.2M | partial6 = _mm256_add_epi16(partial6, tmp); |
110 | | |
111 | | /* Partial sums for lines 6 and 7. */ |
112 | 26.2M | partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2)); |
113 | 26.2M | partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14)); |
114 | 26.2M | partial4a = _mm256_add_epi16(partial4a, lines[7]); |
115 | 26.2M | tmp = _mm256_add_epi16(lines[6], lines[7]); |
116 | 26.2M | partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4)); |
117 | 26.2M | partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12)); |
118 | 26.2M | partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10)); |
119 | 26.2M | partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6)); |
120 | 26.2M | partial6 = _mm256_add_epi16(partial6, tmp); |
121 | | |
122 | 26.2M | const __m256i const_reg_1 = |
123 | 26.2M | _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840); |
124 | 26.2M | const __m256i const_reg_2 = |
125 | 26.2M | _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168); |
126 | 26.2M | const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0); |
127 | 26.2M | const __m256i const_reg_4 = |
128 | 26.2M | _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140); |
129 | | |
130 | | /* Compute costs in terms of partial sums. */ |
131 | 26.2M | partial4a = |
132 | 26.2M | fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2); |
133 | 26.2M | partial7a = |
134 | 26.2M | fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4); |
135 | 26.2M | partial5a = |
136 | 26.2M | fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4); |
137 | 26.2M | partial6 = _mm256_madd_epi16(partial6, partial6); |
138 | 26.2M | partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105)); |
139 | | |
140 | 26.2M | partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a); |
141 | 26.2M | _mm_storeu_si128((__m128i *)cost_frist_8x8, |
142 | 26.2M | _mm256_castsi256_si128(partial4a)); |
143 | 26.2M | _mm_storeu_si128((__m128i *)cost_second_8x8, |
144 | 26.2M | _mm256_extractf128_si256(partial4a, 1)); |
145 | | |
146 | 26.2M | return partial4a; |
147 | 26.2M | } |
148 | | |
149 | | /* transpose and reverse the order of the lines -- equivalent to a 90-degree |
150 | | counter-clockwise rotation of the pixels. */ |
151 | 13.1M | static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) { |
152 | 13.1M | const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); |
153 | 13.1M | const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]); |
154 | 13.1M | const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]); |
155 | 13.1M | const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); |
156 | 13.1M | const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); |
157 | 13.1M | const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]); |
158 | 13.1M | const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]); |
159 | 13.1M | const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); |
160 | | |
161 | 13.1M | const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); |
162 | 13.1M | const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5); |
163 | 13.1M | const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); |
164 | 13.1M | const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5); |
165 | 13.1M | const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3); |
166 | 13.1M | const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); |
167 | 13.1M | const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3); |
168 | 13.1M | const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); |
169 | | |
170 | 13.1M | res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1); |
171 | 13.1M | res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1); |
172 | 13.1M | res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3); |
173 | 13.1M | res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3); |
174 | 13.1M | res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5); |
175 | 13.1M | res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5); |
176 | 13.1M | res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7); |
177 | 13.1M | res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7); |
178 | 13.1M | } |
179 | | |
180 | | void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, |
181 | | int stride, int32_t *var_out_1st, |
182 | | int32_t *var_out_2nd, int coeff_shift, |
183 | 13.0M | int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { |
184 | 13.0M | int32_t cost_first_8x8[8]; |
185 | 13.0M | int32_t cost_second_8x8[8]; |
186 | | // Used to store the best cost for 2 8x8's. |
187 | 13.0M | int32_t best_cost[2] = { 0 }; |
188 | | // Best direction for 2 8x8's. |
189 | 13.0M | int best_dir[2] = { 0 }; |
190 | | |
191 | 13.0M | const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift); |
192 | 13.0M | const __m256i const_128_reg = _mm256_set1_epi16(128); |
193 | 13.0M | __m256i lines[8]; |
194 | 117M | for (int i = 0; i < 8; i++) { |
195 | 104M | const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]); |
196 | 104M | const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]); |
197 | | |
198 | 104M | lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1); |
199 | 104M | lines[i] = _mm256_sub_epi16( |
200 | 104M | _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg); |
201 | 104M | } |
202 | | |
203 | | /* Compute "mostly vertical" directions. */ |
204 | 13.0M | const __m256i dir47 = |
205 | 13.0M | compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4); |
206 | | |
207 | | /* Transpose and reverse the order of the lines. */ |
208 | 13.0M | array_reverse_transpose_8x8_avx2(lines, lines); |
209 | | |
210 | | /* Compute "mostly horizontal" directions. */ |
211 | 13.0M | const __m256i dir03 = |
212 | 13.0M | compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8); |
213 | | |
214 | 13.0M | __m256i max = _mm256_max_epi32(dir03, dir47); |
215 | 13.0M | max = |
216 | 13.0M | _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8), |
217 | 13.0M | _mm256_slli_si256(max, 16 - (8)))); |
218 | 13.0M | max = |
219 | 13.0M | _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4), |
220 | 13.0M | _mm256_slli_si256(max, 16 - (4)))); |
221 | | |
222 | 13.0M | const __m128i first_8x8_output = _mm256_castsi256_si128(max); |
223 | 13.0M | const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1); |
224 | 13.0M | const __m128i cmpeg_res_00 = |
225 | 13.0M | _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47)); |
226 | 13.0M | const __m128i cmpeg_res_01 = |
227 | 13.0M | _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03)); |
228 | 13.0M | const __m128i cmpeg_res_10 = |
229 | 13.0M | _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1)); |
230 | 13.0M | const __m128i cmpeg_res_11 = |
231 | 13.0M | _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1)); |
232 | 13.0M | const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00); |
233 | 13.0M | const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10); |
234 | | |
235 | 13.0M | best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max)); |
236 | 13.0M | best_cost[1] = _mm_cvtsi128_si32(second_8x8_output); |
237 | 13.0M | best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8)); |
238 | 13.0M | best_dir[0] = |
239 | 13.0M | get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros |
240 | 13.0M | best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8)); |
241 | 13.0M | best_dir[1] = |
242 | 13.0M | get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros |
243 | | |
244 | | /* Difference between the optimal variance and the variance along the |
245 | | orthogonal direction. Again, the sum(x^2) terms cancel out. */ |
246 | 13.0M | *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7]; |
247 | 13.0M | *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7]; |
248 | | |
249 | | /* We'd normally divide by 840, but dividing by 1024 is close enough |
250 | | for what we're going to do with this. */ |
251 | 13.0M | *var_out_1st >>= 10; |
252 | 13.0M | *var_out_2nd >>= 10; |
253 | 13.0M | *out_dir_1st_8x8 = best_dir[0]; |
254 | 13.0M | *out_dir_2nd_8x8 = best_dir[1]; |
255 | 13.0M | } |
256 | | |
257 | | void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, |
258 | | const uint8_t *src, int sstride, |
259 | 1.12M | int width, int height) { |
260 | 1.12M | int j = 0; |
261 | 1.12M | int remaining_width = width; |
262 | 1.12M | assert(height % 2 == 0); |
263 | 0 | assert(height > 0); |
264 | 0 | assert(width > 0); |
265 | | |
266 | | // Process multiple 32 pixels at a time. |
267 | 1.12M | if (remaining_width > 31) { |
268 | 1.11M | int i = 0; |
269 | 20.2M | do { |
270 | 20.2M | j = 0; |
271 | 33.1M | do { |
272 | 33.1M | __m128i row00 = |
273 | 33.1M | _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]); |
274 | 33.1M | __m128i row01 = _mm_loadu_si128( |
275 | 33.1M | (const __m128i *)&src[(i + 0) * sstride + (j + 16)]); |
276 | 33.1M | __m128i row10 = |
277 | 33.1M | _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]); |
278 | 33.1M | __m128i row11 = _mm_loadu_si128( |
279 | 33.1M | (const __m128i *)&src[(i + 1) * sstride + (j + 16)]); |
280 | 33.1M | _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)], |
281 | 33.1M | _mm256_cvtepu8_epi16(row00)); |
282 | 33.1M | _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)], |
283 | 33.1M | _mm256_cvtepu8_epi16(row01)); |
284 | 33.1M | _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)], |
285 | 33.1M | _mm256_cvtepu8_epi16(row10)); |
286 | 33.1M | _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)], |
287 | 33.1M | _mm256_cvtepu8_epi16(row11)); |
288 | 33.1M | j += 32; |
289 | 33.1M | } while (j <= width - 32); |
290 | 20.2M | i += 2; |
291 | 20.2M | } while (i < height); |
292 | 1.11M | remaining_width = width & 31; |
293 | 1.11M | } |
294 | | |
295 | | // Process 16 pixels at a time. |
296 | 1.12M | if (remaining_width > 15) { |
297 | 73.1k | int i = 0; |
298 | 681k | do { |
299 | 681k | __m128i row0 = |
300 | 681k | _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]); |
301 | 681k | __m128i row1 = |
302 | 681k | _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]); |
303 | 681k | _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j], |
304 | 681k | _mm256_cvtepu8_epi16(row0)); |
305 | 681k | _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j], |
306 | 681k | _mm256_cvtepu8_epi16(row1)); |
307 | 681k | i += 2; |
308 | 681k | } while (i < height); |
309 | 73.1k | remaining_width = width & 15; |
310 | 73.1k | j += 16; |
311 | 73.1k | } |
312 | | |
313 | | // Process 8 pixels at a time. |
314 | 1.12M | if (remaining_width > 7) { |
315 | 909k | int i = 0; |
316 | 19.1M | do { |
317 | 19.1M | __m128i row0 = |
318 | 19.1M | _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]); |
319 | 19.1M | __m128i row1 = |
320 | 19.1M | _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]); |
321 | 19.1M | _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j], |
322 | 19.1M | _mm_unpacklo_epi8(row0, _mm_setzero_si128())); |
323 | 19.1M | _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j], |
324 | 19.1M | _mm_unpacklo_epi8(row1, _mm_setzero_si128())); |
325 | 19.1M | i += 2; |
326 | 19.1M | } while (i < height); |
327 | 909k | remaining_width = width & 7; |
328 | 909k | j += 8; |
329 | 909k | } |
330 | | |
331 | | // Process 4 pixels at a time. |
332 | 1.12M | if (remaining_width > 3) { |
333 | 10.2k | int i = 0; |
334 | 214k | do { |
335 | 214k | __m128i row0 = |
336 | 214k | _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j])); |
337 | 214k | __m128i row1 = |
338 | 214k | _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j])); |
339 | 214k | _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j], |
340 | 214k | _mm_unpacklo_epi8(row0, _mm_setzero_si128())); |
341 | 214k | _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j], |
342 | 214k | _mm_unpacklo_epi8(row1, _mm_setzero_si128())); |
343 | 214k | i += 2; |
344 | 214k | } while (i < height); |
345 | 10.2k | remaining_width = width & 3; |
346 | 10.2k | j += 4; |
347 | 10.2k | } |
348 | | |
349 | | // Process the remaining pixels. |
350 | 1.12M | if (remaining_width) { |
351 | 0 | for (int i = 0; i < height; i++) { |
352 | 0 | for (int k = j; k < width; k++) { |
353 | 0 | dst[i * dstride + k] = src[i * sstride + k]; |
354 | 0 | } |
355 | 0 | } |
356 | 0 | } |
357 | 1.12M | } |