/src/libavif/ext/libyuv/source/scale_gcc.cc
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include "libyuv/row.h" |
12 | | #include "libyuv/scale_row.h" |
13 | | |
14 | | #ifdef __cplusplus |
15 | | namespace libyuv { |
16 | | extern "C" { |
17 | | #endif |
18 | | |
19 | | // This module is for GCC x86 and x64. |
20 | | #if !defined(LIBYUV_DISABLE_X86) && \ |
21 | | (defined(__x86_64__) || defined(__i386__)) && \ |
22 | | !defined(LIBYUV_ENABLE_ROWWIN) |
23 | | |
24 | | // Offsets for source bytes 0 to 9 |
25 | | static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, |
26 | | 128, 128, 128, 128, 128, 128, 128, 128}; |
27 | | |
28 | | // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
29 | | static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, |
30 | | 128, 128, 128, 128, 128, 128, 128, 128}; |
31 | | |
32 | | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
33 | | static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, |
34 | | 128, 128, 128, 128, 128, 128, 128, 128}; |
35 | | |
36 | | // Offsets for source bytes 0 to 10 |
37 | | static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; |
38 | | |
39 | | // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
40 | | static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, |
41 | | 8, 9, 9, 10, 10, 11, 12, 13}; |
42 | | |
43 | | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
44 | | static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, |
45 | | 10, 11, 12, 13, 13, 14, 14, 15}; |
46 | | |
47 | | // Coefficients for source bytes 0 to 10 |
48 | | static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; |
49 | | |
50 | | // Coefficients for source bytes 10 to 21 |
51 | | static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; |
52 | | |
53 | | // Coefficients for source bytes 21 to 31 |
54 | | static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; |
55 | | |
56 | | // Coefficients for source bytes 21 to 31 |
57 | | static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; |
58 | | |
59 | | static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, |
60 | | 128, 128, 128, 128, 128, 128, 128, 128}; |
61 | | |
62 | | static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, |
63 | | 6, 8, 11, 14, 128, 128, 128, 128}; |
64 | | |
65 | | // Arrange words 0,3,6 into 0,1,2 |
66 | | static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, |
67 | | 128, 128, 128, 128, 128, 128, 128, 128}; |
68 | | |
69 | | // Arrange words 0,3,6 into 3,4,5 |
70 | | static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, |
71 | | 6, 7, 12, 13, 128, 128, 128, 128}; |
72 | | |
73 | | // Scaling values for boxes of 3x3 and 2x3 |
74 | | static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, |
75 | | 65536 / 9, 65536 / 6, 0, 0}; |
76 | | |
77 | | // Arrange first value for pixels 0,1,2,3,4,5 |
78 | | static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, |
79 | | 11, 128, 14, 128, 128, 128, 128, 128}; |
80 | | |
81 | | // Arrange second value for pixels 0,1,2,3,4,5 |
82 | | static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, |
83 | | 12, 128, 15, 128, 128, 128, 128, 128}; |
84 | | |
85 | | // Arrange third value for pixels 0,1,2,3,4,5 |
86 | | static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, |
87 | | 13, 128, 128, 128, 128, 128, 128, 128}; |
88 | | |
89 | | // Scaling values for boxes of 3x2 and 2x2 |
90 | | static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, |
91 | | 65536 / 3, 65536 / 2, 0, 0}; |
92 | | |
93 | | // GCC versions of row functions are verbatim conversions from Visual C. |
94 | | // Generated using gcc disassembly on Visual C object file: |
95 | | // objdump -D yuvscaler.obj >yuvscaler.txt |
96 | | |
97 | | void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, |
98 | | ptrdiff_t src_stride, |
99 | | uint8_t* dst_ptr, |
100 | 0 | int dst_width) { |
101 | 0 | (void)src_stride; |
102 | 0 | asm volatile( |
103 | | // 16 pixel loop. |
104 | 0 | LABELALIGN |
105 | 0 | "1: \n" |
106 | 0 | "movdqu (%0),%%xmm0 \n" |
107 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
108 | 0 | "lea 0x20(%0),%0 \n" |
109 | 0 | "psrlw $0x8,%%xmm0 \n" |
110 | 0 | "psrlw $0x8,%%xmm1 \n" |
111 | 0 | "packuswb %%xmm1,%%xmm0 \n" |
112 | 0 | "movdqu %%xmm0,(%1) \n" |
113 | 0 | "lea 0x10(%1),%1 \n" |
114 | 0 | "sub $0x10,%2 \n" |
115 | 0 | "jg 1b \n" |
116 | 0 | : "+r"(src_ptr), // %0 |
117 | 0 | "+r"(dst_ptr), // %1 |
118 | 0 | "+r"(dst_width) // %2 |
119 | 0 | : |
120 | 0 | : "memory", "cc", "xmm0", "xmm1"); |
121 | 0 | } |
122 | | |
123 | | void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, |
124 | | ptrdiff_t src_stride, |
125 | | uint8_t* dst_ptr, |
126 | 0 | int dst_width) { |
127 | 0 | (void)src_stride; |
128 | 0 | asm volatile( |
129 | 0 | "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 |
130 | 0 | "pabsb %%xmm4,%%xmm4 \n" |
131 | |
|
132 | 0 | "pxor %%xmm5,%%xmm5 \n" |
133 | |
|
134 | 0 | LABELALIGN |
135 | 0 | "1: \n" |
136 | 0 | "movdqu (%0),%%xmm0 \n" |
137 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
138 | 0 | "lea 0x20(%0),%0 \n" |
139 | 0 | "pmaddubsw %%xmm4,%%xmm0 \n" |
140 | 0 | "pmaddubsw %%xmm4,%%xmm1 \n" |
141 | 0 | "pavgw %%xmm5,%%xmm0 \n" |
142 | 0 | "pavgw %%xmm5,%%xmm1 \n" |
143 | 0 | "packuswb %%xmm1,%%xmm0 \n" |
144 | 0 | "movdqu %%xmm0,(%1) \n" |
145 | 0 | "lea 0x10(%1),%1 \n" |
146 | 0 | "sub $0x10,%2 \n" |
147 | 0 | "jg 1b \n" |
148 | 0 | : "+r"(src_ptr), // %0 |
149 | 0 | "+r"(dst_ptr), // %1 |
150 | 0 | "+r"(dst_width) // %2 |
151 | 0 | : |
152 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); |
153 | 0 | } |
154 | | |
155 | | void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, |
156 | | ptrdiff_t src_stride, |
157 | | uint8_t* dst_ptr, |
158 | 0 | int dst_width) { |
159 | 0 | asm volatile( |
160 | 0 | "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 |
161 | 0 | "pabsb %%xmm4,%%xmm4 \n" |
162 | 0 | "pxor %%xmm5,%%xmm5 \n" |
163 | |
|
164 | 0 | LABELALIGN |
165 | 0 | "1: \n" |
166 | 0 | "movdqu (%0),%%xmm0 \n" |
167 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
168 | 0 | "movdqu 0x00(%0,%3,1),%%xmm2 \n" |
169 | 0 | "movdqu 0x10(%0,%3,1),%%xmm3 \n" |
170 | 0 | "lea 0x20(%0),%0 \n" |
171 | 0 | "pmaddubsw %%xmm4,%%xmm0 \n" |
172 | 0 | "pmaddubsw %%xmm4,%%xmm1 \n" |
173 | 0 | "pmaddubsw %%xmm4,%%xmm2 \n" |
174 | 0 | "pmaddubsw %%xmm4,%%xmm3 \n" |
175 | 0 | "paddw %%xmm2,%%xmm0 \n" |
176 | 0 | "paddw %%xmm3,%%xmm1 \n" |
177 | 0 | "psrlw $0x1,%%xmm0 \n" |
178 | 0 | "psrlw $0x1,%%xmm1 \n" |
179 | 0 | "pavgw %%xmm5,%%xmm0 \n" |
180 | 0 | "pavgw %%xmm5,%%xmm1 \n" |
181 | 0 | "packuswb %%xmm1,%%xmm0 \n" |
182 | 0 | "movdqu %%xmm0,(%1) \n" |
183 | 0 | "lea 0x10(%1),%1 \n" |
184 | 0 | "sub $0x10,%2 \n" |
185 | 0 | "jg 1b \n" |
186 | 0 | : "+r"(src_ptr), // %0 |
187 | 0 | "+r"(dst_ptr), // %1 |
188 | 0 | "+r"(dst_width) // %2 |
189 | 0 | : "r"((intptr_t)(src_stride)) // %3 |
190 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); |
191 | 0 | } |
192 | | |
193 | | #ifdef HAS_SCALEROWDOWN2_AVX2 |
194 | | void ScaleRowDown2_AVX2(const uint8_t* src_ptr, |
195 | | ptrdiff_t src_stride, |
196 | | uint8_t* dst_ptr, |
197 | 0 | int dst_width) { |
198 | 0 | (void)src_stride; |
199 | 0 | asm volatile( |
200 | 0 | "1: \n" |
201 | 0 | "vmovdqu (%0),%%ymm0 \n" |
202 | 0 | "vmovdqu 0x20(%0),%%ymm1 \n" |
203 | 0 | "lea 0x40(%0),%0 \n" |
204 | 0 | "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
205 | 0 | "vpsrlw $0x8,%%ymm1,%%ymm1 \n" |
206 | 0 | "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
207 | 0 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
208 | 0 | "vmovdqu %%ymm0,(%1) \n" |
209 | 0 | "lea 0x20(%1),%1 \n" |
210 | 0 | "sub $0x20,%2 \n" |
211 | 0 | "jg 1b \n" |
212 | 0 | "vzeroupper \n" |
213 | 0 | : "+r"(src_ptr), // %0 |
214 | 0 | "+r"(dst_ptr), // %1 |
215 | 0 | "+r"(dst_width) // %2 |
216 | 0 | : |
217 | 0 | : "memory", "cc", "xmm0", "xmm1"); |
218 | 0 | } |
219 | | |
220 | | void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, |
221 | | ptrdiff_t src_stride, |
222 | | uint8_t* dst_ptr, |
223 | 0 | int dst_width) { |
224 | 0 | (void)src_stride; |
225 | 0 | asm volatile( |
226 | 0 | "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
227 | 0 | "vpabsb %%ymm4,%%ymm4 \n" |
228 | 0 | "vpxor %%ymm5,%%ymm5,%%ymm5 \n" |
229 | |
|
230 | 0 | LABELALIGN |
231 | 0 | "1: \n" |
232 | 0 | "vmovdqu (%0),%%ymm0 \n" |
233 | 0 | "vmovdqu 0x20(%0),%%ymm1 \n" |
234 | 0 | "lea 0x40(%0),%0 \n" |
235 | 0 | "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" |
236 | 0 | "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" |
237 | 0 | "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" |
238 | 0 | "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" |
239 | 0 | "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
240 | 0 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
241 | 0 | "vmovdqu %%ymm0,(%1) \n" |
242 | 0 | "lea 0x20(%1),%1 \n" |
243 | 0 | "sub $0x20,%2 \n" |
244 | 0 | "jg 1b \n" |
245 | 0 | "vzeroupper \n" |
246 | 0 | : "+r"(src_ptr), // %0 |
247 | 0 | "+r"(dst_ptr), // %1 |
248 | 0 | "+r"(dst_width) // %2 |
249 | 0 | : |
250 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); |
251 | 0 | } |
252 | | |
253 | | void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, |
254 | | ptrdiff_t src_stride, |
255 | | uint8_t* dst_ptr, |
256 | 404 | int dst_width) { |
257 | 404 | asm volatile( |
258 | 404 | "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
259 | 404 | "vpabsb %%ymm4,%%ymm4 \n" |
260 | 404 | "vpxor %%ymm5,%%ymm5,%%ymm5 \n" |
261 | | |
262 | 404 | LABELALIGN |
263 | 404 | "1: \n" |
264 | 404 | "vmovdqu (%0),%%ymm0 \n" |
265 | 404 | "vmovdqu 0x20(%0),%%ymm1 \n" |
266 | 404 | "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" |
267 | 404 | "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" |
268 | 404 | "lea 0x40(%0),%0 \n" |
269 | 404 | "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" |
270 | 404 | "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" |
271 | 404 | "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
272 | 404 | "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
273 | 404 | "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
274 | 404 | "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
275 | 404 | "vpsrlw $0x1,%%ymm0,%%ymm0 \n" |
276 | 404 | "vpsrlw $0x1,%%ymm1,%%ymm1 \n" |
277 | 404 | "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" |
278 | 404 | "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" |
279 | 404 | "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
280 | 404 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
281 | 404 | "vmovdqu %%ymm0,(%1) \n" |
282 | 404 | "lea 0x20(%1),%1 \n" |
283 | 404 | "sub $0x20,%2 \n" |
284 | 404 | "jg 1b \n" |
285 | 404 | "vzeroupper \n" |
286 | 404 | : "+r"(src_ptr), // %0 |
287 | 404 | "+r"(dst_ptr), // %1 |
288 | 404 | "+r"(dst_width) // %2 |
289 | 404 | : "r"((intptr_t)(src_stride)) // %3 |
290 | 404 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); |
291 | 404 | } |
292 | | #endif // HAS_SCALEROWDOWN2_AVX2 |
293 | | |
294 | | void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, |
295 | | ptrdiff_t src_stride, |
296 | | uint8_t* dst_ptr, |
297 | 0 | int dst_width) { |
298 | 0 | (void)src_stride; |
299 | 0 | asm volatile( |
300 | 0 | "pcmpeqb %%xmm5,%%xmm5 \n" |
301 | 0 | "psrld $0x18,%%xmm5 \n" |
302 | 0 | "pslld $0x10,%%xmm5 \n" |
303 | |
|
304 | 0 | LABELALIGN |
305 | 0 | "1: \n" |
306 | 0 | "movdqu (%0),%%xmm0 \n" |
307 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
308 | 0 | "lea 0x20(%0),%0 \n" |
309 | 0 | "pand %%xmm5,%%xmm0 \n" |
310 | 0 | "pand %%xmm5,%%xmm1 \n" |
311 | 0 | "packuswb %%xmm1,%%xmm0 \n" |
312 | 0 | "psrlw $0x8,%%xmm0 \n" |
313 | 0 | "packuswb %%xmm0,%%xmm0 \n" |
314 | 0 | "movq %%xmm0,(%1) \n" |
315 | 0 | "lea 0x8(%1),%1 \n" |
316 | 0 | "sub $0x8,%2 \n" |
317 | 0 | "jg 1b \n" |
318 | 0 | : "+r"(src_ptr), // %0 |
319 | 0 | "+r"(dst_ptr), // %1 |
320 | 0 | "+r"(dst_width) // %2 |
321 | 0 | : |
322 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm5"); |
323 | 0 | } |
324 | | |
325 | | void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, |
326 | | ptrdiff_t src_stride, |
327 | | uint8_t* dst_ptr, |
328 | 0 | int dst_width) { |
329 | 0 | intptr_t stridex3; |
330 | 0 | asm volatile( |
331 | 0 | "pcmpeqb %%xmm4,%%xmm4 \n" |
332 | 0 | "pabsw %%xmm4,%%xmm5 \n" |
333 | 0 | "pabsb %%xmm4,%%xmm4 \n" // 0x0101 |
334 | 0 | "psllw $0x3,%%xmm5 \n" // 0x0008 |
335 | 0 | "lea 0x00(%4,%4,2),%3 \n" |
336 | |
|
337 | 0 | LABELALIGN |
338 | 0 | "1: \n" |
339 | 0 | "movdqu (%0),%%xmm0 \n" |
340 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
341 | 0 | "movdqu 0x00(%0,%4,1),%%xmm2 \n" |
342 | 0 | "movdqu 0x10(%0,%4,1),%%xmm3 \n" |
343 | 0 | "pmaddubsw %%xmm4,%%xmm0 \n" |
344 | 0 | "pmaddubsw %%xmm4,%%xmm1 \n" |
345 | 0 | "pmaddubsw %%xmm4,%%xmm2 \n" |
346 | 0 | "pmaddubsw %%xmm4,%%xmm3 \n" |
347 | 0 | "paddw %%xmm2,%%xmm0 \n" |
348 | 0 | "paddw %%xmm3,%%xmm1 \n" |
349 | 0 | "movdqu 0x00(%0,%4,2),%%xmm2 \n" |
350 | 0 | "movdqu 0x10(%0,%4,2),%%xmm3 \n" |
351 | 0 | "pmaddubsw %%xmm4,%%xmm2 \n" |
352 | 0 | "pmaddubsw %%xmm4,%%xmm3 \n" |
353 | 0 | "paddw %%xmm2,%%xmm0 \n" |
354 | 0 | "paddw %%xmm3,%%xmm1 \n" |
355 | 0 | "movdqu 0x00(%0,%3,1),%%xmm2 \n" |
356 | 0 | "movdqu 0x10(%0,%3,1),%%xmm3 \n" |
357 | 0 | "lea 0x20(%0),%0 \n" |
358 | 0 | "pmaddubsw %%xmm4,%%xmm2 \n" |
359 | 0 | "pmaddubsw %%xmm4,%%xmm3 \n" |
360 | 0 | "paddw %%xmm2,%%xmm0 \n" |
361 | 0 | "paddw %%xmm3,%%xmm1 \n" |
362 | 0 | "phaddw %%xmm1,%%xmm0 \n" |
363 | 0 | "paddw %%xmm5,%%xmm0 \n" |
364 | 0 | "psrlw $0x4,%%xmm0 \n" |
365 | 0 | "packuswb %%xmm0,%%xmm0 \n" |
366 | 0 | "movq %%xmm0,(%1) \n" |
367 | 0 | "lea 0x8(%1),%1 \n" |
368 | 0 | "sub $0x8,%2 \n" |
369 | 0 | "jg 1b \n" |
370 | 0 | : "+r"(src_ptr), // %0 |
371 | 0 | "+r"(dst_ptr), // %1 |
372 | 0 | "+r"(dst_width), // %2 |
373 | 0 | "=&r"(stridex3) // %3 |
374 | 0 | : "r"((intptr_t)(src_stride)) // %4 |
375 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
376 | 0 | } |
377 | | |
378 | | #ifdef HAS_SCALEROWDOWN4_AVX2 |
379 | | void ScaleRowDown4_AVX2(const uint8_t* src_ptr, |
380 | | ptrdiff_t src_stride, |
381 | | uint8_t* dst_ptr, |
382 | 0 | int dst_width) { |
383 | 0 | (void)src_stride; |
384 | 0 | asm volatile( |
385 | 0 | "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
386 | 0 | "vpsrld $0x18,%%ymm5,%%ymm5 \n" |
387 | 0 | "vpslld $0x10,%%ymm5,%%ymm5 \n" |
388 | |
|
389 | 0 | LABELALIGN |
390 | 0 | "1: \n" |
391 | 0 | "vmovdqu (%0),%%ymm0 \n" |
392 | 0 | "vmovdqu 0x20(%0),%%ymm1 \n" |
393 | 0 | "lea 0x40(%0),%0 \n" |
394 | 0 | "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
395 | 0 | "vpand %%ymm5,%%ymm1,%%ymm1 \n" |
396 | 0 | "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
397 | 0 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
398 | 0 | "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
399 | 0 | "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
400 | 0 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
401 | 0 | "vmovdqu %%xmm0,(%1) \n" |
402 | 0 | "lea 0x10(%1),%1 \n" |
403 | 0 | "sub $0x10,%2 \n" |
404 | 0 | "jg 1b \n" |
405 | 0 | "vzeroupper \n" |
406 | 0 | : "+r"(src_ptr), // %0 |
407 | 0 | "+r"(dst_ptr), // %1 |
408 | 0 | "+r"(dst_width) // %2 |
409 | 0 | : |
410 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm5"); |
411 | 0 | } |
412 | | |
413 | | void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, |
414 | | ptrdiff_t src_stride, |
415 | | uint8_t* dst_ptr, |
416 | 52 | int dst_width) { |
417 | 52 | asm volatile( |
418 | 52 | "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
419 | 52 | "vpabsw %%ymm4,%%ymm5 \n" |
420 | 52 | "vpabsb %%ymm4,%%ymm4 \n" // 0x0101 |
421 | 52 | "vpsllw $0x3,%%ymm5,%%ymm5 \n" // 0x0008 |
422 | | |
423 | 52 | LABELALIGN |
424 | 52 | "1: \n" |
425 | 52 | "vmovdqu (%0),%%ymm0 \n" |
426 | 52 | "vmovdqu 0x20(%0),%%ymm1 \n" |
427 | 52 | "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" |
428 | 52 | "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" |
429 | 52 | "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" |
430 | 52 | "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" |
431 | 52 | "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
432 | 52 | "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
433 | 52 | "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
434 | 52 | "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
435 | 52 | "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" |
436 | 52 | "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" |
437 | 52 | "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
438 | 52 | "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
439 | 52 | "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
440 | 52 | "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
441 | 52 | "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" |
442 | 52 | "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" |
443 | 52 | "lea 0x40(%0),%0 \n" |
444 | 52 | "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
445 | 52 | "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
446 | 52 | "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
447 | 52 | "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
448 | 52 | "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" |
449 | 52 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
450 | 52 | "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" |
451 | 52 | "vpsrlw $0x4,%%ymm0,%%ymm0 \n" |
452 | 52 | "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
453 | 52 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
454 | 52 | "vmovdqu %%xmm0,(%1) \n" |
455 | 52 | "lea 0x10(%1),%1 \n" |
456 | 52 | "sub $0x10,%2 \n" |
457 | 52 | "jg 1b \n" |
458 | 52 | "vzeroupper \n" |
459 | 52 | : "+r"(src_ptr), // %0 |
460 | 52 | "+r"(dst_ptr), // %1 |
461 | 52 | "+r"(dst_width) // %2 |
462 | 52 | : "r"((intptr_t)(src_stride)), // %3 |
463 | 52 | "r"((intptr_t)(src_stride * 3)) // %4 |
464 | 52 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
465 | 52 | } |
466 | | #endif // HAS_SCALEROWDOWN4_AVX2 |
467 | | |
468 | | void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, |
469 | | ptrdiff_t src_stride, |
470 | | uint8_t* dst_ptr, |
471 | 0 | int dst_width) { |
472 | 0 | (void)src_stride; |
473 | 0 | asm volatile( |
474 | 0 | "movdqa %0,%%xmm3 \n" |
475 | 0 | "movdqa %1,%%xmm4 \n" |
476 | 0 | "movdqa %2,%%xmm5 \n" |
477 | 0 | : |
478 | 0 | : "m"(kShuf0), // %0 |
479 | 0 | "m"(kShuf1), // %1 |
480 | 0 | "m"(kShuf2) // %2 |
481 | 0 | ); |
482 | 0 | asm volatile( |
483 | 0 | "1: \n" |
484 | 0 | "movdqu (%0),%%xmm0 \n" |
485 | 0 | "movdqu 0x10(%0),%%xmm2 \n" |
486 | 0 | "lea 0x20(%0),%0 \n" |
487 | 0 | "movdqa %%xmm2,%%xmm1 \n" |
488 | 0 | "palignr $0x8,%%xmm0,%%xmm1 \n" |
489 | 0 | "pshufb %%xmm3,%%xmm0 \n" |
490 | 0 | "pshufb %%xmm4,%%xmm1 \n" |
491 | 0 | "pshufb %%xmm5,%%xmm2 \n" |
492 | 0 | "movq %%xmm0,(%1) \n" |
493 | 0 | "movq %%xmm1,0x8(%1) \n" |
494 | 0 | "movq %%xmm2,0x10(%1) \n" |
495 | 0 | "lea 0x18(%1),%1 \n" |
496 | 0 | "sub $0x18,%2 \n" |
497 | 0 | "jg 1b \n" |
498 | 0 | : "+r"(src_ptr), // %0 |
499 | 0 | "+r"(dst_ptr), // %1 |
500 | 0 | "+r"(dst_width) // %2 |
501 | 0 | : |
502 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
503 | 0 | } |
504 | | |
505 | | void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, |
506 | | ptrdiff_t src_stride, |
507 | | uint8_t* dst_ptr, |
508 | 0 | int dst_width) { |
509 | 0 | asm volatile( |
510 | 0 | "movdqa %0,%%xmm2 \n" // kShuf01 |
511 | 0 | "movdqa %1,%%xmm3 \n" // kShuf11 |
512 | 0 | "movdqa %2,%%xmm4 \n" // kShuf21 |
513 | 0 | : |
514 | 0 | : "m"(kShuf01), // %0 |
515 | 0 | "m"(kShuf11), // %1 |
516 | 0 | "m"(kShuf21) // %2 |
517 | 0 | ); |
518 | 0 | asm volatile( |
519 | 0 | "movdqa %0,%%xmm5 \n" // kMadd01 |
520 | 0 | "movdqa %1,%%xmm0 \n" // kMadd11 |
521 | 0 | "movdqa %2,%%xmm1 \n" // kRound34 |
522 | 0 | : |
523 | 0 | : "m"(kMadd01), // %0 |
524 | 0 | "m"(kMadd11), // %1 |
525 | 0 | "m"(kRound34) // %2 |
526 | 0 | ); |
527 | 0 | asm volatile( |
528 | 0 | "1: \n" |
529 | 0 | "movdqu (%0),%%xmm6 \n" |
530 | 0 | "movdqu 0x00(%0,%3,1),%%xmm7 \n" |
531 | 0 | "pavgb %%xmm7,%%xmm6 \n" |
532 | 0 | "pshufb %%xmm2,%%xmm6 \n" |
533 | 0 | "pmaddubsw %%xmm5,%%xmm6 \n" |
534 | 0 | "paddsw %%xmm1,%%xmm6 \n" |
535 | 0 | "psrlw $0x2,%%xmm6 \n" |
536 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
537 | 0 | "movq %%xmm6,(%1) \n" |
538 | 0 | "movdqu 0x8(%0),%%xmm6 \n" |
539 | 0 | "movdqu 0x8(%0,%3,1),%%xmm7 \n" |
540 | 0 | "pavgb %%xmm7,%%xmm6 \n" |
541 | 0 | "pshufb %%xmm3,%%xmm6 \n" |
542 | 0 | "pmaddubsw %%xmm0,%%xmm6 \n" |
543 | 0 | "paddsw %%xmm1,%%xmm6 \n" |
544 | 0 | "psrlw $0x2,%%xmm6 \n" |
545 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
546 | 0 | "movq %%xmm6,0x8(%1) \n" |
547 | 0 | "movdqu 0x10(%0),%%xmm6 \n" |
548 | 0 | "movdqu 0x10(%0,%3,1),%%xmm7 \n" |
549 | 0 | "lea 0x20(%0),%0 \n" |
550 | 0 | "pavgb %%xmm7,%%xmm6 \n" |
551 | 0 | "pshufb %%xmm4,%%xmm6 \n" |
552 | 0 | "pmaddubsw %4,%%xmm6 \n" |
553 | 0 | "paddsw %%xmm1,%%xmm6 \n" |
554 | 0 | "psrlw $0x2,%%xmm6 \n" |
555 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
556 | 0 | "movq %%xmm6,0x10(%1) \n" |
557 | 0 | "lea 0x18(%1),%1 \n" |
558 | 0 | "sub $0x18,%2 \n" |
559 | 0 | "jg 1b \n" |
560 | 0 | : "+r"(src_ptr), // %0 |
561 | 0 | "+r"(dst_ptr), // %1 |
562 | 0 | "+r"(dst_width) // %2 |
563 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
564 | 0 | "m"(kMadd21) // %4 |
565 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
566 | 0 | "xmm7"); |
567 | 0 | } |
568 | | |
569 | | void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, |
570 | | ptrdiff_t src_stride, |
571 | | uint8_t* dst_ptr, |
572 | 0 | int dst_width) { |
573 | 0 | asm volatile( |
574 | 0 | "movdqa %0,%%xmm2 \n" // kShuf01 |
575 | 0 | "movdqa %1,%%xmm3 \n" // kShuf11 |
576 | 0 | "movdqa %2,%%xmm4 \n" // kShuf21 |
577 | 0 | : |
578 | 0 | : "m"(kShuf01), // %0 |
579 | 0 | "m"(kShuf11), // %1 |
580 | 0 | "m"(kShuf21) // %2 |
581 | 0 | ); |
582 | 0 | asm volatile( |
583 | 0 | "movdqa %0,%%xmm5 \n" // kMadd01 |
584 | 0 | "movdqa %1,%%xmm0 \n" // kMadd11 |
585 | 0 | "movdqa %2,%%xmm1 \n" // kRound34 |
586 | 0 | : |
587 | 0 | : "m"(kMadd01), // %0 |
588 | 0 | "m"(kMadd11), // %1 |
589 | 0 | "m"(kRound34) // %2 |
590 | 0 | ); |
591 | |
|
592 | 0 | asm volatile( |
593 | 0 | "1: \n" |
594 | 0 | "movdqu (%0),%%xmm6 \n" |
595 | 0 | "movdqu 0x00(%0,%3,1),%%xmm7 \n" |
596 | 0 | "pavgb %%xmm6,%%xmm7 \n" |
597 | 0 | "pavgb %%xmm7,%%xmm6 \n" |
598 | 0 | "pshufb %%xmm2,%%xmm6 \n" |
599 | 0 | "pmaddubsw %%xmm5,%%xmm6 \n" |
600 | 0 | "paddsw %%xmm1,%%xmm6 \n" |
601 | 0 | "psrlw $0x2,%%xmm6 \n" |
602 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
603 | 0 | "movq %%xmm6,(%1) \n" |
604 | 0 | "movdqu 0x8(%0),%%xmm6 \n" |
605 | 0 | "movdqu 0x8(%0,%3,1),%%xmm7 \n" |
606 | 0 | "pavgb %%xmm6,%%xmm7 \n" |
607 | 0 | "pavgb %%xmm7,%%xmm6 \n" |
608 | 0 | "pshufb %%xmm3,%%xmm6 \n" |
609 | 0 | "pmaddubsw %%xmm0,%%xmm6 \n" |
610 | 0 | "paddsw %%xmm1,%%xmm6 \n" |
611 | 0 | "psrlw $0x2,%%xmm6 \n" |
612 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
613 | 0 | "movq %%xmm6,0x8(%1) \n" |
614 | 0 | "movdqu 0x10(%0),%%xmm6 \n" |
615 | 0 | "movdqu 0x10(%0,%3,1),%%xmm7 \n" |
616 | 0 | "lea 0x20(%0),%0 \n" |
617 | 0 | "pavgb %%xmm6,%%xmm7 \n" |
618 | 0 | "pavgb %%xmm7,%%xmm6 \n" |
619 | 0 | "pshufb %%xmm4,%%xmm6 \n" |
620 | 0 | "pmaddubsw %4,%%xmm6 \n" |
621 | 0 | "paddsw %%xmm1,%%xmm6 \n" |
622 | 0 | "psrlw $0x2,%%xmm6 \n" |
623 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
624 | 0 | "movq %%xmm6,0x10(%1) \n" |
625 | 0 | "lea 0x18(%1),%1 \n" |
626 | 0 | "sub $0x18,%2 \n" |
627 | 0 | "jg 1b \n" |
628 | 0 | : "+r"(src_ptr), // %0 |
629 | 0 | "+r"(dst_ptr), // %1 |
630 | 0 | "+r"(dst_width) // %2 |
631 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
632 | 0 | "m"(kMadd21) // %4 |
633 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
634 | 0 | "xmm7"); |
635 | 0 | } |
636 | | |
637 | | void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, |
638 | | ptrdiff_t src_stride, |
639 | | uint8_t* dst_ptr, |
640 | 0 | int dst_width) { |
641 | 0 | (void)src_stride; |
642 | 0 | asm volatile( |
643 | 0 | "movdqa %3,%%xmm4 \n" |
644 | 0 | "movdqa %4,%%xmm5 \n" |
645 | |
|
646 | 0 | LABELALIGN |
647 | 0 | "1: \n" |
648 | 0 | "movdqu (%0),%%xmm0 \n" |
649 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
650 | 0 | "lea 0x20(%0),%0 \n" |
651 | 0 | "pshufb %%xmm4,%%xmm0 \n" |
652 | 0 | "pshufb %%xmm5,%%xmm1 \n" |
653 | 0 | "paddusb %%xmm1,%%xmm0 \n" |
654 | 0 | "movq %%xmm0,(%1) \n" |
655 | 0 | "movhlps %%xmm0,%%xmm1 \n" |
656 | 0 | "movd %%xmm1,0x8(%1) \n" |
657 | 0 | "lea 0xc(%1),%1 \n" |
658 | 0 | "sub $0xc,%2 \n" |
659 | 0 | "jg 1b \n" |
660 | 0 | : "+r"(src_ptr), // %0 |
661 | 0 | "+r"(dst_ptr), // %1 |
662 | 0 | "+r"(dst_width) // %2 |
663 | 0 | : "m"(kShuf38a), // %3 |
664 | 0 | "m"(kShuf38b) // %4 |
665 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); |
666 | 0 | } |
667 | | |
668 | | void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, |
669 | | ptrdiff_t src_stride, |
670 | | uint8_t* dst_ptr, |
671 | 0 | int dst_width) { |
672 | 0 | asm volatile( |
673 | 0 | "movdqa %0,%%xmm2 \n" |
674 | 0 | "movdqa %1,%%xmm3 \n" |
675 | 0 | "movdqa %2,%%xmm4 \n" |
676 | 0 | "movdqa %3,%%xmm5 \n" |
677 | 0 | : |
678 | 0 | : "m"(kShufAb0), // %0 |
679 | 0 | "m"(kShufAb1), // %1 |
680 | 0 | "m"(kShufAb2), // %2 |
681 | 0 | "m"(kScaleAb2) // %3 |
682 | 0 | ); |
683 | 0 | asm volatile( |
684 | 0 | "1: \n" |
685 | 0 | "movdqu (%0),%%xmm0 \n" |
686 | 0 | "movdqu 0x00(%0,%3,1),%%xmm1 \n" |
687 | 0 | "lea 0x10(%0),%0 \n" |
688 | 0 | "pavgb %%xmm1,%%xmm0 \n" |
689 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
690 | 0 | "pshufb %%xmm2,%%xmm1 \n" |
691 | 0 | "movdqa %%xmm0,%%xmm6 \n" |
692 | 0 | "pshufb %%xmm3,%%xmm6 \n" |
693 | 0 | "paddusw %%xmm6,%%xmm1 \n" |
694 | 0 | "pshufb %%xmm4,%%xmm0 \n" |
695 | 0 | "paddusw %%xmm0,%%xmm1 \n" |
696 | 0 | "pmulhuw %%xmm5,%%xmm1 \n" |
697 | 0 | "packuswb %%xmm1,%%xmm1 \n" |
698 | 0 | "movd %%xmm1,(%1) \n" |
699 | 0 | "psrlq $0x10,%%xmm1 \n" |
700 | 0 | "movd %%xmm1,0x2(%1) \n" |
701 | 0 | "lea 0x6(%1),%1 \n" |
702 | 0 | "sub $0x6,%2 \n" |
703 | 0 | "jg 1b \n" |
704 | 0 | : "+r"(src_ptr), // %0 |
705 | 0 | "+r"(dst_ptr), // %1 |
706 | 0 | "+r"(dst_width) // %2 |
707 | 0 | : "r"((intptr_t)(src_stride)) // %3 |
708 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); |
709 | 0 | } |
710 | | |
711 | | void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, |
712 | | ptrdiff_t src_stride, |
713 | | uint8_t* dst_ptr, |
714 | 0 | int dst_width) { |
715 | 0 | asm volatile( |
716 | 0 | "movdqa %0,%%xmm2 \n" |
717 | 0 | "movdqa %1,%%xmm3 \n" |
718 | 0 | "movdqa %2,%%xmm4 \n" |
719 | 0 | "pxor %%xmm5,%%xmm5 \n" |
720 | 0 | : |
721 | 0 | : "m"(kShufAc), // %0 |
722 | 0 | "m"(kShufAc3), // %1 |
723 | 0 | "m"(kScaleAc33) // %2 |
724 | 0 | ); |
725 | 0 | asm volatile( |
726 | 0 | "1: \n" |
727 | 0 | "movdqu (%0),%%xmm0 \n" |
728 | 0 | "movdqu 0x00(%0,%3,1),%%xmm6 \n" |
729 | 0 | "movhlps %%xmm0,%%xmm1 \n" |
730 | 0 | "movhlps %%xmm6,%%xmm7 \n" |
731 | 0 | "punpcklbw %%xmm5,%%xmm0 \n" |
732 | 0 | "punpcklbw %%xmm5,%%xmm1 \n" |
733 | 0 | "punpcklbw %%xmm5,%%xmm6 \n" |
734 | 0 | "punpcklbw %%xmm5,%%xmm7 \n" |
735 | 0 | "paddusw %%xmm6,%%xmm0 \n" |
736 | 0 | "paddusw %%xmm7,%%xmm1 \n" |
737 | 0 | "movdqu 0x00(%0,%3,2),%%xmm6 \n" |
738 | 0 | "lea 0x10(%0),%0 \n" |
739 | 0 | "movhlps %%xmm6,%%xmm7 \n" |
740 | 0 | "punpcklbw %%xmm5,%%xmm6 \n" |
741 | 0 | "punpcklbw %%xmm5,%%xmm7 \n" |
742 | 0 | "paddusw %%xmm6,%%xmm0 \n" |
743 | 0 | "paddusw %%xmm7,%%xmm1 \n" |
744 | 0 | "movdqa %%xmm0,%%xmm6 \n" |
745 | 0 | "psrldq $0x2,%%xmm0 \n" |
746 | 0 | "paddusw %%xmm0,%%xmm6 \n" |
747 | 0 | "psrldq $0x2,%%xmm0 \n" |
748 | 0 | "paddusw %%xmm0,%%xmm6 \n" |
749 | 0 | "pshufb %%xmm2,%%xmm6 \n" |
750 | 0 | "movdqa %%xmm1,%%xmm7 \n" |
751 | 0 | "psrldq $0x2,%%xmm1 \n" |
752 | 0 | "paddusw %%xmm1,%%xmm7 \n" |
753 | 0 | "psrldq $0x2,%%xmm1 \n" |
754 | 0 | "paddusw %%xmm1,%%xmm7 \n" |
755 | 0 | "pshufb %%xmm3,%%xmm7 \n" |
756 | 0 | "paddusw %%xmm7,%%xmm6 \n" |
757 | 0 | "pmulhuw %%xmm4,%%xmm6 \n" |
758 | 0 | "packuswb %%xmm6,%%xmm6 \n" |
759 | 0 | "movd %%xmm6,(%1) \n" |
760 | 0 | "psrlq $0x10,%%xmm6 \n" |
761 | 0 | "movd %%xmm6,0x2(%1) \n" |
762 | 0 | "lea 0x6(%1),%1 \n" |
763 | 0 | "sub $0x6,%2 \n" |
764 | 0 | "jg 1b \n" |
765 | 0 | : "+r"(src_ptr), // %0 |
766 | 0 | "+r"(dst_ptr), // %1 |
767 | 0 | "+r"(dst_width) // %2 |
768 | 0 | : "r"((intptr_t)(src_stride)) // %3 |
769 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
770 | 0 | "xmm7"); |
771 | 0 | } |
772 | | |
773 | | static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, |
774 | | 10, 11, 8, 9, 14, 15, 12, 13}; |
775 | | |
776 | | static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, |
777 | | 3, 1, 1, 3, 3, 1, 1, 3}; |
778 | | |
779 | | #ifdef HAS_SCALEROWUP2_LINEAR_SSE2 |
780 | | void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, |
781 | | uint8_t* dst_ptr, |
782 | 0 | int dst_width) { |
783 | 0 | asm volatile( |
784 | 0 | "pxor %%xmm0,%%xmm0 \n" // 0 |
785 | 0 | "pcmpeqw %%xmm6,%%xmm6 \n" |
786 | 0 | "psrlw $15,%%xmm6 \n" |
787 | 0 | "psllw $1,%%xmm6 \n" // all 2 |
788 | |
|
789 | 0 | LABELALIGN |
790 | 0 | "1: \n" |
791 | 0 | "movq (%0),%%xmm1 \n" // 01234567 |
792 | 0 | "movq 1(%0),%%xmm2 \n" // 12345678 |
793 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
794 | 0 | "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 |
795 | 0 | "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 |
796 | 0 | "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 |
797 | 0 | "movdqa %%xmm1,%%xmm4 \n" |
798 | 0 | "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) |
799 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
800 | 0 | "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) |
801 | 0 | "paddw %%xmm5,%%xmm4 \n" |
802 | 0 | "movdqa %%xmm3,%%xmm5 \n" |
803 | 0 | "paddw %%xmm6,%%xmm4 \n" |
804 | 0 | "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) |
805 | 0 | "paddw %%xmm5,%%xmm5 \n" |
806 | 0 | "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) |
807 | 0 | "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) |
808 | |
|
809 | 0 | "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) |
810 | 0 | "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) |
811 | 0 | "paddw %%xmm2,%%xmm1 \n" |
812 | 0 | "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) |
813 | 0 | "paddw %%xmm6,%%xmm1 \n" |
814 | 0 | "paddw %%xmm3,%%xmm3 \n" |
815 | 0 | "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) |
816 | 0 | "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) |
817 | |
|
818 | 0 | "packuswb %%xmm1,%%xmm5 \n" |
819 | 0 | "movdqu %%xmm5,(%1) \n" |
820 | |
|
821 | 0 | "lea 0x8(%0),%0 \n" |
822 | 0 | "lea 0x10(%1),%1 \n" // 8 sample to 16 sample |
823 | 0 | "sub $0x10,%2 \n" |
824 | 0 | "jg 1b \n" |
825 | 0 | : "+r"(src_ptr), // %0 |
826 | 0 | "+r"(dst_ptr), // %1 |
827 | 0 | "+r"(dst_width) // %2 |
828 | 0 | : |
829 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); |
830 | 0 | } |
831 | | #endif |
832 | | |
833 | | #ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 |
834 | | void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, |
835 | | ptrdiff_t src_stride, |
836 | | uint8_t* dst_ptr, |
837 | | ptrdiff_t dst_stride, |
838 | 0 | int dst_width) { |
839 | 0 | asm volatile( |
840 | 0 | "1: \n" |
841 | 0 | "pxor %%xmm0,%%xmm0 \n" // 0 |
842 | | // above line |
843 | 0 | "movq (%0),%%xmm1 \n" // 01234567 |
844 | 0 | "movq 1(%0),%%xmm2 \n" // 12345678 |
845 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
846 | 0 | "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 |
847 | 0 | "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 |
848 | 0 | "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 |
849 | |
|
850 | 0 | "movdqa %%xmm1,%%xmm4 \n" |
851 | 0 | "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) |
852 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
853 | 0 | "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) |
854 | 0 | "paddw %%xmm5,%%xmm4 \n" // near+far |
855 | 0 | "movdqa %%xmm3,%%xmm5 \n" |
856 | 0 | "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) |
857 | 0 | "paddw %%xmm5,%%xmm5 \n" // 2*near |
858 | 0 | "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) |
859 | |
|
860 | 0 | "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) |
861 | 0 | "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) |
862 | 0 | "paddw %%xmm2,%%xmm1 \n" |
863 | 0 | "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) |
864 | 0 | "paddw %%xmm3,%%xmm3 \n" // 2*near |
865 | 0 | "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) |
866 | | |
867 | | // below line |
868 | 0 | "movq (%0,%3),%%xmm6 \n" // 01234567 |
869 | 0 | "movq 1(%0,%3),%%xmm2 \n" // 12345678 |
870 | 0 | "movdqa %%xmm6,%%xmm3 \n" |
871 | 0 | "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 |
872 | 0 | "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 |
873 | 0 | "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 |
874 | |
|
875 | 0 | "movdqa %%xmm6,%%xmm5 \n" |
876 | 0 | "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) |
877 | 0 | "movdqa %%xmm2,%%xmm7 \n" |
878 | 0 | "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) |
879 | 0 | "paddw %%xmm7,%%xmm5 \n" // near+far |
880 | 0 | "movdqa %%xmm3,%%xmm7 \n" |
881 | 0 | "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) |
882 | 0 | "paddw %%xmm7,%%xmm7 \n" // 2*near |
883 | 0 | "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) |
884 | |
|
885 | 0 | "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) |
886 | 0 | "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) |
887 | 0 | "paddw %%xmm6,%%xmm2 \n" // near+far |
888 | 0 | "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) |
889 | 0 | "paddw %%xmm3,%%xmm3 \n" // 2*near |
890 | 0 | "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) |
891 | | |
892 | | // xmm4 xmm1 |
893 | | // xmm5 xmm2 |
894 | 0 | "pcmpeqw %%xmm0,%%xmm0 \n" |
895 | 0 | "psrlw $15,%%xmm0 \n" |
896 | 0 | "psllw $3,%%xmm0 \n" // all 8 |
897 | |
|
898 | 0 | "movdqa %%xmm4,%%xmm3 \n" |
899 | 0 | "movdqa %%xmm5,%%xmm6 \n" |
900 | 0 | "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) |
901 | 0 | "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) |
902 | 0 | "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) |
903 | 0 | "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) |
904 | 0 | "psrlw $4,%%xmm3 \n" // ^ div by 16 |
905 | |
|
906 | 0 | "movdqa %%xmm1,%%xmm7 \n" |
907 | 0 | "movdqa %%xmm2,%%xmm6 \n" |
908 | 0 | "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) |
909 | 0 | "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) |
910 | 0 | "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) |
911 | 0 | "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) |
912 | 0 | "psrlw $4,%%xmm7 \n" // ^ div by 16 |
913 | |
|
914 | 0 | "packuswb %%xmm7,%%xmm3 \n" |
915 | 0 | "movdqu %%xmm3,(%1) \n" // save above line |
916 | |
|
917 | 0 | "movdqa %%xmm5,%%xmm3 \n" |
918 | 0 | "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) |
919 | 0 | "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) |
920 | 0 | "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) |
921 | 0 | "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) |
922 | 0 | "psrlw $4,%%xmm5 \n" // ^ div by 16 |
923 | |
|
924 | 0 | "movdqa %%xmm2,%%xmm3 \n" |
925 | 0 | "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) |
926 | 0 | "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) |
927 | 0 | "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) |
928 | 0 | "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) |
929 | 0 | "psrlw $4,%%xmm2 \n" // ^ div by 16 |
930 | |
|
931 | 0 | "packuswb %%xmm2,%%xmm5 \n" |
932 | 0 | "movdqu %%xmm5,(%1,%4) \n" // save below line |
933 | |
|
934 | 0 | "lea 0x8(%0),%0 \n" |
935 | 0 | "lea 0x10(%1),%1 \n" // 8 sample to 16 sample |
936 | 0 | "sub $0x10,%2 \n" |
937 | 0 | "jg 1b \n" |
938 | 0 | : "+r"(src_ptr), // %0 |
939 | 0 | "+r"(dst_ptr), // %1 |
940 | 0 | "+r"(dst_width) // %2 |
941 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
942 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
943 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
944 | 0 | "xmm7"); |
945 | 0 | } |
946 | | #endif |
947 | | |
948 | | #ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 |
949 | | void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, |
950 | | uint16_t* dst_ptr, |
951 | 0 | int dst_width) { |
952 | 0 | asm volatile( |
953 | 0 | "movdqa %3,%%xmm5 \n" |
954 | 0 | "pcmpeqw %%xmm4,%%xmm4 \n" |
955 | 0 | "psrlw $15,%%xmm4 \n" |
956 | 0 | "psllw $1,%%xmm4 \n" // all 2 |
957 | |
|
958 | 0 | LABELALIGN |
959 | 0 | "1: \n" |
960 | 0 | "movdqu (%0),%%xmm0 \n" // 01234567 (16) |
961 | 0 | "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) |
962 | |
|
963 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
964 | 0 | "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) |
965 | 0 | "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) |
966 | |
|
967 | 0 | "movdqa %%xmm2,%%xmm3 \n" |
968 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
969 | 0 | "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) |
970 | 0 | "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) |
971 | |
|
972 | 0 | "paddw %%xmm4,%%xmm1 \n" // far+2 |
973 | 0 | "paddw %%xmm4,%%xmm3 \n" // far+2 |
974 | 0 | "paddw %%xmm0,%%xmm1 \n" // near+far+2 |
975 | 0 | "paddw %%xmm2,%%xmm3 \n" // near+far+2 |
976 | 0 | "paddw %%xmm0,%%xmm0 \n" // 2*near |
977 | 0 | "paddw %%xmm2,%%xmm2 \n" // 2*near |
978 | 0 | "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) |
979 | 0 | "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) |
980 | |
|
981 | 0 | "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far |
982 | 0 | "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far |
983 | 0 | "movdqu %%xmm0,(%1) \n" |
984 | 0 | "movdqu %%xmm2,16(%1) \n" |
985 | |
|
986 | 0 | "lea 0x10(%0),%0 \n" |
987 | 0 | "lea 0x20(%1),%1 \n" // 8 sample to 16 sample |
988 | 0 | "sub $0x10,%2 \n" |
989 | 0 | "jg 1b \n" |
990 | 0 | : "+r"(src_ptr), // %0 |
991 | 0 | "+r"(dst_ptr), // %1 |
992 | 0 | "+r"(dst_width) // %2 |
993 | 0 | : "m"(kLinearShuffleFar) // %3 |
994 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
995 | 0 | } |
996 | | #endif |
997 | | |
998 | | #ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 |
999 | | void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, |
1000 | | ptrdiff_t src_stride, |
1001 | | uint16_t* dst_ptr, |
1002 | | ptrdiff_t dst_stride, |
1003 | 0 | int dst_width) { |
1004 | 0 | asm volatile( |
1005 | 0 | "pcmpeqw %%xmm7,%%xmm7 \n" |
1006 | 0 | "psrlw $15,%%xmm7 \n" |
1007 | 0 | "psllw $3,%%xmm7 \n" // all 8 |
1008 | 0 | "movdqa %5,%%xmm6 \n" |
1009 | |
|
1010 | 0 | LABELALIGN |
1011 | 0 | "1: \n" |
1012 | | // above line |
1013 | 0 | "movdqu (%0),%%xmm0 \n" // 01234567 (16) |
1014 | 0 | "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) |
1015 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1016 | 0 | "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) |
1017 | 0 | "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) |
1018 | 0 | "movdqa %%xmm2,%%xmm3 \n" |
1019 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
1020 | 0 | "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) |
1021 | 0 | "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) |
1022 | 0 | "paddw %%xmm0,%%xmm1 \n" // near+far |
1023 | 0 | "paddw %%xmm2,%%xmm3 \n" // near+far |
1024 | 0 | "paddw %%xmm0,%%xmm0 \n" // 2*near |
1025 | 0 | "paddw %%xmm2,%%xmm2 \n" // 2*near |
1026 | 0 | "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) |
1027 | 0 | "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) |
1028 | | |
1029 | | // below line |
1030 | 0 | "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) |
1031 | 0 | "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) |
1032 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
1033 | 0 | "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) |
1034 | 0 | "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) |
1035 | 0 | "movdqa %%xmm3,%%xmm5 \n" |
1036 | 0 | "movdqa %%xmm1,%%xmm4 \n" |
1037 | 0 | "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) |
1038 | 0 | "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) |
1039 | 0 | "paddw %%xmm1,%%xmm4 \n" // near+far |
1040 | 0 | "paddw %%xmm3,%%xmm5 \n" // near+far |
1041 | 0 | "paddw %%xmm1,%%xmm1 \n" // 2*near |
1042 | 0 | "paddw %%xmm3,%%xmm3 \n" // 2*near |
1043 | 0 | "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) |
1044 | 0 | "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) |
1045 | | |
1046 | | // xmm0 xmm2 |
1047 | | // xmm1 xmm3 |
1048 | |
|
1049 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
1050 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
1051 | 0 | "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) |
1052 | 0 | "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) |
1053 | 0 | "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) |
1054 | 0 | "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) |
1055 | 0 | "psrlw $4,%%xmm4 \n" // ^ div by 16 |
1056 | 0 | "movdqu %%xmm4,(%1) \n" |
1057 | |
|
1058 | 0 | "movdqa %%xmm2,%%xmm4 \n" |
1059 | 0 | "movdqa %%xmm3,%%xmm5 \n" |
1060 | 0 | "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) |
1061 | 0 | "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) |
1062 | 0 | "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) |
1063 | 0 | "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) |
1064 | 0 | "psrlw $4,%%xmm4 \n" // ^ div by 16 |
1065 | 0 | "movdqu %%xmm4,0x10(%1) \n" |
1066 | |
|
1067 | 0 | "movdqa %%xmm1,%%xmm4 \n" |
1068 | 0 | "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) |
1069 | 0 | "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) |
1070 | 0 | "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) |
1071 | 0 | "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) |
1072 | 0 | "psrlw $4,%%xmm1 \n" // ^ div by 16 |
1073 | 0 | "movdqu %%xmm1,(%1,%4,2) \n" |
1074 | |
|
1075 | 0 | "movdqa %%xmm3,%%xmm4 \n" |
1076 | 0 | "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) |
1077 | 0 | "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) |
1078 | 0 | "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) |
1079 | 0 | "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) |
1080 | 0 | "psrlw $4,%%xmm3 \n" // ^ div by 16 |
1081 | 0 | "movdqu %%xmm3,0x10(%1,%4,2) \n" |
1082 | |
|
1083 | 0 | "lea 0x10(%0),%0 \n" |
1084 | 0 | "lea 0x20(%1),%1 \n" // 8 sample to 16 sample |
1085 | 0 | "sub $0x10,%2 \n" |
1086 | 0 | "jg 1b \n" |
1087 | 0 | : "+r"(src_ptr), // %0 |
1088 | 0 | "+r"(dst_ptr), // %1 |
1089 | 0 | "+r"(dst_width) // %2 |
1090 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
1091 | 0 | "r"((intptr_t)(dst_stride)), // %4 |
1092 | 0 | "m"(kLinearShuffleFar) // %5 |
1093 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
1094 | 0 | "xmm7"); |
1095 | 0 | } |
1096 | | #endif |
1097 | | |
1098 | | #ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 |
1099 | | void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, |
1100 | | uint16_t* dst_ptr, |
1101 | 0 | int dst_width) { |
1102 | 0 | asm volatile( |
1103 | 0 | "pxor %%xmm5,%%xmm5 \n" |
1104 | 0 | "pcmpeqd %%xmm4,%%xmm4 \n" |
1105 | 0 | "psrld $31,%%xmm4 \n" |
1106 | 0 | "pslld $1,%%xmm4 \n" // all 2 |
1107 | |
|
1108 | 0 | LABELALIGN |
1109 | 0 | "1: \n" |
1110 | 0 | "movq (%0),%%xmm0 \n" // 0123 (16b) |
1111 | 0 | "movq 2(%0),%%xmm1 \n" // 1234 (16b) |
1112 | |
|
1113 | 0 | "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) |
1114 | 0 | "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) |
1115 | |
|
1116 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1117 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
1118 | |
|
1119 | 0 | "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) |
1120 | 0 | "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) |
1121 | |
|
1122 | 0 | "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) |
1123 | 0 | "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) |
1124 | 0 | "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) |
1125 | 0 | "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) |
1126 | 0 | "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) |
1127 | 0 | "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) |
1128 | 0 | "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) |
1129 | 0 | "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) |
1130 | |
|
1131 | 0 | "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) |
1132 | 0 | "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) |
1133 | 0 | "packssdw %%xmm1,%%xmm0 \n" |
1134 | 0 | "pshufd $0b11011000,%%xmm0,%%xmm0 \n" |
1135 | 0 | "movdqu %%xmm0,(%1) \n" |
1136 | |
|
1137 | 0 | "lea 0x8(%0),%0 \n" |
1138 | 0 | "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel |
1139 | 0 | "sub $0x8,%2 \n" |
1140 | 0 | "jg 1b \n" |
1141 | 0 | : "+r"(src_ptr), // %0 |
1142 | 0 | "+r"(dst_ptr), // %1 |
1143 | 0 | "+r"(dst_width) // %2 |
1144 | 0 | : |
1145 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
1146 | 0 | } |
1147 | | #endif |
1148 | | |
1149 | | #ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 |
1150 | | void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, |
1151 | | ptrdiff_t src_stride, |
1152 | | uint16_t* dst_ptr, |
1153 | | ptrdiff_t dst_stride, |
1154 | 0 | int dst_width) { |
1155 | 0 | asm volatile( |
1156 | 0 | "pxor %%xmm7,%%xmm7 \n" |
1157 | 0 | "pcmpeqd %%xmm6,%%xmm6 \n" |
1158 | 0 | "psrld $31,%%xmm6 \n" |
1159 | 0 | "pslld $3,%%xmm6 \n" // all 8 |
1160 | |
|
1161 | 0 | LABELALIGN |
1162 | 0 | "1: \n" |
1163 | 0 | "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) |
1164 | 0 | "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) |
1165 | 0 | "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) |
1166 | 0 | "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) |
1167 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1168 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
1169 | 0 | "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) |
1170 | 0 | "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) |
1171 | 0 | "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) |
1172 | 0 | "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) |
1173 | 0 | "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) |
1174 | 0 | "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) |
1175 | 0 | "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) |
1176 | 0 | "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) |
1177 | |
|
1178 | 0 | "movq (%0),%%xmm0 \n" // 0123 (16b) |
1179 | 0 | "movq 2(%0),%%xmm1 \n" // 1234 (16b) |
1180 | 0 | "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) |
1181 | 0 | "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) |
1182 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1183 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
1184 | 0 | "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) |
1185 | 0 | "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) |
1186 | 0 | "paddd %%xmm0,%%xmm2 \n" // near+far (lo) |
1187 | 0 | "paddd %%xmm1,%%xmm3 \n" // near+far (hi) |
1188 | 0 | "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) |
1189 | 0 | "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) |
1190 | 0 | "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) |
1191 | 0 | "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) |
1192 | |
|
1193 | 0 | "movq (%0,%3,2),%%xmm2 \n" |
1194 | 0 | "movq 2(%0,%3,2),%%xmm3 \n" |
1195 | 0 | "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) |
1196 | 0 | "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) |
1197 | 0 | "movdqa %%xmm2,%%xmm4 \n" |
1198 | 0 | "movdqa %%xmm3,%%xmm5 \n" |
1199 | 0 | "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) |
1200 | 0 | "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) |
1201 | 0 | "paddd %%xmm2,%%xmm4 \n" // near+far (lo) |
1202 | 0 | "paddd %%xmm3,%%xmm5 \n" // near+far (hi) |
1203 | 0 | "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) |
1204 | 0 | "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) |
1205 | 0 | "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) |
1206 | 0 | "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) |
1207 | |
|
1208 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
1209 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
1210 | 0 | "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) |
1211 | 0 | "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) |
1212 | 0 | "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) |
1213 | 0 | "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) |
1214 | 0 | "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) |
1215 | |
|
1216 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
1217 | 0 | "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) |
1218 | 0 | "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) |
1219 | 0 | "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) |
1220 | 0 | "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) |
1221 | 0 | "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) |
1222 | |
|
1223 | 0 | "movdqa %%xmm1,%%xmm0 \n" |
1224 | 0 | "movdqa %%xmm3,%%xmm2 \n" |
1225 | 0 | "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) |
1226 | 0 | "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) |
1227 | 0 | "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) |
1228 | 0 | "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) |
1229 | 0 | "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) |
1230 | |
|
1231 | 0 | "movdqa %%xmm3,%%xmm2 \n" |
1232 | 0 | "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) |
1233 | 0 | "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) |
1234 | 0 | "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) |
1235 | 0 | "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) |
1236 | 0 | "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) |
1237 | |
|
1238 | 0 | "packssdw %%xmm0,%%xmm4 \n" |
1239 | 0 | "pshufd $0b11011000,%%xmm4,%%xmm4 \n" |
1240 | 0 | "movdqu %%xmm4,(%1) \n" // store above |
1241 | 0 | "packssdw %%xmm2,%%xmm5 \n" |
1242 | 0 | "pshufd $0b11011000,%%xmm5,%%xmm5 \n" |
1243 | 0 | "movdqu %%xmm5,(%1,%4,2) \n" // store below |
1244 | |
|
1245 | 0 | "lea 0x8(%0),%0 \n" |
1246 | 0 | "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel |
1247 | 0 | "sub $0x8,%2 \n" |
1248 | 0 | "jg 1b \n" |
1249 | 0 | : "+r"(src_ptr), // %0 |
1250 | 0 | "+r"(dst_ptr), // %1 |
1251 | 0 | "+r"(dst_width) // %2 |
1252 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
1253 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
1254 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
1255 | 0 | "xmm7"); |
1256 | 0 | } |
1257 | | #endif |
1258 | | |
1259 | | #ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 |
1260 | | void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, |
1261 | | uint8_t* dst_ptr, |
1262 | 0 | int dst_width) { |
1263 | 0 | asm volatile( |
1264 | 0 | "pcmpeqw %%xmm4,%%xmm4 \n" |
1265 | 0 | "psrlw $15,%%xmm4 \n" |
1266 | 0 | "psllw $1,%%xmm4 \n" // all 2 |
1267 | 0 | "movdqa %3,%%xmm3 \n" |
1268 | |
|
1269 | 0 | LABELALIGN |
1270 | 0 | "1: \n" |
1271 | 0 | "movq (%0),%%xmm0 \n" // 01234567 |
1272 | 0 | "movq 1(%0),%%xmm1 \n" // 12345678 |
1273 | 0 | "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 |
1274 | 0 | "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 |
1275 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1276 | 0 | "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 |
1277 | 0 | "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 |
1278 | 0 | "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) |
1279 | 0 | "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) |
1280 | 0 | "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) |
1281 | 0 | "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) |
1282 | 0 | "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) |
1283 | 0 | "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) |
1284 | 0 | "packuswb %%xmm2,%%xmm0 \n" |
1285 | 0 | "movdqu %%xmm0,(%1) \n" |
1286 | 0 | "lea 0x8(%0),%0 \n" |
1287 | 0 | "lea 0x10(%1),%1 \n" // 8 sample to 16 sample |
1288 | 0 | "sub $0x10,%2 \n" |
1289 | 0 | "jg 1b \n" |
1290 | 0 | : "+r"(src_ptr), // %0 |
1291 | 0 | "+r"(dst_ptr), // %1 |
1292 | 0 | "+r"(dst_width) // %2 |
1293 | 0 | : "m"(kLinearMadd31) // %3 |
1294 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); |
1295 | 0 | } |
1296 | | #endif |
1297 | | |
1298 | | #ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 |
1299 | | void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, |
1300 | | ptrdiff_t src_stride, |
1301 | | uint8_t* dst_ptr, |
1302 | | ptrdiff_t dst_stride, |
1303 | 0 | int dst_width) { |
1304 | 0 | asm volatile( |
1305 | 0 | "pcmpeqw %%xmm6,%%xmm6 \n" |
1306 | 0 | "psrlw $15,%%xmm6 \n" |
1307 | 0 | "psllw $3,%%xmm6 \n" // all 8 |
1308 | 0 | "movdqa %5,%%xmm7 \n" |
1309 | |
|
1310 | 0 | LABELALIGN |
1311 | 0 | "1: \n" |
1312 | 0 | "movq (%0),%%xmm0 \n" // 01234567 |
1313 | 0 | "movq 1(%0),%%xmm1 \n" // 12345678 |
1314 | 0 | "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 |
1315 | 0 | "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 |
1316 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1317 | 0 | "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 |
1318 | 0 | "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 |
1319 | 0 | "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) |
1320 | 0 | "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) |
1321 | |
|
1322 | 0 | "movq (%0,%3),%%xmm1 \n" |
1323 | 0 | "movq 1(%0,%3),%%xmm4 \n" |
1324 | 0 | "punpcklwd %%xmm1,%%xmm1 \n" |
1325 | 0 | "punpcklwd %%xmm4,%%xmm4 \n" |
1326 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
1327 | 0 | "punpckhdq %%xmm4,%%xmm3 \n" |
1328 | 0 | "punpckldq %%xmm4,%%xmm1 \n" |
1329 | 0 | "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) |
1330 | 0 | "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) |
1331 | | |
1332 | | // xmm0 xmm2 |
1333 | | // xmm1 xmm3 |
1334 | |
|
1335 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
1336 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
1337 | 0 | "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) |
1338 | 0 | "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) |
1339 | 0 | "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) |
1340 | 0 | "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) |
1341 | 0 | "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) |
1342 | |
|
1343 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
1344 | 0 | "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) |
1345 | 0 | "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) |
1346 | 0 | "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) |
1347 | 0 | "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) |
1348 | 0 | "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) |
1349 | |
|
1350 | 0 | "movdqa %%xmm2,%%xmm0 \n" |
1351 | 0 | "movdqa %%xmm3,%%xmm1 \n" |
1352 | 0 | "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) |
1353 | 0 | "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) |
1354 | 0 | "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) |
1355 | 0 | "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) |
1356 | 0 | "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) |
1357 | |
|
1358 | 0 | "movdqa %%xmm3,%%xmm1 \n" |
1359 | 0 | "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) |
1360 | 0 | "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) |
1361 | 0 | "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) |
1362 | 0 | "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) |
1363 | 0 | "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) |
1364 | |
|
1365 | 0 | "packuswb %%xmm0,%%xmm4 \n" |
1366 | 0 | "movdqu %%xmm4,(%1) \n" // store above |
1367 | 0 | "packuswb %%xmm1,%%xmm5 \n" |
1368 | 0 | "movdqu %%xmm5,(%1,%4) \n" // store below |
1369 | |
|
1370 | 0 | "lea 0x8(%0),%0 \n" |
1371 | 0 | "lea 0x10(%1),%1 \n" // 8 sample to 16 sample |
1372 | 0 | "sub $0x10,%2 \n" |
1373 | 0 | "jg 1b \n" |
1374 | 0 | : "+r"(src_ptr), // %0 |
1375 | 0 | "+r"(dst_ptr), // %1 |
1376 | 0 | "+r"(dst_width) // %2 |
1377 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
1378 | 0 | "r"((intptr_t)(dst_stride)), // %4 |
1379 | 0 | "m"(kLinearMadd31) // %5 |
1380 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
1381 | 0 | "xmm7"); |
1382 | 0 | } |
1383 | | #endif |
1384 | | |
1385 | | #ifdef HAS_SCALEROWUP2_LINEAR_AVX2 |
1386 | | void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, |
1387 | | uint8_t* dst_ptr, |
1388 | 13.9k | int dst_width) { |
1389 | 13.9k | asm volatile( |
1390 | 13.9k | "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" |
1391 | 13.9k | "vpsrlw $15,%%ymm4,%%ymm4 \n" |
1392 | 13.9k | "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 |
1393 | 13.9k | "vbroadcastf128 %3,%%ymm3 \n" |
1394 | | |
1395 | 13.9k | LABELALIGN |
1396 | 13.9k | "1: \n" |
1397 | 13.9k | "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF |
1398 | 13.9k | "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 |
1399 | 13.9k | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" |
1400 | 13.9k | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" |
1401 | 13.9k | "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" |
1402 | 13.9k | "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" |
1403 | 13.9k | "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" |
1404 | 13.9k | "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" |
1405 | 13.9k | "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) |
1406 | 13.9k | "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) |
1407 | 13.9k | "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) |
1408 | 13.9k | "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) |
1409 | 13.9k | "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) |
1410 | 13.9k | "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) |
1411 | 13.9k | "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
1412 | 13.9k | "vmovdqu %%ymm0,(%1) \n" |
1413 | | |
1414 | 13.9k | "lea 0x10(%0),%0 \n" |
1415 | 13.9k | "lea 0x20(%1),%1 \n" // 16 sample to 32 sample |
1416 | 13.9k | "sub $0x20,%2 \n" |
1417 | 13.9k | "jg 1b \n" |
1418 | 13.9k | "vzeroupper \n" |
1419 | 13.9k | : "+r"(src_ptr), // %0 |
1420 | 13.9k | "+r"(dst_ptr), // %1 |
1421 | 13.9k | "+r"(dst_width) // %2 |
1422 | 13.9k | : "m"(kLinearMadd31) // %3 |
1423 | 13.9k | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); |
1424 | 13.9k | } |
1425 | | #endif |
1426 | | |
1427 | | #ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 |
1428 | | void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, |
1429 | | ptrdiff_t src_stride, |
1430 | | uint8_t* dst_ptr, |
1431 | | ptrdiff_t dst_stride, |
1432 | 16.4k | int dst_width) { |
1433 | 16.4k | asm volatile( |
1434 | 16.4k | "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" |
1435 | 16.4k | "vpsrlw $15,%%ymm6,%%ymm6 \n" |
1436 | 16.4k | "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 |
1437 | 16.4k | "vbroadcastf128 %5,%%ymm7 \n" |
1438 | | |
1439 | 16.4k | LABELALIGN |
1440 | 16.4k | "1: \n" |
1441 | 16.4k | "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF |
1442 | 16.4k | "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 |
1443 | 16.4k | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" |
1444 | 16.4k | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" |
1445 | 16.4k | "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" |
1446 | 16.4k | "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" |
1447 | 16.4k | "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" |
1448 | 16.4k | "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" |
1449 | 16.4k | "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) |
1450 | 16.4k | "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) |
1451 | | |
1452 | 16.4k | "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF |
1453 | 16.4k | "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 |
1454 | 16.4k | "vpermq $0b11011000,%%ymm2,%%ymm2 \n" |
1455 | 16.4k | "vpermq $0b11011000,%%ymm3,%%ymm3 \n" |
1456 | 16.4k | "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" |
1457 | 16.4k | "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" |
1458 | 16.4k | "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" |
1459 | 16.4k | "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" |
1460 | 16.4k | "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) |
1461 | 16.4k | "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) |
1462 | | |
1463 | | // ymm0 ymm1 |
1464 | | // ymm2 ymm3 |
1465 | | |
1466 | 16.4k | "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) |
1467 | 16.4k | "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) |
1468 | 16.4k | "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) |
1469 | 16.4k | "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) |
1470 | 16.4k | "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) |
1471 | | |
1472 | 16.4k | "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) |
1473 | 16.4k | "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) |
1474 | 16.4k | "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) |
1475 | 16.4k | "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) |
1476 | 16.4k | "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) |
1477 | | |
1478 | 16.4k | "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) |
1479 | 16.4k | "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) |
1480 | 16.4k | "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) |
1481 | 16.4k | "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) |
1482 | 16.4k | "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) |
1483 | | |
1484 | 16.4k | "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) |
1485 | 16.4k | "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) |
1486 | 16.4k | "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) |
1487 | 16.4k | "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) |
1488 | 16.4k | "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) |
1489 | | |
1490 | 16.4k | "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" |
1491 | 16.4k | "vmovdqu %%ymm4,(%1) \n" // store above |
1492 | 16.4k | "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" |
1493 | 16.4k | "vmovdqu %%ymm5,(%1,%4) \n" // store below |
1494 | | |
1495 | 16.4k | "lea 0x10(%0),%0 \n" |
1496 | 16.4k | "lea 0x20(%1),%1 \n" // 16 sample to 32 sample |
1497 | 16.4k | "sub $0x20,%2 \n" |
1498 | 16.4k | "jg 1b \n" |
1499 | 16.4k | "vzeroupper \n" |
1500 | 16.4k | : "+r"(src_ptr), // %0 |
1501 | 16.4k | "+r"(dst_ptr), // %1 |
1502 | 16.4k | "+r"(dst_width) // %2 |
1503 | 16.4k | : "r"((intptr_t)(src_stride)), // %3 |
1504 | 16.4k | "r"((intptr_t)(dst_stride)), // %4 |
1505 | 16.4k | "m"(kLinearMadd31) // %5 |
1506 | 16.4k | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
1507 | 16.4k | "xmm7"); |
1508 | 16.4k | } |
1509 | | #endif |
1510 | | |
1511 | | #ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 |
1512 | | void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, |
1513 | | uint16_t* dst_ptr, |
1514 | 76.0k | int dst_width) { |
1515 | 76.0k | asm volatile( |
1516 | 76.0k | "vbroadcastf128 %3,%%ymm5 \n" |
1517 | 76.0k | "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" |
1518 | 76.0k | "vpsrlw $15,%%ymm4,%%ymm4 \n" |
1519 | 76.0k | "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 |
1520 | | |
1521 | 76.0k | LABELALIGN |
1522 | 76.0k | "1: \n" |
1523 | 76.0k | "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) |
1524 | 76.0k | "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) |
1525 | | |
1526 | 76.0k | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF |
1527 | 76.0k | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 |
1528 | | |
1529 | 76.0k | "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) |
1530 | 76.0k | "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) |
1531 | 76.0k | "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) |
1532 | 76.0k | "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) |
1533 | | |
1534 | 76.0k | "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 |
1535 | 76.0k | "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 |
1536 | 76.0k | "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 |
1537 | 76.0k | "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 |
1538 | 76.0k | "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near |
1539 | 76.0k | "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near |
1540 | 76.0k | "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 |
1541 | 76.0k | "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 |
1542 | | |
1543 | 76.0k | "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far |
1544 | 76.0k | "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far |
1545 | 76.0k | "vmovdqu %%ymm0,(%1) \n" |
1546 | 76.0k | "vmovdqu %%ymm2,32(%1) \n" |
1547 | | |
1548 | 76.0k | "lea 0x20(%0),%0 \n" |
1549 | 76.0k | "lea 0x40(%1),%1 \n" // 16 sample to 32 sample |
1550 | 76.0k | "sub $0x20,%2 \n" |
1551 | 76.0k | "jg 1b \n" |
1552 | 76.0k | "vzeroupper \n" |
1553 | 76.0k | : "+r"(src_ptr), // %0 |
1554 | 76.0k | "+r"(dst_ptr), // %1 |
1555 | 76.0k | "+r"(dst_width) // %2 |
1556 | 76.0k | : "m"(kLinearShuffleFar) // %3 |
1557 | 76.0k | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
1558 | 76.0k | } |
1559 | | #endif |
1560 | | |
1561 | | #ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 |
1562 | | void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, |
1563 | | ptrdiff_t src_stride, |
1564 | | uint16_t* dst_ptr, |
1565 | | ptrdiff_t dst_stride, |
1566 | 17.1k | int dst_width) { |
1567 | 17.1k | asm volatile( |
1568 | 17.1k | "vbroadcastf128 %5,%%ymm5 \n" |
1569 | 17.1k | "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" |
1570 | 17.1k | "vpsrlw $15,%%ymm4,%%ymm4 \n" |
1571 | 17.1k | "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 |
1572 | | |
1573 | 17.1k | LABELALIGN |
1574 | 17.1k | "1: \n" |
1575 | | |
1576 | 17.1k | "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) |
1577 | 17.1k | "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) |
1578 | 17.1k | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 |
1579 | 17.1k | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 |
1580 | 17.1k | "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) |
1581 | 17.1k | "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) |
1582 | 17.1k | "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far |
1583 | 17.1k | "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near |
1584 | 17.1k | "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) |
1585 | | |
1586 | 17.1k | "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) |
1587 | 17.1k | "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) |
1588 | 17.1k | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 |
1589 | 17.1k | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 |
1590 | 17.1k | "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) |
1591 | 17.1k | "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) |
1592 | 17.1k | "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far |
1593 | 17.1k | "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near |
1594 | 17.1k | "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) |
1595 | | |
1596 | 17.1k | "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) |
1597 | 17.1k | "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) |
1598 | 17.1k | "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) |
1599 | 17.1k | "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) |
1600 | 17.1k | "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 |
1601 | 17.1k | "vmovdqu %%ymm0,(%1) \n" // store above |
1602 | | |
1603 | 17.1k | "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) |
1604 | 17.1k | "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) |
1605 | 17.1k | "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) |
1606 | 17.1k | "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) |
1607 | 17.1k | "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 |
1608 | 17.1k | "vmovdqu %%ymm0,(%1,%4,2) \n" // store below |
1609 | | |
1610 | 17.1k | "lea 0x10(%0),%0 \n" |
1611 | 17.1k | "lea 0x20(%1),%1 \n" // 8 sample to 16 sample |
1612 | 17.1k | "sub $0x10,%2 \n" |
1613 | 17.1k | "jg 1b \n" |
1614 | 17.1k | "vzeroupper \n" |
1615 | 17.1k | : "+r"(src_ptr), // %0 |
1616 | 17.1k | "+r"(dst_ptr), // %1 |
1617 | 17.1k | "+r"(dst_width) // %2 |
1618 | 17.1k | : "r"((intptr_t)(src_stride)), // %3 |
1619 | 17.1k | "r"((intptr_t)(dst_stride)), // %4 |
1620 | 17.1k | "m"(kLinearShuffleFar) // %5 |
1621 | 17.1k | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
1622 | 17.1k | } |
1623 | | #endif |
1624 | | |
1625 | | #ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 |
1626 | | void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, |
1627 | | uint16_t* dst_ptr, |
1628 | 0 | int dst_width) { |
1629 | 0 | asm volatile( |
1630 | 0 | "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" |
1631 | 0 | "vpsrld $31,%%ymm4,%%ymm4 \n" |
1632 | 0 | "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 |
1633 | |
|
1634 | 0 | LABELALIGN |
1635 | 0 | "1: \n" |
1636 | 0 | "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) |
1637 | 0 | "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) |
1638 | |
|
1639 | 0 | "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) |
1640 | 0 | "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) |
1641 | |
|
1642 | 0 | "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) |
1643 | 0 | "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) |
1644 | |
|
1645 | 0 | "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) |
1646 | 0 | "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) |
1647 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) |
1648 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) |
1649 | 0 | "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) |
1650 | 0 | "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) |
1651 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) |
1652 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) |
1653 | |
|
1654 | 0 | "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) |
1655 | 0 | "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) |
1656 | 0 | "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" |
1657 | 0 | "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" |
1658 | 0 | "vmovdqu %%ymm0,(%1) \n" |
1659 | |
|
1660 | 0 | "lea 0x10(%0),%0 \n" |
1661 | 0 | "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel |
1662 | 0 | "sub $0x10,%2 \n" |
1663 | 0 | "jg 1b \n" |
1664 | 0 | "vzeroupper \n" |
1665 | 0 | : "+r"(src_ptr), // %0 |
1666 | 0 | "+r"(dst_ptr), // %1 |
1667 | 0 | "+r"(dst_width) // %2 |
1668 | 0 | : |
1669 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); |
1670 | 0 | } |
1671 | | #endif |
1672 | | |
1673 | | #ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 |
1674 | | void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, |
1675 | | ptrdiff_t src_stride, |
1676 | | uint16_t* dst_ptr, |
1677 | | ptrdiff_t dst_stride, |
1678 | 0 | int dst_width) { |
1679 | 0 | asm volatile( |
1680 | 0 | "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" |
1681 | 0 | "vpsrld $31,%%ymm6,%%ymm6 \n" |
1682 | 0 | "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 |
1683 | |
|
1684 | 0 | LABELALIGN |
1685 | 0 | "1: \n" |
1686 | |
|
1687 | 0 | "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) |
1688 | 0 | "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) |
1689 | 0 | "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) |
1690 | 0 | "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) |
1691 | 0 | "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) |
1692 | 0 | "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) |
1693 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) |
1694 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) |
1695 | 0 | "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) |
1696 | 0 | "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) |
1697 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) |
1698 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) |
1699 | |
|
1700 | 0 | "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) |
1701 | 0 | "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) |
1702 | 0 | "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) |
1703 | 0 | "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) |
1704 | 0 | "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) |
1705 | 0 | "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) |
1706 | 0 | "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) |
1707 | 0 | "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) |
1708 | 0 | "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) |
1709 | 0 | "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) |
1710 | 0 | "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) |
1711 | 0 | "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) |
1712 | |
|
1713 | 0 | "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) |
1714 | 0 | "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) |
1715 | 0 | "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) |
1716 | 0 | "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) |
1717 | 0 | "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) |
1718 | |
|
1719 | 0 | "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) |
1720 | 0 | "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) |
1721 | 0 | "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) |
1722 | 0 | "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) |
1723 | 0 | "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) |
1724 | |
|
1725 | 0 | "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) |
1726 | 0 | "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) |
1727 | 0 | "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) |
1728 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) |
1729 | 0 | "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) |
1730 | |
|
1731 | 0 | "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) |
1732 | 0 | "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) |
1733 | 0 | "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) |
1734 | 0 | "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) |
1735 | 0 | "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) |
1736 | |
|
1737 | 0 | "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" |
1738 | 0 | "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" |
1739 | 0 | "vmovdqu %%ymm4,(%1) \n" // store above |
1740 | 0 | "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" |
1741 | 0 | "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" |
1742 | 0 | "vmovdqu %%ymm5,(%1,%4,2) \n" // store below |
1743 | |
|
1744 | 0 | "lea 0x10(%0),%0 \n" |
1745 | 0 | "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel |
1746 | 0 | "sub $0x10,%2 \n" |
1747 | 0 | "jg 1b \n" |
1748 | 0 | "vzeroupper \n" |
1749 | 0 | : "+r"(src_ptr), // %0 |
1750 | 0 | "+r"(dst_ptr), // %1 |
1751 | 0 | "+r"(dst_width) // %2 |
1752 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
1753 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
1754 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); |
1755 | 0 | } |
1756 | | #endif |
1757 | | |
1758 | | // Reads 16xN bytes and produces 16 shorts at a time. |
1759 | | void ScaleAddRow_SSE2(const uint8_t* src_ptr, |
1760 | | uint16_t* dst_ptr, |
1761 | 0 | int src_width) { |
1762 | 0 | asm volatile("pxor %%xmm5,%%xmm5 \n" |
1763 | | |
1764 | | // 16 pixel loop. |
1765 | 0 | LABELALIGN |
1766 | 0 | "1: \n" |
1767 | 0 | "movdqu (%0),%%xmm3 \n" |
1768 | 0 | "lea 0x10(%0),%0 \n" // src_ptr += 16 |
1769 | 0 | "movdqu (%1),%%xmm0 \n" |
1770 | 0 | "movdqu 0x10(%1),%%xmm1 \n" |
1771 | 0 | "movdqa %%xmm3,%%xmm2 \n" |
1772 | 0 | "punpcklbw %%xmm5,%%xmm2 \n" |
1773 | 0 | "punpckhbw %%xmm5,%%xmm3 \n" |
1774 | 0 | "paddusw %%xmm2,%%xmm0 \n" |
1775 | 0 | "paddusw %%xmm3,%%xmm1 \n" |
1776 | 0 | "movdqu %%xmm0,(%1) \n" |
1777 | 0 | "movdqu %%xmm1,0x10(%1) \n" |
1778 | 0 | "lea 0x20(%1),%1 \n" |
1779 | 0 | "sub $0x10,%2 \n" |
1780 | 0 | "jg 1b \n" |
1781 | 0 | : "+r"(src_ptr), // %0 |
1782 | 0 | "+r"(dst_ptr), // %1 |
1783 | 0 | "+r"(src_width) // %2 |
1784 | 0 | : |
1785 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); |
1786 | 0 | } |
1787 | | |
1788 | | #ifdef HAS_SCALEADDROW_AVX2 |
1789 | | // Reads 32 bytes and accumulates to 32 shorts at a time. |
1790 | | void ScaleAddRow_AVX2(const uint8_t* src_ptr, |
1791 | | uint16_t* dst_ptr, |
1792 | 559k | int src_width) { |
1793 | 559k | asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" |
1794 | | |
1795 | 559k | LABELALIGN |
1796 | 559k | "1: \n" |
1797 | 559k | "vmovdqu (%0),%%ymm3 \n" |
1798 | 559k | "lea 0x20(%0),%0 \n" // src_ptr += 32 |
1799 | 559k | "vpermq $0xd8,%%ymm3,%%ymm3 \n" |
1800 | 559k | "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" |
1801 | 559k | "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" |
1802 | 559k | "vpaddusw (%1),%%ymm2,%%ymm0 \n" |
1803 | 559k | "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" |
1804 | 559k | "vmovdqu %%ymm0,(%1) \n" |
1805 | 559k | "vmovdqu %%ymm1,0x20(%1) \n" |
1806 | 559k | "lea 0x40(%1),%1 \n" |
1807 | 559k | "sub $0x20,%2 \n" |
1808 | 559k | "jg 1b \n" |
1809 | 559k | "vzeroupper \n" |
1810 | 559k | : "+r"(src_ptr), // %0 |
1811 | 559k | "+r"(dst_ptr), // %1 |
1812 | 559k | "+r"(src_width) // %2 |
1813 | 559k | : |
1814 | 559k | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); |
1815 | 559k | } |
1816 | | #endif // HAS_SCALEADDROW_AVX2 |
1817 | | |
1818 | | // Constant for making pixels signed to avoid pmaddubsw |
1819 | | // saturation. |
1820 | | static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
1821 | | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; |
1822 | | |
1823 | | // Constant for making pixels unsigned and adding .5 for rounding. |
1824 | | static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, |
1825 | | 0x4040, 0x4040, 0x4040, 0x4040}; |
1826 | | |
1827 | | // Bilinear column filtering. SSSE3 version. |
1828 | | void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, |
1829 | | const uint8_t* src_ptr, |
1830 | | int dst_width, |
1831 | | int x, |
1832 | 570k | int dx) { |
1833 | 570k | intptr_t x0, x1, temp_pixel; |
1834 | 570k | asm volatile( |
1835 | 570k | "movd %6,%%xmm2 \n" |
1836 | 570k | "movd %7,%%xmm3 \n" |
1837 | 570k | "movl $0x04040000,%k2 \n" |
1838 | 570k | "movd %k2,%%xmm5 \n" |
1839 | 570k | "pcmpeqb %%xmm6,%%xmm6 \n" |
1840 | 570k | "psrlw $0x9,%%xmm6 \n" // 0x007f007f |
1841 | 570k | "pcmpeqb %%xmm7,%%xmm7 \n" |
1842 | 570k | "psrlw $15,%%xmm7 \n" // 0x00010001 |
1843 | | |
1844 | 570k | "pextrw $0x1,%%xmm2,%k3 \n" |
1845 | 570k | "subl $0x2,%5 \n" |
1846 | 570k | "jl 29f \n" |
1847 | 570k | "movdqa %%xmm2,%%xmm0 \n" |
1848 | 570k | "paddd %%xmm3,%%xmm0 \n" |
1849 | 570k | "punpckldq %%xmm0,%%xmm2 \n" |
1850 | 570k | "punpckldq %%xmm3,%%xmm3 \n" |
1851 | 570k | "paddd %%xmm3,%%xmm3 \n" |
1852 | 570k | "pextrw $0x3,%%xmm2,%k4 \n" |
1853 | | |
1854 | 570k | LABELALIGN |
1855 | 570k | "2: \n" |
1856 | 570k | "movdqa %%xmm2,%%xmm1 \n" |
1857 | 570k | "paddd %%xmm3,%%xmm2 \n" |
1858 | 570k | "movzwl 0x00(%1,%3,1),%k2 \n" |
1859 | 570k | "movd %k2,%%xmm0 \n" |
1860 | 570k | "psrlw $0x9,%%xmm1 \n" |
1861 | 570k | "movzwl 0x00(%1,%4,1),%k2 \n" |
1862 | 570k | "movd %k2,%%xmm4 \n" |
1863 | 570k | "pshufb %%xmm5,%%xmm1 \n" |
1864 | 570k | "punpcklwd %%xmm4,%%xmm0 \n" |
1865 | 570k | "psubb %8,%%xmm0 \n" // make pixels signed. |
1866 | 570k | "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + |
1867 | | // 1 |
1868 | 570k | "paddusb %%xmm7,%%xmm1 \n" |
1869 | 570k | "pmaddubsw %%xmm0,%%xmm1 \n" |
1870 | 570k | "pextrw $0x1,%%xmm2,%k3 \n" |
1871 | 570k | "pextrw $0x3,%%xmm2,%k4 \n" |
1872 | 570k | "paddw %9,%%xmm1 \n" // make pixels unsigned. |
1873 | 570k | "psrlw $0x7,%%xmm1 \n" |
1874 | 570k | "packuswb %%xmm1,%%xmm1 \n" |
1875 | 570k | "movd %%xmm1,%k2 \n" |
1876 | 570k | "mov %w2,(%0) \n" |
1877 | 570k | "lea 0x2(%0),%0 \n" |
1878 | 570k | "subl $0x2,%5 \n" |
1879 | 570k | "jge 2b \n" |
1880 | | |
1881 | 570k | LABELALIGN |
1882 | 570k | "29: \n" |
1883 | 570k | "addl $0x1,%5 \n" |
1884 | 570k | "jl 99f \n" |
1885 | 570k | "movzwl 0x00(%1,%3,1),%k2 \n" |
1886 | 570k | "movd %k2,%%xmm0 \n" |
1887 | 570k | "psrlw $0x9,%%xmm2 \n" |
1888 | 570k | "pshufb %%xmm5,%%xmm2 \n" |
1889 | 570k | "psubb %8,%%xmm0 \n" // make pixels signed. |
1890 | 570k | "pxor %%xmm6,%%xmm2 \n" |
1891 | 570k | "paddusb %%xmm7,%%xmm2 \n" |
1892 | 570k | "pmaddubsw %%xmm0,%%xmm2 \n" |
1893 | 570k | "paddw %9,%%xmm2 \n" // make pixels unsigned. |
1894 | 570k | "psrlw $0x7,%%xmm2 \n" |
1895 | 570k | "packuswb %%xmm2,%%xmm2 \n" |
1896 | 570k | "movd %%xmm2,%k2 \n" |
1897 | 570k | "mov %b2,(%0) \n" |
1898 | 570k | "99: \n" |
1899 | 570k | : "+r"(dst_ptr), // %0 |
1900 | 570k | "+r"(src_ptr), // %1 |
1901 | 570k | "=&a"(temp_pixel), // %2 |
1902 | 570k | "=&r"(x0), // %3 |
1903 | 570k | "=&r"(x1), // %4 |
1904 | 570k | #if defined(__x86_64__) |
1905 | 570k | "+rm"(dst_width) // %5 |
1906 | | #else |
1907 | | "+m"(dst_width) // %5 |
1908 | | #endif |
1909 | 570k | : "rm"(x), // %6 |
1910 | 570k | "rm"(dx), // %7 |
1911 | 570k | #if defined(__x86_64__) |
1912 | 570k | "x"(kFsub80), // %8 |
1913 | 570k | "x"(kFadd40) // %9 |
1914 | | #else |
1915 | | "m"(kFsub80), // %8 |
1916 | | "m"(kFadd40) // %9 |
1917 | | #endif |
1918 | 570k | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
1919 | 570k | "xmm7"); |
1920 | 570k | } |
1921 | | |
1922 | | // Reads 4 pixels, duplicates them and writes 8 pixels. |
1923 | | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
1924 | | void ScaleColsUp2_SSE2(uint8_t* dst_ptr, |
1925 | | const uint8_t* src_ptr, |
1926 | | int dst_width, |
1927 | | int x, |
1928 | 0 | int dx) { |
1929 | 0 | (void)x; |
1930 | 0 | (void)dx; |
1931 | 0 | asm volatile( |
1932 | 0 | "1: \n" |
1933 | 0 | "movdqu (%1),%%xmm0 \n" |
1934 | 0 | "lea 0x10(%1),%1 \n" |
1935 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
1936 | 0 | "punpcklbw %%xmm0,%%xmm0 \n" |
1937 | 0 | "punpckhbw %%xmm1,%%xmm1 \n" |
1938 | 0 | "movdqu %%xmm0,(%0) \n" |
1939 | 0 | "movdqu %%xmm1,0x10(%0) \n" |
1940 | 0 | "lea 0x20(%0),%0 \n" |
1941 | 0 | "sub $0x20,%2 \n" |
1942 | 0 | "jg 1b \n" |
1943 | |
|
1944 | 0 | : "+r"(dst_ptr), // %0 |
1945 | 0 | "+r"(src_ptr), // %1 |
1946 | 0 | "+r"(dst_width) // %2 |
1947 | 0 | : |
1948 | 0 | : "memory", "cc", "xmm0", "xmm1"); |
1949 | 0 | } |
1950 | | |
1951 | | void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, |
1952 | | ptrdiff_t src_stride, |
1953 | | uint8_t* dst_argb, |
1954 | 0 | int dst_width) { |
1955 | 0 | (void)src_stride; |
1956 | 0 | asm volatile( |
1957 | 0 | "1: \n" |
1958 | 0 | "movdqu (%0),%%xmm0 \n" |
1959 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
1960 | 0 | "lea 0x20(%0),%0 \n" |
1961 | 0 | "shufps $0xdd,%%xmm1,%%xmm0 \n" |
1962 | 0 | "movdqu %%xmm0,(%1) \n" |
1963 | 0 | "lea 0x10(%1),%1 \n" |
1964 | 0 | "sub $0x4,%2 \n" |
1965 | 0 | "jg 1b \n" |
1966 | 0 | : "+r"(src_argb), // %0 |
1967 | 0 | "+r"(dst_argb), // %1 |
1968 | 0 | "+r"(dst_width) // %2 |
1969 | 0 | : |
1970 | 0 | : "memory", "cc", "xmm0", "xmm1"); |
1971 | 0 | } |
1972 | | |
1973 | | void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, |
1974 | | ptrdiff_t src_stride, |
1975 | | uint8_t* dst_argb, |
1976 | 0 | int dst_width) { |
1977 | 0 | (void)src_stride; |
1978 | 0 | asm volatile( |
1979 | 0 | "1: \n" |
1980 | 0 | "movdqu (%0),%%xmm0 \n" |
1981 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
1982 | 0 | "lea 0x20(%0),%0 \n" |
1983 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
1984 | 0 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
1985 | 0 | "shufps $0xdd,%%xmm1,%%xmm2 \n" |
1986 | 0 | "pavgb %%xmm2,%%xmm0 \n" |
1987 | 0 | "movdqu %%xmm0,(%1) \n" |
1988 | 0 | "lea 0x10(%1),%1 \n" |
1989 | 0 | "sub $0x4,%2 \n" |
1990 | 0 | "jg 1b \n" |
1991 | 0 | : "+r"(src_argb), // %0 |
1992 | 0 | "+r"(dst_argb), // %1 |
1993 | 0 | "+r"(dst_width) // %2 |
1994 | 0 | : |
1995 | 0 | : "memory", "cc", "xmm0", "xmm1"); |
1996 | 0 | } |
1997 | | |
1998 | | void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, |
1999 | | ptrdiff_t src_stride, |
2000 | | uint8_t* dst_argb, |
2001 | 0 | int dst_width) { |
2002 | 0 | asm volatile( |
2003 | 0 | "1: \n" |
2004 | 0 | "movdqu (%0),%%xmm0 \n" |
2005 | 0 | "movdqu 0x10(%0),%%xmm1 \n" |
2006 | 0 | "movdqu 0x00(%0,%3,1),%%xmm2 \n" |
2007 | 0 | "movdqu 0x10(%0,%3,1),%%xmm3 \n" |
2008 | 0 | "lea 0x20(%0),%0 \n" |
2009 | 0 | "pavgb %%xmm2,%%xmm0 \n" |
2010 | 0 | "pavgb %%xmm3,%%xmm1 \n" |
2011 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
2012 | 0 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
2013 | 0 | "shufps $0xdd,%%xmm1,%%xmm2 \n" |
2014 | 0 | "pavgb %%xmm2,%%xmm0 \n" |
2015 | 0 | "movdqu %%xmm0,(%1) \n" |
2016 | 0 | "lea 0x10(%1),%1 \n" |
2017 | 0 | "sub $0x4,%2 \n" |
2018 | 0 | "jg 1b \n" |
2019 | 0 | : "+r"(src_argb), // %0 |
2020 | 0 | "+r"(dst_argb), // %1 |
2021 | 0 | "+r"(dst_width) // %2 |
2022 | 0 | : "r"((intptr_t)(src_stride)) // %3 |
2023 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); |
2024 | 0 | } |
2025 | | |
2026 | | // Reads 4 pixels at a time. |
2027 | | // Alignment requirement: dst_argb 16 byte aligned. |
2028 | | void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, |
2029 | | ptrdiff_t src_stride, |
2030 | | int src_stepx, |
2031 | | uint8_t* dst_argb, |
2032 | 0 | int dst_width) { |
2033 | 0 | intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
2034 | 0 | intptr_t src_stepx_x12; |
2035 | 0 | (void)src_stride; |
2036 | 0 | asm volatile( |
2037 | 0 | "lea 0x00(,%1,4),%1 \n" |
2038 | 0 | "lea 0x00(%1,%1,2),%4 \n" |
2039 | |
|
2040 | 0 | LABELALIGN |
2041 | 0 | "1: \n" |
2042 | 0 | "movd (%0),%%xmm0 \n" |
2043 | 0 | "movd 0x00(%0,%1,1),%%xmm1 \n" |
2044 | 0 | "punpckldq %%xmm1,%%xmm0 \n" |
2045 | 0 | "movd 0x00(%0,%1,2),%%xmm2 \n" |
2046 | 0 | "movd 0x00(%0,%4,1),%%xmm3 \n" |
2047 | 0 | "lea 0x00(%0,%1,4),%0 \n" |
2048 | 0 | "punpckldq %%xmm3,%%xmm2 \n" |
2049 | 0 | "punpcklqdq %%xmm2,%%xmm0 \n" |
2050 | 0 | "movdqu %%xmm0,(%2) \n" |
2051 | 0 | "lea 0x10(%2),%2 \n" |
2052 | 0 | "sub $0x4,%3 \n" |
2053 | 0 | "jg 1b \n" |
2054 | 0 | : "+r"(src_argb), // %0 |
2055 | 0 | "+r"(src_stepx_x4), // %1 |
2056 | 0 | "+r"(dst_argb), // %2 |
2057 | 0 | "+r"(dst_width), // %3 |
2058 | 0 | "=&r"(src_stepx_x12) // %4 |
2059 | 0 | : |
2060 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); |
2061 | 0 | } |
2062 | | |
2063 | | // Blends four 2x2 to 4x1. |
2064 | | // Alignment requirement: dst_argb 16 byte aligned. |
2065 | | void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, |
2066 | | ptrdiff_t src_stride, |
2067 | | int src_stepx, |
2068 | | uint8_t* dst_argb, |
2069 | 0 | int dst_width) { |
2070 | 0 | intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
2071 | 0 | intptr_t src_stepx_x12; |
2072 | 0 | intptr_t row1 = (intptr_t)(src_stride); |
2073 | 0 | asm volatile( |
2074 | 0 | "lea 0x00(,%1,4),%1 \n" |
2075 | 0 | "lea 0x00(%1,%1,2),%4 \n" |
2076 | 0 | "lea 0x00(%0,%5,1),%5 \n" |
2077 | |
|
2078 | 0 | LABELALIGN |
2079 | 0 | "1: \n" |
2080 | 0 | "movq (%0),%%xmm0 \n" |
2081 | 0 | "movhps 0x00(%0,%1,1),%%xmm0 \n" |
2082 | 0 | "movq 0x00(%0,%1,2),%%xmm1 \n" |
2083 | 0 | "movhps 0x00(%0,%4,1),%%xmm1 \n" |
2084 | 0 | "lea 0x00(%0,%1,4),%0 \n" |
2085 | 0 | "movq (%5),%%xmm2 \n" |
2086 | 0 | "movhps 0x00(%5,%1,1),%%xmm2 \n" |
2087 | 0 | "movq 0x00(%5,%1,2),%%xmm3 \n" |
2088 | 0 | "movhps 0x00(%5,%4,1),%%xmm3 \n" |
2089 | 0 | "lea 0x00(%5,%1,4),%5 \n" |
2090 | 0 | "pavgb %%xmm2,%%xmm0 \n" |
2091 | 0 | "pavgb %%xmm3,%%xmm1 \n" |
2092 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
2093 | 0 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
2094 | 0 | "shufps $0xdd,%%xmm1,%%xmm2 \n" |
2095 | 0 | "pavgb %%xmm2,%%xmm0 \n" |
2096 | 0 | "movdqu %%xmm0,(%2) \n" |
2097 | 0 | "lea 0x10(%2),%2 \n" |
2098 | 0 | "sub $0x4,%3 \n" |
2099 | 0 | "jg 1b \n" |
2100 | 0 | : "+r"(src_argb), // %0 |
2101 | 0 | "+r"(src_stepx_x4), // %1 |
2102 | 0 | "+r"(dst_argb), // %2 |
2103 | 0 | "+rm"(dst_width), // %3 |
2104 | 0 | "=&r"(src_stepx_x12), // %4 |
2105 | 0 | "+r"(row1) // %5 |
2106 | 0 | : |
2107 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); |
2108 | 0 | } |
2109 | | |
2110 | | void ScaleARGBCols_SSE2(uint8_t* dst_argb, |
2111 | | const uint8_t* src_argb, |
2112 | | int dst_width, |
2113 | | int x, |
2114 | 0 | int dx) { |
2115 | 0 | intptr_t x0, x1; |
2116 | 0 | asm volatile( |
2117 | 0 | "movd %5,%%xmm2 \n" |
2118 | 0 | "movd %6,%%xmm3 \n" |
2119 | 0 | "pshufd $0x0,%%xmm2,%%xmm2 \n" |
2120 | 0 | "pshufd $0x11,%%xmm3,%%xmm0 \n" |
2121 | 0 | "paddd %%xmm0,%%xmm2 \n" |
2122 | 0 | "paddd %%xmm3,%%xmm3 \n" |
2123 | 0 | "pshufd $0x5,%%xmm3,%%xmm0 \n" |
2124 | 0 | "paddd %%xmm0,%%xmm2 \n" |
2125 | 0 | "paddd %%xmm3,%%xmm3 \n" |
2126 | 0 | "pshufd $0x0,%%xmm3,%%xmm3 \n" |
2127 | 0 | "pextrw $0x1,%%xmm2,%k0 \n" |
2128 | 0 | "pextrw $0x3,%%xmm2,%k1 \n" |
2129 | 0 | "cmp $0x0,%4 \n" |
2130 | 0 | "jl 99f \n" |
2131 | 0 | "sub $0x4,%4 \n" |
2132 | 0 | "jl 49f \n" |
2133 | |
|
2134 | 0 | LABELALIGN |
2135 | 0 | "40: \n" |
2136 | 0 | "movd 0x00(%3,%0,4),%%xmm0 \n" |
2137 | 0 | "movd 0x00(%3,%1,4),%%xmm1 \n" |
2138 | 0 | "pextrw $0x5,%%xmm2,%k0 \n" |
2139 | 0 | "pextrw $0x7,%%xmm2,%k1 \n" |
2140 | 0 | "paddd %%xmm3,%%xmm2 \n" |
2141 | 0 | "punpckldq %%xmm1,%%xmm0 \n" |
2142 | 0 | "movd 0x00(%3,%0,4),%%xmm1 \n" |
2143 | 0 | "movd 0x00(%3,%1,4),%%xmm4 \n" |
2144 | 0 | "pextrw $0x1,%%xmm2,%k0 \n" |
2145 | 0 | "pextrw $0x3,%%xmm2,%k1 \n" |
2146 | 0 | "punpckldq %%xmm4,%%xmm1 \n" |
2147 | 0 | "punpcklqdq %%xmm1,%%xmm0 \n" |
2148 | 0 | "movdqu %%xmm0,(%2) \n" |
2149 | 0 | "lea 0x10(%2),%2 \n" |
2150 | 0 | "sub $0x4,%4 \n" |
2151 | 0 | "jge 40b \n" |
2152 | |
|
2153 | 0 | "49: \n" |
2154 | 0 | "test $0x2,%4 \n" |
2155 | 0 | "je 29f \n" |
2156 | 0 | "movd 0x00(%3,%0,4),%%xmm0 \n" |
2157 | 0 | "movd 0x00(%3,%1,4),%%xmm1 \n" |
2158 | 0 | "pextrw $0x5,%%xmm2,%k0 \n" |
2159 | 0 | "punpckldq %%xmm1,%%xmm0 \n" |
2160 | 0 | "movq %%xmm0,(%2) \n" |
2161 | 0 | "lea 0x8(%2),%2 \n" |
2162 | 0 | "29: \n" |
2163 | 0 | "test $0x1,%4 \n" |
2164 | 0 | "je 99f \n" |
2165 | 0 | "movd 0x00(%3,%0,4),%%xmm0 \n" |
2166 | 0 | "movd %%xmm0,(%2) \n" |
2167 | 0 | "99: \n" |
2168 | 0 | : "=&a"(x0), // %0 |
2169 | 0 | "=&d"(x1), // %1 |
2170 | 0 | "+r"(dst_argb), // %2 |
2171 | 0 | "+r"(src_argb), // %3 |
2172 | 0 | "+r"(dst_width) // %4 |
2173 | 0 | : "rm"(x), // %5 |
2174 | 0 | "rm"(dx) // %6 |
2175 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); |
2176 | 0 | } |
2177 | | |
2178 | | // Reads 4 pixels, duplicates them and writes 8 pixels. |
2179 | | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
2180 | | void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, |
2181 | | const uint8_t* src_argb, |
2182 | | int dst_width, |
2183 | | int x, |
2184 | 0 | int dx) { |
2185 | 0 | (void)x; |
2186 | 0 | (void)dx; |
2187 | 0 | asm volatile( |
2188 | 0 | "1: \n" |
2189 | 0 | "movdqu (%1),%%xmm0 \n" |
2190 | 0 | "lea 0x10(%1),%1 \n" |
2191 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
2192 | 0 | "punpckldq %%xmm0,%%xmm0 \n" |
2193 | 0 | "punpckhdq %%xmm1,%%xmm1 \n" |
2194 | 0 | "movdqu %%xmm0,(%0) \n" |
2195 | 0 | "movdqu %%xmm1,0x10(%0) \n" |
2196 | 0 | "lea 0x20(%0),%0 \n" |
2197 | 0 | "sub $0x8,%2 \n" |
2198 | 0 | "jg 1b \n" |
2199 | |
|
2200 | 0 | : "+r"(dst_argb), // %0 |
2201 | 0 | "+r"(src_argb), // %1 |
2202 | 0 | "+r"(dst_width) // %2 |
2203 | 0 | : |
2204 | 0 | : "memory", "cc", "xmm0", "xmm1"); |
2205 | 0 | } |
2206 | | |
2207 | | // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
2208 | | static const uvec8 kShuffleColARGB = { |
2209 | | 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
2210 | | 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
2211 | | }; |
2212 | | |
2213 | | // Shuffle table for duplicating 2 fractions into 8 bytes each |
2214 | | static const uvec8 kShuffleFractions = { |
2215 | | 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
2216 | | }; |
2217 | | |
2218 | | // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version |
2219 | | void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, |
2220 | | const uint8_t* src_argb, |
2221 | | int dst_width, |
2222 | | int x, |
2223 | 0 | int dx) { |
2224 | 0 | intptr_t x0, x1; |
2225 | 0 | asm volatile( |
2226 | 0 | "movdqa %0,%%xmm4 \n" |
2227 | 0 | "movdqa %1,%%xmm5 \n" |
2228 | 0 | : |
2229 | 0 | : "m"(kShuffleColARGB), // %0 |
2230 | 0 | "m"(kShuffleFractions) // %1 |
2231 | 0 | ); |
2232 | |
|
2233 | 0 | asm volatile( |
2234 | 0 | "movd %5,%%xmm2 \n" |
2235 | 0 | "movd %6,%%xmm3 \n" |
2236 | 0 | "pcmpeqb %%xmm6,%%xmm6 \n" |
2237 | 0 | "psrlw $0x9,%%xmm6 \n" |
2238 | 0 | "pextrw $0x1,%%xmm2,%k3 \n" |
2239 | 0 | "sub $0x2,%2 \n" |
2240 | 0 | "jl 29f \n" |
2241 | 0 | "movdqa %%xmm2,%%xmm0 \n" |
2242 | 0 | "paddd %%xmm3,%%xmm0 \n" |
2243 | 0 | "punpckldq %%xmm0,%%xmm2 \n" |
2244 | 0 | "punpckldq %%xmm3,%%xmm3 \n" |
2245 | 0 | "paddd %%xmm3,%%xmm3 \n" |
2246 | 0 | "pextrw $0x3,%%xmm2,%k4 \n" |
2247 | |
|
2248 | 0 | LABELALIGN |
2249 | 0 | "2: \n" |
2250 | 0 | "movdqa %%xmm2,%%xmm1 \n" |
2251 | 0 | "paddd %%xmm3,%%xmm2 \n" |
2252 | 0 | "movq 0x00(%1,%3,4),%%xmm0 \n" |
2253 | 0 | "psrlw $0x9,%%xmm1 \n" |
2254 | 0 | "movhps 0x00(%1,%4,4),%%xmm0 \n" |
2255 | 0 | "pshufb %%xmm5,%%xmm1 \n" |
2256 | 0 | "pshufb %%xmm4,%%xmm0 \n" |
2257 | 0 | "pxor %%xmm6,%%xmm1 \n" |
2258 | 0 | "pmaddubsw %%xmm1,%%xmm0 \n" |
2259 | 0 | "psrlw $0x7,%%xmm0 \n" |
2260 | 0 | "pextrw $0x1,%%xmm2,%k3 \n" |
2261 | 0 | "pextrw $0x3,%%xmm2,%k4 \n" |
2262 | 0 | "packuswb %%xmm0,%%xmm0 \n" |
2263 | 0 | "movq %%xmm0,(%0) \n" |
2264 | 0 | "lea 0x8(%0),%0 \n" |
2265 | 0 | "sub $0x2,%2 \n" |
2266 | 0 | "jge 2b \n" |
2267 | |
|
2268 | 0 | LABELALIGN |
2269 | 0 | "29: \n" |
2270 | 0 | "add $0x1,%2 \n" |
2271 | 0 | "jl 99f \n" |
2272 | 0 | "psrlw $0x9,%%xmm2 \n" |
2273 | 0 | "movq 0x00(%1,%3,4),%%xmm0 \n" |
2274 | 0 | "pshufb %%xmm5,%%xmm2 \n" |
2275 | 0 | "pshufb %%xmm4,%%xmm0 \n" |
2276 | 0 | "pxor %%xmm6,%%xmm2 \n" |
2277 | 0 | "pmaddubsw %%xmm2,%%xmm0 \n" |
2278 | 0 | "psrlw $0x7,%%xmm0 \n" |
2279 | 0 | "packuswb %%xmm0,%%xmm0 \n" |
2280 | 0 | "movd %%xmm0,(%0) \n" |
2281 | |
|
2282 | 0 | LABELALIGN "99: \n" |
2283 | |
|
2284 | 0 | : "+r"(dst_argb), // %0 |
2285 | 0 | "+r"(src_argb), // %1 |
2286 | 0 | "+rm"(dst_width), // %2 |
2287 | 0 | "=&r"(x0), // %3 |
2288 | 0 | "=&r"(x1) // %4 |
2289 | 0 | : "rm"(x), // %5 |
2290 | 0 | "rm"(dx) // %6 |
2291 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); |
2292 | 0 | } |
2293 | | |
2294 | | // Divide num by div and return as 16.16 fixed point result. |
2295 | 28.7k | int FixedDiv_X86(int num, int div) { |
2296 | 28.7k | asm volatile( |
2297 | 28.7k | "cdq \n" |
2298 | 28.7k | "shld $0x10,%%eax,%%edx \n" |
2299 | 28.7k | "shl $0x10,%%eax \n" |
2300 | 28.7k | "idiv %1 \n" |
2301 | 28.7k | "mov %0, %%eax \n" |
2302 | 28.7k | : "+a"(num) // %0 |
2303 | 28.7k | : "c"(div) // %1 |
2304 | 28.7k | : "memory", "cc", "edx"); |
2305 | 28.7k | return num; |
2306 | 28.7k | } |
2307 | | |
2308 | | // Divide num - 1 by div - 1 and return as 16.16 fixed point result. |
2309 | 24.5k | int FixedDiv1_X86(int num, int div) { |
2310 | 24.5k | asm volatile( |
2311 | 24.5k | "cdq \n" |
2312 | 24.5k | "shld $0x10,%%eax,%%edx \n" |
2313 | 24.5k | "shl $0x10,%%eax \n" |
2314 | 24.5k | "sub $0x10001,%%eax \n" |
2315 | 24.5k | "sbb $0x0,%%edx \n" |
2316 | 24.5k | "sub $0x1,%1 \n" |
2317 | 24.5k | "idiv %1 \n" |
2318 | 24.5k | "mov %0, %%eax \n" |
2319 | 24.5k | : "+a"(num) // %0 |
2320 | 24.5k | : "c"(div) // %1 |
2321 | 24.5k | : "memory", "cc", "edx"); |
2322 | 24.5k | return num; |
2323 | 24.5k | } |
2324 | | |
2325 | | #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ |
2326 | | defined(HAS_SCALEUVROWDOWN2BOX_AVX2) |
2327 | | |
2328 | | // Shuffle table for splitting UV into upper and lower part of register. |
2329 | | static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, |
2330 | | 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; |
2331 | | static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, |
2332 | | 6u, 14u, 0x80, 0x80, 0x80, 0x80, |
2333 | | 0x80, 0x80, 0x80, 0x80}; |
2334 | | #endif |
2335 | | |
2336 | | #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 |
2337 | | |
2338 | | void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, |
2339 | | ptrdiff_t src_stride, |
2340 | | uint8_t* dst_ptr, |
2341 | 0 | int dst_width) { |
2342 | 0 | asm volatile( |
2343 | 0 | "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 |
2344 | 0 | "psrlw $0xf,%%xmm4 \n" |
2345 | 0 | "packuswb %%xmm4,%%xmm4 \n" |
2346 | 0 | "pxor %%xmm5, %%xmm5 \n" // zero |
2347 | 0 | "movdqa %4,%%xmm1 \n" // split shuffler |
2348 | 0 | "movdqa %5,%%xmm3 \n" // merge shuffler |
2349 | |
|
2350 | 0 | LABELALIGN |
2351 | 0 | "1: \n" |
2352 | 0 | "movdqu (%0),%%xmm0 \n" // 8 UV row 0 |
2353 | 0 | "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 |
2354 | 0 | "lea 0x10(%0),%0 \n" |
2355 | 0 | "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv |
2356 | 0 | "pshufb %%xmm1,%%xmm2 \n" |
2357 | 0 | "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add |
2358 | 0 | "pmaddubsw %%xmm4,%%xmm2 \n" |
2359 | 0 | "paddw %%xmm2,%%xmm0 \n" // vertical add |
2360 | 0 | "psrlw $0x1,%%xmm0 \n" // round |
2361 | 0 | "pavgw %%xmm5,%%xmm0 \n" |
2362 | 0 | "pshufb %%xmm3,%%xmm0 \n" // merge uv |
2363 | 0 | "movq %%xmm0,(%1) \n" |
2364 | 0 | "lea 0x8(%1),%1 \n" // 4 UV |
2365 | 0 | "sub $0x4,%2 \n" |
2366 | 0 | "jg 1b \n" |
2367 | 0 | : "+r"(src_ptr), // %0 |
2368 | 0 | "+r"(dst_ptr), // %1 |
2369 | 0 | "+r"(dst_width) // %2 |
2370 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
2371 | 0 | "m"(kShuffleSplitUV), // %4 |
2372 | 0 | "m"(kShuffleMergeUV) // %5 |
2373 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
2374 | 0 | } |
2375 | | #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 |
2376 | | |
2377 | | #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 |
2378 | | void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, |
2379 | | ptrdiff_t src_stride, |
2380 | | uint8_t* dst_ptr, |
2381 | 0 | int dst_width) { |
2382 | 0 | asm volatile( |
2383 | 0 | "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 |
2384 | 0 | "vpabsb %%ymm4,%%ymm4 \n" |
2385 | 0 | "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero |
2386 | 0 | "vbroadcastf128 %4,%%ymm1 \n" // split shuffler |
2387 | 0 | "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler |
2388 | |
|
2389 | 0 | LABELALIGN |
2390 | 0 | "1: \n" |
2391 | 0 | "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 |
2392 | 0 | "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 |
2393 | 0 | "lea 0x20(%0),%0 \n" |
2394 | 0 | "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv |
2395 | 0 | "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" |
2396 | 0 | "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add |
2397 | 0 | "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
2398 | 0 | "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add |
2399 | 0 | "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round |
2400 | 0 | "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" |
2401 | 0 | "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv |
2402 | 0 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords |
2403 | 0 | "vmovdqu %%xmm0,(%1) \n" |
2404 | 0 | "lea 0x10(%1),%1 \n" // 8 UV |
2405 | 0 | "sub $0x8,%2 \n" |
2406 | 0 | "jg 1b \n" |
2407 | 0 | "vzeroupper \n" |
2408 | 0 | : "+r"(src_ptr), // %0 |
2409 | 0 | "+r"(dst_ptr), // %1 |
2410 | 0 | "+r"(dst_width) // %2 |
2411 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
2412 | 0 | "m"(kShuffleSplitUV), // %4 |
2413 | 0 | "m"(kShuffleMergeUV) // %5 |
2414 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
2415 | 0 | } |
2416 | | #endif // HAS_SCALEUVROWDOWN2BOX_AVX2 |
2417 | | |
2418 | | static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, |
2419 | | 3, 1, 3, 1, 1, 3, 1, 3}; |
2420 | | |
2421 | | #ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 |
2422 | | void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, |
2423 | | uint8_t* dst_ptr, |
2424 | 0 | int dst_width) { |
2425 | 0 | asm volatile( |
2426 | 0 | "pcmpeqw %%xmm4,%%xmm4 \n" |
2427 | 0 | "psrlw $15,%%xmm4 \n" |
2428 | 0 | "psllw $1,%%xmm4 \n" // all 2 |
2429 | 0 | "movdqa %3,%%xmm3 \n" |
2430 | |
|
2431 | 0 | LABELALIGN |
2432 | 0 | "1: \n" |
2433 | 0 | "movq (%0),%%xmm0 \n" // 00112233 (1u1v) |
2434 | 0 | "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) |
2435 | 0 | "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) |
2436 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
2437 | 0 | "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) |
2438 | 0 | "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) |
2439 | 0 | "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) |
2440 | 0 | "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) |
2441 | 0 | "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) |
2442 | 0 | "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) |
2443 | 0 | "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) |
2444 | 0 | "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) |
2445 | 0 | "packuswb %%xmm2,%%xmm0 \n" |
2446 | 0 | "movdqu %%xmm0,(%1) \n" |
2447 | |
|
2448 | 0 | "lea 0x8(%0),%0 \n" |
2449 | 0 | "lea 0x10(%1),%1 \n" // 4 uv to 8 uv |
2450 | 0 | "sub $0x8,%2 \n" |
2451 | 0 | "jg 1b \n" |
2452 | 0 | : "+r"(src_ptr), // %0 |
2453 | 0 | "+r"(dst_ptr), // %1 |
2454 | 0 | "+r"(dst_width) // %2 |
2455 | 0 | : "m"(kUVLinearMadd31) // %3 |
2456 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); |
2457 | 0 | } |
2458 | | #endif |
2459 | | |
2460 | | #ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 |
2461 | | void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, |
2462 | | ptrdiff_t src_stride, |
2463 | | uint8_t* dst_ptr, |
2464 | | ptrdiff_t dst_stride, |
2465 | 0 | int dst_width) { |
2466 | 0 | asm volatile( |
2467 | 0 | "pcmpeqw %%xmm6,%%xmm6 \n" |
2468 | 0 | "psrlw $15,%%xmm6 \n" |
2469 | 0 | "psllw $3,%%xmm6 \n" // all 8 |
2470 | 0 | "movdqa %5,%%xmm7 \n" |
2471 | |
|
2472 | 0 | LABELALIGN |
2473 | 0 | "1: \n" |
2474 | 0 | "movq (%0),%%xmm0 \n" // 00112233 (1u1v) |
2475 | 0 | "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) |
2476 | 0 | "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) |
2477 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
2478 | 0 | "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) |
2479 | 0 | "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) |
2480 | 0 | "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) |
2481 | 0 | "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) |
2482 | |
|
2483 | 0 | "movq (%0,%3),%%xmm1 \n" |
2484 | 0 | "movq 2(%0,%3),%%xmm4 \n" |
2485 | 0 | "punpcklbw %%xmm4,%%xmm1 \n" |
2486 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
2487 | 0 | "punpckhdq %%xmm1,%%xmm3 \n" |
2488 | 0 | "punpckldq %%xmm1,%%xmm1 \n" |
2489 | 0 | "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) |
2490 | 0 | "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) |
2491 | | |
2492 | | // xmm0 xmm2 |
2493 | | // xmm1 xmm3 |
2494 | |
|
2495 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
2496 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
2497 | 0 | "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) |
2498 | 0 | "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) |
2499 | 0 | "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) |
2500 | 0 | "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) |
2501 | 0 | "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) |
2502 | |
|
2503 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
2504 | 0 | "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) |
2505 | 0 | "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) |
2506 | 0 | "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) |
2507 | 0 | "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) |
2508 | 0 | "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) |
2509 | |
|
2510 | 0 | "movdqa %%xmm2,%%xmm0 \n" |
2511 | 0 | "movdqa %%xmm3,%%xmm1 \n" |
2512 | 0 | "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) |
2513 | 0 | "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) |
2514 | 0 | "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) |
2515 | 0 | "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) |
2516 | 0 | "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) |
2517 | |
|
2518 | 0 | "movdqa %%xmm3,%%xmm1 \n" |
2519 | 0 | "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) |
2520 | 0 | "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) |
2521 | 0 | "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) |
2522 | 0 | "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) |
2523 | 0 | "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) |
2524 | |
|
2525 | 0 | "packuswb %%xmm0,%%xmm4 \n" |
2526 | 0 | "movdqu %%xmm4,(%1) \n" // store above |
2527 | 0 | "packuswb %%xmm1,%%xmm5 \n" |
2528 | 0 | "movdqu %%xmm5,(%1,%4) \n" // store below |
2529 | |
|
2530 | 0 | "lea 0x8(%0),%0 \n" |
2531 | 0 | "lea 0x10(%1),%1 \n" // 4 uv to 8 uv |
2532 | 0 | "sub $0x8,%2 \n" |
2533 | 0 | "jg 1b \n" |
2534 | 0 | : "+r"(src_ptr), // %0 |
2535 | 0 | "+r"(dst_ptr), // %1 |
2536 | 0 | "+r"(dst_width) // %2 |
2537 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
2538 | 0 | "r"((intptr_t)(dst_stride)), // %4 |
2539 | 0 | "m"(kUVLinearMadd31) // %5 |
2540 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
2541 | 0 | "xmm7"); |
2542 | 0 | } |
2543 | | #endif |
2544 | | |
2545 | | #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 |
2546 | | |
2547 | | void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, |
2548 | | uint8_t* dst_ptr, |
2549 | 0 | int dst_width) { |
2550 | 0 | asm volatile( |
2551 | 0 | "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" |
2552 | 0 | "vpsrlw $15,%%ymm4,%%ymm4 \n" |
2553 | 0 | "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 |
2554 | 0 | "vbroadcastf128 %3,%%ymm3 \n" |
2555 | |
|
2556 | 0 | LABELALIGN |
2557 | 0 | "1: \n" |
2558 | 0 | "vmovdqu (%0),%%xmm0 \n" |
2559 | 0 | "vmovdqu 2(%0),%%xmm1 \n" |
2560 | 0 | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" |
2561 | 0 | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" |
2562 | 0 | "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" |
2563 | 0 | "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" |
2564 | 0 | "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" |
2565 | 0 | "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) |
2566 | 0 | "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) |
2567 | 0 | "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) |
2568 | 0 | "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) |
2569 | 0 | "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) |
2570 | 0 | "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) |
2571 | 0 | "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
2572 | 0 | "vmovdqu %%ymm0,(%1) \n" |
2573 | |
|
2574 | 0 | "lea 0x10(%0),%0 \n" |
2575 | 0 | "lea 0x20(%1),%1 \n" // 8 uv to 16 uv |
2576 | 0 | "sub $0x10,%2 \n" |
2577 | 0 | "jg 1b \n" |
2578 | 0 | "vzeroupper \n" |
2579 | 0 | : "+r"(src_ptr), // %0 |
2580 | 0 | "+r"(dst_ptr), // %1 |
2581 | 0 | "+r"(dst_width) // %2 |
2582 | 0 | : "m"(kUVLinearMadd31) // %3 |
2583 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); |
2584 | 0 | } |
2585 | | #endif |
2586 | | |
2587 | | #ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 |
2588 | | void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, |
2589 | | ptrdiff_t src_stride, |
2590 | | uint8_t* dst_ptr, |
2591 | | ptrdiff_t dst_stride, |
2592 | 0 | int dst_width) { |
2593 | 0 | asm volatile( |
2594 | 0 | "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" |
2595 | 0 | "vpsrlw $15,%%ymm6,%%ymm6 \n" |
2596 | 0 | "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 |
2597 | 0 | "vbroadcastf128 %5,%%ymm7 \n" |
2598 | |
|
2599 | 0 | LABELALIGN |
2600 | 0 | "1: \n" |
2601 | 0 | "vmovdqu (%0),%%xmm0 \n" |
2602 | 0 | "vmovdqu 2(%0),%%xmm1 \n" |
2603 | 0 | "vpermq $0b11011000,%%ymm0,%%ymm0 \n" |
2604 | 0 | "vpermq $0b11011000,%%ymm1,%%ymm1 \n" |
2605 | 0 | "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" |
2606 | 0 | "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" |
2607 | 0 | "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" |
2608 | 0 | "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) |
2609 | 0 | "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) |
2610 | |
|
2611 | 0 | "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF |
2612 | 0 | "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 |
2613 | 0 | "vpermq $0b11011000,%%ymm2,%%ymm2 \n" |
2614 | 0 | "vpermq $0b11011000,%%ymm3,%%ymm3 \n" |
2615 | 0 | "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" |
2616 | 0 | "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" |
2617 | 0 | "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" |
2618 | 0 | "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) |
2619 | 0 | "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) |
2620 | | |
2621 | | // ymm0 ymm1 |
2622 | | // ymm2 ymm3 |
2623 | |
|
2624 | 0 | "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) |
2625 | 0 | "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) |
2626 | 0 | "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) |
2627 | 0 | "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) |
2628 | 0 | "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) |
2629 | |
|
2630 | 0 | "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) |
2631 | 0 | "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) |
2632 | 0 | "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) |
2633 | 0 | "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) |
2634 | 0 | "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) |
2635 | |
|
2636 | 0 | "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) |
2637 | 0 | "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) |
2638 | 0 | "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) |
2639 | 0 | "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) |
2640 | 0 | "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) |
2641 | |
|
2642 | 0 | "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) |
2643 | 0 | "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) |
2644 | 0 | "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) |
2645 | 0 | "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) |
2646 | 0 | "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) |
2647 | |
|
2648 | 0 | "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" |
2649 | 0 | "vmovdqu %%ymm4,(%1) \n" // store above |
2650 | 0 | "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" |
2651 | 0 | "vmovdqu %%ymm5,(%1,%4) \n" // store below |
2652 | |
|
2653 | 0 | "lea 0x10(%0),%0 \n" |
2654 | 0 | "lea 0x20(%1),%1 \n" // 8 uv to 16 uv |
2655 | 0 | "sub $0x10,%2 \n" |
2656 | 0 | "jg 1b \n" |
2657 | 0 | "vzeroupper \n" |
2658 | 0 | : "+r"(src_ptr), // %0 |
2659 | 0 | "+r"(dst_ptr), // %1 |
2660 | 0 | "+r"(dst_width) // %2 |
2661 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
2662 | 0 | "r"((intptr_t)(dst_stride)), // %4 |
2663 | 0 | "m"(kUVLinearMadd31) // %5 |
2664 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
2665 | 0 | "xmm7"); |
2666 | 0 | } |
2667 | | #endif |
2668 | | |
2669 | | #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 |
2670 | | void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, |
2671 | | uint16_t* dst_ptr, |
2672 | 0 | int dst_width) { |
2673 | 0 | asm volatile( |
2674 | 0 | "pxor %%xmm5,%%xmm5 \n" |
2675 | 0 | "pcmpeqd %%xmm4,%%xmm4 \n" |
2676 | 0 | "psrld $31,%%xmm4 \n" |
2677 | 0 | "pslld $1,%%xmm4 \n" // all 2 |
2678 | |
|
2679 | 0 | LABELALIGN |
2680 | 0 | "1: \n" |
2681 | 0 | "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) |
2682 | 0 | "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) |
2683 | |
|
2684 | 0 | "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) |
2685 | 0 | "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) |
2686 | |
|
2687 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
2688 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
2689 | |
|
2690 | 0 | "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) |
2691 | 0 | "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) |
2692 | |
|
2693 | 0 | "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) |
2694 | 0 | "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) |
2695 | 0 | "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) |
2696 | 0 | "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) |
2697 | 0 | "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) |
2698 | 0 | "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) |
2699 | 0 | "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) |
2700 | 0 | "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) |
2701 | |
|
2702 | 0 | "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) |
2703 | 0 | "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) |
2704 | 0 | "packusdw %%xmm1,%%xmm0 \n" |
2705 | 0 | "movdqu %%xmm0,(%1) \n" |
2706 | |
|
2707 | 0 | "lea 0x8(%0),%0 \n" |
2708 | 0 | "lea 0x10(%1),%1 \n" // 2 uv to 4 uv |
2709 | 0 | "sub $0x4,%2 \n" |
2710 | 0 | "jg 1b \n" |
2711 | 0 | : "+r"(src_ptr), // %0 |
2712 | 0 | "+r"(dst_ptr), // %1 |
2713 | 0 | "+r"(dst_width) // %2 |
2714 | 0 | : |
2715 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); |
2716 | 0 | } |
2717 | | #endif |
2718 | | |
2719 | | #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 |
2720 | | void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, |
2721 | | ptrdiff_t src_stride, |
2722 | | uint16_t* dst_ptr, |
2723 | | ptrdiff_t dst_stride, |
2724 | 0 | int dst_width) { |
2725 | 0 | asm volatile( |
2726 | 0 | "pxor %%xmm7,%%xmm7 \n" |
2727 | 0 | "pcmpeqd %%xmm6,%%xmm6 \n" |
2728 | 0 | "psrld $31,%%xmm6 \n" |
2729 | 0 | "pslld $3,%%xmm6 \n" // all 8 |
2730 | |
|
2731 | 0 | LABELALIGN |
2732 | 0 | "1: \n" |
2733 | 0 | "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) |
2734 | 0 | "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) |
2735 | 0 | "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) |
2736 | 0 | "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) |
2737 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
2738 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
2739 | 0 | "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) |
2740 | 0 | "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) |
2741 | 0 | "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) |
2742 | 0 | "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) |
2743 | 0 | "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) |
2744 | 0 | "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) |
2745 | 0 | "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) |
2746 | 0 | "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) |
2747 | |
|
2748 | 0 | "movq (%0,%3,2),%%xmm2 \n" |
2749 | 0 | "movq 4(%0,%3,2),%%xmm3 \n" |
2750 | 0 | "punpcklwd %%xmm7,%%xmm2 \n" |
2751 | 0 | "punpcklwd %%xmm7,%%xmm3 \n" |
2752 | 0 | "movdqa %%xmm2,%%xmm4 \n" |
2753 | 0 | "movdqa %%xmm3,%%xmm5 \n" |
2754 | 0 | "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) |
2755 | 0 | "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) |
2756 | 0 | "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) |
2757 | 0 | "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) |
2758 | 0 | "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) |
2759 | 0 | "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) |
2760 | 0 | "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) |
2761 | 0 | "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) |
2762 | |
|
2763 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
2764 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
2765 | 0 | "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) |
2766 | 0 | "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) |
2767 | 0 | "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) |
2768 | 0 | "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) |
2769 | 0 | "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) |
2770 | |
|
2771 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
2772 | 0 | "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) |
2773 | 0 | "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) |
2774 | 0 | "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) |
2775 | 0 | "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) |
2776 | 0 | "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) |
2777 | |
|
2778 | 0 | "movdqa %%xmm1,%%xmm0 \n" |
2779 | 0 | "movdqa %%xmm3,%%xmm2 \n" |
2780 | 0 | "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) |
2781 | 0 | "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) |
2782 | 0 | "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) |
2783 | 0 | "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) |
2784 | 0 | "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) |
2785 | |
|
2786 | 0 | "movdqa %%xmm3,%%xmm2 \n" |
2787 | 0 | "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) |
2788 | 0 | "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) |
2789 | 0 | "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) |
2790 | 0 | "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) |
2791 | 0 | "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) |
2792 | |
|
2793 | 0 | "packusdw %%xmm0,%%xmm4 \n" |
2794 | 0 | "movdqu %%xmm4,(%1) \n" // store above |
2795 | 0 | "packusdw %%xmm2,%%xmm5 \n" |
2796 | 0 | "movdqu %%xmm5,(%1,%4,2) \n" // store below |
2797 | |
|
2798 | 0 | "lea 0x8(%0),%0 \n" |
2799 | 0 | "lea 0x10(%1),%1 \n" // 2 uv to 4 uv |
2800 | 0 | "sub $0x4,%2 \n" |
2801 | 0 | "jg 1b \n" |
2802 | 0 | : "+r"(src_ptr), // %0 |
2803 | 0 | "+r"(dst_ptr), // %1 |
2804 | 0 | "+r"(dst_width) // %2 |
2805 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
2806 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
2807 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
2808 | 0 | "xmm7"); |
2809 | 0 | } |
2810 | | #endif |
2811 | | |
2812 | | #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 |
2813 | | void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, |
2814 | | uint16_t* dst_ptr, |
2815 | 0 | int dst_width) { |
2816 | 0 | asm volatile( |
2817 | 0 | "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" |
2818 | 0 | "vpsrld $31,%%ymm4,%%ymm4 \n" |
2819 | 0 | "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 |
2820 | |
|
2821 | 0 | LABELALIGN |
2822 | 0 | "1: \n" |
2823 | 0 | "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) |
2824 | 0 | "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) |
2825 | |
|
2826 | 0 | "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) |
2827 | 0 | "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) |
2828 | |
|
2829 | 0 | "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) |
2830 | 0 | "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) |
2831 | |
|
2832 | 0 | "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) |
2833 | 0 | "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) |
2834 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) |
2835 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) |
2836 | 0 | "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) |
2837 | 0 | "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) |
2838 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) |
2839 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) |
2840 | |
|
2841 | 0 | "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) |
2842 | 0 | "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) |
2843 | 0 | "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" |
2844 | 0 | "vmovdqu %%ymm0,(%1) \n" |
2845 | |
|
2846 | 0 | "lea 0x10(%0),%0 \n" |
2847 | 0 | "lea 0x20(%1),%1 \n" // 4 uv to 8 uv |
2848 | 0 | "sub $0x8,%2 \n" |
2849 | 0 | "jg 1b \n" |
2850 | 0 | "vzeroupper \n" |
2851 | 0 | : "+r"(src_ptr), // %0 |
2852 | 0 | "+r"(dst_ptr), // %1 |
2853 | 0 | "+r"(dst_width) // %2 |
2854 | 0 | : |
2855 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); |
2856 | 0 | } |
2857 | | #endif |
2858 | | |
2859 | | #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 |
2860 | | void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, |
2861 | | ptrdiff_t src_stride, |
2862 | | uint16_t* dst_ptr, |
2863 | | ptrdiff_t dst_stride, |
2864 | 0 | int dst_width) { |
2865 | 0 | asm volatile( |
2866 | 0 | "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" |
2867 | 0 | "vpsrld $31,%%ymm6,%%ymm6 \n" |
2868 | 0 | "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 |
2869 | |
|
2870 | 0 | LABELALIGN |
2871 | 0 | "1: \n" |
2872 | |
|
2873 | 0 | "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) |
2874 | 0 | "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) |
2875 | 0 | "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) |
2876 | 0 | "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) |
2877 | 0 | "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) |
2878 | 0 | "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) |
2879 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) |
2880 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) |
2881 | 0 | "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) |
2882 | 0 | "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) |
2883 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) |
2884 | 0 | "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) |
2885 | |
|
2886 | 0 | "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) |
2887 | 0 | "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) |
2888 | 0 | "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) |
2889 | 0 | "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) |
2890 | 0 | "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) |
2891 | 0 | "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) |
2892 | 0 | "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) |
2893 | 0 | "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) |
2894 | 0 | "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) |
2895 | 0 | "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) |
2896 | 0 | "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) |
2897 | 0 | "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) |
2898 | |
|
2899 | 0 | "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) |
2900 | 0 | "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) |
2901 | 0 | "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) |
2902 | 0 | "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) |
2903 | 0 | "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) |
2904 | |
|
2905 | 0 | "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) |
2906 | 0 | "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) |
2907 | 0 | "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) |
2908 | 0 | "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) |
2909 | 0 | "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) |
2910 | |
|
2911 | 0 | "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) |
2912 | 0 | "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) |
2913 | 0 | "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) |
2914 | 0 | "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) |
2915 | 0 | "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) |
2916 | |
|
2917 | 0 | "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) |
2918 | 0 | "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) |
2919 | 0 | "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) |
2920 | 0 | "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) |
2921 | 0 | "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) |
2922 | |
|
2923 | 0 | "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" |
2924 | 0 | "vmovdqu %%ymm4,(%1) \n" // store above |
2925 | 0 | "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" |
2926 | 0 | "vmovdqu %%ymm5,(%1,%4,2) \n" // store below |
2927 | |
|
2928 | 0 | "lea 0x10(%0),%0 \n" |
2929 | 0 | "lea 0x20(%1),%1 \n" // 4 uv to 8 uv |
2930 | 0 | "sub $0x8,%2 \n" |
2931 | 0 | "jg 1b \n" |
2932 | 0 | "vzeroupper \n" |
2933 | 0 | : "+r"(src_ptr), // %0 |
2934 | 0 | "+r"(dst_ptr), // %1 |
2935 | 0 | "+r"(dst_width) // %2 |
2936 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
2937 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
2938 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); |
2939 | 0 | } |
2940 | | #endif |
2941 | | |
2942 | | #endif // defined(__x86_64__) || defined(__i386__) |
2943 | | |
2944 | | #ifdef __cplusplus |
2945 | | } // extern "C" |
2946 | | } // namespace libyuv |
2947 | | #endif |