/src/libavif/ext/libyuv/source/rotate_gcc.cc
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2015 The LibYuv Project Authors. All rights reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include "libyuv/rotate_row.h" |
12 | | #include "libyuv/row.h" |
13 | | |
14 | | #ifdef __cplusplus |
15 | | namespace libyuv { |
16 | | extern "C" { |
17 | | #endif |
18 | | |
19 | | // This module is for GCC x86 and x64. |
20 | | #if !defined(LIBYUV_DISABLE_X86) && \ |
21 | | (defined(__x86_64__) || defined(__i386__)) && \ |
22 | | !defined(LIBYUV_ENABLE_ROWWIN) |
23 | | |
24 | | // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. |
25 | | #if defined(HAS_TRANSPOSEWX8_SSSE3) |
26 | | void TransposeWx8_SSSE3(const uint8_t* src, |
27 | | int src_stride, |
28 | | uint8_t* dst, |
29 | | int dst_stride, |
30 | 0 | int width) { |
31 | 0 | asm volatile( |
32 | | // Read in the data from the source pointer. |
33 | | // First round of bit swap. |
34 | 0 | LABELALIGN |
35 | 0 | "1: \n" |
36 | 0 | "movq (%0),%%xmm0 \n" |
37 | 0 | "movq (%0,%3),%%xmm1 \n" |
38 | 0 | "lea (%0,%3,2),%0 \n" |
39 | 0 | "punpcklbw %%xmm1,%%xmm0 \n" |
40 | 0 | "movq (%0),%%xmm2 \n" |
41 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
42 | 0 | "palignr $0x8,%%xmm1,%%xmm1 \n" |
43 | 0 | "movq (%0,%3),%%xmm3 \n" |
44 | 0 | "lea (%0,%3,2),%0 \n" |
45 | 0 | "punpcklbw %%xmm3,%%xmm2 \n" |
46 | 0 | "movdqa %%xmm2,%%xmm3 \n" |
47 | 0 | "movq (%0),%%xmm4 \n" |
48 | 0 | "palignr $0x8,%%xmm3,%%xmm3 \n" |
49 | 0 | "movq (%0,%3),%%xmm5 \n" |
50 | 0 | "lea (%0,%3,2),%0 \n" |
51 | 0 | "punpcklbw %%xmm5,%%xmm4 \n" |
52 | 0 | "movdqa %%xmm4,%%xmm5 \n" |
53 | 0 | "movq (%0),%%xmm6 \n" |
54 | 0 | "palignr $0x8,%%xmm5,%%xmm5 \n" |
55 | 0 | "movq (%0,%3),%%xmm7 \n" |
56 | 0 | "lea (%0,%3,2),%0 \n" |
57 | 0 | "punpcklbw %%xmm7,%%xmm6 \n" |
58 | 0 | "neg %3 \n" |
59 | 0 | "movdqa %%xmm6,%%xmm7 \n" |
60 | 0 | "lea 0x8(%0,%3,8),%0 \n" |
61 | 0 | "palignr $0x8,%%xmm7,%%xmm7 \n" |
62 | 0 | "neg %3 \n" |
63 | | // Second round of bit swap. |
64 | 0 | "punpcklwd %%xmm2,%%xmm0 \n" |
65 | 0 | "punpcklwd %%xmm3,%%xmm1 \n" |
66 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
67 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
68 | 0 | "palignr $0x8,%%xmm2,%%xmm2 \n" |
69 | 0 | "palignr $0x8,%%xmm3,%%xmm3 \n" |
70 | 0 | "punpcklwd %%xmm6,%%xmm4 \n" |
71 | 0 | "punpcklwd %%xmm7,%%xmm5 \n" |
72 | 0 | "movdqa %%xmm4,%%xmm6 \n" |
73 | 0 | "movdqa %%xmm5,%%xmm7 \n" |
74 | 0 | "palignr $0x8,%%xmm6,%%xmm6 \n" |
75 | 0 | "palignr $0x8,%%xmm7,%%xmm7 \n" |
76 | | // Third round of bit swap. |
77 | | // Write to the destination pointer. |
78 | 0 | "punpckldq %%xmm4,%%xmm0 \n" |
79 | 0 | "movq %%xmm0,(%1) \n" |
80 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
81 | 0 | "palignr $0x8,%%xmm4,%%xmm4 \n" |
82 | 0 | "movq %%xmm4,(%1,%4) \n" |
83 | 0 | "lea (%1,%4,2),%1 \n" |
84 | 0 | "punpckldq %%xmm6,%%xmm2 \n" |
85 | 0 | "movdqa %%xmm2,%%xmm6 \n" |
86 | 0 | "movq %%xmm2,(%1) \n" |
87 | 0 | "palignr $0x8,%%xmm6,%%xmm6 \n" |
88 | 0 | "punpckldq %%xmm5,%%xmm1 \n" |
89 | 0 | "movq %%xmm6,(%1,%4) \n" |
90 | 0 | "lea (%1,%4,2),%1 \n" |
91 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
92 | 0 | "movq %%xmm1,(%1) \n" |
93 | 0 | "palignr $0x8,%%xmm5,%%xmm5 \n" |
94 | 0 | "movq %%xmm5,(%1,%4) \n" |
95 | 0 | "lea (%1,%4,2),%1 \n" |
96 | 0 | "punpckldq %%xmm7,%%xmm3 \n" |
97 | 0 | "movq %%xmm3,(%1) \n" |
98 | 0 | "movdqa %%xmm3,%%xmm7 \n" |
99 | 0 | "palignr $0x8,%%xmm7,%%xmm7 \n" |
100 | 0 | "sub $0x8,%2 \n" |
101 | 0 | "movq %%xmm7,(%1,%4) \n" |
102 | 0 | "lea (%1,%4,2),%1 \n" |
103 | 0 | "jg 1b \n" |
104 | 0 | : "+r"(src), // %0 |
105 | 0 | "+r"(dst), // %1 |
106 | 0 | "+r"(width) // %2 |
107 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
108 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
109 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
110 | 0 | "xmm7"); |
111 | 0 | } |
112 | | #endif // defined(HAS_TRANSPOSEWX8_SSSE3) |
113 | | |
114 | | // Transpose 16x8. 64 bit |
115 | | #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) |
116 | | void TransposeWx8_Fast_SSSE3(const uint8_t* src, |
117 | | int src_stride, |
118 | | uint8_t* dst, |
119 | | int dst_stride, |
120 | 0 | int width) { |
121 | 0 | asm volatile( |
122 | | // Read in the data from the source pointer. |
123 | | // First round of bit swap. |
124 | 0 | LABELALIGN |
125 | 0 | "1: \n" |
126 | 0 | "movdqu (%0),%%xmm0 \n" |
127 | 0 | "movdqu (%0,%3),%%xmm1 \n" |
128 | 0 | "lea (%0,%3,2),%0 \n" |
129 | 0 | "movdqa %%xmm0,%%xmm8 \n" |
130 | 0 | "punpcklbw %%xmm1,%%xmm0 \n" |
131 | 0 | "punpckhbw %%xmm1,%%xmm8 \n" |
132 | 0 | "movdqu (%0),%%xmm2 \n" |
133 | 0 | "movdqa %%xmm0,%%xmm1 \n" |
134 | 0 | "movdqa %%xmm8,%%xmm9 \n" |
135 | 0 | "palignr $0x8,%%xmm1,%%xmm1 \n" |
136 | 0 | "palignr $0x8,%%xmm9,%%xmm9 \n" |
137 | 0 | "movdqu (%0,%3),%%xmm3 \n" |
138 | 0 | "lea (%0,%3,2),%0 \n" |
139 | 0 | "movdqa %%xmm2,%%xmm10 \n" |
140 | 0 | "punpcklbw %%xmm3,%%xmm2 \n" |
141 | 0 | "punpckhbw %%xmm3,%%xmm10 \n" |
142 | 0 | "movdqa %%xmm2,%%xmm3 \n" |
143 | 0 | "movdqa %%xmm10,%%xmm11 \n" |
144 | 0 | "movdqu (%0),%%xmm4 \n" |
145 | 0 | "palignr $0x8,%%xmm3,%%xmm3 \n" |
146 | 0 | "palignr $0x8,%%xmm11,%%xmm11 \n" |
147 | 0 | "movdqu (%0,%3),%%xmm5 \n" |
148 | 0 | "lea (%0,%3,2),%0 \n" |
149 | 0 | "movdqa %%xmm4,%%xmm12 \n" |
150 | 0 | "punpcklbw %%xmm5,%%xmm4 \n" |
151 | 0 | "punpckhbw %%xmm5,%%xmm12 \n" |
152 | 0 | "movdqa %%xmm4,%%xmm5 \n" |
153 | 0 | "movdqa %%xmm12,%%xmm13 \n" |
154 | 0 | "movdqu (%0),%%xmm6 \n" |
155 | 0 | "palignr $0x8,%%xmm5,%%xmm5 \n" |
156 | 0 | "palignr $0x8,%%xmm13,%%xmm13 \n" |
157 | 0 | "movdqu (%0,%3),%%xmm7 \n" |
158 | 0 | "lea (%0,%3,2),%0 \n" |
159 | 0 | "movdqa %%xmm6,%%xmm14 \n" |
160 | 0 | "punpcklbw %%xmm7,%%xmm6 \n" |
161 | 0 | "punpckhbw %%xmm7,%%xmm14 \n" |
162 | 0 | "neg %3 \n" |
163 | 0 | "movdqa %%xmm6,%%xmm7 \n" |
164 | 0 | "movdqa %%xmm14,%%xmm15 \n" |
165 | 0 | "lea 0x10(%0,%3,8),%0 \n" |
166 | 0 | "palignr $0x8,%%xmm7,%%xmm7 \n" |
167 | 0 | "palignr $0x8,%%xmm15,%%xmm15 \n" |
168 | 0 | "neg %3 \n" |
169 | | // Second round of bit swap. |
170 | 0 | "punpcklwd %%xmm2,%%xmm0 \n" |
171 | 0 | "punpcklwd %%xmm3,%%xmm1 \n" |
172 | 0 | "movdqa %%xmm0,%%xmm2 \n" |
173 | 0 | "movdqa %%xmm1,%%xmm3 \n" |
174 | 0 | "palignr $0x8,%%xmm2,%%xmm2 \n" |
175 | 0 | "palignr $0x8,%%xmm3,%%xmm3 \n" |
176 | 0 | "punpcklwd %%xmm6,%%xmm4 \n" |
177 | 0 | "punpcklwd %%xmm7,%%xmm5 \n" |
178 | 0 | "movdqa %%xmm4,%%xmm6 \n" |
179 | 0 | "movdqa %%xmm5,%%xmm7 \n" |
180 | 0 | "palignr $0x8,%%xmm6,%%xmm6 \n" |
181 | 0 | "palignr $0x8,%%xmm7,%%xmm7 \n" |
182 | 0 | "punpcklwd %%xmm10,%%xmm8 \n" |
183 | 0 | "punpcklwd %%xmm11,%%xmm9 \n" |
184 | 0 | "movdqa %%xmm8,%%xmm10 \n" |
185 | 0 | "movdqa %%xmm9,%%xmm11 \n" |
186 | 0 | "palignr $0x8,%%xmm10,%%xmm10 \n" |
187 | 0 | "palignr $0x8,%%xmm11,%%xmm11 \n" |
188 | 0 | "punpcklwd %%xmm14,%%xmm12 \n" |
189 | 0 | "punpcklwd %%xmm15,%%xmm13 \n" |
190 | 0 | "movdqa %%xmm12,%%xmm14 \n" |
191 | 0 | "movdqa %%xmm13,%%xmm15 \n" |
192 | 0 | "palignr $0x8,%%xmm14,%%xmm14 \n" |
193 | 0 | "palignr $0x8,%%xmm15,%%xmm15 \n" |
194 | | // Third round of bit swap. |
195 | | // Write to the destination pointer. |
196 | 0 | "punpckldq %%xmm4,%%xmm0 \n" |
197 | 0 | "movq %%xmm0,(%1) \n" |
198 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
199 | 0 | "palignr $0x8,%%xmm4,%%xmm4 \n" |
200 | 0 | "movq %%xmm4,(%1,%4) \n" |
201 | 0 | "lea (%1,%4,2),%1 \n" |
202 | 0 | "punpckldq %%xmm6,%%xmm2 \n" |
203 | 0 | "movdqa %%xmm2,%%xmm6 \n" |
204 | 0 | "movq %%xmm2,(%1) \n" |
205 | 0 | "palignr $0x8,%%xmm6,%%xmm6 \n" |
206 | 0 | "punpckldq %%xmm5,%%xmm1 \n" |
207 | 0 | "movq %%xmm6,(%1,%4) \n" |
208 | 0 | "lea (%1,%4,2),%1 \n" |
209 | 0 | "movdqa %%xmm1,%%xmm5 \n" |
210 | 0 | "movq %%xmm1,(%1) \n" |
211 | 0 | "palignr $0x8,%%xmm5,%%xmm5 \n" |
212 | 0 | "movq %%xmm5,(%1,%4) \n" |
213 | 0 | "lea (%1,%4,2),%1 \n" |
214 | 0 | "punpckldq %%xmm7,%%xmm3 \n" |
215 | 0 | "movq %%xmm3,(%1) \n" |
216 | 0 | "movdqa %%xmm3,%%xmm7 \n" |
217 | 0 | "palignr $0x8,%%xmm7,%%xmm7 \n" |
218 | 0 | "movq %%xmm7,(%1,%4) \n" |
219 | 0 | "lea (%1,%4,2),%1 \n" |
220 | 0 | "punpckldq %%xmm12,%%xmm8 \n" |
221 | 0 | "movq %%xmm8,(%1) \n" |
222 | 0 | "movdqa %%xmm8,%%xmm12 \n" |
223 | 0 | "palignr $0x8,%%xmm12,%%xmm12 \n" |
224 | 0 | "movq %%xmm12,(%1,%4) \n" |
225 | 0 | "lea (%1,%4,2),%1 \n" |
226 | 0 | "punpckldq %%xmm14,%%xmm10 \n" |
227 | 0 | "movdqa %%xmm10,%%xmm14 \n" |
228 | 0 | "movq %%xmm10,(%1) \n" |
229 | 0 | "palignr $0x8,%%xmm14,%%xmm14 \n" |
230 | 0 | "punpckldq %%xmm13,%%xmm9 \n" |
231 | 0 | "movq %%xmm14,(%1,%4) \n" |
232 | 0 | "lea (%1,%4,2),%1 \n" |
233 | 0 | "movdqa %%xmm9,%%xmm13 \n" |
234 | 0 | "movq %%xmm9,(%1) \n" |
235 | 0 | "palignr $0x8,%%xmm13,%%xmm13 \n" |
236 | 0 | "movq %%xmm13,(%1,%4) \n" |
237 | 0 | "lea (%1,%4,2),%1 \n" |
238 | 0 | "punpckldq %%xmm15,%%xmm11 \n" |
239 | 0 | "movq %%xmm11,(%1) \n" |
240 | 0 | "movdqa %%xmm11,%%xmm15 \n" |
241 | 0 | "palignr $0x8,%%xmm15,%%xmm15 \n" |
242 | 0 | "sub $0x10,%2 \n" |
243 | 0 | "movq %%xmm15,(%1,%4) \n" |
244 | 0 | "lea (%1,%4,2),%1 \n" |
245 | 0 | "jg 1b \n" |
246 | 0 | : "+r"(src), // %0 |
247 | 0 | "+r"(dst), // %1 |
248 | 0 | "+r"(width) // %2 |
249 | 0 | : "r"((intptr_t)(src_stride)), // %3 |
250 | 0 | "r"((intptr_t)(dst_stride)) // %4 |
251 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
252 | 0 | "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
253 | 0 | "xmm15"); |
254 | 0 | } |
255 | | #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) |
256 | | |
257 | | // Transpose UV 8x8. 64 bit. |
258 | | #if defined(HAS_TRANSPOSEUVWX8_SSE2) |
259 | | void TransposeUVWx8_SSE2(const uint8_t* src, |
260 | | int src_stride, |
261 | | uint8_t* dst_a, |
262 | | int dst_stride_a, |
263 | | uint8_t* dst_b, |
264 | | int dst_stride_b, |
265 | 0 | int width) { |
266 | 0 | asm volatile( |
267 | | // Read in the data from the source pointer. |
268 | | // First round of bit swap. |
269 | 0 | LABELALIGN |
270 | 0 | "1: \n" |
271 | 0 | "movdqu (%0),%%xmm0 \n" |
272 | 0 | "movdqu (%0,%4),%%xmm1 \n" |
273 | 0 | "lea (%0,%4,2),%0 \n" |
274 | 0 | "movdqa %%xmm0,%%xmm8 \n" |
275 | 0 | "punpcklbw %%xmm1,%%xmm0 \n" |
276 | 0 | "punpckhbw %%xmm1,%%xmm8 \n" |
277 | 0 | "movdqa %%xmm8,%%xmm1 \n" |
278 | 0 | "movdqu (%0),%%xmm2 \n" |
279 | 0 | "movdqu (%0,%4),%%xmm3 \n" |
280 | 0 | "lea (%0,%4,2),%0 \n" |
281 | 0 | "movdqa %%xmm2,%%xmm8 \n" |
282 | 0 | "punpcklbw %%xmm3,%%xmm2 \n" |
283 | 0 | "punpckhbw %%xmm3,%%xmm8 \n" |
284 | 0 | "movdqa %%xmm8,%%xmm3 \n" |
285 | 0 | "movdqu (%0),%%xmm4 \n" |
286 | 0 | "movdqu (%0,%4),%%xmm5 \n" |
287 | 0 | "lea (%0,%4,2),%0 \n" |
288 | 0 | "movdqa %%xmm4,%%xmm8 \n" |
289 | 0 | "punpcklbw %%xmm5,%%xmm4 \n" |
290 | 0 | "punpckhbw %%xmm5,%%xmm8 \n" |
291 | 0 | "movdqa %%xmm8,%%xmm5 \n" |
292 | 0 | "movdqu (%0),%%xmm6 \n" |
293 | 0 | "movdqu (%0,%4),%%xmm7 \n" |
294 | 0 | "lea (%0,%4,2),%0 \n" |
295 | 0 | "movdqa %%xmm6,%%xmm8 \n" |
296 | 0 | "punpcklbw %%xmm7,%%xmm6 \n" |
297 | 0 | "neg %4 \n" |
298 | 0 | "lea 0x10(%0,%4,8),%0 \n" |
299 | 0 | "punpckhbw %%xmm7,%%xmm8 \n" |
300 | 0 | "movdqa %%xmm8,%%xmm7 \n" |
301 | 0 | "neg %4 \n" |
302 | | // Second round of bit swap. |
303 | 0 | "movdqa %%xmm0,%%xmm8 \n" |
304 | 0 | "movdqa %%xmm1,%%xmm9 \n" |
305 | 0 | "punpckhwd %%xmm2,%%xmm8 \n" |
306 | 0 | "punpckhwd %%xmm3,%%xmm9 \n" |
307 | 0 | "punpcklwd %%xmm2,%%xmm0 \n" |
308 | 0 | "punpcklwd %%xmm3,%%xmm1 \n" |
309 | 0 | "movdqa %%xmm8,%%xmm2 \n" |
310 | 0 | "movdqa %%xmm9,%%xmm3 \n" |
311 | 0 | "movdqa %%xmm4,%%xmm8 \n" |
312 | 0 | "movdqa %%xmm5,%%xmm9 \n" |
313 | 0 | "punpckhwd %%xmm6,%%xmm8 \n" |
314 | 0 | "punpckhwd %%xmm7,%%xmm9 \n" |
315 | 0 | "punpcklwd %%xmm6,%%xmm4 \n" |
316 | 0 | "punpcklwd %%xmm7,%%xmm5 \n" |
317 | 0 | "movdqa %%xmm8,%%xmm6 \n" |
318 | 0 | "movdqa %%xmm9,%%xmm7 \n" |
319 | | // Third round of bit swap. |
320 | | // Write to the destination pointer. |
321 | 0 | "movdqa %%xmm0,%%xmm8 \n" |
322 | 0 | "punpckldq %%xmm4,%%xmm0 \n" |
323 | 0 | "movlpd %%xmm0,(%1) \n" // Write back U channel |
324 | 0 | "movhpd %%xmm0,(%2) \n" // Write back V channel |
325 | 0 | "punpckhdq %%xmm4,%%xmm8 \n" |
326 | 0 | "movlpd %%xmm8,(%1,%5) \n" |
327 | 0 | "lea (%1,%5,2),%1 \n" |
328 | 0 | "movhpd %%xmm8,(%2,%6) \n" |
329 | 0 | "lea (%2,%6,2),%2 \n" |
330 | 0 | "movdqa %%xmm2,%%xmm8 \n" |
331 | 0 | "punpckldq %%xmm6,%%xmm2 \n" |
332 | 0 | "movlpd %%xmm2,(%1) \n" |
333 | 0 | "movhpd %%xmm2,(%2) \n" |
334 | 0 | "punpckhdq %%xmm6,%%xmm8 \n" |
335 | 0 | "movlpd %%xmm8,(%1,%5) \n" |
336 | 0 | "lea (%1,%5,2),%1 \n" |
337 | 0 | "movhpd %%xmm8,(%2,%6) \n" |
338 | 0 | "lea (%2,%6,2),%2 \n" |
339 | 0 | "movdqa %%xmm1,%%xmm8 \n" |
340 | 0 | "punpckldq %%xmm5,%%xmm1 \n" |
341 | 0 | "movlpd %%xmm1,(%1) \n" |
342 | 0 | "movhpd %%xmm1,(%2) \n" |
343 | 0 | "punpckhdq %%xmm5,%%xmm8 \n" |
344 | 0 | "movlpd %%xmm8,(%1,%5) \n" |
345 | 0 | "lea (%1,%5,2),%1 \n" |
346 | 0 | "movhpd %%xmm8,(%2,%6) \n" |
347 | 0 | "lea (%2,%6,2),%2 \n" |
348 | 0 | "movdqa %%xmm3,%%xmm8 \n" |
349 | 0 | "punpckldq %%xmm7,%%xmm3 \n" |
350 | 0 | "movlpd %%xmm3,(%1) \n" |
351 | 0 | "movhpd %%xmm3,(%2) \n" |
352 | 0 | "punpckhdq %%xmm7,%%xmm8 \n" |
353 | 0 | "sub $0x8,%3 \n" |
354 | 0 | "movlpd %%xmm8,(%1,%5) \n" |
355 | 0 | "lea (%1,%5,2),%1 \n" |
356 | 0 | "movhpd %%xmm8,(%2,%6) \n" |
357 | 0 | "lea (%2,%6,2),%2 \n" |
358 | 0 | "jg 1b \n" |
359 | 0 | : "+r"(src), // %0 |
360 | 0 | "+r"(dst_a), // %1 |
361 | 0 | "+r"(dst_b), // %2 |
362 | 0 | "+r"(width) // %3 |
363 | 0 | : "r"((intptr_t)(src_stride)), // %4 |
364 | 0 | "r"((intptr_t)(dst_stride_a)), // %5 |
365 | 0 | "r"((intptr_t)(dst_stride_b)) // %6 |
366 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
367 | 0 | "xmm7", "xmm8", "xmm9"); |
368 | 0 | } |
369 | | #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) |
370 | | |
371 | | #if defined(HAS_TRANSPOSE4X4_32_SSE2) |
372 | | // 4 values, little endian view |
373 | | // a b c d |
374 | | // e f g h |
375 | | // i j k l |
376 | | // m n o p |
377 | | |
378 | | // transpose 2x2 |
379 | | // a e b f from row 0, 1 |
380 | | // i m j n from row 2, 3 |
381 | | // c g d h from row 0, 1 |
382 | | // k o l p from row 2, 3 |
383 | | |
384 | | // transpose 4x4 |
385 | | // a e i m from row 0, 1 |
386 | | // b f j n from row 0, 1 |
387 | | // c g k o from row 2, 3 |
388 | | // d h l p from row 2, 3 |
389 | | |
390 | | // Transpose 32 bit values (ARGB) |
391 | | void Transpose4x4_32_SSE2(const uint8_t* src, |
392 | | int src_stride, |
393 | | uint8_t* dst, |
394 | | int dst_stride, |
395 | 0 | int width) { |
396 | 0 | asm volatile( |
397 | | // Main loop transpose 4x4. Read a column, write a row. |
398 | 0 | "1: \n" |
399 | 0 | "movdqu (%0),%%xmm0 \n" // a b c d |
400 | 0 | "movdqu (%0,%3),%%xmm1 \n" // e f g h |
401 | 0 | "lea (%0,%3,2),%0 \n" // src += stride * 2 |
402 | 0 | "movdqu (%0),%%xmm2 \n" // i j k l |
403 | 0 | "movdqu (%0,%3),%%xmm3 \n" // m n o p |
404 | 0 | "lea (%0,%3,2),%0 \n" // src += stride * 2 |
405 | | |
406 | | // Transpose 2x2 |
407 | 0 | "movdqa %%xmm0,%%xmm4 \n" |
408 | 0 | "movdqa %%xmm2,%%xmm5 \n" |
409 | 0 | "movdqa %%xmm0,%%xmm6 \n" |
410 | 0 | "movdqa %%xmm2,%%xmm7 \n" |
411 | 0 | "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1 |
412 | 0 | "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3 |
413 | 0 | "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1 |
414 | 0 | "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3 |
415 | | |
416 | | // Transpose 4x4 |
417 | 0 | "movdqa %%xmm4,%%xmm0 \n" |
418 | 0 | "movdqa %%xmm4,%%xmm1 \n" |
419 | 0 | "movdqa %%xmm6,%%xmm2 \n" |
420 | 0 | "movdqa %%xmm6,%%xmm3 \n" |
421 | 0 | "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1 |
422 | 0 | "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1 |
423 | 0 | "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3 |
424 | 0 | "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3 |
425 | |
|
426 | 0 | "movdqu %%xmm0,(%1) \n" |
427 | 0 | "lea 16(%1,%4),%1 \n" // dst += stride + 16 |
428 | 0 | "movdqu %%xmm1,-16(%1) \n" |
429 | 0 | "movdqu %%xmm2,-16(%1,%4) \n" |
430 | 0 | "movdqu %%xmm3,-16(%1,%4,2) \n" |
431 | 0 | "sub %4,%1 \n" |
432 | 0 | "sub $0x4,%2 \n" |
433 | 0 | "jg 1b \n" |
434 | 0 | : "+r"(src), // %0 |
435 | 0 | "+r"(dst), // %1 |
436 | 0 | "+rm"(width) // %2 |
437 | 0 | : "r"((ptrdiff_t)(src_stride)), // %3 |
438 | 0 | "r"((ptrdiff_t)(dst_stride)) // %4 |
439 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
440 | 0 | "xmm7"); |
441 | 0 | } |
442 | | #endif // defined(HAS_TRANSPOSE4X4_32_SSE2) |
443 | | |
444 | | #if defined(HAS_TRANSPOSE4X4_32_AVX2) |
445 | | |
446 | | // Transpose 32 bit values (ARGB) |
447 | | void Transpose4x4_32_AVX2(const uint8_t* src, |
448 | | int src_stride, |
449 | | uint8_t* dst, |
450 | | int dst_stride, |
451 | 0 | int width) { |
452 | 0 | asm volatile( |
453 | | // Main loop transpose 2 blocks of 4x4. Read a column, write a row. |
454 | 0 | "1: \n" |
455 | 0 | "vmovdqu (%0),%%xmm0 \n" // a b c d |
456 | 0 | "vmovdqu (%0,%3),%%xmm1 \n" // e f g h |
457 | 0 | "lea (%0,%3,2),%0 \n" // src += stride * 2 |
458 | 0 | "vmovdqu (%0),%%xmm2 \n" // i j k l |
459 | 0 | "vmovdqu (%0,%3),%%xmm3 \n" // m n o p |
460 | 0 | "lea (%0,%3,2),%0 \n" // src += stride * 2 |
461 | |
|
462 | 0 | "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d |
463 | 0 | "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h |
464 | 0 | "lea (%0,%3,2),%0 \n" // src += stride * 2 |
465 | 0 | "vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l |
466 | 0 | "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p |
467 | 0 | "lea (%0,%3,2),%0 \n" // src += stride * 2 |
468 | | |
469 | | // Transpose 2x2 |
470 | 0 | "vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1 |
471 | 0 | "vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3 |
472 | 0 | "vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1 |
473 | 0 | "vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3 |
474 | | |
475 | | // Transpose 4x4 |
476 | 0 | "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1 |
477 | 0 | "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1 |
478 | 0 | "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3 |
479 | 0 | "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3 |
480 | |
|
481 | 0 | "vmovdqu %%ymm0,(%1) \n" |
482 | 0 | "lea 32(%1,%4),%1 \n" // dst += stride + 32 |
483 | 0 | "vmovdqu %%ymm1,-32(%1) \n" |
484 | 0 | "vmovdqu %%ymm2,-32(%1,%4) \n" |
485 | 0 | "vmovdqu %%ymm3,-32(%1,%4,2) \n" |
486 | 0 | "sub %4,%1 \n" |
487 | 0 | "sub $0x8,%2 \n" |
488 | 0 | "jg 1b \n" |
489 | 0 | "vzeroupper \n" |
490 | 0 | : "+r"(src), // %0 |
491 | 0 | "+r"(dst), // %1 |
492 | 0 | "+rm"(width) // %2 |
493 | 0 | : "r"((ptrdiff_t)(src_stride)), // %3 |
494 | 0 | "r"((ptrdiff_t)(dst_stride)) // %4 |
495 | 0 | : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
496 | 0 | "xmm7"); |
497 | 0 | } |
498 | | #endif // defined(HAS_TRANSPOSE4X4_32_AVX2) |
499 | | |
500 | | #endif // defined(__x86_64__) || defined(__i386__) |
501 | | |
502 | | #ifdef __cplusplus |
503 | | } // extern "C" |
504 | | } // namespace libyuv |
505 | | #endif |