Coverage Report

Created: 2026-01-09 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavif/ext/libyuv/source/row_gcc.cc
Line
Count
Source
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include "libyuv/row.h"
12
#ifdef __cplusplus
13
namespace libyuv {
14
extern "C" {
15
#endif
16
17
// This module is for GCC x86 and x64.
18
#if !defined(LIBYUV_DISABLE_X86) &&               \
19
    (defined(__x86_64__) || defined(__i386__)) && \
20
    !defined(LIBYUV_ENABLE_ROWWIN)
21
22
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24
// Constants for ARGB
25
static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
26
                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
27
28
// JPeg full range.
29
static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
30
                                29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
31
32
static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
33
                                77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
34
35
static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
36
                                0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
37
#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
38
39
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
40
// Constants for BGRA
41
static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
42
                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
43
44
// Constants for ABGR
45
static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
46
                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
47
48
// Constants for RGBA.
49
static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
50
                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
51
// 126 (7e) - (-109..110) = 16..235
52
static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
53
                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
54
static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
55
                              0x8080u, 0x8080u, 0x8080u, 0x8080u};
56
57
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
58
                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
59
60
#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
61
62
#ifdef HAS_RGB24TOARGBROW_SSSE3
63
64
// Shuffle table for converting RGB24 to ARGB.
65
static const uvec8 kShuffleMaskRGB24ToARGB = {
66
    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
67
68
// Shuffle table for converting RAW to ARGB.
69
static const uvec8 kShuffleMaskRAWToARGB = {
70
    2u, 1u, 0u, 128u, 5u, 4u, 3u, 128u, 8u, 7u, 6u, 128u, 11u, 10u, 9u, 128u};
71
// Shuffle table for converting RAW to ARGB.  Last 12
72
static const uvec8 kShuffleMaskRAWToARGB_0 = {6u,  5u,   4u,  128u, 9u,  8u,
73
                                              7u,  128u, 12u, 11u,  10u, 128u,
74
                                              15u, 14u,  13u, 128u};
75
76
// Shuffle table for converting RAW to RGBA.
77
static const uvec8 kShuffleMaskRAWToRGBA = {
78
    128u, 2u, 1u, 0u, 128u, 5u, 4u, 3u, 128u, 8u, 7u, 6u, 128u, 11u, 10u, 9u};
79
80
// Shuffle table for converting RAW to RGB24.  First 8.
81
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
82
    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
83
    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
84
85
// Shuffle table for converting RAW to RGB24.  Middle 8.
86
static const uvec8 kShuffleMaskRAWToRGB24_1 = {
87
    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
88
    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
89
90
// Shuffle table for converting RAW to RGB24.  Last 8.
91
static const uvec8 kShuffleMaskRAWToRGB24_2 = {
92
    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
93
    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
94
95
// Shuffle table for converting ARGB to RGB24.
96
static const uvec8 kShuffleMaskARGBToRGB24 = {
97
    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
98
99
// Shuffle table for converting ARGB to RAW.
100
static const uvec8 kShuffleMaskARGBToRAW = {
101
    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
102
103
// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
104
static const uvec8 kShuffleMaskARGBToRGB24_0 = {
105
    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
106
107
// YUY2 shuf 16 Y to 32 Y.
108
static const vec8 kShuffleYUY2Y = {0, 0, 2,  2,  4,  4,  6,  6,
109
                                   8, 8, 10, 10, 12, 12, 14, 14};
110
111
// YUY2 shuf 8 UV to 16 UV.
112
static const vec8 kShuffleYUY2UV = {1, 3,  1, 3,  5,  7,  5,  7,
113
                                    9, 11, 9, 11, 13, 15, 13, 15};
114
115
// UYVY shuf 16 Y to 32 Y.
116
static const vec8 kShuffleUYVYY = {1, 1, 3,  3,  5,  5,  7,  7,
117
                                   9, 9, 11, 11, 13, 13, 15, 15};
118
119
// UYVY shuf 8 UV to 16 UV.
120
static const vec8 kShuffleUYVYUV = {0, 2,  0, 2,  4,  6,  4,  6,
121
                                    8, 10, 8, 10, 12, 14, 12, 14};
122
123
// NV21 shuf 8 VU to 16 UV.
124
static const lvec8 kShuffleNV21 = {
125
    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
126
    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
127
};
128
#endif  // HAS_RGB24TOARGBROW_SSSE3
129
130
#ifdef HAS_J400TOARGBROW_SSE2
131
0
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
132
0
  asm volatile(
133
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
134
0
      "pslld       $0x18,%%xmm5                  \n"
135
136
0
      LABELALIGN
137
0
      "1:          \n"
138
0
      "movq        (%0),%%xmm0                   \n"
139
0
      "lea         0x8(%0),%0                    \n"
140
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
141
0
      "movdqa      %%xmm0,%%xmm1                 \n"
142
0
      "punpcklwd   %%xmm0,%%xmm0                 \n"
143
0
      "punpckhwd   %%xmm1,%%xmm1                 \n"
144
0
      "por         %%xmm5,%%xmm0                 \n"
145
0
      "por         %%xmm5,%%xmm1                 \n"
146
0
      "movdqu      %%xmm0,(%1)                   \n"
147
0
      "movdqu      %%xmm1,0x10(%1)               \n"
148
0
      "lea         0x20(%1),%1                   \n"
149
0
      "sub         $0x8,%2                       \n"
150
0
      "jg          1b                            \n"
151
0
      : "+r"(src_y),     // %0
152
0
        "+r"(dst_argb),  // %1
153
0
        "+r"(width)      // %2
154
0
        ::"memory",
155
0
        "cc", "xmm0", "xmm1", "xmm5");
156
0
}
157
#endif  // HAS_J400TOARGBROW_SSE2
158
159
#ifdef HAS_RGB24TOARGBROW_SSSE3
160
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
161
                          uint8_t* dst_argb,
162
0
                          int width) {
163
0
  asm volatile(
164
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
165
0
      "pslld       $0x18,%%xmm5                  \n"
166
0
      "movdqa      %3,%%xmm4                     \n"
167
168
0
      LABELALIGN
169
0
      "1:          \n"
170
0
      "movdqu      (%0),%%xmm0                   \n"
171
0
      "movdqu      0x10(%0),%%xmm1               \n"
172
0
      "movdqu      0x20(%0),%%xmm3               \n"
173
0
      "lea         0x30(%0),%0                   \n"
174
0
      "movdqa      %%xmm3,%%xmm2                 \n"
175
0
      "palignr     $0x8,%%xmm1,%%xmm2            \n"
176
0
      "pshufb      %%xmm4,%%xmm2                 \n"
177
0
      "por         %%xmm5,%%xmm2                 \n"
178
0
      "palignr     $0xc,%%xmm0,%%xmm1            \n"
179
0
      "pshufb      %%xmm4,%%xmm0                 \n"
180
0
      "movdqu      %%xmm2,0x20(%1)               \n"
181
0
      "por         %%xmm5,%%xmm0                 \n"
182
0
      "pshufb      %%xmm4,%%xmm1                 \n"
183
0
      "movdqu      %%xmm0,(%1)                   \n"
184
0
      "por         %%xmm5,%%xmm1                 \n"
185
0
      "palignr     $0x4,%%xmm3,%%xmm3            \n"
186
0
      "pshufb      %%xmm4,%%xmm3                 \n"
187
0
      "movdqu      %%xmm1,0x10(%1)               \n"
188
0
      "por         %%xmm5,%%xmm3                 \n"
189
0
      "movdqu      %%xmm3,0x30(%1)               \n"
190
0
      "lea         0x40(%1),%1                   \n"
191
0
      "sub         $0x10,%2                      \n"
192
0
      "jg          1b                            \n"
193
0
      : "+r"(src_rgb24),              // %0
194
0
        "+r"(dst_argb),               // %1
195
0
        "+r"(width)                   // %2
196
0
      : "m"(kShuffleMaskRGB24ToARGB)  // %3
197
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
198
0
}
199
200
0
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
201
0
  asm volatile(
202
0
      "pcmpeqb     %%xmm6,%%xmm6                 \n"  // 0xff000000
203
0
      "pslld       $0x18,%%xmm6                  \n"
204
0
      "movdqa      %3,%%xmm4                     \n"
205
0
      "movdqa      %4,%%xmm5                     \n"
206
207
0
      LABELALIGN
208
0
      "1:          \n"
209
0
      "movdqu      (%0),%%xmm0                   \n"
210
0
      "movdqu      12(%0),%%xmm1                 \n"
211
0
      "movdqu      24(%0),%%xmm2                 \n"
212
0
      "movdqu      32(%0),%%xmm3                 \n"
213
0
      "lea         0x30(%0),%0                   \n"
214
0
      "pshufb      %%xmm4,%%xmm0                 \n"
215
0
      "pshufb      %%xmm4,%%xmm1                 \n"
216
0
      "pshufb      %%xmm4,%%xmm2                 \n"
217
0
      "pshufb      %%xmm5,%%xmm3                 \n"
218
0
      "por         %%xmm6,%%xmm0                 \n"
219
0
      "por         %%xmm6,%%xmm1                 \n"
220
0
      "por         %%xmm6,%%xmm2                 \n"
221
0
      "por         %%xmm6,%%xmm3                 \n"
222
0
      "movdqu      %%xmm0,0x00(%1)               \n"
223
0
      "movdqu      %%xmm1,0x10(%1)               \n"
224
0
      "movdqu      %%xmm2,0x20(%1)               \n"
225
0
      "movdqu      %%xmm3,0x30(%1)               \n"
226
0
      "lea         0x40(%1),%1                   \n"
227
0
      "sub         $0x10,%2                      \n"
228
0
      "jg          1b                            \n"
229
0
      : "+r"(src_raw),                // %0
230
0
        "+r"(dst_argb),               // %1
231
0
        "+r"(width)                   // %2
232
0
      : "m"(kShuffleMaskRAWToARGB),   // %3
233
0
        "m"(kShuffleMaskRAWToARGB_0)  // %4
234
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
235
0
}
236
237
455k
void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
238
455k
  asm volatile(
239
455k
      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 0xff000000
240
455k
      "vpslld      $0x18,%%ymm6,%%ymm6           \n"
241
455k
      "vbroadcastf128 %3,%%ymm4                  \n"  //
242
455k
      "vbroadcastf128 %4,%%ymm5                  \n"  //
243
244
      LABELALIGN  //
245
455k
      "1:          \n"
246
455k
      "vmovdqu     (%0),%%xmm0                   \n"  // first 12
247
455k
      "vinserti128 $1,12(%0),%%ymm0,%%ymm0       \n"  // second 12
248
455k
      "vmovdqu     24(%0),%%xmm1                 \n"  // third 12
249
455k
      "vinserti128 $1,36(%0),%%ymm1,%%ymm1       \n"  // forth 12
250
455k
      "vmovdqu     48(%0),%%xmm2                 \n"  // fifth 12
251
455k
      "vinserti128 $1,60(%0),%%ymm2,%%ymm2       \n"  // sixth 12
252
455k
      "vmovdqu     68(%0),%%xmm3                 \n"  // seventh 12
253
455k
      "vinserti128 $1,80(%0),%%ymm3,%%ymm3       \n"  // eighth 12
254
455k
      "lea         96(%0),%0                     \n"
255
455k
      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
256
455k
      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
257
455k
      "vpshufb     %%ymm4,%%ymm2,%%ymm2          \n"
258
455k
      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
259
455k
      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
260
455k
      "vpor        %%ymm6,%%ymm1,%%ymm1          \n"
261
455k
      "vpor        %%ymm6,%%ymm2,%%ymm2          \n"
262
455k
      "vpor        %%ymm6,%%ymm3,%%ymm3          \n"
263
455k
      "vmovdqu     %%ymm0,(%1)                   \n"
264
455k
      "vmovdqu     %%ymm1,0x20(%1)               \n"
265
455k
      "vmovdqu     %%ymm2,0x40(%1)               \n"
266
455k
      "vmovdqu     %%ymm3,0x60(%1)               \n"
267
455k
      "lea         0x80(%1),%1                   \n"
268
455k
      "sub         $0x20,%2                      \n"
269
455k
      "jg          1b                            \n"
270
455k
      "vzeroupper  \n"
271
455k
      : "+r"(src_raw),                // %0
272
455k
        "+r"(dst_argb),               // %1
273
455k
        "+r"(width)                   // %2
274
455k
      : "m"(kShuffleMaskRAWToARGB),   // %3
275
455k
        "m"(kShuffleMaskRAWToARGB_0)  // %4
276
455k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
277
455k
}
278
279
// Same code as RAWToARGB with different shuffler and A in low bits
280
0
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
281
0
  asm volatile(
282
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
283
0
      "psrld       $24,%%xmm5                    \n"
284
0
      "movdqa      %3,%%xmm4                     \n"
285
286
0
      LABELALIGN
287
0
      "1:          \n"
288
0
      "movdqu      (%0),%%xmm0                   \n"
289
0
      "movdqu      0x10(%0),%%xmm1               \n"
290
0
      "movdqu      0x20(%0),%%xmm3               \n"
291
0
      "lea         0x30(%0),%0                   \n"
292
0
      "movdqa      %%xmm3,%%xmm2                 \n"
293
0
      "palignr     $0x8,%%xmm1,%%xmm2            \n"
294
0
      "pshufb      %%xmm4,%%xmm2                 \n"
295
0
      "por         %%xmm5,%%xmm2                 \n"
296
0
      "palignr     $0xc,%%xmm0,%%xmm1            \n"
297
0
      "pshufb      %%xmm4,%%xmm0                 \n"
298
0
      "movdqu      %%xmm2,0x20(%1)               \n"
299
0
      "por         %%xmm5,%%xmm0                 \n"
300
0
      "pshufb      %%xmm4,%%xmm1                 \n"
301
0
      "movdqu      %%xmm0,(%1)                   \n"
302
0
      "por         %%xmm5,%%xmm1                 \n"
303
0
      "palignr     $0x4,%%xmm3,%%xmm3            \n"
304
0
      "pshufb      %%xmm4,%%xmm3                 \n"
305
0
      "movdqu      %%xmm1,0x10(%1)               \n"
306
0
      "por         %%xmm5,%%xmm3                 \n"
307
0
      "movdqu      %%xmm3,0x30(%1)               \n"
308
0
      "lea         0x40(%1),%1                   \n"
309
0
      "sub         $0x10,%2                      \n"
310
0
      "jg          1b                            \n"
311
0
      : "+r"(src_raw),              // %0
312
0
        "+r"(dst_rgba),             // %1
313
0
        "+r"(width)                 // %2
314
0
      : "m"(kShuffleMaskRAWToRGBA)  // %3
315
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
316
0
}
317
318
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
319
                         uint8_t* dst_rgb24,
320
0
                         int width) {
321
0
  asm volatile(
322
0
      "movdqa      %3,%%xmm3                     \n"
323
0
      "movdqa      %4,%%xmm4                     \n"
324
0
      "movdqa      %5,%%xmm5                     \n"
325
326
0
      LABELALIGN
327
0
      "1:          \n"
328
0
      "movdqu      (%0),%%xmm0                   \n"
329
0
      "movdqu      0x4(%0),%%xmm1                \n"
330
0
      "movdqu      0x8(%0),%%xmm2                \n"
331
0
      "lea         0x18(%0),%0                   \n"
332
0
      "pshufb      %%xmm3,%%xmm0                 \n"
333
0
      "pshufb      %%xmm4,%%xmm1                 \n"
334
0
      "pshufb      %%xmm5,%%xmm2                 \n"
335
0
      "movq        %%xmm0,(%1)                   \n"
336
0
      "movq        %%xmm1,0x8(%1)                \n"
337
0
      "movq        %%xmm2,0x10(%1)               \n"
338
0
      "lea         0x18(%1),%1                   \n"
339
0
      "sub         $0x8,%2                       \n"
340
0
      "jg          1b                            \n"
341
0
      : "+r"(src_raw),                  // %0
342
0
        "+r"(dst_rgb24),                // %1
343
0
        "+r"(width)                     // %2
344
0
      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
345
0
        "m"(kShuffleMaskRAWToRGB24_1),  // %4
346
0
        "m"(kShuffleMaskRAWToRGB24_2)   // %5
347
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
348
0
}
349
350
0
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
351
0
  asm volatile(
352
0
      "mov         $0x1080108,%%eax              \n"
353
0
      "movd        %%eax,%%xmm5                  \n"
354
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
355
0
      "mov         $0x20802080,%%eax             \n"
356
0
      "movd        %%eax,%%xmm6                  \n"
357
0
      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
358
0
      "pcmpeqb     %%xmm3,%%xmm3                 \n"
359
0
      "psllw       $0xb,%%xmm3                   \n"
360
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
361
0
      "psllw       $10,%%xmm4                    \n"
362
0
      "psrlw       $5,%%xmm4                     \n"
363
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
364
0
      "psllw       $0x8,%%xmm7                   \n"
365
0
      "sub         %0,%1                         \n"
366
0
      "sub         %0,%1                         \n"
367
368
0
      LABELALIGN
369
0
      "1:          \n"
370
0
      "movdqu      (%0),%%xmm0                   \n"
371
0
      "movdqa      %%xmm0,%%xmm1                 \n"
372
0
      "movdqa      %%xmm0,%%xmm2                 \n"
373
0
      "pand        %%xmm3,%%xmm1                 \n"
374
0
      "psllw       $0xb,%%xmm2                   \n"
375
0
      "pmulhuw     %%xmm5,%%xmm1                 \n"
376
0
      "pmulhuw     %%xmm5,%%xmm2                 \n"
377
0
      "psllw       $0x8,%%xmm1                   \n"
378
0
      "por         %%xmm2,%%xmm1                 \n"
379
0
      "pand        %%xmm4,%%xmm0                 \n"
380
0
      "pmulhuw     %%xmm6,%%xmm0                 \n"
381
0
      "por         %%xmm7,%%xmm0                 \n"
382
0
      "movdqa      %%xmm1,%%xmm2                 \n"
383
0
      "punpcklbw   %%xmm0,%%xmm1                 \n"
384
0
      "punpckhbw   %%xmm0,%%xmm2                 \n"
385
0
      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
386
0
      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
387
0
      "lea         0x10(%0),%0                   \n"
388
0
      "sub         $0x8,%2                       \n"
389
0
      "jg          1b                            \n"
390
0
      : "+r"(src),   // %0
391
0
        "+r"(dst),   // %1
392
0
        "+r"(width)  // %2
393
0
      :
394
0
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
395
0
        "xmm6", "xmm7");
396
0
}
397
398
0
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
399
0
  asm volatile(
400
0
      "mov         $0x1080108,%%eax              \n"
401
0
      "movd        %%eax,%%xmm5                  \n"
402
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
403
0
      "mov         $0x42004200,%%eax             \n"
404
0
      "movd        %%eax,%%xmm6                  \n"
405
0
      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
406
0
      "pcmpeqb     %%xmm3,%%xmm3                 \n"
407
0
      "psllw       $0xb,%%xmm3                   \n"
408
0
      "movdqa      %%xmm3,%%xmm4                 \n"
409
0
      "psrlw       $0x6,%%xmm4                   \n"
410
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
411
0
      "psllw       $0x8,%%xmm7                   \n"
412
0
      "sub         %0,%1                         \n"
413
0
      "sub         %0,%1                         \n"
414
415
0
      LABELALIGN
416
0
      "1:          \n"
417
0
      "movdqu      (%0),%%xmm0                   \n"
418
0
      "movdqa      %%xmm0,%%xmm1                 \n"
419
0
      "movdqa      %%xmm0,%%xmm2                 \n"
420
0
      "psllw       $0x1,%%xmm1                   \n"
421
0
      "psllw       $0xb,%%xmm2                   \n"
422
0
      "pand        %%xmm3,%%xmm1                 \n"
423
0
      "pmulhuw     %%xmm5,%%xmm2                 \n"
424
0
      "pmulhuw     %%xmm5,%%xmm1                 \n"
425
0
      "psllw       $0x8,%%xmm1                   \n"
426
0
      "por         %%xmm2,%%xmm1                 \n"
427
0
      "movdqa      %%xmm0,%%xmm2                 \n"
428
0
      "pand        %%xmm4,%%xmm0                 \n"
429
0
      "psraw       $0x8,%%xmm2                   \n"
430
0
      "pmulhuw     %%xmm6,%%xmm0                 \n"
431
0
      "pand        %%xmm7,%%xmm2                 \n"
432
0
      "por         %%xmm2,%%xmm0                 \n"
433
0
      "movdqa      %%xmm1,%%xmm2                 \n"
434
0
      "punpcklbw   %%xmm0,%%xmm1                 \n"
435
0
      "punpckhbw   %%xmm0,%%xmm2                 \n"
436
0
      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
437
0
      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
438
0
      "lea         0x10(%0),%0                   \n"
439
0
      "sub         $0x8,%2                       \n"
440
0
      "jg          1b                            \n"
441
0
      : "+r"(src),   // %0
442
0
        "+r"(dst),   // %1
443
0
        "+r"(width)  // %2
444
0
      :
445
0
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
446
0
        "xmm6", "xmm7");
447
0
}
448
449
0
void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
450
0
  asm volatile(
451
0
      "mov         $0xf0f0f0f,%%eax              \n"
452
0
      "movd        %%eax,%%xmm4                  \n"
453
0
      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
454
0
      "movdqa      %%xmm4,%%xmm5                 \n"
455
0
      "pslld       $0x4,%%xmm5                   \n"
456
0
      "sub         %0,%1                         \n"
457
0
      "sub         %0,%1                         \n"
458
459
0
      LABELALIGN
460
0
      "1:          \n"
461
0
      "movdqu      (%0),%%xmm0                   \n"
462
0
      "movdqa      %%xmm0,%%xmm2                 \n"
463
0
      "pand        %%xmm4,%%xmm0                 \n"
464
0
      "pand        %%xmm5,%%xmm2                 \n"
465
0
      "movdqa      %%xmm0,%%xmm1                 \n"
466
0
      "movdqa      %%xmm2,%%xmm3                 \n"
467
0
      "psllw       $0x4,%%xmm1                   \n"
468
0
      "psrlw       $0x4,%%xmm3                   \n"
469
0
      "por         %%xmm1,%%xmm0                 \n"
470
0
      "por         %%xmm3,%%xmm2                 \n"
471
0
      "movdqa      %%xmm0,%%xmm1                 \n"
472
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"
473
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"
474
0
      "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
475
0
      "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
476
0
      "lea         0x10(%0),%0                   \n"
477
0
      "sub         $0x8,%2                       \n"
478
0
      "jg          1b                            \n"
479
0
      : "+r"(src),   // %0
480
0
        "+r"(dst),   // %1
481
0
        "+r"(width)  // %2
482
0
      :
483
0
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
484
0
}
485
486
0
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
487
0
      asm volatile("movdqa      %3,%%xmm6                     \n"
488
489
0
               LABELALIGN
490
0
      "1:          \n"
491
0
      "movdqu      (%0),%%xmm0                   \n"
492
0
      "movdqu      0x10(%0),%%xmm1               \n"
493
0
      "movdqu      0x20(%0),%%xmm2               \n"
494
0
      "movdqu      0x30(%0),%%xmm3               \n"
495
0
      "lea         0x40(%0),%0                   \n"
496
0
      "pshufb      %%xmm6,%%xmm0                 \n"
497
0
      "pshufb      %%xmm6,%%xmm1                 \n"
498
0
      "pshufb      %%xmm6,%%xmm2                 \n"
499
0
      "pshufb      %%xmm6,%%xmm3                 \n"
500
0
      "movdqa      %%xmm1,%%xmm4                 \n"
501
0
      "psrldq      $0x4,%%xmm1                   \n"
502
0
      "pslldq      $0xc,%%xmm4                   \n"
503
0
      "movdqa      %%xmm2,%%xmm5                 \n"
504
0
      "por         %%xmm4,%%xmm0                 \n"
505
0
      "pslldq      $0x8,%%xmm5                   \n"
506
0
      "movdqu      %%xmm0,(%1)                   \n"
507
0
      "por         %%xmm5,%%xmm1                 \n"
508
0
      "psrldq      $0x8,%%xmm2                   \n"
509
0
      "pslldq      $0x4,%%xmm3                   \n"
510
0
      "por         %%xmm3,%%xmm2                 \n"
511
0
      "movdqu      %%xmm1,0x10(%1)               \n"
512
0
      "movdqu      %%xmm2,0x20(%1)               \n"
513
0
      "lea         0x30(%1),%1                   \n"
514
0
      "sub         $0x10,%2                      \n"
515
0
      "jg          1b                            \n"
516
0
               : "+r"(src),                    // %0
517
0
                 "+r"(dst),                    // %1
518
0
                 "+r"(width)                   // %2
519
0
               : "m"(kShuffleMaskARGBToRGB24)  // %3
520
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
521
0
                 "xmm6");
522
0
}
523
524
0
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
525
0
      asm volatile("movdqa      %3,%%xmm6                     \n"
526
527
0
               LABELALIGN
528
0
      "1:          \n"
529
0
      "movdqu      (%0),%%xmm0                   \n"
530
0
      "movdqu      0x10(%0),%%xmm1               \n"
531
0
      "movdqu      0x20(%0),%%xmm2               \n"
532
0
      "movdqu      0x30(%0),%%xmm3               \n"
533
0
      "lea         0x40(%0),%0                   \n"
534
0
      "pshufb      %%xmm6,%%xmm0                 \n"
535
0
      "pshufb      %%xmm6,%%xmm1                 \n"
536
0
      "pshufb      %%xmm6,%%xmm2                 \n"
537
0
      "pshufb      %%xmm6,%%xmm3                 \n"
538
0
      "movdqa      %%xmm1,%%xmm4                 \n"
539
0
      "psrldq      $0x4,%%xmm1                   \n"
540
0
      "pslldq      $0xc,%%xmm4                   \n"
541
0
      "movdqa      %%xmm2,%%xmm5                 \n"
542
0
      "por         %%xmm4,%%xmm0                 \n"
543
0
      "pslldq      $0x8,%%xmm5                   \n"
544
0
      "movdqu      %%xmm0,(%1)                   \n"
545
0
      "por         %%xmm5,%%xmm1                 \n"
546
0
      "psrldq      $0x8,%%xmm2                   \n"
547
0
      "pslldq      $0x4,%%xmm3                   \n"
548
0
      "por         %%xmm3,%%xmm2                 \n"
549
0
      "movdqu      %%xmm1,0x10(%1)               \n"
550
0
      "movdqu      %%xmm2,0x20(%1)               \n"
551
0
      "lea         0x30(%1),%1                   \n"
552
0
      "sub         $0x10,%2                      \n"
553
0
      "jg          1b                            \n"
554
0
               : "+r"(src),                  // %0
555
0
                 "+r"(dst),                  // %1
556
0
                 "+r"(width)                 // %2
557
0
               : "m"(kShuffleMaskARGBToRAW)  // %3
558
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
559
0
                 "xmm6");
560
0
}
561
562
#ifdef HAS_ARGBTORGB24ROW_AVX2
563
// vpermd for 12+12 to 24
564
static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
565
566
12.2k
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
567
12.2k
  asm volatile(
568
12.2k
      "vbroadcastf128 %3,%%ymm6                  \n"
569
12.2k
      "vmovdqa     %4,%%ymm7                     \n"
570
571
12.2k
      LABELALIGN
572
12.2k
      "1:          \n"
573
12.2k
      "vmovdqu     (%0),%%ymm0                   \n"
574
12.2k
      "vmovdqu     0x20(%0),%%ymm1               \n"
575
12.2k
      "vmovdqu     0x40(%0),%%ymm2               \n"
576
12.2k
      "vmovdqu     0x60(%0),%%ymm3               \n"
577
12.2k
      "lea         0x80(%0),%0                   \n"
578
12.2k
      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
579
12.2k
      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
580
12.2k
      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
581
12.2k
      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
582
12.2k
      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
583
12.2k
      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
584
12.2k
      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
585
12.2k
      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
586
12.2k
      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
587
12.2k
      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
588
12.2k
      "vmovdqu     %%ymm0,(%1)                   \n"
589
12.2k
      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
590
12.2k
      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
591
12.2k
      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
592
12.2k
      "vmovdqu     %%ymm1,0x20(%1)               \n"
593
12.2k
      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
594
12.2k
      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
595
12.2k
      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
596
12.2k
      "vmovdqu     %%ymm2,0x40(%1)               \n"
597
12.2k
      "lea         0x60(%1),%1                   \n"
598
12.2k
      "sub         $0x20,%2                      \n"
599
12.2k
      "jg          1b                            \n"
600
12.2k
      "vzeroupper  \n"
601
12.2k
      : "+r"(src),                     // %0
602
12.2k
        "+r"(dst),                     // %1
603
12.2k
        "+r"(width)                    // %2
604
12.2k
      : "m"(kShuffleMaskARGBToRGB24),  // %3
605
12.2k
        "m"(kPermdRGB24_AVX)           // %4
606
12.2k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
607
12.2k
        "xmm7");
608
12.2k
}
609
#endif
610
611
#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
612
// Shuffle table for converting ARGBToRGB24
613
static const ulvec8 kPermARGBToRGB24_0 = {
614
    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
615
    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
616
    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
617
static const ulvec8 kPermARGBToRGB24_1 = {
618
    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
619
    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
620
    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
621
static const ulvec8 kPermARGBToRGB24_2 = {
622
    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
623
    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
624
    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
625
626
0
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
627
0
  asm volatile(
628
0
      "vmovdqa     %3,%%ymm5                     \n"
629
0
      "vmovdqa     %4,%%ymm6                     \n"
630
0
      "vmovdqa     %5,%%ymm7                     \n"
631
632
0
      LABELALIGN
633
0
      "1:          \n"
634
0
      "vmovdqu     (%0),%%ymm0                   \n"
635
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
636
0
      "vmovdqu     0x40(%0),%%ymm2               \n"
637
0
      "vmovdqu     0x60(%0),%%ymm3               \n"
638
0
      "lea         0x80(%0),%0                   \n"
639
0
      "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
640
0
      "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
641
0
      "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
642
0
      "vmovdqu     %%ymm0,(%1)                   \n"
643
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
644
0
      "vmovdqu     %%ymm2,0x40(%1)               \n"
645
0
      "lea         0x60(%1),%1                   \n"
646
0
      "sub         $0x20,%2                      \n"
647
0
      "jg          1b                            \n"
648
0
      "vzeroupper  \n"
649
0
      : "+r"(src),                // %0
650
0
        "+r"(dst),                // %1
651
0
        "+r"(width)               // %2
652
0
      : "m"(kPermARGBToRGB24_0),  // %3
653
0
        "m"(kPermARGBToRGB24_1),  // %4
654
0
        "m"(kPermARGBToRGB24_2)   // %5
655
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
656
0
}
657
#endif
658
659
#ifdef HAS_ARGBTORAWROW_AVX2
660
0
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
661
0
  asm volatile(
662
0
      "vbroadcastf128 %3,%%ymm6                  \n"
663
0
      "vmovdqa     %4,%%ymm7                     \n"
664
665
0
      LABELALIGN
666
0
      "1:          \n"
667
0
      "vmovdqu     (%0),%%ymm0                   \n"
668
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
669
0
      "vmovdqu     0x40(%0),%%ymm2               \n"
670
0
      "vmovdqu     0x60(%0),%%ymm3               \n"
671
0
      "lea         0x80(%0),%0                   \n"
672
0
      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
673
0
      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
674
0
      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
675
0
      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
676
0
      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
677
0
      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
678
0
      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
679
0
      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
680
0
      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
681
0
      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
682
0
      "vmovdqu     %%ymm0,(%1)                   \n"
683
0
      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
684
0
      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
685
0
      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
686
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
687
0
      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
688
0
      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
689
0
      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
690
0
      "vmovdqu     %%ymm2,0x40(%1)               \n"
691
0
      "lea         0x60(%1),%1                   \n"
692
0
      "sub         $0x20,%2                      \n"
693
0
      "jg          1b                            \n"
694
0
      "vzeroupper  \n"
695
0
      : "+r"(src),                   // %0
696
0
        "+r"(dst),                   // %1
697
0
        "+r"(width)                  // %2
698
0
      : "m"(kShuffleMaskARGBToRAW),  // %3
699
0
        "m"(kPermdRGB24_AVX)         // %4
700
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
701
0
        "xmm7");
702
0
}
703
#endif
704
705
4.45k
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
706
4.45k
  asm volatile(
707
4.45k
      "pcmpeqb     %%xmm3,%%xmm3                 \n"
708
4.45k
      "psrld       $0x1b,%%xmm3                  \n"
709
4.45k
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
710
4.45k
      "psrld       $0x1a,%%xmm4                  \n"
711
4.45k
      "pslld       $0x5,%%xmm4                   \n"
712
4.45k
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
713
4.45k
      "pslld       $0xb,%%xmm5                   \n"
714
715
4.45k
      LABELALIGN
716
4.45k
      "1:          \n"
717
4.45k
      "movdqu      (%0),%%xmm0                   \n"
718
4.45k
      "movdqa      %%xmm0,%%xmm1                 \n"
719
4.45k
      "movdqa      %%xmm0,%%xmm2                 \n"
720
4.45k
      "pslld       $0x8,%%xmm0                   \n"
721
4.45k
      "psrld       $0x3,%%xmm1                   \n"
722
4.45k
      "psrld       $0x5,%%xmm2                   \n"
723
4.45k
      "psrad       $0x10,%%xmm0                  \n"
724
4.45k
      "pand        %%xmm3,%%xmm1                 \n"
725
4.45k
      "pand        %%xmm4,%%xmm2                 \n"
726
4.45k
      "pand        %%xmm5,%%xmm0                 \n"
727
4.45k
      "por         %%xmm2,%%xmm1                 \n"
728
4.45k
      "por         %%xmm1,%%xmm0                 \n"
729
4.45k
      "packssdw    %%xmm0,%%xmm0                 \n"
730
4.45k
      "lea         0x10(%0),%0                   \n"
731
4.45k
      "movq        %%xmm0,(%1)                   \n"
732
4.45k
      "lea         0x8(%1),%1                    \n"
733
4.45k
      "sub         $0x4,%2                       \n"
734
4.45k
      "jg          1b                            \n"
735
4.45k
      : "+r"(src),   // %0
736
4.45k
        "+r"(dst),   // %1
737
4.45k
        "+r"(width)  // %2
738
4.45k
        ::"memory",
739
4.45k
        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
740
4.45k
}
741
742
void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
743
                                uint8_t* dst,
744
                                uint32_t dither4,
745
0
                                int width) {
746
0
  asm volatile(
747
0
      "movd        %3,%%xmm6                     \n"
748
0
      "punpcklbw   %%xmm6,%%xmm6                 \n"
749
0
      "movdqa      %%xmm6,%%xmm7                 \n"
750
0
      "punpcklwd   %%xmm6,%%xmm6                 \n"
751
0
      "punpckhwd   %%xmm7,%%xmm7                 \n"
752
0
      "pcmpeqb     %%xmm3,%%xmm3                 \n"
753
0
      "psrld       $0x1b,%%xmm3                  \n"
754
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
755
0
      "psrld       $0x1a,%%xmm4                  \n"
756
0
      "pslld       $0x5,%%xmm4                   \n"
757
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
758
0
      "pslld       $0xb,%%xmm5                   \n"
759
760
0
      LABELALIGN
761
0
      "1:          \n"
762
0
      "movdqu      (%0),%%xmm0                   \n"
763
0
      "paddusb     %%xmm6,%%xmm0                 \n"
764
0
      "movdqa      %%xmm0,%%xmm1                 \n"
765
0
      "movdqa      %%xmm0,%%xmm2                 \n"
766
0
      "pslld       $0x8,%%xmm0                   \n"
767
0
      "psrld       $0x3,%%xmm1                   \n"
768
0
      "psrld       $0x5,%%xmm2                   \n"
769
0
      "psrad       $0x10,%%xmm0                  \n"
770
0
      "pand        %%xmm3,%%xmm1                 \n"
771
0
      "pand        %%xmm4,%%xmm2                 \n"
772
0
      "pand        %%xmm5,%%xmm0                 \n"
773
0
      "por         %%xmm2,%%xmm1                 \n"
774
0
      "por         %%xmm1,%%xmm0                 \n"
775
0
      "packssdw    %%xmm0,%%xmm0                 \n"
776
0
      "lea         0x10(%0),%0                   \n"
777
0
      "movq        %%xmm0,(%1)                   \n"
778
0
      "lea         0x8(%1),%1                    \n"
779
0
      "sub         $0x4,%2                       \n"
780
0
      "jg          1b                            \n"
781
0
      : "+r"(src),    // %0
782
0
        "+r"(dst),    // %1
783
0
        "+r"(width)   // %2
784
0
      : "m"(dither4)  // %3
785
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
786
0
        "xmm7");
787
0
}
788
789
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
790
void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
791
                                uint8_t* dst,
792
                                uint32_t dither4,
793
0
                                int width) {
794
0
  asm volatile(
795
0
      "vbroadcastss %3,%%xmm6                    \n"
796
0
      "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
797
0
      "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
798
0
      "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
799
0
      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
800
0
      "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
801
0
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
802
0
      "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
803
0
      "vpslld      $0x5,%%ymm4,%%ymm4            \n"
804
0
      "vpslld      $0xb,%%ymm3,%%ymm5            \n"
805
806
0
      LABELALIGN
807
0
      "1:          \n"
808
0
      "vmovdqu     (%0),%%ymm0                   \n"
809
0
      "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
810
0
      "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
811
0
      "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
812
0
      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
813
0
      "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
814
0
      "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
815
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
816
0
      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
817
0
      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
818
0
      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
819
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
820
0
      "lea         0x20(%0),%0                   \n"
821
0
      "vmovdqu     %%xmm0,(%1)                   \n"
822
0
      "lea         0x10(%1),%1                   \n"
823
0
      "sub         $0x8,%2                       \n"
824
0
      "jg          1b                            \n"
825
0
      "vzeroupper  \n"
826
0
      : "+r"(src),    // %0
827
0
        "+r"(dst),    // %1
828
0
        "+r"(width)   // %2
829
0
      : "m"(dither4)  // %3
830
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
831
0
        "xmm7");
832
0
}
833
#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
834
835
0
void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
836
0
  asm volatile(
837
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
838
0
      "psrld       $0x1b,%%xmm4                  \n"
839
0
      "movdqa      %%xmm4,%%xmm5                 \n"
840
0
      "pslld       $0x5,%%xmm5                   \n"
841
0
      "movdqa      %%xmm4,%%xmm6                 \n"
842
0
      "pslld       $0xa,%%xmm6                   \n"
843
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
844
0
      "pslld       $0xf,%%xmm7                   \n"
845
846
0
      LABELALIGN
847
0
      "1:          \n"
848
0
      "movdqu      (%0),%%xmm0                   \n"
849
0
      "movdqa      %%xmm0,%%xmm1                 \n"
850
0
      "movdqa      %%xmm0,%%xmm2                 \n"
851
0
      "movdqa      %%xmm0,%%xmm3                 \n"
852
0
      "psrad       $0x10,%%xmm0                  \n"
853
0
      "psrld       $0x3,%%xmm1                   \n"
854
0
      "psrld       $0x6,%%xmm2                   \n"
855
0
      "psrld       $0x9,%%xmm3                   \n"
856
0
      "pand        %%xmm7,%%xmm0                 \n"
857
0
      "pand        %%xmm4,%%xmm1                 \n"
858
0
      "pand        %%xmm5,%%xmm2                 \n"
859
0
      "pand        %%xmm6,%%xmm3                 \n"
860
0
      "por         %%xmm1,%%xmm0                 \n"
861
0
      "por         %%xmm3,%%xmm2                 \n"
862
0
      "por         %%xmm2,%%xmm0                 \n"
863
0
      "packssdw    %%xmm0,%%xmm0                 \n"
864
0
      "lea         0x10(%0),%0                   \n"
865
0
      "movq        %%xmm0,(%1)                   \n"
866
0
      "lea         0x8(%1),%1                    \n"
867
0
      "sub         $0x4,%2                       \n"
868
0
      "jg          1b                            \n"
869
0
      : "+r"(src),   // %0
870
0
        "+r"(dst),   // %1
871
0
        "+r"(width)  // %2
872
0
        ::"memory",
873
0
        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
874
0
}
875
876
0
void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
877
0
  asm volatile(
878
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
879
0
      "psllw       $0xc,%%xmm4                   \n"
880
0
      "movdqa      %%xmm4,%%xmm3                 \n"
881
0
      "psrlw       $0x8,%%xmm3                   \n"
882
883
0
      LABELALIGN
884
0
      "1:          \n"
885
0
      "movdqu      (%0),%%xmm0                   \n"
886
0
      "movdqa      %%xmm0,%%xmm1                 \n"
887
0
      "pand        %%xmm3,%%xmm0                 \n"
888
0
      "pand        %%xmm4,%%xmm1                 \n"
889
0
      "psrlq       $0x4,%%xmm0                   \n"
890
0
      "psrlq       $0x8,%%xmm1                   \n"
891
0
      "por         %%xmm1,%%xmm0                 \n"
892
0
      "packuswb    %%xmm0,%%xmm0                 \n"
893
0
      "lea         0x10(%0),%0                   \n"
894
0
      "movq        %%xmm0,(%1)                   \n"
895
0
      "lea         0x8(%1),%1                    \n"
896
0
      "sub         $0x4,%2                       \n"
897
0
      "jg          1b                            \n"
898
0
      : "+r"(src),   // %0
899
0
        "+r"(dst),   // %1
900
0
        "+r"(width)  // %2
901
0
        ::"memory",
902
0
        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
903
0
}
904
#endif  // HAS_RGB24TOARGBROW_SSSE3
905
906
/*
907
908
ARGBToAR30Row:
909
910
Red Blue
911
With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
912
produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
913
wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
914
(1024+4)*16 for red.
915
916
Alpha Green
917
Alpha and Green are already in the high bits so vpand can zero out the other
918
bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
919
could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
920
would be a simple multiplier to shift it into position.  It wants a gap of 10
921
above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
922
more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
923
and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
924
result left 10 to position the A and G channels.
925
*/
926
927
// Shuffle table for converting RAW to RGB24.  Last 8.
928
static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
929
                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
930
931
static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
932
                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
933
934
static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
935
static const uint32_t kMaskRB10 = 0x3ff003ff;
936
static const uint32_t kMaskAG10 = 0xc000ff00;
937
static const uint32_t kMulAG10 = 64 * 65536 + 1028;
938
939
0
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
940
0
  asm volatile(
941
0
      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
942
0
      "movd        %4,%%xmm3                     \n"  // multipler for RB
943
0
      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
944
0
      "movd        %6,%%xmm5                     \n"  // mask for AG
945
0
      "movd        %7,%%xmm6                     \n"  // multipler for AG
946
0
      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
947
0
      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
948
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
949
0
      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
950
0
      "sub         %0,%1                         \n"
951
952
0
      "1:          \n"
953
0
      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
954
0
      "movdqa      %%xmm0,%%xmm1                 \n"
955
0
      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
956
0
      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
957
0
      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
958
0
      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
959
0
      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
960
0
      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
961
0
      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
962
0
      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
963
0
      "add         $0x10,%0                      \n"
964
0
      "sub         $0x4,%2                       \n"
965
0
      "jg          1b                            \n"
966
967
0
      : "+r"(src),          // %0
968
0
        "+r"(dst),          // %1
969
0
        "+r"(width)         // %2
970
0
      : "m"(kShuffleRB30),  // %3
971
0
        "m"(kMulRB10),      // %4
972
0
        "m"(kMaskRB10),     // %5
973
0
        "m"(kMaskAG10),     // %6
974
0
        "m"(kMulAG10)       // %7
975
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
976
0
}
977
978
0
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
979
0
  asm volatile(
980
0
      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
981
0
      "movd        %4,%%xmm3                     \n"  // multipler for RB
982
0
      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
983
0
      "movd        %6,%%xmm5                     \n"  // mask for AG
984
0
      "movd        %7,%%xmm6                     \n"  // multipler for AG
985
0
      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
986
0
      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
987
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
988
0
      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
989
0
      "sub         %0,%1                         \n"
990
991
0
      "1:          \n"
992
0
      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
993
0
      "movdqa      %%xmm0,%%xmm1                 \n"
994
0
      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
995
0
      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
996
0
      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
997
0
      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
998
0
      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
999
0
      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
1000
0
      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
1001
0
      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
1002
0
      "add         $0x10,%0                      \n"
1003
0
      "sub         $0x4,%2                       \n"
1004
0
      "jg          1b                            \n"
1005
1006
0
      : "+r"(src),          // %0
1007
0
        "+r"(dst),          // %1
1008
0
        "+r"(width)         // %2
1009
0
      : "m"(kShuffleBR30),  // %3  reversed shuffler
1010
0
        "m"(kMulRB10),      // %4
1011
0
        "m"(kMaskRB10),     // %5
1012
0
        "m"(kMaskAG10),     // %6
1013
0
        "m"(kMulAG10)       // %7
1014
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1015
0
}
1016
1017
#ifdef HAS_ARGBTOAR30ROW_AVX2
1018
0
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1019
0
  asm volatile(
1020
0
      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1021
0
      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1022
0
      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1023
0
      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1024
0
      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1025
0
      "sub         %0,%1                         \n"
1026
1027
0
      "1:          \n"
1028
0
      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
1029
0
      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1030
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1031
0
      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1032
0
      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1033
0
      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1034
0
      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1035
0
      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1036
0
      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1037
0
      "add         $0x20,%0                      \n"
1038
0
      "sub         $0x8,%2                       \n"
1039
0
      "jg          1b                            \n"
1040
0
      "vzeroupper  \n"
1041
1042
0
      : "+r"(src),          // %0
1043
0
        "+r"(dst),          // %1
1044
0
        "+r"(width)         // %2
1045
0
      : "m"(kShuffleRB30),  // %3
1046
0
        "m"(kMulRB10),      // %4
1047
0
        "m"(kMaskRB10),     // %5
1048
0
        "m"(kMaskAG10),     // %6
1049
0
        "m"(kMulAG10)       // %7
1050
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1051
0
}
1052
#endif
1053
1054
#ifdef HAS_ABGRTOAR30ROW_AVX2
1055
0
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1056
0
  asm volatile(
1057
0
      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1058
0
      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1059
0
      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1060
0
      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1061
0
      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1062
0
      "sub         %0,%1                         \n"
1063
1064
0
      "1:          \n"
1065
0
      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
1066
0
      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1067
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1068
0
      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1069
0
      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1070
0
      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1071
0
      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1072
0
      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1073
0
      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1074
0
      "add         $0x20,%0                      \n"
1075
0
      "sub         $0x8,%2                       \n"
1076
0
      "jg          1b                            \n"
1077
0
      "vzeroupper  \n"
1078
1079
0
      : "+r"(src),          // %0
1080
0
        "+r"(dst),          // %1
1081
0
        "+r"(width)         // %2
1082
0
      : "m"(kShuffleBR30),  // %3  reversed shuffler
1083
0
        "m"(kMulRB10),      // %4
1084
0
        "m"(kMaskRB10),     // %5
1085
0
        "m"(kMaskAG10),     // %6
1086
0
        "m"(kMulAG10)       // %7
1087
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1088
0
}
1089
#endif
1090
1091
static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
1092
                                         10, 9, 8, 11, 14, 13, 12, 15};
1093
1094
static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1095
                                           6, 6, 5, 5, 4, 4, 7, 7};
1096
static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
1097
                                           14, 14, 13, 13, 12, 12, 15, 15};
1098
1099
void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1100
                         uint16_t* dst_ar64,
1101
0
                         int width) {
1102
0
  asm volatile(
1103
0
      "1:          \n"
1104
0
      "movdqu      (%0),%%xmm0                   \n"
1105
0
      "movdqa      %%xmm0,%%xmm1                 \n"
1106
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
1107
0
      "punpckhbw   %%xmm1,%%xmm1                 \n"
1108
0
      "movdqu      %%xmm0,(%1)                   \n"
1109
0
      "movdqu      %%xmm1,0x10(%1)               \n"
1110
0
      "lea         0x10(%0),%0                   \n"
1111
0
      "lea         0x20(%1),%1                   \n"
1112
0
      "sub         $0x4,%2                       \n"
1113
0
      "jg          1b                            \n"
1114
0
      : "+r"(src_argb),  // %0
1115
0
        "+r"(dst_ar64),  // %1
1116
0
        "+r"(width)      // %2
1117
0
        ::"memory",
1118
0
        "cc", "xmm0", "xmm1");
1119
0
}
1120
1121
void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1122
                         uint16_t* dst_ab64,
1123
0
                         int width) {
1124
0
  asm volatile(
1125
0
      "movdqa      %3,%%xmm2                     \n"
1126
0
      "movdqa      %4,%%xmm3                     \n" LABELALIGN
1127
0
      "1:          \n"
1128
0
      "movdqu      (%0),%%xmm0                   \n"
1129
0
      "movdqa      %%xmm0,%%xmm1                 \n"
1130
0
      "pshufb      %%xmm2,%%xmm0                 \n"
1131
0
      "pshufb      %%xmm3,%%xmm1                 \n"
1132
0
      "movdqu      %%xmm0,(%1)                   \n"
1133
0
      "movdqu      %%xmm1,0x10(%1)               \n"
1134
0
      "lea         0x10(%0),%0                   \n"
1135
0
      "lea         0x20(%1),%1                   \n"
1136
0
      "sub         $0x4,%2                       \n"
1137
0
      "jg          1b                            \n"
1138
0
      : "+r"(src_argb),             // %0
1139
0
        "+r"(dst_ab64),             // %1
1140
0
        "+r"(width)                 // %2
1141
0
      : "m"(kShuffleARGBToAB64Lo),  // %3
1142
0
        "m"(kShuffleARGBToAB64Hi)   // %4
1143
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
1144
0
}
1145
1146
void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1147
                         uint8_t* dst_argb,
1148
0
                         int width) {
1149
0
  asm volatile(
1150
0
      "1:          \n"
1151
0
      "movdqu      (%0),%%xmm0                   \n"
1152
0
      "movdqu      0x10(%0),%%xmm1               \n"
1153
0
      "psrlw       $8,%%xmm0                     \n"
1154
0
      "psrlw       $8,%%xmm1                     \n"
1155
0
      "packuswb    %%xmm1,%%xmm0                 \n"
1156
0
      "movdqu      %%xmm0,(%1)                   \n"
1157
0
      "lea         0x20(%0),%0                   \n"
1158
0
      "lea         0x10(%1),%1                   \n"
1159
0
      "sub         $0x4,%2                       \n"
1160
0
      "jg          1b                            \n"
1161
0
      : "+r"(src_ar64),  // %0
1162
0
        "+r"(dst_argb),  // %1
1163
0
        "+r"(width)      // %2
1164
0
        ::"memory",
1165
0
        "cc", "xmm0", "xmm1");
1166
0
}
1167
1168
void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1169
                         uint8_t* dst_argb,
1170
0
                         int width) {
1171
0
      asm volatile("movdqa      %3,%%xmm2                     \n"
1172
1173
0
               LABELALIGN
1174
0
      "1:          \n"
1175
0
      "movdqu      (%0),%%xmm0                   \n"
1176
0
      "movdqu      0x10(%0),%%xmm1               \n"
1177
0
      "psrlw       $8,%%xmm0                     \n"
1178
0
      "psrlw       $8,%%xmm1                     \n"
1179
0
      "packuswb    %%xmm1,%%xmm0                 \n"
1180
0
      "pshufb      %%xmm2,%%xmm0                 \n"
1181
0
      "movdqu      %%xmm0,(%1)                   \n"
1182
0
      "lea         0x20(%0),%0                   \n"
1183
0
      "lea         0x10(%1),%1                   \n"
1184
0
      "sub         $0x4,%2                       \n"
1185
0
      "jg          1b                            \n"
1186
0
               : "+r"(src_ab64),          // %0
1187
0
                 "+r"(dst_argb),          // %1
1188
0
                 "+r"(width)              // %2
1189
0
               : "m"(kShuffleARGBToABGR)  // %3
1190
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
1191
0
}
1192
1193
#ifdef HAS_ARGBTOAR64ROW_AVX2
1194
void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1195
                        uint16_t* dst_ar64,
1196
0
                        int width) {
1197
0
  asm volatile(
1198
0
      "1:          \n"
1199
0
      "vmovdqu     (%0),%%ymm0                   \n"
1200
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1201
0
      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
1202
0
      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
1203
0
      "vmovdqu     %%ymm0,(%1)                   \n"
1204
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
1205
0
      "lea         0x20(%0),%0                   \n"
1206
0
      "lea         0x40(%1),%1                   \n"
1207
0
      "sub         $0x8,%2                       \n"
1208
0
      "jg          1b                            \n"
1209
0
      "vzeroupper  \n"
1210
0
      : "+r"(src_argb),  // %0
1211
0
        "+r"(dst_ar64),  // %1
1212
0
        "+r"(width)      // %2
1213
0
        ::"memory",
1214
0
        "cc", "xmm0", "xmm1");
1215
0
}
1216
#endif
1217
1218
#ifdef HAS_ARGBTOAB64ROW_AVX2
1219
void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1220
                        uint16_t* dst_ab64,
1221
0
                        int width) {
1222
0
  asm volatile(
1223
0
      "vbroadcastf128 %3,%%ymm2                  \n"
1224
0
      "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
1225
0
      "1:          \n"
1226
0
      "vmovdqu     (%0),%%ymm0                   \n"
1227
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1228
0
      "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
1229
0
      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1230
0
      "vmovdqu     %%ymm0,(%1)                   \n"
1231
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
1232
0
      "lea         0x20(%0),%0                   \n"
1233
0
      "lea         0x40(%1),%1                   \n"
1234
0
      "sub         $0x8,%2                       \n"
1235
0
      "jg          1b                            \n"
1236
0
      "vzeroupper  \n"
1237
0
      : "+r"(src_argb),             // %0
1238
0
        "+r"(dst_ab64),             // %1
1239
0
        "+r"(width)                 // %2
1240
0
      : "m"(kShuffleARGBToAB64Lo),  // %3
1241
0
        "m"(kShuffleARGBToAB64Hi)   // %3
1242
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
1243
0
}
1244
#endif
1245
1246
#ifdef HAS_AR64TOARGBROW_AVX2
1247
void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1248
                        uint8_t* dst_argb,
1249
0
                        int width) {
1250
0
  asm volatile(
1251
0
      "1:          \n"
1252
0
      "vmovdqu     (%0),%%ymm0                   \n"
1253
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
1254
0
      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1255
0
      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1256
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1257
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1258
0
      "vmovdqu     %%ymm0,(%1)                   \n"
1259
0
      "lea         0x40(%0),%0                   \n"
1260
0
      "lea         0x20(%1),%1                   \n"
1261
0
      "sub         $0x8,%2                       \n"
1262
0
      "jg          1b                            \n"
1263
0
      "vzeroupper  \n"
1264
0
      : "+r"(src_ar64),  // %0
1265
0
        "+r"(dst_argb),  // %1
1266
0
        "+r"(width)      // %2
1267
0
        ::"memory",
1268
0
        "cc", "xmm0", "xmm1");
1269
0
}
1270
#endif
1271
1272
#ifdef HAS_AB64TOARGBROW_AVX2
1273
void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1274
                        uint8_t* dst_argb,
1275
0
                        int width) {
1276
0
      asm volatile("vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
1277
0
      "1:          \n"
1278
0
      "vmovdqu     (%0),%%ymm0                   \n"
1279
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
1280
0
      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1281
0
      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1282
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1283
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1284
0
      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1285
0
      "vmovdqu     %%ymm0,(%1)                   \n"
1286
0
      "lea         0x40(%0),%0                   \n"
1287
0
      "lea         0x20(%1),%1                   \n"
1288
0
      "sub         $0x8,%2                       \n"
1289
0
      "jg          1b                            \n"
1290
0
      "vzeroupper  \n"
1291
0
               : "+r"(src_ab64),          // %0
1292
0
                 "+r"(dst_argb),          // %1
1293
0
                 "+r"(width)              // %2
1294
0
               : "m"(kShuffleARGBToABGR)  // %3
1295
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
1296
0
}
1297
#endif
1298
1299
// clang-format off
1300
1301
// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1302
// round parameter is register containing value to add before shift.
1303
#define RGBTOY(round)                            \
1304
  "1:                                        \n" \
1305
  "movdqu    (%0),%%xmm0                     \n" \
1306
  "movdqu    0x10(%0),%%xmm1                 \n" \
1307
  "movdqu    0x20(%0),%%xmm2                 \n" \
1308
  "movdqu    0x30(%0),%%xmm3                 \n" \
1309
  "psubb     %%xmm5,%%xmm0                   \n" \
1310
  "psubb     %%xmm5,%%xmm1                   \n" \
1311
  "psubb     %%xmm5,%%xmm2                   \n" \
1312
  "psubb     %%xmm5,%%xmm3                   \n" \
1313
  "movdqu    %%xmm4,%%xmm6                   \n" \
1314
  "pmaddubsw %%xmm0,%%xmm6                   \n" \
1315
  "movdqu    %%xmm4,%%xmm0                   \n" \
1316
  "pmaddubsw %%xmm1,%%xmm0                   \n" \
1317
  "movdqu    %%xmm4,%%xmm1                   \n" \
1318
  "pmaddubsw %%xmm2,%%xmm1                   \n" \
1319
  "movdqu    %%xmm4,%%xmm2                   \n" \
1320
  "pmaddubsw %%xmm3,%%xmm2                   \n" \
1321
  "lea       0x40(%0),%0                     \n" \
1322
  "phaddw    %%xmm0,%%xmm6                   \n" \
1323
  "phaddw    %%xmm2,%%xmm1                   \n" \
1324
  "prefetcht0 1280(%0)                       \n" \
1325
  "paddw     %%" #round ",%%xmm6             \n" \
1326
  "paddw     %%" #round ",%%xmm1             \n" \
1327
  "psrlw     $0x8,%%xmm6                     \n" \
1328
  "psrlw     $0x8,%%xmm1                     \n" \
1329
  "packuswb  %%xmm1,%%xmm6                   \n" \
1330
  "movdqu    %%xmm6,(%1)                     \n" \
1331
  "lea       0x10(%1),%1                     \n" \
1332
  "sub       $0x10,%2                        \n" \
1333
  "jg        1b                              \n"
1334
1335
#define RGBTOY_AVX2(round)                                       \
1336
  "1:                                        \n"                 \
1337
  "vmovdqu    (%0),%%ymm0                    \n"                 \
1338
  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
1339
  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
1340
  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
1341
  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
1342
  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
1343
  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
1344
  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
1345
  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
1346
  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
1347
  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
1348
  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
1349
  "lea       0x80(%0),%0                     \n"                 \
1350
  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
1351
  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
1352
  "prefetcht0 1280(%0)                       \n"                 \
1353
  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add 16 */    \
1354
  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n"                 \
1355
  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
1356
  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
1357
  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
1358
  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
1359
  "vmovdqu    %%ymm0,(%1)                    \n"                 \
1360
  "lea       0x20(%1),%1                     \n"                 \
1361
  "sub       $0x20,%2                        \n"                 \
1362
  "jg        1b                              \n"
1363
1364
// clang-format on
1365
1366
#ifdef HAS_ARGBTOYROW_SSSE3
1367
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1368
0
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1369
0
  asm volatile(
1370
0
      "movdqa      %3,%%xmm4                     \n"
1371
0
      "movdqa      %4,%%xmm5                     \n"
1372
0
      "movdqa      %5,%%xmm7                     \n"  //
1373
1374
0
      LABELALIGN ""      //
1375
      RGBTOY(xmm7)       //
1376
0
      : "+r"(src_argb),  // %0
1377
0
        "+r"(dst_y),     // %1
1378
0
        "+r"(width)      // %2
1379
0
      : "m"(kARGBToY),   // %3
1380
0
        "m"(kSub128),    // %4
1381
0
        "m"(kAddY16)     // %5
1382
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1383
0
        "xmm7");
1384
0
}
1385
#endif  // HAS_ARGBTOYROW_SSSE3
1386
1387
#ifdef HAS_ARGBTOYJROW_SSSE3
1388
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1389
// Same as ARGBToYRow but different coefficients, no add 16.
1390
0
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1391
0
  asm volatile(
1392
0
      "movdqa      %3,%%xmm4                     \n"
1393
0
      "movdqa      %4,%%xmm5                     \n"
1394
0
      "movdqa      %5,%%xmm7                     \n"  //
1395
1396
0
      LABELALIGN ""      //
1397
      RGBTOY(xmm7)       //
1398
0
      : "+r"(src_argb),  // %0
1399
0
        "+r"(dst_y),     // %1
1400
0
        "+r"(width)      // %2
1401
0
      : "m"(kARGBToYJ),  // %3
1402
0
        "m"(kSub128),    // %4
1403
0
        "m"(kAddY0)      // %5
1404
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1405
0
        "xmm7");
1406
0
}
1407
#endif  // HAS_ARGBTOYJROW_SSSE3
1408
1409
#ifdef HAS_ABGRTOYJROW_SSSE3
1410
// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
1411
// Same as ABGRToYRow but different coefficients, no add 16.
1412
0
void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1413
0
  asm volatile(
1414
0
      "movdqa      %3,%%xmm4                     \n"
1415
0
      "movdqa      %4,%%xmm5                     \n"
1416
0
      "movdqa      %5,%%xmm7                     \n"  //
1417
1418
0
      LABELALIGN ""      //
1419
      RGBTOY(xmm7)       //
1420
0
      : "+r"(src_abgr),  // %0
1421
0
        "+r"(dst_y),     // %1
1422
0
        "+r"(width)      // %2
1423
0
      : "m"(kABGRToYJ),  // %3
1424
0
        "m"(kSub128),    // %4
1425
0
        "m"(kAddY0)      // %5
1426
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1427
0
        "xmm7");
1428
0
}
1429
#endif  // HAS_ABGRTOYJROW_SSSE3
1430
1431
#ifdef HAS_RGBATOYJROW_SSSE3
1432
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1433
// Same as ARGBToYRow but different coefficients, no add 16.
1434
0
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1435
0
  asm volatile(
1436
0
      "movdqa      %3,%%xmm4                     \n"
1437
0
      "movdqa      %4,%%xmm5                     \n"
1438
0
      "movdqa      %5,%%xmm7                     \n"  //
1439
1440
0
      LABELALIGN ""      //
1441
      RGBTOY(xmm7)       //
1442
0
      : "+r"(src_rgba),  // %0
1443
0
        "+r"(dst_y),     // %1
1444
0
        "+r"(width)      // %2
1445
0
      : "m"(kRGBAToYJ),  // %3
1446
0
        "m"(kSub128),    // %4
1447
0
        "m"(kAddY0)      // %5
1448
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1449
0
        "xmm7");
1450
0
}
1451
#endif  // HAS_RGBATOYJROW_SSSE3
1452
1453
#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
1454
    defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
1455
// vpermd for vphaddw + vpackuswb vpermd.
1456
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1457
#endif
1458
1459
#ifdef HAS_ARGBTOYROW_AVX2
1460
1461
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1462
0
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1463
0
  asm volatile(
1464
0
      "vbroadcastf128 %3,%%ymm4                  \n"
1465
0
      "vbroadcastf128 %4,%%ymm5                  \n"
1466
0
      "vbroadcastf128 %5,%%ymm7                  \n"
1467
0
      "vmovdqa     %6,%%ymm6                     \n"  //
1468
1469
0
      LABELALIGN ""      //
1470
      RGBTOY_AVX2(ymm7)  //
1471
0
      "vzeroupper  \n"
1472
0
      : "+r"(src_argb),         // %0
1473
0
        "+r"(dst_y),            // %1
1474
0
        "+r"(width)             // %2
1475
0
      : "m"(kARGBToY),          // %3
1476
0
        "m"(kSub128),           // %4
1477
0
        "m"(kAddY16),           // %5
1478
0
        "m"(kPermdARGBToY_AVX)  // %6
1479
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1480
0
        "xmm7");
1481
0
}
1482
#endif  // HAS_ARGBTOYROW_AVX2
1483
1484
#ifdef HAS_ABGRTOYROW_AVX2
1485
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
1486
0
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1487
0
  asm volatile(
1488
0
      "vbroadcastf128 %3,%%ymm4                  \n"
1489
0
      "vbroadcastf128 %4,%%ymm5                  \n"
1490
0
      "vbroadcastf128 %5,%%ymm7                  \n"
1491
0
      "vmovdqa     %6,%%ymm6                     \n"  //
1492
1493
0
      LABELALIGN ""      //
1494
      RGBTOY_AVX2(ymm7)  //
1495
0
      "vzeroupper  \n"
1496
0
      : "+r"(src_abgr),         // %0
1497
0
        "+r"(dst_y),            // %1
1498
0
        "+r"(width)             // %2
1499
0
      : "m"(kABGRToY),          // %3
1500
0
        "m"(kSub128),           // %4
1501
0
        "m"(kAddY16),           // %5
1502
0
        "m"(kPermdARGBToY_AVX)  // %6
1503
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1504
0
        "xmm7");
1505
0
}
1506
#endif  // HAS_ABGRTOYROW_AVX2
1507
1508
#ifdef HAS_ARGBTOYJROW_AVX2
1509
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1510
842k
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1511
842k
  asm volatile(
1512
842k
      "vbroadcastf128 %3,%%ymm4                  \n"
1513
842k
      "vbroadcastf128 %4,%%ymm5                  \n"
1514
842k
      "vbroadcastf128 %5,%%ymm7                  \n"
1515
842k
      "vmovdqa     %6,%%ymm6                     \n"  //
1516
1517
842k
      LABELALIGN ""      //
1518
      RGBTOY_AVX2(ymm7)  //
1519
842k
      "vzeroupper  \n"
1520
842k
      : "+r"(src_argb),         // %0
1521
842k
        "+r"(dst_y),            // %1
1522
842k
        "+r"(width)             // %2
1523
842k
      : "m"(kARGBToYJ),         // %3
1524
842k
        "m"(kSub128),           // %4
1525
842k
        "m"(kAddY0),            // %5
1526
842k
        "m"(kPermdARGBToY_AVX)  // %6
1527
842k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1528
842k
        "xmm7");
1529
842k
}
1530
1531
#endif  // HAS_ARGBTOYJROW_AVX2
1532
1533
#ifdef HAS_ABGRTOYJROW_AVX2
1534
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
1535
1.00k
void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1536
1.00k
  asm volatile(
1537
1.00k
      "vbroadcastf128 %3,%%ymm4                  \n"
1538
1.00k
      "vbroadcastf128 %4,%%ymm5                  \n"
1539
1.00k
      "vbroadcastf128 %5,%%ymm7                  \n"
1540
1.00k
      "vmovdqa     %6,%%ymm6                     \n"  //
1541
1542
1.00k
      LABELALIGN ""      //
1543
      RGBTOY_AVX2(ymm7)  //
1544
1.00k
      "vzeroupper  \n"
1545
1.00k
      : "+r"(src_abgr),         // %0
1546
1.00k
        "+r"(dst_y),            // %1
1547
1.00k
        "+r"(width)             // %2
1548
1.00k
      : "m"(kABGRToYJ),         // %3
1549
1.00k
        "m"(kSub128),           // %4
1550
1.00k
        "m"(kAddY0),            // %5
1551
1.00k
        "m"(kPermdARGBToY_AVX)  // %6
1552
1.00k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1553
1.00k
        "xmm7");
1554
1.00k
}
1555
#endif  // HAS_ABGRTOYJROW_AVX2
1556
1557
#ifdef HAS_RGBATOYJROW_AVX2
1558
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1559
0
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1560
0
  asm volatile(
1561
0
      "vbroadcastf128 %3,%%ymm4                  \n"
1562
0
      "vbroadcastf128 %4,%%ymm5                  \n"
1563
0
      "vbroadcastf128 %5,%%ymm7                  \n"
1564
0
      "vmovdqa     %6,%%ymm6                     \n"  //
1565
1566
0
      LABELALIGN ""      //
1567
      RGBTOY_AVX2(ymm7)  //
1568
0
      "vzeroupper  \n"
1569
0
      : "+r"(src_rgba),         // %0
1570
0
        "+r"(dst_y),            // %1
1571
0
        "+r"(width)             // %2
1572
0
      : "m"(kRGBAToYJ),         // %3
1573
0
        "m"(kSub128),           // %4
1574
0
        "m"(kAddY0),            // %5
1575
0
        "m"(kPermdARGBToY_AVX)  // %6
1576
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1577
0
        "xmm7");
1578
0
}
1579
#endif  // HAS_RGBATOYJROW_AVX2
1580
1581
#ifdef HAS_ARGBTOUV444ROW_SSSE3
1582
1583
// Coefficients expressed as negatives to allow 128
1584
struct RgbUVConstants {
1585
  vec8 kRGBToU;
1586
  vec8 kRGBToV;
1587
};
1588
1589
// Offsets into RgbUVConstants structure
1590
#define KRGBTOU 0
1591
#define KRGBTOV 16
1592
1593
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
1594
                                uint8_t* dst_u,
1595
                                uint8_t* dst_v,
1596
                                int width,
1597
0
                                const struct RgbUVConstants* rgbuvconstants) {
1598
0
  asm volatile(
1599
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x8000
1600
0
      "psllw       $15,%%xmm5                    \n"
1601
0
      "movdqa      0x0(%4),%%xmm3                \n"  // kRGBToU
1602
0
      "movdqa      0x10(%4),%%xmm4               \n"  // kRGBToV
1603
0
      "sub         %1,%2                         \n"
1604
1605
0
      LABELALIGN
1606
0
      "1:          \n"
1607
0
      "movdqu      (%0),%%xmm0                   \n"
1608
0
      "movdqu      0x10(%0),%%xmm1               \n"
1609
0
      "movdqu      0x20(%0),%%xmm2               \n"
1610
0
      "movdqu      0x30(%0),%%xmm6               \n"
1611
0
      "pmaddubsw   %%xmm3,%%xmm0                 \n"
1612
0
      "pmaddubsw   %%xmm3,%%xmm1                 \n"
1613
0
      "pmaddubsw   %%xmm3,%%xmm2                 \n"
1614
0
      "pmaddubsw   %%xmm3,%%xmm6                 \n"
1615
0
      "phaddw      %%xmm1,%%xmm0                 \n"
1616
0
      "phaddw      %%xmm6,%%xmm2                 \n"
1617
0
      "movdqa      %%xmm5,%%xmm1                 \n"
1618
0
      "movdqa      %%xmm5,%%xmm6                 \n"
1619
0
      "psubw       %%xmm0,%%xmm1                 \n"
1620
0
      "psubw       %%xmm2,%%xmm6                 \n"
1621
0
      "psrlw       $0x8,%%xmm1                   \n"
1622
0
      "psrlw       $0x8,%%xmm6                   \n"
1623
0
      "packuswb    %%xmm6,%%xmm1                 \n"
1624
0
      "movdqu      %%xmm1,(%1)                   \n"
1625
1626
0
      "movdqu      (%0),%%xmm0                   \n"
1627
0
      "movdqu      0x10(%0),%%xmm1               \n"
1628
0
      "movdqu      0x20(%0),%%xmm2               \n"
1629
0
      "movdqu      0x30(%0),%%xmm6               \n"
1630
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"
1631
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
1632
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
1633
0
      "pmaddubsw   %%xmm4,%%xmm6                 \n"
1634
0
      "phaddw      %%xmm1,%%xmm0                 \n"
1635
0
      "phaddw      %%xmm6,%%xmm2                 \n"
1636
0
      "movdqa      %%xmm5,%%xmm1                 \n"
1637
0
      "movdqa      %%xmm5,%%xmm6                 \n"
1638
0
      "psubw       %%xmm0,%%xmm1                 \n"
1639
0
      "psubw       %%xmm2,%%xmm6                 \n"
1640
0
      "psrlw       $0x8,%%xmm1                   \n"
1641
0
      "psrlw       $0x8,%%xmm6                   \n"
1642
0
      "packuswb    %%xmm6,%%xmm1                 \n"
1643
0
      "movdqu      %%xmm1,0x00(%1,%2,1)          \n"
1644
1645
0
      "lea         0x40(%0),%0                   \n"
1646
0
      "lea         0x10(%1),%1                   \n"
1647
0
      "subl        $0x10,%3                      \n"
1648
0
      "jg          1b                            \n"
1649
0
      : "+r"(src_argb),  // %0
1650
0
        "+r"(dst_u),     // %1
1651
0
        "+r"(dst_v),     // %2
1652
#if defined(__i386__)
1653
        "+m"(width)  // %3
1654
#else
1655
0
        "+rm"(width)  // %3
1656
0
#endif
1657
0
      : "r"(rgbuvconstants)  // %4
1658
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1659
0
}
1660
#endif  // HAS_ARGBTOUV444ROW_SSSE3
1661
1662
#ifdef HAS_ARGBTOUV444ROW_AVX2
1663
1664
void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
1665
                               uint8_t* dst_u,
1666
                               uint8_t* dst_v,
1667
                               int width,
1668
258k
                               const struct RgbUVConstants* rgbuvconstants) {
1669
258k
  asm volatile(
1670
258k
      "vbroadcastf128 0x0(%4),%%ymm3             \n"  // kRGBToU
1671
258k
      "vbroadcastf128 0x10(%4),%%ymm4            \n"  // kRGBToV
1672
258k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // 0x8000
1673
258k
      "vpsllw      $15,%%ymm5,%%ymm5             \n"
1674
258k
      "vmovdqa     %5,%%ymm7                     \n"
1675
258k
      "sub         %1,%2                         \n"
1676
1677
258k
      LABELALIGN
1678
258k
      "1:          \n"
1679
258k
      "vmovdqu     (%0),%%ymm0                   \n"
1680
258k
      "vmovdqu     0x20(%0),%%ymm1               \n"
1681
258k
      "vmovdqu     0x40(%0),%%ymm2               \n"
1682
258k
      "vmovdqu     0x60(%0),%%ymm6               \n"
1683
258k
      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"
1684
258k
      "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1          \n"
1685
258k
      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2          \n"
1686
258k
      "vpmaddubsw  %%ymm3,%%ymm6,%%ymm6          \n"
1687
258k
      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
1688
258k
      "vphaddw     %%ymm6,%%ymm2,%%ymm2          \n"
1689
258k
      "vpsubw      %%ymm0,%%ymm5,%%ymm0          \n"
1690
258k
      "vpsubw      %%ymm2,%%ymm5,%%ymm2          \n"
1691
258k
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
1692
258k
      "vpsrlw      $0x8,%%ymm2,%%ymm2            \n"
1693
258k
      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates
1694
258k
      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // unmutate.
1695
258k
      "vmovdqu     %%ymm0,(%1)                   \n"
1696
1697
258k
      "vmovdqu     (%0),%%ymm0                   \n"
1698
258k
      "vmovdqu     0x20(%0),%%ymm1               \n"
1699
258k
      "vmovdqu     0x40(%0),%%ymm2               \n"
1700
258k
      "vmovdqu     0x60(%0),%%ymm6               \n"
1701
258k
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
1702
258k
      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
1703
258k
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
1704
258k
      "vpmaddubsw  %%ymm4,%%ymm6,%%ymm6          \n"
1705
258k
      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
1706
258k
      "vphaddw     %%ymm6,%%ymm2,%%ymm2          \n"
1707
258k
      "vpsubw      %%ymm0,%%ymm5,%%ymm0          \n"
1708
258k
      "vpsubw      %%ymm2,%%ymm5,%%ymm2          \n"
1709
258k
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
1710
258k
      "vpsrlw      $0x8,%%ymm2,%%ymm2            \n"
1711
258k
      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates
1712
258k
      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // unmutate.
1713
258k
      "vmovdqu     %%ymm0,(%1,%2,1)              \n"
1714
258k
      "lea         0x80(%0),%0                   \n"
1715
258k
      "lea         0x20(%1),%1                   \n"
1716
258k
      "subl        $0x20,%3                      \n"
1717
258k
      "jg          1b                            \n"
1718
258k
      "vzeroupper  \n"
1719
258k
      : "+r"(src_argb),  // %0
1720
258k
        "+r"(dst_u),     // %1
1721
258k
        "+r"(dst_v),     // %2
1722
#if defined(__i386__)
1723
        "+m"(width)  // %3
1724
#else
1725
258k
        "+rm"(width)  // %3
1726
258k
#endif
1727
258k
      : "r"(rgbuvconstants),    // %4
1728
258k
        "m"(kPermdARGBToY_AVX)  // %5
1729
258k
      : "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
1730
258k
        "ymm7");
1731
258k
}
1732
#endif  // HAS_ARGBTOUV444ROW_AVX2
1733
1734
#ifdef HAS_ARGBTOUVROW_SSSE3
1735
1736
// ARGBARGB to AARRGGBB shuffle
1737
static const lvec8 kShuffleAARRGGBB = {
1738
    0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
1739
    0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
1740
};
1741
1742
// 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V
1743
// ARGBToUV does rounding average of 4 ARGB pixels
1744
void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
1745
                             int src_stride_argb,
1746
                             uint8_t* dst_u,
1747
                             uint8_t* dst_v,
1748
                             int width,
1749
0
                             const struct RgbUVConstants* rgbuvconstants) {
1750
0
  asm volatile(
1751
0
      "movdqa      0x0(%5),%%xmm4                \n"  // RGBToU
1752
0
      "movdqa      0x10(%5),%%xmm5               \n"  // RGBToV
1753
0
      "pcmpeqb     %%xmm6,%%xmm6                 \n"  // 0x0101
1754
0
      "pabsb       %%xmm6,%%xmm6                 \n"
1755
0
      "movdqa      %6,%%xmm7                     \n"  // kShuffleAARRGGBB
1756
0
      "sub         %1,%2                         \n"
1757
1758
0
      "1:          \n"
1759
0
      "movdqu      (%0),%%xmm0                   \n"  // Read 8x2 ARGB Pixels
1760
0
      "movdqu      0x10(%0),%%xmm1               \n"
1761
0
      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
1762
0
      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
1763
0
      "pshufb      %%xmm7,%%xmm0                 \n"  // aarrggbb
1764
0
      "pshufb      %%xmm7,%%xmm1                 \n"
1765
0
      "pshufb      %%xmm7,%%xmm2                 \n"
1766
0
      "pshufb      %%xmm7,%%xmm3                 \n"
1767
0
      "pmaddubsw   %%xmm6,%%xmm0                 \n"  // 8x2 -> 4x2
1768
0
      "pmaddubsw   %%xmm6,%%xmm1                 \n"
1769
0
      "pmaddubsw   %%xmm6,%%xmm2                 \n"
1770
0
      "pmaddubsw   %%xmm6,%%xmm3                 \n"
1771
0
      "paddw       %%xmm2,%%xmm0                 \n"  // 4x2 -> 4x1
1772
0
      "paddw       %%xmm3,%%xmm1                 \n"
1773
0
      "pxor        %%xmm2,%%xmm2                 \n"  // 0 for vpavgw
1774
0
      "psrlw       $1,%%xmm0                     \n"
1775
0
      "psrlw       $1,%%xmm1                     \n"
1776
0
      "pavgw       %%xmm2,%%xmm0                 \n"
1777
0
      "pavgw       %%xmm2,%%xmm1                 \n"
1778
0
      "packuswb    %%xmm1,%%xmm0                 \n"  // mutates
1779
1780
0
      "movdqa      %%xmm6,%%xmm2                 \n"
1781
0
      "psllw       $15,%%xmm2                    \n"  // 0x8000
1782
0
      "movdqa      %%xmm0,%%xmm1                 \n"
1783
0
      "pmaddubsw   %%xmm5,%%xmm1                 \n"  // 4 V
1784
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // 4 U
1785
0
      "phaddw      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
1786
0
      "psubw       %%xmm0,%%xmm2                 \n"
1787
0
      "psrlw       $0x8,%%xmm2                   \n"
1788
0
      "packuswb    %%xmm2,%%xmm2                 \n"
1789
0
      "movd        %%xmm2,(%1)                   \n"  // Write 4 U's
1790
0
      "pshufd      $0x55,%%xmm2,%%xmm2           \n"  // Copy V to low 4 bytes
1791
0
      "movd        %%xmm2,0x00(%1,%2,1)          \n"  // Write 4 V's
1792
1793
0
      "lea         0x20(%0),%0                  \n"
1794
0
      "lea         0x4(%1),%1                   \n"
1795
0
      "subl        $0x8,%3                      \n"
1796
0
      "jg          1b                           \n"
1797
0
      : "+r"(src_argb),  // %0
1798
0
        "+r"(dst_u),     // %1
1799
0
        "+r"(dst_v),     // %2
1800
#if defined(__i386__)
1801
        "+m"(width)  // %3
1802
#else
1803
0
        "+rm"(width)  // %3
1804
0
#endif
1805
0
      : "r"((intptr_t)(src_stride_argb)),  // %4
1806
0
        "r"(rgbuvconstants),               // %5
1807
0
        "m"(kShuffleAARRGGBB)              // %6
1808
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1809
0
        "xmm7");
1810
0
}
1811
1812
#endif  // HAS_ARGBTOUVROW_SSSE3
1813
1814
#ifdef HAS_ARGBTOUVROW_AVX2
1815
1816
// 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V
1817
// ARGBToUV does rounding average of 4 ARGB pixels
1818
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
1819
                            int src_stride_argb,
1820
                            uint8_t* dst_u,
1821
                            uint8_t* dst_v,
1822
                            int width,
1823
508k
                            const struct RgbUVConstants* rgbuvconstants) {
1824
508k
  asm volatile(
1825
508k
      "vbroadcastf128 0(%5),%%ymm4               \n"  // RGBToU
1826
508k
      "vbroadcastf128 0x10(%5),%%ymm5            \n"  // RGBToV
1827
508k
      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 0x0101
1828
508k
      "vpabsb      %%ymm6,%%ymm6                 \n"
1829
508k
      "vmovdqa     %6,%%ymm7                     \n"  // kShuffleAARRGGBB
1830
508k
      "sub         %1,%2                         \n"
1831
1832
508k
      "1:          \n"
1833
508k
      "vmovdqu     (%0),%%ymm0                   \n"  // Read 16x2 ARGB Pixels
1834
508k
      "vmovdqu     0x20(%0),%%ymm1               \n"
1835
508k
      "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
1836
508k
      "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
1837
508k
      "vpshufb     %%ymm7,%%ymm0,%%ymm0          \n"  // aarrggbb
1838
508k
      "vpshufb     %%ymm7,%%ymm1,%%ymm1          \n"
1839
508k
      "vpshufb     %%ymm7,%%ymm2,%%ymm2          \n"
1840
508k
      "vpshufb     %%ymm7,%%ymm3,%%ymm3          \n"
1841
508k
      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"  // 16x2 -> 8x2
1842
508k
      "vpmaddubsw  %%ymm6,%%ymm1,%%ymm1          \n"
1843
508k
      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1844
508k
      "vpmaddubsw  %%ymm6,%%ymm3,%%ymm3          \n"
1845
508k
      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 8x2 -> 8x1
1846
508k
      "vpaddw      %%ymm1,%%ymm3,%%ymm1          \n"
1847
508k
      "vpxor       %%ymm2,%%ymm2,%%ymm2          \n"  // 0 for vpavgw
1848
508k
      "vpsrlw      $1,%%ymm0,%%ymm0              \n"
1849
508k
      "vpsrlw      $1,%%ymm1,%%ymm1              \n"
1850
508k
      "vpavgw      %%ymm2,%%ymm0,%%ymm0          \n"
1851
508k
      "vpavgw      %%ymm2,%%ymm1,%%ymm1          \n"
1852
508k
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
1853
508k
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // 8 ARGB Pixels
1854
1855
508k
      "vpsllw      $15,%%ymm6,%%ymm2             \n"  // 0x8000
1856
508k
      "vpmaddubsw  %%ymm5,%%ymm0,%%ymm1          \n"  // 8 V
1857
508k
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // 8 U
1858
508k
      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv uuuuvvvv
1859
508k
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // uuuuuuuu vvvvvvvv
1860
508k
      "vpsubw      %%ymm0,%%ymm2,%%ymm0          \n"
1861
508k
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
1862
508k
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"  // mutates 8U8u- 8V8v
1863
508k
      "vmovq       %%xmm0,(%1)                   \n"  // Write 8 U's
1864
508k
      "vextractf128 $0x1,%%ymm0,%%xmm0           \n"  // Copy V to low 8 bytes
1865
508k
      "vmovq       %%xmm0,0x00(%1,%2,1)          \n"  // Write 8 V's
1866
1867
508k
      "lea         0x40(%0),%0                   \n"
1868
508k
      "lea         0x8(%1),%1                    \n"
1869
508k
      "subl        $0x10,%3                      \n"
1870
508k
      "jg          1b                            \n"
1871
508k
      "vzeroupper  \n"
1872
508k
      : "+r"(src_argb),  // %0
1873
508k
        "+r"(dst_u),     // %1
1874
508k
        "+r"(dst_v),     // %2
1875
#if defined(__i386__)
1876
        "+m"(width)  // %3
1877
#else
1878
508k
        "+rm"(width)  // %3
1879
508k
#endif
1880
508k
      : "r"((intptr_t)(src_stride_argb)),  // %4
1881
508k
        "r"(rgbuvconstants),               // %5
1882
508k
        "m"(kShuffleAARRGGBB)              // %6
1883
508k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1884
508k
        "xmm7");
1885
508k
}
1886
#endif  // HAS_ARGBTOUVROW_AVX2
1887
1888
#if defined(HAS_ARGBTOUV444ROW_SSSE3) || defined(HAS_ARGBTOUVROW_AVX2)
1889
1890
// RGB to BT601 coefficients
1891
// UB   0.875 coefficient = 112
1892
// UG -0.5781 coefficient = -74
1893
// UR -0.2969 coefficient = -38
1894
// VB -0.1406 coefficient = -18
1895
// VG -0.7344 coefficient = -94
1896
// VR   0.875 coefficient = 112
1897
1898
static const struct RgbUVConstants kARGBI601UVConstants = {
1899
    {-112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0},
1900
    {18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0}};
1901
1902
static const struct RgbUVConstants kABGRI601UVConstants = {
1903
    {38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0},
1904
    {-112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0}};
1905
1906
static const struct RgbUVConstants kBGRAI601UVConstants = {
1907
    {0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112},
1908
    {0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}};
1909
1910
static const struct RgbUVConstants kRGBAI601UVConstants = {
1911
    {0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38},
1912
    {0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112}};
1913
#endif
1914
1915
#ifdef HAS_ARGBTOUV444ROW_SSSE3
1916
void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1917
                          uint8_t* dst_u,
1918
                          uint8_t* dst_v,
1919
0
                          int width) {
1920
0
  ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width,
1921
0
                             &kARGBI601UVConstants);
1922
0
}
1923
#endif  // HAS_ARGBTOUV444ROW_SSSE3
1924
1925
#ifdef HAS_ARGBTOUV444ROW_AVX2
1926
void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
1927
                         uint8_t* dst_u,
1928
                         uint8_t* dst_v,
1929
0
                         int width) {
1930
0
  ARGBToUV444MatrixRow_AVX2(src_argb, dst_u, dst_v, width,
1931
0
                            &kARGBI601UVConstants);
1932
0
}
1933
#endif  // HAS_ARGBTOUV444ROW_AVX2
1934
1935
#ifdef HAS_ARGBTOUVROW_SSSE3
1936
void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1937
                       int src_stride_argb,
1938
                       uint8_t* dst_u,
1939
                       uint8_t* dst_v,
1940
0
                       int width) {
1941
0
  ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width,
1942
0
                          &kARGBI601UVConstants);
1943
0
}
1944
1945
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
1946
                       int src_stride_abgr,
1947
                       uint8_t* dst_u,
1948
                       uint8_t* dst_v,
1949
0
                       int width) {
1950
0
  ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width,
1951
0
                          &kABGRI601UVConstants);
1952
0
}
1953
1954
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
1955
                       int src_stride_bgra,
1956
                       uint8_t* dst_u,
1957
                       uint8_t* dst_v,
1958
0
                       int width) {
1959
0
  ARGBToUVMatrixRow_SSSE3(src_bgra, src_stride_bgra, dst_u, dst_v, width,
1960
0
                          &kBGRAI601UVConstants);
1961
0
}
1962
1963
void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
1964
                       int src_stride_rgba,
1965
                       uint8_t* dst_u,
1966
                       uint8_t* dst_v,
1967
0
                       int width) {
1968
0
  ARGBToUVMatrixRow_SSSE3(src_rgba, src_stride_rgba, dst_u, dst_v, width,
1969
0
                          &kRGBAI601UVConstants);
1970
0
}
1971
#endif  // HAS_ARGBTOUVROW_SSSE3
1972
1973
#ifdef HAS_ARGBTOUVROW_AVX2
1974
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1975
                      int src_stride_argb,
1976
                      uint8_t* dst_u,
1977
                      uint8_t* dst_v,
1978
0
                      int width) {
1979
0
  ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
1980
0
                         &kARGBI601UVConstants);
1981
0
}
1982
1983
void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1984
                      int src_stride_abgr,
1985
                      uint8_t* dst_u,
1986
                      uint8_t* dst_v,
1987
0
                      int width) {
1988
0
  ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
1989
0
                         &kABGRI601UVConstants);
1990
0
}
1991
#endif  // HAS_ARGBTOUVROW_AVX2
1992
1993
#ifdef HAS_ARGBTOUVJ444ROW_SSSE3
1994
// RGB to JPEG coefficients
1995
// UB  0.500    coefficient = 128
1996
// UG -0.33126  coefficient = -85
1997
// UR -0.16874  coefficient = -43
1998
// VB -0.08131  coefficient = -21
1999
// VG -0.41869  coefficient = -107
2000
// VR 0.500     coefficient = 128
2001
2002
static const struct RgbUVConstants kARGBJPEGUVConstants = {
2003
    {-128, 85, 43, 0, -128, 85, 43, 0, -128, 85, 43, 0, -128, 85, 43, 0},
2004
    {21, 107, -128, 0, 21, 107, -128, 0, 21, 107, -128, 0, 21, 107, -128, 0}};
2005
2006
void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb,
2007
                           uint8_t* dst_u,
2008
                           uint8_t* dst_v,
2009
0
                           int width) {
2010
0
  ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width,
2011
0
                             &kARGBJPEGUVConstants);
2012
0
}
2013
2014
#endif  // HAS_ARGBTOUVJ444ROW_SSSE3
2015
2016
#ifdef HAS_ARGBTOUVJ444ROW_AVX2
2017
void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
2018
                          uint8_t* dst_u,
2019
                          uint8_t* dst_v,
2020
258k
                          int width) {
2021
258k
  ARGBToUV444MatrixRow_AVX2(src_argb, dst_u, dst_v, width,
2022
258k
                            &kARGBJPEGUVConstants);
2023
258k
}
2024
#endif  // HAS_ARGBTOUVJ444ROW_AVX2
2025
2026
static const struct RgbUVConstants kABGRJPEGUVConstants = {
2027
    {43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0},
2028
    {-128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0}};
2029
2030
#ifdef HAS_ARGBTOUVJROW_SSSE3
2031
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
2032
                        int src_stride_argb,
2033
                        uint8_t* dst_u,
2034
                        uint8_t* dst_v,
2035
0
                        int width) {
2036
0
  ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width,
2037
0
                          &kARGBJPEGUVConstants);
2038
0
}
2039
#endif  // HAS_ARGBTOUVJROW_SSSE3
2040
2041
#ifdef HAS_ABGRTOUVJROW_SSSE3
2042
void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
2043
                        int src_stride_abgr,
2044
                        uint8_t* dst_u,
2045
                        uint8_t* dst_v,
2046
0
                        int width) {
2047
0
  ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width,
2048
0
                          &kABGRJPEGUVConstants);
2049
0
}
2050
#endif  // HAS_ABGRTOUVJROW_SSSE3
2051
2052
#ifdef HAS_ARGBTOUVJROW_AVX2
2053
void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
2054
                       int src_stride_argb,
2055
                       uint8_t* dst_u,
2056
                       uint8_t* dst_v,
2057
507k
                       int width) {
2058
507k
  ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
2059
507k
                         &kARGBJPEGUVConstants);
2060
507k
}
2061
#endif  // HAS_ARGBTOUVJROW_AVX2
2062
2063
#ifdef HAS_ABGRTOUVJROW_AVX2
2064
void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
2065
                       int src_stride_abgr,
2066
                       uint8_t* dst_u,
2067
                       uint8_t* dst_v,
2068
537
                       int width) {
2069
537
  ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
2070
537
                         &kABGRJPEGUVConstants);
2071
537
}
2072
#endif  // HAS_ABGRTOUVJROW_AVX2
2073
2074
0
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2075
0
  asm volatile(
2076
0
      "movdqa      %3,%%xmm4                     \n"
2077
0
      "movdqa      %4,%%xmm5                     \n"
2078
0
      "movdqa      %5,%%xmm7                     \n"
2079
2080
0
      LABELALIGN ""  //
2081
0
      RGBTOY(xmm7)
2082
0
      : "+r"(src_bgra),  // %0
2083
0
        "+r"(dst_y),     // %1
2084
0
        "+r"(width)      // %2
2085
0
      : "m"(kBGRAToY),   // %3
2086
0
        "m"(kSub128),    // %4
2087
0
        "m"(kAddY16)     // %5
2088
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2089
0
        "xmm7");
2090
0
}
2091
2092
0
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2093
0
  asm volatile(
2094
0
      "movdqa      %3,%%xmm4                     \n"
2095
0
      "movdqa      %4,%%xmm5                     \n"
2096
0
      "movdqa      %5,%%xmm7                     \n"
2097
2098
0
      LABELALIGN ""  //
2099
0
      RGBTOY(xmm7)
2100
0
      : "+r"(src_abgr),  // %0
2101
0
        "+r"(dst_y),     // %1
2102
0
        "+r"(width)      // %2
2103
0
      : "m"(kABGRToY),   // %3
2104
0
        "m"(kSub128),    // %4
2105
0
        "m"(kAddY16)     // %5
2106
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2107
0
        "xmm7");
2108
0
}
2109
2110
0
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2111
0
  asm volatile(
2112
0
      "movdqa      %3,%%xmm4                     \n"
2113
0
      "movdqa      %4,%%xmm5                     \n"
2114
0
      "movdqa      %5,%%xmm7                     \n"
2115
2116
0
      LABELALIGN ""  //
2117
0
      RGBTOY(xmm7)
2118
0
      : "+r"(src_rgba),  // %0
2119
0
        "+r"(dst_y),     // %1
2120
0
        "+r"(width)      // %2
2121
0
      : "m"(kRGBAToY),   // %3
2122
0
        "m"(kSub128),    // %4
2123
0
        "m"(kAddY16)     // %5
2124
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2125
0
        "xmm7");
2126
0
}
2127
2128
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2129
2130
// Read 8 UV from 444
2131
#define READYUV444                                                \
2132
  "movq       (%[u_buf]),%%xmm3                               \n" \
2133
  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2134
  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2135
  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2136
  "movq       (%[y_buf]),%%xmm4                               \n" \
2137
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2138
  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2139
2140
// Read 4 UV from 422, upsample to 8 UV
2141
#define READYUV422                                                \
2142
  "movd       (%[u_buf]),%%xmm3                               \n" \
2143
  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2144
  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2145
  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2146
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2147
  "movq       (%[y_buf]),%%xmm4                               \n" \
2148
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2149
  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2150
2151
// Read 4 UV from 422 10 bit, upsample to 8 UV
2152
#define READYUV210                                                \
2153
  "movq       (%[u_buf]),%%xmm3                               \n" \
2154
  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2155
  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2156
  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2157
  "psraw      $2,%%xmm3                                       \n" \
2158
  "packuswb   %%xmm3,%%xmm3                                   \n" \
2159
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2160
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2161
  "movdqa     %%xmm4,%%xmm2                                   \n" \
2162
  "psllw      $6,%%xmm4                                       \n" \
2163
  "psrlw      $4,%%xmm2                                       \n" \
2164
  "paddw      %%xmm2,%%xmm4                                   \n" \
2165
  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2166
2167
#define READYUVA210                                               \
2168
  "movq       (%[u_buf]),%%xmm3                               \n" \
2169
  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2170
  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2171
  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2172
  "psraw      $2,%%xmm3                                       \n" \
2173
  "packuswb   %%xmm3,%%xmm3                                   \n" \
2174
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2175
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2176
  "movdqa     %%xmm4,%%xmm2                                   \n" \
2177
  "psllw      $6,%%xmm4                                       \n" \
2178
  "psrlw      $4,%%xmm2                                       \n" \
2179
  "paddw      %%xmm2,%%xmm4                                   \n" \
2180
  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2181
  "movdqu     (%[a_buf]),%%xmm5                               \n" \
2182
  "psraw      $2,%%xmm5                                       \n" \
2183
  "packuswb   %%xmm5,%%xmm5                                   \n" \
2184
  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2185
2186
// Read 8 UV from 444 10 bit
2187
#define READYUV410                                                \
2188
  "movdqu     (%[u_buf]),%%xmm3                               \n" \
2189
  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2190
  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2191
  "psraw      $2,%%xmm3                                       \n" \
2192
  "psraw      $2,%%xmm2                                       \n" \
2193
  "movdqa     %%xmm3,%%xmm1                                   \n" \
2194
  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2195
  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2196
  "packuswb   %%xmm1,%%xmm3                                   \n" \
2197
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2198
  "movdqa     %%xmm4,%%xmm2                                   \n" \
2199
  "psllw      $6,%%xmm4                                       \n" \
2200
  "psrlw      $4,%%xmm2                                       \n" \
2201
  "paddw      %%xmm2,%%xmm4                                   \n" \
2202
  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2203
2204
// Read 8 UV from 444 10 bit.  With 8 Alpha.
2205
#define READYUVA410                                               \
2206
  "movdqu     (%[u_buf]),%%xmm3                               \n" \
2207
  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2208
  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2209
  "psraw      $2,%%xmm3                                       \n" \
2210
  "psraw      $2,%%xmm2                                       \n" \
2211
  "movdqa     %%xmm3,%%xmm1                                   \n" \
2212
  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2213
  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2214
  "packuswb   %%xmm1,%%xmm3                                   \n" \
2215
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2216
  "movdqa     %%xmm4,%%xmm2                                   \n" \
2217
  "psllw      $6,%%xmm4                                       \n" \
2218
  "psrlw      $4,%%xmm2                                       \n" \
2219
  "paddw      %%xmm2,%%xmm4                                   \n" \
2220
  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2221
  "movdqu     (%[a_buf]),%%xmm5                               \n" \
2222
  "psraw      $2,%%xmm5                                       \n" \
2223
  "packuswb   %%xmm5,%%xmm5                                   \n" \
2224
  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2225
2226
// Read 4 UV from 422 12 bit, upsample to 8 UV
2227
#define READYUV212                                                \
2228
  "movq       (%[u_buf]),%%xmm3                               \n" \
2229
  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2230
  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2231
  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2232
  "psraw      $0x4,%%xmm3                                     \n" \
2233
  "packuswb   %%xmm3,%%xmm3                                   \n" \
2234
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2235
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2236
  "movdqa     %%xmm4,%%xmm2                                   \n" \
2237
  "psllw      $4,%%xmm4                                       \n" \
2238
  "psrlw      $8,%%xmm2                                       \n" \
2239
  "paddw      %%xmm2,%%xmm4                                   \n" \
2240
  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2241
2242
// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2243
#define READYUVA422                                               \
2244
  "movd       (%[u_buf]),%%xmm3                               \n" \
2245
  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2246
  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2247
  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2248
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2249
  "movq       (%[y_buf]),%%xmm4                               \n" \
2250
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2251
  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2252
  "movq       (%[a_buf]),%%xmm5                               \n" \
2253
  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2254
2255
// Read 8 UV from 444.  With 8 Alpha.
2256
#define READYUVA444                                               \
2257
  "movq       (%[u_buf]),%%xmm3                               \n" \
2258
  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2259
  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2260
  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2261
  "movq       (%[y_buf]),%%xmm4                               \n" \
2262
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2263
  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2264
  "movq       (%[a_buf]),%%xmm5                               \n" \
2265
  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2266
2267
// Read 4 UV from NV12, upsample to 8 UV
2268
#define READNV12                                                  \
2269
  "movq       (%[uv_buf]),%%xmm3                              \n" \
2270
  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
2271
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2272
  "movq       (%[y_buf]),%%xmm4                               \n" \
2273
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2274
  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2275
2276
// Read 4 VU from NV21, upsample to 8 UV
2277
#define READNV21                                                  \
2278
  "movq       (%[vu_buf]),%%xmm3                              \n" \
2279
  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
2280
  "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
2281
  "movq       (%[y_buf]),%%xmm4                               \n" \
2282
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2283
  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2284
2285
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2286
// xmm6 kShuffleYUY2Y,
2287
// xmm7 kShuffleYUY2UV
2288
#define READYUY2                                                  \
2289
  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
2290
  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n" \
2291
  "movdqa     %%xmm4,%%xmm3                                   \n" \
2292
  "pshufb     %%xmm6,%%xmm4                                   \n" \
2293
  "pshufb     %%xmm7,%%xmm3                                   \n"
2294
2295
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2296
// xmm6 kShuffleUYVYY,
2297
// xmm7 kShuffleUYVYUV
2298
#define READUYVY                                                  \
2299
  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
2300
  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n" \
2301
  "movdqa     %%xmm4,%%xmm3                                   \n" \
2302
  "pshufb     %%xmm6,%%xmm4                                   \n" \
2303
  "pshufb     %%xmm7,%%xmm3                                   \n"
2304
2305
// Read 4 UV from P210, upsample to 8 UV
2306
#define READP210                                                  \
2307
  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2308
  "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
2309
  "psrlw      $0x8,%%xmm3                                     \n" \
2310
  "packuswb   %%xmm3,%%xmm3                                   \n" \
2311
  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2312
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2313
  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2314
2315
// Read 8 UV from P410
2316
#define READP410                                                  \
2317
  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2318
  "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
2319
  "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
2320
  "psrlw      $0x8,%%xmm3                                     \n" \
2321
  "psrlw      $0x8,%%xmm1                                     \n" \
2322
  "packuswb   %%xmm1,%%xmm3                                   \n" \
2323
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
2324
  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2325
2326
#if defined(__x86_64__)
2327
#define YUVTORGB_SETUP(yuvconstants)                              \
2328
  "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
2329
  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
2330
  "pxor       %%xmm12,%%xmm12                                 \n" \
2331
  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
2332
  "psllw      $7,%%xmm13                                      \n" \
2333
  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
2334
  "pshufb     %%xmm12,%%xmm13                                 \n" \
2335
  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
2336
  "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
2337
2338
// Convert 8 pixels: 8 UV and 8 Y
2339
#define YUVTORGB16(yuvconstants)                                  \
2340
  "psubb      %%xmm13,%%xmm3                                  \n" \
2341
  "pmulhuw    %%xmm11,%%xmm4                                  \n" \
2342
  "movdqa     %%xmm8,%%xmm0                                   \n" \
2343
  "movdqa     %%xmm9,%%xmm1                                   \n" \
2344
  "movdqa     %%xmm10,%%xmm2                                  \n" \
2345
  "paddw      %%xmm12,%%xmm4                                  \n" \
2346
  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2347
  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2348
  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2349
  "paddsw     %%xmm4,%%xmm0                                   \n" \
2350
  "paddsw     %%xmm4,%%xmm2                                   \n" \
2351
  "psubsw     %%xmm1,%%xmm4                                   \n" \
2352
  "movdqa     %%xmm4,%%xmm1                                   \n"
2353
2354
#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2355
2356
#else
2357
#define YUVTORGB_SETUP(yuvconstants)
2358
2359
// Convert 8 pixels: 8 UV and 8 Y
2360
#define YUVTORGB16(yuvconstants)                                  \
2361
  "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
2362
  "pxor       %%xmm1,%%xmm1                                   \n" \
2363
  "psllw      $7,%%xmm0                                       \n" \
2364
  "pshufb     %%xmm1,%%xmm0                                   \n" \
2365
  "psubb      %%xmm0,%%xmm3                                   \n" \
2366
  "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
2367
  "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
2368
  "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
2369
  "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
2370
  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2371
  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2372
  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2373
  "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
2374
  "paddw      %%xmm3,%%xmm4                                   \n" \
2375
  "paddsw     %%xmm4,%%xmm0                                   \n" \
2376
  "paddsw     %%xmm4,%%xmm2                                   \n" \
2377
  "psubsw     %%xmm1,%%xmm4                                   \n" \
2378
  "movdqa     %%xmm4,%%xmm1                                   \n"
2379
2380
#define YUVTORGB_REGS
2381
#endif
2382
2383
#define YUVTORGB(yuvconstants)                                    \
2384
  YUVTORGB16(yuvconstants)                                        \
2385
  "psraw      $0x6,%%xmm0                                     \n" \
2386
  "psraw      $0x6,%%xmm1                                     \n" \
2387
  "psraw      $0x6,%%xmm2                                     \n" \
2388
  "packuswb   %%xmm0,%%xmm0                                   \n" \
2389
  "packuswb   %%xmm1,%%xmm1                                   \n" \
2390
  "packuswb   %%xmm2,%%xmm2                                   \n"
2391
2392
// Store 8 ARGB values.
2393
#define STOREARGB                                                  \
2394
  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
2395
  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
2396
  "movdqa     %%xmm0,%%xmm1                                    \n" \
2397
  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2398
  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
2399
  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
2400
  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
2401
  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
2402
2403
// Store 8 RGBA values.
2404
#define STORERGBA                                                  \
2405
  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
2406
  "punpcklbw %%xmm2,%%xmm1                                     \n" \
2407
  "punpcklbw %%xmm0,%%xmm5                                     \n" \
2408
  "movdqa    %%xmm5,%%xmm0                                     \n" \
2409
  "punpcklwd %%xmm1,%%xmm5                                     \n" \
2410
  "punpckhwd %%xmm1,%%xmm0                                     \n" \
2411
  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
2412
  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
2413
  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
2414
2415
// Store 8 RGB24 values.
2416
#define STORERGB24                                                      \
2417
  "punpcklbw   %%xmm1,%%xmm0                                        \n" \
2418
  "punpcklbw   %%xmm2,%%xmm2                                        \n" \
2419
  "movdqa      %%xmm0,%%xmm1                                        \n" \
2420
  "punpcklwd   %%xmm2,%%xmm0                                        \n" \
2421
  "punpckhwd   %%xmm2,%%xmm1                                        \n" \
2422
  "pshufb      %%xmm5,%%xmm0                                        \n" \
2423
  "pshufb      %%xmm6,%%xmm1                                        \n" \
2424
  "palignr     $0xc,%%xmm0,%%xmm1                                   \n" \
2425
  "movq        %%xmm0,(%[dst_rgb24])                                \n" \
2426
  "movdqu      %%xmm1,0x8(%[dst_rgb24])                             \n" \
2427
  "lea         0x18(%[dst_rgb24]),%[dst_rgb24]                      \n"
2428
2429
// Store 8 AR30 values.
2430
#define STOREAR30                                                  \
2431
  "psraw      $0x4,%%xmm0                                      \n" \
2432
  "psraw      $0x4,%%xmm1                                      \n" \
2433
  "psraw      $0x4,%%xmm2                                      \n" \
2434
  "pminsw     %%xmm7,%%xmm0                                    \n" \
2435
  "pminsw     %%xmm7,%%xmm1                                    \n" \
2436
  "pminsw     %%xmm7,%%xmm2                                    \n" \
2437
  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
2438
  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
2439
  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
2440
  "psllw      $0x4,%%xmm2                                      \n" \
2441
  "movdqa     %%xmm0,%%xmm3                                    \n" \
2442
  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2443
  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
2444
  "movdqa     %%xmm1,%%xmm2                                    \n" \
2445
  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
2446
  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
2447
  "pslld      $0xa,%%xmm1                                      \n" \
2448
  "pslld      $0xa,%%xmm2                                      \n" \
2449
  "por        %%xmm1,%%xmm0                                    \n" \
2450
  "por        %%xmm2,%%xmm3                                    \n" \
2451
  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
2452
  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
2453
  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
2454
2455
void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2456
                                const uint8_t* u_buf,
2457
                                const uint8_t* v_buf,
2458
                                uint8_t* dst_argb,
2459
                                const struct YuvConstants* yuvconstants,
2460
0
                                int width) {
2461
0
  asm volatile (
2462
0
    YUVTORGB_SETUP(yuvconstants)
2463
0
      "sub         %[u_buf],%[v_buf]             \n"
2464
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2465
2466
0
    LABELALIGN
2467
0
      "1:          \n"
2468
0
    READYUV444
2469
0
    YUVTORGB(yuvconstants)
2470
0
    STOREARGB
2471
0
      "sub         $0x8,%[width]                 \n"
2472
0
      "jg          1b                            \n"
2473
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2474
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2475
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2476
0
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2477
0
    [width]"+rm"(width)    // %[width]
2478
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2479
0
  : "memory", "cc", YUVTORGB_REGS
2480
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2481
0
  );
2482
0
}
2483
2484
#ifdef HAS_I444ALPHATOARGBROW_SSSE3
2485
void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2486
                                     const uint8_t* u_buf,
2487
                                     const uint8_t* v_buf,
2488
                                     const uint8_t* a_buf,
2489
                                     uint8_t* dst_argb,
2490
                                     const struct YuvConstants* yuvconstants,
2491
0
                                     int width) {
2492
0
  asm volatile(YUVTORGB_SETUP(
2493
0
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
2494
2495
0
      LABELALIGN "1:          \n" READYUVA444 YUVTORGB(yuvconstants)
2496
0
                   STOREARGB
2497
0
      "subl        $0x8,%[width]                 \n"
2498
0
      "jg          1b                            \n"
2499
0
               : [y_buf] "+r"(y_buf),        // %[y_buf]
2500
0
                 [u_buf] "+r"(u_buf),        // %[u_buf]
2501
0
                 [v_buf] "+r"(v_buf),        // %[v_buf]
2502
0
                 [a_buf] "+r"(a_buf),        // %[a_buf]
2503
0
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2504
#if defined(__i386__)
2505
                 [width] "+m"(width)  // %[width]
2506
#else
2507
0
                 [width] "+rm"(width)  // %[width]
2508
0
#endif
2509
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2510
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
2511
0
                 "xmm4", "xmm5");
2512
0
}
2513
#endif  // HAS_I444ALPHATOARGBROW_SSSE3
2514
2515
void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2516
                                 const uint8_t* u_buf,
2517
                                 const uint8_t* v_buf,
2518
                                 uint8_t* dst_rgb24,
2519
                                 const struct YuvConstants* yuvconstants,
2520
0
                                 int width) {
2521
0
  asm volatile (
2522
0
    YUVTORGB_SETUP(yuvconstants)
2523
0
      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2524
0
      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2525
0
      "sub         %[u_buf],%[v_buf]             \n"
2526
2527
0
    LABELALIGN
2528
0
      "1:          \n"
2529
0
    READYUV422
2530
0
    YUVTORGB(yuvconstants)
2531
0
    STORERGB24
2532
0
      "subl        $0x8,%[width]                 \n"
2533
0
      "jg          1b                            \n"
2534
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2535
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2536
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2537
0
    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2538
#if defined(__i386__)
2539
    [width]"+m"(width)     // %[width]
2540
#else
2541
0
    [width]"+rm"(width)    // %[width]
2542
0
#endif
2543
0
  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2544
0
    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2545
0
    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2546
0
  : "memory", "cc", YUVTORGB_REGS
2547
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2548
0
  );
2549
0
}
2550
2551
void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
2552
                                 const uint8_t* u_buf,
2553
                                 const uint8_t* v_buf,
2554
                                 uint8_t* dst_rgb24,
2555
                                 const struct YuvConstants* yuvconstants,
2556
0
                                 int width) {
2557
0
  asm volatile (
2558
0
    YUVTORGB_SETUP(yuvconstants)
2559
0
      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2560
0
      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2561
0
      "sub         %[u_buf],%[v_buf]             \n"
2562
2563
0
    LABELALIGN
2564
0
      "1:          \n"
2565
0
    READYUV444
2566
0
    YUVTORGB(yuvconstants)
2567
0
    STORERGB24
2568
0
      "subl        $0x8,%[width]                 \n"
2569
0
      "jg          1b                            \n"
2570
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2571
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2572
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2573
0
    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2574
#if defined(__i386__)
2575
    [width]"+m"(width)     // %[width]
2576
#else
2577
0
    [width]"+rm"(width)    // %[width]
2578
0
#endif
2579
0
  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2580
0
    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2581
0
    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2582
0
  : "memory", "cc", YUVTORGB_REGS
2583
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2584
0
  );
2585
0
}
2586
2587
void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2588
                                const uint8_t* u_buf,
2589
                                const uint8_t* v_buf,
2590
                                uint8_t* dst_argb,
2591
                                const struct YuvConstants* yuvconstants,
2592
0
                                int width) {
2593
0
  asm volatile (
2594
0
    YUVTORGB_SETUP(yuvconstants)
2595
0
      "sub         %[u_buf],%[v_buf]             \n"
2596
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2597
2598
0
    LABELALIGN
2599
0
      "1:          \n"
2600
0
    READYUV422
2601
0
    YUVTORGB(yuvconstants)
2602
0
    STOREARGB
2603
0
      "sub         $0x8,%[width]                 \n"
2604
0
      "jg          1b                            \n"
2605
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2606
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2607
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2608
0
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2609
0
    [width]"+rm"(width)    // %[width]
2610
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2611
0
  : "memory", "cc", YUVTORGB_REGS
2612
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2613
0
  );
2614
0
}
2615
2616
void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2617
                                const uint8_t* u_buf,
2618
                                const uint8_t* v_buf,
2619
                                uint8_t* dst_ar30,
2620
                                const struct YuvConstants* yuvconstants,
2621
0
                                int width) {
2622
0
  asm volatile (
2623
0
    YUVTORGB_SETUP(yuvconstants)
2624
0
      "sub         %[u_buf],%[v_buf]             \n"
2625
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
2626
0
      "psrlw       $14,%%xmm5                    \n"
2627
0
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2628
0
      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2629
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
2630
0
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2631
2632
0
    LABELALIGN
2633
0
      "1:          \n"
2634
0
    READYUV422
2635
0
    YUVTORGB16(yuvconstants)
2636
0
    STOREAR30
2637
0
      "sub         $0x8,%[width]                 \n"
2638
0
      "jg          1b                            \n"
2639
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2640
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2641
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2642
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2643
0
    [width]"+rm"(width)    // %[width]
2644
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2645
0
  : "memory", "cc", YUVTORGB_REGS
2646
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2647
0
  );
2648
0
}
2649
2650
// 10 bit YUV to ARGB
2651
void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2652
                                const uint16_t* u_buf,
2653
                                const uint16_t* v_buf,
2654
                                uint8_t* dst_argb,
2655
                                const struct YuvConstants* yuvconstants,
2656
0
                                int width) {
2657
0
  asm volatile (
2658
0
    YUVTORGB_SETUP(yuvconstants)
2659
0
      "sub         %[u_buf],%[v_buf]             \n"
2660
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2661
2662
0
    LABELALIGN
2663
0
      "1:          \n"
2664
0
    READYUV210
2665
0
    YUVTORGB(yuvconstants)
2666
0
    STOREARGB
2667
0
      "sub         $0x8,%[width]                 \n"
2668
0
      "jg          1b                            \n"
2669
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2670
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2671
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2672
0
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2673
0
    [width]"+rm"(width)    // %[width]
2674
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2675
0
  : "memory", "cc", YUVTORGB_REGS
2676
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2677
0
  );
2678
0
}
2679
2680
// 12 bit YUV to ARGB
2681
void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2682
                                const uint16_t* u_buf,
2683
                                const uint16_t* v_buf,
2684
                                uint8_t* dst_argb,
2685
                                const struct YuvConstants* yuvconstants,
2686
0
                                int width) {
2687
0
  asm volatile (
2688
0
    YUVTORGB_SETUP(yuvconstants)
2689
0
      "sub         %[u_buf],%[v_buf]             \n"
2690
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2691
2692
0
    LABELALIGN
2693
0
      "1:          \n"
2694
0
    READYUV212
2695
0
    YUVTORGB(yuvconstants)
2696
0
    STOREARGB
2697
0
      "sub         $0x8,%[width]                 \n"
2698
0
      "jg          1b                            \n"
2699
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2700
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2701
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2702
0
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2703
0
    [width]"+rm"(width)    // %[width]
2704
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2705
0
  : "memory", "cc", YUVTORGB_REGS
2706
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2707
0
  );
2708
0
}
2709
2710
// 10 bit YUV to AR30
2711
void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2712
                                const uint16_t* u_buf,
2713
                                const uint16_t* v_buf,
2714
                                uint8_t* dst_ar30,
2715
                                const struct YuvConstants* yuvconstants,
2716
0
                                int width) {
2717
0
  asm volatile (
2718
0
    YUVTORGB_SETUP(yuvconstants)
2719
0
      "sub         %[u_buf],%[v_buf]             \n"
2720
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2721
0
      "psrlw       $14,%%xmm5                    \n"
2722
0
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2723
0
      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2724
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
2725
0
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2726
2727
0
    LABELALIGN
2728
0
      "1:          \n"
2729
0
    READYUV210
2730
0
    YUVTORGB16(yuvconstants)
2731
0
    STOREAR30
2732
0
      "sub         $0x8,%[width]                 \n"
2733
0
      "jg          1b                            \n"
2734
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2735
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2736
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2737
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2738
0
    [width]"+rm"(width)    // %[width]
2739
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2740
0
  : "memory", "cc", YUVTORGB_REGS
2741
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2742
0
  );
2743
0
}
2744
2745
// 12 bit YUV to AR30
2746
void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2747
                                const uint16_t* u_buf,
2748
                                const uint16_t* v_buf,
2749
                                uint8_t* dst_ar30,
2750
                                const struct YuvConstants* yuvconstants,
2751
0
                                int width) {
2752
0
  asm volatile (
2753
0
    YUVTORGB_SETUP(yuvconstants)
2754
0
      "sub         %[u_buf],%[v_buf]             \n"
2755
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2756
0
      "psrlw       $14,%%xmm5                    \n"
2757
0
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2758
0
      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2759
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
2760
0
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2761
2762
0
    LABELALIGN
2763
0
      "1:          \n"
2764
0
    READYUV212
2765
0
    YUVTORGB16(yuvconstants)
2766
0
    STOREAR30
2767
0
      "sub         $0x8,%[width]                 \n"
2768
0
      "jg          1b                            \n"
2769
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2770
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2771
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2772
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2773
0
    [width]"+rm"(width)    // %[width]
2774
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2775
0
  : "memory", "cc", YUVTORGB_REGS
2776
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2777
0
  );
2778
0
}
2779
2780
// 10 bit YUV to ARGB
2781
void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2782
                                const uint16_t* u_buf,
2783
                                const uint16_t* v_buf,
2784
                                uint8_t* dst_argb,
2785
                                const struct YuvConstants* yuvconstants,
2786
0
                                int width) {
2787
0
  asm volatile (
2788
0
    YUVTORGB_SETUP(yuvconstants)
2789
0
      "sub         %[u_buf],%[v_buf]             \n"
2790
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2791
2792
0
    LABELALIGN
2793
0
      "1:          \n"
2794
0
    READYUV410
2795
0
    YUVTORGB(yuvconstants)
2796
0
    STOREARGB
2797
0
      "sub         $0x8,%[width]                 \n"
2798
0
      "jg          1b                            \n"
2799
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2800
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2801
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2802
0
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2803
0
    [width]"+rm"(width)    // %[width]
2804
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2805
0
  : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2806
0
  );
2807
0
}
2808
2809
#ifdef HAS_I210ALPHATOARGBROW_SSSE3
2810
// 10 bit YUVA to ARGB
2811
void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2812
                                     const uint16_t* u_buf,
2813
                                     const uint16_t* v_buf,
2814
                                     const uint16_t* a_buf,
2815
                                     uint8_t* dst_argb,
2816
                                     const struct YuvConstants* yuvconstants,
2817
0
                                     int width) {
2818
0
  asm volatile(YUVTORGB_SETUP(
2819
0
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
2820
2821
0
      LABELALIGN "1:          \n" READYUVA210 YUVTORGB(yuvconstants)
2822
0
                   STOREARGB
2823
0
      "subl        $0x8,%[width]                 \n"
2824
0
      "jg          1b                            \n"
2825
0
               : [y_buf] "+r"(y_buf),  // %[y_buf]
2826
0
                 [u_buf] "+r"(u_buf),  // %[u_buf]
2827
0
                 [v_buf] "+r"(v_buf),  // %[v_buf]
2828
0
                 [a_buf] "+r"(a_buf),
2829
0
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2830
#if defined(__i386__)
2831
                 [width] "+m"(width)  // %[width]
2832
#else
2833
0
                 [width] "+rm"(width)  // %[width]
2834
0
#endif
2835
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2836
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
2837
0
                 "xmm4", "xmm5");
2838
0
}
2839
#endif
2840
2841
#ifdef HAS_I410ALPHATOARGBROW_SSSE3
2842
// 10 bit YUVA to ARGB
2843
void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2844
                                     const uint16_t* u_buf,
2845
                                     const uint16_t* v_buf,
2846
                                     const uint16_t* a_buf,
2847
                                     uint8_t* dst_argb,
2848
                                     const struct YuvConstants* yuvconstants,
2849
0
                                     int width) {
2850
0
  asm volatile(YUVTORGB_SETUP(
2851
0
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
2852
2853
0
      LABELALIGN "1:          \n" READYUVA410 YUVTORGB(yuvconstants)
2854
0
                   STOREARGB
2855
0
      "subl        $0x8,%[width]                 \n"
2856
0
      "jg          1b                            \n"
2857
0
               : [y_buf] "+r"(y_buf),  // %[y_buf]
2858
0
                 [u_buf] "+r"(u_buf),  // %[u_buf]
2859
0
                 [v_buf] "+r"(v_buf),  // %[v_buf]
2860
0
                 [a_buf] "+r"(a_buf),
2861
0
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2862
#if defined(__i386__)
2863
                 [width] "+m"(width)  // %[width]
2864
#else
2865
0
                 [width] "+rm"(width)  // %[width]
2866
0
#endif
2867
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2868
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
2869
0
                 "xmm4", "xmm5");
2870
0
}
2871
#endif
2872
2873
// 10 bit YUV to AR30
2874
void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
2875
                                const uint16_t* u_buf,
2876
                                const uint16_t* v_buf,
2877
                                uint8_t* dst_ar30,
2878
                                const struct YuvConstants* yuvconstants,
2879
0
                                int width) {
2880
0
  asm volatile (
2881
0
    YUVTORGB_SETUP(yuvconstants)
2882
0
      "sub         %[u_buf],%[v_buf]             \n"
2883
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
2884
0
      "psrlw       $14,%%xmm5                    \n"
2885
0
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2886
0
      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2887
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
2888
0
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2889
2890
0
    LABELALIGN
2891
0
      "1:          \n"
2892
0
    READYUV410
2893
0
    YUVTORGB16(yuvconstants)
2894
0
    STOREAR30
2895
0
      "sub         $0x8,%[width]                 \n"
2896
0
      "jg          1b                            \n"
2897
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
2898
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
2899
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
2900
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2901
0
    [width]"+rm"(width)    // %[width]
2902
0
  : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
2903
0
  : "memory", "cc", YUVTORGB_REGS
2904
0
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2905
0
  );
2906
0
}
2907
2908
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
2909
void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2910
                                     const uint8_t* u_buf,
2911
                                     const uint8_t* v_buf,
2912
                                     const uint8_t* a_buf,
2913
                                     uint8_t* dst_argb,
2914
                                     const struct YuvConstants* yuvconstants,
2915
0
                                     int width) {
2916
0
  asm volatile(YUVTORGB_SETUP(
2917
0
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
2918
2919
0
      LABELALIGN "1:          \n" READYUVA422 YUVTORGB(yuvconstants)
2920
0
                   STOREARGB
2921
0
      "subl        $0x8,%[width]                 \n"
2922
0
      "jg          1b                            \n"
2923
0
               : [y_buf] "+r"(y_buf),        // %[y_buf]
2924
0
                 [u_buf] "+r"(u_buf),        // %[u_buf]
2925
0
                 [v_buf] "+r"(v_buf),        // %[v_buf]
2926
0
                 [a_buf] "+r"(a_buf),        // %[a_buf]
2927
0
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2928
#if defined(__i386__)
2929
                 [width] "+m"(width)  // %[width]
2930
#else
2931
0
                 [width] "+rm"(width)  // %[width]
2932
0
#endif
2933
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2934
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
2935
0
                 "xmm4", "xmm5");
2936
0
}
2937
#endif  // HAS_I422ALPHATOARGBROW_SSSE3
2938
2939
void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2940
                                const uint8_t* uv_buf,
2941
                                uint8_t* dst_argb,
2942
                                const struct YuvConstants* yuvconstants,
2943
0
                                int width) {
2944
0
  asm volatile(YUVTORGB_SETUP(
2945
0
      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
2946
2947
0
      LABELALIGN "1:          \n" READNV12 YUVTORGB(yuvconstants)
2948
0
                   STOREARGB
2949
0
      "sub         $0x8,%[width]                 \n"
2950
0
      "jg          1b                            \n"
2951
0
               : [y_buf] "+r"(y_buf),              // %[y_buf]
2952
0
                 [uv_buf] "+r"(uv_buf),            // %[uv_buf]
2953
0
                 [dst_argb] "+r"(dst_argb),        // %[dst_argb]
2954
0
                 [width] "+rm"(width)              // %[width]
2955
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2956
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
2957
0
                 "xmm4", "xmm5");
2958
0
}
2959
2960
void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2961
                                const uint8_t* vu_buf,
2962
                                uint8_t* dst_argb,
2963
                                const struct YuvConstants* yuvconstants,
2964
0
                                int width) {
2965
0
  asm volatile(YUVTORGB_SETUP(
2966
0
      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
2967
2968
0
      LABELALIGN "1:          \n" READNV21 YUVTORGB(yuvconstants)
2969
0
                   STOREARGB
2970
0
      "sub         $0x8,%[width]                 \n"
2971
0
      "jg          1b                            \n"
2972
0
               : [y_buf] "+r"(y_buf),               // %[y_buf]
2973
0
                 [vu_buf] "+r"(vu_buf),             // %[vu_buf]
2974
0
                 [dst_argb] "+r"(dst_argb),         // %[dst_argb]
2975
0
                 [width] "+rm"(width)               // %[width]
2976
0
               : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
2977
0
                 [kShuffleNV21] "m"(kShuffleNV21)
2978
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
2979
0
                 "xmm4", "xmm5");
2980
0
}
2981
2982
void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2983
                                uint8_t* dst_argb,
2984
                                const struct YuvConstants* yuvconstants,
2985
0
                                int width) {
2986
0
  asm volatile(
2987
0
      "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
2988
0
      "movdqa      %[kShuffleYUY2UV],%%xmm7      \n" YUVTORGB_SETUP(
2989
0
      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
2990
2991
0
      LABELALIGN "1:          \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB
2992
0
      "sub         $0x8,%[width]                 \n"
2993
0
      "jg          1b                            \n"
2994
0
      : [yuy2_buf] "+r"(yuy2_buf),         // %[yuy2_buf]
2995
0
        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
2996
0
        [width] "+rm"(width)               // %[width]
2997
0
      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
2998
0
        [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV)
2999
0
      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3000
0
        "xmm5", "xmm6", "xmm7");
3001
0
}
3002
3003
void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
3004
                                uint8_t* dst_argb,
3005
                                const struct YuvConstants* yuvconstants,
3006
0
                                int width) {
3007
0
  asm volatile(
3008
0
      "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
3009
0
      "movdqa      %[kShuffleUYVYUV],%%xmm7      \n" YUVTORGB_SETUP(
3010
0
      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3011
3012
0
      LABELALIGN "1:          \n" READUYVY YUVTORGB(yuvconstants) STOREARGB
3013
0
      "sub         $0x8,%[width]                 \n"
3014
0
      "jg          1b                            \n"
3015
0
      : [uyvy_buf] "+r"(uyvy_buf),         // %[uyvy_buf]
3016
0
        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
3017
0
        [width] "+rm"(width)               // %[width]
3018
0
      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
3019
0
        [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV)
3020
0
      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3021
0
        "xmm5");
3022
0
}
3023
3024
void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3025
                                const uint16_t* uv_buf,
3026
                                uint8_t* dst_argb,
3027
                                const struct YuvConstants* yuvconstants,
3028
0
                                int width) {
3029
0
  asm volatile(YUVTORGB_SETUP(
3030
0
      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3031
3032
0
      LABELALIGN "1:          \n" READP210 YUVTORGB(yuvconstants)
3033
0
                   STOREARGB
3034
0
      "sub         $0x8,%[width]                 \n"
3035
0
      "jg          1b                            \n"
3036
0
               : [y_buf] "+r"(y_buf),              // %[y_buf]
3037
0
                 [uv_buf] "+r"(uv_buf),            // %[u_buf]
3038
0
                 [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3039
0
                 [width] "+rm"(width)              // %[width]
3040
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3041
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
3042
0
                 "xmm4", "xmm5");
3043
0
}
3044
3045
void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3046
                                const uint16_t* uv_buf,
3047
                                uint8_t* dst_argb,
3048
                                const struct YuvConstants* yuvconstants,
3049
0
                                int width) {
3050
0
  asm volatile(YUVTORGB_SETUP(
3051
0
      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3052
3053
0
      LABELALIGN "1:          \n" READP410 YUVTORGB(yuvconstants)
3054
0
                   STOREARGB
3055
0
      "sub         $0x8,%[width]                 \n"
3056
0
      "jg          1b                            \n"
3057
0
               : [y_buf] "+r"(y_buf),              // %[y_buf]
3058
0
                 [uv_buf] "+r"(uv_buf),            // %[u_buf]
3059
0
                 [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3060
0
                 [width] "+rm"(width)              // %[width]
3061
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3062
0
               : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3",
3063
0
                 "xmm4", "xmm5");
3064
0
}
3065
3066
void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3067
                                const uint16_t* uv_buf,
3068
                                uint8_t* dst_ar30,
3069
                                const struct YuvConstants* yuvconstants,
3070
0
                                int width) {
3071
0
  asm volatile (
3072
0
    YUVTORGB_SETUP(yuvconstants)
3073
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
3074
0
      "psrlw       $14,%%xmm5                    \n"
3075
0
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3076
0
      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3077
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
3078
0
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3079
3080
0
    LABELALIGN
3081
0
      "1:          \n"
3082
0
    READP210
3083
0
    YUVTORGB16(yuvconstants)
3084
0
    STOREAR30
3085
0
      "sub         $0x8,%[width]                 \n"
3086
0
      "jg          1b                            \n"
3087
0
  : [y_buf]"+r"(y_buf),              // %[y_buf]
3088
0
    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3089
0
    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3090
0
    [width]"+rm"(width)              // %[width]
3091
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3092
0
  : "memory", "cc", YUVTORGB_REGS
3093
0
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3094
0
  );
3095
0
}
3096
3097
void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3098
                                const uint16_t* uv_buf,
3099
                                uint8_t* dst_ar30,
3100
                                const struct YuvConstants* yuvconstants,
3101
0
                                int width) {
3102
0
  asm volatile (
3103
0
    YUVTORGB_SETUP(yuvconstants)
3104
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
3105
0
      "psrlw       $14,%%xmm5                    \n"
3106
0
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3107
0
      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3108
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
3109
0
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3110
3111
0
    LABELALIGN
3112
0
      "1:          \n"
3113
0
    READP410
3114
0
    YUVTORGB16(yuvconstants)
3115
0
    STOREAR30
3116
0
      "sub         $0x8,%[width]                 \n"
3117
0
      "jg          1b                            \n"
3118
0
  : [y_buf]"+r"(y_buf),              // %[y_buf]
3119
0
    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3120
0
    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3121
0
    [width]"+rm"(width)              // %[width]
3122
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3123
0
  : "memory", "cc", YUVTORGB_REGS
3124
0
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3125
0
  );
3126
0
}
3127
3128
void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3129
                                const uint8_t* u_buf,
3130
                                const uint8_t* v_buf,
3131
                                uint8_t* dst_rgba,
3132
                                const struct YuvConstants* yuvconstants,
3133
0
                                int width) {
3134
0
  asm volatile (
3135
0
    YUVTORGB_SETUP(yuvconstants)
3136
0
      "sub         %[u_buf],%[v_buf]             \n"
3137
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
3138
3139
0
    LABELALIGN
3140
0
      "1:          \n"
3141
0
    READYUV422
3142
0
    YUVTORGB(yuvconstants)
3143
0
    STORERGBA
3144
0
      "sub         $0x8,%[width]                 \n"
3145
0
      "jg          1b                            \n"
3146
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3147
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
3148
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
3149
0
    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
3150
0
    [width]"+rm"(width)    // %[width]
3151
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3152
0
  : "memory", "cc", YUVTORGB_REGS
3153
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3154
0
  );
3155
0
}
3156
3157
#endif  // HAS_I422TOARGBROW_SSSE3
3158
3159
// Read 16 UV from 444
3160
#define READYUV444_AVX2                                               \
3161
  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3162
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3163
  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3164
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3165
  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3166
  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3167
  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3168
  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3169
  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3170
  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3171
3172
// Read 8 UV from 422, upsample to 16 UV.
3173
#define READYUV422_AVX2                                               \
3174
  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3175
  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3176
  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3177
  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3178
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3179
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3180
  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3181
  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3182
  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3183
  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3184
3185
#define READYUV422_AVX512BW                                           \
3186
  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3187
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3188
  "vpermq     %%zmm3,%%zmm16,%%zmm3                               \n" \
3189
  "vpermq     %%zmm1,%%zmm16,%%zmm1                               \n" \
3190
  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3191
  "vpunpcklbw %%zmm1,%%zmm3,%%zmm3                                \n" \
3192
  "vpermq     $0xd8,%%zmm3,%%zmm3                                 \n" \
3193
  "vpunpcklwd %%zmm3,%%zmm3,%%zmm3                                \n" \
3194
  "vmovups    (%[y_buf]),%%ymm4                                   \n" \
3195
  "vpermq     %%zmm4,%%zmm17,%%zmm4                               \n" \
3196
  "vpermq     $0xd8,%%zmm4,%%zmm4                                 \n" \
3197
  "vpunpcklbw %%zmm4,%%zmm4,%%zmm4                                \n" \
3198
  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3199
3200
// Read 8 UV from 210, upsample to 16 UV
3201
// TODO(fbarchard): Consider vpshufb to replace pack/unpack
3202
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3203
#define READYUV210_AVX2                                            \
3204
  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3205
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3206
  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3207
  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3208
  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3209
  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3210
  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3211
  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3212
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3213
  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3214
  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3215
  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3216
  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3217
  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3218
3219
// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3220
#define READYUVA210_AVX2                                           \
3221
  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3222
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3223
  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3224
  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3225
  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3226
  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3227
  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3228
  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3229
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3230
  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3231
  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3232
  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3233
  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3234
  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3235
  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3236
  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3237
  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3238
  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3239
3240
// Read 16 UV from 410
3241
#define READYUV410_AVX2                                            \
3242
  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3243
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3244
  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3245
  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3246
  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3247
  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3248
  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3249
  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3250
  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3251
  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3252
  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3253
  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3254
  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3255
3256
// Read 8 UV from 212 12 bit, upsample to 16 UV
3257
#define READYUV212_AVX2                                            \
3258
  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3259
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3260
  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3261
  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3262
  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3263
  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3264
  "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
3265
  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3266
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3267
  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3268
  "vpsllw     $4,%%ymm4,%%ymm2                                 \n" \
3269
  "vpsrlw     $8,%%ymm4,%%ymm4                                 \n" \
3270
  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3271
  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3272
3273
// Read 16 UV from 410. With 16 Alpha.
3274
#define READYUVA410_AVX2                                           \
3275
  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3276
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3277
  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3278
  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3279
  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3280
  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3281
  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3282
  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3283
  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3284
  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3285
  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3286
  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3287
  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3288
  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3289
  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3290
  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3291
  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3292
3293
// Read 16 UV from 444.  With 16 Alpha.
3294
#define READYUVA444_AVX2                                              \
3295
  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3296
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3297
  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3298
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3299
  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3300
  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3301
  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3302
  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3303
  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3304
  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3305
  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3306
  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3307
  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3308
3309
// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
3310
#define READYUVA422_AVX2                                              \
3311
  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3312
  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3313
  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3314
  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3315
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3316
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3317
  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3318
  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3319
  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3320
  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3321
  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3322
  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3323
  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3324
3325
// Read 8 UV from NV12, upsample to 16 UV.
3326
#define READNV12_AVX2                                                 \
3327
  "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
3328
  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
3329
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3330
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3331
  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3332
  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3333
  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3334
  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3335
3336
// Read 8 VU from NV21, upsample to 16 UV.
3337
#define READNV21_AVX2                                                 \
3338
  "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
3339
  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
3340
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3341
  "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
3342
  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3343
  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3344
  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3345
  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3346
3347
// Read 4 UV from P210, upsample to 8 UV
3348
#define READP210_AVX2                                                 \
3349
  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3350
  "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
3351
  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3352
  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
3353
  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3354
  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3355
  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3356
3357
// Read 8 UV from P410
3358
#define READP410_AVX2                                                 \
3359
  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3360
  "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
3361
  "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
3362
  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3363
  "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
3364
  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
3365
  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3366
  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3367
  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3368
3369
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3370
// ymm6 kShuffleYUY2Y,
3371
// ymm7 kShuffleYUY2UV
3372
#define READYUY2_AVX2                                                 \
3373
  "vmovdqu    (%[yuy2_buf]),%%ymm1                                \n" \
3374
  "vpshufb    %%ymm6,%%ymm1,%%ymm4                                \n" \
3375
  "vpshufb    %%ymm7,%%ymm1,%%ymm3                                \n" \
3376
  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
3377
3378
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3379
// ymm6 kShuffleUYVYY,
3380
// ymm7 kShuffleUYVYUV
3381
#define READUYVY_AVX2                                                 \
3382
  "vmovdqu    (%[uyvy_buf]),%%ymm1                                \n" \
3383
  "vpshufb    %%ymm6,%%ymm1,%%ymm4                                \n" \
3384
  "vpshufb    %%ymm7,%%ymm1,%%ymm3                                \n" \
3385
  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
3386
3387
// TODO(fbarchard): Remove broadcastb
3388
#if defined(__x86_64__)
3389
#define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
3390
  "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
3391
  "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
3392
  "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
3393
  "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
3394
  "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
3395
  "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
3396
  "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
3397
  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
3398
3399
#define YUVTORGB_SETUP_AVX512BW(yuvconstants)                         \
3400
  "vpcmpeqb   %%xmm13,%%xmm13,%%xmm13                             \n" \
3401
  "movdqa     (%[yuvconstants]),%%xmm8                            \n" \
3402
  "vpbroadcastq %%xmm8, %%zmm8                                    \n" \
3403
  "vpsllw     $7,%%xmm13,%%xmm13                                  \n" \
3404
  "vpbroadcastb %%xmm13,%%zmm13                                   \n" \
3405
  "movq       32(%[yuvconstants]),%%xmm9                          \n" \
3406
  "vpbroadcastq %%xmm9,%%zmm9                                     \n" \
3407
  "movq       64(%[yuvconstants]),%%xmm10                         \n" \
3408
  "vpbroadcastq %%xmm10,%%zmm10                                   \n" \
3409
  "movq       96(%[yuvconstants]),%%xmm11                         \n" \
3410
  "vpbroadcastq %%xmm11,%%zmm11                                   \n" \
3411
  "movq       128(%[yuvconstants]),%%xmm12                        \n" \
3412
  "vpbroadcastq %%xmm12,%%zmm12                                   \n" \
3413
  "vmovups    (%[quadsplitperm]),%%zmm16                          \n" \
3414
  "vmovups    (%[dquadsplitperm]),%%zmm17                         \n" \
3415
  "vmovups    (%[unperm]),%%zmm18                                 \n"
3416
3417
#define YUVTORGB16_AVX2(yuvconstants)                                 \
3418
  "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
3419
  "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
3420
  "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
3421
  "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
3422
  "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
3423
  "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
3424
  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3425
  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3426
  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3427
3428
#define YUVTORGB16_AVX512BW(yuvconstants)                             \
3429
  "vpsubb      %%zmm13,%%zmm3,%%zmm3                              \n" \
3430
  "vpmulhuw    %%zmm11,%%zmm4,%%zmm4                              \n" \
3431
  "vpmaddubsw  %%zmm3,%%zmm8,%%zmm0                               \n" \
3432
  "vpmaddubsw  %%zmm3,%%zmm9,%%zmm1                               \n" \
3433
  "vpmaddubsw  %%zmm3,%%zmm10,%%zmm2                              \n" \
3434
  "vpaddw      %%zmm4,%%zmm12,%%zmm4                              \n" \
3435
  "vpaddsw     %%zmm4,%%zmm0,%%zmm0                               \n" \
3436
  "vpsubsw     %%zmm1,%%zmm4,%%zmm1                               \n" \
3437
  "vpaddsw     %%zmm4,%%zmm2,%%zmm2                               \n"
3438
3439
#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3440
#define YUVTORGB_REGS_AVX512BW \
3441
  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
3442
3443
#else  // Convert 16 pixels: 16 UV and 16 Y.
3444
3445
#define YUVTORGB_SETUP_AVX2(yuvconstants)
3446
#define YUVTORGB16_AVX2(yuvconstants)                                 \
3447
  "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
3448
  "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
3449
  "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
3450
  "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
3451
  "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
3452
  "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
3453
  "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
3454
  "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
3455
  "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
3456
  "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
3457
  "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
3458
  "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
3459
  "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
3460
  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3461
  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3462
  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3463
3464
#define YUVTORGB_REGS_AVX2
3465
#endif
3466
3467
#define YUVTORGB_AVX2(yuvconstants)                                   \
3468
  YUVTORGB16_AVX2(yuvconstants)                                       \
3469
  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
3470
  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
3471
  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
3472
  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
3473
  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
3474
  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
3475
3476
#define YUVTORGB_AVX512BW(yuvconstants)                               \
3477
  YUVTORGB16_AVX512BW(yuvconstants)                                   \
3478
  "vpsraw     $0x6,%%zmm0,%%zmm0                                  \n" \
3479
  "vpsraw     $0x6,%%zmm1,%%zmm1                                  \n" \
3480
  "vpsraw     $0x6,%%zmm2,%%zmm2                                  \n" \
3481
  "vpackuswb  %%zmm0,%%zmm0,%%zmm0                                \n" \
3482
  "vpackuswb  %%zmm1,%%zmm1,%%zmm1                                \n" \
3483
  "vpackuswb  %%zmm2,%%zmm2,%%zmm2                                \n"
3484
3485
// Store 16 ARGB values.
3486
#define STOREARGB_AVX2                                                \
3487
  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
3488
  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3489
  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
3490
  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3491
  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
3492
  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3493
  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
3494
  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
3495
  "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
3496
3497
// Store 32 ARGB values.
3498
#define STOREARGB_AVX512BW                                            \
3499
  "vpunpcklbw %%zmm1,%%zmm0,%%zmm0                                \n" \
3500
  "vpermq     %%zmm0,%%zmm18,%%zmm0                               \n" \
3501
  "vpunpcklbw %%zmm5,%%zmm2,%%zmm2                                \n" \
3502
  "vpermq     %%zmm2,%%zmm18,%%zmm2                               \n" \
3503
  "vpunpcklwd %%zmm2,%%zmm0,%%zmm1                                \n" \
3504
  "vpunpckhwd %%zmm2,%%zmm0,%%zmm0                                \n" \
3505
  "vmovups    %%zmm1,(%[dst_argb])                                \n" \
3506
  "vmovups    %%zmm0,0x40(%[dst_argb])                            \n" \
3507
  "lea        0x80(%[dst_argb]), %[dst_argb]                      \n"
3508
3509
// Store 16 AR30 values.
3510
#define STOREAR30_AVX2                                                \
3511
  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
3512
  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
3513
  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
3514
  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
3515
  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
3516
  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
3517
  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
3518
  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
3519
  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
3520
  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
3521
  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3522
  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3523
  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3524
  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
3525
  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3526
  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
3527
  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
3528
  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
3529
  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
3530
  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
3531
  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
3532
  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
3533
  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
3534
  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
3535
3536
#ifdef HAS_I444TOARGBROW_AVX2
3537
// 16 pixels
3538
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
3539
void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3540
                               const uint8_t* u_buf,
3541
                               const uint8_t* v_buf,
3542
                               uint8_t* dst_argb,
3543
                               const struct YuvConstants* yuvconstants,
3544
21.5k
                               int width) {
3545
21.5k
  asm volatile (
3546
21.5k
    YUVTORGB_SETUP_AVX2(yuvconstants)
3547
21.5k
      "sub         %[u_buf],%[v_buf]             \n"
3548
21.5k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3549
3550
21.5k
    LABELALIGN
3551
21.5k
      "1:          \n"
3552
21.5k
    READYUV444_AVX2
3553
21.5k
    YUVTORGB_AVX2(yuvconstants)
3554
21.5k
    STOREARGB_AVX2
3555
21.5k
      "sub         $0x10,%[width]                \n"
3556
21.5k
      "jg          1b                            \n"
3557
21.5k
      "vzeroupper  \n"
3558
21.5k
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3559
21.5k
    [u_buf]"+r"(u_buf),    // %[u_buf]
3560
21.5k
    [v_buf]"+r"(v_buf),    // %[v_buf]
3561
21.5k
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3562
21.5k
    [width]"+rm"(width)    // %[width]
3563
21.5k
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3564
21.5k
  : "memory", "cc", YUVTORGB_REGS_AVX2
3565
21.5k
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3566
21.5k
  );
3567
21.5k
}
3568
#endif  // HAS_I444TOARGBROW_AVX2
3569
3570
#if defined(HAS_I422TOARGBROW_AVX2)
3571
// 16 pixels
3572
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
3573
void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3574
                               const uint8_t* u_buf,
3575
                               const uint8_t* v_buf,
3576
                               uint8_t* dst_argb,
3577
                               const struct YuvConstants* yuvconstants,
3578
13.3k
                               int width) {
3579
13.3k
  asm volatile (
3580
13.3k
    YUVTORGB_SETUP_AVX2(yuvconstants)
3581
13.3k
      "sub         %[u_buf],%[v_buf]             \n"
3582
13.3k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3583
3584
13.3k
    LABELALIGN
3585
13.3k
      "1:          \n"
3586
13.3k
    READYUV422_AVX2
3587
13.3k
    YUVTORGB_AVX2(yuvconstants)
3588
13.3k
    STOREARGB_AVX2
3589
13.3k
      "sub         $0x10,%[width]                \n"
3590
13.3k
      "jg          1b                            \n"
3591
3592
13.3k
      "vzeroupper  \n"
3593
13.3k
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3594
13.3k
    [u_buf]"+r"(u_buf),    // %[u_buf]
3595
13.3k
    [v_buf]"+r"(v_buf),    // %[v_buf]
3596
13.3k
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3597
13.3k
    [width]"+rm"(width)    // %[width]
3598
13.3k
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3599
13.3k
  : "memory", "cc", YUVTORGB_REGS_AVX2
3600
13.3k
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3601
13.3k
  );
3602
13.3k
}
3603
#endif  // HAS_I422TOARGBROW_AVX2
3604
3605
#if defined(HAS_I422TOARGBROW_AVX512BW)
3606
static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
3607
static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
3608
static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
3609
3610
// 32 pixels
3611
// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
3612
// bytes).
3613
void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
3614
                                   const uint8_t* u_buf,
3615
                                   const uint8_t* v_buf,
3616
                                   uint8_t* dst_argb,
3617
                                   const struct YuvConstants* yuvconstants,
3618
0
                                   int width) {
3619
0
  asm volatile (
3620
0
    YUVTORGB_SETUP_AVX512BW(yuvconstants)
3621
0
      "sub         %[u_buf],%[v_buf]             \n"
3622
0
      "vpcmpeqb    %%xmm5,%%xmm5,%%xmm5          \n"
3623
0
      "vpbroadcastq %%xmm5,%%zmm5                \n"
3624
3625
0
    LABELALIGN
3626
0
      "1:          \n"
3627
0
    READYUV422_AVX512BW
3628
0
    YUVTORGB_AVX512BW(yuvconstants)
3629
0
    STOREARGB_AVX512BW
3630
0
      "sub         $0x20,%[width]                \n"
3631
0
      "jg          1b                            \n"
3632
3633
0
      "vzeroupper  \n"
3634
0
  : [y_buf]"+r"(y_buf),                         // %[y_buf]
3635
0
    [u_buf]"+r"(u_buf),                         // %[u_buf]
3636
0
    [v_buf]"+r"(v_buf),                         // %[v_buf]
3637
0
    [dst_argb]"+r"(dst_argb),                   // %[dst_argb]
3638
0
    [width]"+rm"(width)                         // %[width]
3639
0
  : [yuvconstants]"r"(yuvconstants),            // %[yuvconstants]
3640
0
    [quadsplitperm]"r"(kSplitQuadWords),        // %[quadsplitperm]
3641
0
    [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
3642
0
    [unperm]"r"(kUnpermuteAVX512)               // %[unperm]
3643
0
  : "memory", "cc", YUVTORGB_REGS_AVX512BW
3644
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3645
0
  );
3646
0
}
3647
#endif  // HAS_I422TOARGBROW_AVX512BW
3648
3649
#if defined(HAS_I422TOAR30ROW_AVX2)
3650
// 16 pixels
3651
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
3652
void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3653
                               const uint8_t* u_buf,
3654
                               const uint8_t* v_buf,
3655
                               uint8_t* dst_ar30,
3656
                               const struct YuvConstants* yuvconstants,
3657
0
                               int width) {
3658
0
  asm volatile (
3659
0
    YUVTORGB_SETUP_AVX2(yuvconstants)
3660
0
      "sub         %[u_buf],%[v_buf]             \n"
3661
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3662
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3663
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3664
0
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3665
0
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3666
0
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3667
3668
0
    LABELALIGN
3669
0
      "1:          \n"
3670
0
    READYUV422_AVX2
3671
0
    YUVTORGB16_AVX2(yuvconstants)
3672
0
    STOREAR30_AVX2
3673
0
      "sub         $0x10,%[width]                \n"
3674
0
      "jg          1b                            \n"
3675
3676
0
      "vzeroupper  \n"
3677
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3678
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
3679
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
3680
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3681
0
    [width]"+rm"(width)    // %[width]
3682
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3683
0
  : "memory", "cc", YUVTORGB_REGS_AVX2
3684
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3685
0
  );
3686
0
}
3687
#endif  // HAS_I422TOAR30ROW_AVX2
3688
3689
#if defined(HAS_I210TOARGBROW_AVX2)
3690
// 16 pixels
3691
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
3692
void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3693
                               const uint16_t* u_buf,
3694
                               const uint16_t* v_buf,
3695
                               uint8_t* dst_argb,
3696
                               const struct YuvConstants* yuvconstants,
3697
4.39k
                               int width) {
3698
4.39k
  asm volatile (
3699
4.39k
    YUVTORGB_SETUP_AVX2(yuvconstants)
3700
4.39k
      "sub         %[u_buf],%[v_buf]             \n"
3701
4.39k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3702
3703
4.39k
    LABELALIGN
3704
4.39k
      "1:          \n"
3705
4.39k
    READYUV210_AVX2
3706
4.39k
    YUVTORGB_AVX2(yuvconstants)
3707
4.39k
    STOREARGB_AVX2
3708
4.39k
      "sub         $0x10,%[width]                \n"
3709
4.39k
      "jg          1b                            \n"
3710
3711
4.39k
      "vzeroupper  \n"
3712
4.39k
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3713
4.39k
    [u_buf]"+r"(u_buf),    // %[u_buf]
3714
4.39k
    [v_buf]"+r"(v_buf),    // %[v_buf]
3715
4.39k
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3716
4.39k
    [width]"+rm"(width)    // %[width]
3717
4.39k
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3718
4.39k
  : "memory", "cc", YUVTORGB_REGS_AVX2
3719
4.39k
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3720
4.39k
  );
3721
4.39k
}
3722
#endif  // HAS_I210TOARGBROW_AVX2
3723
3724
#if defined(HAS_I212TOARGBROW_AVX2)
3725
// 16 pixels
3726
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
3727
void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3728
                               const uint16_t* u_buf,
3729
                               const uint16_t* v_buf,
3730
                               uint8_t* dst_argb,
3731
                               const struct YuvConstants* yuvconstants,
3732
5.16k
                               int width) {
3733
5.16k
  asm volatile (
3734
5.16k
    YUVTORGB_SETUP_AVX2(yuvconstants)
3735
5.16k
      "sub         %[u_buf],%[v_buf]             \n"
3736
5.16k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3737
3738
5.16k
    LABELALIGN
3739
5.16k
      "1:          \n"
3740
5.16k
    READYUV212_AVX2
3741
5.16k
    YUVTORGB_AVX2(yuvconstants)
3742
5.16k
    STOREARGB_AVX2
3743
5.16k
      "sub         $0x10,%[width]                \n"
3744
5.16k
      "jg          1b                            \n"
3745
3746
5.16k
      "vzeroupper  \n"
3747
5.16k
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3748
5.16k
    [u_buf]"+r"(u_buf),    // %[u_buf]
3749
5.16k
    [v_buf]"+r"(v_buf),    // %[v_buf]
3750
5.16k
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3751
5.16k
    [width]"+rm"(width)    // %[width]
3752
5.16k
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3753
5.16k
  : "memory", "cc", YUVTORGB_REGS_AVX2
3754
5.16k
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3755
5.16k
  );
3756
5.16k
}
3757
#endif  // HAS_I212TOARGBROW_AVX2
3758
3759
#if defined(HAS_I210TOAR30ROW_AVX2)
3760
// 16 pixels
3761
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
3762
void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3763
                               const uint16_t* u_buf,
3764
                               const uint16_t* v_buf,
3765
                               uint8_t* dst_ar30,
3766
                               const struct YuvConstants* yuvconstants,
3767
0
                               int width) {
3768
0
  asm volatile (
3769
0
    YUVTORGB_SETUP_AVX2(yuvconstants)
3770
0
      "sub         %[u_buf],%[v_buf]             \n"
3771
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3772
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3773
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3774
0
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3775
0
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3776
0
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3777
3778
0
    LABELALIGN
3779
0
      "1:          \n"
3780
0
    READYUV210_AVX2
3781
0
    YUVTORGB16_AVX2(yuvconstants)
3782
0
    STOREAR30_AVX2
3783
0
      "sub         $0x10,%[width]                \n"
3784
0
      "jg          1b                            \n"
3785
3786
0
      "vzeroupper  \n"
3787
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3788
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
3789
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
3790
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3791
0
    [width]"+rm"(width)    // %[width]
3792
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3793
0
  : "memory", "cc", YUVTORGB_REGS_AVX2
3794
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3795
0
  );
3796
0
}
3797
#endif  // HAS_I210TOAR30ROW_AVX2
3798
3799
#if defined(HAS_I212TOAR30ROW_AVX2)
3800
// 16 pixels
3801
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
3802
void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
3803
                               const uint16_t* u_buf,
3804
                               const uint16_t* v_buf,
3805
                               uint8_t* dst_ar30,
3806
                               const struct YuvConstants* yuvconstants,
3807
0
                               int width) {
3808
0
  asm volatile (
3809
0
    YUVTORGB_SETUP_AVX2(yuvconstants)
3810
0
      "sub         %[u_buf],%[v_buf]             \n"
3811
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3812
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3813
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3814
0
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3815
0
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3816
0
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3817
3818
0
    LABELALIGN
3819
0
      "1:          \n"
3820
0
    READYUV212_AVX2
3821
0
    YUVTORGB16_AVX2(yuvconstants)
3822
0
    STOREAR30_AVX2
3823
0
      "sub         $0x10,%[width]                \n"
3824
0
      "jg          1b                            \n"
3825
3826
0
      "vzeroupper  \n"
3827
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3828
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
3829
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
3830
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3831
0
    [width]"+rm"(width)    // %[width]
3832
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3833
0
  : "memory", "cc", YUVTORGB_REGS_AVX2
3834
0
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3835
0
  );
3836
0
}
3837
#endif  // HAS_I212TOAR30ROW_AVX2
3838
3839
#if defined(HAS_I410TOARGBROW_AVX2)
3840
// 16 pixels
3841
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
3842
void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
3843
                               const uint16_t* u_buf,
3844
                               const uint16_t* v_buf,
3845
                               uint8_t* dst_argb,
3846
                               const struct YuvConstants* yuvconstants,
3847
7.69k
                               int width) {
3848
7.69k
  asm volatile (
3849
7.69k
    YUVTORGB_SETUP_AVX2(yuvconstants)
3850
7.69k
      "sub         %[u_buf],%[v_buf]             \n"
3851
7.69k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3852
3853
7.69k
    LABELALIGN
3854
7.69k
      "1:          \n"
3855
7.69k
    READYUV410_AVX2
3856
7.69k
    YUVTORGB_AVX2(yuvconstants)
3857
7.69k
    STOREARGB_AVX2
3858
7.69k
      "sub         $0x10,%[width]                \n"
3859
7.69k
      "jg          1b                            \n"
3860
7.69k
      "vzeroupper  \n"
3861
3862
7.69k
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3863
7.69k
    [u_buf]"+r"(u_buf),    // %[u_buf]
3864
7.69k
    [v_buf]"+r"(v_buf),    // %[v_buf]
3865
7.69k
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3866
7.69k
    [width]"+rm"(width)    // %[width]
3867
7.69k
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3868
7.69k
  : "memory", "cc", YUVTORGB_REGS_AVX2
3869
7.69k
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3870
7.69k
  );
3871
7.69k
}
3872
#endif  // HAS_I410TOARGBROW_AVX2
3873
3874
#if defined(HAS_I210ALPHATOARGBROW_AVX2)
3875
// 16 pixels
3876
// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
3877
void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3878
                                    const uint16_t* u_buf,
3879
                                    const uint16_t* v_buf,
3880
                                    const uint16_t* a_buf,
3881
                                    uint8_t* dst_argb,
3882
                                    const struct YuvConstants* yuvconstants,
3883
3.89k
                                    int width) {
3884
3.89k
  asm volatile(YUVTORGB_SETUP_AVX2(
3885
3.89k
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3886
3887
3.89k
      LABELALIGN "1:          \n" READYUVA210_AVX2 YUVTORGB_AVX2(
3888
3.89k
                   yuvconstants) STOREARGB_AVX2
3889
3.89k
      "subl        $0x10,%[width]                \n"
3890
3.89k
      "jg          1b                            \n"
3891
3.89k
      "vzeroupper  \n"
3892
3893
3.89k
               : [y_buf] "+r"(y_buf),        // %[y_buf]
3894
3.89k
                 [u_buf] "+r"(u_buf),        // %[u_buf]
3895
3.89k
                 [v_buf] "+r"(v_buf),        // %[v_buf]
3896
3.89k
                 [a_buf] "+r"(a_buf),        // %[a_buf]
3897
3.89k
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3898
#if defined(__i386__)
3899
                 [width] "+m"(width)  // %[width]
3900
#else
3901
3.89k
                 [width] "+rm"(width)  // %[width]
3902
3.89k
#endif
3903
3.89k
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3904
3.89k
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2",
3905
3.89k
                 "xmm3", "xmm4", "xmm5");
3906
3.89k
}
3907
#endif  // HAS_I210TOARGBROW_AVX2
3908
3909
#if defined(HAS_I410ALPHATOARGBROW_AVX2)
3910
// 16 pixels
3911
// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
3912
void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3913
                                    const uint16_t* u_buf,
3914
                                    const uint16_t* v_buf,
3915
                                    const uint16_t* a_buf,
3916
                                    uint8_t* dst_argb,
3917
                                    const struct YuvConstants* yuvconstants,
3918
7.35k
                                    int width) {
3919
7.35k
  asm volatile(YUVTORGB_SETUP_AVX2(
3920
7.35k
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3921
3922
7.35k
      LABELALIGN "1:          \n" READYUVA410_AVX2 YUVTORGB_AVX2(
3923
7.35k
                   yuvconstants) STOREARGB_AVX2
3924
7.35k
      "subl        $0x10,%[width]                \n"
3925
7.35k
      "jg          1b                            \n"
3926
7.35k
      "vzeroupper  \n"
3927
3928
7.35k
               : [y_buf] "+r"(y_buf),        // %[y_buf]
3929
7.35k
                 [u_buf] "+r"(u_buf),        // %[u_buf]
3930
7.35k
                 [v_buf] "+r"(v_buf),        // %[v_buf]
3931
7.35k
                 [a_buf] "+r"(a_buf),        // %[a_buf]
3932
7.35k
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3933
#if defined(__i386__)
3934
                 [width] "+m"(width)  // %[width]
3935
#else
3936
7.35k
                 [width] "+rm"(width)  // %[width]
3937
7.35k
#endif
3938
7.35k
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3939
7.35k
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2",
3940
7.35k
                 "xmm3", "xmm4", "xmm5");
3941
7.35k
}
3942
#endif  // HAS_I410TOARGBROW_AVX2
3943
3944
#if defined(HAS_I410TOAR30ROW_AVX2)
3945
// 16 pixels
3946
// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
3947
void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
3948
                               const uint16_t* u_buf,
3949
                               const uint16_t* v_buf,
3950
                               uint8_t* dst_ar30,
3951
                               const struct YuvConstants* yuvconstants,
3952
0
                               int width) {
3953
0
  asm volatile (
3954
0
    YUVTORGB_SETUP_AVX2(yuvconstants)
3955
0
      "sub         %[u_buf],%[v_buf]             \n"
3956
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3957
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3958
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3959
0
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3960
0
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3961
0
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3962
3963
0
    LABELALIGN
3964
0
      "1:          \n"
3965
0
    READYUV410_AVX2
3966
0
    YUVTORGB16_AVX2(yuvconstants)
3967
0
    STOREAR30_AVX2
3968
0
      "sub         $0x10,%[width]                \n"
3969
0
      "jg          1b                            \n"
3970
3971
0
      "vzeroupper  \n"
3972
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
3973
0
    [u_buf]"+r"(u_buf),    // %[u_buf]
3974
0
    [v_buf]"+r"(v_buf),    // %[v_buf]
3975
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3976
0
    [width]"+rm"(width)    // %[width]
3977
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3978
0
  : "memory", "cc", YUVTORGB_REGS_AVX2
3979
0
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3980
0
  );
3981
0
}
3982
#endif  // HAS_I410TOAR30ROW_AVX2
3983
3984
#if defined(HAS_I444ALPHATOARGBROW_AVX2)
3985
// 16 pixels
3986
// 16 UV values with 16 Y and 16 A producing 16 ARGB.
3987
void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3988
                                    const uint8_t* u_buf,
3989
                                    const uint8_t* v_buf,
3990
                                    const uint8_t* a_buf,
3991
                                    uint8_t* dst_argb,
3992
                                    const struct YuvConstants* yuvconstants,
3993
7.54k
                                    int width) {
3994
7.54k
  asm volatile(YUVTORGB_SETUP_AVX2(
3995
7.54k
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3996
3997
7.54k
      LABELALIGN "1:          \n" READYUVA444_AVX2 YUVTORGB_AVX2(
3998
7.54k
                   yuvconstants) STOREARGB_AVX2
3999
7.54k
      "subl        $0x10,%[width]                \n"
4000
7.54k
      "jg          1b                            \n"
4001
7.54k
      "vzeroupper  \n"
4002
7.54k
               : [y_buf] "+r"(y_buf),        // %[y_buf]
4003
7.54k
                 [u_buf] "+r"(u_buf),        // %[u_buf]
4004
7.54k
                 [v_buf] "+r"(v_buf),        // %[v_buf]
4005
7.54k
                 [a_buf] "+r"(a_buf),        // %[a_buf]
4006
7.54k
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
4007
#if defined(__i386__)
4008
                 [width] "+m"(width)  // %[width]
4009
#else
4010
7.54k
                 [width] "+rm"(width)  // %[width]
4011
7.54k
#endif
4012
7.54k
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4013
7.54k
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2",
4014
7.54k
                 "xmm3", "xmm4", "xmm5");
4015
7.54k
}
4016
#endif  // HAS_I444ALPHATOARGBROW_AVX2
4017
4018
#if defined(HAS_I422ALPHATOARGBROW_AVX2)
4019
// 16 pixels
4020
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
4021
void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4022
                                    const uint8_t* u_buf,
4023
                                    const uint8_t* v_buf,
4024
                                    const uint8_t* a_buf,
4025
                                    uint8_t* dst_argb,
4026
                                    const struct YuvConstants* yuvconstants,
4027
3.54k
                                    int width) {
4028
3.54k
  asm volatile(YUVTORGB_SETUP_AVX2(
4029
3.54k
      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
4030
4031
3.54k
      LABELALIGN "1:          \n" READYUVA422_AVX2 YUVTORGB_AVX2(
4032
3.54k
                   yuvconstants) STOREARGB_AVX2
4033
3.54k
      "subl        $0x10,%[width]                \n"
4034
3.54k
      "jg          1b                            \n"
4035
3.54k
      "vzeroupper  \n"
4036
3.54k
               : [y_buf] "+r"(y_buf),        // %[y_buf]
4037
3.54k
                 [u_buf] "+r"(u_buf),        // %[u_buf]
4038
3.54k
                 [v_buf] "+r"(v_buf),        // %[v_buf]
4039
3.54k
                 [a_buf] "+r"(a_buf),        // %[a_buf]
4040
3.54k
                 [dst_argb] "+r"(dst_argb),  // %[dst_argb]
4041
#if defined(__i386__)
4042
                 [width] "+m"(width)  // %[width]
4043
#else
4044
3.54k
                 [width] "+rm"(width)  // %[width]
4045
3.54k
#endif
4046
3.54k
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4047
3.54k
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2",
4048
3.54k
                 "xmm3", "xmm4", "xmm5");
4049
3.54k
}
4050
#endif  // HAS_I422ALPHATOARGBROW_AVX2
4051
4052
#if defined(HAS_I422TORGBAROW_AVX2)
4053
// 16 pixels
4054
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
4055
void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
4056
                               const uint8_t* u_buf,
4057
                               const uint8_t* v_buf,
4058
                               uint8_t* dst_argb,
4059
                               const struct YuvConstants* yuvconstants,
4060
3.98k
                               int width) {
4061
3.98k
  asm volatile (
4062
3.98k
    YUVTORGB_SETUP_AVX2(yuvconstants)
4063
3.98k
      "sub         %[u_buf],%[v_buf]             \n"
4064
3.98k
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4065
4066
3.98k
    LABELALIGN
4067
3.98k
      "1:          \n"
4068
3.98k
    READYUV422_AVX2
4069
3.98k
    YUVTORGB_AVX2(yuvconstants)
4070
4071
    // Step 3: Weave into RGBA
4072
3.98k
    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
4073
3.98k
    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4074
3.98k
    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
4075
3.98k
    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
4076
3.98k
    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
4077
3.98k
    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
4078
3.98k
    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
4079
3.98k
    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
4080
3.98k
    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
4081
3.98k
    "sub        $0x10,%[width]                 \n"
4082
3.98k
    "jg         1b                             \n"
4083
3.98k
    "vzeroupper                                \n"
4084
3.98k
  : [y_buf]"+r"(y_buf),    // %[y_buf]
4085
3.98k
    [u_buf]"+r"(u_buf),    // %[u_buf]
4086
3.98k
    [v_buf]"+r"(v_buf),    // %[v_buf]
4087
3.98k
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4088
3.98k
    [width]"+rm"(width)    // %[width]
4089
3.98k
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4090
3.98k
  : "memory", "cc", YUVTORGB_REGS_AVX2
4091
3.98k
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4092
3.98k
  );
4093
3.98k
}
4094
#endif  // HAS_I422TORGBAROW_AVX2
4095
4096
#if defined(HAS_NV12TOARGBROW_AVX2)
4097
// 16 pixels.
4098
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
4099
void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
4100
                               const uint8_t* uv_buf,
4101
                               uint8_t* dst_argb,
4102
                               const struct YuvConstants* yuvconstants,
4103
0
                               int width) {
4104
0
  asm volatile(YUVTORGB_SETUP_AVX2(
4105
0
      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4106
4107
0
      LABELALIGN "1:          \n" READNV12_AVX2 YUVTORGB_AVX2(
4108
0
                   yuvconstants) STOREARGB_AVX2
4109
0
      "sub         $0x10,%[width]                \n"
4110
0
      "jg          1b                            \n"
4111
0
      "vzeroupper  \n"
4112
0
               : [y_buf] "+r"(y_buf),              // %[y_buf]
4113
0
                 [uv_buf] "+r"(uv_buf),            // %[uv_buf]
4114
0
                 [dst_argb] "+r"(dst_argb),        // %[dst_argb]
4115
0
                 [width] "+rm"(width)              // %[width]
4116
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4117
0
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1",
4118
0
                 "xmm2", "xmm3", "xmm4", "xmm5");
4119
0
}
4120
#endif  // HAS_NV12TOARGBROW_AVX2
4121
4122
#if defined(HAS_NV21TOARGBROW_AVX2)
4123
// 16 pixels.
4124
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
4125
void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4126
                               const uint8_t* vu_buf,
4127
                               uint8_t* dst_argb,
4128
                               const struct YuvConstants* yuvconstants,
4129
0
                               int width) {
4130
0
  asm volatile(YUVTORGB_SETUP_AVX2(
4131
0
      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4132
4133
0
      LABELALIGN "1:          \n" READNV21_AVX2 YUVTORGB_AVX2(
4134
0
                   yuvconstants) STOREARGB_AVX2
4135
0
      "sub         $0x10,%[width]                \n"
4136
0
      "jg          1b                            \n"
4137
0
      "vzeroupper  \n"
4138
0
               : [y_buf] "+r"(y_buf),               // %[y_buf]
4139
0
                 [vu_buf] "+r"(vu_buf),             // %[vu_buf]
4140
0
                 [dst_argb] "+r"(dst_argb),         // %[dst_argb]
4141
0
                 [width] "+rm"(width)               // %[width]
4142
0
               : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
4143
0
                 [kShuffleNV21] "m"(kShuffleNV21)
4144
0
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1",
4145
0
                 "xmm2", "xmm3", "xmm4", "xmm5");
4146
0
}
4147
#endif  // HAS_NV21TOARGBROW_AVX2
4148
4149
#if defined(HAS_YUY2TOARGBROW_AVX2)
4150
// 16 pixels.
4151
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
4152
void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4153
                               uint8_t* dst_argb,
4154
                               const struct YuvConstants* yuvconstants,
4155
0
                               int width) {
4156
0
  asm volatile(
4157
0
      "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
4158
0
      "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
4159
0
      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4160
4161
0
      LABELALIGN "1:          \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants)
4162
0
          STOREARGB_AVX2
4163
0
      "sub         $0x10,%[width]                \n"
4164
0
      "jg          1b                            \n"
4165
0
      "vzeroupper  \n"
4166
0
      : [yuy2_buf] "+r"(yuy2_buf),         // %[yuy2_buf]
4167
0
        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
4168
0
        [width] "+rm"(width)               // %[width]
4169
0
      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
4170
0
        [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV)
4171
0
      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2",
4172
0
        "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
4173
0
}
4174
#endif  // HAS_YUY2TOARGBROW_AVX2
4175
4176
#if defined(HAS_UYVYTOARGBROW_AVX2)
4177
// 16 pixels.
4178
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
4179
void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4180
                               uint8_t* dst_argb,
4181
                               const struct YuvConstants* yuvconstants,
4182
0
                               int width) {
4183
0
  asm volatile(
4184
0
      "vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
4185
0
      "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
4186
0
      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4187
4188
0
      LABELALIGN "1:          \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants)
4189
0
          STOREARGB_AVX2
4190
0
      "sub         $0x10,%[width]                \n"
4191
0
      "jg          1b                            \n"
4192
0
      "vzeroupper  \n"
4193
0
      : [uyvy_buf] "+r"(uyvy_buf),         // %[uyvy_buf]
4194
0
        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
4195
0
        [width] "+rm"(width)               // %[width]
4196
0
      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
4197
0
        [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV)
4198
0
      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
4199
0
        "xmm4", "xmm5", "xmm6", "xmm7");
4200
0
}
4201
#endif  // HAS_UYVYTOARGBROW_AVX2
4202
4203
#if defined(HAS_P210TOARGBROW_AVX2)
4204
// 16 pixels.
4205
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
4206
void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4207
                               const uint16_t* uv_buf,
4208
                               uint8_t* dst_argb,
4209
                               const struct YuvConstants* yuvconstants,
4210
0
                               int width) {
4211
0
  asm volatile(YUVTORGB_SETUP_AVX2(
4212
0
      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4213
4214
0
      LABELALIGN "1:          \n" READP210_AVX2 YUVTORGB_AVX2(
4215
0
                   yuvconstants) STOREARGB_AVX2
4216
0
      "sub         $0x10,%[width]                \n"
4217
0
      "jg          1b                            \n"
4218
0
      "vzeroupper  \n"
4219
0
               : [y_buf] "+r"(y_buf),              // %[y_buf]
4220
0
                 [uv_buf] "+r"(uv_buf),            // %[uv_buf]
4221
0
                 [dst_argb] "+r"(dst_argb),        // %[dst_argb]
4222
0
                 [width] "+rm"(width)              // %[width]
4223
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4224
0
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1",
4225
0
                 "xmm2", "xmm3", "xmm4", "xmm5");
4226
0
}
4227
#endif  // HAS_P210TOARGBROW_AVX2
4228
4229
#if defined(HAS_P410TOARGBROW_AVX2)
4230
// 16 pixels.
4231
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
4232
void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4233
                               const uint16_t* uv_buf,
4234
                               uint8_t* dst_argb,
4235
                               const struct YuvConstants* yuvconstants,
4236
0
                               int width) {
4237
0
  asm volatile(YUVTORGB_SETUP_AVX2(
4238
0
      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4239
4240
0
      LABELALIGN "1:          \n" READP410_AVX2 YUVTORGB_AVX2(
4241
0
                   yuvconstants) STOREARGB_AVX2
4242
0
      "sub         $0x10,%[width]                \n"
4243
0
      "jg          1b                            \n"
4244
0
      "vzeroupper  \n"
4245
0
               : [y_buf] "+r"(y_buf),              // %[y_buf]
4246
0
                 [uv_buf] "+r"(uv_buf),            // %[uv_buf]
4247
0
                 [dst_argb] "+r"(dst_argb),        // %[dst_argb]
4248
0
                 [width] "+rm"(width)              // %[width]
4249
0
               : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4250
0
               : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1",
4251
0
                 "xmm2", "xmm3", "xmm4", "xmm5");
4252
0
}
4253
#endif  // HAS_P410TOARGBROW_AVX2
4254
4255
#if defined(HAS_P210TOAR30ROW_AVX2)
4256
// 16 pixels
4257
// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
4258
void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4259
                               const uint16_t* uv_buf,
4260
                               uint8_t* dst_ar30,
4261
                               const struct YuvConstants* yuvconstants,
4262
0
                               int width) {
4263
0
  asm volatile (
4264
0
    YUVTORGB_SETUP_AVX2(yuvconstants)
4265
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4266
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4267
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4268
0
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4269
0
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4270
0
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4271
4272
0
    LABELALIGN
4273
0
      "1:          \n"
4274
0
    READP210_AVX2
4275
0
    YUVTORGB16_AVX2(yuvconstants)
4276
0
    STOREAR30_AVX2
4277
0
      "sub         $0x10,%[width]                \n"
4278
0
      "jg          1b                            \n"
4279
4280
0
      "vzeroupper  \n"
4281
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
4282
0
    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4283
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4284
0
    [width]"+rm"(width)    // %[width]
4285
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4286
0
  : "memory", "cc", YUVTORGB_REGS_AVX2
4287
0
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4288
0
  );
4289
0
}
4290
#endif  // HAS_P210TOAR30ROW_AVX2
4291
4292
#if defined(HAS_P410TOAR30ROW_AVX2)
4293
// 16 pixels
4294
// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
4295
void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4296
                               const uint16_t* uv_buf,
4297
                               uint8_t* dst_ar30,
4298
                               const struct YuvConstants* yuvconstants,
4299
0
                               int width) {
4300
0
  asm volatile (
4301
0
    YUVTORGB_SETUP_AVX2(yuvconstants)
4302
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4303
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4304
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4305
0
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4306
0
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4307
0
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4308
4309
0
    LABELALIGN
4310
0
      "1:          \n"
4311
0
    READP410_AVX2
4312
0
    YUVTORGB16_AVX2(yuvconstants)
4313
0
    STOREAR30_AVX2
4314
0
      "sub         $0x10,%[width]                \n"
4315
0
      "jg          1b                            \n"
4316
4317
0
      "vzeroupper  \n"
4318
0
  : [y_buf]"+r"(y_buf),    // %[y_buf]
4319
0
    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4320
0
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4321
0
    [width]"+rm"(width)    // %[width]
4322
0
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4323
0
  : "memory", "cc", YUVTORGB_REGS_AVX2
4324
0
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4325
0
  );
4326
0
}
4327
#endif  // HAS_P410TOAR30ROW_AVX2
4328
4329
#ifdef HAS_I400TOARGBROW_SSE2
4330
void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4331
                        uint8_t* dst_argb,
4332
                        const struct YuvConstants* yuvconstants,
4333
0
                        int width) {
4334
0
  asm volatile(
4335
0
      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
4336
0
      "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
4337
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
4338
0
      "pslld       $0x18,%%xmm4                  \n"
4339
4340
0
      LABELALIGN
4341
0
      "1:          \n"
4342
      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4343
0
      "movq      (%0),%%xmm0                     \n"
4344
0
      "lea       0x8(%0),%0                      \n"
4345
0
      "punpcklbw %%xmm0,%%xmm0                   \n"
4346
0
      "pmulhuw   %%xmm2,%%xmm0                   \n"
4347
0
      "paddsw    %%xmm3,%%xmm0                   \n"
4348
0
      "psraw     $6, %%xmm0                      \n"
4349
0
      "packuswb  %%xmm0,%%xmm0                   \n"
4350
4351
      // Step 2: Weave into ARGB
4352
0
      "punpcklbw %%xmm0,%%xmm0                   \n"
4353
0
      "movdqa    %%xmm0,%%xmm1                   \n"
4354
0
      "punpcklwd %%xmm0,%%xmm0                   \n"
4355
0
      "punpckhwd %%xmm1,%%xmm1                   \n"
4356
0
      "por       %%xmm4,%%xmm0                   \n"
4357
0
      "por       %%xmm4,%%xmm1                   \n"
4358
0
      "movdqu    %%xmm0,(%1)                     \n"
4359
0
      "movdqu    %%xmm1,0x10(%1)                 \n"
4360
0
      "lea       0x20(%1),%1                     \n"
4361
4362
0
      "sub       $0x8,%2                         \n"
4363
0
      "jg        1b                              \n"
4364
0
      : "+r"(y_buf),       // %0
4365
0
        "+r"(dst_argb),    // %1
4366
0
        "+rm"(width)       // %2
4367
0
      : "r"(yuvconstants)  // %3
4368
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4369
0
}
4370
#endif  // HAS_I400TOARGBROW_SSE2
4371
4372
#ifdef HAS_I400TOARGBROW_AVX2
4373
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4374
// note: vpunpcklbw mutates and vpackuswb unmutates.
4375
void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4376
                        uint8_t* dst_argb,
4377
                        const struct YuvConstants* yuvconstants,
4378
363
                        int width) {
4379
363
  asm volatile(
4380
363
      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
4381
363
      "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
4382
363
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
4383
363
      "vpslld      $0x18,%%ymm4,%%ymm4           \n"
4384
4385
363
      LABELALIGN
4386
363
      "1:          \n"
4387
      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4388
363
      "vmovdqu    (%0),%%xmm0                    \n"
4389
363
      "lea        0x10(%0),%0                    \n"
4390
363
      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
4391
363
      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
4392
363
      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4393
363
      "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
4394
363
      "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
4395
363
      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
4396
363
      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
4397
363
      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4398
363
      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
4399
363
      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
4400
363
      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
4401
363
      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
4402
363
      "vmovdqu    %%ymm0,(%1)                    \n"
4403
363
      "vmovdqu    %%ymm1,0x20(%1)                \n"
4404
363
      "lea        0x40(%1),%1                     \n"
4405
363
      "sub        $0x10,%2                       \n"
4406
363
      "jg        1b                              \n"
4407
363
      "vzeroupper                                \n"
4408
363
      : "+r"(y_buf),       // %0
4409
363
        "+r"(dst_argb),    // %1
4410
363
        "+rm"(width)       // %2
4411
363
      : "r"(yuvconstants)  // %3
4412
363
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4413
363
}
4414
#endif  // HAS_I400TOARGBROW_AVX2
4415
4416
#ifdef HAS_MIRRORROW_SSSE3
4417
// Shuffle table for reversing the bytes.
4418
static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4419
                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
4420
4421
0
void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4422
0
  intptr_t temp_width = (intptr_t)(width);
4423
0
      asm volatile("movdqa      %3,%%xmm5                     \n"
4424
4425
0
               LABELALIGN
4426
0
      "1:          \n"
4427
0
      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
4428
0
      "pshufb      %%xmm5,%%xmm0                 \n"
4429
0
      "movdqu      %%xmm0,(%1)                   \n"
4430
0
      "lea         0x10(%1),%1                   \n"
4431
0
      "sub         $0x10,%2                      \n"
4432
0
      "jg          1b                            \n"
4433
0
               : "+r"(src),           // %0
4434
0
                 "+r"(dst),           // %1
4435
0
                 "+r"(temp_width)     // %2
4436
0
               : "m"(kShuffleMirror)  // %3
4437
0
               : "memory", "cc", "xmm0", "xmm5");
4438
0
}
4439
#endif  // HAS_MIRRORROW_SSSE3
4440
4441
#ifdef HAS_MIRRORROW_AVX2
4442
0
void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4443
0
  intptr_t temp_width = (intptr_t)(width);
4444
0
      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
4445
4446
0
               LABELALIGN
4447
0
      "1:          \n"
4448
0
      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
4449
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4450
0
      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4451
0
      "vmovdqu     %%ymm0,(%1)                   \n"
4452
0
      "lea         0x20(%1),%1                   \n"
4453
0
      "sub         $0x20,%2                      \n"
4454
0
      "jg          1b                            \n"
4455
0
      "vzeroupper  \n"
4456
0
               : "+r"(src),           // %0
4457
0
                 "+r"(dst),           // %1
4458
0
                 "+r"(temp_width)     // %2
4459
0
               : "m"(kShuffleMirror)  // %3
4460
0
               : "memory", "cc", "xmm0", "xmm5");
4461
0
}
4462
#endif  // HAS_MIRRORROW_AVX2
4463
4464
#ifdef HAS_MIRRORUVROW_SSSE3
4465
// Shuffle table for reversing the UV.
4466
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4467
                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
4468
4469
0
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4470
0
  intptr_t temp_width = (intptr_t)(width);
4471
0
      asm volatile("movdqa      %3,%%xmm5                     \n"
4472
4473
0
               LABELALIGN
4474
0
      "1:          \n"
4475
0
      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
4476
0
      "pshufb      %%xmm5,%%xmm0                 \n"
4477
0
      "movdqu      %%xmm0,(%1)                   \n"
4478
0
      "lea         0x10(%1),%1                   \n"
4479
0
      "sub         $0x8,%2                       \n"
4480
0
      "jg          1b                            \n"
4481
0
               : "+r"(src_uv),          // %0
4482
0
                 "+r"(dst_uv),          // %1
4483
0
                 "+r"(temp_width)       // %2
4484
0
               : "m"(kShuffleMirrorUV)  // %3
4485
0
               : "memory", "cc", "xmm0", "xmm5");
4486
0
}
4487
#endif  // HAS_MIRRORUVROW_SSSE3
4488
4489
#ifdef HAS_MIRRORUVROW_AVX2
4490
0
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4491
0
  intptr_t temp_width = (intptr_t)(width);
4492
0
      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
4493
4494
0
               LABELALIGN
4495
0
      "1:          \n"
4496
0
      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
4497
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4498
0
      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4499
0
      "vmovdqu     %%ymm0,(%1)                   \n"
4500
0
      "lea         0x20(%1),%1                   \n"
4501
0
      "sub         $0x10,%2                      \n"
4502
0
      "jg          1b                            \n"
4503
0
      "vzeroupper  \n"
4504
0
               : "+r"(src_uv),          // %0
4505
0
                 "+r"(dst_uv),          // %1
4506
0
                 "+r"(temp_width)       // %2
4507
0
               : "m"(kShuffleMirrorUV)  // %3
4508
0
               : "memory", "cc", "xmm0", "xmm5");
4509
0
}
4510
#endif  // HAS_MIRRORUVROW_AVX2
4511
4512
#ifdef HAS_MIRRORSPLITUVROW_SSSE3
4513
// Shuffle table for reversing the bytes of UV channels.
4514
static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4515
                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
4516
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4517
                            uint8_t* dst_u,
4518
                            uint8_t* dst_v,
4519
0
                            int width) {
4520
0
  intptr_t temp_width = (intptr_t)(width);
4521
0
  asm volatile(
4522
0
      "movdqa      %4,%%xmm1                     \n"
4523
0
      "lea         -0x10(%0,%3,2),%0             \n"
4524
0
      "sub         %1,%2                         \n"
4525
4526
0
      LABELALIGN
4527
0
      "1:          \n"
4528
0
      "movdqu      (%0),%%xmm0                   \n"
4529
0
      "lea         -0x10(%0),%0                  \n"
4530
0
      "pshufb      %%xmm1,%%xmm0                 \n"
4531
0
      "movlpd      %%xmm0,(%1)                   \n"
4532
0
      "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
4533
0
      "lea         0x8(%1),%1                    \n"
4534
0
      "sub         $8,%3                         \n"
4535
0
      "jg          1b                            \n"
4536
0
      : "+r"(src),                  // %0
4537
0
        "+r"(dst_u),                // %1
4538
0
        "+r"(dst_v),                // %2
4539
0
        "+r"(temp_width)            // %3
4540
0
      : "m"(kShuffleMirrorSplitUV)  // %4
4541
0
      : "memory", "cc", "xmm0", "xmm1");
4542
0
}
4543
#endif  // HAS_MIRRORSPLITUVROW_SSSE3
4544
4545
#ifdef HAS_RGB24MIRRORROW_SSSE3
4546
4547
// Shuffle first 5 pixels to last 5 mirrored.  first byte zero
4548
static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4549
                                         7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
4550
4551
// Shuffle last 5 pixels to first 5 mirrored.  last byte zero
4552
static const uvec8 kShuffleMirrorRGB1 = {
4553
    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4554
4555
// Shuffle 5 pixels at a time (15 bytes)
4556
void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4557
                          uint8_t* dst_rgb24,
4558
0
                          int width) {
4559
0
  intptr_t temp_width = (intptr_t)(width);
4560
0
  src_rgb24 += width * 3 - 48;
4561
0
  asm volatile(
4562
0
      "movdqa      %3,%%xmm4                     \n"
4563
0
      "movdqa      %4,%%xmm5                     \n"
4564
4565
0
      LABELALIGN
4566
0
      "1:          \n"
4567
0
      "movdqu      (%0),%%xmm0                   \n"  // first 5
4568
0
      "movdqu      15(%0),%%xmm1                 \n"  // next 5
4569
0
      "movdqu      30(%0),%%xmm2                 \n"  // next 5
4570
0
      "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
4571
0
      "pshufb      %%xmm4,%%xmm0                 \n"
4572
0
      "pshufb      %%xmm4,%%xmm1                 \n"
4573
0
      "pshufb      %%xmm4,%%xmm2                 \n"
4574
0
      "pshufb      %%xmm5,%%xmm3                 \n"
4575
0
      "lea         -0x30(%0),%0                  \n"
4576
0
      "movdqu      %%xmm0,32(%1)                 \n"  // last 5
4577
0
      "movdqu      %%xmm1,17(%1)                 \n"  // next 5
4578
0
      "movdqu      %%xmm2,2(%1)                  \n"  // next 5
4579
0
      "movlpd      %%xmm3,0(%1)                  \n"  // first 1
4580
0
      "lea         0x30(%1),%1                   \n"
4581
0
      "sub         $0x10,%2                      \n"
4582
0
      "jg          1b                            \n"
4583
0
      : "+r"(src_rgb24),          // %0
4584
0
        "+r"(dst_rgb24),          // %1
4585
0
        "+r"(temp_width)          // %2
4586
0
      : "m"(kShuffleMirrorRGB0),  // %3
4587
0
        "m"(kShuffleMirrorRGB1)   // %4
4588
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4589
0
}
4590
#endif  // HAS_RGB24MIRRORROW_SSSE3
4591
4592
#ifdef HAS_ARGBMIRRORROW_SSE2
4593
4594
0
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4595
0
  intptr_t temp_width = (intptr_t)(width);
4596
0
      asm volatile("lea         -0x10(%0,%2,4),%0             \n"
4597
4598
0
               LABELALIGN
4599
0
      "1:          \n"
4600
0
      "movdqu      (%0),%%xmm0                   \n"
4601
0
      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
4602
0
      "lea         -0x10(%0),%0                  \n"
4603
0
      "movdqu      %%xmm0,(%1)                   \n"
4604
0
      "lea         0x10(%1),%1                   \n"
4605
0
      "sub         $0x4,%2                       \n"
4606
0
      "jg          1b                            \n"
4607
0
               : "+r"(src),        // %0
4608
0
                 "+r"(dst),        // %1
4609
0
                 "+r"(temp_width)  // %2
4610
0
               :
4611
0
               : "memory", "cc", "xmm0");
4612
0
}
4613
#endif  // HAS_ARGBMIRRORROW_SSE2
4614
4615
#ifdef HAS_ARGBMIRRORROW_AVX2
4616
// Shuffle table for reversing the bytes.
4617
static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
4618
0
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4619
0
  intptr_t temp_width = (intptr_t)(width);
4620
0
      asm volatile("vmovdqu     %3,%%ymm5                     \n"
4621
4622
0
               LABELALIGN
4623
0
      "1:          \n"
4624
0
      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
4625
0
      "vmovdqu     %%ymm0,(%1)                   \n"
4626
0
      "lea         0x20(%1),%1                   \n"
4627
0
      "sub         $0x8,%2                       \n"
4628
0
      "jg          1b                            \n"
4629
0
      "vzeroupper  \n"
4630
0
               : "+r"(src),                    // %0
4631
0
                 "+r"(dst),                    // %1
4632
0
                 "+r"(temp_width)              // %2
4633
0
               : "m"(kARGBShuffleMirror_AVX2)  // %3
4634
0
               : "memory", "cc", "xmm0", "xmm5");
4635
0
}
4636
#endif  // HAS_ARGBMIRRORROW_AVX2
4637
4638
#ifdef HAS_SPLITUVROW_AVX2
4639
void SplitUVRow_AVX2(const uint8_t* src_uv,
4640
                     uint8_t* dst_u,
4641
                     uint8_t* dst_v,
4642
0
                     int width) {
4643
0
  asm volatile(
4644
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4645
0
      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4646
0
      "sub         %1,%2                         \n"
4647
4648
0
      LABELALIGN
4649
0
      "1:          \n"
4650
0
      "vmovdqu     (%0),%%ymm0                   \n"
4651
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
4652
0
      "lea         0x40(%0),%0                   \n"
4653
0
      "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
4654
0
      "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
4655
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4656
0
      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4657
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4658
0
      "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
4659
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4660
0
      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
4661
0
      "vmovdqu     %%ymm0,(%1)                   \n"
4662
0
      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
4663
0
      "lea         0x20(%1),%1                   \n"
4664
0
      "sub         $0x20,%3                      \n"
4665
0
      "jg          1b                            \n"
4666
0
      "vzeroupper  \n"
4667
0
      : "+r"(src_uv),  // %0
4668
0
        "+r"(dst_u),   // %1
4669
0
        "+r"(dst_v),   // %2
4670
0
        "+r"(width)    // %3
4671
0
      :
4672
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4673
0
}
4674
#endif  // HAS_SPLITUVROW_AVX2
4675
4676
#ifdef HAS_SPLITUVROW_SSE2
4677
void SplitUVRow_SSE2(const uint8_t* src_uv,
4678
                     uint8_t* dst_u,
4679
                     uint8_t* dst_v,
4680
0
                     int width) {
4681
0
  asm volatile(
4682
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
4683
0
      "psrlw       $0x8,%%xmm5                   \n"
4684
0
      "sub         %1,%2                         \n"
4685
4686
0
      LABELALIGN
4687
0
      "1:          \n"
4688
0
      "movdqu      (%0),%%xmm0                   \n"
4689
0
      "movdqu      0x10(%0),%%xmm1               \n"
4690
0
      "lea         0x20(%0),%0                   \n"
4691
0
      "movdqa      %%xmm0,%%xmm2                 \n"
4692
0
      "movdqa      %%xmm1,%%xmm3                 \n"
4693
0
      "pand        %%xmm5,%%xmm0                 \n"
4694
0
      "pand        %%xmm5,%%xmm1                 \n"
4695
0
      "packuswb    %%xmm1,%%xmm0                 \n"
4696
0
      "psrlw       $0x8,%%xmm2                   \n"
4697
0
      "psrlw       $0x8,%%xmm3                   \n"
4698
0
      "packuswb    %%xmm3,%%xmm2                 \n"
4699
0
      "movdqu      %%xmm0,(%1)                   \n"
4700
0
      "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
4701
0
      "lea         0x10(%1),%1                   \n"
4702
0
      "sub         $0x10,%3                      \n"
4703
0
      "jg          1b                            \n"
4704
0
      : "+r"(src_uv),  // %0
4705
0
        "+r"(dst_u),   // %1
4706
0
        "+r"(dst_v),   // %2
4707
0
        "+r"(width)    // %3
4708
0
      :
4709
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4710
0
}
4711
#endif  // HAS_SPLITUVROW_SSE2
4712
4713
#ifdef HAS_DETILEROW_SSE2
4714
void DetileRow_SSE2(const uint8_t* src,
4715
                    ptrdiff_t src_tile_stride,
4716
                    uint8_t* dst,
4717
0
                    int width) {
4718
0
  asm volatile(
4719
0
      "1:          \n"
4720
0
      "movdqu      (%0),%%xmm0                   \n"
4721
0
      "sub         $0x10,%2                      \n"
4722
0
      "lea         (%0,%3),%0                    \n"
4723
0
      "movdqu      %%xmm0,(%1)                   \n"
4724
0
      "lea         0x10(%1),%1                   \n"
4725
0
      "jg          1b                            \n"
4726
0
      : "+r"(src),            // %0
4727
0
        "+r"(dst),            // %1
4728
0
        "+r"(width)           // %2
4729
0
      : "r"(src_tile_stride)  // %3
4730
0
      : "cc", "memory", "xmm0");
4731
0
}
4732
#endif  // HAS_DETILEROW_SSE2
4733
4734
#ifdef HAS_DETILEROW_16_SSE2
4735
void DetileRow_16_SSE2(const uint16_t* src,
4736
                       ptrdiff_t src_tile_stride,
4737
                       uint16_t* dst,
4738
0
                       int width) {
4739
0
  asm volatile(
4740
0
      "1:          \n"
4741
0
      "movdqu      (%0),%%xmm0                   \n"
4742
0
      "movdqu      0x10(%0),%%xmm1               \n"
4743
0
      "lea         (%0,%3,2),%0                  \n"
4744
0
      "movdqu      %%xmm0,(%1)                   \n"
4745
0
      "movdqu      %%xmm1,0x10(%1)               \n"
4746
0
      "lea         0x20(%1),%1                   \n"
4747
0
      "sub         $0x10,%2                      \n"
4748
0
      "jg          1b                            \n"
4749
0
      : "+r"(src),            // %0
4750
0
        "+r"(dst),            // %1
4751
0
        "+r"(width)           // %2
4752
0
      : "r"(src_tile_stride)  // %3
4753
0
      : "cc", "memory", "xmm0", "xmm1");
4754
0
}
4755
#endif  // HAS_DETILEROW_SSE2
4756
4757
#ifdef HAS_DETILEROW_16_AVX
4758
void DetileRow_16_AVX(const uint16_t* src,
4759
                      ptrdiff_t src_tile_stride,
4760
                      uint16_t* dst,
4761
0
                      int width) {
4762
0
  asm volatile(
4763
0
      "1:          \n"
4764
0
      "vmovdqu     (%0),%%ymm0                   \n"
4765
0
      "lea         (%0,%3,2),%0                  \n"
4766
0
      "vmovdqu     %%ymm0,(%1)                   \n"
4767
0
      "lea         0x20(%1),%1                   \n"
4768
0
      "sub         $0x10,%2                      \n"
4769
0
      "jg          1b                            \n"
4770
0
      "vzeroupper  \n"
4771
0
      : "+r"(src),            // %0
4772
0
        "+r"(dst),            // %1
4773
0
        "+r"(width)           // %2
4774
0
      : "r"(src_tile_stride)  // %3
4775
0
      : "cc", "memory", "xmm0");
4776
0
}
4777
#endif  // HAS_DETILEROW_AVX
4778
4779
#ifdef HAS_DETILETOYUY2_SSE2
4780
// Read 16 Y, 8 UV, and write 8 YUYV.
4781
void DetileToYUY2_SSE2(const uint8_t* src_y,
4782
                       ptrdiff_t src_y_tile_stride,
4783
                       const uint8_t* src_uv,
4784
                       ptrdiff_t src_uv_tile_stride,
4785
                       uint8_t* dst_yuy2,
4786
0
                       int width) {
4787
0
  asm volatile(
4788
0
      "1:          \n"
4789
0
      "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
4790
0
      "sub         $0x10,%3                      \n"
4791
0
      "lea         (%0,%4),%0                    \n"
4792
0
      "movdqu      (%1),%%xmm1                   \n"  // Load 8 UV
4793
0
      "lea         (%1,%5),%1                    \n"
4794
0
      "movdqu      %%xmm0,%%xmm2                 \n"
4795
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"
4796
0
      "punpckhbw   %%xmm1,%%xmm2                 \n"
4797
0
      "movdqu      %%xmm0,(%2)                   \n"
4798
0
      "movdqu      %%xmm2,0x10(%2)               \n"
4799
0
      "lea         0x20(%2),%2                   \n"
4800
0
      "jg          1b                            \n"
4801
0
      : "+r"(src_y),                            // %0
4802
0
        "+r"(src_uv),                           // %1
4803
0
        "+r"(dst_yuy2),                         // %2
4804
0
        "+r"(width)                             // %3
4805
0
      : "r"(src_y_tile_stride),                 // %4
4806
0
        "r"(src_uv_tile_stride)                 // %5
4807
0
      : "cc", "memory", "xmm0", "xmm1", "xmm2"  // Clobber list
4808
0
  );
4809
0
}
4810
#endif
4811
4812
#ifdef HAS_DETILESPLITUVROW_SSSE3
4813
// TODO(greenjustin): Look into generating these constants instead of loading
4814
// them since this can cause branch mispredicts for fPIC code on 32-bit
4815
// machines.
4816
static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
4817
                                     1, 3, 5, 7, 9, 11, 13, 15};
4818
4819
// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
4820
// slow on older SSE2 processors.
4821
void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
4822
                            ptrdiff_t src_tile_stride,
4823
                            uint8_t* dst_u,
4824
                            uint8_t* dst_v,
4825
0
                            int width) {
4826
0
  asm volatile(
4827
0
      "movdqu      %4,%%xmm1                     \n"
4828
0
      "1:          \n"
4829
0
      "movdqu      (%0),%%xmm0                   \n"
4830
0
      "lea         (%0, %5),%0                   \n"
4831
0
      "pshufb      %%xmm1,%%xmm0                 \n"
4832
0
      "movq        %%xmm0,(%1)                   \n"
4833
0
      "lea         0x8(%1),%1                    \n"
4834
0
      "movhps      %%xmm0,(%2)                   \n"
4835
0
      "lea         0x8(%2),%2                    \n"
4836
0
      "sub         $0x10,%3                      \n"
4837
0
      "jg          1b                            \n"
4838
0
      : "+r"(src_uv),         // %0
4839
0
        "+r"(dst_u),          // %1
4840
0
        "+r"(dst_v),          // %2
4841
0
        "+r"(width)           // %3
4842
0
      : "m"(kDeinterlaceUV),  // %4
4843
0
        "r"(src_tile_stride)  // %5
4844
0
      : "cc", "memory", "xmm0", "xmm1");
4845
0
}
4846
#endif  // HAS_DETILESPLITUVROW_SSSE3
4847
4848
#ifdef HAS_MERGEUVROW_AVX512BW
4849
void MergeUVRow_AVX512BW(const uint8_t* src_u,
4850
                         const uint8_t* src_v,
4851
                         uint8_t* dst_uv,
4852
0
                         int width) {
4853
0
      asm volatile("sub         %0,%1                         \n"
4854
4855
0
               LABELALIGN
4856
0
      "1:          \n"
4857
0
      "vpmovzxbw   (%0),%%zmm0                   \n"
4858
0
      "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
4859
0
      "lea         0x20(%0),%0                   \n"
4860
0
      "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
4861
0
      "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
4862
0
      "vmovdqu64   %%zmm2,(%2)                   \n"
4863
0
      "lea         0x40(%2),%2                   \n"
4864
0
      "sub         $0x20,%3                      \n"
4865
0
      "jg          1b                            \n"
4866
0
      "vzeroupper  \n"
4867
0
               : "+r"(src_u),   // %0
4868
0
                 "+r"(src_v),   // %1
4869
0
                 "+r"(dst_uv),  // %2
4870
0
                 "+r"(width)    // %3
4871
0
               :
4872
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
4873
0
}
4874
#endif  // HAS_MERGEUVROW_AVX512BW
4875
4876
#ifdef HAS_MERGEUVROW_AVX2
4877
void MergeUVRow_AVX2(const uint8_t* src_u,
4878
                     const uint8_t* src_v,
4879
                     uint8_t* dst_uv,
4880
0
                     int width) {
4881
0
      asm volatile("sub         %0,%1                         \n"
4882
4883
0
               LABELALIGN
4884
0
      "1:          \n"
4885
0
      "vpmovzxbw   (%0),%%ymm0                   \n"
4886
0
      "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
4887
0
      "lea         0x10(%0),%0                   \n"
4888
0
      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
4889
0
      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
4890
0
      "vmovdqu     %%ymm2,(%2)                   \n"
4891
0
      "lea         0x20(%2),%2                   \n"
4892
0
      "sub         $0x10,%3                      \n"
4893
0
      "jg          1b                            \n"
4894
0
      "vzeroupper  \n"
4895
0
               : "+r"(src_u),   // %0
4896
0
                 "+r"(src_v),   // %1
4897
0
                 "+r"(dst_uv),  // %2
4898
0
                 "+r"(width)    // %3
4899
0
               :
4900
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
4901
0
}
4902
#endif  // HAS_MERGEUVROW_AVX2
4903
4904
#ifdef HAS_MERGEUVROW_SSE2
4905
void MergeUVRow_SSE2(const uint8_t* src_u,
4906
                     const uint8_t* src_v,
4907
                     uint8_t* dst_uv,
4908
0
                     int width) {
4909
0
      asm volatile("sub         %0,%1                         \n"
4910
4911
0
               LABELALIGN
4912
0
      "1:          \n"
4913
0
      "movdqu      (%0),%%xmm0                   \n"
4914
0
      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
4915
0
      "lea         0x10(%0),%0                   \n"
4916
0
      "movdqa      %%xmm0,%%xmm2                 \n"
4917
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"
4918
0
      "punpckhbw   %%xmm1,%%xmm2                 \n"
4919
0
      "movdqu      %%xmm0,(%2)                   \n"
4920
0
      "movdqu      %%xmm2,0x10(%2)               \n"
4921
0
      "lea         0x20(%2),%2                   \n"
4922
0
      "sub         $0x10,%3                      \n"
4923
0
      "jg          1b                            \n"
4924
0
               : "+r"(src_u),   // %0
4925
0
                 "+r"(src_v),   // %1
4926
0
                 "+r"(dst_uv),  // %2
4927
0
                 "+r"(width)    // %3
4928
0
               :
4929
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
4930
0
}
4931
#endif  // HAS_MERGEUVROW_SSE2
4932
4933
#ifdef HAS_MERGEUVROW_16_AVX2
4934
void MergeUVRow_16_AVX2(const uint16_t* src_u,
4935
                        const uint16_t* src_v,
4936
                        uint16_t* dst_uv,
4937
                        int depth,
4938
0
                        int width) {
4939
0
  asm volatile(
4940
0
      "vmovd       %4,%%xmm3                     \n"
4941
0
      "vmovd       %5,%%xmm4                     \n"
4942
4943
0
      "sub         %0,%1                         \n"
4944
      // 8 pixels per loop.
4945
4946
0
      LABELALIGN
4947
0
      "1:          \n"
4948
0
      "vpmovzxwd   (%0),%%ymm0                   \n"
4949
0
      "vpmovzxwd   0x00(%0,%1,1),%%ymm1          \n"
4950
0
      "lea         0x10(%0),%0                   \n"
4951
0
      "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
4952
0
      "vpslld      %%xmm4,%%ymm1,%%ymm1          \n"
4953
0
      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
4954
0
      "vmovdqu     %%ymm2,(%2)                   \n"
4955
0
      "lea         0x20(%2),%2                   \n"
4956
0
      "sub         $0x8,%3                       \n"
4957
0
      "jg          1b                            \n"
4958
0
      "vzeroupper  \n"
4959
0
      : "+r"(src_u),      // %0
4960
0
        "+r"(src_v),      // %1
4961
0
        "+r"(dst_uv),     // %2
4962
0
        "+r"(width)       // %3
4963
0
      : "r"(16 - depth),  // %4
4964
0
        "r"(32 - depth)   // %5
4965
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4966
0
}
4967
#endif  // HAS_MERGEUVROW_AVX2
4968
4969
#ifdef HAS_SPLITUVROW_16_AVX2
4970
const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
4971
                                 2, 3, 6, 7, 10, 11, 14, 15};
4972
void SplitUVRow_16_AVX2(const uint16_t* src_uv,
4973
                        uint16_t* dst_u,
4974
                        uint16_t* dst_v,
4975
                        int depth,
4976
0
                        int width) {
4977
0
  depth = 16 - depth;
4978
0
  asm volatile(
4979
0
      "vmovd       %4,%%xmm3                     \n"
4980
0
      "vbroadcastf128 %5,%%ymm4                  \n"
4981
0
      "sub         %1,%2                         \n"
4982
4983
      // 16 pixels per loop.
4984
0
      LABELALIGN
4985
0
      "1:          \n"
4986
0
      "vmovdqu     (%0),%%ymm0                   \n"
4987
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
4988
0
      "add         $0x40,%0                      \n"
4989
4990
0
      "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
4991
0
      "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
4992
0
      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
4993
0
      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
4994
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4995
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4996
0
      "vextractf128 $0x0,%%ymm0,(%1)             \n"
4997
0
      "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
4998
0
      "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
4999
0
      "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
5000
0
      "add         $0x20,%1                      \n"
5001
0
      "sub         $0x10,%3                      \n"
5002
0
      "jg          1b                            \n"
5003
0
      "vzeroupper  \n"
5004
0
      : "+r"(src_uv),           // %0
5005
0
        "+r"(dst_u),            // %1
5006
0
        "+r"(dst_v),            // %2
5007
0
        "+r"(width)             // %3
5008
0
      : "r"(depth),             // %4
5009
0
        "m"(kSplitUVShuffle16)  // %5
5010
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5011
0
}
5012
#endif  // HAS_SPLITUVROW_16_AVX2
5013
5014
// Use scale to convert lsb formats to msb, depending how many bits there are:
5015
// 128 = 9 bits
5016
// 64 = 10 bits
5017
// 16 = 12 bits
5018
// 1 = 16 bits
5019
#ifdef HAS_MULTIPLYROW_16_AVX2
5020
void MultiplyRow_16_AVX2(const uint16_t* src_y,
5021
                         uint16_t* dst_y,
5022
                         int scale,
5023
0
                         int width) {
5024
0
  asm volatile(
5025
0
      "vmovd       %3,%%xmm3                     \n"
5026
0
      "vpbroadcastw %%xmm3,%%ymm3                \n"
5027
0
      "sub         %0,%1                         \n"
5028
5029
      // 32 pixels per loop.
5030
0
      LABELALIGN
5031
0
      "1:          \n"
5032
0
      "vmovdqu     (%0),%%ymm0                   \n"
5033
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
5034
0
      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
5035
0
      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
5036
0
      "vmovdqu     %%ymm0,(%0,%1)                \n"
5037
0
      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
5038
0
      "add         $0x40,%0                      \n"
5039
0
      "sub         $0x20,%2                      \n"
5040
0
      "jg          1b                            \n"
5041
0
      "vzeroupper  \n"
5042
0
      : "+r"(src_y),  // %0
5043
0
        "+r"(dst_y),  // %1
5044
0
        "+r"(width)   // %2
5045
0
      : "r"(scale)    // %3
5046
0
      : "memory", "cc", "xmm0", "xmm1", "xmm3");
5047
0
}
5048
#endif  // HAS_MULTIPLYROW_16_AVX2
5049
5050
// Use scale to convert msb formats to lsb, depending how many bits there are:
5051
// 512 = 9 bits
5052
// 1024 = 10 bits
5053
// 4096 = 12 bits
5054
// 65536 = 16 bits
5055
#ifdef HAS_DIVIDEROW_16_AVX2
5056
void DivideRow_16_AVX2(const uint16_t* src_y,
5057
                       uint16_t* dst_y,
5058
                       int scale,
5059
0
                       int width) {
5060
0
  asm volatile(
5061
0
      "vmovd       %3,%%xmm3                     \n"
5062
0
      "vpbroadcastw %%xmm3,%%ymm3                \n"
5063
0
      "sub         %0,%1                         \n"
5064
5065
      // 32 pixels per loop.
5066
0
      LABELALIGN
5067
0
      "1:          \n"
5068
0
      "vmovdqu     (%0),%%ymm0                   \n"
5069
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
5070
0
      "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
5071
0
      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
5072
0
      "vmovdqu     %%ymm0,(%0,%1)                \n"
5073
0
      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
5074
0
      "add         $0x40,%0                      \n"
5075
0
      "sub         $0x20,%2                      \n"
5076
0
      "jg          1b                            \n"
5077
0
      "vzeroupper  \n"
5078
0
      : "+r"(src_y),  // %0
5079
0
        "+r"(dst_y),  // %1
5080
0
        "+r"(width),  // %2
5081
0
        "+r"(scale)   // %3
5082
0
      :
5083
0
      : "memory", "cc", "xmm0", "xmm1", "xmm3");
5084
0
}
5085
#endif  // HAS_MULTIPLYROW_16_AVX2
5086
5087
// Use scale to convert lsb formats to msb, depending how many bits there are:
5088
// 32768 = 9 bits
5089
// 16384 = 10 bits
5090
// 4096 = 12 bits
5091
// 256 = 16 bits
5092
void Convert16To8Row_SSSE3(const uint16_t* src_y,
5093
                           uint8_t* dst_y,
5094
                           int scale,
5095
0
                           int width) {
5096
0
  asm volatile(
5097
0
      "movd        %3,%%xmm2                     \n"
5098
0
      "punpcklwd   %%xmm2,%%xmm2                 \n"
5099
0
      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
5100
5101
      // 32 pixels per loop.
5102
0
      LABELALIGN
5103
0
      "1:          \n"
5104
0
      "movdqu      (%0),%%xmm0                   \n"
5105
0
      "movdqu      0x10(%0),%%xmm1               \n"
5106
0
      "add         $0x20,%0                      \n"
5107
0
      "pmulhuw     %%xmm2,%%xmm0                 \n"
5108
0
      "pmulhuw     %%xmm2,%%xmm1                 \n"
5109
0
      "packuswb    %%xmm1,%%xmm0                 \n"
5110
0
      "movdqu      %%xmm0,(%1)                   \n"
5111
0
      "add         $0x10,%1                      \n"
5112
0
      "sub         $0x10,%2                      \n"
5113
0
      "jg          1b                            \n"
5114
0
      : "+r"(src_y),  // %0
5115
0
        "+r"(dst_y),  // %1
5116
0
        "+r"(width)   // %2
5117
0
      : "r"(scale)    // %3
5118
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5119
0
}
5120
5121
#ifdef HAS_CONVERT16TO8ROW_AVX2
5122
void Convert16To8Row_AVX2(const uint16_t* src_y,
5123
                          uint8_t* dst_y,
5124
                          int scale,
5125
802
                          int width) {
5126
802
  asm volatile(
5127
802
      "vmovd       %3,%%xmm2                     \n"
5128
802
      "vpbroadcastw %%xmm2,%%ymm2                \n"
5129
5130
      // 32 pixels per loop.
5131
802
      LABELALIGN
5132
802
      "1:          \n"
5133
802
      "vmovdqu     (%0),%%ymm0                   \n"
5134
802
      "vmovdqu     0x20(%0),%%ymm1               \n"
5135
802
      "add         $0x40,%0                      \n"
5136
802
      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5137
802
      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
5138
802
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
5139
802
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5140
802
      "vmovdqu     %%ymm0,(%1)                   \n"
5141
802
      "add         $0x20,%1                      \n"
5142
802
      "sub         $0x20,%2                      \n"
5143
802
      "jg          1b                            \n"
5144
802
      "vzeroupper  \n"
5145
802
      : "+r"(src_y),  // %0
5146
802
        "+r"(dst_y),  // %1
5147
802
        "+r"(width)   // %2
5148
802
      : "r"(scale)    // %3
5149
802
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5150
802
}
5151
#endif  // HAS_CONVERT16TO8ROW_AVX2
5152
5153
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
5154
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
5155
                              uint8_t* dst_y,
5156
                              int scale,
5157
0
                              int width) {
5158
0
      asm volatile("vpbroadcastw %3,%%zmm2                    \n"
5159
5160
               // 64 pixels per loop.
5161
0
               LABELALIGN
5162
0
      "1:          \n"
5163
0
      "vmovups     (%0),%%zmm0                   \n"
5164
0
      "vmovups     0x40(%0),%%zmm1               \n"
5165
0
      "add         $0x80,%0                      \n"
5166
0
      "vpmulhuw    %%zmm2,%%zmm0,%%zmm0          \n"
5167
0
      "vpmulhuw    %%zmm2,%%zmm1,%%zmm1          \n"
5168
0
      "vpmovuswb   %%zmm0,%%ymm0                 \n"
5169
0
      "vpmovuswb   %%zmm1,%%ymm1                 \n"
5170
0
      "vmovups     %%ymm0,(%1)                   \n"
5171
0
      "vmovups     %%ymm1,0x20(%1)               \n"
5172
0
      "add         $0x40,%1                      \n"
5173
0
      "sub         $0x40,%2                      \n"
5174
0
      "jg          1b                            \n"
5175
0
      "vzeroupper  \n"
5176
0
               : "+r"(src_y),  // %0
5177
0
                 "+r"(dst_y),  // %1
5178
0
                 "+r"(width)   // %2
5179
0
               : "r"(scale)    // %3
5180
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
5181
0
}
5182
#endif  // HAS_CONVERT16TO8ROW_AVX2
5183
5184
// Use scale to convert to lsb formats depending how many bits there are:
5185
// 512 = 9 bits
5186
// 1024 = 10 bits
5187
// 4096 = 12 bits
5188
void Convert8To16Row_SSE2(const uint8_t* src_y,
5189
                          uint16_t* dst_y,
5190
                          int scale,
5191
0
                          int width) {
5192
0
  asm volatile(
5193
0
      "movd        %3,%%xmm2                     \n"
5194
0
      "punpcklwd   %%xmm2,%%xmm2                 \n"
5195
0
      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
5196
5197
      // 32 pixels per loop.
5198
0
      LABELALIGN
5199
0
      "1:          \n"
5200
0
      "movdqu      (%0),%%xmm0                   \n"
5201
0
      "movdqa      %%xmm0,%%xmm1                 \n"
5202
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
5203
0
      "punpckhbw   %%xmm1,%%xmm1                 \n"
5204
0
      "add         $0x10,%0                      \n"
5205
0
      "pmulhuw     %%xmm2,%%xmm0                 \n"
5206
0
      "pmulhuw     %%xmm2,%%xmm1                 \n"
5207
0
      "movdqu      %%xmm0,(%1)                   \n"
5208
0
      "movdqu      %%xmm1,0x10(%1)               \n"
5209
0
      "add         $0x20,%1                      \n"
5210
0
      "sub         $0x10,%2                      \n"
5211
0
      "jg          1b                            \n"
5212
0
      : "+r"(src_y),  // %0
5213
0
        "+r"(dst_y),  // %1
5214
0
        "+r"(width)   // %2
5215
0
      : "r"(scale)    // %3
5216
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5217
0
}
5218
5219
#ifdef HAS_CONVERT8TO16ROW_AVX2
5220
void Convert8To16Row_AVX2(const uint8_t* src_y,
5221
                          uint16_t* dst_y,
5222
                          int scale,
5223
0
                          int width) {
5224
0
  const int shift = __builtin_clz(scale) - 15;
5225
0
      asm volatile("vmovd       %3,%%xmm2                     \n"
5226
5227
               // 32 pixels per loop.
5228
0
               LABELALIGN
5229
0
      "1:          \n"
5230
0
      "vmovdqu     (%0),%%ymm0                   \n"
5231
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5232
0
      "add         $0x20,%0                      \n"
5233
0
      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
5234
0
      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
5235
0
      "vpsrlw      %%xmm2,%%ymm0,%%ymm0          \n"
5236
0
      "vpsrlw      %%xmm2,%%ymm1,%%ymm1          \n"
5237
0
      "vmovdqu     %%ymm0,(%1)                   \n"
5238
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
5239
0
      "add         $0x40,%1                      \n"
5240
0
      "sub         $0x20,%2                      \n"
5241
0
      "jg          1b                            \n"
5242
0
      "vzeroupper  \n"
5243
0
               : "+r"(src_y),  // %0
5244
0
                 "+r"(dst_y),  // %1
5245
0
                 "+r"(width)   // %2
5246
0
               : "r"(shift)    // %3
5247
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
5248
0
}
5249
#endif  // HAS_CONVERT8TO16ROW_AVX2
5250
5251
#ifdef HAS_SPLITRGBROW_SSSE3
5252
// Shuffle table for converting RGB to Planar.
5253
static const uvec8 kSplitRGBShuffle[9] = {
5254
    {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5255
     128u, 128u},
5256
    {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
5257
     128u, 128u},
5258
    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
5259
     7u, 10u, 13u},
5260
    {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5261
     128u, 128u},
5262
    {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
5263
     128u, 128u},
5264
    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
5265
     8u, 11u, 14u},
5266
    {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5267
     128u, 128u},
5268
    {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
5269
     128u, 128u},
5270
    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
5271
     12u, 15u}};
5272
5273
void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5274
                       uint8_t* dst_r,
5275
                       uint8_t* dst_g,
5276
                       uint8_t* dst_b,
5277
0
                       int width) {
5278
0
  asm volatile(
5279
0
      "1:          \n"
5280
0
      "movdqu      (%0),%%xmm0                   \n"
5281
0
      "movdqu      0x10(%0),%%xmm1               \n"
5282
0
      "movdqu      0x20(%0),%%xmm2               \n"
5283
0
      "pshufb      0(%5), %%xmm0                 \n"
5284
0
      "pshufb      16(%5), %%xmm1                \n"
5285
0
      "pshufb      32(%5), %%xmm2                \n"
5286
0
      "por         %%xmm1,%%xmm0                 \n"
5287
0
      "por         %%xmm2,%%xmm0                 \n"
5288
0
      "movdqu      %%xmm0,(%1)                   \n"
5289
0
      "lea         0x10(%1),%1                   \n"
5290
5291
0
      "movdqu      (%0),%%xmm0                   \n"
5292
0
      "movdqu      0x10(%0),%%xmm1               \n"
5293
0
      "movdqu      0x20(%0),%%xmm2               \n"
5294
0
      "pshufb      48(%5),%%xmm0                 \n"
5295
0
      "pshufb      64(%5),%%xmm1                 \n"
5296
0
      "pshufb      80(%5), %%xmm2                \n"
5297
0
      "por         %%xmm1,%%xmm0                 \n"
5298
0
      "por         %%xmm2,%%xmm0                 \n"
5299
0
      "movdqu      %%xmm0,(%2)                   \n"
5300
0
      "lea         0x10(%2),%2                   \n"
5301
5302
0
      "movdqu      (%0),%%xmm0                   \n"
5303
0
      "movdqu      0x10(%0),%%xmm1               \n"
5304
0
      "movdqu      0x20(%0),%%xmm2               \n"
5305
0
      "pshufb      96(%5), %%xmm0                \n"
5306
0
      "pshufb      112(%5), %%xmm1               \n"
5307
0
      "pshufb      128(%5), %%xmm2               \n"
5308
0
      "por         %%xmm1,%%xmm0                 \n"
5309
0
      "por         %%xmm2,%%xmm0                 \n"
5310
0
      "movdqu      %%xmm0,(%3)                   \n"
5311
0
      "lea         0x10(%3),%3                   \n"
5312
0
      "lea         0x30(%0),%0                   \n"
5313
0
      "sub         $0x10,%4                      \n"
5314
0
      "jg          1b                            \n"
5315
0
      : "+r"(src_rgb),             // %0
5316
0
        "+r"(dst_r),               // %1
5317
0
        "+r"(dst_g),               // %2
5318
0
        "+r"(dst_b),               // %3
5319
0
        "+r"(width)                // %4
5320
0
      : "r"(&kSplitRGBShuffle[0])  // %5
5321
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5322
0
}
5323
#endif  // HAS_SPLITRGBROW_SSSE3
5324
5325
#ifdef HAS_SPLITRGBROW_SSE41
5326
// Shuffle table for converting RGB to Planar, SSE4.1. Note: these are used for
5327
// the AVX2 implementation as well.
5328
static const uvec8 kSplitRGBShuffleSSE41[5] = {
5329
    {0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u},
5330
    {1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u},
5331
    {2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u},
5332
    {0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u},
5333
    {0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u},
5334
};
5335
5336
void SplitRGBRow_SSE41(const uint8_t* src_rgb,
5337
                       uint8_t* dst_r,
5338
                       uint8_t* dst_g,
5339
                       uint8_t* dst_b,
5340
0
                       int width) {
5341
0
  asm volatile(
5342
0
      "movdqa      48(%5), %%xmm0                \n"
5343
0
      "1:          \n"
5344
0
      "movdqu      (%0),%%xmm1                   \n"
5345
0
      "movdqu      0x10(%0),%%xmm2               \n"
5346
0
      "movdqu      0x20(%0),%%xmm3               \n"
5347
0
      "lea         0x30(%0),%0                   \n"
5348
0
      "movdqa      %%xmm1, %%xmm4                \n"
5349
0
      "pblendvb    %%xmm3, %%xmm1                \n"
5350
0
      "pblendvb    %%xmm2, %%xmm3                \n"
5351
0
      "pblendvb    %%xmm4, %%xmm2                \n"
5352
0
      "palignr     $0xF, %%xmm0, %%xmm0          \n"
5353
0
      "pblendvb    %%xmm2, %%xmm1                \n"
5354
0
      "pblendvb    %%xmm3, %%xmm2                \n"
5355
0
      "pblendvb    %%xmm4, %%xmm3                \n"
5356
0
      "palignr     $0x1, %%xmm0, %%xmm0          \n"
5357
0
      "pshufb      0(%5), %%xmm1                 \n"
5358
0
      "pshufb      16(%5), %%xmm2                \n"
5359
0
      "pshufb      32(%5), %%xmm3                \n"
5360
0
      "movdqu      %%xmm1,(%1)                   \n"
5361
0
      "lea         0x10(%1),%1                   \n"
5362
0
      "movdqu      %%xmm2,(%2)                   \n"
5363
0
      "lea         0x10(%2),%2                   \n"
5364
0
      "movdqu      %%xmm3,(%3)                   \n"
5365
0
      "lea         0x10(%3),%3                   \n"
5366
0
      "sub         $0x10,%4                      \n"
5367
0
      "jg          1b                            \n"
5368
0
      : "+r"(src_rgb),                  // %0
5369
0
        "+r"(dst_r),                    // %1
5370
0
        "+r"(dst_g),                    // %2
5371
0
        "+r"(dst_b),                    // %3
5372
0
        "+r"(width)                     // %4
5373
0
      : "r"(&kSplitRGBShuffleSSE41[0])  // %5
5374
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5375
0
}
5376
#endif  // HAS_SPLITRGBROW_SSE41
5377
5378
#ifdef HAS_SPLITRGBROW_AVX2
5379
void SplitRGBRow_AVX2(const uint8_t* src_rgb,
5380
                      uint8_t* dst_r,
5381
                      uint8_t* dst_g,
5382
                      uint8_t* dst_b,
5383
0
                      int width) {
5384
0
  asm volatile(
5385
0
      "vbroadcasti128 48(%5), %%ymm0             \n"
5386
0
      "vbroadcasti128 64(%5), %%ymm7             \n"
5387
0
#if defined(__x86_64__)
5388
0
      "vbroadcasti128 0(%5), %%ymm8              \n"
5389
0
      "vbroadcasti128 16(%5), %%ymm9             \n"
5390
0
      "vbroadcasti128 32(%5), %%ymm10            \n"
5391
0
#endif
5392
0
      "1:          \n"
5393
0
      "vmovdqu     (%0),%%ymm4                   \n"
5394
0
      "vmovdqu     0x20(%0),%%ymm5               \n"
5395
0
      "vmovdqu     0x40(%0),%%ymm6               \n"
5396
0
      "lea         0x60(%0),%0                   \n"
5397
0
      "vpblendd    $240, %%ymm5, %%ymm4, %%ymm1  \n"
5398
0
      "vperm2i128  $33, %%ymm6, %%ymm4, %%ymm2   \n"
5399
0
      "vpblendd    $240, %%ymm6, %%ymm5, %%ymm3  \n"
5400
0
      "vpblendvb   %%ymm0, %%ymm3, %%ymm1, %%ymm4 \n"
5401
0
      "vpblendvb   %%ymm0, %%ymm1, %%ymm2, %%ymm5 \n"
5402
0
      "vpblendvb   %%ymm0, %%ymm2, %%ymm3, %%ymm6 \n"
5403
0
      "vpblendvb   %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n"
5404
0
      "vpblendvb   %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n"
5405
0
      "vpblendvb   %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n"
5406
0
#if defined(__x86_64__)
5407
0
      "vpshufb     %%ymm8, %%ymm1, %%ymm1        \n"
5408
0
      "vpshufb     %%ymm9, %%ymm2, %%ymm2        \n"
5409
0
      "vpshufb     %%ymm10, %%ymm3, %%ymm3       \n"
5410
#else
5411
      "vbroadcasti128 0(%5), %%ymm4              \n"
5412
      "vbroadcasti128 16(%5), %%ymm5             \n"
5413
      "vbroadcasti128 32(%5), %%ymm6             \n"
5414
      "vpshufb     %%ymm4, %%ymm1, %%ymm1        \n"
5415
      "vpshufb     %%ymm5, %%ymm2, %%ymm2        \n"
5416
      "vpshufb     %%ymm6, %%ymm3, %%ymm3        \n"
5417
#endif
5418
0
      "vmovdqu     %%ymm1,(%1)                   \n"
5419
0
      "lea         0x20(%1),%1                   \n"
5420
0
      "vmovdqu     %%ymm2,(%2)                   \n"
5421
0
      "lea         0x20(%2),%2                   \n"
5422
0
      "vmovdqu     %%ymm3,(%3)                   \n"
5423
0
      "lea         0x20(%3),%3                   \n"
5424
0
      "sub         $0x20,%4                      \n"
5425
0
      "jg          1b                            \n"
5426
0
      : "+r"(src_rgb),                  // %0
5427
0
        "+r"(dst_r),                    // %1
5428
0
        "+r"(dst_g),                    // %2
5429
0
        "+r"(dst_b),                    // %3
5430
0
        "+r"(width)                     // %4
5431
0
      : "r"(&kSplitRGBShuffleSSE41[0])  // %5
5432
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5433
0
        "xmm7"
5434
0
#if defined(__x86_64__)
5435
0
        ,
5436
0
        "xmm8", "xmm9", "xmm10"
5437
0
#endif
5438
0
  );
5439
0
}
5440
#endif  // HAS_SPLITRGBROW_AVX2
5441
5442
#ifdef HAS_MERGERGBROW_SSSE3
5443
// Shuffle table for converting Planar to RGB.
5444
static const uvec8 kMergeRGBShuffle[9] = {
5445
    {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
5446
     128u, 5u},
5447
    {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
5448
     128u, 128u},
5449
    {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
5450
     4u, 128u},
5451
    {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
5452
     10u, 128u},
5453
    {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
5454
     128u, 10u},
5455
    {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
5456
     128u, 128u},
5457
    {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
5458
     15u, 128u, 128u},
5459
    {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
5460
     128u, 15u, 128u},
5461
    {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
5462
     128u, 128u, 15u}};
5463
5464
void MergeRGBRow_SSSE3(const uint8_t* src_r,
5465
                       const uint8_t* src_g,
5466
                       const uint8_t* src_b,
5467
                       uint8_t* dst_rgb,
5468
0
                       int width) {
5469
0
  asm volatile(
5470
0
      "1:          \n"
5471
0
      "movdqu      (%0),%%xmm0                   \n"
5472
0
      "movdqu      (%1),%%xmm1                   \n"
5473
0
      "movdqu      (%2),%%xmm2                   \n"
5474
0
      "pshufb      (%5), %%xmm0                  \n"
5475
0
      "pshufb      16(%5), %%xmm1                \n"
5476
0
      "pshufb      32(%5), %%xmm2                \n"
5477
0
      "por         %%xmm1,%%xmm0                 \n"
5478
0
      "por         %%xmm2,%%xmm0                 \n"
5479
0
      "movdqu      %%xmm0,(%3)                   \n"
5480
5481
0
      "movdqu      (%0),%%xmm0                   \n"
5482
0
      "movdqu      (%1),%%xmm1                   \n"
5483
0
      "movdqu      (%2),%%xmm2                   \n"
5484
0
      "pshufb      48(%5), %%xmm0                \n"
5485
0
      "pshufb      64(%5), %%xmm1                \n"
5486
0
      "pshufb      80(%5), %%xmm2                \n"
5487
0
      "por         %%xmm1,%%xmm0                 \n"
5488
0
      "por         %%xmm2,%%xmm0                 \n"
5489
0
      "movdqu      %%xmm0,16(%3)                 \n"
5490
5491
0
      "movdqu      (%0),%%xmm0                   \n"
5492
0
      "movdqu      (%1),%%xmm1                   \n"
5493
0
      "movdqu      (%2),%%xmm2                   \n"
5494
0
      "pshufb      96(%5), %%xmm0                \n"
5495
0
      "pshufb      112(%5), %%xmm1               \n"
5496
0
      "pshufb      128(%5), %%xmm2               \n"
5497
0
      "por         %%xmm1,%%xmm0                 \n"
5498
0
      "por         %%xmm2,%%xmm0                 \n"
5499
0
      "movdqu      %%xmm0,32(%3)                 \n"
5500
5501
0
      "lea         0x10(%0),%0                   \n"
5502
0
      "lea         0x10(%1),%1                   \n"
5503
0
      "lea         0x10(%2),%2                   \n"
5504
0
      "lea         0x30(%3),%3                   \n"
5505
0
      "sub         $0x10,%4                      \n"
5506
0
      "jg          1b                            \n"
5507
0
      : "+r"(src_r),               // %0
5508
0
        "+r"(src_g),               // %1
5509
0
        "+r"(src_b),               // %2
5510
0
        "+r"(dst_rgb),             // %3
5511
0
        "+r"(width)                // %4
5512
0
      : "r"(&kMergeRGBShuffle[0])  // %5
5513
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5514
0
}
5515
#endif  // HAS_MERGERGBROW_SSSE3
5516
5517
#ifdef HAS_MERGEARGBROW_SSE2
5518
void MergeARGBRow_SSE2(const uint8_t* src_r,
5519
                       const uint8_t* src_g,
5520
                       const uint8_t* src_b,
5521
                       const uint8_t* src_a,
5522
                       uint8_t* dst_argb,
5523
0
                       int width) {
5524
0
  asm volatile(
5525
0
      "sub         %0,%1                         \n"
5526
0
      "sub         %0,%2                         \n"
5527
0
      "sub         %0,%3                         \n"
5528
5529
0
      LABELALIGN
5530
0
      "1:          \n"
5531
5532
0
      "movq        (%0,%2),%%xmm0                \n"  // B
5533
0
      "movq        (%0),%%xmm1                   \n"  // R
5534
0
      "movq        (%0,%1),%%xmm2                \n"  // G
5535
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5536
0
      "movq        (%0,%3),%%xmm1                \n"  // A
5537
0
      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5538
0
      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5539
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5540
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5541
0
      "movdqu      %%xmm0,(%4)                   \n"
5542
0
      "movdqu      %%xmm1,16(%4)                 \n"
5543
5544
0
      "lea         8(%0),%0                      \n"
5545
0
      "lea         32(%4),%4                     \n"
5546
0
      "sub         $0x8,%5                       \n"
5547
0
      "jg          1b                            \n"
5548
0
      : "+r"(src_r),     // %0
5549
0
        "+r"(src_g),     // %1
5550
0
        "+r"(src_b),     // %2
5551
0
        "+r"(src_a),     // %3
5552
0
        "+r"(dst_argb),  // %4
5553
0
        "+r"(width)      // %5
5554
0
      :
5555
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5556
0
}
5557
#endif
5558
5559
#ifdef HAS_MERGEXRGBROW_SSE2
5560
void MergeXRGBRow_SSE2(const uint8_t* src_r,
5561
                       const uint8_t* src_g,
5562
                       const uint8_t* src_b,
5563
                       uint8_t* dst_argb,
5564
0
                       int width) {
5565
0
  asm volatile(
5566
0
      "1:          \n"
5567
5568
0
      "movq        (%2),%%xmm0                   \n"  // B
5569
0
      "movq        (%0),%%xmm1                   \n"  // R
5570
0
      "movq        (%1),%%xmm2                   \n"  // G
5571
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5572
0
      "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
5573
0
      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5574
0
      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5575
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5576
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5577
0
      "movdqu      %%xmm0,(%3)                   \n"
5578
0
      "movdqu      %%xmm1,16(%3)                 \n"
5579
5580
0
      "lea         8(%0),%0                      \n"
5581
0
      "lea         8(%1),%1                      \n"
5582
0
      "lea         8(%2),%2                      \n"
5583
0
      "lea         32(%3),%3                     \n"
5584
0
      "sub         $0x8,%4                       \n"
5585
0
      "jg          1b                            \n"
5586
0
      : "+r"(src_r),     // %0
5587
0
        "+r"(src_g),     // %1
5588
0
        "+r"(src_b),     // %2
5589
0
        "+r"(dst_argb),  // %3
5590
0
        "+r"(width)      // %4
5591
0
      :
5592
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5593
0
}
5594
#endif  // HAS_MERGEARGBROW_SSE2
5595
5596
#ifdef HAS_MERGEARGBROW_AVX2
5597
void MergeARGBRow_AVX2(const uint8_t* src_r,
5598
                       const uint8_t* src_g,
5599
                       const uint8_t* src_b,
5600
                       const uint8_t* src_a,
5601
                       uint8_t* dst_argb,
5602
0
                       int width) {
5603
0
  asm volatile(
5604
0
      "sub         %0,%1                         \n"
5605
0
      "sub         %0,%2                         \n"
5606
0
      "sub         %0,%3                         \n"
5607
5608
0
      LABELALIGN
5609
0
      "1:          \n"
5610
5611
0
      "vmovdqu     (%0,%2),%%xmm0                \n"  // B
5612
0
      "vmovdqu     (%0,%1),%%xmm1                \n"  // R
5613
0
      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5614
0
      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
5615
0
      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5616
0
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5617
0
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5618
0
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5619
0
      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5620
0
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5621
0
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5622
0
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5623
0
      "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
5624
0
      "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
5625
5626
0
      "lea         16(%0),%0                     \n"
5627
0
      "lea         64(%4),%4                     \n"
5628
0
      "sub         $0x10,%5                      \n"
5629
0
      "jg          1b                            \n"
5630
0
      "vzeroupper  \n"
5631
0
      : "+r"(src_r),     // %0
5632
0
        "+r"(src_g),     // %1
5633
0
        "+r"(src_b),     // %2
5634
0
        "+r"(src_a),     // %3
5635
0
        "+r"(dst_argb),  // %4
5636
0
        "+r"(width)      // %5
5637
0
      :
5638
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5639
0
}
5640
#endif
5641
5642
#ifdef HAS_MERGEXRGBROW_AVX2
5643
void MergeXRGBRow_AVX2(const uint8_t* src_r,
5644
                       const uint8_t* src_g,
5645
                       const uint8_t* src_b,
5646
                       uint8_t* dst_argb,
5647
0
                       int width) {
5648
0
  asm volatile(
5649
0
      "1:          \n"
5650
5651
0
      "vmovdqu     (%2),%%xmm0                   \n"  // B
5652
0
      "vpcmpeqb    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
5653
0
      "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
5654
0
      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5655
0
      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5656
0
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5657
0
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5658
0
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5659
0
      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5660
0
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5661
0
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5662
0
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5663
0
      "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
5664
0
      "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
5665
5666
0
      "lea         16(%0),%0                     \n"
5667
0
      "lea         16(%1),%1                     \n"
5668
0
      "lea         16(%2),%2                     \n"
5669
0
      "lea         64(%3),%3                     \n"
5670
0
      "sub         $0x10,%4                      \n"
5671
0
      "jg          1b                            \n"
5672
0
      "vzeroupper  \n"
5673
0
      : "+r"(src_r),     // %0
5674
0
        "+r"(src_g),     // %1
5675
0
        "+r"(src_b),     // %2
5676
0
        "+r"(dst_argb),  // %3
5677
0
        "+rm"(width)     // %4
5678
0
        ::"memory",
5679
0
        "cc", "xmm0", "xmm1", "xmm2");
5680
0
}
5681
#endif  // HAS_MERGEARGBROW_AVX2
5682
5683
#ifdef HAS_SPLITARGBROW_SSE2
5684
void SplitARGBRow_SSE2(const uint8_t* src_argb,
5685
                       uint8_t* dst_r,
5686
                       uint8_t* dst_g,
5687
                       uint8_t* dst_b,
5688
                       uint8_t* dst_a,
5689
0
                       int width) {
5690
0
  asm volatile(
5691
0
      "sub         %1,%2                         \n"
5692
0
      "sub         %1,%3                         \n"
5693
0
      "sub         %1,%4                         \n"
5694
5695
0
      LABELALIGN
5696
0
      "1:          \n"
5697
5698
0
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5699
0
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5700
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5701
0
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5702
0
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5703
0
      "movdqa      %%xmm0,%%xmm1                 \n"
5704
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5705
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5706
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5707
0
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5708
0
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5709
0
      "movdqa      %%xmm0,%%xmm1                 \n"
5710
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5711
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5712
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5713
0
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5714
0
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5715
0
      "movlps      %%xmm0,(%1,%3)                \n"  // B
5716
0
      "movhps      %%xmm0,(%1,%2)                \n"  // G
5717
0
      "movlps      %%xmm2,(%1)                   \n"  // R
5718
0
      "movhps      %%xmm2,(%1,%4)                \n"  // A
5719
5720
0
      "lea         32(%0),%0                     \n"
5721
0
      "lea         8(%1),%1                      \n"
5722
0
      "sub         $0x8,%5                       \n"
5723
0
      "jg          1b                            \n"
5724
0
      : "+r"(src_argb),  // %0
5725
0
        "+r"(dst_r),     // %1
5726
0
        "+r"(dst_g),     // %2
5727
0
        "+r"(dst_b),     // %3
5728
0
        "+r"(dst_a),     // %4
5729
0
        "+rm"(width)     // %5
5730
0
      :
5731
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5732
0
}
5733
#endif
5734
5735
#ifdef HAS_SPLITXRGBROW_SSE2
5736
void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5737
                       uint8_t* dst_r,
5738
                       uint8_t* dst_g,
5739
                       uint8_t* dst_b,
5740
0
                       int width) {
5741
0
  asm volatile(
5742
0
      "1:          \n"
5743
5744
0
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5745
0
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5746
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5747
0
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5748
0
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5749
0
      "movdqa      %%xmm0,%%xmm1                 \n"
5750
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5751
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5752
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5753
0
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5754
0
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5755
0
      "movdqa      %%xmm0,%%xmm1                 \n"
5756
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5757
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5758
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5759
0
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5760
0
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5761
0
      "movlps      %%xmm0,(%3)                   \n"  // B
5762
0
      "movhps      %%xmm0,(%2)                   \n"  // G
5763
0
      "movlps      %%xmm2,(%1)                   \n"  // R
5764
5765
0
      "lea         32(%0),%0                     \n"
5766
0
      "lea         8(%1),%1                      \n"
5767
0
      "lea         8(%2),%2                      \n"
5768
0
      "lea         8(%3),%3                      \n"
5769
0
      "sub         $0x8,%4                       \n"
5770
0
      "jg          1b                            \n"
5771
0
      : "+r"(src_argb),  // %0
5772
0
        "+r"(dst_r),     // %1
5773
0
        "+r"(dst_g),     // %2
5774
0
        "+r"(dst_b),     // %3
5775
0
        "+rm"(width)     // %4
5776
0
      :
5777
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
5778
0
}
5779
#endif
5780
5781
static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
5782
                                            2, 6, 10, 14, 3, 7, 11, 15};
5783
#ifdef HAS_SPLITARGBROW_SSSE3
5784
void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5785
                        uint8_t* dst_r,
5786
                        uint8_t* dst_g,
5787
                        uint8_t* dst_b,
5788
                        uint8_t* dst_a,
5789
0
                        int width) {
5790
0
  asm volatile(
5791
0
      "movdqa      %6,%%xmm3                     \n"
5792
0
      "sub         %1,%2                         \n"
5793
0
      "sub         %1,%3                         \n"
5794
0
      "sub         %1,%4                         \n"
5795
5796
0
      LABELALIGN
5797
0
      "1:          \n"
5798
5799
0
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5800
0
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5801
0
      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5802
0
      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5803
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5804
0
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5805
0
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5806
0
      "movlps      %%xmm0,(%1,%3)                \n"  // B
5807
0
      "movhps      %%xmm0,(%1,%2)                \n"  // G
5808
0
      "movlps      %%xmm2,(%1)                   \n"  // R
5809
0
      "movhps      %%xmm2,(%1,%4)                \n"  // A
5810
5811
0
      "lea         32(%0),%0                     \n"
5812
0
      "lea         8(%1),%1                      \n"
5813
0
      "subl        $0x8,%5                       \n"
5814
0
      "jg          1b                            \n"
5815
0
      : "+r"(src_argb),  // %0
5816
0
        "+r"(dst_r),     // %1
5817
0
        "+r"(dst_g),     // %2
5818
0
        "+r"(dst_b),     // %3
5819
0
        "+r"(dst_a),     // %4
5820
#if defined(__i386__)
5821
        "+m"(width)  // %5
5822
#else
5823
0
        "+rm"(width)  // %5
5824
0
#endif
5825
0
      : "m"(kShuffleMaskARGBSplit)  // %6
5826
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5827
0
}
5828
#endif
5829
5830
#ifdef HAS_SPLITXRGBROW_SSSE3
5831
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
5832
                        uint8_t* dst_r,
5833
                        uint8_t* dst_g,
5834
                        uint8_t* dst_b,
5835
0
                        int width) {
5836
0
  asm volatile(
5837
0
      "movdqa      %5,%%xmm3                     \n"
5838
5839
0
      LABELALIGN
5840
0
      "1:          \n"
5841
5842
0
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5843
0
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5844
0
      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5845
0
      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5846
0
      "movdqa      %%xmm0,%%xmm2                 \n"
5847
0
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5848
0
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5849
0
      "movlps      %%xmm0,(%3)                   \n"  // B
5850
0
      "movhps      %%xmm0,(%2)                   \n"  // G
5851
0
      "movlps      %%xmm2,(%1)                   \n"  // R
5852
5853
0
      "lea         32(%0),%0                     \n"
5854
0
      "lea         8(%1),%1                      \n"
5855
0
      "lea         8(%2),%2                      \n"
5856
0
      "lea         8(%3),%3                      \n"
5857
0
      "sub         $0x8,%4                       \n"
5858
0
      "jg          1b                            \n"
5859
0
      : "+r"(src_argb),             // %0
5860
0
        "+r"(dst_r),                // %1
5861
0
        "+r"(dst_g),                // %2
5862
0
        "+r"(dst_b),                // %3
5863
0
        "+r"(width)                 // %4
5864
0
      : "m"(kShuffleMaskARGBSplit)  // %5
5865
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5866
0
}
5867
#endif
5868
5869
#ifdef HAS_SPLITARGBROW_AVX2
5870
static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
5871
void SplitARGBRow_AVX2(const uint8_t* src_argb,
5872
                       uint8_t* dst_r,
5873
                       uint8_t* dst_g,
5874
                       uint8_t* dst_b,
5875
                       uint8_t* dst_a,
5876
0
                       int width) {
5877
0
  asm volatile(
5878
0
      "sub         %1,%2                         \n"
5879
0
      "sub         %1,%3                         \n"
5880
0
      "sub         %1,%4                         \n"
5881
0
      "vmovdqa     %7,%%ymm3                     \n"
5882
0
      "vbroadcastf128 %6,%%ymm4                  \n"
5883
5884
0
      LABELALIGN
5885
0
      "1:          \n"
5886
5887
0
      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
5888
0
      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
5889
0
      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
5890
0
      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
5891
0
      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5892
0
      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5893
0
      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
5894
0
      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
5895
0
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
5896
0
      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
5897
0
      "vmovdqu     %%xmm0,(%1,%3)                \n"  // B
5898
0
      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
5899
0
      "vmovdqu     %%xmm2,(%1,%2)                \n"  // G
5900
0
      "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
5901
0
      "lea         64(%0),%0                     \n"
5902
0
      "lea         16(%1),%1                     \n"
5903
0
      "subl        $0x10,%5                      \n"
5904
0
      "jg          1b                            \n"
5905
0
      "vzeroupper  \n"
5906
0
      : "+r"(src_argb),  // %0
5907
0
        "+r"(dst_r),     // %1
5908
0
        "+r"(dst_g),     // %2
5909
0
        "+r"(dst_b),     // %3
5910
0
        "+r"(dst_a),     // %4
5911
#if defined(__i386__)
5912
        "+m"(width)  // %5
5913
#else
5914
0
        "+rm"(width)  // %5
5915
0
#endif
5916
0
      : "m"(kShuffleMaskARGBSplit),   // %6
5917
0
        "m"(kShuffleMaskARGBPermute)  // %7
5918
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5919
0
}
5920
#endif
5921
5922
#ifdef HAS_SPLITXRGBROW_AVX2
5923
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
5924
                       uint8_t* dst_r,
5925
                       uint8_t* dst_g,
5926
                       uint8_t* dst_b,
5927
0
                       int width) {
5928
0
  asm volatile(
5929
0
      "vmovdqa     %6,%%ymm3                     \n"
5930
0
      "vbroadcastf128 %5,%%ymm4                  \n"
5931
5932
0
      LABELALIGN
5933
0
      "1:          \n"
5934
5935
0
      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
5936
0
      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
5937
0
      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
5938
0
      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
5939
0
      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5940
0
      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5941
0
      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
5942
0
      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
5943
0
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
5944
0
      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
5945
0
      "vmovdqu     %%xmm0,(%3)                   \n"  // B
5946
0
      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
5947
0
      "vmovdqu     %%xmm2,(%2)                   \n"  // G
5948
5949
0
      "lea         64(%0),%0                     \n"
5950
0
      "lea         16(%1),%1                     \n"
5951
0
      "lea         16(%2),%2                     \n"
5952
0
      "lea         16(%3),%3                     \n"
5953
0
      "sub         $0x10,%4                      \n"
5954
0
      "jg          1b                            \n"
5955
0
      "vzeroupper  \n"
5956
0
      : "+r"(src_argb),               // %0
5957
0
        "+r"(dst_r),                  // %1
5958
0
        "+r"(dst_g),                  // %2
5959
0
        "+r"(dst_b),                  // %3
5960
0
        "+r"(width)                   // %4
5961
0
      : "m"(kShuffleMaskARGBSplit),   // %5
5962
0
        "m"(kShuffleMaskARGBPermute)  // %6
5963
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5964
0
}
5965
#endif
5966
5967
#ifdef HAS_MERGEXR30ROW_AVX2
5968
void MergeXR30Row_AVX2(const uint16_t* src_r,
5969
                       const uint16_t* src_g,
5970
                       const uint16_t* src_b,
5971
                       uint8_t* dst_ar30,
5972
                       int depth,
5973
0
                       int width) {
5974
0
  int shift = depth - 10;
5975
0
  asm volatile(
5976
0
      "sub         %0,%1                         \n"
5977
0
      "sub         %0,%2                         \n"
5978
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
5979
0
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
5980
0
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
5981
0
      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
5982
0
      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
5983
0
      "vmovd       %5,%%xmm4                     \n"
5984
5985
0
      LABELALIGN
5986
0
      "1:          \n"
5987
0
      "vmovdqu     (%0),%%ymm0                   \n"
5988
0
      "vmovdqu     (%0,%1),%%ymm1                \n"
5989
0
      "vmovdqu     (%0,%2),%%ymm2                \n"
5990
0
      "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
5991
0
      "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
5992
0
      "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
5993
0
      "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
5994
0
      "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
5995
0
      "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
5996
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5997
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
5998
0
      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
5999
0
      "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
6000
0
      "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
6001
0
      "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
6002
0
      "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
6003
0
      "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
6004
0
      "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
6005
0
      "vpslld      $0xa,%%ymm2,%%ymm2            \n"
6006
0
      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
6007
0
      "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
6008
0
      "vmovdqu     %%ymm0,(%3)                   \n"
6009
0
      "vmovdqu     %%ymm3,0x20(%3)               \n"
6010
0
      "lea         0x20(%0),%0                   \n"
6011
0
      "lea         0x40(%3),%3                   \n"
6012
0
      "sub         $0x10,%4                      \n"
6013
0
      "jg          1b                            \n"
6014
0
      "vzeroupper  \n"
6015
0
      : "+r"(src_r),     // %0
6016
0
        "+r"(src_g),     // %1
6017
0
        "+r"(src_b),     // %2
6018
0
        "+r"(dst_ar30),  // %3
6019
0
        "+r"(width)      // %4
6020
#if defined(__i386__)
6021
      : "m"(shift)  // %5
6022
#else
6023
0
      : "rm"(shift)   // %5
6024
0
#endif
6025
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6026
0
}
6027
#endif
6028
6029
#ifdef HAS_MERGEAR64ROW_AVX2
6030
static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
6031
void MergeAR64Row_AVX2(const uint16_t* src_r,
6032
                       const uint16_t* src_g,
6033
                       const uint16_t* src_b,
6034
                       const uint16_t* src_a,
6035
                       uint16_t* dst_ar64,
6036
                       int depth,
6037
0
                       int width) {
6038
0
  int shift = 16 - depth;
6039
0
  int mask = (1 << depth) - 1;
6040
0
  mask = (mask << 16) + mask;
6041
0
  asm volatile(
6042
0
      "sub         %0,%1                         \n"
6043
0
      "sub         %0,%2                         \n"
6044
0
      "sub         %0,%3                         \n"
6045
0
      "vmovdqa     %8,%%ymm5                     \n"
6046
0
      "vmovd       %6,%%xmm6                     \n"
6047
0
      "vbroadcastss %7,%%ymm7                    \n"
6048
6049
0
      LABELALIGN
6050
0
      "1:          \n"
6051
0
      "vmovdqu     (%0),%%ymm0                   \n"  // R
6052
0
      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6053
0
      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6054
0
      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
6055
0
      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
6056
0
      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
6057
0
      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
6058
0
      "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
6059
0
      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
6060
0
      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
6061
0
      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
6062
0
      "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
6063
0
      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
6064
0
      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
6065
0
      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
6066
0
      "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
6067
0
      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
6068
0
      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
6069
0
      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
6070
0
      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
6071
0
      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
6072
0
      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
6073
0
      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
6074
0
      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
6075
0
      "vmovdqu     %%ymm3,(%4)                   \n"
6076
0
      "vmovdqu     %%ymm2,0x20(%4)               \n"
6077
0
      "vmovdqu     %%ymm4,0x40(%4)               \n"
6078
0
      "vmovdqu     %%ymm1,0x60(%4)               \n"
6079
0
      "lea         0x20(%0),%0                   \n"
6080
0
      "lea         0x80(%4),%4                   \n"
6081
0
      "subl        $0x10,%5                      \n"
6082
0
      "jg          1b                            \n"
6083
0
      "vzeroupper  \n"
6084
0
      : "+r"(src_r),     // %0
6085
0
        "+r"(src_g),     // %1
6086
0
        "+r"(src_b),     // %2
6087
0
        "+r"(src_a),     // %3
6088
0
        "+r"(dst_ar64),  // %4
6089
#if defined(__i386__)
6090
        "+m"(width)  // %5
6091
#else
6092
0
        "+rm"(width)  // %5
6093
0
#endif
6094
0
      : "m"(shift),            // %6
6095
0
        "m"(mask),             // %7
6096
0
        "m"(MergeAR64Permute)  // %8
6097
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6098
0
        "xmm7");
6099
0
}
6100
#endif
6101
6102
#ifdef HAS_MERGEXR64ROW_AVX2
6103
void MergeXR64Row_AVX2(const uint16_t* src_r,
6104
                       const uint16_t* src_g,
6105
                       const uint16_t* src_b,
6106
                       uint16_t* dst_ar64,
6107
                       int depth,
6108
0
                       int width) {
6109
0
  int shift = 16 - depth;
6110
0
  int mask = (1 << depth) - 1;
6111
0
  mask = (mask << 16) + mask;
6112
0
  asm volatile(
6113
0
      "sub         %0,%1                         \n"
6114
0
      "sub         %0,%2                         \n"
6115
0
      "vmovdqa     %7,%%ymm5                     \n"
6116
0
      "vmovd       %5,%%xmm6                     \n"
6117
0
      "vbroadcastss %6,%%ymm7                    \n"
6118
6119
0
      LABELALIGN
6120
0
      "1:          \n"
6121
0
      "vmovdqu     (%0),%%ymm0                   \n"  // R
6122
0
      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6123
0
      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6124
0
      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
6125
0
      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
6126
0
      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
6127
0
      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
6128
0
      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
6129
0
      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
6130
0
      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
6131
0
      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
6132
0
      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
6133
0
      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
6134
0
      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
6135
0
      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
6136
0
      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
6137
0
      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
6138
0
      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
6139
0
      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
6140
0
      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
6141
0
      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
6142
0
      "vmovdqu     %%ymm3,(%3)                   \n"
6143
0
      "vmovdqu     %%ymm2,0x20(%3)               \n"
6144
0
      "vmovdqu     %%ymm4,0x40(%3)               \n"
6145
0
      "vmovdqu     %%ymm1,0x60(%3)               \n"
6146
0
      "lea         0x20(%0),%0                   \n"
6147
0
      "lea         0x80(%3),%3                   \n"
6148
0
      "subl        $0x10,%4                      \n"
6149
0
      "jg          1b                            \n"
6150
0
      "vzeroupper  \n"
6151
0
      : "+r"(src_r),           // %0
6152
0
        "+r"(src_g),           // %1
6153
0
        "+r"(src_b),           // %2
6154
0
        "+r"(dst_ar64),        // %3
6155
0
        "+r"(width)            // %4
6156
0
      : "m"(shift),            // %5
6157
0
        "m"(mask),             // %6
6158
0
        "m"(MergeAR64Permute)  // %7
6159
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6160
0
        "xmm7");
6161
0
}
6162
#endif
6163
6164
#ifdef HAS_MERGEARGB16TO8ROW_AVX2
6165
static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
6166
                                            4, 12, 5, 13, 6, 14, 7, 15};
6167
void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
6168
                            const uint16_t* src_g,
6169
                            const uint16_t* src_b,
6170
                            const uint16_t* src_a,
6171
                            uint8_t* dst_argb,
6172
                            int depth,
6173
0
                            int width) {
6174
0
  int shift = depth - 8;
6175
0
  asm volatile(
6176
0
      "sub         %0,%1                         \n"
6177
0
      "sub         %0,%2                         \n"
6178
0
      "sub         %0,%3                         \n"
6179
0
      "vbroadcastf128 %7,%%ymm5                  \n"
6180
0
      "vmovd       %6,%%xmm6                     \n"
6181
6182
0
      LABELALIGN
6183
0
      "1:          \n"
6184
0
      "vmovdqu     (%0),%%ymm0                   \n"  // R
6185
0
      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6186
0
      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6187
0
      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
6188
0
      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
6189
0
      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
6190
0
      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
6191
0
      "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
6192
0
      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
6193
0
      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
6194
0
      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
6195
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
6196
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6197
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6198
0
      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
6199
0
      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
6200
0
      "vmovdqu     %%ymm2,(%4)                   \n"
6201
0
      "vmovdqu     %%ymm0,0x20(%4)               \n"
6202
0
      "lea         0x20(%0),%0                   \n"
6203
0
      "lea         0x40(%4),%4                   \n"
6204
0
      "subl        $0x10,%5                      \n"
6205
0
      "jg          1b                            \n"
6206
0
      "vzeroupper  \n"
6207
0
      : "+r"(src_r),     // %0
6208
0
        "+r"(src_g),     // %1
6209
0
        "+r"(src_b),     // %2
6210
0
        "+r"(src_a),     // %3
6211
0
        "+r"(dst_argb),  // %4
6212
#if defined(__i386__)
6213
        "+m"(width)  // %5
6214
#else
6215
0
        "+rm"(width)  // %5
6216
0
#endif
6217
0
      : "m"(shift),                 // %6
6218
0
        "m"(MergeARGB16To8Shuffle)  // %7
6219
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6220
0
}
6221
#endif
6222
6223
#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
6224
void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
6225
                            const uint16_t* src_g,
6226
                            const uint16_t* src_b,
6227
                            uint8_t* dst_argb,
6228
                            int depth,
6229
0
                            int width) {
6230
0
  int shift = depth - 8;
6231
0
  asm volatile(
6232
0
      "sub         %0,%1                         \n"
6233
0
      "sub         %0,%2                         \n"
6234
0
      "vbroadcastf128 %6,%%ymm5                  \n"
6235
0
      "vmovd       %5,%%xmm6                     \n"
6236
0
      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
6237
0
      "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
6238
6239
0
      LABELALIGN
6240
0
      "1:          \n"
6241
0
      "vmovdqu     (%0),%%ymm0                   \n"  // R
6242
0
      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6243
0
      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6244
0
      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
6245
0
      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
6246
0
      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
6247
0
      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
6248
0
      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
6249
0
      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
6250
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
6251
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6252
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6253
0
      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
6254
0
      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
6255
0
      "vmovdqu     %%ymm2,(%3)                   \n"
6256
0
      "vmovdqu     %%ymm0,0x20(%3)               \n"
6257
0
      "lea         0x20(%0),%0                   \n"
6258
0
      "lea         0x40(%3),%3                   \n"
6259
0
      "subl        $0x10,%4                      \n"
6260
0
      "jg          1b                            \n"
6261
0
      "vzeroupper  \n"
6262
0
      : "+r"(src_r),                // %0
6263
0
        "+r"(src_g),                // %1
6264
0
        "+r"(src_b),                // %2
6265
0
        "+r"(dst_argb),             // %3
6266
0
        "+r"(width)                 // %4
6267
0
      : "m"(shift),                 // %5
6268
0
        "m"(MergeARGB16To8Shuffle)  // %6
6269
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6270
0
}
6271
#endif
6272
6273
#ifdef HAS_COPYROW_SSE2
6274
0
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6275
0
  asm volatile(
6276
0
      "test        $0xf,%0                       \n"
6277
0
      "jne         2f                            \n"
6278
0
      "test        $0xf,%1                       \n"
6279
0
      "jne         2f                            \n"
6280
6281
0
      LABELALIGN
6282
0
      "1:          \n"
6283
0
      "movdqa      (%0),%%xmm0                   \n"
6284
0
      "movdqa      0x10(%0),%%xmm1               \n"
6285
0
      "lea         0x20(%0),%0                   \n"
6286
0
      "movdqa      %%xmm0,(%1)                   \n"
6287
0
      "movdqa      %%xmm1,0x10(%1)               \n"
6288
0
      "lea         0x20(%1),%1                   \n"
6289
0
      "sub         $0x20,%2                      \n"
6290
0
      "jg          1b                            \n"
6291
0
      "jmp         9f                            \n"
6292
6293
0
      LABELALIGN
6294
0
      "2:          \n"
6295
0
      "movdqu      (%0),%%xmm0                   \n"
6296
0
      "movdqu      0x10(%0),%%xmm1               \n"
6297
0
      "lea         0x20(%0),%0                   \n"
6298
0
      "movdqu      %%xmm0,(%1)                   \n"
6299
0
      "movdqu      %%xmm1,0x10(%1)               \n"
6300
0
      "lea         0x20(%1),%1                   \n"
6301
0
      "sub         $0x20,%2                      \n"
6302
0
      "jg          2b                            \n"
6303
6304
0
      LABELALIGN "9:          \n"
6305
0
      : "+r"(src),   // %0
6306
0
        "+r"(dst),   // %1
6307
0
        "+r"(width)  // %2
6308
0
      :
6309
0
      : "memory", "cc", "xmm0", "xmm1");
6310
0
}
6311
#endif  // HAS_COPYROW_SSE2
6312
6313
#ifdef HAS_COPYROW_AVX
6314
0
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6315
0
  asm volatile(
6316
0
      "1:          \n"
6317
0
      "vmovdqu     (%0),%%ymm0                   \n"
6318
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6319
0
      "lea         0x40(%0),%0                   \n"
6320
0
      "vmovdqu     %%ymm0,(%1)                   \n"
6321
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
6322
0
      "lea         0x40(%1),%1                   \n"
6323
0
      "sub         $0x40,%2                      \n"
6324
0
      "jg          1b                            \n"
6325
0
      "vzeroupper  \n"
6326
0
      : "+r"(src),   // %0
6327
0
        "+r"(dst),   // %1
6328
0
        "+r"(width)  // %2
6329
0
      :
6330
0
      : "memory", "cc", "xmm0", "xmm1");
6331
0
}
6332
#endif  // HAS_COPYROW_AVX
6333
6334
#ifdef HAS_COPYROW_AVX512BW
6335
0
void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
6336
0
  asm volatile(
6337
0
      "1:          \n"
6338
0
      "vmovups     (%0),%%zmm0                   \n"
6339
0
      "vmovups     0x40(%0),%%zmm1               \n"
6340
0
      "lea         0x80(%0),%0                   \n"
6341
0
      "vmovups     %%zmm0,(%1)                   \n"
6342
0
      "vmovups     %%zmm1,0x40(%1)               \n"
6343
0
      "lea         0x80(%1),%1                   \n"
6344
0
      "sub         $0x80,%2                      \n"
6345
0
      "jg          1b                            \n"
6346
0
      "vzeroupper  \n"
6347
0
      : "+r"(src),   // %0
6348
0
        "+r"(dst),   // %1
6349
0
        "+r"(width)  // %2
6350
0
      :
6351
0
      : "memory", "cc", "xmm0", "xmm1");
6352
0
}
6353
#endif  // HAS_COPYROW_AVX512
6354
6355
#ifdef HAS_COPYROW_ERMS
6356
// Multiple of 1.
6357
24.3k
void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6358
24.3k
  size_t width_tmp = (size_t)(width);
6359
24.3k
      asm volatile("rep         movsb                         \n"
6360
24.3k
               : "+S"(src),       // %0
6361
24.3k
                 "+D"(dst),       // %1
6362
24.3k
                 "+c"(width_tmp)  // %2
6363
24.3k
               :
6364
24.3k
               : "memory", "cc");
6365
24.3k
}
6366
#endif  // HAS_COPYROW_ERMS
6367
6368
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
6369
// width in pixels
6370
0
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6371
0
  asm volatile(
6372
0
      "pcmpeqb     %%xmm0,%%xmm0                 \n"
6373
0
      "pslld       $0x18,%%xmm0                  \n"
6374
0
      "pcmpeqb     %%xmm1,%%xmm1                 \n"
6375
0
      "psrld       $0x8,%%xmm1                   \n"
6376
6377
0
      LABELALIGN
6378
0
      "1:          \n"
6379
0
      "movdqu      (%0),%%xmm2                   \n"
6380
0
      "movdqu      0x10(%0),%%xmm3               \n"
6381
0
      "lea         0x20(%0),%0                   \n"
6382
0
      "movdqu      (%1),%%xmm4                   \n"
6383
0
      "movdqu      0x10(%1),%%xmm5               \n"
6384
0
      "pand        %%xmm0,%%xmm2                 \n"
6385
0
      "pand        %%xmm0,%%xmm3                 \n"
6386
0
      "pand        %%xmm1,%%xmm4                 \n"
6387
0
      "pand        %%xmm1,%%xmm5                 \n"
6388
0
      "por         %%xmm4,%%xmm2                 \n"
6389
0
      "por         %%xmm5,%%xmm3                 \n"
6390
0
      "movdqu      %%xmm2,(%1)                   \n"
6391
0
      "movdqu      %%xmm3,0x10(%1)               \n"
6392
0
      "lea         0x20(%1),%1                   \n"
6393
0
      "sub         $0x8,%2                       \n"
6394
0
      "jg          1b                            \n"
6395
0
      : "+r"(src),   // %0
6396
0
        "+r"(dst),   // %1
6397
0
        "+r"(width)  // %2
6398
0
      :
6399
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6400
0
}
6401
#endif  // HAS_ARGBCOPYALPHAROW_SSE2
6402
6403
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
6404
// width in pixels
6405
0
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6406
0
  asm volatile(
6407
0
      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6408
0
      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6409
6410
0
      LABELALIGN
6411
0
      "1:          \n"
6412
0
      "vmovdqu     (%0),%%ymm1                   \n"
6413
0
      "vmovdqu     0x20(%0),%%ymm2               \n"
6414
0
      "lea         0x40(%0),%0                   \n"
6415
0
      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6416
0
      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6417
0
      "vmovdqu     %%ymm1,(%1)                   \n"
6418
0
      "vmovdqu     %%ymm2,0x20(%1)               \n"
6419
0
      "lea         0x40(%1),%1                   \n"
6420
0
      "sub         $0x10,%2                      \n"
6421
0
      "jg          1b                            \n"
6422
0
      "vzeroupper  \n"
6423
0
      : "+r"(src),   // %0
6424
0
        "+r"(dst),   // %1
6425
0
        "+r"(width)  // %2
6426
0
      :
6427
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
6428
0
}
6429
#endif  // HAS_ARGBCOPYALPHAROW_AVX2
6430
6431
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6432
// width in pixels
6433
void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6434
                              uint8_t* dst_a,
6435
0
                              int width) {
6436
0
  asm volatile(
6437
0
      "1:          \n"
6438
0
      "movdqu      (%0), %%xmm0                  \n"
6439
0
      "movdqu      0x10(%0), %%xmm1              \n"
6440
0
      "lea         0x20(%0), %0                  \n"
6441
0
      "psrld       $0x18, %%xmm0                 \n"
6442
0
      "psrld       $0x18, %%xmm1                 \n"
6443
0
      "packssdw    %%xmm1, %%xmm0                \n"
6444
0
      "packuswb    %%xmm0, %%xmm0                \n"
6445
0
      "movq        %%xmm0,(%1)                   \n"
6446
0
      "lea         0x8(%1), %1                   \n"
6447
0
      "sub         $0x8, %2                      \n"
6448
0
      "jg          1b                            \n"
6449
0
      : "+r"(src_argb),  // %0
6450
0
        "+r"(dst_a),     // %1
6451
0
        "+rm"(width)     // %2
6452
0
      :
6453
0
      : "memory", "cc", "xmm0", "xmm1");
6454
0
}
6455
#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
6456
6457
#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6458
static const uvec8 kShuffleAlphaShort_AVX2 = {
6459
    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
6460
    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6461
6462
void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6463
                              uint8_t* dst_a,
6464
0
                              int width) {
6465
0
  asm volatile(
6466
0
      "vmovdqa     %3,%%ymm4                     \n"
6467
0
      "vbroadcastf128 %4,%%ymm5                  \n"
6468
6469
0
      LABELALIGN
6470
0
      "1:          \n"
6471
0
      "vmovdqu     (%0), %%ymm0                  \n"
6472
0
      "vmovdqu     0x20(%0), %%ymm1              \n"
6473
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
6474
0
      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
6475
0
      "vmovdqu     0x40(%0), %%ymm2              \n"
6476
0
      "vmovdqu     0x60(%0), %%ymm3              \n"
6477
0
      "lea         0x80(%0), %0                  \n"
6478
0
      "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
6479
0
      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
6480
0
      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
6481
0
      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
6482
0
      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
6483
0
      "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
6484
0
      "vmovdqu     %%ymm0,(%1)                   \n"
6485
0
      "lea         0x20(%1),%1                   \n"
6486
0
      "sub         $0x20, %2                     \n"
6487
0
      "jg          1b                            \n"
6488
0
      "vzeroupper  \n"
6489
0
      : "+r"(src_argb),               // %0
6490
0
        "+r"(dst_a),                  // %1
6491
0
        "+rm"(width)                  // %2
6492
0
      : "m"(kPermdARGBToY_AVX),       // %3
6493
0
        "m"(kShuffleAlphaShort_AVX2)  // %4
6494
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6495
0
}
6496
#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
6497
6498
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6499
// width in pixels
6500
0
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6501
0
  asm volatile(
6502
0
      "pcmpeqb     %%xmm0,%%xmm0                 \n"
6503
0
      "pslld       $0x18,%%xmm0                  \n"
6504
0
      "pcmpeqb     %%xmm1,%%xmm1                 \n"
6505
0
      "psrld       $0x8,%%xmm1                   \n"
6506
6507
0
      LABELALIGN
6508
0
      "1:          \n"
6509
0
      "movq        (%0),%%xmm2                   \n"
6510
0
      "lea         0x8(%0),%0                    \n"
6511
0
      "punpcklbw   %%xmm2,%%xmm2                 \n"
6512
0
      "punpckhwd   %%xmm2,%%xmm3                 \n"
6513
0
      "punpcklwd   %%xmm2,%%xmm2                 \n"
6514
0
      "movdqu      (%1),%%xmm4                   \n"
6515
0
      "movdqu      0x10(%1),%%xmm5               \n"
6516
0
      "pand        %%xmm0,%%xmm2                 \n"
6517
0
      "pand        %%xmm0,%%xmm3                 \n"
6518
0
      "pand        %%xmm1,%%xmm4                 \n"
6519
0
      "pand        %%xmm1,%%xmm5                 \n"
6520
0
      "por         %%xmm4,%%xmm2                 \n"
6521
0
      "por         %%xmm5,%%xmm3                 \n"
6522
0
      "movdqu      %%xmm2,(%1)                   \n"
6523
0
      "movdqu      %%xmm3,0x10(%1)               \n"
6524
0
      "lea         0x20(%1),%1                   \n"
6525
0
      "sub         $0x8,%2                       \n"
6526
0
      "jg          1b                            \n"
6527
0
      : "+r"(src),   // %0
6528
0
        "+r"(dst),   // %1
6529
0
        "+r"(width)  // %2
6530
0
      :
6531
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6532
0
}
6533
#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
6534
6535
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6536
// width in pixels
6537
0
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6538
0
  asm volatile(
6539
0
      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6540
0
      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6541
6542
0
      LABELALIGN
6543
0
      "1:          \n"
6544
0
      "vpmovzxbd   (%0),%%ymm1                   \n"
6545
0
      "vpmovzxbd   0x8(%0),%%ymm2                \n"
6546
0
      "lea         0x10(%0),%0                   \n"
6547
0
      "vpslld      $0x18,%%ymm1,%%ymm1           \n"
6548
0
      "vpslld      $0x18,%%ymm2,%%ymm2           \n"
6549
0
      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6550
0
      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6551
0
      "vmovdqu     %%ymm1,(%1)                   \n"
6552
0
      "vmovdqu     %%ymm2,0x20(%1)               \n"
6553
0
      "lea         0x40(%1),%1                   \n"
6554
0
      "sub         $0x10,%2                      \n"
6555
0
      "jg          1b                            \n"
6556
0
      "vzeroupper  \n"
6557
0
      : "+r"(src),   // %0
6558
0
        "+r"(dst),   // %1
6559
0
        "+r"(width)  // %2
6560
0
      :
6561
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
6562
0
}
6563
#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
6564
6565
#ifdef HAS_SETROW_X86
6566
0
void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6567
0
  size_t width_tmp = (size_t)(width >> 2);
6568
0
  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
6569
0
      asm volatile("rep         stosl                         \n"
6570
0
               : "+D"(dst),       // %0
6571
0
                 "+c"(width_tmp)  // %1
6572
0
               : "a"(v32)         // %2
6573
0
               : "memory", "cc");
6574
0
}
6575
6576
0
void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6577
0
  size_t width_tmp = (size_t)(width);
6578
0
      asm volatile("rep         stosb                         \n"
6579
0
               : "+D"(dst),       // %0
6580
0
                 "+c"(width_tmp)  // %1
6581
0
               : "a"(v8)          // %2
6582
0
               : "memory", "cc");
6583
0
}
6584
6585
0
void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6586
0
  size_t width_tmp = (size_t)(width);
6587
0
      asm volatile("rep         stosl                         \n"
6588
0
               : "+D"(dst_argb),  // %0
6589
0
                 "+c"(width_tmp)  // %1
6590
0
               : "a"(v32)         // %2
6591
0
               : "memory", "cc");
6592
0
}
6593
#endif  // HAS_SETROW_X86
6594
6595
#ifdef HAS_YUY2TOYROW_SSE2
6596
0
void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6597
0
  asm volatile(
6598
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
6599
0
      "psrlw       $0x8,%%xmm5                   \n"
6600
6601
0
      LABELALIGN
6602
0
      "1:          \n"
6603
0
      "movdqu      (%0),%%xmm0                   \n"
6604
0
      "movdqu      0x10(%0),%%xmm1               \n"
6605
0
      "lea         0x20(%0),%0                   \n"
6606
0
      "pand        %%xmm5,%%xmm0                 \n"
6607
0
      "pand        %%xmm5,%%xmm1                 \n"
6608
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6609
0
      "movdqu      %%xmm0,(%1)                   \n"
6610
0
      "lea         0x10(%1),%1                   \n"
6611
0
      "sub         $0x10,%2                      \n"
6612
0
      "jg          1b                            \n"
6613
0
      : "+r"(src_yuy2),  // %0
6614
0
        "+r"(dst_y),     // %1
6615
0
        "+r"(width)      // %2
6616
0
      :
6617
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6618
0
}
6619
6620
void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
6621
                        int stride_yuy2,
6622
                        uint8_t* dst_uv,
6623
0
                        int width) {
6624
0
  asm volatile(
6625
0
      "1:          \n"
6626
0
      "movdqu      (%0),%%xmm0                   \n"
6627
0
      "movdqu      0x10(%0),%%xmm1               \n"
6628
0
      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
6629
0
      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
6630
0
      "lea         0x20(%0),%0                   \n"
6631
0
      "pavgb       %%xmm2,%%xmm0                 \n"
6632
0
      "pavgb       %%xmm3,%%xmm1                 \n"
6633
0
      "psrlw       $0x8,%%xmm0                   \n"
6634
0
      "psrlw       $0x8,%%xmm1                   \n"
6635
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6636
0
      "movdqu      %%xmm0,(%1)                   \n"
6637
0
      "lea         0x10(%1),%1                   \n"
6638
0
      "sub         $0x10,%2                      \n"
6639
0
      "jg          1b                            \n"
6640
0
      : "+r"(src_yuy2),               // %0
6641
0
        "+r"(dst_uv),                 // %1
6642
0
        "+r"(width)                   // %2
6643
0
      : "r"((intptr_t)(stride_yuy2))  // %3
6644
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6645
0
}
6646
6647
void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6648
                      int stride_yuy2,
6649
                      uint8_t* dst_u,
6650
                      uint8_t* dst_v,
6651
0
                      int width) {
6652
0
  asm volatile(
6653
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
6654
0
      "psrlw       $0x8,%%xmm5                   \n"
6655
0
      "sub         %1,%2                         \n"
6656
6657
0
      LABELALIGN
6658
0
      "1:          \n"
6659
0
      "movdqu      (%0),%%xmm0                   \n"
6660
0
      "movdqu      0x10(%0),%%xmm1               \n"
6661
0
      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6662
0
      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6663
0
      "lea         0x20(%0),%0                   \n"
6664
0
      "pavgb       %%xmm2,%%xmm0                 \n"
6665
0
      "pavgb       %%xmm3,%%xmm1                 \n"
6666
0
      "psrlw       $0x8,%%xmm0                   \n"
6667
0
      "psrlw       $0x8,%%xmm1                   \n"
6668
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6669
0
      "movdqa      %%xmm0,%%xmm1                 \n"
6670
0
      "pand        %%xmm5,%%xmm0                 \n"
6671
0
      "packuswb    %%xmm0,%%xmm0                 \n"
6672
0
      "psrlw       $0x8,%%xmm1                   \n"
6673
0
      "packuswb    %%xmm1,%%xmm1                 \n"
6674
0
      "movq        %%xmm0,(%1)                   \n"
6675
0
      "movq        %%xmm1,0x00(%1,%2,1)          \n"
6676
0
      "lea         0x8(%1),%1                    \n"
6677
0
      "sub         $0x10,%3                      \n"
6678
0
      "jg          1b                            \n"
6679
0
      : "+r"(src_yuy2),               // %0
6680
0
        "+r"(dst_u),                  // %1
6681
0
        "+r"(dst_v),                  // %2
6682
0
        "+r"(width)                   // %3
6683
0
      : "r"((intptr_t)(stride_yuy2))  // %4
6684
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6685
0
}
6686
6687
void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6688
                         uint8_t* dst_u,
6689
                         uint8_t* dst_v,
6690
0
                         int width) {
6691
0
  asm volatile(
6692
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
6693
0
      "psrlw       $0x8,%%xmm5                   \n"
6694
0
      "sub         %1,%2                         \n"
6695
6696
0
      LABELALIGN
6697
0
      "1:          \n"
6698
0
      "movdqu      (%0),%%xmm0                   \n"
6699
0
      "movdqu      0x10(%0),%%xmm1               \n"
6700
0
      "lea         0x20(%0),%0                   \n"
6701
0
      "psrlw       $0x8,%%xmm0                   \n"
6702
0
      "psrlw       $0x8,%%xmm1                   \n"
6703
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6704
0
      "movdqa      %%xmm0,%%xmm1                 \n"
6705
0
      "pand        %%xmm5,%%xmm0                 \n"
6706
0
      "packuswb    %%xmm0,%%xmm0                 \n"
6707
0
      "psrlw       $0x8,%%xmm1                   \n"
6708
0
      "packuswb    %%xmm1,%%xmm1                 \n"
6709
0
      "movq        %%xmm0,(%1)                   \n"
6710
0
      "movq        %%xmm1,0x00(%1,%2,1)          \n"
6711
0
      "lea         0x8(%1),%1                    \n"
6712
0
      "sub         $0x10,%3                      \n"
6713
0
      "jg          1b                            \n"
6714
0
      : "+r"(src_yuy2),  // %0
6715
0
        "+r"(dst_u),     // %1
6716
0
        "+r"(dst_v),     // %2
6717
0
        "+r"(width)      // %3
6718
0
      :
6719
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6720
0
}
6721
6722
0
void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6723
0
  asm volatile(
6724
0
      "1:          \n"
6725
0
      "movdqu      (%0),%%xmm0                   \n"
6726
0
      "movdqu      0x10(%0),%%xmm1               \n"
6727
0
      "lea         0x20(%0),%0                   \n"
6728
0
      "psrlw       $0x8,%%xmm0                   \n"
6729
0
      "psrlw       $0x8,%%xmm1                   \n"
6730
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6731
0
      "movdqu      %%xmm0,(%1)                   \n"
6732
0
      "lea         0x10(%1),%1                   \n"
6733
0
      "sub         $0x10,%2                      \n"
6734
0
      "jg          1b                            \n"
6735
0
      : "+r"(src_uyvy),  // %0
6736
0
        "+r"(dst_y),     // %1
6737
0
        "+r"(width)      // %2
6738
0
      :
6739
0
      : "memory", "cc", "xmm0", "xmm1");
6740
0
}
6741
6742
void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6743
                      int stride_uyvy,
6744
                      uint8_t* dst_u,
6745
                      uint8_t* dst_v,
6746
0
                      int width) {
6747
0
  asm volatile(
6748
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
6749
0
      "psrlw       $0x8,%%xmm5                   \n"
6750
0
      "sub         %1,%2                         \n"
6751
6752
0
      LABELALIGN
6753
0
      "1:          \n"
6754
0
      "movdqu      (%0),%%xmm0                   \n"
6755
0
      "movdqu      0x10(%0),%%xmm1               \n"
6756
0
      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6757
0
      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6758
0
      "lea         0x20(%0),%0                   \n"
6759
0
      "pavgb       %%xmm2,%%xmm0                 \n"
6760
0
      "pavgb       %%xmm3,%%xmm1                 \n"
6761
0
      "pand        %%xmm5,%%xmm0                 \n"
6762
0
      "pand        %%xmm5,%%xmm1                 \n"
6763
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6764
0
      "movdqa      %%xmm0,%%xmm1                 \n"
6765
0
      "pand        %%xmm5,%%xmm0                 \n"
6766
0
      "packuswb    %%xmm0,%%xmm0                 \n"
6767
0
      "psrlw       $0x8,%%xmm1                   \n"
6768
0
      "packuswb    %%xmm1,%%xmm1                 \n"
6769
0
      "movq        %%xmm0,(%1)                   \n"
6770
0
      "movq        %%xmm1,0x00(%1,%2,1)          \n"
6771
0
      "lea         0x8(%1),%1                    \n"
6772
0
      "sub         $0x10,%3                      \n"
6773
0
      "jg          1b                            \n"
6774
0
      : "+r"(src_uyvy),               // %0
6775
0
        "+r"(dst_u),                  // %1
6776
0
        "+r"(dst_v),                  // %2
6777
0
        "+r"(width)                   // %3
6778
0
      : "r"((intptr_t)(stride_uyvy))  // %4
6779
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6780
0
}
6781
6782
void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6783
                         uint8_t* dst_u,
6784
                         uint8_t* dst_v,
6785
0
                         int width) {
6786
0
  asm volatile(
6787
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
6788
0
      "psrlw       $0x8,%%xmm5                   \n"
6789
0
      "sub         %1,%2                         \n"
6790
6791
0
      LABELALIGN
6792
0
      "1:          \n"
6793
0
      "movdqu      (%0),%%xmm0                   \n"
6794
0
      "movdqu      0x10(%0),%%xmm1               \n"
6795
0
      "lea         0x20(%0),%0                   \n"
6796
0
      "pand        %%xmm5,%%xmm0                 \n"
6797
0
      "pand        %%xmm5,%%xmm1                 \n"
6798
0
      "packuswb    %%xmm1,%%xmm0                 \n"
6799
0
      "movdqa      %%xmm0,%%xmm1                 \n"
6800
0
      "pand        %%xmm5,%%xmm0                 \n"
6801
0
      "packuswb    %%xmm0,%%xmm0                 \n"
6802
0
      "psrlw       $0x8,%%xmm1                   \n"
6803
0
      "packuswb    %%xmm1,%%xmm1                 \n"
6804
0
      "movq        %%xmm0,(%1)                   \n"
6805
0
      "movq        %%xmm1,0x00(%1,%2,1)          \n"
6806
0
      "lea         0x8(%1),%1                    \n"
6807
0
      "sub         $0x10,%3                      \n"
6808
0
      "jg          1b                            \n"
6809
0
      : "+r"(src_uyvy),  // %0
6810
0
        "+r"(dst_u),     // %1
6811
0
        "+r"(dst_v),     // %2
6812
0
        "+r"(width)      // %3
6813
0
      :
6814
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6815
0
}
6816
#endif  // HAS_YUY2TOYROW_SSE2
6817
6818
#ifdef HAS_YUY2TOYROW_AVX2
6819
0
void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6820
0
  asm volatile(
6821
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6822
0
      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6823
6824
0
      LABELALIGN
6825
0
      "1:          \n"
6826
0
      "vmovdqu     (%0),%%ymm0                   \n"
6827
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6828
0
      "lea         0x40(%0),%0                   \n"
6829
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6830
0
      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6831
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6832
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6833
0
      "vmovdqu     %%ymm0,(%1)                   \n"
6834
0
      "lea         0x20(%1),%1                   \n"
6835
0
      "sub         $0x20,%2                      \n"
6836
0
      "jg          1b                            \n"
6837
0
      "vzeroupper  \n"
6838
0
      : "+r"(src_yuy2),  // %0
6839
0
        "+r"(dst_y),     // %1
6840
0
        "+r"(width)      // %2
6841
0
      :
6842
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6843
0
}
6844
6845
void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
6846
                        int stride_yuy2,
6847
                        uint8_t* dst_uv,
6848
0
                        int width) {
6849
0
  asm volatile(
6850
0
      "1:          \n"
6851
0
      "vmovdqu     (%0),%%ymm0                   \n"
6852
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6853
0
      "vpavgb      0x00(%0,%3,1),%%ymm0,%%ymm0   \n"
6854
0
      "vpavgb      0x20(%0,%3,1),%%ymm1,%%ymm1   \n"
6855
0
      "lea         0x40(%0),%0                   \n"
6856
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6857
0
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6858
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6859
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6860
0
      "vmovdqu     %%ymm0,(%1)                   \n"
6861
0
      "lea         0x20(%1),%1                   \n"
6862
0
      "sub         $0x20,%2                      \n"
6863
0
      "jg          1b                            \n"
6864
0
      "vzeroupper  \n"
6865
0
      : "+r"(src_yuy2),               // %0
6866
0
        "+r"(dst_uv),                 // %1
6867
0
        "+r"(width)                   // %2
6868
0
      : "r"((intptr_t)(stride_yuy2))  // %3
6869
0
      : "memory", "cc", "xmm0", "xmm1");
6870
0
}
6871
6872
void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
6873
                      int stride_yuy2,
6874
                      uint8_t* dst_u,
6875
                      uint8_t* dst_v,
6876
0
                      int width) {
6877
0
  asm volatile(
6878
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6879
0
      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6880
0
      "sub         %1,%2                         \n"
6881
6882
0
      LABELALIGN
6883
0
      "1:          \n"
6884
0
      "vmovdqu     (%0),%%ymm0                   \n"
6885
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6886
0
      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
6887
0
      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
6888
0
      "lea         0x40(%0),%0                   \n"
6889
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6890
0
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6891
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6892
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6893
0
      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6894
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6895
0
      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6896
0
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6897
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6898
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6899
0
      "vextractf128 $0x0,%%ymm1,(%1)             \n"
6900
0
      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6901
0
      "lea         0x10(%1),%1                   \n"
6902
0
      "sub         $0x20,%3                      \n"
6903
0
      "jg          1b                            \n"
6904
0
      "vzeroupper  \n"
6905
0
      : "+r"(src_yuy2),               // %0
6906
0
        "+r"(dst_u),                  // %1
6907
0
        "+r"(dst_v),                  // %2
6908
0
        "+r"(width)                   // %3
6909
0
      : "r"((intptr_t)(stride_yuy2))  // %4
6910
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6911
0
}
6912
6913
void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
6914
                         uint8_t* dst_u,
6915
                         uint8_t* dst_v,
6916
0
                         int width) {
6917
0
  asm volatile(
6918
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6919
0
      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6920
0
      "sub         %1,%2                         \n"
6921
6922
0
      LABELALIGN
6923
0
      "1:          \n"
6924
0
      "vmovdqu     (%0),%%ymm0                   \n"
6925
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6926
0
      "lea         0x40(%0),%0                   \n"
6927
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6928
0
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6929
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6930
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6931
0
      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6932
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6933
0
      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6934
0
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6935
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6936
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6937
0
      "vextractf128 $0x0,%%ymm1,(%1)             \n"
6938
0
      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6939
0
      "lea         0x10(%1),%1                   \n"
6940
0
      "sub         $0x20,%3                      \n"
6941
0
      "jg          1b                            \n"
6942
0
      "vzeroupper  \n"
6943
0
      : "+r"(src_yuy2),  // %0
6944
0
        "+r"(dst_u),     // %1
6945
0
        "+r"(dst_v),     // %2
6946
0
        "+r"(width)      // %3
6947
0
      :
6948
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6949
0
}
6950
6951
0
void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6952
0
  asm volatile(
6953
0
      "1:          \n"
6954
0
      "vmovdqu     (%0),%%ymm0                   \n"
6955
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6956
0
      "lea         0x40(%0),%0                   \n"
6957
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6958
0
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6959
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6960
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6961
0
      "vmovdqu     %%ymm0,(%1)                   \n"
6962
0
      "lea         0x20(%1),%1                   \n"
6963
0
      "sub         $0x20,%2                      \n"
6964
0
      "jg          1b                            \n"
6965
0
      "vzeroupper  \n"
6966
0
      : "+r"(src_uyvy),  // %0
6967
0
        "+r"(dst_y),     // %1
6968
0
        "+r"(width)      // %2
6969
0
      :
6970
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
6971
0
}
6972
void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
6973
                      int stride_uyvy,
6974
                      uint8_t* dst_u,
6975
                      uint8_t* dst_v,
6976
0
                      int width) {
6977
0
  asm volatile(
6978
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6979
0
      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6980
0
      "sub         %1,%2                         \n"
6981
6982
0
      LABELALIGN
6983
0
      "1:          \n"
6984
0
      "vmovdqu     (%0),%%ymm0                   \n"
6985
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
6986
0
      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
6987
0
      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
6988
0
      "lea         0x40(%0),%0                   \n"
6989
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6990
0
      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6991
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6992
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6993
0
      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6994
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6995
0
      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6996
0
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6997
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6998
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6999
0
      "vextractf128 $0x0,%%ymm1,(%1)             \n"
7000
0
      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
7001
0
      "lea         0x10(%1),%1                   \n"
7002
0
      "sub         $0x20,%3                      \n"
7003
0
      "jg          1b                            \n"
7004
0
      "vzeroupper  \n"
7005
0
      : "+r"(src_uyvy),               // %0
7006
0
        "+r"(dst_u),                  // %1
7007
0
        "+r"(dst_v),                  // %2
7008
0
        "+r"(width)                   // %3
7009
0
      : "r"((intptr_t)(stride_uyvy))  // %4
7010
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
7011
0
}
7012
7013
void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
7014
                         uint8_t* dst_u,
7015
                         uint8_t* dst_v,
7016
0
                         int width) {
7017
0
  asm volatile(
7018
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7019
0
      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
7020
0
      "sub         %1,%2                         \n"
7021
7022
0
      LABELALIGN
7023
0
      "1:          \n"
7024
0
      "vmovdqu     (%0),%%ymm0                   \n"
7025
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
7026
0
      "lea         0x40(%0),%0                   \n"
7027
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
7028
0
      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
7029
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7030
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7031
0
      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
7032
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7033
0
      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
7034
0
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
7035
0
      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
7036
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7037
0
      "vextractf128 $0x0,%%ymm1,(%1)             \n"
7038
0
      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
7039
0
      "lea         0x10(%1),%1                   \n"
7040
0
      "sub         $0x20,%3                      \n"
7041
0
      "jg          1b                            \n"
7042
0
      "vzeroupper  \n"
7043
0
      : "+r"(src_uyvy),  // %0
7044
0
        "+r"(dst_u),     // %1
7045
0
        "+r"(dst_v),     // %2
7046
0
        "+r"(width)      // %3
7047
0
      :
7048
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
7049
0
}
7050
#endif  // HAS_YUY2TOYROW_AVX2
7051
7052
#ifdef HAS_ARGBBLENDROW_SSSE3
7053
// Shuffle table for isolating alpha.
7054
static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
7055
                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
7056
7057
// Blend 8 pixels at a time
7058
void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
7059
                        const uint8_t* src_argb1,
7060
                        uint8_t* dst_argb,
7061
0
                        int width) {
7062
0
  asm volatile(
7063
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
7064
0
      "psrlw       $0xf,%%xmm7                   \n"
7065
0
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
7066
0
      "psrlw       $0x8,%%xmm6                   \n"
7067
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
7068
0
      "psllw       $0x8,%%xmm5                   \n"
7069
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
7070
0
      "pslld       $0x18,%%xmm4                  \n"
7071
0
      "sub         $0x4,%3                       \n"
7072
0
      "jl          49f                           \n"
7073
7074
      // 4 pixel loop.
7075
0
      LABELALIGN
7076
0
      "40:         \n"
7077
0
      "movdqu      (%0),%%xmm3                   \n"
7078
0
      "lea         0x10(%0),%0                   \n"
7079
0
      "movdqa      %%xmm3,%%xmm0                 \n"
7080
0
      "pxor        %%xmm4,%%xmm3                 \n"
7081
0
      "movdqu      (%1),%%xmm2                   \n"
7082
0
      "pshufb      %4,%%xmm3                     \n"
7083
0
      "pand        %%xmm6,%%xmm2                 \n"
7084
0
      "paddw       %%xmm7,%%xmm3                 \n"
7085
0
      "pmullw      %%xmm3,%%xmm2                 \n"
7086
0
      "movdqu      (%1),%%xmm1                   \n"
7087
0
      "lea         0x10(%1),%1                   \n"
7088
0
      "psrlw       $0x8,%%xmm1                   \n"
7089
0
      "por         %%xmm4,%%xmm0                 \n"
7090
0
      "pmullw      %%xmm3,%%xmm1                 \n"
7091
0
      "psrlw       $0x8,%%xmm2                   \n"
7092
0
      "paddusb     %%xmm2,%%xmm0                 \n"
7093
0
      "pand        %%xmm5,%%xmm1                 \n"
7094
0
      "paddusb     %%xmm1,%%xmm0                 \n"
7095
0
      "movdqu      %%xmm0,(%2)                   \n"
7096
0
      "lea         0x10(%2),%2                   \n"
7097
0
      "sub         $0x4,%3                       \n"
7098
0
      "jge         40b                           \n"
7099
7100
0
      "49:         \n"
7101
0
      "add         $0x3,%3                       \n"
7102
0
      "jl          99f                           \n"
7103
7104
      // 1 pixel loop.
7105
0
      "91:         \n"
7106
0
      "movd        (%0),%%xmm3                   \n"
7107
0
      "lea         0x4(%0),%0                    \n"
7108
0
      "movdqa      %%xmm3,%%xmm0                 \n"
7109
0
      "pxor        %%xmm4,%%xmm3                 \n"
7110
0
      "movd        (%1),%%xmm2                   \n"
7111
0
      "pshufb      %4,%%xmm3                     \n"
7112
0
      "pand        %%xmm6,%%xmm2                 \n"
7113
0
      "paddw       %%xmm7,%%xmm3                 \n"
7114
0
      "pmullw      %%xmm3,%%xmm2                 \n"
7115
0
      "movd        (%1),%%xmm1                   \n"
7116
0
      "lea         0x4(%1),%1                    \n"
7117
0
      "psrlw       $0x8,%%xmm1                   \n"
7118
0
      "por         %%xmm4,%%xmm0                 \n"
7119
0
      "pmullw      %%xmm3,%%xmm1                 \n"
7120
0
      "psrlw       $0x8,%%xmm2                   \n"
7121
0
      "paddusb     %%xmm2,%%xmm0                 \n"
7122
0
      "pand        %%xmm5,%%xmm1                 \n"
7123
0
      "paddusb     %%xmm1,%%xmm0                 \n"
7124
0
      "movd        %%xmm0,(%2)                   \n"
7125
0
      "lea         0x4(%2),%2                    \n"
7126
0
      "sub         $0x1,%3                       \n"
7127
0
      "jge         91b                           \n"
7128
0
      "99:         \n"
7129
0
      : "+r"(src_argb),     // %0
7130
0
        "+r"(src_argb1),    // %1
7131
0
        "+r"(dst_argb),     // %2
7132
0
        "+r"(width)         // %3
7133
0
      : "m"(kShuffleAlpha)  // %4
7134
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7135
0
        "xmm7");
7136
0
}
7137
#endif  // HAS_ARGBBLENDROW_SSSE3
7138
7139
#ifdef HAS_BLENDPLANEROW_SSSE3
7140
// Blend 8 pixels at a time.
7141
// unsigned version of math
7142
// =((A2*C2)+(B2*(255-C2))+255)/256
7143
// signed version of math
7144
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
7145
void BlendPlaneRow_SSSE3(const uint8_t* src0,
7146
                         const uint8_t* src1,
7147
                         const uint8_t* alpha,
7148
                         uint8_t* dst,
7149
0
                         int width) {
7150
0
  asm volatile(
7151
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
7152
0
      "psllw       $0x8,%%xmm5                   \n"
7153
0
      "mov         $0x80808080,%%eax             \n"
7154
0
      "movd        %%eax,%%xmm6                  \n"
7155
0
      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
7156
0
      "mov         $0x807f807f,%%eax             \n"
7157
0
      "movd        %%eax,%%xmm7                  \n"
7158
0
      "pshufd      $0x0,%%xmm7,%%xmm7            \n"
7159
0
      "sub         %2,%0                         \n"
7160
0
      "sub         %2,%1                         \n"
7161
0
      "sub         %2,%3                         \n"
7162
7163
      // 8 pixel loop.
7164
0
      LABELALIGN
7165
0
      "1:          \n"
7166
0
      "movq        (%2),%%xmm0                   \n"
7167
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
7168
0
      "pxor        %%xmm5,%%xmm0                 \n"
7169
0
      "movq        (%0,%2,1),%%xmm1              \n"
7170
0
      "movq        (%1,%2,1),%%xmm2              \n"
7171
0
      "punpcklbw   %%xmm2,%%xmm1                 \n"
7172
0
      "psubb       %%xmm6,%%xmm1                 \n"
7173
0
      "pmaddubsw   %%xmm1,%%xmm0                 \n"
7174
0
      "paddw       %%xmm7,%%xmm0                 \n"
7175
0
      "psrlw       $0x8,%%xmm0                   \n"
7176
0
      "packuswb    %%xmm0,%%xmm0                 \n"
7177
0
      "movq        %%xmm0,(%3,%2,1)              \n"
7178
0
      "lea         0x8(%2),%2                    \n"
7179
0
      "sub         $0x8,%4                       \n"
7180
0
      "jg          1b                            \n"
7181
0
      : "+r"(src0),   // %0
7182
0
        "+r"(src1),   // %1
7183
0
        "+r"(alpha),  // %2
7184
0
        "+r"(dst),    // %3
7185
0
        "+rm"(width)  // %4
7186
0
        ::"memory",
7187
0
        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
7188
0
}
7189
#endif  // HAS_BLENDPLANEROW_SSSE3
7190
7191
#ifdef HAS_BLENDPLANEROW_AVX2
7192
// Blend 32 pixels at a time.
7193
// unsigned version of math
7194
// =((A2*C2)+(B2*(255-C2))+255)/256
7195
// signed version of math
7196
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
7197
void BlendPlaneRow_AVX2(const uint8_t* src0,
7198
                        const uint8_t* src1,
7199
                        const uint8_t* alpha,
7200
                        uint8_t* dst,
7201
0
                        int width) {
7202
0
  asm volatile(
7203
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7204
0
      "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
7205
0
      "mov         $0x80808080,%%eax             \n"
7206
0
      "vmovd       %%eax,%%xmm6                  \n"
7207
0
      "vbroadcastss %%xmm6,%%ymm6                \n"
7208
0
      "mov         $0x807f807f,%%eax             \n"
7209
0
      "vmovd       %%eax,%%xmm7                  \n"
7210
0
      "vbroadcastss %%xmm7,%%ymm7                \n"
7211
0
      "sub         %2,%0                         \n"
7212
0
      "sub         %2,%1                         \n"
7213
0
      "sub         %2,%3                         \n"
7214
7215
      // 32 pixel loop.
7216
0
      LABELALIGN
7217
0
      "1:          \n"
7218
0
      "vmovdqu     (%2),%%ymm0                   \n"
7219
0
      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
7220
0
      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
7221
0
      "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
7222
0
      "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
7223
0
      "vmovdqu     (%0,%2,1),%%ymm1              \n"
7224
0
      "vmovdqu     (%1,%2,1),%%ymm2              \n"
7225
0
      "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
7226
0
      "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
7227
0
      "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
7228
0
      "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
7229
0
      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
7230
0
      "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
7231
0
      "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
7232
0
      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
7233
0
      "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
7234
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7235
0
      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
7236
0
      "vmovdqu     %%ymm0,(%3,%2,1)              \n"
7237
0
      "lea         0x20(%2),%2                   \n"
7238
0
      "sub         $0x20,%4                      \n"
7239
0
      "jg          1b                            \n"
7240
0
      "vzeroupper  \n"
7241
0
      : "+r"(src0),   // %0
7242
0
        "+r"(src1),   // %1
7243
0
        "+r"(alpha),  // %2
7244
0
        "+r"(dst),    // %3
7245
0
        "+rm"(width)  // %4
7246
0
        ::"memory",
7247
0
        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7248
0
        "xmm7");
7249
0
}
7250
#endif  // HAS_BLENDPLANEROW_AVX2
7251
7252
#ifdef HAS_ARGBATTENUATEROW_SSSE3
7253
// Shuffle table duplicating alpha.
7254
static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
7255
                                       -128, -128, 14,   -128, 14, -128,
7256
                                       14,   -128, -128, -128};
7257
7258
// Attenuate 4 pixels at a time.
7259
void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
7260
                            uint8_t* dst_argb,
7261
0
                            int width) {
7262
0
  asm volatile(
7263
0
      "movdqa      %3,%%xmm4                     \n"
7264
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
7265
0
      "pslld       $0x18,%%xmm5                  \n"
7266
0
      "pxor        %%xmm6,%%xmm6                 \n"
7267
0
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
7268
0
      "punpcklbw   %%xmm6,%%xmm7                 \n"
7269
0
      "sub         %0,%1                         \n"
7270
7271
      // 4 pixel loop.
7272
0
      LABELALIGN
7273
0
      "1:          \n"
7274
0
      "movdqu      (%0),%%xmm6                   \n"
7275
0
      "movdqa      %%xmm6,%%xmm0                 \n"
7276
0
      "movdqa      %%xmm6,%%xmm1                 \n"
7277
0
      "punpcklbw   %%xmm5,%%xmm0                 \n"
7278
0
      "punpckhbw   %%xmm5,%%xmm1                 \n"
7279
0
      "movdqa      %%xmm0,%%xmm2                 \n"
7280
0
      "movdqa      %%xmm1,%%xmm3                 \n"
7281
0
      "pshufb      %%xmm4,%%xmm2                 \n"  // a,a,a,0
7282
0
      "pshufb      %%xmm4,%%xmm3                 \n"
7283
0
      "pmullw      %%xmm2,%%xmm0                 \n"  // rgb * alpha
7284
0
      "pmullw      %%xmm3,%%xmm1                 \n"
7285
0
      "paddw       %%xmm7,%%xmm0                 \n"  // + 255
7286
0
      "paddw       %%xmm7,%%xmm1                 \n"
7287
0
      "psrlw       $0x8,%%xmm0                   \n"
7288
0
      "psrlw       $0x8,%%xmm1                   \n"
7289
0
      "packuswb    %%xmm1,%%xmm0                 \n"
7290
0
      "pand        %%xmm5,%%xmm6                 \n"
7291
0
      "por         %%xmm6,%%xmm0                 \n"
7292
0
      "movdqu      %%xmm0,(%0,%1)                \n"
7293
0
      "lea         0x10(%0),%0                   \n"
7294
0
      "sub         $0x4,%2                       \n"
7295
0
      "jg          1b                            \n"
7296
0
      : "+r"(src_argb),         // %0
7297
0
        "+r"(dst_argb),         // %1
7298
0
        "+r"(width)             // %2
7299
0
      : "m"(kAttenuateShuffle)  // %3
7300
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7301
0
        "xmm7");
7302
0
}
7303
#endif  // HAS_ARGBATTENUATEROW_SSSE3
7304
7305
#ifdef HAS_ARGBATTENUATEROW_AVX2
7306
7307
// Shuffle table duplicating alpha.
7308
static const lvec8 kAttenuateShuffle_AVX2 = {
7309
    6,    -128, 6,    -128, 6,    -128, -128, -128, 14,   -128, 14,
7310
    -128, 14,   -128, -128, -128, 22,   -128, 22,   -128, 22,   -128,
7311
    -128, -128, 30,   -128, 30,   -128, 30,   -128, -128, -128};
7312
7313
// Attenuate 8 pixels at a time.
7314
void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
7315
                           uint8_t* dst_argb,
7316
953
                           int width) {
7317
953
  asm volatile(
7318
953
      "vmovdqa     %3,%%ymm4                     \n"
7319
953
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7320
953
      "vpslld      $0x18,%%ymm5,%%ymm5           \n"
7321
953
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
7322
953
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
7323
953
      "vpunpcklbw  %%ymm6,%%ymm7,%%ymm7          \n"
7324
953
      "sub         %0,%1                         \n"
7325
7326
      // 8 pixel loop.
7327
953
      LABELALIGN
7328
953
      "1:          \n"
7329
953
      "vmovdqu     (%0),%%ymm6                   \n"
7330
953
      "vpunpcklbw  %%ymm5,%%ymm6,%%ymm0          \n"
7331
953
      "vpunpckhbw  %%ymm5,%%ymm6,%%ymm1          \n"
7332
953
      "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
7333
953
      "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
7334
953
      "vpmullw     %%ymm2,%%ymm0,%%ymm0          \n"
7335
953
      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
7336
953
      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
7337
953
      "vpaddw      %%ymm7,%%ymm1,%%ymm1          \n"
7338
953
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7339
953
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7340
953
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7341
953
      "vpand       %%ymm5,%%ymm6,%%ymm1          \n"
7342
953
      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
7343
953
      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7344
953
      "lea         0x20(%0),%0                   \n"
7345
953
      "sub         $0x8,%2                       \n"
7346
953
      "jg          1b                            \n"
7347
953
      "vzeroupper  \n"
7348
953
      : "+r"(src_argb),              // %0
7349
953
        "+r"(dst_argb),              // %1
7350
953
        "+r"(width)                  // %2
7351
953
      : "m"(kAttenuateShuffle_AVX2)  // %3
7352
953
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7353
953
        "xmm7");
7354
953
}
7355
#endif  // HAS_ARGBATTENUATEROW_AVX2
7356
7357
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
7358
// Unattenuate 4 pixels at a time.
7359
void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7360
                             uint8_t* dst_argb,
7361
0
                             int width) {
7362
0
  uintptr_t alpha;
7363
0
  asm volatile(
7364
      // 4 pixel loop.
7365
0
      LABELALIGN
7366
0
      "1:          \n"
7367
0
      "movdqu      (%0),%%xmm0                   \n"
7368
0
      "movzb       0x03(%0),%3                   \n"
7369
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
7370
0
      "movd        0x00(%4,%3,4),%%xmm2          \n"
7371
0
      "movzb       0x07(%0),%3                   \n"
7372
0
      "movd        0x00(%4,%3,4),%%xmm3          \n"
7373
0
      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7374
0
      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7375
0
      "movlhps     %%xmm3,%%xmm2                 \n"
7376
0
      "pmulhuw     %%xmm2,%%xmm0                 \n"
7377
0
      "movdqu      (%0),%%xmm1                   \n"
7378
0
      "movzb       0x0b(%0),%3                   \n"
7379
0
      "punpckhbw   %%xmm1,%%xmm1                 \n"
7380
0
      "movd        0x00(%4,%3,4),%%xmm2          \n"
7381
0
      "movzb       0x0f(%0),%3                   \n"
7382
0
      "movd        0x00(%4,%3,4),%%xmm3          \n"
7383
0
      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7384
0
      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7385
0
      "movlhps     %%xmm3,%%xmm2                 \n"
7386
0
      "pmulhuw     %%xmm2,%%xmm1                 \n"
7387
0
      "lea         0x10(%0),%0                   \n"
7388
0
      "packuswb    %%xmm1,%%xmm0                 \n"
7389
0
      "movdqu      %%xmm0,(%1)                   \n"
7390
0
      "lea         0x10(%1),%1                   \n"
7391
0
      "sub         $0x4,%2                       \n"
7392
0
      "jg          1b                            \n"
7393
0
      : "+r"(src_argb),     // %0
7394
0
        "+r"(dst_argb),     // %1
7395
0
        "+r"(width),        // %2
7396
0
        "=&r"(alpha)        // %3
7397
0
      : "r"(fixed_invtbl8)  // %4
7398
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7399
0
}
7400
#endif  // HAS_ARGBUNATTENUATEROW_SSE2
7401
7402
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
7403
// Shuffle table duplicating alpha.
7404
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7405
    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7406
// Unattenuate 8 pixels at a time.
7407
void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7408
                             uint8_t* dst_argb,
7409
0
                             int width) {
7410
0
  uintptr_t alpha;
7411
0
  asm volatile(
7412
0
      "sub         %0,%1                         \n"
7413
0
      "vbroadcastf128 %5,%%ymm5                  \n"
7414
7415
      // 8 pixel loop.
7416
0
      LABELALIGN
7417
0
      "1:          \n"
7418
      // replace VPGATHER
7419
0
      "movzb       0x03(%0),%3                   \n"
7420
0
      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7421
0
      "movzb       0x07(%0),%3                   \n"
7422
0
      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7423
0
      "movzb       0x0b(%0),%3                   \n"
7424
0
      "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
7425
0
      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7426
0
      "movzb       0x0f(%0),%3                   \n"
7427
0
      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7428
0
      "movzb       0x13(%0),%3                   \n"
7429
0
      "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
7430
0
      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7431
0
      "movzb       0x17(%0),%3                   \n"
7432
0
      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7433
0
      "movzb       0x1b(%0),%3                   \n"
7434
0
      "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
7435
0
      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7436
0
      "movzb       0x1f(%0),%3                   \n"
7437
0
      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7438
0
      "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
7439
0
      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
7440
0
      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
7441
0
      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
7442
      // end of VPGATHER
7443
7444
0
      "vmovdqu     (%0),%%ymm6                   \n"
7445
0
      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
7446
0
      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
7447
0
      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
7448
0
      "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
7449
0
      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
7450
0
      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
7451
0
      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7452
0
      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7453
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7454
0
      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7455
0
      "lea         0x20(%0),%0                   \n"
7456
0
      "sub         $0x8,%2                       \n"
7457
0
      "jg          1b                            \n"
7458
0
      "vzeroupper  \n"
7459
0
      : "+r"(src_argb),                 // %0
7460
0
        "+r"(dst_argb),                 // %1
7461
0
        "+r"(width),                    // %2
7462
0
        "=&r"(alpha)                    // %3
7463
0
      : "r"(fixed_invtbl8),             // %4
7464
0
        "m"(kUnattenShuffleAlpha_AVX2)  // %5
7465
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7466
0
        "xmm7");
7467
0
}
7468
#endif  // HAS_ARGBUNATTENUATEROW_AVX2
7469
7470
#ifdef HAS_ARGBGRAYROW_SSSE3
7471
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
7472
0
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7473
0
  asm volatile(
7474
0
      "movdqa      %3,%%xmm4                     \n"
7475
0
      "movdqa      %4,%%xmm5                     \n"
7476
7477
      // 8 pixel loop.
7478
0
      LABELALIGN
7479
0
      "1:          \n"
7480
0
      "movdqu      (%0),%%xmm0                   \n"
7481
0
      "movdqu      0x10(%0),%%xmm1               \n"
7482
0
      "psubb       %%xmm5,%%xmm0                 \n"
7483
0
      "psubb       %%xmm5,%%xmm1                 \n"
7484
0
      "movdqu      %%xmm4,%%xmm6                 \n"
7485
0
      "pmaddubsw   %%xmm0,%%xmm6                 \n"
7486
0
      "movdqu      %%xmm4,%%xmm0                 \n"
7487
0
      "pmaddubsw   %%xmm1,%%xmm0                 \n"
7488
0
      "phaddw      %%xmm0,%%xmm6                 \n"
7489
0
      "paddw       %%xmm5,%%xmm6                 \n"
7490
0
      "psrlw       $0x8,%%xmm6                   \n"
7491
0
      "packuswb    %%xmm6,%%xmm6                 \n"
7492
0
      "movdqu      (%0),%%xmm2                   \n"
7493
0
      "movdqu      0x10(%0),%%xmm3               \n"
7494
0
      "lea         0x20(%0),%0                   \n"
7495
0
      "psrld       $0x18,%%xmm2                  \n"
7496
0
      "psrld       $0x18,%%xmm3                  \n"
7497
0
      "packuswb    %%xmm3,%%xmm2                 \n"
7498
0
      "packuswb    %%xmm2,%%xmm2                 \n"
7499
0
      "movdqa      %%xmm6,%%xmm3                 \n"
7500
0
      "punpcklbw   %%xmm6,%%xmm6                 \n"
7501
0
      "punpcklbw   %%xmm2,%%xmm3                 \n"
7502
0
      "movdqa      %%xmm6,%%xmm1                 \n"
7503
0
      "punpcklwd   %%xmm3,%%xmm6                 \n"
7504
0
      "punpckhwd   %%xmm3,%%xmm1                 \n"
7505
0
      "movdqu      %%xmm6,(%1)                   \n"
7506
0
      "movdqu      %%xmm1,0x10(%1)               \n"
7507
0
      "lea         0x20(%1),%1                   \n"
7508
0
      "sub         $0x8,%2                       \n"
7509
0
      "jg          1b                            \n"
7510
0
      : "+r"(src_argb),  // %0
7511
0
        "+r"(dst_argb),  // %1
7512
0
        "+r"(width)      // %2
7513
0
      : "m"(kARGBToYJ),  // %3
7514
0
        "m"(kSub128)     // %4
7515
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7516
0
}
7517
#endif  // HAS_ARGBGRAYROW_SSSE3
7518
7519
#ifdef HAS_ARGBSEPIAROW_SSSE3
7520
//    b = (r * 35 + g * 68 + b * 17) >> 7
7521
//    g = (r * 45 + g * 88 + b * 22) >> 7
7522
//    r = (r * 50 + g * 98 + b * 24) >> 7
7523
// Constant for ARGB color to sepia tone
7524
static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7525
                                   17, 68, 35, 0, 17, 68, 35, 0};
7526
7527
static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7528
                                   22, 88, 45, 0, 22, 88, 45, 0};
7529
7530
static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7531
                                   24, 98, 50, 0, 24, 98, 50, 0};
7532
7533
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
7534
0
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7535
0
  asm volatile(
7536
0
      "movdqa      %2,%%xmm2                     \n"
7537
0
      "movdqa      %3,%%xmm3                     \n"
7538
0
      "movdqa      %4,%%xmm4                     \n"
7539
7540
      // 8 pixel loop.
7541
0
      LABELALIGN
7542
0
      "1:          \n"
7543
0
      "movdqu      (%0),%%xmm0                   \n"
7544
0
      "movdqu      0x10(%0),%%xmm6               \n"
7545
0
      "pmaddubsw   %%xmm2,%%xmm0                 \n"
7546
0
      "pmaddubsw   %%xmm2,%%xmm6                 \n"
7547
0
      "phaddw      %%xmm6,%%xmm0                 \n"
7548
0
      "psrlw       $0x7,%%xmm0                   \n"
7549
0
      "packuswb    %%xmm0,%%xmm0                 \n"
7550
0
      "movdqu      (%0),%%xmm5                   \n"
7551
0
      "movdqu      0x10(%0),%%xmm1               \n"
7552
0
      "pmaddubsw   %%xmm3,%%xmm5                 \n"
7553
0
      "pmaddubsw   %%xmm3,%%xmm1                 \n"
7554
0
      "phaddw      %%xmm1,%%xmm5                 \n"
7555
0
      "psrlw       $0x7,%%xmm5                   \n"
7556
0
      "packuswb    %%xmm5,%%xmm5                 \n"
7557
0
      "punpcklbw   %%xmm5,%%xmm0                 \n"
7558
0
      "movdqu      (%0),%%xmm5                   \n"
7559
0
      "movdqu      0x10(%0),%%xmm1               \n"
7560
0
      "pmaddubsw   %%xmm4,%%xmm5                 \n"
7561
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
7562
0
      "phaddw      %%xmm1,%%xmm5                 \n"
7563
0
      "psrlw       $0x7,%%xmm5                   \n"
7564
0
      "packuswb    %%xmm5,%%xmm5                 \n"
7565
0
      "movdqu      (%0),%%xmm6                   \n"
7566
0
      "movdqu      0x10(%0),%%xmm1               \n"
7567
0
      "psrld       $0x18,%%xmm6                  \n"
7568
0
      "psrld       $0x18,%%xmm1                  \n"
7569
0
      "packuswb    %%xmm1,%%xmm6                 \n"
7570
0
      "packuswb    %%xmm6,%%xmm6                 \n"
7571
0
      "punpcklbw   %%xmm6,%%xmm5                 \n"
7572
0
      "movdqa      %%xmm0,%%xmm1                 \n"
7573
0
      "punpcklwd   %%xmm5,%%xmm0                 \n"
7574
0
      "punpckhwd   %%xmm5,%%xmm1                 \n"
7575
0
      "movdqu      %%xmm0,(%0)                   \n"
7576
0
      "movdqu      %%xmm1,0x10(%0)               \n"
7577
0
      "lea         0x20(%0),%0                   \n"
7578
0
      "sub         $0x8,%1                       \n"
7579
0
      "jg          1b                            \n"
7580
0
      : "+r"(dst_argb),      // %0
7581
0
        "+r"(width)          // %1
7582
0
      : "m"(kARGBToSepiaB),  // %2
7583
0
        "m"(kARGBToSepiaG),  // %3
7584
0
        "m"(kARGBToSepiaR)   // %4
7585
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7586
0
}
7587
#endif  // HAS_ARGBSEPIAROW_SSSE3
7588
7589
#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7590
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
7591
// Same as Sepia except matrix is provided.
7592
void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7593
                              uint8_t* dst_argb,
7594
                              const int8_t* matrix_argb,
7595
0
                              int width) {
7596
0
  asm volatile(
7597
0
      "movdqu      (%3),%%xmm5                   \n"
7598
0
      "pshufd      $0x00,%%xmm5,%%xmm2           \n"
7599
0
      "pshufd      $0x55,%%xmm5,%%xmm3           \n"
7600
0
      "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
7601
0
      "pshufd      $0xff,%%xmm5,%%xmm5           \n"
7602
7603
      // 8 pixel loop.
7604
0
      LABELALIGN
7605
0
      "1:          \n"
7606
0
      "movdqu      (%0),%%xmm0                   \n"
7607
0
      "movdqu      0x10(%0),%%xmm7               \n"
7608
0
      "pmaddubsw   %%xmm2,%%xmm0                 \n"
7609
0
      "pmaddubsw   %%xmm2,%%xmm7                 \n"
7610
0
      "movdqu      (%0),%%xmm6                   \n"
7611
0
      "movdqu      0x10(%0),%%xmm1               \n"
7612
0
      "pmaddubsw   %%xmm3,%%xmm6                 \n"
7613
0
      "pmaddubsw   %%xmm3,%%xmm1                 \n"
7614
0
      "phaddsw     %%xmm7,%%xmm0                 \n"
7615
0
      "phaddsw     %%xmm1,%%xmm6                 \n"
7616
0
      "psraw       $0x6,%%xmm0                   \n"
7617
0
      "psraw       $0x6,%%xmm6                   \n"
7618
0
      "packuswb    %%xmm0,%%xmm0                 \n"
7619
0
      "packuswb    %%xmm6,%%xmm6                 \n"
7620
0
      "punpcklbw   %%xmm6,%%xmm0                 \n"
7621
0
      "movdqu      (%0),%%xmm1                   \n"
7622
0
      "movdqu      0x10(%0),%%xmm7               \n"
7623
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
7624
0
      "pmaddubsw   %%xmm4,%%xmm7                 \n"
7625
0
      "phaddsw     %%xmm7,%%xmm1                 \n"
7626
0
      "movdqu      (%0),%%xmm6                   \n"
7627
0
      "movdqu      0x10(%0),%%xmm7               \n"
7628
0
      "pmaddubsw   %%xmm5,%%xmm6                 \n"
7629
0
      "pmaddubsw   %%xmm5,%%xmm7                 \n"
7630
0
      "phaddsw     %%xmm7,%%xmm6                 \n"
7631
0
      "psraw       $0x6,%%xmm1                   \n"
7632
0
      "psraw       $0x6,%%xmm6                   \n"
7633
0
      "packuswb    %%xmm1,%%xmm1                 \n"
7634
0
      "packuswb    %%xmm6,%%xmm6                 \n"
7635
0
      "punpcklbw   %%xmm6,%%xmm1                 \n"
7636
0
      "movdqa      %%xmm0,%%xmm6                 \n"
7637
0
      "punpcklwd   %%xmm1,%%xmm0                 \n"
7638
0
      "punpckhwd   %%xmm1,%%xmm6                 \n"
7639
0
      "movdqu      %%xmm0,(%1)                   \n"
7640
0
      "movdqu      %%xmm6,0x10(%1)               \n"
7641
0
      "lea         0x20(%0),%0                   \n"
7642
0
      "lea         0x20(%1),%1                   \n"
7643
0
      "sub         $0x8,%2                       \n"
7644
0
      "jg          1b                            \n"
7645
0
      : "+r"(src_argb),   // %0
7646
0
        "+r"(dst_argb),   // %1
7647
0
        "+r"(width)       // %2
7648
0
      : "r"(matrix_argb)  // %3
7649
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7650
0
        "xmm7");
7651
0
}
7652
#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
7653
7654
#ifdef HAS_ARGBQUANTIZEROW_SSE2
7655
// Quantize 4 ARGB pixels (16 bytes).
7656
void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7657
                          int scale,
7658
                          int interval_size,
7659
                          int interval_offset,
7660
0
                          int width) {
7661
0
  asm volatile(
7662
0
      "movd        %2,%%xmm2                     \n"
7663
0
      "movd        %3,%%xmm3                     \n"
7664
0
      "movd        %4,%%xmm4                     \n"
7665
0
      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7666
0
      "pshufd      $0x44,%%xmm2,%%xmm2           \n"
7667
0
      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7668
0
      "pshufd      $0x44,%%xmm3,%%xmm3           \n"
7669
0
      "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
7670
0
      "pshufd      $0x44,%%xmm4,%%xmm4           \n"
7671
0
      "pxor        %%xmm5,%%xmm5                 \n"
7672
0
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
7673
0
      "pslld       $0x18,%%xmm6                  \n"
7674
7675
      // 4 pixel loop.
7676
0
      LABELALIGN
7677
0
      "1:          \n"
7678
0
      "movdqu      (%0),%%xmm0                   \n"
7679
0
      "punpcklbw   %%xmm5,%%xmm0                 \n"
7680
0
      "pmulhuw     %%xmm2,%%xmm0                 \n"
7681
0
      "movdqu      (%0),%%xmm1                   \n"
7682
0
      "punpckhbw   %%xmm5,%%xmm1                 \n"
7683
0
      "pmulhuw     %%xmm2,%%xmm1                 \n"
7684
0
      "pmullw      %%xmm3,%%xmm0                 \n"
7685
0
      "movdqu      (%0),%%xmm7                   \n"
7686
0
      "pmullw      %%xmm3,%%xmm1                 \n"
7687
0
      "pand        %%xmm6,%%xmm7                 \n"
7688
0
      "paddw       %%xmm4,%%xmm0                 \n"
7689
0
      "paddw       %%xmm4,%%xmm1                 \n"
7690
0
      "packuswb    %%xmm1,%%xmm0                 \n"
7691
0
      "por         %%xmm7,%%xmm0                 \n"
7692
0
      "movdqu      %%xmm0,(%0)                   \n"
7693
0
      "lea         0x10(%0),%0                   \n"
7694
0
      "sub         $0x4,%1                       \n"
7695
0
      "jg          1b                            \n"
7696
0
      : "+r"(dst_argb),       // %0
7697
0
        "+r"(width)           // %1
7698
0
      : "r"(scale),           // %2
7699
0
        "r"(interval_size),   // %3
7700
0
        "r"(interval_offset)  // %4
7701
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7702
0
        "xmm7");
7703
0
}
7704
#endif  // HAS_ARGBQUANTIZEROW_SSE2
7705
7706
#ifdef HAS_ARGBSHADEROW_SSE2
7707
// Shade 4 pixels at a time by specified value.
7708
void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7709
                       uint8_t* dst_argb,
7710
                       int width,
7711
0
                       uint32_t value) {
7712
0
  asm volatile(
7713
0
      "movd        %3,%%xmm2                     \n"
7714
0
      "punpcklbw   %%xmm2,%%xmm2                 \n"
7715
0
      "punpcklqdq  %%xmm2,%%xmm2                 \n"
7716
7717
      // 4 pixel loop.
7718
0
      LABELALIGN
7719
0
      "1:          \n"
7720
0
      "movdqu      (%0),%%xmm0                   \n"
7721
0
      "lea         0x10(%0),%0                   \n"
7722
0
      "movdqa      %%xmm0,%%xmm1                 \n"
7723
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
7724
0
      "punpckhbw   %%xmm1,%%xmm1                 \n"
7725
0
      "pmulhuw     %%xmm2,%%xmm0                 \n"
7726
0
      "pmulhuw     %%xmm2,%%xmm1                 \n"
7727
0
      "psrlw       $0x8,%%xmm0                   \n"
7728
0
      "psrlw       $0x8,%%xmm1                   \n"
7729
0
      "packuswb    %%xmm1,%%xmm0                 \n"
7730
0
      "movdqu      %%xmm0,(%1)                   \n"
7731
0
      "lea         0x10(%1),%1                   \n"
7732
0
      "sub         $0x4,%2                       \n"
7733
0
      "jg          1b                            \n"
7734
0
      : "+r"(src_argb),  // %0
7735
0
        "+r"(dst_argb),  // %1
7736
0
        "+r"(width)      // %2
7737
0
      : "r"(value)       // %3
7738
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
7739
0
}
7740
#endif  // HAS_ARGBSHADEROW_SSE2
7741
7742
#ifdef HAS_ARGBMULTIPLYROW_SSE2
7743
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
7744
void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7745
                          const uint8_t* src_argb1,
7746
                          uint8_t* dst_argb,
7747
                          int width) {
7748
      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
7749
7750
               // 4 pixel loop.
7751
               LABELALIGN
7752
      "1:          \n"
7753
      "movdqu      (%0),%%xmm0                   \n"
7754
      "lea         0x10(%0),%0                   \n"
7755
      "movdqu      (%1),%%xmm2                   \n"
7756
      "lea         0x10(%1),%1                   \n"
7757
      "movdqu      %%xmm0,%%xmm1                 \n"
7758
      "movdqu      %%xmm2,%%xmm3                 \n"
7759
      "punpcklbw   %%xmm0,%%xmm0                 \n"
7760
      "punpckhbw   %%xmm1,%%xmm1                 \n"
7761
      "punpcklbw   %%xmm5,%%xmm2                 \n"
7762
      "punpckhbw   %%xmm5,%%xmm3                 \n"
7763
      "pmulhuw     %%xmm2,%%xmm0                 \n"
7764
      "pmulhuw     %%xmm3,%%xmm1                 \n"
7765
      "packuswb    %%xmm1,%%xmm0                 \n"
7766
      "movdqu      %%xmm0,(%2)                   \n"
7767
      "lea         0x10(%2),%2                   \n"
7768
      "sub         $0x4,%3                       \n"
7769
      "jg          1b                            \n"
7770
               : "+r"(src_argb),   // %0
7771
                 "+r"(src_argb1),  // %1
7772
                 "+r"(dst_argb),   // %2
7773
                 "+r"(width)       // %3
7774
               :
7775
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7776
}
7777
#endif  // HAS_ARGBMULTIPLYROW_SSE2
7778
7779
#ifdef HAS_ARGBMULTIPLYROW_AVX2
7780
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
7781
void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7782
                          const uint8_t* src_argb1,
7783
                          uint8_t* dst_argb,
7784
                          int width) {
7785
      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
7786
7787
               // 4 pixel loop.
7788
               LABELALIGN
7789
      "1:          \n"
7790
      "vmovdqu     (%0),%%ymm1                   \n"
7791
      "lea         0x20(%0),%0                   \n"
7792
      "vmovdqu     (%1),%%ymm3                   \n"
7793
      "lea         0x20(%1),%1                   \n"
7794
      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
7795
      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
7796
      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
7797
      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
7798
      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7799
      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7800
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7801
      "vmovdqu     %%ymm0,(%2)                   \n"
7802
      "lea         0x20(%2),%2                   \n"
7803
      "sub         $0x8,%3                       \n"
7804
      "jg          1b                            \n"
7805
      "vzeroupper  \n"
7806
               : "+r"(src_argb),   // %0
7807
                 "+r"(src_argb1),  // %1
7808
                 "+r"(dst_argb),   // %2
7809
                 "+r"(width)       // %3
7810
               :
7811
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7812
}
7813
#endif  // HAS_ARGBMULTIPLYROW_AVX2
7814
7815
#ifdef HAS_ARGBADDROW_SSE2
7816
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
7817
void ARGBAddRow_SSE2(const uint8_t* src_argb,
7818
                     const uint8_t* src_argb1,
7819
                     uint8_t* dst_argb,
7820
0
                     int width) {
7821
0
  asm volatile(
7822
      // 4 pixel loop.
7823
0
      LABELALIGN
7824
0
      "1:          \n"
7825
0
      "movdqu      (%0),%%xmm0                   \n"
7826
0
      "lea         0x10(%0),%0                   \n"
7827
0
      "movdqu      (%1),%%xmm1                   \n"
7828
0
      "lea         0x10(%1),%1                   \n"
7829
0
      "paddusb     %%xmm1,%%xmm0                 \n"
7830
0
      "movdqu      %%xmm0,(%2)                   \n"
7831
0
      "lea         0x10(%2),%2                   \n"
7832
0
      "sub         $0x4,%3                       \n"
7833
0
      "jg          1b                            \n"
7834
0
      : "+r"(src_argb),   // %0
7835
0
        "+r"(src_argb1),  // %1
7836
0
        "+r"(dst_argb),   // %2
7837
0
        "+r"(width)       // %3
7838
0
      :
7839
0
      : "memory", "cc", "xmm0", "xmm1");
7840
0
}
7841
#endif  // HAS_ARGBADDROW_SSE2
7842
7843
#ifdef HAS_ARGBADDROW_AVX2
7844
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
7845
void ARGBAddRow_AVX2(const uint8_t* src_argb,
7846
                     const uint8_t* src_argb1,
7847
                     uint8_t* dst_argb,
7848
0
                     int width) {
7849
0
  asm volatile(
7850
      // 4 pixel loop.
7851
0
      LABELALIGN
7852
0
      "1:          \n"
7853
0
      "vmovdqu     (%0),%%ymm0                   \n"
7854
0
      "lea         0x20(%0),%0                   \n"
7855
0
      "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
7856
0
      "lea         0x20(%1),%1                   \n"
7857
0
      "vmovdqu     %%ymm0,(%2)                   \n"
7858
0
      "lea         0x20(%2),%2                   \n"
7859
0
      "sub         $0x8,%3                       \n"
7860
0
      "jg          1b                            \n"
7861
0
      "vzeroupper  \n"
7862
0
      : "+r"(src_argb),   // %0
7863
0
        "+r"(src_argb1),  // %1
7864
0
        "+r"(dst_argb),   // %2
7865
0
        "+r"(width)       // %3
7866
0
      :
7867
0
      : "memory", "cc", "xmm0");
7868
0
}
7869
#endif  // HAS_ARGBADDROW_AVX2
7870
7871
#ifdef HAS_ARGBSUBTRACTROW_SSE2
7872
// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
7873
void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
7874
                          const uint8_t* src_argb1,
7875
                          uint8_t* dst_argb,
7876
0
                          int width) {
7877
0
  asm volatile(
7878
      // 4 pixel loop.
7879
0
      LABELALIGN
7880
0
      "1:          \n"
7881
0
      "movdqu      (%0),%%xmm0                   \n"
7882
0
      "lea         0x10(%0),%0                   \n"
7883
0
      "movdqu      (%1),%%xmm1                   \n"
7884
0
      "lea         0x10(%1),%1                   \n"
7885
0
      "psubusb     %%xmm1,%%xmm0                 \n"
7886
0
      "movdqu      %%xmm0,(%2)                   \n"
7887
0
      "lea         0x10(%2),%2                   \n"
7888
0
      "sub         $0x4,%3                       \n"
7889
0
      "jg          1b                            \n"
7890
0
      : "+r"(src_argb),   // %0
7891
0
        "+r"(src_argb1),  // %1
7892
0
        "+r"(dst_argb),   // %2
7893
0
        "+r"(width)       // %3
7894
0
      :
7895
0
      : "memory", "cc", "xmm0", "xmm1");
7896
0
}
7897
#endif  // HAS_ARGBSUBTRACTROW_SSE2
7898
7899
#ifdef HAS_ARGBSUBTRACTROW_AVX2
7900
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
7901
void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
7902
                          const uint8_t* src_argb1,
7903
                          uint8_t* dst_argb,
7904
0
                          int width) {
7905
0
  asm volatile(
7906
      // 4 pixel loop.
7907
0
      LABELALIGN
7908
0
      "1:          \n"
7909
0
      "vmovdqu     (%0),%%ymm0                   \n"
7910
0
      "lea         0x20(%0),%0                   \n"
7911
0
      "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
7912
0
      "lea         0x20(%1),%1                   \n"
7913
0
      "vmovdqu     %%ymm0,(%2)                   \n"
7914
0
      "lea         0x20(%2),%2                   \n"
7915
0
      "sub         $0x8,%3                       \n"
7916
0
      "jg          1b                            \n"
7917
0
      "vzeroupper  \n"
7918
0
      : "+r"(src_argb),   // %0
7919
0
        "+r"(src_argb1),  // %1
7920
0
        "+r"(dst_argb),   // %2
7921
0
        "+r"(width)       // %3
7922
0
      :
7923
0
      : "memory", "cc", "xmm0");
7924
0
}
7925
#endif  // HAS_ARGBSUBTRACTROW_AVX2
7926
7927
#ifdef HAS_SOBELXROW_SSE2
7928
// SobelX as a matrix is
7929
// -1  0  1
7930
// -2  0  2
7931
// -1  0  1
7932
void SobelXRow_SSE2(const uint8_t* src_y0,
7933
                    const uint8_t* src_y1,
7934
                    const uint8_t* src_y2,
7935
                    uint8_t* dst_sobelx,
7936
0
                    int width) {
7937
0
  asm volatile(
7938
0
      "sub         %0,%1                         \n"
7939
0
      "sub         %0,%2                         \n"
7940
0
      "sub         %0,%3                         \n"
7941
0
      "pxor        %%xmm5,%%xmm5                 \n"
7942
7943
      // 8 pixel loop.
7944
0
      LABELALIGN
7945
0
      "1:          \n"
7946
0
      "movq        (%0),%%xmm0                   \n"
7947
0
      "movq        0x2(%0),%%xmm1                \n"
7948
0
      "punpcklbw   %%xmm5,%%xmm0                 \n"
7949
0
      "punpcklbw   %%xmm5,%%xmm1                 \n"
7950
0
      "psubw       %%xmm1,%%xmm0                 \n"
7951
0
      "movq        0x00(%0,%1,1),%%xmm1          \n"
7952
0
      "movq        0x02(%0,%1,1),%%xmm2          \n"
7953
0
      "punpcklbw   %%xmm5,%%xmm1                 \n"
7954
0
      "punpcklbw   %%xmm5,%%xmm2                 \n"
7955
0
      "psubw       %%xmm2,%%xmm1                 \n"
7956
0
      "movq        0x00(%0,%2,1),%%xmm2          \n"
7957
0
      "movq        0x02(%0,%2,1),%%xmm3          \n"
7958
0
      "punpcklbw   %%xmm5,%%xmm2                 \n"
7959
0
      "punpcklbw   %%xmm5,%%xmm3                 \n"
7960
0
      "psubw       %%xmm3,%%xmm2                 \n"
7961
0
      "paddw       %%xmm2,%%xmm0                 \n"
7962
0
      "paddw       %%xmm1,%%xmm0                 \n"
7963
0
      "paddw       %%xmm1,%%xmm0                 \n"
7964
0
      "pxor        %%xmm1,%%xmm1                 \n"
7965
0
      "psubw       %%xmm0,%%xmm1                 \n"
7966
0
      "pmaxsw      %%xmm1,%%xmm0                 \n"
7967
0
      "packuswb    %%xmm0,%%xmm0                 \n"
7968
0
      "movq        %%xmm0,0x00(%0,%3,1)          \n"
7969
0
      "lea         0x8(%0),%0                    \n"
7970
0
      "sub         $0x8,%4                       \n"
7971
0
      "jg          1b                            \n"
7972
0
      : "+r"(src_y0),      // %0
7973
0
        "+r"(src_y1),      // %1
7974
0
        "+r"(src_y2),      // %2
7975
0
        "+r"(dst_sobelx),  // %3
7976
0
        "+r"(width)        // %4
7977
0
      :
7978
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7979
0
}
7980
#endif  // HAS_SOBELXROW_SSE2
7981
7982
#ifdef HAS_SOBELYROW_SSE2
7983
// SobelY as a matrix is
7984
// -1 -2 -1
7985
//  0  0  0
7986
//  1  2  1
7987
void SobelYRow_SSE2(const uint8_t* src_y0,
7988
                    const uint8_t* src_y1,
7989
                    uint8_t* dst_sobely,
7990
0
                    int width) {
7991
0
  asm volatile(
7992
0
      "sub         %0,%1                         \n"
7993
0
      "sub         %0,%2                         \n"
7994
0
      "pxor        %%xmm5,%%xmm5                 \n"
7995
7996
      // 8 pixel loop.
7997
0
      LABELALIGN
7998
0
      "1:          \n"
7999
0
      "movq        (%0),%%xmm0                   \n"
8000
0
      "movq        0x00(%0,%1,1),%%xmm1          \n"
8001
0
      "punpcklbw   %%xmm5,%%xmm0                 \n"
8002
0
      "punpcklbw   %%xmm5,%%xmm1                 \n"
8003
0
      "psubw       %%xmm1,%%xmm0                 \n"
8004
0
      "movq        0x1(%0),%%xmm1                \n"
8005
0
      "movq        0x01(%0,%1,1),%%xmm2          \n"
8006
0
      "punpcklbw   %%xmm5,%%xmm1                 \n"
8007
0
      "punpcklbw   %%xmm5,%%xmm2                 \n"
8008
0
      "psubw       %%xmm2,%%xmm1                 \n"
8009
0
      "movq        0x2(%0),%%xmm2                \n"
8010
0
      "movq        0x02(%0,%1,1),%%xmm3          \n"
8011
0
      "punpcklbw   %%xmm5,%%xmm2                 \n"
8012
0
      "punpcklbw   %%xmm5,%%xmm3                 \n"
8013
0
      "psubw       %%xmm3,%%xmm2                 \n"
8014
0
      "paddw       %%xmm2,%%xmm0                 \n"
8015
0
      "paddw       %%xmm1,%%xmm0                 \n"
8016
0
      "paddw       %%xmm1,%%xmm0                 \n"
8017
0
      "pxor        %%xmm1,%%xmm1                 \n"
8018
0
      "psubw       %%xmm0,%%xmm1                 \n"
8019
0
      "pmaxsw      %%xmm1,%%xmm0                 \n"
8020
0
      "packuswb    %%xmm0,%%xmm0                 \n"
8021
0
      "movq        %%xmm0,0x00(%0,%2,1)          \n"
8022
0
      "lea         0x8(%0),%0                    \n"
8023
0
      "sub         $0x8,%3                       \n"
8024
0
      "jg          1b                            \n"
8025
0
      : "+r"(src_y0),      // %0
8026
0
        "+r"(src_y1),      // %1
8027
0
        "+r"(dst_sobely),  // %2
8028
0
        "+r"(width)        // %3
8029
0
      :
8030
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8031
0
}
8032
#endif  // HAS_SOBELYROW_SSE2
8033
8034
#ifdef HAS_SOBELROW_SSE2
8035
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
8036
// A = 255
8037
// R = Sobel
8038
// G = Sobel
8039
// B = Sobel
8040
void SobelRow_SSE2(const uint8_t* src_sobelx,
8041
                   const uint8_t* src_sobely,
8042
                   uint8_t* dst_argb,
8043
0
                   int width) {
8044
0
  asm volatile(
8045
0
      "sub         %0,%1                         \n"
8046
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
8047
0
      "pslld       $0x18,%%xmm5                  \n"
8048
8049
      // 8 pixel loop.
8050
0
      LABELALIGN
8051
0
      "1:          \n"
8052
0
      "movdqu      (%0),%%xmm0                   \n"
8053
0
      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
8054
0
      "lea         0x10(%0),%0                   \n"
8055
0
      "paddusb     %%xmm1,%%xmm0                 \n"
8056
0
      "movdqa      %%xmm0,%%xmm2                 \n"
8057
0
      "punpcklbw   %%xmm0,%%xmm2                 \n"
8058
0
      "punpckhbw   %%xmm0,%%xmm0                 \n"
8059
0
      "movdqa      %%xmm2,%%xmm1                 \n"
8060
0
      "punpcklwd   %%xmm2,%%xmm1                 \n"
8061
0
      "punpckhwd   %%xmm2,%%xmm2                 \n"
8062
0
      "por         %%xmm5,%%xmm1                 \n"
8063
0
      "por         %%xmm5,%%xmm2                 \n"
8064
0
      "movdqa      %%xmm0,%%xmm3                 \n"
8065
0
      "punpcklwd   %%xmm0,%%xmm3                 \n"
8066
0
      "punpckhwd   %%xmm0,%%xmm0                 \n"
8067
0
      "por         %%xmm5,%%xmm3                 \n"
8068
0
      "por         %%xmm5,%%xmm0                 \n"
8069
0
      "movdqu      %%xmm1,(%2)                   \n"
8070
0
      "movdqu      %%xmm2,0x10(%2)               \n"
8071
0
      "movdqu      %%xmm3,0x20(%2)               \n"
8072
0
      "movdqu      %%xmm0,0x30(%2)               \n"
8073
0
      "lea         0x40(%2),%2                   \n"
8074
0
      "sub         $0x10,%3                      \n"
8075
0
      "jg          1b                            \n"
8076
0
      : "+r"(src_sobelx),  // %0
8077
0
        "+r"(src_sobely),  // %1
8078
0
        "+r"(dst_argb),    // %2
8079
0
        "+r"(width)        // %3
8080
0
      :
8081
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8082
0
}
8083
#endif  // HAS_SOBELROW_SSE2
8084
8085
#ifdef HAS_SOBELTOPLANEROW_SSE2
8086
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
8087
void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
8088
                          const uint8_t* src_sobely,
8089
                          uint8_t* dst_y,
8090
0
                          int width) {
8091
0
  asm volatile(
8092
0
      "sub         %0,%1                         \n"
8093
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
8094
0
      "pslld       $0x18,%%xmm5                  \n"
8095
8096
      // 8 pixel loop.
8097
0
      LABELALIGN
8098
0
      "1:          \n"
8099
0
      "movdqu      (%0),%%xmm0                   \n"
8100
0
      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
8101
0
      "lea         0x10(%0),%0                   \n"
8102
0
      "paddusb     %%xmm1,%%xmm0                 \n"
8103
0
      "movdqu      %%xmm0,(%2)                   \n"
8104
0
      "lea         0x10(%2),%2                   \n"
8105
0
      "sub         $0x10,%3                      \n"
8106
0
      "jg          1b                            \n"
8107
0
      : "+r"(src_sobelx),  // %0
8108
0
        "+r"(src_sobely),  // %1
8109
0
        "+r"(dst_y),       // %2
8110
0
        "+r"(width)        // %3
8111
0
      :
8112
0
      : "memory", "cc", "xmm0", "xmm1");
8113
0
}
8114
#endif  // HAS_SOBELTOPLANEROW_SSE2
8115
8116
#ifdef HAS_SOBELXYROW_SSE2
8117
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
8118
// A = 255
8119
// R = Sobel X
8120
// G = Sobel
8121
// B = Sobel Y
8122
void SobelXYRow_SSE2(const uint8_t* src_sobelx,
8123
                     const uint8_t* src_sobely,
8124
                     uint8_t* dst_argb,
8125
0
                     int width) {
8126
0
  asm volatile(
8127
0
      "sub         %0,%1                         \n"
8128
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
8129
8130
      // 8 pixel loop.
8131
0
      LABELALIGN
8132
0
      "1:          \n"
8133
0
      "movdqu      (%0),%%xmm0                   \n"
8134
0
      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
8135
0
      "lea         0x10(%0),%0                   \n"
8136
0
      "movdqa      %%xmm0,%%xmm2                 \n"
8137
0
      "paddusb     %%xmm1,%%xmm2                 \n"
8138
0
      "movdqa      %%xmm0,%%xmm3                 \n"
8139
0
      "punpcklbw   %%xmm5,%%xmm3                 \n"
8140
0
      "punpckhbw   %%xmm5,%%xmm0                 \n"
8141
0
      "movdqa      %%xmm1,%%xmm4                 \n"
8142
0
      "punpcklbw   %%xmm2,%%xmm4                 \n"
8143
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"
8144
0
      "movdqa      %%xmm4,%%xmm6                 \n"
8145
0
      "punpcklwd   %%xmm3,%%xmm6                 \n"
8146
0
      "punpckhwd   %%xmm3,%%xmm4                 \n"
8147
0
      "movdqa      %%xmm1,%%xmm7                 \n"
8148
0
      "punpcklwd   %%xmm0,%%xmm7                 \n"
8149
0
      "punpckhwd   %%xmm0,%%xmm1                 \n"
8150
0
      "movdqu      %%xmm6,(%2)                   \n"
8151
0
      "movdqu      %%xmm4,0x10(%2)               \n"
8152
0
      "movdqu      %%xmm7,0x20(%2)               \n"
8153
0
      "movdqu      %%xmm1,0x30(%2)               \n"
8154
0
      "lea         0x40(%2),%2                   \n"
8155
0
      "sub         $0x10,%3                      \n"
8156
0
      "jg          1b                            \n"
8157
0
      : "+r"(src_sobelx),  // %0
8158
0
        "+r"(src_sobely),  // %1
8159
0
        "+r"(dst_argb),    // %2
8160
0
        "+r"(width)        // %3
8161
0
      :
8162
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8163
0
        "xmm7");
8164
0
}
8165
#endif  // HAS_SOBELXYROW_SSE2
8166
8167
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
8168
// Creates a table of cumulative sums where each value is a sum of all values
8169
// above and to the left of the value, inclusive of the value.
8170
void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
8171
                                  int32_t* cumsum,
8172
                                  const int32_t* previous_cumsum,
8173
0
                                  int width) {
8174
0
  asm volatile(
8175
0
      "pxor        %%xmm0,%%xmm0                 \n"
8176
0
      "pxor        %%xmm1,%%xmm1                 \n"
8177
0
      "sub         $0x4,%3                       \n"
8178
0
      "jl          49f                           \n"
8179
0
      "test        $0xf,%1                       \n"
8180
0
      "jne         49f                           \n"
8181
8182
      // 4 pixel loop.
8183
0
      LABELALIGN
8184
0
      "40:         \n"
8185
0
      "movdqu      (%0),%%xmm2                   \n"
8186
0
      "lea         0x10(%0),%0                   \n"
8187
0
      "movdqa      %%xmm2,%%xmm4                 \n"
8188
0
      "punpcklbw   %%xmm1,%%xmm2                 \n"
8189
0
      "movdqa      %%xmm2,%%xmm3                 \n"
8190
0
      "punpcklwd   %%xmm1,%%xmm2                 \n"
8191
0
      "punpckhwd   %%xmm1,%%xmm3                 \n"
8192
0
      "punpckhbw   %%xmm1,%%xmm4                 \n"
8193
0
      "movdqa      %%xmm4,%%xmm5                 \n"
8194
0
      "punpcklwd   %%xmm1,%%xmm4                 \n"
8195
0
      "punpckhwd   %%xmm1,%%xmm5                 \n"
8196
0
      "paddd       %%xmm2,%%xmm0                 \n"
8197
0
      "movdqu      (%2),%%xmm2                   \n"
8198
0
      "paddd       %%xmm0,%%xmm2                 \n"
8199
0
      "paddd       %%xmm3,%%xmm0                 \n"
8200
0
      "movdqu      0x10(%2),%%xmm3               \n"
8201
0
      "paddd       %%xmm0,%%xmm3                 \n"
8202
0
      "paddd       %%xmm4,%%xmm0                 \n"
8203
0
      "movdqu      0x20(%2),%%xmm4               \n"
8204
0
      "paddd       %%xmm0,%%xmm4                 \n"
8205
0
      "paddd       %%xmm5,%%xmm0                 \n"
8206
0
      "movdqu      0x30(%2),%%xmm5               \n"
8207
0
      "lea         0x40(%2),%2                   \n"
8208
0
      "paddd       %%xmm0,%%xmm5                 \n"
8209
0
      "movdqu      %%xmm2,(%1)                   \n"
8210
0
      "movdqu      %%xmm3,0x10(%1)               \n"
8211
0
      "movdqu      %%xmm4,0x20(%1)               \n"
8212
0
      "movdqu      %%xmm5,0x30(%1)               \n"
8213
0
      "lea         0x40(%1),%1                   \n"
8214
0
      "sub         $0x4,%3                       \n"
8215
0
      "jge         40b                           \n"
8216
8217
0
      "49:         \n"
8218
0
      "add         $0x3,%3                       \n"
8219
0
      "jl          19f                           \n"
8220
8221
      // 1 pixel loop.
8222
0
      LABELALIGN
8223
0
      "10:         \n"
8224
0
      "movd        (%0),%%xmm2                   \n"
8225
0
      "lea         0x4(%0),%0                    \n"
8226
0
      "punpcklbw   %%xmm1,%%xmm2                 \n"
8227
0
      "punpcklwd   %%xmm1,%%xmm2                 \n"
8228
0
      "paddd       %%xmm2,%%xmm0                 \n"
8229
0
      "movdqu      (%2),%%xmm2                   \n"
8230
0
      "lea         0x10(%2),%2                   \n"
8231
0
      "paddd       %%xmm0,%%xmm2                 \n"
8232
0
      "movdqu      %%xmm2,(%1)                   \n"
8233
0
      "lea         0x10(%1),%1                   \n"
8234
0
      "sub         $0x1,%3                       \n"
8235
0
      "jge         10b                           \n"
8236
8237
0
      "19:         \n"
8238
0
      : "+r"(row),              // %0
8239
0
        "+r"(cumsum),           // %1
8240
0
        "+r"(previous_cumsum),  // %2
8241
0
        "+r"(width)             // %3
8242
0
      :
8243
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8244
0
}
8245
#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
8246
8247
#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8248
void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
8249
                                    const int32_t* botleft,
8250
                                    int width,
8251
                                    int area,
8252
                                    uint8_t* dst,
8253
0
                                    int count) {
8254
0
  asm volatile(
8255
0
      "movd        %5,%%xmm5                     \n"
8256
0
      "cvtdq2ps    %%xmm5,%%xmm5                 \n"
8257
0
      "rcpss       %%xmm5,%%xmm4                 \n"
8258
0
      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8259
0
      "sub         $0x4,%3                       \n"
8260
0
      "jl          49f                           \n"
8261
0
      "cmpl        $0x80,%5                      \n"
8262
0
      "ja          40f                           \n"
8263
8264
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8265
0
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
8266
0
      "psrld       $0x10,%%xmm6                  \n"
8267
0
      "cvtdq2ps    %%xmm6,%%xmm6                 \n"
8268
0
      "addps       %%xmm6,%%xmm5                 \n"
8269
0
      "mulps       %%xmm4,%%xmm5                 \n"
8270
0
      "cvtps2dq    %%xmm5,%%xmm5                 \n"
8271
0
      "packssdw    %%xmm5,%%xmm5                 \n"
8272
8273
      // 4 pixel small loop.
8274
0
      LABELALIGN
8275
0
      "4:          \n"
8276
0
      "movdqu      (%0),%%xmm0                   \n"
8277
0
      "movdqu      0x10(%0),%%xmm1               \n"
8278
0
      "movdqu      0x20(%0),%%xmm2               \n"
8279
0
      "movdqu      0x30(%0),%%xmm3               \n"
8280
0
      "psubd       0x00(%0,%4,4),%%xmm0          \n"
8281
0
      "psubd       0x10(%0,%4,4),%%xmm1          \n"
8282
0
      "psubd       0x20(%0,%4,4),%%xmm2          \n"
8283
0
      "psubd       0x30(%0,%4,4),%%xmm3          \n"
8284
0
      "lea         0x40(%0),%0                   \n"
8285
0
      "psubd       (%1),%%xmm0                   \n"
8286
0
      "psubd       0x10(%1),%%xmm1               \n"
8287
0
      "psubd       0x20(%1),%%xmm2               \n"
8288
0
      "psubd       0x30(%1),%%xmm3               \n"
8289
0
      "paddd       0x00(%1,%4,4),%%xmm0          \n"
8290
0
      "paddd       0x10(%1,%4,4),%%xmm1          \n"
8291
0
      "paddd       0x20(%1,%4,4),%%xmm2          \n"
8292
0
      "paddd       0x30(%1,%4,4),%%xmm3          \n"
8293
0
      "lea         0x40(%1),%1                   \n"
8294
0
      "packssdw    %%xmm1,%%xmm0                 \n"
8295
0
      "packssdw    %%xmm3,%%xmm2                 \n"
8296
0
      "pmulhuw     %%xmm5,%%xmm0                 \n"
8297
0
      "pmulhuw     %%xmm5,%%xmm2                 \n"
8298
0
      "packuswb    %%xmm2,%%xmm0                 \n"
8299
0
      "movdqu      %%xmm0,(%2)                   \n"
8300
0
      "lea         0x10(%2),%2                   \n"
8301
0
      "sub         $0x4,%3                       \n"
8302
0
      "jge         4b                            \n"
8303
0
      "jmp         49f                           \n"
8304
8305
      // 4 pixel loop
8306
0
      LABELALIGN
8307
0
      "40:         \n"
8308
0
      "movdqu      (%0),%%xmm0                   \n"
8309
0
      "movdqu      0x10(%0),%%xmm1               \n"
8310
0
      "movdqu      0x20(%0),%%xmm2               \n"
8311
0
      "movdqu      0x30(%0),%%xmm3               \n"
8312
0
      "psubd       0x00(%0,%4,4),%%xmm0          \n"
8313
0
      "psubd       0x10(%0,%4,4),%%xmm1          \n"
8314
0
      "psubd       0x20(%0,%4,4),%%xmm2          \n"
8315
0
      "psubd       0x30(%0,%4,4),%%xmm3          \n"
8316
0
      "lea         0x40(%0),%0                   \n"
8317
0
      "psubd       (%1),%%xmm0                   \n"
8318
0
      "psubd       0x10(%1),%%xmm1               \n"
8319
0
      "psubd       0x20(%1),%%xmm2               \n"
8320
0
      "psubd       0x30(%1),%%xmm3               \n"
8321
0
      "paddd       0x00(%1,%4,4),%%xmm0          \n"
8322
0
      "paddd       0x10(%1,%4,4),%%xmm1          \n"
8323
0
      "paddd       0x20(%1,%4,4),%%xmm2          \n"
8324
0
      "paddd       0x30(%1,%4,4),%%xmm3          \n"
8325
0
      "lea         0x40(%1),%1                   \n"
8326
0
      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8327
0
      "cvtdq2ps    %%xmm1,%%xmm1                 \n"
8328
0
      "mulps       %%xmm4,%%xmm0                 \n"
8329
0
      "mulps       %%xmm4,%%xmm1                 \n"
8330
0
      "cvtdq2ps    %%xmm2,%%xmm2                 \n"
8331
0
      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
8332
0
      "mulps       %%xmm4,%%xmm2                 \n"
8333
0
      "mulps       %%xmm4,%%xmm3                 \n"
8334
0
      "cvtps2dq    %%xmm0,%%xmm0                 \n"
8335
0
      "cvtps2dq    %%xmm1,%%xmm1                 \n"
8336
0
      "cvtps2dq    %%xmm2,%%xmm2                 \n"
8337
0
      "cvtps2dq    %%xmm3,%%xmm3                 \n"
8338
0
      "packssdw    %%xmm1,%%xmm0                 \n"
8339
0
      "packssdw    %%xmm3,%%xmm2                 \n"
8340
0
      "packuswb    %%xmm2,%%xmm0                 \n"
8341
0
      "movdqu      %%xmm0,(%2)                   \n"
8342
0
      "lea         0x10(%2),%2                   \n"
8343
0
      "sub         $0x4,%3                       \n"
8344
0
      "jge         40b                           \n"
8345
8346
0
      "49:         \n"
8347
0
      "add         $0x3,%3                       \n"
8348
0
      "jl          19f                           \n"
8349
8350
      // 1 pixel loop
8351
0
      LABELALIGN
8352
0
      "10:         \n"
8353
0
      "movdqu      (%0),%%xmm0                   \n"
8354
0
      "psubd       0x00(%0,%4,4),%%xmm0          \n"
8355
0
      "lea         0x10(%0),%0                   \n"
8356
0
      "psubd       (%1),%%xmm0                   \n"
8357
0
      "paddd       0x00(%1,%4,4),%%xmm0          \n"
8358
0
      "lea         0x10(%1),%1                   \n"
8359
0
      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8360
0
      "mulps       %%xmm4,%%xmm0                 \n"
8361
0
      "cvtps2dq    %%xmm0,%%xmm0                 \n"
8362
0
      "packssdw    %%xmm0,%%xmm0                 \n"
8363
0
      "packuswb    %%xmm0,%%xmm0                 \n"
8364
0
      "movd        %%xmm0,(%2)                   \n"
8365
0
      "lea         0x4(%2),%2                    \n"
8366
0
      "sub         $0x1,%3                       \n"
8367
0
      "jge         10b                           \n"
8368
0
      "19:         \n"
8369
0
      : "+r"(topleft),           // %0
8370
0
        "+r"(botleft),           // %1
8371
0
        "+r"(dst),               // %2
8372
0
        "+rm"(count)             // %3
8373
0
      : "r"((intptr_t)(width)),  // %4
8374
0
        "rm"(area)               // %5
8375
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8376
0
}
8377
#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8378
8379
#ifdef HAS_ARGBAFFINEROW_SSE2
8380
// Copy ARGB pixels from source image with slope to a row of destination.
8381
LIBYUV_API
8382
void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8383
                        int src_argb_stride,
8384
                        uint8_t* dst_argb,
8385
                        const float* src_dudv,
8386
0
                        int width) {
8387
0
  intptr_t src_argb_stride_temp = src_argb_stride;
8388
0
  intptr_t temp;
8389
0
  asm volatile(
8390
0
      "movq        (%3),%%xmm2                   \n"
8391
0
      "movq        0x08(%3),%%xmm7               \n"
8392
0
      "shl         $0x10,%1                      \n"
8393
0
      "add         $0x4,%1                       \n"
8394
0
      "movd        %1,%%xmm5                     \n"
8395
0
      "sub         $0x4,%4                       \n"
8396
0
      "jl          49f                           \n"
8397
8398
0
      "pshufd      $0x44,%%xmm7,%%xmm7           \n"
8399
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8400
0
      "movdqa      %%xmm2,%%xmm0                 \n"
8401
0
      "addps       %%xmm7,%%xmm0                 \n"
8402
0
      "movlhps     %%xmm0,%%xmm2                 \n"
8403
0
      "movdqa      %%xmm7,%%xmm4                 \n"
8404
0
      "addps       %%xmm4,%%xmm4                 \n"
8405
0
      "movdqa      %%xmm2,%%xmm3                 \n"
8406
0
      "addps       %%xmm4,%%xmm3                 \n"
8407
0
      "addps       %%xmm4,%%xmm4                 \n"
8408
8409
      // 4 pixel loop
8410
0
      LABELALIGN
8411
0
      "40:         \n"
8412
0
      "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
8413
0
      "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
8414
0
      "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
8415
0
      "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
8416
0
      "movd        %%xmm0,%k1                    \n"
8417
0
      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8418
0
      "movd        %%xmm0,%k5                    \n"
8419
0
      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8420
0
      "movd        0x00(%0,%1,1),%%xmm1          \n"
8421
0
      "movd        0x00(%0,%5,1),%%xmm6          \n"
8422
0
      "punpckldq   %%xmm6,%%xmm1                 \n"
8423
0
      "addps       %%xmm4,%%xmm2                 \n"
8424
0
      "movq        %%xmm1,(%2)                   \n"
8425
0
      "movd        %%xmm0,%k1                    \n"
8426
0
      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8427
0
      "movd        %%xmm0,%k5                    \n"
8428
0
      "movd        0x00(%0,%1,1),%%xmm0          \n"
8429
0
      "movd        0x00(%0,%5,1),%%xmm6          \n"
8430
0
      "punpckldq   %%xmm6,%%xmm0                 \n"
8431
0
      "addps       %%xmm4,%%xmm3                 \n"
8432
0
      "movq        %%xmm0,0x08(%2)               \n"
8433
0
      "lea         0x10(%2),%2                   \n"
8434
0
      "sub         $0x4,%4                       \n"
8435
0
      "jge         40b                           \n"
8436
8437
0
      "49:         \n"
8438
0
      "add         $0x3,%4                       \n"
8439
0
      "jl          19f                           \n"
8440
8441
      // 1 pixel loop
8442
0
      LABELALIGN
8443
0
      "10:         \n"
8444
0
      "cvttps2dq   %%xmm2,%%xmm0                 \n"
8445
0
      "packssdw    %%xmm0,%%xmm0                 \n"
8446
0
      "pmaddwd     %%xmm5,%%xmm0                 \n"
8447
0
      "addps       %%xmm7,%%xmm2                 \n"
8448
0
      "movd        %%xmm0,%k1                    \n"
8449
0
      "movd        0x00(%0,%1,1),%%xmm0          \n"
8450
0
      "movd        %%xmm0,(%2)                   \n"
8451
0
      "lea         0x04(%2),%2                   \n"
8452
0
      "sub         $0x1,%4                       \n"
8453
0
      "jge         10b                           \n"
8454
0
      "19:         \n"
8455
0
      : "+r"(src_argb),              // %0
8456
0
        "+r"(src_argb_stride_temp),  // %1
8457
0
        "+r"(dst_argb),              // %2
8458
0
        "+r"(src_dudv),              // %3
8459
0
        "+rm"(width),                // %4
8460
0
        "=&r"(temp)                  // %5
8461
0
      :
8462
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8463
0
        "xmm7");
8464
0
}
8465
#endif  // HAS_ARGBAFFINEROW_SSE2
8466
8467
#ifdef HAS_INTERPOLATEROW_SSSE3
8468
// Bilinear filter 16x2 -> 16x1
8469
void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8470
                          const uint8_t* src_ptr,
8471
                          ptrdiff_t src_stride,
8472
                          int width,
8473
0
                          int source_y_fraction) {
8474
0
  asm volatile(
8475
0
      "sub         %1,%0                         \n"
8476
0
      "cmp         $0x0,%3                       \n"
8477
0
      "je          100f                          \n"
8478
0
      "cmp         $0x80,%3                      \n"
8479
0
      "je          50f                           \n"
8480
8481
0
      "movd        %3,%%xmm0                     \n"
8482
0
      "neg         %3                            \n"
8483
0
      "add         $0x100,%3                     \n"
8484
0
      "movd        %3,%%xmm5                     \n"
8485
0
      "punpcklbw   %%xmm0,%%xmm5                 \n"
8486
0
      "punpcklwd   %%xmm5,%%xmm5                 \n"
8487
0
      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8488
0
      "mov         $0x80808080,%%eax             \n"
8489
0
      "movd        %%eax,%%xmm4                  \n"
8490
0
      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8491
8492
      // General purpose row blend.
8493
0
      LABELALIGN
8494
0
      "1:          \n"
8495
0
      "movdqu      (%1),%%xmm0                   \n"
8496
0
      "movdqu      0x00(%1,%4,1),%%xmm2          \n"
8497
0
      "movdqa      %%xmm0,%%xmm1                 \n"
8498
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"
8499
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"
8500
0
      "psubb       %%xmm4,%%xmm0                 \n"
8501
0
      "psubb       %%xmm4,%%xmm1                 \n"
8502
0
      "movdqa      %%xmm5,%%xmm2                 \n"
8503
0
      "movdqa      %%xmm5,%%xmm3                 \n"
8504
0
      "pmaddubsw   %%xmm0,%%xmm2                 \n"
8505
0
      "pmaddubsw   %%xmm1,%%xmm3                 \n"
8506
0
      "paddw       %%xmm4,%%xmm2                 \n"
8507
0
      "paddw       %%xmm4,%%xmm3                 \n"
8508
0
      "psrlw       $0x8,%%xmm2                   \n"
8509
0
      "psrlw       $0x8,%%xmm3                   \n"
8510
0
      "packuswb    %%xmm3,%%xmm2                 \n"
8511
0
      "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
8512
0
      "lea         0x10(%1),%1                   \n"
8513
0
      "sub         $0x10,%2                      \n"
8514
0
      "jg          1b                            \n"
8515
0
      "jmp         99f                           \n"
8516
8517
      // Blend 50 / 50.
8518
0
      LABELALIGN
8519
0
      "50:         \n"
8520
0
      "movdqu      (%1),%%xmm0                   \n"
8521
0
      "movdqu      0x00(%1,%4,1),%%xmm1          \n"
8522
0
      "pavgb       %%xmm1,%%xmm0                 \n"
8523
0
      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8524
0
      "lea         0x10(%1),%1                   \n"
8525
0
      "sub         $0x10,%2                      \n"
8526
0
      "jg          50b                           \n"
8527
0
      "jmp         99f                           \n"
8528
8529
      // Blend 100 / 0 - Copy row unchanged.
8530
0
      LABELALIGN
8531
0
      "100:        \n"
8532
0
      "movdqu      (%1),%%xmm0                   \n"
8533
0
      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8534
0
      "lea         0x10(%1),%1                   \n"
8535
0
      "sub         $0x10,%2                      \n"
8536
0
      "jg          100b                          \n"
8537
8538
0
      "99:         \n"
8539
0
      : "+r"(dst_ptr),               // %0
8540
0
        "+r"(src_ptr),               // %1
8541
0
        "+rm"(width),                // %2
8542
0
        "+r"(source_y_fraction)      // %3
8543
0
      : "r"((intptr_t)(src_stride))  // %4
8544
0
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8545
0
}
8546
#endif  // HAS_INTERPOLATEROW_SSSE3
8547
8548
#ifdef HAS_INTERPOLATEROW_AVX2
8549
// Bilinear filter 32x2 -> 32x1
8550
void InterpolateRow_AVX2(uint8_t* dst_ptr,
8551
                         const uint8_t* src_ptr,
8552
                         ptrdiff_t src_stride,
8553
                         int width,
8554
6.11M
                         int source_y_fraction) {
8555
6.11M
  asm volatile(
8556
6.11M
      "sub         %1,%0                         \n"
8557
6.11M
      "cmp         $0x0,%3                       \n"
8558
6.11M
      "je          100f                          \n"
8559
6.11M
      "cmp         $0x80,%3                      \n"
8560
6.11M
      "je          50f                           \n"
8561
8562
6.11M
      "vmovd       %3,%%xmm0                     \n"
8563
6.11M
      "neg         %3                            \n"
8564
6.11M
      "add         $0x100,%3                     \n"
8565
6.11M
      "vmovd       %3,%%xmm5                     \n"
8566
6.11M
      "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
8567
6.11M
      "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
8568
6.11M
      "vbroadcastss %%xmm5,%%ymm5                \n"
8569
6.11M
      "mov         $0x80808080,%%eax             \n"
8570
6.11M
      "vmovd       %%eax,%%xmm4                  \n"
8571
6.11M
      "vbroadcastss %%xmm4,%%ymm4                \n"
8572
8573
      // General purpose row blend.
8574
6.11M
      LABELALIGN
8575
6.11M
      "1:          \n"
8576
6.11M
      "vmovdqu     (%1),%%ymm0                   \n"
8577
6.11M
      "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
8578
6.11M
      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
8579
6.11M
      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
8580
6.11M
      "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
8581
6.11M
      "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
8582
6.11M
      "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
8583
6.11M
      "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
8584
6.11M
      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
8585
6.11M
      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
8586
6.11M
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
8587
6.11M
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
8588
6.11M
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
8589
6.11M
      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8590
6.11M
      "lea         0x20(%1),%1                   \n"
8591
6.11M
      "sub         $0x20,%2                      \n"
8592
6.11M
      "jg          1b                            \n"
8593
6.11M
      "jmp         99f                           \n"
8594
8595
      // Blend 50 / 50.
8596
6.11M
      LABELALIGN
8597
6.11M
      "50:         \n"
8598
6.11M
      "vmovdqu     (%1),%%ymm0                   \n"
8599
6.11M
      "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
8600
6.11M
      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8601
6.11M
      "lea         0x20(%1),%1                   \n"
8602
6.11M
      "sub         $0x20,%2                      \n"
8603
6.11M
      "jg          50b                           \n"
8604
6.11M
      "jmp         99f                           \n"
8605
8606
      // Blend 100 / 0 - Copy row unchanged.
8607
6.11M
      LABELALIGN
8608
6.11M
      "100:        \n"
8609
6.11M
      "vmovdqu     (%1),%%ymm0                   \n"
8610
6.11M
      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8611
6.11M
      "lea         0x20(%1),%1                   \n"
8612
6.11M
      "sub         $0x20,%2                      \n"
8613
6.11M
      "jg          100b                          \n"
8614
8615
6.11M
      "99:         \n"
8616
6.11M
      "vzeroupper  \n"
8617
6.11M
      : "+r"(dst_ptr),               // %0
8618
6.11M
        "+r"(src_ptr),               // %1
8619
6.11M
        "+r"(width),                 // %2
8620
6.11M
        "+r"(source_y_fraction)      // %3
8621
6.11M
      : "r"((intptr_t)(src_stride))  // %4
8622
6.11M
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8623
6.11M
}
8624
#endif  // HAS_INTERPOLATEROW_AVX2
8625
8626
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
8627
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
8628
void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8629
                          uint8_t* dst_argb,
8630
                          const uint8_t* shuffler,
8631
0
                          int width) {
8632
0
      asm volatile("movdqu      (%3),%%xmm5                   \n"
8633
8634
0
               LABELALIGN
8635
0
      "1:          \n"
8636
0
      "movdqu      (%0),%%xmm0                   \n"
8637
0
      "movdqu      0x10(%0),%%xmm1               \n"
8638
0
      "lea         0x20(%0),%0                   \n"
8639
0
      "pshufb      %%xmm5,%%xmm0                 \n"
8640
0
      "pshufb      %%xmm5,%%xmm1                 \n"
8641
0
      "movdqu      %%xmm0,(%1)                   \n"
8642
0
      "movdqu      %%xmm1,0x10(%1)               \n"
8643
0
      "lea         0x20(%1),%1                   \n"
8644
0
      "sub         $0x8,%2                       \n"
8645
0
      "jg          1b                            \n"
8646
0
               : "+r"(src_argb),  // %0
8647
0
                 "+r"(dst_argb),  // %1
8648
0
                 "+r"(width)      // %2
8649
0
               : "r"(shuffler)    // %3
8650
0
               : "memory", "cc", "xmm0", "xmm1", "xmm5");
8651
0
}
8652
#endif  // HAS_ARGBSHUFFLEROW_SSSE3
8653
8654
#ifdef HAS_ARGBSHUFFLEROW_AVX2
8655
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
8656
void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8657
                         uint8_t* dst_argb,
8658
                         const uint8_t* shuffler,
8659
0
                         int width) {
8660
0
      asm volatile("vbroadcastf128 (%3),%%ymm5                \n"
8661
8662
0
               LABELALIGN
8663
0
      "1:          \n"
8664
0
      "vmovdqu     (%0),%%ymm0                   \n"
8665
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
8666
0
      "lea         0x40(%0),%0                   \n"
8667
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
8668
0
      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
8669
0
      "vmovdqu     %%ymm0,(%1)                   \n"
8670
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
8671
0
      "lea         0x40(%1),%1                   \n"
8672
0
      "sub         $0x10,%2                      \n"
8673
0
      "jg          1b                            \n"
8674
0
      "vzeroupper  \n"
8675
0
               : "+r"(src_argb),  // %0
8676
0
                 "+r"(dst_argb),  // %1
8677
0
                 "+r"(width)      // %2
8678
0
               : "r"(shuffler)    // %3
8679
0
               : "memory", "cc", "xmm0", "xmm1", "xmm5");
8680
0
}
8681
#endif  // HAS_ARGBSHUFFLEROW_AVX2
8682
8683
#ifdef HAS_I422TOYUY2ROW_SSE2
8684
void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8685
                        const uint8_t* src_u,
8686
                        const uint8_t* src_v,
8687
                        uint8_t* dst_yuy2,
8688
0
                        int width) {
8689
0
      asm volatile("sub         %1,%2                         \n"
8690
8691
0
               LABELALIGN
8692
0
      "1:          \n"
8693
0
      "movq        (%1),%%xmm2                   \n"
8694
0
      "movq        0x00(%1,%2,1),%%xmm1          \n"
8695
0
      "add         $0x8,%1                       \n"
8696
0
      "punpcklbw   %%xmm1,%%xmm2                 \n"
8697
0
      "movdqu      (%0),%%xmm0                   \n"
8698
0
      "add         $0x10,%0                      \n"
8699
0
      "movdqa      %%xmm0,%%xmm1                 \n"
8700
0
      "punpcklbw   %%xmm2,%%xmm0                 \n"
8701
0
      "punpckhbw   %%xmm2,%%xmm1                 \n"
8702
0
      "movdqu      %%xmm0,(%3)                   \n"
8703
0
      "movdqu      %%xmm1,0x10(%3)               \n"
8704
0
      "lea         0x20(%3),%3                   \n"
8705
0
      "sub         $0x10,%4                      \n"
8706
0
      "jg          1b                            \n"
8707
0
               : "+r"(src_y),     // %0
8708
0
                 "+r"(src_u),     // %1
8709
0
                 "+r"(src_v),     // %2
8710
0
                 "+r"(dst_yuy2),  // %3
8711
0
                 "+rm"(width)     // %4
8712
0
               :
8713
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
8714
0
}
8715
#endif  // HAS_I422TOYUY2ROW_SSE2
8716
8717
#ifdef HAS_I422TOUYVYROW_SSE2
8718
void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8719
                        const uint8_t* src_u,
8720
                        const uint8_t* src_v,
8721
                        uint8_t* dst_uyvy,
8722
0
                        int width) {
8723
0
      asm volatile("sub         %1,%2                         \n"
8724
8725
0
               LABELALIGN
8726
0
      "1:          \n"
8727
0
      "movq        (%1),%%xmm2                   \n"
8728
0
      "movq        0x00(%1,%2,1),%%xmm1          \n"
8729
0
      "add         $0x8,%1                       \n"
8730
0
      "punpcklbw   %%xmm1,%%xmm2                 \n"
8731
0
      "movdqu      (%0),%%xmm0                   \n"
8732
0
      "movdqa      %%xmm2,%%xmm1                 \n"
8733
0
      "add         $0x10,%0                      \n"
8734
0
      "punpcklbw   %%xmm0,%%xmm1                 \n"
8735
0
      "punpckhbw   %%xmm0,%%xmm2                 \n"
8736
0
      "movdqu      %%xmm1,(%3)                   \n"
8737
0
      "movdqu      %%xmm2,0x10(%3)               \n"
8738
0
      "lea         0x20(%3),%3                   \n"
8739
0
      "sub         $0x10,%4                      \n"
8740
0
      "jg          1b                            \n"
8741
0
               : "+r"(src_y),     // %0
8742
0
                 "+r"(src_u),     // %1
8743
0
                 "+r"(src_v),     // %2
8744
0
                 "+r"(dst_uyvy),  // %3
8745
0
                 "+rm"(width)     // %4
8746
0
               :
8747
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
8748
0
}
8749
#endif  // HAS_I422TOUYVYROW_SSE2
8750
8751
#ifdef HAS_I422TOYUY2ROW_AVX2
8752
void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8753
                        const uint8_t* src_u,
8754
                        const uint8_t* src_v,
8755
                        uint8_t* dst_yuy2,
8756
0
                        int width) {
8757
0
      asm volatile("sub         %1,%2                         \n"
8758
8759
0
               LABELALIGN
8760
0
      "1:          \n"
8761
0
      "vpmovzxbw   (%1),%%ymm1                   \n"
8762
0
      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8763
0
      "add         $0x10,%1                      \n"
8764
0
      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8765
0
      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8766
0
      "vmovdqu     (%0),%%ymm0                   \n"
8767
0
      "add         $0x20,%0                      \n"
8768
0
      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
8769
0
      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
8770
0
      "vextractf128 $0x0,%%ymm1,(%3)             \n"
8771
0
      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8772
0
      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8773
0
      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8774
0
      "lea         0x40(%3),%3                   \n"
8775
0
      "sub         $0x20,%4                      \n"
8776
0
      "jg          1b                            \n"
8777
0
      "vzeroupper  \n"
8778
0
               : "+r"(src_y),     // %0
8779
0
                 "+r"(src_u),     // %1
8780
0
                 "+r"(src_v),     // %2
8781
0
                 "+r"(dst_yuy2),  // %3
8782
0
                 "+rm"(width)     // %4
8783
0
               :
8784
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
8785
0
}
8786
#endif  // HAS_I422TOYUY2ROW_AVX2
8787
8788
#ifdef HAS_I422TOUYVYROW_AVX2
8789
void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8790
                        const uint8_t* src_u,
8791
                        const uint8_t* src_v,
8792
                        uint8_t* dst_uyvy,
8793
0
                        int width) {
8794
0
      asm volatile("sub         %1,%2                         \n"
8795
8796
0
               LABELALIGN
8797
0
      "1:          \n"
8798
0
      "vpmovzxbw   (%1),%%ymm1                   \n"
8799
0
      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8800
0
      "add         $0x10,%1                      \n"
8801
0
      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8802
0
      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8803
0
      "vmovdqu     (%0),%%ymm0                   \n"
8804
0
      "add         $0x20,%0                      \n"
8805
0
      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
8806
0
      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
8807
0
      "vextractf128 $0x0,%%ymm1,(%3)             \n"
8808
0
      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8809
0
      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8810
0
      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8811
0
      "lea         0x40(%3),%3                   \n"
8812
0
      "sub         $0x20,%4                      \n"
8813
0
      "jg          1b                            \n"
8814
0
      "vzeroupper  \n"
8815
0
               : "+r"(src_y),     // %0
8816
0
                 "+r"(src_u),     // %1
8817
0
                 "+r"(src_v),     // %2
8818
0
                 "+r"(dst_uyvy),  // %3
8819
0
                 "+rm"(width)     // %4
8820
0
               :
8821
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2");
8822
0
}
8823
#endif  // HAS_I422TOUYVYROW_AVX2
8824
8825
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
8826
void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
8827
                            uint8_t* dst_argb,
8828
                            const float* poly,
8829
0
                            int width) {
8830
0
      asm volatile("pxor        %%xmm3,%%xmm3                 \n"
8831
8832
               // 2 pixel loop.
8833
0
               LABELALIGN
8834
0
      "1:          \n"
8835
0
      "movq        (%0),%%xmm0                   \n"
8836
0
      "lea         0x8(%0),%0                    \n"
8837
0
      "punpcklbw   %%xmm3,%%xmm0                 \n"
8838
0
      "movdqa      %%xmm0,%%xmm4                 \n"
8839
0
      "punpcklwd   %%xmm3,%%xmm0                 \n"
8840
0
      "punpckhwd   %%xmm3,%%xmm4                 \n"
8841
0
      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8842
0
      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
8843
0
      "movdqa      %%xmm0,%%xmm1                 \n"
8844
0
      "movdqa      %%xmm4,%%xmm5                 \n"
8845
0
      "mulps       0x10(%3),%%xmm0               \n"
8846
0
      "mulps       0x10(%3),%%xmm4               \n"
8847
0
      "addps       (%3),%%xmm0                   \n"
8848
0
      "addps       (%3),%%xmm4                   \n"
8849
0
      "movdqa      %%xmm1,%%xmm2                 \n"
8850
0
      "movdqa      %%xmm5,%%xmm6                 \n"
8851
0
      "mulps       %%xmm1,%%xmm2                 \n"
8852
0
      "mulps       %%xmm5,%%xmm6                 \n"
8853
0
      "mulps       %%xmm2,%%xmm1                 \n"
8854
0
      "mulps       %%xmm6,%%xmm5                 \n"
8855
0
      "mulps       0x20(%3),%%xmm2               \n"
8856
0
      "mulps       0x20(%3),%%xmm6               \n"
8857
0
      "mulps       0x30(%3),%%xmm1               \n"
8858
0
      "mulps       0x30(%3),%%xmm5               \n"
8859
0
      "addps       %%xmm2,%%xmm0                 \n"
8860
0
      "addps       %%xmm6,%%xmm4                 \n"
8861
0
      "addps       %%xmm1,%%xmm0                 \n"
8862
0
      "addps       %%xmm5,%%xmm4                 \n"
8863
0
      "cvttps2dq   %%xmm0,%%xmm0                 \n"
8864
0
      "cvttps2dq   %%xmm4,%%xmm4                 \n"
8865
0
      "packuswb    %%xmm4,%%xmm0                 \n"
8866
0
      "packuswb    %%xmm0,%%xmm0                 \n"
8867
0
      "movq        %%xmm0,(%1)                   \n"
8868
0
      "lea         0x8(%1),%1                    \n"
8869
0
      "sub         $0x2,%2                       \n"
8870
0
      "jg          1b                            \n"
8871
0
               : "+r"(src_argb),  // %0
8872
0
                 "+r"(dst_argb),  // %1
8873
0
                 "+r"(width)      // %2
8874
0
               : "r"(poly)        // %3
8875
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
8876
0
                 "xmm6");
8877
0
}
8878
#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
8879
8880
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
8881
void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
8882
                            uint8_t* dst_argb,
8883
                            const float* poly,
8884
0
                            int width) {
8885
0
  asm volatile(
8886
0
      "vbroadcastf128 (%3),%%ymm4                \n"
8887
0
      "vbroadcastf128 0x10(%3),%%ymm5            \n"
8888
0
      "vbroadcastf128 0x20(%3),%%ymm6            \n"
8889
0
      "vbroadcastf128 0x30(%3),%%ymm7            \n"
8890
8891
      // 2 pixel loop.
8892
0
      LABELALIGN
8893
0
      "1:          \n"
8894
0
      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
8895
0
      "lea         0x8(%0),%0                    \n"
8896
0
      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
8897
0
      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
8898
0
      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
8899
0
      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
8900
0
      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
8901
0
      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
8902
                                                      // X
8903
0
      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
8904
0
      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
8905
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
8906
0
      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
8907
0
      "vmovq       %%xmm0,(%1)                   \n"
8908
0
      "lea         0x8(%1),%1                    \n"
8909
0
      "sub         $0x2,%2                       \n"
8910
0
      "jg          1b                            \n"
8911
0
      "vzeroupper  \n"
8912
0
      : "+r"(src_argb),  // %0
8913
0
        "+r"(dst_argb),  // %1
8914
0
        "+r"(width)      // %2
8915
0
      : "r"(poly)        // %3
8916
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8917
0
        "xmm7");
8918
0
}
8919
#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
8920
8921
#ifdef HAS_HALFFLOATROW_SSE2
8922
static float kScaleBias = 1.9259299444e-34f;
8923
void HalfFloatRow_SSE2(const uint16_t* src,
8924
                       uint16_t* dst,
8925
                       float scale,
8926
0
                       int width) {
8927
0
  scale *= kScaleBias;
8928
0
  asm volatile(
8929
0
      "movd        %3,%%xmm4                     \n"
8930
0
      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8931
0
      "pxor        %%xmm5,%%xmm5                 \n"
8932
0
      "sub         %0,%1                         \n"
8933
8934
      // 16 pixel loop.
8935
0
      LABELALIGN
8936
0
      "1:          \n"
8937
0
      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
8938
0
      "add         $0x10,%0                      \n"
8939
0
      "movdqa      %%xmm2,%%xmm3                 \n"
8940
0
      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
8941
0
      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
8942
0
      "punpckhwd   %%xmm5,%%xmm3                 \n"
8943
0
      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
8944
0
      "mulps       %%xmm4,%%xmm2                 \n"
8945
0
      "mulps       %%xmm4,%%xmm3                 \n"
8946
0
      "psrld       $0xd,%%xmm2                   \n"
8947
0
      "psrld       $0xd,%%xmm3                   \n"
8948
0
      "packssdw    %%xmm3,%%xmm2                 \n"
8949
0
      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
8950
0
      "sub         $0x8,%2                       \n"
8951
0
      "jg          1b                            \n"
8952
0
      : "+r"(src),   // %0
8953
0
        "+r"(dst),   // %1
8954
0
        "+r"(width)  // %2
8955
0
      : "m"(scale)   // %3
8956
0
      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8957
0
}
8958
#endif  // HAS_HALFFLOATROW_SSE2
8959
8960
#ifdef HAS_HALFFLOATROW_AVX2
8961
void HalfFloatRow_AVX2(const uint16_t* src,
8962
                       uint16_t* dst,
8963
                       float scale,
8964
468
                       int width) {
8965
468
  scale *= kScaleBias;
8966
468
  asm volatile(
8967
468
      "vbroadcastss %3, %%ymm4                   \n"
8968
468
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
8969
468
      "sub         %0,%1                         \n"
8970
8971
      // 16 pixel loop.
8972
468
      LABELALIGN
8973
468
      "1:          \n"
8974
468
      "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
8975
468
      "add         $0x20,%0                      \n"
8976
468
      "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
8977
468
      "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
8978
468
      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8979
468
      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8980
468
      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
8981
468
      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
8982
468
      "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
8983
468
      "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
8984
468
      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
8985
468
      "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
8986
468
      "sub         $0x10,%2                      \n"
8987
468
      "jg          1b                            \n"
8988
8989
468
      "vzeroupper  \n"
8990
468
      : "+r"(src),   // %0
8991
468
        "+r"(dst),   // %1
8992
468
        "+r"(width)  // %2
8993
468
#if defined(__x86_64__)
8994
468
      : "x"(scale)  // %3
8995
#else
8996
      : "m"(scale)    // %3
8997
#endif
8998
468
      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8999
468
}
9000
#endif  // HAS_HALFFLOATROW_AVX2
9001
9002
#ifdef HAS_HALFFLOATROW_F16C
9003
void HalfFloatRow_F16C(const uint16_t* src,
9004
                       uint16_t* dst,
9005
                       float scale,
9006
                       int width) {
9007
  asm volatile(
9008
      "vbroadcastss %3, %%ymm4                   \n"
9009
      "sub         %0,%1                         \n"
9010
9011
      // 16 pixel loop.
9012
      LABELALIGN
9013
      "1:          \n"
9014
      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
9015
      "vpmovzxwd   0x10(%0),%%ymm3               \n"
9016
      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
9017
      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
9018
      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
9019
      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
9020
      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
9021
      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
9022
      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
9023
      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
9024
      "add         $0x20,%0                      \n"
9025
      "sub         $0x10,%2                      \n"
9026
      "jg          1b                            \n"
9027
      "vzeroupper  \n"
9028
      : "+r"(src),   // %0
9029
        "+r"(dst),   // %1
9030
        "+r"(width)  // %2
9031
#if defined(__x86_64__)
9032
      : "x"(scale)  // %3
9033
#else
9034
      : "m"(scale)    // %3
9035
#endif
9036
      : "memory", "cc", "xmm2", "xmm3", "xmm4");
9037
}
9038
#endif  // HAS_HALFFLOATROW_F16C
9039
9040
#ifdef HAS_HALFFLOATROW_F16C
9041
void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
9042
  asm volatile(
9043
      "sub         %0,%1                         \n"
9044
      // 16 pixel loop.
9045
      LABELALIGN
9046
      "1:          \n"
9047
      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
9048
      "vpmovzxwd   0x10(%0),%%ymm3               \n"
9049
      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
9050
      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
9051
      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
9052
      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
9053
      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
9054
      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
9055
      "add         $0x20,%0                      \n"
9056
      "sub         $0x10,%2                      \n"
9057
      "jg          1b                            \n"
9058
      "vzeroupper  \n"
9059
      : "+r"(src),   // %0
9060
        "+r"(dst),   // %1
9061
        "+r"(width)  // %2
9062
      :
9063
      : "memory", "cc", "xmm2", "xmm3");
9064
}
9065
#endif  // HAS_HALFFLOATROW_F16C
9066
9067
#ifdef HAS_ARGBCOLORTABLEROW_X86
9068
// Tranform ARGB pixels with color table.
9069
void ARGBColorTableRow_X86(uint8_t* dst_argb,
9070
                           const uint8_t* table_argb,
9071
0
                           int width) {
9072
0
  uintptr_t pixel_temp;
9073
0
  asm volatile(
9074
      // 1 pixel loop.
9075
0
      LABELALIGN
9076
0
      "1:          \n"
9077
0
      "movzb       (%0),%1                       \n"
9078
0
      "lea         0x4(%0),%0                    \n"
9079
0
      "movzb       0x00(%3,%1,4),%1              \n"
9080
0
      "mov         %b1,-0x4(%0)                  \n"
9081
0
      "movzb       -0x3(%0),%1                   \n"
9082
0
      "movzb       0x01(%3,%1,4),%1              \n"
9083
0
      "mov         %b1,-0x3(%0)                  \n"
9084
0
      "movzb       -0x2(%0),%1                   \n"
9085
0
      "movzb       0x02(%3,%1,4),%1              \n"
9086
0
      "mov         %b1,-0x2(%0)                  \n"
9087
0
      "movzb       -0x1(%0),%1                   \n"
9088
0
      "movzb       0x03(%3,%1,4),%1              \n"
9089
0
      "mov         %b1,-0x1(%0)                  \n"
9090
0
      "dec         %2                            \n"
9091
0
      "jg          1b                            \n"
9092
0
      : "+r"(dst_argb),     // %0
9093
0
        "=&d"(pixel_temp),  // %1
9094
0
        "+r"(width)         // %2
9095
0
      : "r"(table_argb)     // %3
9096
0
      : "memory", "cc");
9097
0
}
9098
#endif  // HAS_ARGBCOLORTABLEROW_X86
9099
9100
#ifdef HAS_RGBCOLORTABLEROW_X86
9101
// Tranform RGB pixels with color table.
9102
void RGBColorTableRow_X86(uint8_t* dst_argb,
9103
                          const uint8_t* table_argb,
9104
0
                          int width) {
9105
0
  uintptr_t pixel_temp;
9106
0
  asm volatile(
9107
      // 1 pixel loop.
9108
0
      LABELALIGN
9109
0
      "1:          \n"
9110
0
      "movzb       (%0),%1                       \n"
9111
0
      "lea         0x4(%0),%0                    \n"
9112
0
      "movzb       0x00(%3,%1,4),%1              \n"
9113
0
      "mov         %b1,-0x4(%0)                  \n"
9114
0
      "movzb       -0x3(%0),%1                   \n"
9115
0
      "movzb       0x01(%3,%1,4),%1              \n"
9116
0
      "mov         %b1,-0x3(%0)                  \n"
9117
0
      "movzb       -0x2(%0),%1                   \n"
9118
0
      "movzb       0x02(%3,%1,4),%1              \n"
9119
0
      "mov         %b1,-0x2(%0)                  \n"
9120
0
      "dec         %2                            \n"
9121
0
      "jg          1b                            \n"
9122
0
      : "+r"(dst_argb),     // %0
9123
0
        "=&d"(pixel_temp),  // %1
9124
0
        "+r"(width)         // %2
9125
0
      : "r"(table_argb)     // %3
9126
0
      : "memory", "cc");
9127
0
}
9128
#endif  // HAS_RGBCOLORTABLEROW_X86
9129
9130
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
9131
// Tranform RGB pixels with luma table.
9132
void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
9133
                                 uint8_t* dst_argb,
9134
                                 int width,
9135
                                 const uint8_t* luma,
9136
0
                                 uint32_t lumacoeff) {
9137
0
  uintptr_t pixel_temp;
9138
0
  uintptr_t table_temp;
9139
0
  asm volatile(
9140
0
      "movd        %6,%%xmm3                     \n"
9141
0
      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
9142
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
9143
0
      "psllw       $0x8,%%xmm4                   \n"
9144
0
      "pxor        %%xmm5,%%xmm5                 \n"
9145
9146
      // 4 pixel loop.
9147
0
      LABELALIGN
9148
0
      "1:          \n"
9149
0
      "movdqu      (%2),%%xmm0                   \n"
9150
0
      "pmaddubsw   %%xmm3,%%xmm0                 \n"
9151
0
      "phaddw      %%xmm0,%%xmm0                 \n"
9152
0
      "pand        %%xmm4,%%xmm0                 \n"
9153
0
      "punpcklwd   %%xmm5,%%xmm0                 \n"
9154
0
      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9155
0
      "add         %5,%1                         \n"
9156
0
      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
9157
9158
0
      "movzb       (%2),%0                       \n"
9159
0
      "movzb       0x00(%1,%0,1),%0              \n"
9160
0
      "mov         %b0,(%3)                      \n"
9161
0
      "movzb       0x1(%2),%0                    \n"
9162
0
      "movzb       0x00(%1,%0,1),%0              \n"
9163
0
      "mov         %b0,0x1(%3)                   \n"
9164
0
      "movzb       0x2(%2),%0                    \n"
9165
0
      "movzb       0x00(%1,%0,1),%0              \n"
9166
0
      "mov         %b0,0x2(%3)                   \n"
9167
0
      "movzb       0x3(%2),%0                    \n"
9168
0
      "mov         %b0,0x3(%3)                   \n"
9169
9170
0
      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9171
0
      "add         %5,%1                         \n"
9172
0
      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
9173
9174
0
      "movzb       0x4(%2),%0                    \n"
9175
0
      "movzb       0x00(%1,%0,1),%0              \n"
9176
0
      "mov         %b0,0x4(%3)                   \n"
9177
0
      "movzb       0x5(%2),%0                    \n"
9178
0
      "movzb       0x00(%1,%0,1),%0              \n"
9179
0
      "mov         %b0,0x5(%3)                   \n"
9180
0
      "movzb       0x6(%2),%0                    \n"
9181
0
      "movzb       0x00(%1,%0,1),%0              \n"
9182
0
      "mov         %b0,0x6(%3)                   \n"
9183
0
      "movzb       0x7(%2),%0                    \n"
9184
0
      "mov         %b0,0x7(%3)                   \n"
9185
9186
0
      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9187
0
      "add         %5,%1                         \n"
9188
0
      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
9189
9190
0
      "movzb       0x8(%2),%0                    \n"
9191
0
      "movzb       0x00(%1,%0,1),%0              \n"
9192
0
      "mov         %b0,0x8(%3)                   \n"
9193
0
      "movzb       0x9(%2),%0                    \n"
9194
0
      "movzb       0x00(%1,%0,1),%0              \n"
9195
0
      "mov         %b0,0x9(%3)                   \n"
9196
0
      "movzb       0xa(%2),%0                    \n"
9197
0
      "movzb       0x00(%1,%0,1),%0              \n"
9198
0
      "mov         %b0,0xa(%3)                   \n"
9199
0
      "movzb       0xb(%2),%0                    \n"
9200
0
      "mov         %b0,0xb(%3)                   \n"
9201
9202
0
      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9203
0
      "add         %5,%1                         \n"
9204
9205
0
      "movzb       0xc(%2),%0                    \n"
9206
0
      "movzb       0x00(%1,%0,1),%0              \n"
9207
0
      "mov         %b0,0xc(%3)                   \n"
9208
0
      "movzb       0xd(%2),%0                    \n"
9209
0
      "movzb       0x00(%1,%0,1),%0              \n"
9210
0
      "mov         %b0,0xd(%3)                   \n"
9211
0
      "movzb       0xe(%2),%0                    \n"
9212
0
      "movzb       0x00(%1,%0,1),%0              \n"
9213
0
      "mov         %b0,0xe(%3)                   \n"
9214
0
      "movzb       0xf(%2),%0                    \n"
9215
0
      "mov         %b0,0xf(%3)                   \n"
9216
0
      "lea         0x10(%2),%2                   \n"
9217
0
      "lea         0x10(%3),%3                   \n"
9218
0
      "sub         $0x4,%4                       \n"
9219
0
      "jg          1b                            \n"
9220
0
      : "=&d"(pixel_temp),  // %0
9221
0
        "=&a"(table_temp),  // %1
9222
0
        "+r"(src_argb),     // %2
9223
0
        "+r"(dst_argb),     // %3
9224
0
        "+rm"(width)        // %4
9225
0
      : "r"(luma),          // %5
9226
0
        "rm"(lumacoeff)     // %6
9227
0
      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
9228
0
}
9229
#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
9230
9231
static const uvec8 kYUV24Shuffle[3] = {
9232
    {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
9233
    {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
9234
    {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
9235
9236
// Convert biplanar NV21 to packed YUV24
9237
// NV21 has VU in memory for chroma.
9238
// YUV24 is VUY in memory
9239
void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
9240
                          const uint8_t* src_vu,
9241
                          uint8_t* dst_yuv24,
9242
0
                          int width) {
9243
0
  asm volatile(
9244
0
      "sub         %0,%1                         \n"
9245
0
      "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
9246
0
      "movdqa      16(%4),%%xmm5                 \n"
9247
0
      "movdqa      32(%4),%%xmm6                 \n"
9248
0
      "1:          \n"
9249
0
      "movdqu      (%0),%%xmm2                   \n"  // load 16 Y values
9250
0
      "movdqu      (%0,%1),%%xmm3                \n"  // load 8 VU values
9251
0
      "lea         16(%0),%0                     \n"
9252
0
      "movdqa      %%xmm2,%%xmm0                 \n"
9253
0
      "movdqa      %%xmm2,%%xmm1                 \n"
9254
0
      "shufps      $0x44,%%xmm3,%%xmm0           \n"  // Y 0..7,  UV 0..3
9255
0
      "shufps      $0x99,%%xmm3,%%xmm1           \n"  // Y 4..11, UV 2..5
9256
0
      "shufps      $0xee,%%xmm3,%%xmm2           \n"  // Y 8..15, UV 4..7
9257
0
      "pshufb      %%xmm4, %%xmm0                \n"  // weave into YUV24
9258
0
      "pshufb      %%xmm5, %%xmm1                \n"
9259
0
      "pshufb      %%xmm6, %%xmm2                \n"
9260
0
      "movdqu      %%xmm0,(%2)                   \n"
9261
0
      "movdqu      %%xmm1,16(%2)                 \n"
9262
0
      "movdqu      %%xmm2,32(%2)                 \n"
9263
0
      "lea         48(%2),%2                     \n"
9264
0
      "sub         $16,%3                        \n"  // 16 pixels per loop
9265
0
      "jg          1b                            \n"
9266
0
      : "+r"(src_y),            // %0
9267
0
        "+r"(src_vu),           // %1
9268
0
        "+r"(dst_yuv24),        // %2
9269
0
        "+r"(width)             // %3
9270
0
      : "r"(&kYUV24Shuffle[0])  // %4
9271
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9272
0
}
9273
9274
// Convert biplanar NV21 to packed YUV24
9275
// NV21 has VU in memory for chroma.
9276
// YUV24 is VUY in memory
9277
void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
9278
                         const uint8_t* src_vu,
9279
                         uint8_t* dst_yuv24,
9280
0
                         int width) {
9281
0
  asm volatile(
9282
0
      "sub         %0,%1                         \n"
9283
0
      "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
9284
0
      "vbroadcastf128 16(%4),%%ymm5              \n"
9285
0
      "vbroadcastf128 32(%4),%%ymm6              \n"
9286
9287
0
      "1:          \n"
9288
0
      "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
9289
0
      "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
9290
0
      "lea         32(%0),%0                     \n"
9291
0
      "vshufps     $0x44,%%ymm3,%%ymm2,%%ymm0    \n"  // Y 0..7,  UV 0..3
9292
0
      "vshufps     $0x99,%%ymm3,%%ymm2,%%ymm1    \n"  // Y 4..11, UV 2..5
9293
0
      "vshufps     $0xee,%%ymm3,%%ymm2,%%ymm2    \n"  // Y 8..15, UV 4..7
9294
0
      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"  // weave into YUV24
9295
0
      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9296
0
      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
9297
0
      "vperm2i128  $0x20,%%ymm1,%%ymm0,%%ymm3    \n"
9298
0
      "vperm2i128  $0x30,%%ymm0,%%ymm2,%%ymm0    \n"
9299
0
      "vperm2i128  $0x31,%%ymm2,%%ymm1,%%ymm1    \n"
9300
0
      "vmovdqu     %%ymm3,(%2)                   \n"
9301
0
      "vmovdqu     %%ymm0,32(%2)                 \n"
9302
0
      "vmovdqu     %%ymm1,64(%2)                 \n"
9303
0
      "lea         96(%2),%2                     \n"
9304
0
      "sub         $32,%3                        \n"  // 32 pixels per loop
9305
0
      "jg          1b                            \n"
9306
0
      "vzeroupper  \n"
9307
0
      : "+r"(src_y),            // %0
9308
0
        "+r"(src_vu),           // %1
9309
0
        "+r"(dst_yuv24),        // %2
9310
0
        "+r"(width)             // %3
9311
0
      : "r"(&kYUV24Shuffle[0])  // %4
9312
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9313
0
}
9314
9315
#ifdef HAS_NV21ToYUV24ROW_AVX512
9316
// The following VMBI VEX256 code tests okay with the intelsde emulator.
9317
static const lvec8 kYUV24Perm[3] = {
9318
    {32, 33, 0,  32, 33, 1,  34, 35, 2,  34, 35, 3,  36, 37, 4,  36,
9319
     37, 5,  38, 39, 6,  38, 39, 7,  40, 41, 8,  40, 41, 9,  42, 43},
9320
    {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
9321
     48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
9322
    {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
9323
     26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
9324
9325
void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
9326
                           const uint8_t* src_vu,
9327
                           uint8_t* dst_yuv24,
9328
                           int width) {
9329
  asm volatile(
9330
      "sub         %0,%1                         \n"
9331
      "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
9332
      "vmovdqa     32(%4),%%ymm5                 \n"
9333
      "vmovdqa     64(%4),%%ymm6                 \n" LABELALIGN
9334
      "1:          \n"
9335
      "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
9336
      "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
9337
      "lea         32(%0),%0                     \n"
9338
      "vmovdqa     %%ymm2, %%ymm0                \n"
9339
      "vmovdqa     %%ymm2, %%ymm1                \n"
9340
      "vpermt2b    %%ymm3,%%ymm4,%%ymm0          \n"
9341
      "vpermt2b    %%ymm3,%%ymm5,%%ymm1          \n"
9342
      "vpermt2b    %%ymm3,%%ymm6,%%ymm2          \n"
9343
      "vmovdqu     %%ymm0,(%2)                   \n"
9344
      "vmovdqu     %%ymm1,32(%2)                 \n"
9345
      "vmovdqu     %%ymm2,64(%2)                 \n"
9346
      "lea         96(%2),%2                     \n"
9347
      "sub         $32,%3                        \n"
9348
      "jg          1b                            \n"
9349
      "vzeroupper  \n"
9350
      : "+r"(src_y),         // %0
9351
        "+r"(src_vu),        // %1
9352
        "+r"(dst_yuv24),     // %2
9353
        "+r"(width)          // %3
9354
      : "r"(&kYUV24Perm[0])  // %4
9355
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9356
}
9357
9358
#endif  // HAS_NV21ToYUV24ROW_AVX512
9359
9360
#ifdef HAS_SWAPUVROW_SSSE3
9361
9362
// Shuffle table for reversing the bytes.
9363
static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
9364
                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9365
9366
// Convert UV plane of NV12 to VU of NV21.
9367
0
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9368
0
      asm volatile("movdqu      %3,%%xmm5                     \n"
9369
9370
0
               LABELALIGN
9371
0
      "1:          \n"
9372
0
      "movdqu      (%0),%%xmm0                   \n"
9373
0
      "movdqu      0x10(%0),%%xmm1               \n"
9374
0
      "lea         0x20(%0),%0                   \n"
9375
0
      "pshufb      %%xmm5,%%xmm0                 \n"
9376
0
      "pshufb      %%xmm5,%%xmm1                 \n"
9377
0
      "movdqu      %%xmm0,(%1)                   \n"
9378
0
      "movdqu      %%xmm1,0x10(%1)               \n"
9379
0
      "lea         0x20(%1),%1                   \n"
9380
0
      "sub         $0x10,%2                      \n"
9381
0
      "jg          1b                            \n"
9382
0
               : "+r"(src_uv),        // %0
9383
0
                 "+r"(dst_vu),        // %1
9384
0
                 "+r"(width)          // %2
9385
0
               : "m"(kShuffleUVToVU)  // %3
9386
0
               : "memory", "cc", "xmm0", "xmm1", "xmm5");
9387
0
}
9388
#endif  // HAS_SWAPUVROW_SSSE3
9389
9390
#ifdef HAS_SWAPUVROW_AVX2
9391
0
void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9392
0
      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
9393
9394
0
               LABELALIGN
9395
0
      "1:          \n"
9396
0
      "vmovdqu     (%0),%%ymm0                   \n"
9397
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
9398
0
      "lea         0x40(%0),%0                   \n"
9399
0
      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
9400
0
      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9401
0
      "vmovdqu     %%ymm0,(%1)                   \n"
9402
0
      "vmovdqu     %%ymm1,0x20(%1)               \n"
9403
0
      "lea         0x40(%1),%1                   \n"
9404
0
      "sub         $0x20,%2                      \n"
9405
0
      "jg          1b                            \n"
9406
0
      "vzeroupper  \n"
9407
0
               : "+r"(src_uv),        // %0
9408
0
                 "+r"(dst_vu),        // %1
9409
0
                 "+r"(width)          // %2
9410
0
               : "m"(kShuffleUVToVU)  // %3
9411
0
               : "memory", "cc", "xmm0", "xmm1", "xmm5");
9412
0
}
9413
#endif  // HAS_SWAPUVROW_AVX2
9414
9415
void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9416
                          int src_stride_u,
9417
                          const uint8_t* src_v,
9418
                          int src_stride_v,
9419
                          uint8_t* dst_uv,
9420
0
                          int width) {
9421
0
  asm volatile(
9422
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0x0101
9423
0
      "pabsb       %%xmm4,%%xmm4                 \n"
9424
0
      "pxor        %%xmm5,%%xmm5                 \n"
9425
9426
0
      LABELALIGN
9427
0
      "1:          \n"
9428
0
      "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
9429
0
      "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
9430
0
      "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
9431
0
      "movdqu      0(%1,%5,1),%%xmm3             \n"
9432
0
      "lea         0x10(%0),%0                   \n"
9433
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
9434
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
9435
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
9436
0
      "pmaddubsw   %%xmm4,%%xmm3                 \n"
9437
0
      "lea         0x10(%1),%1                   \n"
9438
0
      "paddw       %%xmm2,%%xmm0                 \n"
9439
0
      "paddw       %%xmm3,%%xmm1                 \n"
9440
0
      "psrlw       $0x1,%%xmm0                   \n"
9441
0
      "psrlw       $0x1,%%xmm1                   \n"
9442
0
      "pavgw       %%xmm5,%%xmm0                 \n"
9443
0
      "pavgw       %%xmm5,%%xmm1                 \n"
9444
0
      "packuswb    %%xmm0,%%xmm0                 \n"
9445
0
      "packuswb    %%xmm1,%%xmm1                 \n"
9446
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"
9447
0
      "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
9448
0
      "lea         0x10(%2),%2                   \n"
9449
0
      "sub         $0x10,%3                      \n"  // 16 src pixels per loop
9450
0
      "jg          1b                            \n"
9451
0
      : "+r"(src_u),                    // %0
9452
0
        "+r"(src_v),                    // %1
9453
0
        "+r"(dst_uv),                   // %2
9454
0
        "+r"(width)                     // %3
9455
0
      : "r"((intptr_t)(src_stride_u)),  // %4
9456
0
        "r"((intptr_t)(src_stride_v))   // %5
9457
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9458
0
}
9459
9460
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9461
                         int src_stride_u,
9462
                         const uint8_t* src_v,
9463
                         int src_stride_v,
9464
                         uint8_t* dst_uv,
9465
0
                         int width) {
9466
0
  asm volatile(
9467
0
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
9468
0
      "vpabsb      %%ymm4,%%ymm4                 \n"
9469
0
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
9470
9471
0
      LABELALIGN
9472
0
      "1:          \n"
9473
0
      "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
9474
0
      "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
9475
0
      "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
9476
0
      "vmovdqu     0(%1,%5,1),%%ymm3             \n"
9477
0
      "lea         0x20(%0),%0                   \n"
9478
0
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
9479
0
      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
9480
0
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
9481
0
      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
9482
0
      "lea         0x20(%1),%1                   \n"
9483
0
      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
9484
0
      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
9485
0
      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
9486
0
      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
9487
0
      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
9488
0
      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
9489
0
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
9490
0
      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
9491
0
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
9492
0
      "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
9493
0
      "lea         0x20(%2),%2                   \n"
9494
0
      "sub         $0x20,%3                      \n"  // 32 src pixels per loop
9495
0
      "jg          1b                            \n"
9496
0
      "vzeroupper  \n"
9497
0
      : "+r"(src_u),                    // %0
9498
0
        "+r"(src_v),                    // %1
9499
0
        "+r"(dst_uv),                   // %2
9500
0
        "+r"(width)                     // %3
9501
0
      : "r"((intptr_t)(src_stride_u)),  // %4
9502
0
        "r"((intptr_t)(src_stride_v))   // %5
9503
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9504
0
}
9505
9506
0
void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9507
0
  asm volatile(
9508
0
      "pxor        %%xmm1,%%xmm1                 \n"
9509
9510
0
      LABELALIGN
9511
0
      "1:          \n"
9512
0
      "movd        (%0),%%xmm0                   \n"  // load float
9513
0
      "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
9514
0
      "add         4, %0                         \n"
9515
0
      "movd        %%xmm0, (%1)                  \n"  // store float
9516
0
      "add         4, %1                         \n"
9517
0
      "sub         $0x4,%2                       \n"  // 1 float per loop
9518
0
      "jg          1b                            \n"
9519
0
      : "+r"(src_x),  // %0
9520
0
        "+r"(dst_y),  // %1
9521
0
        "+r"(width)   // %2
9522
0
      :
9523
0
      : "memory", "cc", "xmm0", "xmm1");
9524
0
}
9525
9526
#ifdef HAS_CONVERT16TO8ROW_AVX2
9527
void Convert8To8Row_AVX2(const uint8_t* src_y,
9528
                         uint8_t* dst_y,
9529
                         int scale,
9530
                         int bias,
9531
0
                         int width) {
9532
0
  asm volatile(
9533
0
      "sub         %0,%1                         \n"
9534
0
      "vmovd       %3,%%xmm2                     \n"
9535
0
      "vmovd       %4,%%xmm3                     \n"
9536
0
      "vpbroadcastw %%xmm2,%%ymm2                \n"
9537
0
      "vpbroadcastb %%xmm3,%%ymm3                \n"
9538
0
      "vpxor       %%ymm4,%%ymm4,%%ymm4          \n"
9539
0
      "vpsllw      $8,%%ymm2,%%ymm2              \n"
9540
9541
      // 32 pixels per loop.
9542
0
      LABELALIGN
9543
0
      "1:          \n"
9544
0
      "vmovdqu     (%0),%%ymm0                   \n"
9545
0
      "vpunpckhbw  %%ymm4,%%ymm0,%%ymm1          \n"  // mutates
9546
0
      "vpunpcklbw  %%ymm4,%%ymm0,%%ymm0          \n"
9547
0
      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
9548
0
      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
9549
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // unmutates
9550
0
      "vpaddb      %%ymm3,%%ymm0,%%ymm0          \n"
9551
0
      "vmovdqu     %%ymm0,(%0,%1)                \n"
9552
0
      "add         $0x20,%0                      \n"
9553
0
      "sub         $0x20,%2                      \n"
9554
0
      "jg          1b                            \n"
9555
0
      "vzeroupper  \n"
9556
0
      : "+r"(src_y),  // %0
9557
0
        "+r"(dst_y),  // %1
9558
0
        "+r"(width)   // %2
9559
0
      : "r"(scale),   // %3
9560
0
        "r"(bias)     // %4
9561
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
9562
0
}
9563
#endif  // HAS_CONVERT16TO8ROW_AVX2
9564
9565
#endif  // defined(__x86_64__) || defined(__i386__)
9566
9567
#ifdef __cplusplus
9568
}  // extern "C"
9569
}  // namespace libyuv
9570
#endif