Coverage Report

Created: 2025-07-11 06:43

/src/libavif/ext/libyuv/source/scale_gcc.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include "libyuv/row.h"
12
#include "libyuv/scale_row.h"
13
14
#ifdef __cplusplus
15
namespace libyuv {
16
extern "C" {
17
#endif
18
19
// This module is for GCC x86 and x64.
20
#if !defined(LIBYUV_DISABLE_X86) &&               \
21
    (defined(__x86_64__) || defined(__i386__)) && \
22
    !defined(LIBYUV_ENABLE_ROWWIN)
23
24
// Offsets for source bytes 0 to 9
25
static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
26
                             128, 128, 128, 128, 128, 128, 128, 128};
27
28
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
29
static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
30
                             128, 128, 128, 128, 128, 128, 128, 128};
31
32
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
33
static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
34
                             128, 128, 128, 128, 128, 128, 128, 128};
35
36
// Offsets for source bytes 0 to 10
37
static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
38
39
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40
static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
41
                              8, 9, 9, 10, 10, 11, 12, 13};
42
43
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44
static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
45
                              10, 11, 12, 13, 13, 14, 14, 15};
46
47
// Coefficients for source bytes 0 to 10
48
static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
49
50
// Coefficients for source bytes 10 to 21
51
static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
52
53
// Coefficients for source bytes 21 to 31
54
static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
55
56
// Coefficients for source bytes 21 to 31
57
static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
58
59
static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
60
                               128, 128, 128, 128, 128, 128, 128, 128};
61
62
static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
63
                               6,   8,   11,  14,  128, 128, 128, 128};
64
65
// Arrange words 0,3,6 into 0,1,2
66
static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
67
                              128, 128, 128, 128, 128, 128, 128, 128};
68
69
// Arrange words 0,3,6 into 3,4,5
70
static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
71
                               6,   7,   12,  13,  128, 128, 128, 128};
72
73
// Scaling values for boxes of 3x3 and 2x3
74
static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
75
                                  65536 / 9, 65536 / 6, 0,         0};
76
77
// Arrange first value for pixels 0,1,2,3,4,5
78
static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
79
                               11, 128, 14, 128, 128, 128, 128, 128};
80
81
// Arrange second value for pixels 0,1,2,3,4,5
82
static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
83
                               12, 128, 15, 128, 128, 128, 128, 128};
84
85
// Arrange third value for pixels 0,1,2,3,4,5
86
static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
87
                               13, 128, 128, 128, 128, 128, 128, 128};
88
89
// Scaling values for boxes of 3x2 and 2x2
90
static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
91
                                 65536 / 3, 65536 / 2, 0,         0};
92
93
// GCC versions of row functions are verbatim conversions from Visual C.
94
// Generated using gcc disassembly on Visual C object file:
95
// objdump -D yuvscaler.obj >yuvscaler.txt
96
97
void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
98
                         ptrdiff_t src_stride,
99
                         uint8_t* dst_ptr,
100
0
                         int dst_width) {
101
0
  (void)src_stride;
102
0
  asm volatile(
103
      // 16 pixel loop.
104
0
      LABELALIGN
105
0
      "1:          \n"
106
0
      "movdqu      (%0),%%xmm0                   \n"
107
0
      "movdqu      0x10(%0),%%xmm1               \n"
108
0
      "lea         0x20(%0),%0                   \n"
109
0
      "psrlw       $0x8,%%xmm0                   \n"
110
0
      "psrlw       $0x8,%%xmm1                   \n"
111
0
      "packuswb    %%xmm1,%%xmm0                 \n"
112
0
      "movdqu      %%xmm0,(%1)                   \n"
113
0
      "lea         0x10(%1),%1                   \n"
114
0
      "sub         $0x10,%2                      \n"
115
0
      "jg          1b                            \n"
116
0
      : "+r"(src_ptr),   // %0
117
0
        "+r"(dst_ptr),   // %1
118
0
        "+r"(dst_width)  // %2
119
0
      :
120
0
      : "memory", "cc", "xmm0", "xmm1");
121
0
}
122
123
void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
124
                               ptrdiff_t src_stride,
125
                               uint8_t* dst_ptr,
126
0
                               int dst_width) {
127
0
  (void)src_stride;
128
0
  asm volatile(
129
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0x0101
130
0
      "pabsb       %%xmm4,%%xmm4                 \n"
131
132
0
      "pxor        %%xmm5,%%xmm5                 \n"
133
134
0
      LABELALIGN
135
0
      "1:          \n"
136
0
      "movdqu      (%0),%%xmm0                   \n"
137
0
      "movdqu      0x10(%0),%%xmm1               \n"
138
0
      "lea         0x20(%0),%0                   \n"
139
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"
140
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
141
0
      "pavgw       %%xmm5,%%xmm0                 \n"
142
0
      "pavgw       %%xmm5,%%xmm1                 \n"
143
0
      "packuswb    %%xmm1,%%xmm0                 \n"
144
0
      "movdqu      %%xmm0,(%1)                   \n"
145
0
      "lea         0x10(%1),%1                   \n"
146
0
      "sub         $0x10,%2                      \n"
147
0
      "jg          1b                            \n"
148
0
      : "+r"(src_ptr),   // %0
149
0
        "+r"(dst_ptr),   // %1
150
0
        "+r"(dst_width)  // %2
151
0
      :
152
0
      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
153
0
}
154
155
void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
156
                            ptrdiff_t src_stride,
157
                            uint8_t* dst_ptr,
158
0
                            int dst_width) {
159
0
  asm volatile(
160
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0x0101
161
0
      "pabsb       %%xmm4,%%xmm4                 \n"
162
0
      "pxor        %%xmm5,%%xmm5                 \n"
163
164
0
      LABELALIGN
165
0
      "1:          \n"
166
0
      "movdqu      (%0),%%xmm0                   \n"
167
0
      "movdqu      0x10(%0),%%xmm1               \n"
168
0
      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
169
0
      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
170
0
      "lea         0x20(%0),%0                   \n"
171
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"
172
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
173
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
174
0
      "pmaddubsw   %%xmm4,%%xmm3                 \n"
175
0
      "paddw       %%xmm2,%%xmm0                 \n"
176
0
      "paddw       %%xmm3,%%xmm1                 \n"
177
0
      "psrlw       $0x1,%%xmm0                   \n"
178
0
      "psrlw       $0x1,%%xmm1                   \n"
179
0
      "pavgw       %%xmm5,%%xmm0                 \n"
180
0
      "pavgw       %%xmm5,%%xmm1                 \n"
181
0
      "packuswb    %%xmm1,%%xmm0                 \n"
182
0
      "movdqu      %%xmm0,(%1)                   \n"
183
0
      "lea         0x10(%1),%1                   \n"
184
0
      "sub         $0x10,%2                      \n"
185
0
      "jg          1b                            \n"
186
0
      : "+r"(src_ptr),               // %0
187
0
        "+r"(dst_ptr),               // %1
188
0
        "+r"(dst_width)              // %2
189
0
      : "r"((intptr_t)(src_stride))  // %3
190
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
191
0
}
192
193
#ifdef HAS_SCALEROWDOWN2_AVX2
194
void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
195
                        ptrdiff_t src_stride,
196
                        uint8_t* dst_ptr,
197
0
                        int dst_width) {
198
0
  (void)src_stride;
199
0
  asm volatile(
200
0
      "1:          \n"
201
0
      "vmovdqu     (%0),%%ymm0                   \n"
202
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
203
0
      "lea         0x40(%0),%0                   \n"
204
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
205
0
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
206
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
207
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
208
0
      "vmovdqu     %%ymm0,(%1)                   \n"
209
0
      "lea         0x20(%1),%1                   \n"
210
0
      "sub         $0x20,%2                      \n"
211
0
      "jg          1b                            \n"
212
0
      "vzeroupper  \n"
213
0
      : "+r"(src_ptr),   // %0
214
0
        "+r"(dst_ptr),   // %1
215
0
        "+r"(dst_width)  // %2
216
0
      :
217
0
      : "memory", "cc", "xmm0", "xmm1");
218
0
}
219
220
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
221
                              ptrdiff_t src_stride,
222
                              uint8_t* dst_ptr,
223
0
                              int dst_width) {
224
0
  (void)src_stride;
225
0
  asm volatile(
226
0
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
227
0
      "vpabsb      %%ymm4,%%ymm4                 \n"
228
0
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
229
230
0
      LABELALIGN
231
0
      "1:          \n"
232
0
      "vmovdqu     (%0),%%ymm0                   \n"
233
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
234
0
      "lea         0x40(%0),%0                   \n"
235
0
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
236
0
      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
237
0
      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
238
0
      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
239
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
240
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
241
0
      "vmovdqu     %%ymm0,(%1)                   \n"
242
0
      "lea         0x20(%1),%1                   \n"
243
0
      "sub         $0x20,%2                      \n"
244
0
      "jg          1b                            \n"
245
0
      "vzeroupper  \n"
246
0
      : "+r"(src_ptr),   // %0
247
0
        "+r"(dst_ptr),   // %1
248
0
        "+r"(dst_width)  // %2
249
0
      :
250
0
      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251
0
}
252
253
void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254
                           ptrdiff_t src_stride,
255
                           uint8_t* dst_ptr,
256
404
                           int dst_width) {
257
404
  asm volatile(
258
404
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
259
404
      "vpabsb      %%ymm4,%%ymm4                 \n"
260
404
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
261
262
404
      LABELALIGN
263
404
      "1:          \n"
264
404
      "vmovdqu     (%0),%%ymm0                   \n"
265
404
      "vmovdqu     0x20(%0),%%ymm1               \n"
266
404
      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
267
404
      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
268
404
      "lea         0x40(%0),%0                   \n"
269
404
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
270
404
      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
271
404
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
272
404
      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
273
404
      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
274
404
      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
275
404
      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
276
404
      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
277
404
      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
278
404
      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
279
404
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
280
404
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
281
404
      "vmovdqu     %%ymm0,(%1)                   \n"
282
404
      "lea         0x20(%1),%1                   \n"
283
404
      "sub         $0x20,%2                      \n"
284
404
      "jg          1b                            \n"
285
404
      "vzeroupper  \n"
286
404
      : "+r"(src_ptr),               // %0
287
404
        "+r"(dst_ptr),               // %1
288
404
        "+r"(dst_width)              // %2
289
404
      : "r"((intptr_t)(src_stride))  // %3
290
404
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
291
404
}
292
#endif  // HAS_SCALEROWDOWN2_AVX2
293
294
void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
295
                         ptrdiff_t src_stride,
296
                         uint8_t* dst_ptr,
297
0
                         int dst_width) {
298
0
  (void)src_stride;
299
0
  asm volatile(
300
0
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
301
0
      "psrld       $0x18,%%xmm5                  \n"
302
0
      "pslld       $0x10,%%xmm5                  \n"
303
304
0
      LABELALIGN
305
0
      "1:          \n"
306
0
      "movdqu      (%0),%%xmm0                   \n"
307
0
      "movdqu      0x10(%0),%%xmm1               \n"
308
0
      "lea         0x20(%0),%0                   \n"
309
0
      "pand        %%xmm5,%%xmm0                 \n"
310
0
      "pand        %%xmm5,%%xmm1                 \n"
311
0
      "packuswb    %%xmm1,%%xmm0                 \n"
312
0
      "psrlw       $0x8,%%xmm0                   \n"
313
0
      "packuswb    %%xmm0,%%xmm0                 \n"
314
0
      "movq        %%xmm0,(%1)                   \n"
315
0
      "lea         0x8(%1),%1                    \n"
316
0
      "sub         $0x8,%2                       \n"
317
0
      "jg          1b                            \n"
318
0
      : "+r"(src_ptr),   // %0
319
0
        "+r"(dst_ptr),   // %1
320
0
        "+r"(dst_width)  // %2
321
0
      :
322
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
323
0
}
324
325
void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
326
                            ptrdiff_t src_stride,
327
                            uint8_t* dst_ptr,
328
0
                            int dst_width) {
329
0
  intptr_t stridex3;
330
0
  asm volatile(
331
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
332
0
      "pabsw       %%xmm4,%%xmm5                 \n"
333
0
      "pabsb       %%xmm4,%%xmm4                 \n"  // 0x0101
334
0
      "psllw       $0x3,%%xmm5                   \n"  // 0x0008
335
0
      "lea         0x00(%4,%4,2),%3              \n"
336
337
0
      LABELALIGN
338
0
      "1:          \n"
339
0
      "movdqu      (%0),%%xmm0                   \n"
340
0
      "movdqu      0x10(%0),%%xmm1               \n"
341
0
      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
342
0
      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
343
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"
344
0
      "pmaddubsw   %%xmm4,%%xmm1                 \n"
345
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
346
0
      "pmaddubsw   %%xmm4,%%xmm3                 \n"
347
0
      "paddw       %%xmm2,%%xmm0                 \n"
348
0
      "paddw       %%xmm3,%%xmm1                 \n"
349
0
      "movdqu      0x00(%0,%4,2),%%xmm2          \n"
350
0
      "movdqu      0x10(%0,%4,2),%%xmm3          \n"
351
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
352
0
      "pmaddubsw   %%xmm4,%%xmm3                 \n"
353
0
      "paddw       %%xmm2,%%xmm0                 \n"
354
0
      "paddw       %%xmm3,%%xmm1                 \n"
355
0
      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
356
0
      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
357
0
      "lea         0x20(%0),%0                   \n"
358
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
359
0
      "pmaddubsw   %%xmm4,%%xmm3                 \n"
360
0
      "paddw       %%xmm2,%%xmm0                 \n"
361
0
      "paddw       %%xmm3,%%xmm1                 \n"
362
0
      "phaddw      %%xmm1,%%xmm0                 \n"
363
0
      "paddw       %%xmm5,%%xmm0                 \n"
364
0
      "psrlw       $0x4,%%xmm0                   \n"
365
0
      "packuswb    %%xmm0,%%xmm0                 \n"
366
0
      "movq        %%xmm0,(%1)                   \n"
367
0
      "lea         0x8(%1),%1                    \n"
368
0
      "sub         $0x8,%2                       \n"
369
0
      "jg          1b                            \n"
370
0
      : "+r"(src_ptr),               // %0
371
0
        "+r"(dst_ptr),               // %1
372
0
        "+r"(dst_width),             // %2
373
0
        "=&r"(stridex3)              // %3
374
0
      : "r"((intptr_t)(src_stride))  // %4
375
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
376
0
}
377
378
#ifdef HAS_SCALEROWDOWN4_AVX2
379
void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
380
                        ptrdiff_t src_stride,
381
                        uint8_t* dst_ptr,
382
0
                        int dst_width) {
383
0
  (void)src_stride;
384
0
  asm volatile(
385
0
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
386
0
      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
387
0
      "vpslld      $0x10,%%ymm5,%%ymm5           \n"
388
389
0
      LABELALIGN
390
0
      "1:          \n"
391
0
      "vmovdqu     (%0),%%ymm0                   \n"
392
0
      "vmovdqu     0x20(%0),%%ymm1               \n"
393
0
      "lea         0x40(%0),%0                   \n"
394
0
      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
395
0
      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
396
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
397
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
398
0
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
399
0
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
400
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
401
0
      "vmovdqu     %%xmm0,(%1)                   \n"
402
0
      "lea         0x10(%1),%1                   \n"
403
0
      "sub         $0x10,%2                      \n"
404
0
      "jg          1b                            \n"
405
0
      "vzeroupper  \n"
406
0
      : "+r"(src_ptr),   // %0
407
0
        "+r"(dst_ptr),   // %1
408
0
        "+r"(dst_width)  // %2
409
0
      :
410
0
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
411
0
}
412
413
void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
414
                           ptrdiff_t src_stride,
415
                           uint8_t* dst_ptr,
416
52
                           int dst_width) {
417
52
  asm volatile(
418
52
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
419
52
      "vpabsw      %%ymm4,%%ymm5                 \n"
420
52
      "vpabsb      %%ymm4,%%ymm4                 \n"  // 0x0101
421
52
      "vpsllw      $0x3,%%ymm5,%%ymm5            \n"  // 0x0008
422
423
52
      LABELALIGN
424
52
      "1:          \n"
425
52
      "vmovdqu     (%0),%%ymm0                   \n"
426
52
      "vmovdqu     0x20(%0),%%ymm1               \n"
427
52
      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
428
52
      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
429
52
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
430
52
      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
431
52
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
432
52
      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
433
52
      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
434
52
      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
435
52
      "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
436
52
      "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
437
52
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
438
52
      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
439
52
      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
440
52
      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
441
52
      "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
442
52
      "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
443
52
      "lea         0x40(%0),%0                   \n"
444
52
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
445
52
      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
446
52
      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
447
52
      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
448
52
      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
449
52
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
450
52
      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
451
52
      "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
452
52
      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
453
52
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
454
52
      "vmovdqu     %%xmm0,(%1)                   \n"
455
52
      "lea         0x10(%1),%1                   \n"
456
52
      "sub         $0x10,%2                      \n"
457
52
      "jg          1b                            \n"
458
52
      "vzeroupper  \n"
459
52
      : "+r"(src_ptr),                   // %0
460
52
        "+r"(dst_ptr),                   // %1
461
52
        "+r"(dst_width)                  // %2
462
52
      : "r"((intptr_t)(src_stride)),     // %3
463
52
        "r"((intptr_t)(src_stride * 3))  // %4
464
52
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
465
52
}
466
#endif  // HAS_SCALEROWDOWN4_AVX2
467
468
void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
469
                          ptrdiff_t src_stride,
470
                          uint8_t* dst_ptr,
471
0
                          int dst_width) {
472
0
  (void)src_stride;
473
0
  asm volatile(
474
0
      "movdqa      %0,%%xmm3                     \n"
475
0
      "movdqa      %1,%%xmm4                     \n"
476
0
      "movdqa      %2,%%xmm5                     \n"
477
0
      :
478
0
      : "m"(kShuf0),  // %0
479
0
        "m"(kShuf1),  // %1
480
0
        "m"(kShuf2)   // %2
481
0
  );
482
0
  asm volatile(
483
0
      "1:          \n"
484
0
      "movdqu      (%0),%%xmm0                   \n"
485
0
      "movdqu      0x10(%0),%%xmm2               \n"
486
0
      "lea         0x20(%0),%0                   \n"
487
0
      "movdqa      %%xmm2,%%xmm1                 \n"
488
0
      "palignr     $0x8,%%xmm0,%%xmm1            \n"
489
0
      "pshufb      %%xmm3,%%xmm0                 \n"
490
0
      "pshufb      %%xmm4,%%xmm1                 \n"
491
0
      "pshufb      %%xmm5,%%xmm2                 \n"
492
0
      "movq        %%xmm0,(%1)                   \n"
493
0
      "movq        %%xmm1,0x8(%1)                \n"
494
0
      "movq        %%xmm2,0x10(%1)               \n"
495
0
      "lea         0x18(%1),%1                   \n"
496
0
      "sub         $0x18,%2                      \n"
497
0
      "jg          1b                            \n"
498
0
      : "+r"(src_ptr),   // %0
499
0
        "+r"(dst_ptr),   // %1
500
0
        "+r"(dst_width)  // %2
501
0
      :
502
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
503
0
}
504
505
void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
506
                                ptrdiff_t src_stride,
507
                                uint8_t* dst_ptr,
508
0
                                int dst_width) {
509
0
  asm volatile(
510
0
      "movdqa      %0,%%xmm2                     \n"  // kShuf01
511
0
      "movdqa      %1,%%xmm3                     \n"  // kShuf11
512
0
      "movdqa      %2,%%xmm4                     \n"  // kShuf21
513
0
      :
514
0
      : "m"(kShuf01),  // %0
515
0
        "m"(kShuf11),  // %1
516
0
        "m"(kShuf21)   // %2
517
0
  );
518
0
  asm volatile(
519
0
      "movdqa      %0,%%xmm5                     \n"  // kMadd01
520
0
      "movdqa      %1,%%xmm0                     \n"  // kMadd11
521
0
      "movdqa      %2,%%xmm1                     \n"  // kRound34
522
0
      :
523
0
      : "m"(kMadd01),  // %0
524
0
        "m"(kMadd11),  // %1
525
0
        "m"(kRound34)  // %2
526
0
  );
527
0
  asm volatile(
528
0
      "1:          \n"
529
0
      "movdqu      (%0),%%xmm6                   \n"
530
0
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
531
0
      "pavgb       %%xmm7,%%xmm6                 \n"
532
0
      "pshufb      %%xmm2,%%xmm6                 \n"
533
0
      "pmaddubsw   %%xmm5,%%xmm6                 \n"
534
0
      "paddsw      %%xmm1,%%xmm6                 \n"
535
0
      "psrlw       $0x2,%%xmm6                   \n"
536
0
      "packuswb    %%xmm6,%%xmm6                 \n"
537
0
      "movq        %%xmm6,(%1)                   \n"
538
0
      "movdqu      0x8(%0),%%xmm6                \n"
539
0
      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
540
0
      "pavgb       %%xmm7,%%xmm6                 \n"
541
0
      "pshufb      %%xmm3,%%xmm6                 \n"
542
0
      "pmaddubsw   %%xmm0,%%xmm6                 \n"
543
0
      "paddsw      %%xmm1,%%xmm6                 \n"
544
0
      "psrlw       $0x2,%%xmm6                   \n"
545
0
      "packuswb    %%xmm6,%%xmm6                 \n"
546
0
      "movq        %%xmm6,0x8(%1)                \n"
547
0
      "movdqu      0x10(%0),%%xmm6               \n"
548
0
      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
549
0
      "lea         0x20(%0),%0                   \n"
550
0
      "pavgb       %%xmm7,%%xmm6                 \n"
551
0
      "pshufb      %%xmm4,%%xmm6                 \n"
552
0
      "pmaddubsw   %4,%%xmm6                     \n"
553
0
      "paddsw      %%xmm1,%%xmm6                 \n"
554
0
      "psrlw       $0x2,%%xmm6                   \n"
555
0
      "packuswb    %%xmm6,%%xmm6                 \n"
556
0
      "movq        %%xmm6,0x10(%1)               \n"
557
0
      "lea         0x18(%1),%1                   \n"
558
0
      "sub         $0x18,%2                      \n"
559
0
      "jg          1b                            \n"
560
0
      : "+r"(src_ptr),                // %0
561
0
        "+r"(dst_ptr),                // %1
562
0
        "+r"(dst_width)               // %2
563
0
      : "r"((intptr_t)(src_stride)),  // %3
564
0
        "m"(kMadd21)                  // %4
565
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
566
0
        "xmm7");
567
0
}
568
569
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
570
                                ptrdiff_t src_stride,
571
                                uint8_t* dst_ptr,
572
0
                                int dst_width) {
573
0
  asm volatile(
574
0
      "movdqa      %0,%%xmm2                     \n"  // kShuf01
575
0
      "movdqa      %1,%%xmm3                     \n"  // kShuf11
576
0
      "movdqa      %2,%%xmm4                     \n"  // kShuf21
577
0
      :
578
0
      : "m"(kShuf01),  // %0
579
0
        "m"(kShuf11),  // %1
580
0
        "m"(kShuf21)   // %2
581
0
  );
582
0
  asm volatile(
583
0
      "movdqa      %0,%%xmm5                     \n"  // kMadd01
584
0
      "movdqa      %1,%%xmm0                     \n"  // kMadd11
585
0
      "movdqa      %2,%%xmm1                     \n"  // kRound34
586
0
      :
587
0
      : "m"(kMadd01),  // %0
588
0
        "m"(kMadd11),  // %1
589
0
        "m"(kRound34)  // %2
590
0
  );
591
592
0
  asm volatile(
593
0
      "1:          \n"
594
0
      "movdqu      (%0),%%xmm6                   \n"
595
0
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
596
0
      "pavgb       %%xmm6,%%xmm7                 \n"
597
0
      "pavgb       %%xmm7,%%xmm6                 \n"
598
0
      "pshufb      %%xmm2,%%xmm6                 \n"
599
0
      "pmaddubsw   %%xmm5,%%xmm6                 \n"
600
0
      "paddsw      %%xmm1,%%xmm6                 \n"
601
0
      "psrlw       $0x2,%%xmm6                   \n"
602
0
      "packuswb    %%xmm6,%%xmm6                 \n"
603
0
      "movq        %%xmm6,(%1)                   \n"
604
0
      "movdqu      0x8(%0),%%xmm6                \n"
605
0
      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
606
0
      "pavgb       %%xmm6,%%xmm7                 \n"
607
0
      "pavgb       %%xmm7,%%xmm6                 \n"
608
0
      "pshufb      %%xmm3,%%xmm6                 \n"
609
0
      "pmaddubsw   %%xmm0,%%xmm6                 \n"
610
0
      "paddsw      %%xmm1,%%xmm6                 \n"
611
0
      "psrlw       $0x2,%%xmm6                   \n"
612
0
      "packuswb    %%xmm6,%%xmm6                 \n"
613
0
      "movq        %%xmm6,0x8(%1)                \n"
614
0
      "movdqu      0x10(%0),%%xmm6               \n"
615
0
      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
616
0
      "lea         0x20(%0),%0                   \n"
617
0
      "pavgb       %%xmm6,%%xmm7                 \n"
618
0
      "pavgb       %%xmm7,%%xmm6                 \n"
619
0
      "pshufb      %%xmm4,%%xmm6                 \n"
620
0
      "pmaddubsw   %4,%%xmm6                     \n"
621
0
      "paddsw      %%xmm1,%%xmm6                 \n"
622
0
      "psrlw       $0x2,%%xmm6                   \n"
623
0
      "packuswb    %%xmm6,%%xmm6                 \n"
624
0
      "movq        %%xmm6,0x10(%1)               \n"
625
0
      "lea         0x18(%1),%1                   \n"
626
0
      "sub         $0x18,%2                      \n"
627
0
      "jg          1b                            \n"
628
0
      : "+r"(src_ptr),                // %0
629
0
        "+r"(dst_ptr),                // %1
630
0
        "+r"(dst_width)               // %2
631
0
      : "r"((intptr_t)(src_stride)),  // %3
632
0
        "m"(kMadd21)                  // %4
633
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
634
0
        "xmm7");
635
0
}
636
637
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
638
                          ptrdiff_t src_stride,
639
                          uint8_t* dst_ptr,
640
0
                          int dst_width) {
641
0
  (void)src_stride;
642
0
  asm volatile(
643
0
      "movdqa      %3,%%xmm4                     \n"
644
0
      "movdqa      %4,%%xmm5                     \n"
645
646
0
      LABELALIGN
647
0
      "1:          \n"
648
0
      "movdqu      (%0),%%xmm0                   \n"
649
0
      "movdqu      0x10(%0),%%xmm1               \n"
650
0
      "lea         0x20(%0),%0                   \n"
651
0
      "pshufb      %%xmm4,%%xmm0                 \n"
652
0
      "pshufb      %%xmm5,%%xmm1                 \n"
653
0
      "paddusb     %%xmm1,%%xmm0                 \n"
654
0
      "movq        %%xmm0,(%1)                   \n"
655
0
      "movhlps     %%xmm0,%%xmm1                 \n"
656
0
      "movd        %%xmm1,0x8(%1)                \n"
657
0
      "lea         0xc(%1),%1                    \n"
658
0
      "sub         $0xc,%2                       \n"
659
0
      "jg          1b                            \n"
660
0
      : "+r"(src_ptr),   // %0
661
0
        "+r"(dst_ptr),   // %1
662
0
        "+r"(dst_width)  // %2
663
0
      : "m"(kShuf38a),   // %3
664
0
        "m"(kShuf38b)    // %4
665
0
      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
666
0
}
667
668
void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
669
                                ptrdiff_t src_stride,
670
                                uint8_t* dst_ptr,
671
0
                                int dst_width) {
672
0
  asm volatile(
673
0
      "movdqa      %0,%%xmm2                     \n"
674
0
      "movdqa      %1,%%xmm3                     \n"
675
0
      "movdqa      %2,%%xmm4                     \n"
676
0
      "movdqa      %3,%%xmm5                     \n"
677
0
      :
678
0
      : "m"(kShufAb0),  // %0
679
0
        "m"(kShufAb1),  // %1
680
0
        "m"(kShufAb2),  // %2
681
0
        "m"(kScaleAb2)  // %3
682
0
  );
683
0
  asm volatile(
684
0
      "1:          \n"
685
0
      "movdqu      (%0),%%xmm0                   \n"
686
0
      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
687
0
      "lea         0x10(%0),%0                   \n"
688
0
      "pavgb       %%xmm1,%%xmm0                 \n"
689
0
      "movdqa      %%xmm0,%%xmm1                 \n"
690
0
      "pshufb      %%xmm2,%%xmm1                 \n"
691
0
      "movdqa      %%xmm0,%%xmm6                 \n"
692
0
      "pshufb      %%xmm3,%%xmm6                 \n"
693
0
      "paddusw     %%xmm6,%%xmm1                 \n"
694
0
      "pshufb      %%xmm4,%%xmm0                 \n"
695
0
      "paddusw     %%xmm0,%%xmm1                 \n"
696
0
      "pmulhuw     %%xmm5,%%xmm1                 \n"
697
0
      "packuswb    %%xmm1,%%xmm1                 \n"
698
0
      "movd        %%xmm1,(%1)                   \n"
699
0
      "psrlq       $0x10,%%xmm1                  \n"
700
0
      "movd        %%xmm1,0x2(%1)                \n"
701
0
      "lea         0x6(%1),%1                    \n"
702
0
      "sub         $0x6,%2                       \n"
703
0
      "jg          1b                            \n"
704
0
      : "+r"(src_ptr),               // %0
705
0
        "+r"(dst_ptr),               // %1
706
0
        "+r"(dst_width)              // %2
707
0
      : "r"((intptr_t)(src_stride))  // %3
708
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
709
0
}
710
711
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
712
                                ptrdiff_t src_stride,
713
                                uint8_t* dst_ptr,
714
0
                                int dst_width) {
715
0
  asm volatile(
716
0
      "movdqa      %0,%%xmm2                     \n"
717
0
      "movdqa      %1,%%xmm3                     \n"
718
0
      "movdqa      %2,%%xmm4                     \n"
719
0
      "pxor        %%xmm5,%%xmm5                 \n"
720
0
      :
721
0
      : "m"(kShufAc),    // %0
722
0
        "m"(kShufAc3),   // %1
723
0
        "m"(kScaleAc33)  // %2
724
0
  );
725
0
  asm volatile(
726
0
      "1:          \n"
727
0
      "movdqu      (%0),%%xmm0                   \n"
728
0
      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
729
0
      "movhlps     %%xmm0,%%xmm1                 \n"
730
0
      "movhlps     %%xmm6,%%xmm7                 \n"
731
0
      "punpcklbw   %%xmm5,%%xmm0                 \n"
732
0
      "punpcklbw   %%xmm5,%%xmm1                 \n"
733
0
      "punpcklbw   %%xmm5,%%xmm6                 \n"
734
0
      "punpcklbw   %%xmm5,%%xmm7                 \n"
735
0
      "paddusw     %%xmm6,%%xmm0                 \n"
736
0
      "paddusw     %%xmm7,%%xmm1                 \n"
737
0
      "movdqu      0x00(%0,%3,2),%%xmm6          \n"
738
0
      "lea         0x10(%0),%0                   \n"
739
0
      "movhlps     %%xmm6,%%xmm7                 \n"
740
0
      "punpcklbw   %%xmm5,%%xmm6                 \n"
741
0
      "punpcklbw   %%xmm5,%%xmm7                 \n"
742
0
      "paddusw     %%xmm6,%%xmm0                 \n"
743
0
      "paddusw     %%xmm7,%%xmm1                 \n"
744
0
      "movdqa      %%xmm0,%%xmm6                 \n"
745
0
      "psrldq      $0x2,%%xmm0                   \n"
746
0
      "paddusw     %%xmm0,%%xmm6                 \n"
747
0
      "psrldq      $0x2,%%xmm0                   \n"
748
0
      "paddusw     %%xmm0,%%xmm6                 \n"
749
0
      "pshufb      %%xmm2,%%xmm6                 \n"
750
0
      "movdqa      %%xmm1,%%xmm7                 \n"
751
0
      "psrldq      $0x2,%%xmm1                   \n"
752
0
      "paddusw     %%xmm1,%%xmm7                 \n"
753
0
      "psrldq      $0x2,%%xmm1                   \n"
754
0
      "paddusw     %%xmm1,%%xmm7                 \n"
755
0
      "pshufb      %%xmm3,%%xmm7                 \n"
756
0
      "paddusw     %%xmm7,%%xmm6                 \n"
757
0
      "pmulhuw     %%xmm4,%%xmm6                 \n"
758
0
      "packuswb    %%xmm6,%%xmm6                 \n"
759
0
      "movd        %%xmm6,(%1)                   \n"
760
0
      "psrlq       $0x10,%%xmm6                  \n"
761
0
      "movd        %%xmm6,0x2(%1)                \n"
762
0
      "lea         0x6(%1),%1                    \n"
763
0
      "sub         $0x6,%2                       \n"
764
0
      "jg          1b                            \n"
765
0
      : "+r"(src_ptr),               // %0
766
0
        "+r"(dst_ptr),               // %1
767
0
        "+r"(dst_width)              // %2
768
0
      : "r"((intptr_t)(src_stride))  // %3
769
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
770
0
        "xmm7");
771
0
}
772
773
static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
774
                                        10, 11, 8, 9, 14, 15, 12, 13};
775
776
static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
777
                                    3, 1, 1, 3, 3, 1, 1, 3};
778
779
#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
780
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
781
                             uint8_t* dst_ptr,
782
0
                             int dst_width) {
783
0
  asm volatile(
784
0
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
785
0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
786
0
      "psrlw       $15,%%xmm6                    \n"
787
0
      "psllw       $1,%%xmm6                     \n"  // all 2
788
789
0
      LABELALIGN
790
0
      "1:          \n"
791
0
      "movq        (%0),%%xmm1                   \n"  // 01234567
792
0
      "movq        1(%0),%%xmm2                  \n"  // 12345678
793
0
      "movdqa      %%xmm1,%%xmm3                 \n"
794
0
      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
795
0
      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
796
0
      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
797
0
      "movdqa      %%xmm1,%%xmm4                 \n"
798
0
      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
799
0
      "movdqa      %%xmm2,%%xmm5                 \n"
800
0
      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
801
0
      "paddw       %%xmm5,%%xmm4                 \n"
802
0
      "movdqa      %%xmm3,%%xmm5                 \n"
803
0
      "paddw       %%xmm6,%%xmm4                 \n"
804
0
      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
805
0
      "paddw       %%xmm5,%%xmm5                 \n"
806
0
      "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
807
0
      "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
808
809
0
      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
810
0
      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
811
0
      "paddw       %%xmm2,%%xmm1                 \n"
812
0
      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
813
0
      "paddw       %%xmm6,%%xmm1                 \n"
814
0
      "paddw       %%xmm3,%%xmm3                 \n"
815
0
      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
816
0
      "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
817
818
0
      "packuswb    %%xmm1,%%xmm5                 \n"
819
0
      "movdqu      %%xmm5,(%1)                   \n"
820
821
0
      "lea         0x8(%0),%0                    \n"
822
0
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
823
0
      "sub         $0x10,%2                      \n"
824
0
      "jg          1b                            \n"
825
0
      : "+r"(src_ptr),   // %0
826
0
        "+r"(dst_ptr),   // %1
827
0
        "+r"(dst_width)  // %2
828
0
      :
829
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
830
0
}
831
#endif
832
833
#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
834
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
835
                               ptrdiff_t src_stride,
836
                               uint8_t* dst_ptr,
837
                               ptrdiff_t dst_stride,
838
0
                               int dst_width) {
839
0
  asm volatile(
840
0
      "1:          \n"
841
0
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
842
      // above line
843
0
      "movq        (%0),%%xmm1                   \n"  // 01234567
844
0
      "movq        1(%0),%%xmm2                  \n"  // 12345678
845
0
      "movdqa      %%xmm1,%%xmm3                 \n"
846
0
      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
847
0
      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
848
0
      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
849
850
0
      "movdqa      %%xmm1,%%xmm4                 \n"
851
0
      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
852
0
      "movdqa      %%xmm2,%%xmm5                 \n"
853
0
      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
854
0
      "paddw       %%xmm5,%%xmm4                 \n"  // near+far
855
0
      "movdqa      %%xmm3,%%xmm5                 \n"
856
0
      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
857
0
      "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
858
0
      "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
859
860
0
      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
861
0
      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
862
0
      "paddw       %%xmm2,%%xmm1                 \n"
863
0
      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
864
0
      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
865
0
      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
866
867
      // below line
868
0
      "movq        (%0,%3),%%xmm6                \n"  // 01234567
869
0
      "movq        1(%0,%3),%%xmm2               \n"  // 12345678
870
0
      "movdqa      %%xmm6,%%xmm3                 \n"
871
0
      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
872
0
      "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
873
0
      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
874
875
0
      "movdqa      %%xmm6,%%xmm5                 \n"
876
0
      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
877
0
      "movdqa      %%xmm2,%%xmm7                 \n"
878
0
      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
879
0
      "paddw       %%xmm7,%%xmm5                 \n"  // near+far
880
0
      "movdqa      %%xmm3,%%xmm7                 \n"
881
0
      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
882
0
      "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
883
0
      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
884
885
0
      "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
886
0
      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
887
0
      "paddw       %%xmm6,%%xmm2                 \n"  // near+far
888
0
      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
889
0
      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
890
0
      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
891
892
      // xmm4 xmm1
893
      // xmm5 xmm2
894
0
      "pcmpeqw     %%xmm0,%%xmm0                 \n"
895
0
      "psrlw       $15,%%xmm0                    \n"
896
0
      "psllw       $3,%%xmm0                     \n"  // all 8
897
898
0
      "movdqa      %%xmm4,%%xmm3                 \n"
899
0
      "movdqa      %%xmm5,%%xmm6                 \n"
900
0
      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
901
0
      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
902
0
      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
903
0
      "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
904
0
      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
905
906
0
      "movdqa      %%xmm1,%%xmm7                 \n"
907
0
      "movdqa      %%xmm2,%%xmm6                 \n"
908
0
      "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
909
0
      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
910
0
      "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
911
0
      "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
912
0
      "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
913
914
0
      "packuswb    %%xmm7,%%xmm3                 \n"
915
0
      "movdqu      %%xmm3,(%1)                   \n"  // save above line
916
917
0
      "movdqa      %%xmm5,%%xmm3                 \n"
918
0
      "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
919
0
      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
920
0
      "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
921
0
      "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
922
0
      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
923
924
0
      "movdqa      %%xmm2,%%xmm3                 \n"
925
0
      "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
926
0
      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
927
0
      "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
928
0
      "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
929
0
      "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
930
931
0
      "packuswb    %%xmm2,%%xmm5                 \n"
932
0
      "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
933
934
0
      "lea         0x8(%0),%0                    \n"
935
0
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
936
0
      "sub         $0x10,%2                      \n"
937
0
      "jg          1b                            \n"
938
0
      : "+r"(src_ptr),                // %0
939
0
        "+r"(dst_ptr),                // %1
940
0
        "+r"(dst_width)               // %2
941
0
      : "r"((intptr_t)(src_stride)),  // %3
942
0
        "r"((intptr_t)(dst_stride))   // %4
943
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
944
0
        "xmm7");
945
0
}
946
#endif
947
948
#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
949
void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
950
                                 uint16_t* dst_ptr,
951
0
                                 int dst_width) {
952
0
  asm volatile(
953
0
      "movdqa      %3,%%xmm5                     \n"
954
0
      "pcmpeqw     %%xmm4,%%xmm4                 \n"
955
0
      "psrlw       $15,%%xmm4                    \n"
956
0
      "psllw       $1,%%xmm4                     \n"  // all 2
957
958
0
      LABELALIGN
959
0
      "1:          \n"
960
0
      "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
961
0
      "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
962
963
0
      "movdqa      %%xmm0,%%xmm2                 \n"
964
0
      "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
965
0
      "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
966
967
0
      "movdqa      %%xmm2,%%xmm3                 \n"
968
0
      "movdqa      %%xmm0,%%xmm1                 \n"
969
0
      "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
970
0
      "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
971
972
0
      "paddw       %%xmm4,%%xmm1                 \n"  // far+2
973
0
      "paddw       %%xmm4,%%xmm3                 \n"  // far+2
974
0
      "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
975
0
      "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
976
0
      "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
977
0
      "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
978
0
      "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
979
0
      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
980
981
0
      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
982
0
      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
983
0
      "movdqu      %%xmm0,(%1)                   \n"
984
0
      "movdqu      %%xmm2,16(%1)                 \n"
985
986
0
      "lea         0x10(%0),%0                   \n"
987
0
      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
988
0
      "sub         $0x10,%2                      \n"
989
0
      "jg          1b                            \n"
990
0
      : "+r"(src_ptr),          // %0
991
0
        "+r"(dst_ptr),          // %1
992
0
        "+r"(dst_width)         // %2
993
0
      : "m"(kLinearShuffleFar)  // %3
994
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
995
0
}
996
#endif
997
998
#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
999
void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1000
                                   ptrdiff_t src_stride,
1001
                                   uint16_t* dst_ptr,
1002
                                   ptrdiff_t dst_stride,
1003
0
                                   int dst_width) {
1004
0
  asm volatile(
1005
0
      "pcmpeqw     %%xmm7,%%xmm7                 \n"
1006
0
      "psrlw       $15,%%xmm7                    \n"
1007
0
      "psllw       $3,%%xmm7                     \n"  // all 8
1008
0
      "movdqa      %5,%%xmm6                     \n"
1009
1010
0
      LABELALIGN
1011
0
      "1:          \n"
1012
      // above line
1013
0
      "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
1014
0
      "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
1015
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1016
0
      "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
1017
0
      "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
1018
0
      "movdqa      %%xmm2,%%xmm3                 \n"
1019
0
      "movdqa      %%xmm0,%%xmm1                 \n"
1020
0
      "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
1021
0
      "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
1022
0
      "paddw       %%xmm0,%%xmm1                 \n"  // near+far
1023
0
      "paddw       %%xmm2,%%xmm3                 \n"  // near+far
1024
0
      "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
1025
0
      "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
1026
0
      "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
1027
0
      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
1028
1029
      // below line
1030
0
      "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
1031
0
      "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
1032
0
      "movdqa      %%xmm1,%%xmm3                 \n"
1033
0
      "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
1034
0
      "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
1035
0
      "movdqa      %%xmm3,%%xmm5                 \n"
1036
0
      "movdqa      %%xmm1,%%xmm4                 \n"
1037
0
      "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
1038
0
      "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
1039
0
      "paddw       %%xmm1,%%xmm4                 \n"  // near+far
1040
0
      "paddw       %%xmm3,%%xmm5                 \n"  // near+far
1041
0
      "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
1042
0
      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
1043
0
      "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
1044
0
      "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1045
1046
      // xmm0 xmm2
1047
      // xmm1 xmm3
1048
1049
0
      "movdqa      %%xmm0,%%xmm4                 \n"
1050
0
      "movdqa      %%xmm1,%%xmm5                 \n"
1051
0
      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1052
0
      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1053
0
      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1054
0
      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1055
0
      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1056
0
      "movdqu      %%xmm4,(%1)                   \n"
1057
1058
0
      "movdqa      %%xmm2,%%xmm4                 \n"
1059
0
      "movdqa      %%xmm3,%%xmm5                 \n"
1060
0
      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
1061
0
      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
1062
0
      "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
1063
0
      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
1064
0
      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1065
0
      "movdqu      %%xmm4,0x10(%1)               \n"
1066
1067
0
      "movdqa      %%xmm1,%%xmm4                 \n"
1068
0
      "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1069
0
      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
1070
0
      "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
1071
0
      "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
1072
0
      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
1073
0
      "movdqu      %%xmm1,(%1,%4,2)              \n"
1074
1075
0
      "movdqa      %%xmm3,%%xmm4                 \n"
1076
0
      "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1077
0
      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
1078
0
      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
1079
0
      "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
1080
0
      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
1081
0
      "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
1082
1083
0
      "lea         0x10(%0),%0                   \n"
1084
0
      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1085
0
      "sub         $0x10,%2                      \n"
1086
0
      "jg          1b                            \n"
1087
0
      : "+r"(src_ptr),                // %0
1088
0
        "+r"(dst_ptr),                // %1
1089
0
        "+r"(dst_width)               // %2
1090
0
      : "r"((intptr_t)(src_stride)),  // %3
1091
0
        "r"((intptr_t)(dst_stride)),  // %4
1092
0
        "m"(kLinearShuffleFar)        // %5
1093
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1094
0
        "xmm7");
1095
0
}
1096
#endif
1097
1098
#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
1099
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1100
                                uint16_t* dst_ptr,
1101
0
                                int dst_width) {
1102
0
  asm volatile(
1103
0
      "pxor        %%xmm5,%%xmm5                 \n"
1104
0
      "pcmpeqd     %%xmm4,%%xmm4                 \n"
1105
0
      "psrld       $31,%%xmm4                    \n"
1106
0
      "pslld       $1,%%xmm4                     \n"  // all 2
1107
1108
0
      LABELALIGN
1109
0
      "1:          \n"
1110
0
      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1111
0
      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1112
1113
0
      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
1114
0
      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
1115
1116
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1117
0
      "movdqa      %%xmm1,%%xmm3                 \n"
1118
1119
0
      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1120
0
      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1121
1122
0
      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
1123
0
      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
1124
0
      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
1125
0
      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
1126
0
      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1127
0
      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1128
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
1129
0
      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
1130
1131
0
      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1132
0
      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
1133
0
      "packssdw    %%xmm1,%%xmm0                 \n"
1134
0
      "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
1135
0
      "movdqu      %%xmm0,(%1)                   \n"
1136
1137
0
      "lea         0x8(%0),%0                    \n"
1138
0
      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1139
0
      "sub         $0x8,%2                       \n"
1140
0
      "jg          1b                            \n"
1141
0
      : "+r"(src_ptr),   // %0
1142
0
        "+r"(dst_ptr),   // %1
1143
0
        "+r"(dst_width)  // %2
1144
0
      :
1145
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1146
0
}
1147
#endif
1148
1149
#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
1150
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1151
                                  ptrdiff_t src_stride,
1152
                                  uint16_t* dst_ptr,
1153
                                  ptrdiff_t dst_stride,
1154
0
                                  int dst_width) {
1155
0
  asm volatile(
1156
0
      "pxor        %%xmm7,%%xmm7                 \n"
1157
0
      "pcmpeqd     %%xmm6,%%xmm6                 \n"
1158
0
      "psrld       $31,%%xmm6                    \n"
1159
0
      "pslld       $3,%%xmm6                     \n"  // all 8
1160
1161
0
      LABELALIGN
1162
0
      "1:          \n"
1163
0
      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
1164
0
      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
1165
0
      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
1166
0
      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
1167
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1168
0
      "movdqa      %%xmm1,%%xmm3                 \n"
1169
0
      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
1170
0
      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
1171
0
      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
1172
0
      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
1173
0
      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
1174
0
      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
1175
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1176
0
      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1177
1178
0
      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1179
0
      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1180
0
      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
1181
0
      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
1182
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1183
0
      "movdqa      %%xmm1,%%xmm3                 \n"
1184
0
      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1185
0
      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1186
0
      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
1187
0
      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
1188
0
      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1189
0
      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1190
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1191
0
      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1192
1193
0
      "movq        (%0,%3,2),%%xmm2              \n"
1194
0
      "movq        2(%0,%3,2),%%xmm3             \n"
1195
0
      "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
1196
0
      "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
1197
0
      "movdqa      %%xmm2,%%xmm4                 \n"
1198
0
      "movdqa      %%xmm3,%%xmm5                 \n"
1199
0
      "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
1200
0
      "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
1201
0
      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
1202
0
      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
1203
0
      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
1204
0
      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
1205
0
      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
1206
0
      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1207
1208
0
      "movdqa      %%xmm0,%%xmm4                 \n"
1209
0
      "movdqa      %%xmm2,%%xmm5                 \n"
1210
0
      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1211
0
      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1212
0
      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1213
0
      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1214
0
      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1215
1216
0
      "movdqa      %%xmm2,%%xmm5                 \n"
1217
0
      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1218
0
      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1219
0
      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1220
0
      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1221
0
      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1222
1223
0
      "movdqa      %%xmm1,%%xmm0                 \n"
1224
0
      "movdqa      %%xmm3,%%xmm2                 \n"
1225
0
      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1226
0
      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
1227
0
      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1228
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1229
0
      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1230
1231
0
      "movdqa      %%xmm3,%%xmm2                 \n"
1232
0
      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
1233
0
      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
1234
0
      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
1235
0
      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
1236
0
      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
1237
1238
0
      "packssdw    %%xmm0,%%xmm4                 \n"
1239
0
      "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1240
0
      "movdqu      %%xmm4,(%1)                   \n"  // store above
1241
0
      "packssdw    %%xmm2,%%xmm5                 \n"
1242
0
      "pshufd      $0b11011000,%%xmm5,%%xmm5     \n"
1243
0
      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
1244
1245
0
      "lea         0x8(%0),%0                    \n"
1246
0
      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1247
0
      "sub         $0x8,%2                       \n"
1248
0
      "jg          1b                            \n"
1249
0
      : "+r"(src_ptr),                // %0
1250
0
        "+r"(dst_ptr),                // %1
1251
0
        "+r"(dst_width)               // %2
1252
0
      : "r"((intptr_t)(src_stride)),  // %3
1253
0
        "r"((intptr_t)(dst_stride))   // %4
1254
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1255
0
        "xmm7");
1256
0
}
1257
#endif
1258
1259
#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
1260
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1261
                              uint8_t* dst_ptr,
1262
0
                              int dst_width) {
1263
0
  asm volatile(
1264
0
      "pcmpeqw     %%xmm4,%%xmm4                 \n"
1265
0
      "psrlw       $15,%%xmm4                    \n"
1266
0
      "psllw       $1,%%xmm4                     \n"  // all 2
1267
0
      "movdqa      %3,%%xmm3                     \n"
1268
1269
0
      LABELALIGN
1270
0
      "1:          \n"
1271
0
      "movq        (%0),%%xmm0                   \n"  // 01234567
1272
0
      "movq        1(%0),%%xmm1                  \n"  // 12345678
1273
0
      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1274
0
      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1275
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1276
0
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1277
0
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1278
0
      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
1279
0
      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
1280
0
      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
1281
0
      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
1282
0
      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1283
0
      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
1284
0
      "packuswb    %%xmm2,%%xmm0                 \n"
1285
0
      "movdqu      %%xmm0,(%1)                   \n"
1286
0
      "lea         0x8(%0),%0                    \n"
1287
0
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1288
0
      "sub         $0x10,%2                      \n"
1289
0
      "jg          1b                            \n"
1290
0
      : "+r"(src_ptr),      // %0
1291
0
        "+r"(dst_ptr),      // %1
1292
0
        "+r"(dst_width)     // %2
1293
0
      : "m"(kLinearMadd31)  // %3
1294
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1295
0
}
1296
#endif
1297
1298
#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
1299
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1300
                                ptrdiff_t src_stride,
1301
                                uint8_t* dst_ptr,
1302
                                ptrdiff_t dst_stride,
1303
0
                                int dst_width) {
1304
0
  asm volatile(
1305
0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
1306
0
      "psrlw       $15,%%xmm6                    \n"
1307
0
      "psllw       $3,%%xmm6                     \n"  // all 8
1308
0
      "movdqa      %5,%%xmm7                     \n"
1309
1310
0
      LABELALIGN
1311
0
      "1:          \n"
1312
0
      "movq        (%0),%%xmm0                   \n"  // 01234567
1313
0
      "movq        1(%0),%%xmm1                  \n"  // 12345678
1314
0
      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1315
0
      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1316
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1317
0
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1318
0
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1319
0
      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
1320
0
      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
1321
1322
0
      "movq        (%0,%3),%%xmm1                \n"
1323
0
      "movq        1(%0,%3),%%xmm4               \n"
1324
0
      "punpcklwd   %%xmm1,%%xmm1                 \n"
1325
0
      "punpcklwd   %%xmm4,%%xmm4                 \n"
1326
0
      "movdqa      %%xmm1,%%xmm3                 \n"
1327
0
      "punpckhdq   %%xmm4,%%xmm3                 \n"
1328
0
      "punpckldq   %%xmm4,%%xmm1                 \n"
1329
0
      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
1330
0
      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
1331
1332
      // xmm0 xmm2
1333
      // xmm1 xmm3
1334
1335
0
      "movdqa      %%xmm0,%%xmm4                 \n"
1336
0
      "movdqa      %%xmm1,%%xmm5                 \n"
1337
0
      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1338
0
      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1339
0
      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1340
0
      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1341
0
      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1342
1343
0
      "movdqa      %%xmm1,%%xmm5                 \n"
1344
0
      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1345
0
      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1346
0
      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1347
0
      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1348
0
      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1349
1350
0
      "movdqa      %%xmm2,%%xmm0                 \n"
1351
0
      "movdqa      %%xmm3,%%xmm1                 \n"
1352
0
      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1353
0
      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
1354
0
      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1355
0
      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1356
0
      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1357
1358
0
      "movdqa      %%xmm3,%%xmm1                 \n"
1359
0
      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
1360
0
      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1361
0
      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
1362
0
      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
1363
0
      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
1364
1365
0
      "packuswb    %%xmm0,%%xmm4                 \n"
1366
0
      "movdqu      %%xmm4,(%1)                   \n"  // store above
1367
0
      "packuswb    %%xmm1,%%xmm5                 \n"
1368
0
      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
1369
1370
0
      "lea         0x8(%0),%0                    \n"
1371
0
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1372
0
      "sub         $0x10,%2                      \n"
1373
0
      "jg          1b                            \n"
1374
0
      : "+r"(src_ptr),                // %0
1375
0
        "+r"(dst_ptr),                // %1
1376
0
        "+r"(dst_width)               // %2
1377
0
      : "r"((intptr_t)(src_stride)),  // %3
1378
0
        "r"((intptr_t)(dst_stride)),  // %4
1379
0
        "m"(kLinearMadd31)            // %5
1380
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1381
0
        "xmm7");
1382
0
}
1383
#endif
1384
1385
#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
1386
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1387
                             uint8_t* dst_ptr,
1388
13.9k
                             int dst_width) {
1389
13.9k
  asm volatile(
1390
13.9k
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1391
13.9k
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1392
13.9k
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1393
13.9k
      "vbroadcastf128 %3,%%ymm3                  \n"
1394
1395
13.9k
      LABELALIGN
1396
13.9k
      "1:          \n"
1397
13.9k
      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1398
13.9k
      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1399
13.9k
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1400
13.9k
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1401
13.9k
      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1402
13.9k
      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1403
13.9k
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1404
13.9k
      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1405
13.9k
      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
1406
13.9k
      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
1407
13.9k
      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
1408
13.9k
      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
1409
13.9k
      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1410
13.9k
      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1411
13.9k
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1412
13.9k
      "vmovdqu     %%ymm0,(%1)                   \n"
1413
1414
13.9k
      "lea         0x10(%0),%0                   \n"
1415
13.9k
      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1416
13.9k
      "sub         $0x20,%2                      \n"
1417
13.9k
      "jg          1b                            \n"
1418
13.9k
      "vzeroupper  \n"
1419
13.9k
      : "+r"(src_ptr),      // %0
1420
13.9k
        "+r"(dst_ptr),      // %1
1421
13.9k
        "+r"(dst_width)     // %2
1422
13.9k
      : "m"(kLinearMadd31)  // %3
1423
13.9k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1424
13.9k
}
1425
#endif
1426
1427
#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
1428
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1429
                               ptrdiff_t src_stride,
1430
                               uint8_t* dst_ptr,
1431
                               ptrdiff_t dst_stride,
1432
16.4k
                               int dst_width) {
1433
16.4k
  asm volatile(
1434
16.4k
      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
1435
16.4k
      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
1436
16.4k
      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
1437
16.4k
      "vbroadcastf128 %5,%%ymm7                  \n"
1438
1439
16.4k
      LABELALIGN
1440
16.4k
      "1:          \n"
1441
16.4k
      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1442
16.4k
      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1443
16.4k
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1444
16.4k
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1445
16.4k
      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1446
16.4k
      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1447
16.4k
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1448
16.4k
      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1449
16.4k
      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
1450
16.4k
      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
1451
1452
16.4k
      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
1453
16.4k
      "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
1454
16.4k
      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
1455
16.4k
      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
1456
16.4k
      "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
1457
16.4k
      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
1458
16.4k
      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
1459
16.4k
      "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
1460
16.4k
      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
1461
16.4k
      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
1462
1463
      // ymm0 ymm1
1464
      // ymm2 ymm3
1465
1466
16.4k
      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1467
16.4k
      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1468
16.4k
      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1469
16.4k
      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1470
16.4k
      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1471
1472
16.4k
      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1473
16.4k
      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1474
16.4k
      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1475
16.4k
      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1476
16.4k
      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1477
1478
16.4k
      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1479
16.4k
      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1480
16.4k
      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1481
16.4k
      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1482
16.4k
      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1483
1484
16.4k
      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1485
16.4k
      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1486
16.4k
      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1487
16.4k
      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1488
16.4k
      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1489
1490
16.4k
      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
1491
16.4k
      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1492
16.4k
      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
1493
16.4k
      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
1494
1495
16.4k
      "lea         0x10(%0),%0                   \n"
1496
16.4k
      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1497
16.4k
      "sub         $0x20,%2                      \n"
1498
16.4k
      "jg          1b                            \n"
1499
16.4k
      "vzeroupper  \n"
1500
16.4k
      : "+r"(src_ptr),                // %0
1501
16.4k
        "+r"(dst_ptr),                // %1
1502
16.4k
        "+r"(dst_width)               // %2
1503
16.4k
      : "r"((intptr_t)(src_stride)),  // %3
1504
16.4k
        "r"((intptr_t)(dst_stride)),  // %4
1505
16.4k
        "m"(kLinearMadd31)            // %5
1506
16.4k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1507
16.4k
        "xmm7");
1508
16.4k
}
1509
#endif
1510
1511
#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
1512
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1513
                                uint16_t* dst_ptr,
1514
76.0k
                                int dst_width) {
1515
76.0k
  asm volatile(
1516
76.0k
      "vbroadcastf128 %3,%%ymm5                  \n"
1517
76.0k
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1518
76.0k
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1519
76.0k
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1520
1521
76.0k
      LABELALIGN
1522
76.0k
      "1:          \n"
1523
76.0k
      "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
1524
76.0k
      "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
1525
1526
76.0k
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
1527
76.0k
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
1528
1529
76.0k
      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
1530
76.0k
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1531
76.0k
      "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
1532
76.0k
      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1533
1534
76.0k
      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
1535
76.0k
      "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
1536
76.0k
      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
1537
76.0k
      "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
1538
76.0k
      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1539
76.0k
      "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
1540
76.0k
      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
1541
76.0k
      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
1542
1543
76.0k
      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
1544
76.0k
      "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
1545
76.0k
      "vmovdqu     %%ymm0,(%1)                   \n"
1546
76.0k
      "vmovdqu     %%ymm2,32(%1)                 \n"
1547
1548
76.0k
      "lea         0x20(%0),%0                   \n"
1549
76.0k
      "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
1550
76.0k
      "sub         $0x20,%2                      \n"
1551
76.0k
      "jg          1b                            \n"
1552
76.0k
      "vzeroupper  \n"
1553
76.0k
      : "+r"(src_ptr),          // %0
1554
76.0k
        "+r"(dst_ptr),          // %1
1555
76.0k
        "+r"(dst_width)         // %2
1556
76.0k
      : "m"(kLinearShuffleFar)  // %3
1557
76.0k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1558
76.0k
}
1559
#endif
1560
1561
#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
1562
void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1563
                                  ptrdiff_t src_stride,
1564
                                  uint16_t* dst_ptr,
1565
                                  ptrdiff_t dst_stride,
1566
17.1k
                                  int dst_width) {
1567
17.1k
  asm volatile(
1568
17.1k
      "vbroadcastf128 %5,%%ymm5                  \n"
1569
17.1k
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1570
17.1k
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1571
17.1k
      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
1572
1573
17.1k
      LABELALIGN
1574
17.1k
      "1:          \n"
1575
1576
17.1k
      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
1577
17.1k
      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
1578
17.1k
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1579
17.1k
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1580
17.1k
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1581
17.1k
      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1582
17.1k
      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1583
17.1k
      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1584
17.1k
      "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
1585
1586
17.1k
      "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
1587
17.1k
      "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
1588
17.1k
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1589
17.1k
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1590
17.1k
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1591
17.1k
      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1592
17.1k
      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1593
17.1k
      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1594
17.1k
      "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
1595
1596
17.1k
      "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
1597
17.1k
      "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
1598
17.1k
      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
1599
17.1k
      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
1600
17.1k
      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1601
17.1k
      "vmovdqu     %%ymm0,(%1)                   \n"  // store above
1602
1603
17.1k
      "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
1604
17.1k
      "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
1605
17.1k
      "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
1606
17.1k
      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
1607
17.1k
      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1608
17.1k
      "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
1609
1610
17.1k
      "lea         0x10(%0),%0                   \n"
1611
17.1k
      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1612
17.1k
      "sub         $0x10,%2                      \n"
1613
17.1k
      "jg          1b                            \n"
1614
17.1k
      "vzeroupper  \n"
1615
17.1k
      : "+r"(src_ptr),                // %0
1616
17.1k
        "+r"(dst_ptr),                // %1
1617
17.1k
        "+r"(dst_width)               // %2
1618
17.1k
      : "r"((intptr_t)(src_stride)),  // %3
1619
17.1k
        "r"((intptr_t)(dst_stride)),  // %4
1620
17.1k
        "m"(kLinearShuffleFar)        // %5
1621
17.1k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1622
17.1k
}
1623
#endif
1624
1625
#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
1626
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1627
                                uint16_t* dst_ptr,
1628
0
                                int dst_width) {
1629
0
  asm volatile(
1630
0
      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
1631
0
      "vpsrld      $31,%%ymm4,%%ymm4             \n"
1632
0
      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
1633
1634
0
      LABELALIGN
1635
0
      "1:          \n"
1636
0
      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1637
0
      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1638
1639
0
      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1640
0
      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1641
1642
0
      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1643
0
      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1644
1645
0
      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
1646
0
      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
1647
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
1648
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
1649
0
      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1650
0
      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1651
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
1652
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
1653
1654
0
      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1655
0
      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1656
0
      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
1657
0
      "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
1658
0
      "vmovdqu     %%ymm0,(%1)                   \n"
1659
1660
0
      "lea         0x10(%0),%0                   \n"
1661
0
      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1662
0
      "sub         $0x10,%2                      \n"
1663
0
      "jg          1b                            \n"
1664
0
      "vzeroupper  \n"
1665
0
      : "+r"(src_ptr),   // %0
1666
0
        "+r"(dst_ptr),   // %1
1667
0
        "+r"(dst_width)  // %2
1668
0
      :
1669
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1670
0
}
1671
#endif
1672
1673
#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
1674
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1675
                                  ptrdiff_t src_stride,
1676
                                  uint16_t* dst_ptr,
1677
                                  ptrdiff_t dst_stride,
1678
0
                                  int dst_width) {
1679
0
  asm volatile(
1680
0
      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
1681
0
      "vpsrld      $31,%%ymm6,%%ymm6             \n"
1682
0
      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
1683
1684
0
      LABELALIGN
1685
0
      "1:          \n"
1686
1687
0
      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1688
0
      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1689
0
      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1690
0
      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1691
0
      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1692
0
      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1693
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
1694
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
1695
0
      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1696
0
      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1697
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
1698
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
1699
1700
0
      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
1701
0
      "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
1702
0
      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
1703
0
      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
1704
0
      "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
1705
0
      "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
1706
0
      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
1707
0
      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
1708
0
      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
1709
0
      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
1710
0
      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
1711
0
      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
1712
1713
0
      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1714
0
      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1715
0
      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1716
0
      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1717
0
      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1718
1719
0
      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1720
0
      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1721
0
      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1722
0
      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1723
0
      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1724
1725
0
      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1726
0
      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1727
0
      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1728
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1729
0
      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1730
1731
0
      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1732
0
      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1733
0
      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1734
0
      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1735
0
      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1736
1737
0
      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
1738
0
      "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
1739
0
      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1740
0
      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
1741
0
      "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
1742
0
      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
1743
1744
0
      "lea         0x10(%0),%0                   \n"
1745
0
      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1746
0
      "sub         $0x10,%2                      \n"
1747
0
      "jg          1b                            \n"
1748
0
      "vzeroupper  \n"
1749
0
      : "+r"(src_ptr),                // %0
1750
0
        "+r"(dst_ptr),                // %1
1751
0
        "+r"(dst_width)               // %2
1752
0
      : "r"((intptr_t)(src_stride)),  // %3
1753
0
        "r"((intptr_t)(dst_stride))   // %4
1754
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1755
0
}
1756
#endif
1757
1758
// Reads 16xN bytes and produces 16 shorts at a time.
1759
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1760
                      uint16_t* dst_ptr,
1761
0
                      int src_width) {
1762
0
  asm volatile("pxor        %%xmm5,%%xmm5                 \n"
1763
1764
               // 16 pixel loop.
1765
0
               LABELALIGN
1766
0
               "1:          \n"
1767
0
               "movdqu      (%0),%%xmm3                   \n"
1768
0
               "lea         0x10(%0),%0                   \n"  // src_ptr += 16
1769
0
               "movdqu      (%1),%%xmm0                   \n"
1770
0
               "movdqu      0x10(%1),%%xmm1               \n"
1771
0
               "movdqa      %%xmm3,%%xmm2                 \n"
1772
0
               "punpcklbw   %%xmm5,%%xmm2                 \n"
1773
0
               "punpckhbw   %%xmm5,%%xmm3                 \n"
1774
0
               "paddusw     %%xmm2,%%xmm0                 \n"
1775
0
               "paddusw     %%xmm3,%%xmm1                 \n"
1776
0
               "movdqu      %%xmm0,(%1)                   \n"
1777
0
               "movdqu      %%xmm1,0x10(%1)               \n"
1778
0
               "lea         0x20(%1),%1                   \n"
1779
0
               "sub         $0x10,%2                      \n"
1780
0
               "jg          1b                            \n"
1781
0
               : "+r"(src_ptr),   // %0
1782
0
                 "+r"(dst_ptr),   // %1
1783
0
                 "+r"(src_width)  // %2
1784
0
               :
1785
0
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1786
0
}
1787
1788
#ifdef HAS_SCALEADDROW_AVX2
1789
// Reads 32 bytes and accumulates to 32 shorts at a time.
1790
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1791
                      uint16_t* dst_ptr,
1792
559k
                      int src_width) {
1793
559k
  asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
1794
1795
559k
               LABELALIGN
1796
559k
               "1:          \n"
1797
559k
               "vmovdqu     (%0),%%ymm3                   \n"
1798
559k
               "lea         0x20(%0),%0                   \n"  // src_ptr += 32
1799
559k
               "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
1800
559k
               "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
1801
559k
               "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
1802
559k
               "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
1803
559k
               "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
1804
559k
               "vmovdqu     %%ymm0,(%1)                   \n"
1805
559k
               "vmovdqu     %%ymm1,0x20(%1)               \n"
1806
559k
               "lea         0x40(%1),%1                   \n"
1807
559k
               "sub         $0x20,%2                      \n"
1808
559k
               "jg          1b                            \n"
1809
559k
               "vzeroupper  \n"
1810
559k
               : "+r"(src_ptr),   // %0
1811
559k
                 "+r"(dst_ptr),   // %1
1812
559k
                 "+r"(src_width)  // %2
1813
559k
               :
1814
559k
               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1815
559k
}
1816
#endif  // HAS_SCALEADDROW_AVX2
1817
1818
// Constant for making pixels signed to avoid pmaddubsw
1819
// saturation.
1820
static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1821
                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1822
1823
// Constant for making pixels unsigned and adding .5 for rounding.
1824
static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1825
                               0x4040, 0x4040, 0x4040, 0x4040};
1826
1827
// Bilinear column filtering. SSSE3 version.
1828
void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1829
                           const uint8_t* src_ptr,
1830
                           int dst_width,
1831
                           int x,
1832
570k
                           int dx) {
1833
570k
  intptr_t x0, x1, temp_pixel;
1834
570k
  asm volatile(
1835
570k
      "movd        %6,%%xmm2                     \n"
1836
570k
      "movd        %7,%%xmm3                     \n"
1837
570k
      "movl        $0x04040000,%k2               \n"
1838
570k
      "movd        %k2,%%xmm5                    \n"
1839
570k
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
1840
570k
      "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
1841
570k
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
1842
570k
      "psrlw       $15,%%xmm7                    \n"  // 0x00010001
1843
1844
570k
      "pextrw      $0x1,%%xmm2,%k3               \n"
1845
570k
      "subl        $0x2,%5                       \n"
1846
570k
      "jl          29f                           \n"
1847
570k
      "movdqa      %%xmm2,%%xmm0                 \n"
1848
570k
      "paddd       %%xmm3,%%xmm0                 \n"
1849
570k
      "punpckldq   %%xmm0,%%xmm2                 \n"
1850
570k
      "punpckldq   %%xmm3,%%xmm3                 \n"
1851
570k
      "paddd       %%xmm3,%%xmm3                 \n"
1852
570k
      "pextrw      $0x3,%%xmm2,%k4               \n"
1853
1854
570k
      LABELALIGN
1855
570k
      "2:          \n"
1856
570k
      "movdqa      %%xmm2,%%xmm1                 \n"
1857
570k
      "paddd       %%xmm3,%%xmm2                 \n"
1858
570k
      "movzwl      0x00(%1,%3,1),%k2             \n"
1859
570k
      "movd        %k2,%%xmm0                    \n"
1860
570k
      "psrlw       $0x9,%%xmm1                   \n"
1861
570k
      "movzwl      0x00(%1,%4,1),%k2             \n"
1862
570k
      "movd        %k2,%%xmm4                    \n"
1863
570k
      "pshufb      %%xmm5,%%xmm1                 \n"
1864
570k
      "punpcklwd   %%xmm4,%%xmm0                 \n"
1865
570k
      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1866
570k
      "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
1867
                                                      // 1
1868
570k
      "paddusb     %%xmm7,%%xmm1                 \n"
1869
570k
      "pmaddubsw   %%xmm0,%%xmm1                 \n"
1870
570k
      "pextrw      $0x1,%%xmm2,%k3               \n"
1871
570k
      "pextrw      $0x3,%%xmm2,%k4               \n"
1872
570k
      "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
1873
570k
      "psrlw       $0x7,%%xmm1                   \n"
1874
570k
      "packuswb    %%xmm1,%%xmm1                 \n"
1875
570k
      "movd        %%xmm1,%k2                    \n"
1876
570k
      "mov         %w2,(%0)                      \n"
1877
570k
      "lea         0x2(%0),%0                    \n"
1878
570k
      "subl        $0x2,%5                       \n"
1879
570k
      "jge         2b                            \n"
1880
1881
570k
      LABELALIGN
1882
570k
      "29:         \n"
1883
570k
      "addl        $0x1,%5                       \n"
1884
570k
      "jl          99f                           \n"
1885
570k
      "movzwl      0x00(%1,%3,1),%k2             \n"
1886
570k
      "movd        %k2,%%xmm0                    \n"
1887
570k
      "psrlw       $0x9,%%xmm2                   \n"
1888
570k
      "pshufb      %%xmm5,%%xmm2                 \n"
1889
570k
      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1890
570k
      "pxor        %%xmm6,%%xmm2                 \n"
1891
570k
      "paddusb     %%xmm7,%%xmm2                 \n"
1892
570k
      "pmaddubsw   %%xmm0,%%xmm2                 \n"
1893
570k
      "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
1894
570k
      "psrlw       $0x7,%%xmm2                   \n"
1895
570k
      "packuswb    %%xmm2,%%xmm2                 \n"
1896
570k
      "movd        %%xmm2,%k2                    \n"
1897
570k
      "mov         %b2,(%0)                      \n"
1898
570k
      "99:         \n"
1899
570k
      : "+r"(dst_ptr),      // %0
1900
570k
        "+r"(src_ptr),      // %1
1901
570k
        "=&a"(temp_pixel),  // %2
1902
570k
        "=&r"(x0),          // %3
1903
570k
        "=&r"(x1),          // %4
1904
570k
#if defined(__x86_64__)
1905
570k
        "+rm"(dst_width)  // %5
1906
#else
1907
        "+m"(dst_width)  // %5
1908
#endif
1909
570k
      : "rm"(x),   // %6
1910
570k
        "rm"(dx),  // %7
1911
570k
#if defined(__x86_64__)
1912
570k
        "x"(kFsub80),  // %8
1913
570k
        "x"(kFadd40)   // %9
1914
#else
1915
        "m"(kFsub80),    // %8
1916
        "m"(kFadd40)     // %9
1917
#endif
1918
570k
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1919
570k
        "xmm7");
1920
570k
}
1921
1922
// Reads 4 pixels, duplicates them and writes 8 pixels.
1923
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1924
void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1925
                       const uint8_t* src_ptr,
1926
                       int dst_width,
1927
                       int x,
1928
0
                       int dx) {
1929
0
  (void)x;
1930
0
  (void)dx;
1931
0
  asm volatile(
1932
0
      "1:          \n"
1933
0
      "movdqu      (%1),%%xmm0                   \n"
1934
0
      "lea         0x10(%1),%1                   \n"
1935
0
      "movdqa      %%xmm0,%%xmm1                 \n"
1936
0
      "punpcklbw   %%xmm0,%%xmm0                 \n"
1937
0
      "punpckhbw   %%xmm1,%%xmm1                 \n"
1938
0
      "movdqu      %%xmm0,(%0)                   \n"
1939
0
      "movdqu      %%xmm1,0x10(%0)               \n"
1940
0
      "lea         0x20(%0),%0                   \n"
1941
0
      "sub         $0x20,%2                      \n"
1942
0
      "jg          1b                            \n"
1943
1944
0
      : "+r"(dst_ptr),   // %0
1945
0
        "+r"(src_ptr),   // %1
1946
0
        "+r"(dst_width)  // %2
1947
0
      :
1948
0
      : "memory", "cc", "xmm0", "xmm1");
1949
0
}
1950
1951
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1952
                            ptrdiff_t src_stride,
1953
                            uint8_t* dst_argb,
1954
0
                            int dst_width) {
1955
0
  (void)src_stride;
1956
0
  asm volatile(
1957
0
      "1:          \n"
1958
0
      "movdqu      (%0),%%xmm0                   \n"
1959
0
      "movdqu      0x10(%0),%%xmm1               \n"
1960
0
      "lea         0x20(%0),%0                   \n"
1961
0
      "shufps      $0xdd,%%xmm1,%%xmm0           \n"
1962
0
      "movdqu      %%xmm0,(%1)                   \n"
1963
0
      "lea         0x10(%1),%1                   \n"
1964
0
      "sub         $0x4,%2                       \n"
1965
0
      "jg          1b                            \n"
1966
0
      : "+r"(src_argb),  // %0
1967
0
        "+r"(dst_argb),  // %1
1968
0
        "+r"(dst_width)  // %2
1969
0
      :
1970
0
      : "memory", "cc", "xmm0", "xmm1");
1971
0
}
1972
1973
void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1974
                                  ptrdiff_t src_stride,
1975
                                  uint8_t* dst_argb,
1976
0
                                  int dst_width) {
1977
0
  (void)src_stride;
1978
0
  asm volatile(
1979
0
      "1:          \n"
1980
0
      "movdqu      (%0),%%xmm0                   \n"
1981
0
      "movdqu      0x10(%0),%%xmm1               \n"
1982
0
      "lea         0x20(%0),%0                   \n"
1983
0
      "movdqa      %%xmm0,%%xmm2                 \n"
1984
0
      "shufps      $0x88,%%xmm1,%%xmm0           \n"
1985
0
      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1986
0
      "pavgb       %%xmm2,%%xmm0                 \n"
1987
0
      "movdqu      %%xmm0,(%1)                   \n"
1988
0
      "lea         0x10(%1),%1                   \n"
1989
0
      "sub         $0x4,%2                       \n"
1990
0
      "jg          1b                            \n"
1991
0
      : "+r"(src_argb),  // %0
1992
0
        "+r"(dst_argb),  // %1
1993
0
        "+r"(dst_width)  // %2
1994
0
      :
1995
0
      : "memory", "cc", "xmm0", "xmm1");
1996
0
}
1997
1998
void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1999
                               ptrdiff_t src_stride,
2000
                               uint8_t* dst_argb,
2001
0
                               int dst_width) {
2002
0
  asm volatile(
2003
0
      "1:          \n"
2004
0
      "movdqu      (%0),%%xmm0                   \n"
2005
0
      "movdqu      0x10(%0),%%xmm1               \n"
2006
0
      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
2007
0
      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
2008
0
      "lea         0x20(%0),%0                   \n"
2009
0
      "pavgb       %%xmm2,%%xmm0                 \n"
2010
0
      "pavgb       %%xmm3,%%xmm1                 \n"
2011
0
      "movdqa      %%xmm0,%%xmm2                 \n"
2012
0
      "shufps      $0x88,%%xmm1,%%xmm0           \n"
2013
0
      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2014
0
      "pavgb       %%xmm2,%%xmm0                 \n"
2015
0
      "movdqu      %%xmm0,(%1)                   \n"
2016
0
      "lea         0x10(%1),%1                   \n"
2017
0
      "sub         $0x4,%2                       \n"
2018
0
      "jg          1b                            \n"
2019
0
      : "+r"(src_argb),              // %0
2020
0
        "+r"(dst_argb),              // %1
2021
0
        "+r"(dst_width)              // %2
2022
0
      : "r"((intptr_t)(src_stride))  // %3
2023
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2024
0
}
2025
2026
// Reads 4 pixels at a time.
2027
// Alignment requirement: dst_argb 16 byte aligned.
2028
void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2029
                               ptrdiff_t src_stride,
2030
                               int src_stepx,
2031
                               uint8_t* dst_argb,
2032
0
                               int dst_width) {
2033
0
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2034
0
  intptr_t src_stepx_x12;
2035
0
  (void)src_stride;
2036
0
  asm volatile(
2037
0
      "lea         0x00(,%1,4),%1                \n"
2038
0
      "lea         0x00(%1,%1,2),%4              \n"
2039
2040
0
      LABELALIGN
2041
0
      "1:          \n"
2042
0
      "movd        (%0),%%xmm0                   \n"
2043
0
      "movd        0x00(%0,%1,1),%%xmm1          \n"
2044
0
      "punpckldq   %%xmm1,%%xmm0                 \n"
2045
0
      "movd        0x00(%0,%1,2),%%xmm2          \n"
2046
0
      "movd        0x00(%0,%4,1),%%xmm3          \n"
2047
0
      "lea         0x00(%0,%1,4),%0              \n"
2048
0
      "punpckldq   %%xmm3,%%xmm2                 \n"
2049
0
      "punpcklqdq  %%xmm2,%%xmm0                 \n"
2050
0
      "movdqu      %%xmm0,(%2)                   \n"
2051
0
      "lea         0x10(%2),%2                   \n"
2052
0
      "sub         $0x4,%3                       \n"
2053
0
      "jg          1b                            \n"
2054
0
      : "+r"(src_argb),       // %0
2055
0
        "+r"(src_stepx_x4),   // %1
2056
0
        "+r"(dst_argb),       // %2
2057
0
        "+r"(dst_width),      // %3
2058
0
        "=&r"(src_stepx_x12)  // %4
2059
0
      :
2060
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2061
0
}
2062
2063
// Blends four 2x2 to 4x1.
2064
// Alignment requirement: dst_argb 16 byte aligned.
2065
void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2066
                                  ptrdiff_t src_stride,
2067
                                  int src_stepx,
2068
                                  uint8_t* dst_argb,
2069
0
                                  int dst_width) {
2070
0
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2071
0
  intptr_t src_stepx_x12;
2072
0
  intptr_t row1 = (intptr_t)(src_stride);
2073
0
  asm volatile(
2074
0
      "lea         0x00(,%1,4),%1                \n"
2075
0
      "lea         0x00(%1,%1,2),%4              \n"
2076
0
      "lea         0x00(%0,%5,1),%5              \n"
2077
2078
0
      LABELALIGN
2079
0
      "1:          \n"
2080
0
      "movq        (%0),%%xmm0                   \n"
2081
0
      "movhps      0x00(%0,%1,1),%%xmm0          \n"
2082
0
      "movq        0x00(%0,%1,2),%%xmm1          \n"
2083
0
      "movhps      0x00(%0,%4,1),%%xmm1          \n"
2084
0
      "lea         0x00(%0,%1,4),%0              \n"
2085
0
      "movq        (%5),%%xmm2                   \n"
2086
0
      "movhps      0x00(%5,%1,1),%%xmm2          \n"
2087
0
      "movq        0x00(%5,%1,2),%%xmm3          \n"
2088
0
      "movhps      0x00(%5,%4,1),%%xmm3          \n"
2089
0
      "lea         0x00(%5,%1,4),%5              \n"
2090
0
      "pavgb       %%xmm2,%%xmm0                 \n"
2091
0
      "pavgb       %%xmm3,%%xmm1                 \n"
2092
0
      "movdqa      %%xmm0,%%xmm2                 \n"
2093
0
      "shufps      $0x88,%%xmm1,%%xmm0           \n"
2094
0
      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2095
0
      "pavgb       %%xmm2,%%xmm0                 \n"
2096
0
      "movdqu      %%xmm0,(%2)                   \n"
2097
0
      "lea         0x10(%2),%2                   \n"
2098
0
      "sub         $0x4,%3                       \n"
2099
0
      "jg          1b                            \n"
2100
0
      : "+r"(src_argb),        // %0
2101
0
        "+r"(src_stepx_x4),    // %1
2102
0
        "+r"(dst_argb),        // %2
2103
0
        "+rm"(dst_width),      // %3
2104
0
        "=&r"(src_stepx_x12),  // %4
2105
0
        "+r"(row1)             // %5
2106
0
      :
2107
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2108
0
}
2109
2110
void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2111
                        const uint8_t* src_argb,
2112
                        int dst_width,
2113
                        int x,
2114
0
                        int dx) {
2115
0
  intptr_t x0, x1;
2116
0
  asm volatile(
2117
0
      "movd        %5,%%xmm2                     \n"
2118
0
      "movd        %6,%%xmm3                     \n"
2119
0
      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
2120
0
      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
2121
0
      "paddd       %%xmm0,%%xmm2                 \n"
2122
0
      "paddd       %%xmm3,%%xmm3                 \n"
2123
0
      "pshufd      $0x5,%%xmm3,%%xmm0            \n"
2124
0
      "paddd       %%xmm0,%%xmm2                 \n"
2125
0
      "paddd       %%xmm3,%%xmm3                 \n"
2126
0
      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
2127
0
      "pextrw      $0x1,%%xmm2,%k0               \n"
2128
0
      "pextrw      $0x3,%%xmm2,%k1               \n"
2129
0
      "cmp         $0x0,%4                       \n"
2130
0
      "jl          99f                           \n"
2131
0
      "sub         $0x4,%4                       \n"
2132
0
      "jl          49f                           \n"
2133
2134
0
      LABELALIGN
2135
0
      "40:         \n"
2136
0
      "movd        0x00(%3,%0,4),%%xmm0          \n"
2137
0
      "movd        0x00(%3,%1,4),%%xmm1          \n"
2138
0
      "pextrw      $0x5,%%xmm2,%k0               \n"
2139
0
      "pextrw      $0x7,%%xmm2,%k1               \n"
2140
0
      "paddd       %%xmm3,%%xmm2                 \n"
2141
0
      "punpckldq   %%xmm1,%%xmm0                 \n"
2142
0
      "movd        0x00(%3,%0,4),%%xmm1          \n"
2143
0
      "movd        0x00(%3,%1,4),%%xmm4          \n"
2144
0
      "pextrw      $0x1,%%xmm2,%k0               \n"
2145
0
      "pextrw      $0x3,%%xmm2,%k1               \n"
2146
0
      "punpckldq   %%xmm4,%%xmm1                 \n"
2147
0
      "punpcklqdq  %%xmm1,%%xmm0                 \n"
2148
0
      "movdqu      %%xmm0,(%2)                   \n"
2149
0
      "lea         0x10(%2),%2                   \n"
2150
0
      "sub         $0x4,%4                       \n"
2151
0
      "jge         40b                           \n"
2152
2153
0
      "49:         \n"
2154
0
      "test        $0x2,%4                       \n"
2155
0
      "je          29f                           \n"
2156
0
      "movd        0x00(%3,%0,4),%%xmm0          \n"
2157
0
      "movd        0x00(%3,%1,4),%%xmm1          \n"
2158
0
      "pextrw      $0x5,%%xmm2,%k0               \n"
2159
0
      "punpckldq   %%xmm1,%%xmm0                 \n"
2160
0
      "movq        %%xmm0,(%2)                   \n"
2161
0
      "lea         0x8(%2),%2                    \n"
2162
0
      "29:         \n"
2163
0
      "test        $0x1,%4                       \n"
2164
0
      "je          99f                           \n"
2165
0
      "movd        0x00(%3,%0,4),%%xmm0          \n"
2166
0
      "movd        %%xmm0,(%2)                   \n"
2167
0
      "99:         \n"
2168
0
      : "=&a"(x0),       // %0
2169
0
        "=&d"(x1),       // %1
2170
0
        "+r"(dst_argb),  // %2
2171
0
        "+r"(src_argb),  // %3
2172
0
        "+r"(dst_width)  // %4
2173
0
      : "rm"(x),         // %5
2174
0
        "rm"(dx)         // %6
2175
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2176
0
}
2177
2178
// Reads 4 pixels, duplicates them and writes 8 pixels.
2179
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
2180
void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2181
                           const uint8_t* src_argb,
2182
                           int dst_width,
2183
                           int x,
2184
0
                           int dx) {
2185
0
  (void)x;
2186
0
  (void)dx;
2187
0
  asm volatile(
2188
0
      "1:          \n"
2189
0
      "movdqu      (%1),%%xmm0                   \n"
2190
0
      "lea         0x10(%1),%1                   \n"
2191
0
      "movdqa      %%xmm0,%%xmm1                 \n"
2192
0
      "punpckldq   %%xmm0,%%xmm0                 \n"
2193
0
      "punpckhdq   %%xmm1,%%xmm1                 \n"
2194
0
      "movdqu      %%xmm0,(%0)                   \n"
2195
0
      "movdqu      %%xmm1,0x10(%0)               \n"
2196
0
      "lea         0x20(%0),%0                   \n"
2197
0
      "sub         $0x8,%2                       \n"
2198
0
      "jg          1b                            \n"
2199
2200
0
      : "+r"(dst_argb),  // %0
2201
0
        "+r"(src_argb),  // %1
2202
0
        "+r"(dst_width)  // %2
2203
0
      :
2204
0
      : "memory", "cc", "xmm0", "xmm1");
2205
0
}
2206
2207
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2208
static const uvec8 kShuffleColARGB = {
2209
    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
2210
    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
2211
};
2212
2213
// Shuffle table for duplicating 2 fractions into 8 bytes each
2214
static const uvec8 kShuffleFractions = {
2215
    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2216
};
2217
2218
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
2219
void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2220
                               const uint8_t* src_argb,
2221
                               int dst_width,
2222
                               int x,
2223
0
                               int dx) {
2224
0
  intptr_t x0, x1;
2225
0
  asm volatile(
2226
0
      "movdqa      %0,%%xmm4                     \n"
2227
0
      "movdqa      %1,%%xmm5                     \n"
2228
0
      :
2229
0
      : "m"(kShuffleColARGB),   // %0
2230
0
        "m"(kShuffleFractions)  // %1
2231
0
  );
2232
2233
0
  asm volatile(
2234
0
      "movd        %5,%%xmm2                     \n"
2235
0
      "movd        %6,%%xmm3                     \n"
2236
0
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
2237
0
      "psrlw       $0x9,%%xmm6                   \n"
2238
0
      "pextrw      $0x1,%%xmm2,%k3               \n"
2239
0
      "sub         $0x2,%2                       \n"
2240
0
      "jl          29f                           \n"
2241
0
      "movdqa      %%xmm2,%%xmm0                 \n"
2242
0
      "paddd       %%xmm3,%%xmm0                 \n"
2243
0
      "punpckldq   %%xmm0,%%xmm2                 \n"
2244
0
      "punpckldq   %%xmm3,%%xmm3                 \n"
2245
0
      "paddd       %%xmm3,%%xmm3                 \n"
2246
0
      "pextrw      $0x3,%%xmm2,%k4               \n"
2247
2248
0
      LABELALIGN
2249
0
      "2:          \n"
2250
0
      "movdqa      %%xmm2,%%xmm1                 \n"
2251
0
      "paddd       %%xmm3,%%xmm2                 \n"
2252
0
      "movq        0x00(%1,%3,4),%%xmm0          \n"
2253
0
      "psrlw       $0x9,%%xmm1                   \n"
2254
0
      "movhps      0x00(%1,%4,4),%%xmm0          \n"
2255
0
      "pshufb      %%xmm5,%%xmm1                 \n"
2256
0
      "pshufb      %%xmm4,%%xmm0                 \n"
2257
0
      "pxor        %%xmm6,%%xmm1                 \n"
2258
0
      "pmaddubsw   %%xmm1,%%xmm0                 \n"
2259
0
      "psrlw       $0x7,%%xmm0                   \n"
2260
0
      "pextrw      $0x1,%%xmm2,%k3               \n"
2261
0
      "pextrw      $0x3,%%xmm2,%k4               \n"
2262
0
      "packuswb    %%xmm0,%%xmm0                 \n"
2263
0
      "movq        %%xmm0,(%0)                   \n"
2264
0
      "lea         0x8(%0),%0                    \n"
2265
0
      "sub         $0x2,%2                       \n"
2266
0
      "jge         2b                            \n"
2267
2268
0
      LABELALIGN
2269
0
      "29:         \n"
2270
0
      "add         $0x1,%2                       \n"
2271
0
      "jl          99f                           \n"
2272
0
      "psrlw       $0x9,%%xmm2                   \n"
2273
0
      "movq        0x00(%1,%3,4),%%xmm0          \n"
2274
0
      "pshufb      %%xmm5,%%xmm2                 \n"
2275
0
      "pshufb      %%xmm4,%%xmm0                 \n"
2276
0
      "pxor        %%xmm6,%%xmm2                 \n"
2277
0
      "pmaddubsw   %%xmm2,%%xmm0                 \n"
2278
0
      "psrlw       $0x7,%%xmm0                   \n"
2279
0
      "packuswb    %%xmm0,%%xmm0                 \n"
2280
0
      "movd        %%xmm0,(%0)                   \n"
2281
2282
0
      LABELALIGN "99:         \n"
2283
2284
0
      : "+r"(dst_argb),    // %0
2285
0
        "+r"(src_argb),    // %1
2286
0
        "+rm"(dst_width),  // %2
2287
0
        "=&r"(x0),         // %3
2288
0
        "=&r"(x1)          // %4
2289
0
      : "rm"(x),           // %5
2290
0
        "rm"(dx)           // %6
2291
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2292
0
}
2293
2294
// Divide num by div and return as 16.16 fixed point result.
2295
28.7k
int FixedDiv_X86(int num, int div) {
2296
28.7k
  asm volatile(
2297
28.7k
      "cdq         \n"
2298
28.7k
      "shld        $0x10,%%eax,%%edx             \n"
2299
28.7k
      "shl         $0x10,%%eax                   \n"
2300
28.7k
      "idiv        %1                            \n"
2301
28.7k
      "mov         %0, %%eax                     \n"
2302
28.7k
      : "+a"(num)  // %0
2303
28.7k
      : "c"(div)   // %1
2304
28.7k
      : "memory", "cc", "edx");
2305
28.7k
  return num;
2306
28.7k
}
2307
2308
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
2309
24.5k
int FixedDiv1_X86(int num, int div) {
2310
24.5k
  asm volatile(
2311
24.5k
      "cdq         \n"
2312
24.5k
      "shld        $0x10,%%eax,%%edx             \n"
2313
24.5k
      "shl         $0x10,%%eax                   \n"
2314
24.5k
      "sub         $0x10001,%%eax                \n"
2315
24.5k
      "sbb         $0x0,%%edx                    \n"
2316
24.5k
      "sub         $0x1,%1                       \n"
2317
24.5k
      "idiv        %1                            \n"
2318
24.5k
      "mov         %0, %%eax                     \n"
2319
24.5k
      : "+a"(num)  // %0
2320
24.5k
      : "c"(div)   // %1
2321
24.5k
      : "memory", "cc", "edx");
2322
24.5k
  return num;
2323
24.5k
}
2324
2325
#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
2326
    defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
2327
2328
// Shuffle table for splitting UV into upper and lower part of register.
2329
static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2330
                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2331
static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
2332
                                      6u,   14u,  0x80, 0x80, 0x80, 0x80,
2333
                                      0x80, 0x80, 0x80, 0x80};
2334
#endif
2335
2336
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2337
2338
void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2339
                              ptrdiff_t src_stride,
2340
                              uint8_t* dst_ptr,
2341
0
                              int dst_width) {
2342
0
  asm volatile(
2343
0
      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
2344
0
      "psrlw       $0xf,%%xmm4                   \n"
2345
0
      "packuswb    %%xmm4,%%xmm4                 \n"
2346
0
      "pxor        %%xmm5, %%xmm5                \n"  // zero
2347
0
      "movdqa      %4,%%xmm1                     \n"  // split shuffler
2348
0
      "movdqa      %5,%%xmm3                     \n"  // merge shuffler
2349
2350
0
      LABELALIGN
2351
0
      "1:          \n"
2352
0
      "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
2353
0
      "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
2354
0
      "lea         0x10(%0),%0                   \n"
2355
0
      "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
2356
0
      "pshufb      %%xmm1,%%xmm2                 \n"
2357
0
      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
2358
0
      "pmaddubsw   %%xmm4,%%xmm2                 \n"
2359
0
      "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
2360
0
      "psrlw       $0x1,%%xmm0                   \n"  // round
2361
0
      "pavgw       %%xmm5,%%xmm0                 \n"
2362
0
      "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
2363
0
      "movq        %%xmm0,(%1)                   \n"
2364
0
      "lea         0x8(%1),%1                    \n"  // 4 UV
2365
0
      "sub         $0x4,%2                       \n"
2366
0
      "jg          1b                            \n"
2367
0
      : "+r"(src_ptr),                // %0
2368
0
        "+r"(dst_ptr),                // %1
2369
0
        "+r"(dst_width)               // %2
2370
0
      : "r"((intptr_t)(src_stride)),  // %3
2371
0
        "m"(kShuffleSplitUV),         // %4
2372
0
        "m"(kShuffleMergeUV)          // %5
2373
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2374
0
}
2375
#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
2376
2377
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
2378
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2379
                             ptrdiff_t src_stride,
2380
                             uint8_t* dst_ptr,
2381
0
                             int dst_width) {
2382
0
  asm volatile(
2383
0
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
2384
0
      "vpabsb      %%ymm4,%%ymm4                 \n"
2385
0
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
2386
0
      "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
2387
0
      "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
2388
2389
0
      LABELALIGN
2390
0
      "1:          \n"
2391
0
      "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
2392
0
      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
2393
0
      "lea         0x20(%0),%0                   \n"
2394
0
      "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
2395
0
      "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
2396
0
      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
2397
0
      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
2398
0
      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
2399
0
      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
2400
0
      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
2401
0
      "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
2402
0
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
2403
0
      "vmovdqu     %%xmm0,(%1)                   \n"
2404
0
      "lea         0x10(%1),%1                   \n"  // 8 UV
2405
0
      "sub         $0x8,%2                       \n"
2406
0
      "jg          1b                            \n"
2407
0
      "vzeroupper  \n"
2408
0
      : "+r"(src_ptr),                // %0
2409
0
        "+r"(dst_ptr),                // %1
2410
0
        "+r"(dst_width)               // %2
2411
0
      : "r"((intptr_t)(src_stride)),  // %3
2412
0
        "m"(kShuffleSplitUV),         // %4
2413
0
        "m"(kShuffleMergeUV)          // %5
2414
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2415
0
}
2416
#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
2417
2418
static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2419
                                      3, 1, 3, 1, 1, 3, 1, 3};
2420
2421
#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
2422
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2423
                                uint8_t* dst_ptr,
2424
0
                                int dst_width) {
2425
0
  asm volatile(
2426
0
      "pcmpeqw     %%xmm4,%%xmm4                 \n"
2427
0
      "psrlw       $15,%%xmm4                    \n"
2428
0
      "psllw       $1,%%xmm4                     \n"  // all 2
2429
0
      "movdqa      %3,%%xmm3                     \n"
2430
2431
0
      LABELALIGN
2432
0
      "1:          \n"
2433
0
      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2434
0
      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2435
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2436
0
      "movdqa      %%xmm0,%%xmm2                 \n"
2437
0
      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2438
0
      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2439
0
      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2440
0
      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2441
0
      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
2442
0
      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
2443
0
      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2444
0
      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
2445
0
      "packuswb    %%xmm2,%%xmm0                 \n"
2446
0
      "movdqu      %%xmm0,(%1)                   \n"
2447
2448
0
      "lea         0x8(%0),%0                    \n"
2449
0
      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2450
0
      "sub         $0x8,%2                       \n"
2451
0
      "jg          1b                            \n"
2452
0
      : "+r"(src_ptr),        // %0
2453
0
        "+r"(dst_ptr),        // %1
2454
0
        "+r"(dst_width)       // %2
2455
0
      : "m"(kUVLinearMadd31)  // %3
2456
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2457
0
}
2458
#endif
2459
2460
#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
2461
void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2462
                                  ptrdiff_t src_stride,
2463
                                  uint8_t* dst_ptr,
2464
                                  ptrdiff_t dst_stride,
2465
0
                                  int dst_width) {
2466
0
  asm volatile(
2467
0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
2468
0
      "psrlw       $15,%%xmm6                    \n"
2469
0
      "psllw       $3,%%xmm6                     \n"  // all 8
2470
0
      "movdqa      %5,%%xmm7                     \n"
2471
2472
0
      LABELALIGN
2473
0
      "1:          \n"
2474
0
      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2475
0
      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2476
0
      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2477
0
      "movdqa      %%xmm0,%%xmm2                 \n"
2478
0
      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2479
0
      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2480
0
      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2481
0
      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2482
2483
0
      "movq        (%0,%3),%%xmm1                \n"
2484
0
      "movq        2(%0,%3),%%xmm4               \n"
2485
0
      "punpcklbw   %%xmm4,%%xmm1                 \n"
2486
0
      "movdqa      %%xmm1,%%xmm3                 \n"
2487
0
      "punpckhdq   %%xmm1,%%xmm3                 \n"
2488
0
      "punpckldq   %%xmm1,%%xmm1                 \n"
2489
0
      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
2490
0
      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
2491
2492
      // xmm0 xmm2
2493
      // xmm1 xmm3
2494
2495
0
      "movdqa      %%xmm0,%%xmm4                 \n"
2496
0
      "movdqa      %%xmm1,%%xmm5                 \n"
2497
0
      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2498
0
      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2499
0
      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2500
0
      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2501
0
      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2502
2503
0
      "movdqa      %%xmm1,%%xmm5                 \n"
2504
0
      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2505
0
      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2506
0
      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2507
0
      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2508
0
      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2509
2510
0
      "movdqa      %%xmm2,%%xmm0                 \n"
2511
0
      "movdqa      %%xmm3,%%xmm1                 \n"
2512
0
      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2513
0
      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
2514
0
      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2515
0
      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2516
0
      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2517
2518
0
      "movdqa      %%xmm3,%%xmm1                 \n"
2519
0
      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
2520
0
      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
2521
0
      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
2522
0
      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
2523
0
      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
2524
2525
0
      "packuswb    %%xmm0,%%xmm4                 \n"
2526
0
      "movdqu      %%xmm4,(%1)                   \n"  // store above
2527
0
      "packuswb    %%xmm1,%%xmm5                 \n"
2528
0
      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
2529
2530
0
      "lea         0x8(%0),%0                    \n"
2531
0
      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2532
0
      "sub         $0x8,%2                       \n"
2533
0
      "jg          1b                            \n"
2534
0
      : "+r"(src_ptr),                // %0
2535
0
        "+r"(dst_ptr),                // %1
2536
0
        "+r"(dst_width)               // %2
2537
0
      : "r"((intptr_t)(src_stride)),  // %3
2538
0
        "r"((intptr_t)(dst_stride)),  // %4
2539
0
        "m"(kUVLinearMadd31)          // %5
2540
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2541
0
        "xmm7");
2542
0
}
2543
#endif
2544
2545
#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
2546
2547
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2548
                               uint8_t* dst_ptr,
2549
0
                               int dst_width) {
2550
0
  asm volatile(
2551
0
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
2552
0
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
2553
0
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
2554
0
      "vbroadcastf128 %3,%%ymm3                  \n"
2555
2556
0
      LABELALIGN
2557
0
      "1:          \n"
2558
0
      "vmovdqu     (%0),%%xmm0                   \n"
2559
0
      "vmovdqu     2(%0),%%xmm1                  \n"
2560
0
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2561
0
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2562
0
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2563
0
      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2564
0
      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2565
0
      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
2566
0
      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
2567
0
      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
2568
0
      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
2569
0
      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2570
0
      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2571
0
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
2572
0
      "vmovdqu     %%ymm0,(%1)                   \n"
2573
2574
0
      "lea         0x10(%0),%0                   \n"
2575
0
      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2576
0
      "sub         $0x10,%2                      \n"
2577
0
      "jg          1b                            \n"
2578
0
      "vzeroupper  \n"
2579
0
      : "+r"(src_ptr),        // %0
2580
0
        "+r"(dst_ptr),        // %1
2581
0
        "+r"(dst_width)       // %2
2582
0
      : "m"(kUVLinearMadd31)  // %3
2583
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2584
0
}
2585
#endif
2586
2587
#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
2588
void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2589
                                 ptrdiff_t src_stride,
2590
                                 uint8_t* dst_ptr,
2591
                                 ptrdiff_t dst_stride,
2592
0
                                 int dst_width) {
2593
0
  asm volatile(
2594
0
      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
2595
0
      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
2596
0
      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
2597
0
      "vbroadcastf128 %5,%%ymm7                  \n"
2598
2599
0
      LABELALIGN
2600
0
      "1:          \n"
2601
0
      "vmovdqu     (%0),%%xmm0                   \n"
2602
0
      "vmovdqu     2(%0),%%xmm1                  \n"
2603
0
      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2604
0
      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2605
0
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2606
0
      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2607
0
      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2608
0
      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
2609
0
      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
2610
2611
0
      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
2612
0
      "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
2613
0
      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
2614
0
      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
2615
0
      "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
2616
0
      "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
2617
0
      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
2618
0
      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
2619
0
      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
2620
2621
      // ymm0 ymm1
2622
      // ymm2 ymm3
2623
2624
0
      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2625
0
      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2626
0
      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2627
0
      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2628
0
      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2629
2630
0
      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2631
0
      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2632
0
      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2633
0
      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2634
0
      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2635
2636
0
      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2637
0
      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2638
0
      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2639
0
      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2640
0
      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2641
2642
0
      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2643
0
      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2644
0
      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2645
0
      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2646
0
      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2647
2648
0
      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
2649
0
      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2650
0
      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
2651
0
      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
2652
2653
0
      "lea         0x10(%0),%0                   \n"
2654
0
      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2655
0
      "sub         $0x10,%2                      \n"
2656
0
      "jg          1b                            \n"
2657
0
      "vzeroupper  \n"
2658
0
      : "+r"(src_ptr),                // %0
2659
0
        "+r"(dst_ptr),                // %1
2660
0
        "+r"(dst_width)               // %2
2661
0
      : "r"((intptr_t)(src_stride)),  // %3
2662
0
        "r"((intptr_t)(dst_stride)),  // %4
2663
0
        "m"(kUVLinearMadd31)          // %5
2664
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2665
0
        "xmm7");
2666
0
}
2667
#endif
2668
2669
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
2670
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2671
                                   uint16_t* dst_ptr,
2672
0
                                   int dst_width) {
2673
0
  asm volatile(
2674
0
      "pxor        %%xmm5,%%xmm5                 \n"
2675
0
      "pcmpeqd     %%xmm4,%%xmm4                 \n"
2676
0
      "psrld       $31,%%xmm4                    \n"
2677
0
      "pslld       $1,%%xmm4                     \n"  // all 2
2678
2679
0
      LABELALIGN
2680
0
      "1:          \n"
2681
0
      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2682
0
      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2683
2684
0
      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
2685
0
      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
2686
2687
0
      "movdqa      %%xmm0,%%xmm2                 \n"
2688
0
      "movdqa      %%xmm1,%%xmm3                 \n"
2689
2690
0
      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
2691
0
      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
2692
2693
0
      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
2694
0
      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
2695
0
      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
2696
0
      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
2697
0
      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
2698
0
      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
2699
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
2700
0
      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
2701
2702
0
      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2703
0
      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
2704
0
      "packusdw    %%xmm1,%%xmm0                 \n"
2705
0
      "movdqu      %%xmm0,(%1)                   \n"
2706
2707
0
      "lea         0x8(%0),%0                    \n"
2708
0
      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2709
0
      "sub         $0x4,%2                       \n"
2710
0
      "jg          1b                            \n"
2711
0
      : "+r"(src_ptr),   // %0
2712
0
        "+r"(dst_ptr),   // %1
2713
0
        "+r"(dst_width)  // %2
2714
0
      :
2715
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2716
0
}
2717
#endif
2718
2719
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
2720
void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2721
                                     ptrdiff_t src_stride,
2722
                                     uint16_t* dst_ptr,
2723
                                     ptrdiff_t dst_stride,
2724
0
                                     int dst_width) {
2725
0
  asm volatile(
2726
0
      "pxor        %%xmm7,%%xmm7                 \n"
2727
0
      "pcmpeqd     %%xmm6,%%xmm6                 \n"
2728
0
      "psrld       $31,%%xmm6                    \n"
2729
0
      "pslld       $3,%%xmm6                     \n"  // all 8
2730
2731
0
      LABELALIGN
2732
0
      "1:          \n"
2733
0
      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2734
0
      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2735
0
      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
2736
0
      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
2737
0
      "movdqa      %%xmm0,%%xmm2                 \n"
2738
0
      "movdqa      %%xmm1,%%xmm3                 \n"
2739
0
      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
2740
0
      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
2741
0
      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
2742
0
      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
2743
0
      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
2744
0
      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
2745
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
2746
0
      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
2747
2748
0
      "movq        (%0,%3,2),%%xmm2              \n"
2749
0
      "movq        4(%0,%3,2),%%xmm3             \n"
2750
0
      "punpcklwd   %%xmm7,%%xmm2                 \n"
2751
0
      "punpcklwd   %%xmm7,%%xmm3                 \n"
2752
0
      "movdqa      %%xmm2,%%xmm4                 \n"
2753
0
      "movdqa      %%xmm3,%%xmm5                 \n"
2754
0
      "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
2755
0
      "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
2756
0
      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
2757
0
      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
2758
0
      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
2759
0
      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
2760
0
      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
2761
0
      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
2762
2763
0
      "movdqa      %%xmm0,%%xmm4                 \n"
2764
0
      "movdqa      %%xmm2,%%xmm5                 \n"
2765
0
      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2766
0
      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2767
0
      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2768
0
      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2769
0
      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2770
2771
0
      "movdqa      %%xmm2,%%xmm5                 \n"
2772
0
      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2773
0
      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2774
0
      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2775
0
      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2776
0
      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2777
2778
0
      "movdqa      %%xmm1,%%xmm0                 \n"
2779
0
      "movdqa      %%xmm3,%%xmm2                 \n"
2780
0
      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2781
0
      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
2782
0
      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2783
0
      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2784
0
      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2785
2786
0
      "movdqa      %%xmm3,%%xmm2                 \n"
2787
0
      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
2788
0
      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
2789
0
      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
2790
0
      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
2791
0
      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
2792
2793
0
      "packusdw    %%xmm0,%%xmm4                 \n"
2794
0
      "movdqu      %%xmm4,(%1)                   \n"  // store above
2795
0
      "packusdw    %%xmm2,%%xmm5                 \n"
2796
0
      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
2797
2798
0
      "lea         0x8(%0),%0                    \n"
2799
0
      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2800
0
      "sub         $0x4,%2                       \n"
2801
0
      "jg          1b                            \n"
2802
0
      : "+r"(src_ptr),                // %0
2803
0
        "+r"(dst_ptr),                // %1
2804
0
        "+r"(dst_width)               // %2
2805
0
      : "r"((intptr_t)(src_stride)),  // %3
2806
0
        "r"((intptr_t)(dst_stride))   // %4
2807
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2808
0
        "xmm7");
2809
0
}
2810
#endif
2811
2812
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
2813
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2814
                                  uint16_t* dst_ptr,
2815
0
                                  int dst_width) {
2816
0
  asm volatile(
2817
0
      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
2818
0
      "vpsrld      $31,%%ymm4,%%ymm4             \n"
2819
0
      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
2820
2821
0
      LABELALIGN
2822
0
      "1:          \n"
2823
0
      "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2824
0
      "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2825
2826
0
      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2827
0
      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2828
2829
0
      "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2830
0
      "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2831
2832
0
      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
2833
0
      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
2834
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
2835
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
2836
0
      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2837
0
      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2838
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
2839
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
2840
2841
0
      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2842
0
      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2843
0
      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
2844
0
      "vmovdqu     %%ymm0,(%1)                   \n"
2845
2846
0
      "lea         0x10(%0),%0                   \n"
2847
0
      "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2848
0
      "sub         $0x8,%2                       \n"
2849
0
      "jg          1b                            \n"
2850
0
      "vzeroupper  \n"
2851
0
      : "+r"(src_ptr),   // %0
2852
0
        "+r"(dst_ptr),   // %1
2853
0
        "+r"(dst_width)  // %2
2854
0
      :
2855
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2856
0
}
2857
#endif
2858
2859
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
2860
void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2861
                                    ptrdiff_t src_stride,
2862
                                    uint16_t* dst_ptr,
2863
                                    ptrdiff_t dst_stride,
2864
0
                                    int dst_width) {
2865
0
  asm volatile(
2866
0
      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
2867
0
      "vpsrld      $31,%%ymm6,%%ymm6             \n"
2868
0
      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
2869
2870
0
      LABELALIGN
2871
0
      "1:          \n"
2872
2873
0
      "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2874
0
      "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2875
0
      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2876
0
      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2877
0
      "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2878
0
      "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2879
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
2880
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
2881
0
      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2882
0
      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2883
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
2884
0
      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
2885
2886
0
      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
2887
0
      "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
2888
0
      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
2889
0
      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
2890
0
      "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
2891
0
      "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
2892
0
      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
2893
0
      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
2894
0
      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
2895
0
      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
2896
0
      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
2897
0
      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
2898
2899
0
      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2900
0
      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2901
0
      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2902
0
      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2903
0
      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2904
2905
0
      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2906
0
      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2907
0
      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2908
0
      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2909
0
      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2910
2911
0
      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2912
0
      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2913
0
      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2914
0
      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2915
0
      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2916
2917
0
      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2918
0
      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2919
0
      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2920
0
      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2921
0
      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2922
2923
0
      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
2924
0
      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2925
0
      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
2926
0
      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
2927
2928
0
      "lea         0x10(%0),%0                   \n"
2929
0
      "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2930
0
      "sub         $0x8,%2                       \n"
2931
0
      "jg          1b                            \n"
2932
0
      "vzeroupper  \n"
2933
0
      : "+r"(src_ptr),                // %0
2934
0
        "+r"(dst_ptr),                // %1
2935
0
        "+r"(dst_width)               // %2
2936
0
      : "r"((intptr_t)(src_stride)),  // %3
2937
0
        "r"((intptr_t)(dst_stride))   // %4
2938
0
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2939
0
}
2940
#endif
2941
2942
#endif  // defined(__x86_64__) || defined(__i386__)
2943
2944
#ifdef __cplusplus
2945
}  // extern "C"
2946
}  // namespace libyuv
2947
#endif