Coverage Report

Created: 2026-06-30 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libde265/libde265/x86/sse-motion.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013 openHEVC contributors
4
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5
 *
6
 * This file is part of libde265.
7
 *
8
 * libde265 is free software: you can redistribute it and/or modify
9
 * it under the terms of the GNU Lesser General Public License as
10
 * published by the Free Software Foundation, either version 3 of
11
 * the License, or (at your option) any later version.
12
 *
13
 * libde265 is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public License
19
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#ifdef HAVE_CONFIG_H
23
#include "config.h"
24
#endif
25
26
#include <stdio.h>
27
#include <emmintrin.h>
28
#include <tmmintrin.h> // SSSE3
29
#if HAVE_SSE4_1
30
#include <smmintrin.h>
31
#endif
32
33
#include "sse-motion.h"
34
#include "libde265/util.h"
35
36
37
ALIGNED_16(const int8_t) epel_filters[7][16] = {
38
  { -2,  58,  10,  -2,-2,  58,  10,  -2,-2,  58,  10,  -2,-2,  58,  10,  -2 },
39
  { -4,  54,  16,  -2,-4,  54,  16,  -2,-4,  54,  16,  -2,-4,  54,  16,  -2 },
40
  { -6,  46,  28,  -4,-6,  46,  28,  -4,-6,  46,  28,  -4,-6,  46,  28,  -4 },
41
  { -4,  36,  36,  -4,-4,  36,  36,  -4,-4,  36,  36,  -4,-4,  36,  36,  -4 },
42
  { -4,  28,  46,  -6,-4,  28,  46,  -6,-4,  28,  46,  -6,-4,  28,  46,  -6 },
43
  { -2,  16,  54,  -4,-2,  16,  54,  -4,-2,  16,  54,  -4,-2,  16,  54,  -4 },
44
  { -2,  10,  58,  -2,-2,  10,  58,  -2,-2,  10,  58,  -2,-2,  10,  58,  -2 },
45
};
46
47
static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 };
48
//static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 };
49
static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 };
50
51
static const int epel_extra_before = 1;
52
//static const int epel_extra_after = 2;
53
static const int epel_extra = 3;
54
55
47.1M
#define MAX_PB_SIZE 64
56
57
#define MASKMOVE 0
58
59
void print128(const char* prefix, __m128i r)
60
0
{
61
0
  unsigned char buf[16];
62
63
0
  *(__m128i*)buf = r;
64
65
0
  printf("%s ",prefix);
66
0
  for (int i=0;i<16;i++)
67
0
    {
68
0
      if (i>0) { printf(":"); }
69
0
      printf("%02x", buf[i]);
70
0
    }
71
72
0
  printf("\n");
73
0
}
74
75
76
void printm32(const char* prefix, unsigned char* p)
77
0
{
78
0
  printf("%s ",prefix);
79
80
0
  for (int i=0;i<4;i++)
81
0
    {
82
0
      if (i>0) { printf(":"); }
83
0
      printf("%02x", p[i]);
84
0
    }
85
86
0
  printf("\n");
87
0
}
88
89
90
793k
#define BIT_DEPTH 8
91
92
void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
93
                                       const int16_t *src, ptrdiff_t srcstride,
94
2.58M
                                       int width, int height) {
95
2.58M
    int x, y;
96
2.58M
    uint8_t *dst = (uint8_t*) _dst;
97
2.58M
    __m128i r0, r1, f0;
98
99
2.58M
    f0 = _mm_set1_epi16(32);
100
101
102
2.58M
    if(!(width & 15))
103
95.0k
    {
104
1.68M
        for (y = 0; y < height; y++) {
105
3.42M
                    for (x = 0; x < width; x += 16) {
106
1.83M
                        r0 = _mm_load_si128((__m128i *) (src+x));
107
108
1.83M
                        r1 = _mm_load_si128((__m128i *) (src+x + 8));
109
1.83M
                        r0 = _mm_adds_epi16(r0, f0);
110
111
1.83M
                        r1 = _mm_adds_epi16(r1, f0);
112
1.83M
                        r0 = _mm_srai_epi16(r0, 6);
113
1.83M
                        r1 = _mm_srai_epi16(r1, 6);
114
1.83M
                        r0 = _mm_packus_epi16(r0, r1);
115
116
1.83M
                        _mm_storeu_si128((__m128i *) (dst+x), r0);
117
1.83M
                    }
118
1.58M
                    dst += dststride;
119
1.58M
                    src += srcstride;
120
1.58M
                }
121
2.48M
    }else if(!(width & 7))
122
558k
    {
123
4.88M
        for (y = 0; y < height; y++) {
124
8.69M
            for (x = 0; x < width; x += 8) {
125
4.37M
                    r0 = _mm_load_si128((__m128i *) (src+x));
126
127
4.37M
                    r0 = _mm_adds_epi16(r0, f0);
128
129
4.37M
                    r0 = _mm_srai_epi16(r0, 6);
130
4.37M
                    r0 = _mm_packus_epi16(r0, r0);
131
132
4.37M
                    _mm_storel_epi64((__m128i *) (dst+x), r0);
133
4.37M
            }
134
4.32M
                    dst += dststride;
135
4.32M
                    src += srcstride;
136
4.32M
                }
137
1.93M
    }else if(!(width & 3)){
138
8.82M
        for (y = 0; y < height; y++) {
139
14.6M
                    for(x = 0;x < width; x+=4){
140
7.39M
                    r0 = _mm_loadl_epi64((__m128i *) (src+x));
141
7.39M
                    r0 = _mm_adds_epi16(r0, f0);
142
143
7.39M
                    r0 = _mm_srai_epi16(r0, 6);
144
7.39M
                    r0 = _mm_packus_epi16(r0, r0);
145
#if MASKMOVE
146
                    _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
147
#else
148
                    //r0 = _mm_shuffle_epi32 (r0, 0x00);
149
7.39M
                    *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
150
7.39M
#endif
151
7.39M
                    }
152
7.26M
                    dst += dststride;
153
7.26M
                    src += srcstride;
154
7.26M
                }
155
1.56M
    }else{
156
2.45M
        for (y = 0; y < height; y++) {
157
4.26M
                    for(x = 0;x < width; x+=2){
158
2.17M
                    r0 = _mm_loadl_epi64((__m128i *) (src+x));
159
2.17M
                    r0 = _mm_adds_epi16(r0, f0);
160
161
2.17M
                    r0 = _mm_srai_epi16(r0, 6);
162
2.17M
                    r0 = _mm_packus_epi16(r0, r0);
163
#if MASKMOVE
164
                    _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
165
#else
166
2.17M
                    *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
167
2.17M
#endif
168
2.17M
                    }
169
2.08M
                    dst += dststride;
170
2.08M
                    src += srcstride;
171
2.08M
                }
172
369k
    }
173
174
2.58M
}
175
176
void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride,
177
                                     const int16_t *src, ptrdiff_t srcstride,
178
0
                                     int width, int height) {
179
0
    int x, y;
180
0
    uint8_t *dst = (uint8_t*) _dst;
181
0
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
182
0
    __m128i r0, r1, f0;
183
0
    int shift = 14 - BIT_DEPTH;
184
0
#if BIT_DEPTH < 14
185
0
    int16_t offset = 1 << (shift - 1);
186
#else
187
    int16_t offset = 0;
188
189
#endif
190
0
    f0 = _mm_set1_epi16(offset);
191
192
0
    for (y = 0; y < height; y++) {
193
0
        for (x = 0; x < width; x += 16) {
194
0
            r0 = _mm_load_si128((__m128i *) &src[x]);
195
196
0
            r1 = _mm_load_si128((__m128i *) &src[x + 8]);
197
0
            r0 = _mm_adds_epi16(r0, f0);
198
199
0
            r1 = _mm_adds_epi16(r1, f0);
200
0
            r0 = _mm_srai_epi16(r0, shift);
201
0
            r1 = _mm_srai_epi16(r1, shift);
202
0
            r0 = _mm_packus_epi16(r0, r1);
203
204
0
            _mm_storeu_si128((__m128i *) &dst[x], r0);
205
0
        }
206
0
        dst += dststride;
207
0
        src += srcstride;
208
0
    }
209
0
}
210
211
void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
212
                                         const int16_t *src1, const int16_t *src2,
213
                                         ptrdiff_t srcstride, int width,
214
504k
                                         int height) {
215
504k
    int x, y;
216
504k
    uint8_t *dst = (uint8_t*) _dst;
217
504k
    __m128i r0, r1, f0, r2, r3;
218
219
504k
    f0 = _mm_set1_epi16(64);
220
504k
    if(!(width & 15)){
221
859k
        for (y = 0; y < height; y++) {
222
223
1.78M
            for (x = 0; x < width; x += 16) {
224
974k
                r0 = _mm_load_si128((__m128i *) &src1[x]);
225
974k
                r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
226
974k
                r2 = _mm_load_si128((__m128i *) &src2[x]);
227
974k
                r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
228
229
974k
                r0 = _mm_adds_epi16(r0, f0);
230
974k
                r1 = _mm_adds_epi16(r1, f0);
231
974k
                r0 = _mm_adds_epi16(r0, r2);
232
974k
                r1 = _mm_adds_epi16(r1, r3);
233
974k
                r0 = _mm_srai_epi16(r0, 7);
234
974k
                r1 = _mm_srai_epi16(r1, 7);
235
974k
                r0 = _mm_packus_epi16(r0, r1);
236
237
974k
                _mm_storeu_si128((__m128i *) (dst + x), r0);
238
974k
            }
239
812k
            dst += dststride;
240
812k
            src1 += srcstride;
241
812k
            src2 += srcstride;
242
812k
        }
243
456k
    }else if(!(width & 7)){
244
1.76M
        for (y = 0; y < height; y++) {
245
3.25M
            for(x=0;x<width;x+=8){
246
1.64M
                r0 = _mm_load_si128((__m128i *) (src1+x));
247
1.64M
                r2 = _mm_load_si128((__m128i *) (src2+x));
248
249
1.64M
                r0 = _mm_adds_epi16(r0, f0);
250
1.64M
                r0 = _mm_adds_epi16(r0, r2);
251
1.64M
                r0 = _mm_srai_epi16(r0, 7);
252
1.64M
                r0 = _mm_packus_epi16(r0, r0);
253
254
1.64M
                _mm_storel_epi64((__m128i *) (dst+x), r0);
255
1.64M
            }
256
1.60M
            dst += dststride;
257
1.60M
            src1 += srcstride;
258
1.60M
            src2 += srcstride;
259
1.60M
        }
260
300k
    }else if(!(width & 3)){
261
#if MASKMOVE
262
      r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
263
#endif
264
2.37M
        for (y = 0; y < height; y++) {
265
266
4.23M
            for(x=0;x<width;x+=4)
267
2.16M
            {
268
2.16M
                r0 = _mm_loadl_epi64((__m128i *) (src1+x));
269
2.16M
                r2 = _mm_loadl_epi64((__m128i *) (src2+x));
270
271
2.16M
                r0 = _mm_adds_epi16(r0, f0);
272
2.16M
                r0 = _mm_adds_epi16(r0, r2);
273
2.16M
                r0 = _mm_srai_epi16(r0, 7);
274
2.16M
                r0 = _mm_packus_epi16(r0, r0);
275
276
#if MASKMOVE
277
                _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
278
#else
279
2.16M
                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
280
2.16M
#endif
281
2.16M
            }
282
2.07M
            dst += dststride;
283
2.07M
            src1 += srcstride;
284
2.07M
            src2 += srcstride;
285
2.07M
        }
286
298k
    }else{
287
#if MASKMOVE
288
      r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1);
289
#endif
290
35.5k
        for (y = 0; y < height; y++) {
291
99.5k
                    for(x=0;x<width;x+=2)
292
66.2k
                    {
293
66.2k
                        r0 = _mm_loadl_epi64((__m128i *) (src1+x));
294
66.2k
                        r2 = _mm_loadl_epi64((__m128i *) (src2+x));
295
296
66.2k
                        r0 = _mm_adds_epi16(r0, f0);
297
66.2k
                        r0 = _mm_adds_epi16(r0, r2);
298
66.2k
                        r0 = _mm_srai_epi16(r0, 7);
299
66.2k
                        r0 = _mm_packus_epi16(r0, r0);
300
301
#if MASKMOVE
302
                        _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
303
#else
304
66.2k
                        *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
305
66.2k
#endif
306
66.2k
                    }
307
33.3k
                    dst += dststride;
308
33.3k
                    src1 += srcstride;
309
33.3k
                    src2 += srcstride;
310
33.3k
                }
311
2.20k
    }
312
313
314
504k
}
315
316
void ff_hevc_put_weighted_pred_avg_sse(uint8_t *_dst, ptrdiff_t _dststride,
317
                                       const int16_t *src1, const int16_t *src2,
318
                                       ptrdiff_t srcstride, int width,
319
0
                                       int height) {
320
0
    int x, y;
321
0
    uint8_t *dst = (uint8_t*) _dst;
322
0
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
323
0
    __m128i r0, r1, f0, r2, r3;
324
0
    int shift = 14 + 1 - BIT_DEPTH;
325
0
#if BIT_DEPTH < 14
326
0
    int offset = 1 << (shift - 1);
327
#else
328
    int offset = 0;
329
#endif
330
0
    f0 = _mm_set1_epi16(offset);
331
0
    for (y = 0; y < height; y++) {
332
333
0
        for (x = 0; x < width; x += 16) {
334
0
            r0 = _mm_load_si128((__m128i *) &src1[x]);
335
0
            r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
336
0
            r2 = _mm_load_si128((__m128i *) &src2[x]);
337
0
            r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
338
339
0
            r0 = _mm_adds_epi16(r0, f0);
340
0
            r1 = _mm_adds_epi16(r1, f0);
341
0
            r0 = _mm_adds_epi16(r0, r2);
342
0
            r1 = _mm_adds_epi16(r1, r3);
343
0
            r0 = _mm_srai_epi16(r0, shift);
344
0
            r1 = _mm_srai_epi16(r1, shift);
345
0
            r0 = _mm_packus_epi16(r0, r1);
346
347
0
            _mm_storeu_si128((__m128i *) (dst + x), r0);
348
0
        }
349
0
        dst += dststride;
350
0
        src1 += srcstride;
351
0
        src2 += srcstride;
352
0
    }
353
0
}
354
355
#if 0
356
void ff_hevc_weighted_pred_8_sse4(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
357
                                  uint8_t *_dst, ptrdiff_t _dststride,
358
                                  const int16_t *src, ptrdiff_t srcstride,
359
                                  int width, int height) {
360
361
    int log2Wd;
362
    int x, y;
363
364
    uint8_t *dst = (uint8_t*) _dst;
365
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
366
    __m128i x0, x1, x2, x3, c0, add, add2;
367
368
    log2Wd = denom + 14 - BIT_DEPTH;
369
370
    add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
371
    add2 = _mm_set1_epi32(1 << (log2Wd - 1));
372
    c0 = _mm_set1_epi16(wlxFlag);
373
    if (log2Wd >= 1){
374
        if(!(width & 15)){
375
            for (y = 0; y < height; y++) {
376
                for (x = 0; x < width; x += 16) {
377
                    x0 = _mm_load_si128((__m128i *) &src[x]);
378
                    x2 = _mm_load_si128((__m128i *) &src[x + 8]);
379
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
380
                            _mm_mulhi_epi16(x0, c0));
381
                    x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
382
                            _mm_mulhi_epi16(x2, c0));
383
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
384
                            _mm_mulhi_epi16(x0, c0));
385
                    x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
386
                            _mm_mulhi_epi16(x2, c0));
387
                    x0 = _mm_add_epi32(x0, add2);
388
                    x1 = _mm_add_epi32(x1, add2);
389
                    x2 = _mm_add_epi32(x2, add2);
390
                    x3 = _mm_add_epi32(x3, add2);
391
                    x0 = _mm_srai_epi32(x0, log2Wd);
392
                    x1 = _mm_srai_epi32(x1, log2Wd);
393
                    x2 = _mm_srai_epi32(x2, log2Wd);
394
                    x3 = _mm_srai_epi32(x3, log2Wd);
395
                    x0 = _mm_add_epi32(x0, add);
396
                    x1 = _mm_add_epi32(x1, add);
397
                    x2 = _mm_add_epi32(x2, add);
398
                    x3 = _mm_add_epi32(x3, add);
399
                    x0 = _mm_packus_epi32(x0, x1);
400
                    x2 = _mm_packus_epi32(x2, x3);
401
                    x0 = _mm_packus_epi16(x0, x2);
402
403
                    _mm_storeu_si128((__m128i *) (dst + x), x0);
404
405
                }
406
                dst += dststride;
407
                src += srcstride;
408
            }
409
        }else if(!(width & 7)){
410
            for (y = 0; y < height; y++) {
411
                for(x=0;x<width;x+=8){
412
                    x0 = _mm_load_si128((__m128i *) (src+x));
413
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
414
                            _mm_mulhi_epi16(x0, c0));
415
416
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
417
                            _mm_mulhi_epi16(x0, c0));
418
419
                    x0 = _mm_add_epi32(x0, add2);
420
                    x1 = _mm_add_epi32(x1, add2);
421
422
                    x0 = _mm_srai_epi32(x0, log2Wd);
423
                    x1 = _mm_srai_epi32(x1, log2Wd);
424
425
                    x0 = _mm_add_epi32(x0, add);
426
                    x1 = _mm_add_epi32(x1, add);
427
428
                    x0 = _mm_packus_epi32(x0, x1);
429
                    x0 = _mm_packus_epi16(x0, x0);
430
431
                    _mm_storel_epi64((__m128i *) (dst+x), x0);
432
433
                }
434
                dst += dststride;
435
                src += srcstride;
436
            }
437
        }else if(!(width & 3)){
438
            for (y = 0; y < height; y++) {
439
                for(x=0;x<width;x+=4){
440
                    x0 = _mm_loadl_epi64((__m128i *)(src+x));
441
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
442
                            _mm_mulhi_epi16(x0, c0));
443
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
444
                            _mm_mulhi_epi16(x0, c0));
445
446
                    x0 = _mm_add_epi32(x0, add2);
447
                    x1 = _mm_add_epi32(x1, add2);
448
                    x0 = _mm_srai_epi32(x0, log2Wd);
449
                    x1 = _mm_srai_epi32(x1, log2Wd);
450
                    x0 = _mm_add_epi32(x0, add);
451
                    x1 = _mm_add_epi32(x1, add);
452
                    x0 = _mm_packus_epi32(x0, x1);
453
                    x0 = _mm_packus_epi16(x0, x0);
454
455
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
456
                    // _mm_storeu_si128((__m128i *) (dst + x), x0);
457
                }
458
                dst += dststride;
459
                src += srcstride;
460
            }
461
        }else{
462
            for (y = 0; y < height; y++) {
463
                for(x=0;x<width;x+=2){
464
                    x0 = _mm_loadl_epi64((__m128i *)(src+x));
465
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
466
                            _mm_mulhi_epi16(x0, c0));
467
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
468
                            _mm_mulhi_epi16(x0, c0));
469
470
                    x0 = _mm_add_epi32(x0, add2);
471
                    x1 = _mm_add_epi32(x1, add2);
472
                    x0 = _mm_srai_epi32(x0, log2Wd);
473
                    x1 = _mm_srai_epi32(x1, log2Wd);
474
                    x0 = _mm_add_epi32(x0, add);
475
                    x1 = _mm_add_epi32(x1, add);
476
                    x0 = _mm_packus_epi32(x0, x1);
477
                    x0 = _mm_packus_epi16(x0, x0);
478
479
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
480
                    // _mm_storeu_si128((__m128i *) (dst + x), x0);
481
                }
482
                dst += dststride;
483
                src += srcstride;
484
            }
485
        }
486
    }else{
487
        if(!(width & 15)){
488
            for (y = 0; y < height; y++) {
489
                for (x = 0; x < width; x += 16) {
490
491
                    x0 = _mm_load_si128((__m128i *) &src[x]);
492
                    x2 = _mm_load_si128((__m128i *) &src[x + 8]);
493
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
494
                            _mm_mulhi_epi16(x0, c0));
495
                    x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
496
                            _mm_mulhi_epi16(x2, c0));
497
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
498
                            _mm_mulhi_epi16(x0, c0));
499
                    x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
500
                            _mm_mulhi_epi16(x2, c0));
501
502
                    x0 = _mm_add_epi32(x0, add2);
503
                    x1 = _mm_add_epi32(x1, add2);
504
                    x2 = _mm_add_epi32(x2, add2);
505
                    x3 = _mm_add_epi32(x3, add2);
506
507
                    x0 = _mm_packus_epi32(x0, x1);
508
                    x2 = _mm_packus_epi32(x2, x3);
509
                    x0 = _mm_packus_epi16(x0, x2);
510
511
                    _mm_storeu_si128((__m128i *) (dst + x), x0);
512
513
                }
514
                dst += dststride;
515
                src += srcstride;
516
            }
517
        }else if(!(width & 7)){
518
            for (y = 0; y < height; y++) {
519
                for(x=0;x<width;x+=8){
520
                    x0 = _mm_load_si128((__m128i *) (src+x));
521
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
522
                            _mm_mulhi_epi16(x0, c0));
523
524
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
525
                            _mm_mulhi_epi16(x0, c0));
526
527
528
                    x0 = _mm_add_epi32(x0, add2);
529
                    x1 = _mm_add_epi32(x1, add2);
530
531
                    x0 = _mm_packus_epi32(x0, x1);
532
                    x0 = _mm_packus_epi16(x0, x0);
533
534
                    _mm_storeu_si128((__m128i *) (dst+x), x0);
535
                }
536
537
                dst += dststride;
538
                src += srcstride;
539
            }
540
        }else if(!(width & 3)){
541
            for (y = 0; y < height; y++) {
542
                for(x=0;x<width;x+=4){
543
                    x0 = _mm_loadl_epi64((__m128i *) (src+x));
544
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
545
                            _mm_mulhi_epi16(x0, c0));
546
547
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
548
                            _mm_mulhi_epi16(x0, c0));
549
550
551
                    x0 = _mm_add_epi32(x0, add2);
552
                    x1 = _mm_add_epi32(x1, add2);
553
554
555
                    x0 = _mm_packus_epi32(x0, x1);
556
                    x0 = _mm_packus_epi16(x0, x0);
557
558
559
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
560
                }
561
                dst += dststride;
562
                src += srcstride;
563
            }
564
        }else{
565
            for (y = 0; y < height; y++) {
566
                for(x=0;x<width;x+=2){
567
                    x0 = _mm_loadl_epi64((__m128i *) (src+x));
568
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
569
                            _mm_mulhi_epi16(x0, c0));
570
571
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
572
                            _mm_mulhi_epi16(x0, c0));
573
574
575
                    x0 = _mm_add_epi32(x0, add2);
576
                    x1 = _mm_add_epi32(x1, add2);
577
578
579
                    x0 = _mm_packus_epi32(x0, x1);
580
                    x0 = _mm_packus_epi16(x0, x0);
581
582
583
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
584
                }
585
                dst += dststride;
586
                src += srcstride;
587
            }
588
589
        }
590
591
    }
592
593
}
594
#endif
595
596
597
#if 0
598
void ff_hevc_weighted_pred_sse(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
599
                               uint8_t *_dst, ptrdiff_t _dststride,
600
                               const int16_t *src, ptrdiff_t srcstride,
601
                               int width, int height) {
602
603
    int log2Wd;
604
    int x, y;
605
606
    uint8_t *dst = (uint8_t*) _dst;
607
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
608
    __m128i x0, x1, x2, x3, c0, add, add2;
609
610
    log2Wd = denom + 14 - BIT_DEPTH;
611
612
    add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
613
    add2 = _mm_set1_epi32(1 << (log2Wd - 1));
614
    c0 = _mm_set1_epi16(wlxFlag);
615
    if (log2Wd >= 1)
616
        for (y = 0; y < height; y++) {
617
            for (x = 0; x < width; x += 16) {
618
                x0 = _mm_load_si128((__m128i *) &src[x]);
619
                x2 = _mm_load_si128((__m128i *) &src[x + 8]);
620
                x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
621
                        _mm_mulhi_epi16(x0, c0));
622
                x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
623
                        _mm_mulhi_epi16(x2, c0));
624
                x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
625
                        _mm_mulhi_epi16(x0, c0));
626
                x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
627
                        _mm_mulhi_epi16(x2, c0));
628
                x0 = _mm_add_epi32(x0, add2);
629
                x1 = _mm_add_epi32(x1, add2);
630
                x2 = _mm_add_epi32(x2, add2);
631
                x3 = _mm_add_epi32(x3, add2);
632
                x0 = _mm_srai_epi32(x0, log2Wd);
633
                x1 = _mm_srai_epi32(x1, log2Wd);
634
                x2 = _mm_srai_epi32(x2, log2Wd);
635
                x3 = _mm_srai_epi32(x3, log2Wd);
636
                x0 = _mm_add_epi32(x0, add);
637
                x1 = _mm_add_epi32(x1, add);
638
                x2 = _mm_add_epi32(x2, add);
639
                x3 = _mm_add_epi32(x3, add);
640
                x0 = _mm_packus_epi32(x0, x1);
641
                x2 = _mm_packus_epi32(x2, x3);
642
                x0 = _mm_packus_epi16(x0, x2);
643
644
                _mm_storeu_si128((__m128i *) (dst + x), x0);
645
646
            }
647
            dst += dststride;
648
            src += srcstride;
649
        }
650
    else
651
        for (y = 0; y < height; y++) {
652
            for (x = 0; x < width; x += 16) {
653
654
                x0 = _mm_load_si128((__m128i *) &src[x]);
655
                x2 = _mm_load_si128((__m128i *) &src[x + 8]);
656
                x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
657
                        _mm_mulhi_epi16(x0, c0));
658
                x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
659
                        _mm_mulhi_epi16(x2, c0));
660
                x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
661
                        _mm_mulhi_epi16(x0, c0));
662
                x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
663
                        _mm_mulhi_epi16(x2, c0));
664
665
                x0 = _mm_add_epi32(x0, add2);
666
                x1 = _mm_add_epi32(x1, add2);
667
                x2 = _mm_add_epi32(x2, add2);
668
                x3 = _mm_add_epi32(x3, add2);
669
670
                x0 = _mm_packus_epi32(x0, x1);
671
                x2 = _mm_packus_epi32(x2, x3);
672
                x0 = _mm_packus_epi16(x0, x2);
673
674
                _mm_storeu_si128((__m128i *) (dst + x), x0);
675
676
            }
677
            dst += dststride;
678
            src += srcstride;
679
        }
680
}
681
#endif
682
683
#if HAVE_SSE4_1
684
void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag,
685
                                      int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag,
686
                                      uint8_t *_dst, ptrdiff_t _dststride,
687
                                      const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
688
0
                                      int width, int height) {
689
0
    int shift, shift2;
690
0
    int log2Wd;
691
0
    int o0;
692
0
    int o1;
693
0
    int x, y;
694
0
    uint8_t *dst = (uint8_t*) _dst;
695
0
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
696
0
    __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
697
0
    shift = 14 - BIT_DEPTH;
698
0
    log2Wd = denom + shift;
699
700
0
    o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
701
0
    o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
702
0
    shift2 = (log2Wd + 1);
703
0
    c0 = _mm_set1_epi16(wl0Flag);
704
0
    c1 = _mm_set1_epi16(wl1Flag);
705
0
    c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
706
707
0
    if(!(width & 15)){
708
0
        for (y = 0; y < height; y++) {
709
0
                   for (x = 0; x < width; x += 16) {
710
0
                       x0 = _mm_load_si128((__m128i *) &src1[x]);
711
0
                       x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
712
0
                       x2 = _mm_load_si128((__m128i *) &src2[x]);
713
0
                       x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
714
715
0
                       r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
716
0
                               _mm_mulhi_epi16(x0, c0));
717
0
                       r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
718
0
                               _mm_mulhi_epi16(x1, c0));
719
0
                       r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
720
0
                               _mm_mulhi_epi16(x2, c1));
721
0
                       r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
722
0
                               _mm_mulhi_epi16(x3, c1));
723
0
                       x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
724
0
                               _mm_mulhi_epi16(x0, c0));
725
0
                       x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
726
0
                               _mm_mulhi_epi16(x1, c0));
727
0
                       x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
728
0
                               _mm_mulhi_epi16(x2, c1));
729
0
                       x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
730
0
                               _mm_mulhi_epi16(x3, c1));
731
0
                       r0 = _mm_add_epi32(r0, r2);
732
0
                       r1 = _mm_add_epi32(r1, r3);
733
0
                       r2 = _mm_add_epi32(x0, x2);
734
0
                       r3 = _mm_add_epi32(x1, x3);
735
736
0
                       r0 = _mm_add_epi32(r0, c2);
737
0
                       r1 = _mm_add_epi32(r1, c2);
738
0
                       r2 = _mm_add_epi32(r2, c2);
739
0
                       r3 = _mm_add_epi32(r3, c2);
740
741
0
                       r0 = _mm_srai_epi32(r0, shift2);
742
0
                       r1 = _mm_srai_epi32(r1, shift2);
743
0
                       r2 = _mm_srai_epi32(r2, shift2);
744
0
                       r3 = _mm_srai_epi32(r3, shift2);
745
746
0
                       r0 = _mm_packus_epi32(r0, r2);
747
0
                       r1 = _mm_packus_epi32(r1, r3);
748
0
                       r0 = _mm_packus_epi16(r0, r1);
749
750
0
                       _mm_storeu_si128((__m128i *) (dst + x), r0);
751
752
0
                   }
753
0
                   dst += dststride;
754
0
                   src1 += srcstride;
755
0
                   src2 += srcstride;
756
0
               }
757
0
    }else if(!(width & 7)){
758
0
        for (y = 0; y < height; y++) {
759
0
            for(x=0;x<width;x+=8){
760
0
                x0 = _mm_load_si128((__m128i *) (src1+x));
761
0
                x2 = _mm_load_si128((__m128i *) (src2+x));
762
763
0
                r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
764
0
                        _mm_mulhi_epi16(x0, c0));
765
766
0
                r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
767
0
                        _mm_mulhi_epi16(x2, c1));
768
769
0
                x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
770
0
                        _mm_mulhi_epi16(x0, c0));
771
772
0
                x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
773
0
                        _mm_mulhi_epi16(x2, c1));
774
775
0
                r0 = _mm_add_epi32(r0, r2);
776
0
                r2 = _mm_add_epi32(x0, x2);
777
778
779
0
                r0 = _mm_add_epi32(r0, c2);
780
0
                r2 = _mm_add_epi32(r2, c2);
781
782
0
                r0 = _mm_srai_epi32(r0, shift2);
783
0
                r2 = _mm_srai_epi32(r2, shift2);
784
785
0
                r0 = _mm_packus_epi32(r0, r2);
786
0
                r0 = _mm_packus_epi16(r0, r0);
787
788
0
                _mm_storel_epi64((__m128i *) (dst+x), r0);
789
0
            }
790
791
0
            dst += dststride;
792
0
            src1 += srcstride;
793
0
            src2 += srcstride;
794
0
        }
795
0
    }else if(!(width & 3)){
796
0
        for (y = 0; y < height; y++) {
797
0
            for(x=0;x<width;x+=4){
798
0
                x0 = _mm_loadl_epi64((__m128i *) (src1+x));
799
0
                x2 = _mm_loadl_epi64((__m128i *) (src2+x));
800
801
0
                r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
802
0
                        _mm_mulhi_epi16(x0, c0));
803
804
0
                r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
805
0
                        _mm_mulhi_epi16(x2, c1));
806
807
0
                x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
808
0
                        _mm_mulhi_epi16(x0, c0));
809
810
0
                x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
811
0
                        _mm_mulhi_epi16(x2, c1));
812
813
0
                r0 = _mm_add_epi32(r0, r2);
814
0
                r2 = _mm_add_epi32(x0, x2);
815
816
0
                r0 = _mm_add_epi32(r0, c2);
817
0
                r2 = _mm_add_epi32(r2, c2);
818
819
0
                r0 = _mm_srai_epi32(r0, shift2);
820
0
                r2 = _mm_srai_epi32(r2, shift2);
821
822
0
                r0 = _mm_packus_epi32(r0, r2);
823
0
                r0 = _mm_packus_epi16(r0, r0);
824
825
#if MASKMOVE
826
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
827
#else
828
0
                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
829
0
#endif
830
0
            }
831
0
            dst += dststride;
832
0
            src1 += srcstride;
833
0
            src2 += srcstride;
834
0
        }
835
0
    }else{
836
0
        for (y = 0; y < height; y++) {
837
0
            for(x=0;x<width;x+=2){
838
0
                x0 = _mm_loadl_epi64((__m128i *) (src1+x));
839
0
                x2 = _mm_loadl_epi64((__m128i *) (src2+x));
840
841
0
                r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
842
0
                        _mm_mulhi_epi16(x0, c0));
843
844
0
                r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
845
0
                        _mm_mulhi_epi16(x2, c1));
846
847
0
                x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
848
0
                        _mm_mulhi_epi16(x0, c0));
849
850
0
                x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
851
0
                        _mm_mulhi_epi16(x2, c1));
852
853
0
                r0 = _mm_add_epi32(r0, r2);
854
0
                r2 = _mm_add_epi32(x0, x2);
855
856
0
                r0 = _mm_add_epi32(r0, c2);
857
0
                r2 = _mm_add_epi32(r2, c2);
858
859
0
                r0 = _mm_srai_epi32(r0, shift2);
860
0
                r2 = _mm_srai_epi32(r2, shift2);
861
862
0
                r0 = _mm_packus_epi32(r0, r2);
863
0
                r0 = _mm_packus_epi16(r0, r0);
864
865
#if MASKMOVE
866
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
867
#else
868
0
                *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
869
0
#endif
870
0
            }
871
0
            dst += dststride;
872
0
            src1 += srcstride;
873
0
            src2 += srcstride;
874
0
        }
875
0
    }
876
0
}
877
#endif
878
879
880
#if 0
881
void ff_hevc_weighted_pred_avg_sse(uint8_t denom, int16_t wl0Flag,
882
        int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
883
                                   ptrdiff_t _dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
884
        int width, int height) {
885
    int shift, shift2;
886
    int log2Wd;
887
    int o0;
888
    int o1;
889
    int x, y;
890
    uint8_t *dst = (uint8_t*) _dst;
891
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
892
    __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
893
    shift = 14 - BIT_DEPTH;
894
    log2Wd = denom + shift;
895
896
    o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
897
    o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
898
    shift2 = (log2Wd + 1);
899
    c0 = _mm_set1_epi16(wl0Flag);
900
    c1 = _mm_set1_epi16(wl1Flag);
901
    c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
902
903
    for (y = 0; y < height; y++) {
904
        for (x = 0; x < width; x += 16) {
905
            x0 = _mm_load_si128((__m128i *) &src1[x]);
906
            x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
907
            x2 = _mm_load_si128((__m128i *) &src2[x]);
908
            x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
909
910
            r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
911
                    _mm_mulhi_epi16(x0, c0));
912
            r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
913
                    _mm_mulhi_epi16(x1, c0));
914
            r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
915
                    _mm_mulhi_epi16(x2, c1));
916
            r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
917
                    _mm_mulhi_epi16(x3, c1));
918
            x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
919
                    _mm_mulhi_epi16(x0, c0));
920
            x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
921
                    _mm_mulhi_epi16(x1, c0));
922
            x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
923
                    _mm_mulhi_epi16(x2, c1));
924
            x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
925
                    _mm_mulhi_epi16(x3, c1));
926
            r0 = _mm_add_epi32(r0, r2);
927
            r1 = _mm_add_epi32(r1, r3);
928
            r2 = _mm_add_epi32(x0, x2);
929
            r3 = _mm_add_epi32(x1, x3);
930
931
            r0 = _mm_add_epi32(r0, c2);
932
            r1 = _mm_add_epi32(r1, c2);
933
            r2 = _mm_add_epi32(r2, c2);
934
            r3 = _mm_add_epi32(r3, c2);
935
936
            r0 = _mm_srai_epi32(r0, shift2);
937
            r1 = _mm_srai_epi32(r1, shift2);
938
            r2 = _mm_srai_epi32(r2, shift2);
939
            r3 = _mm_srai_epi32(r3, shift2);
940
941
            r0 = _mm_packus_epi32(r0, r2);
942
            r1 = _mm_packus_epi32(r1, r3);
943
            r0 = _mm_packus_epi16(r0, r1);
944
945
            _mm_storeu_si128((__m128i *) (dst + x), r0);
946
947
        }
948
        dst += dststride;
949
        src1 += srcstride;
950
        src2 += srcstride;
951
    }
952
}
953
#endif
954
955
956
void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
957
                                        const uint8_t *_src, ptrdiff_t srcstride,
958
                                        int width, int height, int mx,
959
1.80M
                                        int my, int16_t* mcbuffer) {
960
1.80M
    int x, y;
961
1.80M
    __m128i x1, x2,x3;
962
1.80M
    uint8_t *src = (uint8_t*) _src;
963
1.80M
    if(!(width & 15)){
964
63.9k
        x3= _mm_setzero_si128();
965
1.35M
        for (y = 0; y < height; y++) {
966
2.62M
                    for (x = 0; x < width; x += 16) {
967
968
1.33M
                        x1 = _mm_loadu_si128((__m128i *) &src[x]);
969
1.33M
                        x2 = _mm_unpacklo_epi8(x1, x3);
970
971
1.33M
                        x1 = _mm_unpackhi_epi8(x1, x3);
972
973
1.33M
                        x2 = _mm_slli_epi16(x2, 6);
974
1.33M
                        x1 = _mm_slli_epi16(x1, 6);
975
1.33M
                        _mm_store_si128((__m128i *) &dst[x], x2);
976
1.33M
                        _mm_store_si128((__m128i *) &dst[x + 8], x1);
977
978
1.33M
                    }
979
1.29M
                    src += srcstride;
980
1.29M
                    dst += dststride;
981
1.29M
                }
982
1.74M
    }else  if(!(width & 7)){
983
217k
        x1= _mm_setzero_si128();
984
2.48M
        for (y = 0; y < height; y++) {
985
4.57M
                    for (x = 0; x < width; x += 8) {
986
987
2.30M
                        x2 = _mm_loadl_epi64((__m128i *) &src[x]);
988
2.30M
                        x2 = _mm_unpacklo_epi8(x2, x1);
989
2.30M
                        x2 = _mm_slli_epi16(x2, 6);
990
2.30M
                        _mm_store_si128((__m128i *) &dst[x], x2);
991
992
2.30M
                    }
993
2.26M
                    src += srcstride;
994
2.26M
                    dst += dststride;
995
2.26M
                }
996
1.52M
    }else  if(!(width & 3)){
997
1.36M
        x1= _mm_setzero_si128();
998
8.39M
        for (y = 0; y < height; y++) {
999
14.3M
                    for (x = 0; x < width; x += 4) {
1000
1001
7.33M
                        x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1002
7.33M
                        x2 = _mm_unpacklo_epi8(x2,x1);
1003
1004
7.33M
                        x2 = _mm_slli_epi16(x2, 6);
1005
1006
7.33M
                        _mm_storel_epi64((__m128i *) &dst[x], x2);
1007
1008
7.33M
                    }
1009
7.03M
                    src += srcstride;
1010
7.03M
                    dst += dststride;
1011
7.03M
                }
1012
1.36M
    }else{
1013
161k
        x1= _mm_setzero_si128();
1014
1.15M
        for (y = 0; y < height; y++) {
1015
2.18M
                    for (x = 0; x < width; x += 2) {
1016
1017
1.20M
                        x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1018
1.20M
                        x2 = _mm_unpacklo_epi8(x2, x1);
1019
1.20M
                        x2 = _mm_slli_epi16(x2, 6);
1020
#if MASKMOVE
1021
                        _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1022
#else
1023
1.20M
                        *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1024
1.20M
#endif
1025
1.20M
                    }
1026
988k
                    src += srcstride;
1027
988k
                    dst += dststride;
1028
988k
                }
1029
161k
    }
1030
1031
1.80M
}
1032
1033
#ifndef __native_client__
1034
void ff_hevc_put_hevc_epel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
1035
                                         const uint8_t *_src, ptrdiff_t _srcstride,
1036
                                         int width, int height, int mx,
1037
0
                                         int my, int16_t* mcbuffer) {
1038
0
    int x, y;
1039
0
    __m128i x2;
1040
0
    uint16_t *src = (uint16_t*) _src;
1041
0
    ptrdiff_t srcstride = _srcstride>>1;
1042
0
    if(!(width & 7)){
1043
      //x1= _mm_setzero_si128();
1044
0
        for (y = 0; y < height; y++) {
1045
0
            for (x = 0; x < width; x += 8) {
1046
1047
0
                x2 = _mm_loadu_si128((__m128i *) &src[x]);
1048
0
                x2 = _mm_slli_epi16(x2, 4);         //shift 14 - BIT LENGTH
1049
0
                _mm_store_si128((__m128i *) &dst[x], x2);
1050
1051
0
            }
1052
0
            src += srcstride;
1053
0
            dst += dststride;
1054
0
        }
1055
0
    }else  if(!(width & 3)){
1056
      //x1= _mm_setzero_si128();
1057
0
        for (y = 0; y < height; y++) {
1058
0
            for (x = 0; x < width; x += 4) {
1059
1060
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1061
0
                x2 = _mm_slli_epi16(x2, 4);     //shift 14 - BIT LENGTH
1062
1063
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
1064
1065
0
            }
1066
0
            src += srcstride;
1067
0
            dst += dststride;
1068
0
        }
1069
0
    }else{
1070
      //x1= _mm_setzero_si128();
1071
0
        for (y = 0; y < height; y++) {
1072
0
            for (x = 0; x < width; x += 2) {
1073
1074
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1075
0
                x2 = _mm_slli_epi16(x2, 4);     //shift 14 - BIT LENGTH
1076
0
                _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1077
0
            }
1078
0
            src += srcstride;
1079
0
            dst += dststride;
1080
0
        }
1081
0
    }
1082
1083
0
}
1084
#endif
1085
1086
void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
1087
                                   const uint8_t *_src, ptrdiff_t _srcstride,
1088
                                   int width, int height, int mx,
1089
257k
                                   int my, int16_t* mcbuffer, int bit_depth) {
1090
257k
    int x, y;
1091
257k
    const uint8_t *src = (const uint8_t*) _src;
1092
257k
    ptrdiff_t srcstride = _srcstride;
1093
257k
    const int8_t *filter = epel_filters[mx - 1];
1094
257k
    __m128i r0, bshuffle1, bshuffle2, x1, x2, x3;
1095
257k
    int8_t filter_0 = filter[0];
1096
257k
    int8_t filter_1 = filter[1];
1097
257k
    int8_t filter_2 = filter[2];
1098
257k
    int8_t filter_3 = filter[3];
1099
257k
    r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1100
257k
            filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1101
257k
            filter_0, filter_3, filter_2, filter_1, filter_0);
1102
257k
    bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1103
1104
1105
    /*
1106
  printf("---IN---SSE\n");
1107
1108
  int extra_top  = 1;
1109
  int extra_left = 1;
1110
  int extra_right  = 2;
1111
  int extra_bottom = 2;
1112
1113
  for (int y=-extra_top;y<height+extra_bottom;y++) {
1114
    uint8_t* p = &_src[y*_srcstride -extra_left];
1115
1116
    for (int x=-extra_left;x<width+extra_right;x++) {
1117
      printf("%05d ",*p << 6);
1118
      p++;
1119
    }
1120
    printf("\n");
1121
  }
1122
    */
1123
1124
257k
    if(!(width & 7)){
1125
55.2k
        bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1126
55.2k
                        4);
1127
742k
                for (y = 0; y < height; y++) {
1128
1.64M
                    for (x = 0; x < width; x += 8) {
1129
1130
960k
                        x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1131
960k
                        x2 = _mm_shuffle_epi8(x1, bshuffle1);
1132
960k
                        x3 = _mm_shuffle_epi8(x1, bshuffle2);
1133
1134
                        /*  PMADDUBSW then PMADDW     */
1135
960k
                        x2 = _mm_maddubs_epi16(x2, r0);
1136
960k
                        x3 = _mm_maddubs_epi16(x3, r0);
1137
960k
                        x2 = _mm_hadd_epi16(x2, x3);
1138
960k
                        _mm_store_si128((__m128i *) &dst[x], x2);
1139
960k
                    }
1140
687k
                    src += srcstride;
1141
687k
                    dst += dststride;
1142
687k
                }
1143
201k
    }else if(!(width & 3)){
1144
1145
1.12M
        for (y = 0; y < height; y++) {
1146
1.98M
            for (x = 0; x < width; x += 4) {
1147
            /* load data in register     */
1148
1.01M
            x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1149
1.01M
            x2 = _mm_shuffle_epi8(x1, bshuffle1);
1150
1151
            /*  PMADDUBSW then PMADDW     */
1152
1.01M
            x2 = _mm_maddubs_epi16(x2, r0);
1153
1.01M
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1154
            /* give results back            */
1155
1.01M
            _mm_storel_epi64((__m128i *) &dst[x], x2);
1156
1.01M
            }
1157
969k
            src += srcstride;
1158
969k
            dst += dststride;
1159
969k
        }
1160
160k
    }else{
1161
339k
        for (y = 0; y < height; y++) {
1162
622k
            for (x = 0; x < width; x += 2) {
1163
            /* load data in register     */
1164
325k
            x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1165
325k
            x2 = _mm_shuffle_epi8(x1, bshuffle1);
1166
1167
            /*  PMADDUBSW then PMADDW     */
1168
325k
            x2 = _mm_maddubs_epi16(x2, r0);
1169
325k
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1170
            /* give results back            */
1171
#if MASKMOVE
1172
            _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1173
#else
1174
325k
            *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1175
325k
#endif
1176
325k
            }
1177
297k
            src += srcstride;
1178
297k
            dst += dststride;
1179
297k
        }
1180
41.5k
    }
1181
257k
}
1182
1183
#ifndef __native_client__
1184
void ff_hevc_put_hevc_epel_h_10_sse(int16_t *dst, ptrdiff_t dststride,
1185
                                    const uint8_t *_src, ptrdiff_t _srcstride,
1186
                                    int width, int height, int mx,
1187
0
                                    int my, int16_t* mcbuffer) {
1188
0
    int x, y;
1189
0
    uint16_t *src = (uint16_t*) _src;
1190
0
    ptrdiff_t srcstride = _srcstride>>1;
1191
0
    const int8_t *filter = epel_filters[mx - 1];
1192
0
    __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1;
1193
0
    int8_t filter_0 = filter[0];
1194
0
    int8_t filter_1 = filter[1];
1195
0
    int8_t filter_2 = filter[2];
1196
0
    int8_t filter_3 = filter[3];
1197
0
    r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1198
0
            filter_0, filter_3, filter_2, filter_1, filter_0);
1199
0
    bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1200
1201
0
    if(!(width & 3)){
1202
0
        bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1203
0
        for (y = 0; y < height; y++) {
1204
0
            for (x = 0; x < width; x += 4) {
1205
1206
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1207
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1208
0
                x3 = _mm_shuffle_epi8(x1, bshuffle2);
1209
1210
1211
0
                x2 = _mm_madd_epi16(x2, r0);
1212
0
                x3 = _mm_madd_epi16(x3, r0);
1213
0
                x2 = _mm_hadd_epi32(x2, x3);
1214
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1215
1216
0
                x2 = _mm_packs_epi32(x2,r0);
1217
                //give results back
1218
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
1219
0
            }
1220
0
            src += srcstride;
1221
0
            dst += dststride;
1222
0
        }
1223
0
    }else{
1224
0
        r1= _mm_setzero_si128();
1225
0
        for (y = 0; y < height; y++) {
1226
0
            for (x = 0; x < width; x += 2) {
1227
                /* load data in register     */
1228
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1229
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1230
1231
                /*  PMADDUBSW then PMADDW     */
1232
0
                x2 = _mm_madd_epi16(x2, r0);
1233
0
                x2 = _mm_hadd_epi32(x2, r1);
1234
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1235
0
                x2 = _mm_packs_epi32(x2, r1);
1236
                /* give results back            */
1237
0
                _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1238
0
            }
1239
0
            src += srcstride;
1240
0
            dst += dststride;
1241
0
        }
1242
0
    }
1243
0
}
1244
#endif
1245
1246
1247
void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
1248
                                   const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1249
255k
                                   int my, int16_t* mcbuffer, int bit_depth) {
1250
255k
    int x, y;
1251
255k
    __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1;
1252
255k
    uint8_t *src = (uint8_t*) _src;
1253
255k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
1254
255k
    const int8_t *filter = epel_filters[my - 1];
1255
255k
    int8_t filter_0 = filter[0];
1256
255k
    int8_t filter_1 = filter[1];
1257
255k
    int8_t filter_2 = filter[2];
1258
255k
    int8_t filter_3 = filter[3];
1259
255k
    f0 = _mm_set1_epi16(filter_0);
1260
255k
    f1 = _mm_set1_epi16(filter_1);
1261
255k
    f2 = _mm_set1_epi16(filter_2);
1262
255k
    f3 = _mm_set1_epi16(filter_3);
1263
1264
255k
    if(!(width & 15)){
1265
208k
        for (y = 0; y < height; y++) {
1266
405k
            for (x = 0; x < width; x += 16) {
1267
                /* check if memory needs to be reloaded */
1268
1269
208k
                x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1270
208k
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1271
208k
                x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1272
208k
                x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1273
1274
208k
                t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128());
1275
208k
                t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128());
1276
208k
                t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128());
1277
208k
                t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128());
1278
1279
208k
                x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128());
1280
208k
                x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128());
1281
208k
                x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128());
1282
208k
                x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128());
1283
1284
                /* multiply by correct value : */
1285
208k
                r0 = _mm_mullo_epi16(t0, f0);
1286
208k
                r1 = _mm_mullo_epi16(x0, f0);
1287
208k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1288
208k
                r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1));
1289
208k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1290
208k
                r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2));
1291
208k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1292
208k
                r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3));
1293
                /* give results back            */
1294
208k
                _mm_store_si128((__m128i *) &dst[x], r0);
1295
208k
                _mm_storeu_si128((__m128i *) &dst[x + 8], r1);
1296
208k
            }
1297
197k
            src += srcstride;
1298
197k
            dst += dststride;
1299
197k
        }
1300
244k
    }else if(!(width & 7)){
1301
43.2k
        r1= _mm_setzero_si128();
1302
473k
        for (y = 0; y < height; y++) {
1303
866k
            for(x=0;x<width;x+=8){
1304
436k
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1305
436k
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1306
436k
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1307
436k
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1308
1309
436k
                t0 = _mm_unpacklo_epi8(x0, r1);
1310
436k
                t1 = _mm_unpacklo_epi8(x1, r1);
1311
436k
                t2 = _mm_unpacklo_epi8(x2, r1);
1312
436k
                t3 = _mm_unpacklo_epi8(x3, r1);
1313
1314
1315
                /* multiply by correct value : */
1316
436k
                r0 = _mm_mullo_epi16(t0, f0);
1317
436k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1318
436k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1319
436k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1320
                /* give results back            */
1321
436k
                _mm_storeu_si128((__m128i *) &dst[x], r0);
1322
436k
            }
1323
430k
            src += srcstride;
1324
430k
            dst += dststride;
1325
430k
        }
1326
201k
    }else if(!(width & 3)){
1327
167k
        r1= _mm_setzero_si128();
1328
1.16M
        for (y = 0; y < height; y++) {
1329
2.05M
            for(x=0;x<width;x+=4){
1330
1.06M
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1331
1.06M
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1332
1.06M
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1333
1.06M
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1334
1335
1.06M
                t0 = _mm_unpacklo_epi8(x0, r1);
1336
1.06M
                t1 = _mm_unpacklo_epi8(x1, r1);
1337
1.06M
                t2 = _mm_unpacklo_epi8(x2, r1);
1338
1.06M
                t3 = _mm_unpacklo_epi8(x3, r1);
1339
1340
1341
                /* multiply by correct value : */
1342
1.06M
                r0 = _mm_mullo_epi16(t0, f0);
1343
1.06M
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1344
1.06M
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1345
1.06M
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1346
                /* give results back            */
1347
1.06M
                _mm_storel_epi64((__m128i *) &dst[x], r0);
1348
1.06M
            }
1349
995k
            src += srcstride;
1350
995k
            dst += dststride;
1351
995k
        }
1352
167k
    }else{
1353
34.0k
        r1= _mm_setzero_si128();
1354
235k
        for (y = 0; y < height; y++) {
1355
409k
            for(x=0;x<width;x+=2){
1356
207k
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1357
207k
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1358
207k
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1359
207k
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1360
1361
207k
                t0 = _mm_unpacklo_epi8(x0, r1);
1362
207k
                t1 = _mm_unpacklo_epi8(x1, r1);
1363
207k
                t2 = _mm_unpacklo_epi8(x2, r1);
1364
207k
                t3 = _mm_unpacklo_epi8(x3, r1);
1365
1366
1367
                /* multiply by correct value : */
1368
207k
                r0 = _mm_mullo_epi16(t0, f0);
1369
207k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1370
207k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1371
207k
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1372
                /* give results back            */
1373
#if MASKMOVE
1374
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1375
#else
1376
207k
                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1377
207k
#endif
1378
207k
            }
1379
201k
            src += srcstride;
1380
201k
            dst += dststride;
1381
201k
        }
1382
34.0k
    }
1383
255k
}
1384
1385
#ifndef __native_client__
1386
void ff_hevc_put_hevc_epel_v_10_sse(int16_t *dst, ptrdiff_t dststride,
1387
                                    const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1388
0
        int my, int16_t* mcbuffer) {
1389
0
    int x, y;
1390
0
    __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1, r2, r3;
1391
0
    uint16_t *src = (uint16_t*) _src;
1392
0
    ptrdiff_t srcstride = _srcstride >>1;
1393
0
    const int8_t *filter = epel_filters[my - 1];
1394
0
    int8_t filter_0 = filter[0];
1395
0
    int8_t filter_1 = filter[1];
1396
0
    int8_t filter_2 = filter[2];
1397
0
    int8_t filter_3 = filter[3];
1398
0
    f0 = _mm_set1_epi16(filter_0);
1399
0
    f1 = _mm_set1_epi16(filter_1);
1400
0
    f2 = _mm_set1_epi16(filter_2);
1401
0
    f3 = _mm_set1_epi16(filter_3);
1402
1403
0
    if(!(width & 7)){
1404
0
        r1= _mm_setzero_si128();
1405
0
        for (y = 0; y < height; y++) {
1406
0
            for(x=0;x<width;x+=8){
1407
0
                x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1408
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1409
0
                x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1410
0
                x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1411
1412
                // multiply by correct value :
1413
0
                r0 = _mm_mullo_epi16(x0, f0);
1414
0
                t0 = _mm_mulhi_epi16(x0, f0);
1415
1416
0
                x0= _mm_unpacklo_epi16(r0,t0);
1417
0
                t0= _mm_unpackhi_epi16(r0,t0);
1418
1419
0
                r1 = _mm_mullo_epi16(x1, f1);
1420
0
                t1 = _mm_mulhi_epi16(x1, f1);
1421
1422
0
                x1= _mm_unpacklo_epi16(r1,t1);
1423
0
                t1= _mm_unpackhi_epi16(r1,t1);
1424
1425
1426
0
                r2 = _mm_mullo_epi16(x2, f2);
1427
0
                t2 = _mm_mulhi_epi16(x2, f2);
1428
1429
0
                x2= _mm_unpacklo_epi16(r2,t2);
1430
0
                t2= _mm_unpackhi_epi16(r2,t2);
1431
1432
1433
0
                r3 = _mm_mullo_epi16(x3, f3);
1434
0
                t3 = _mm_mulhi_epi16(x3, f3);
1435
1436
0
                x3= _mm_unpacklo_epi16(r3,t3);
1437
0
                t3= _mm_unpackhi_epi16(r3,t3);
1438
1439
1440
0
                r0= _mm_add_epi32(x0,x1);
1441
0
                r1= _mm_add_epi32(x2,x3);
1442
1443
0
                t0= _mm_add_epi32(t0,t1);
1444
0
                t1= _mm_add_epi32(t2,t3);
1445
1446
0
                r0= _mm_add_epi32(r0,r1);
1447
0
                t0= _mm_add_epi32(t0,t1);
1448
1449
0
                r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1450
0
                t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8)
1451
1452
0
                r0= _mm_packs_epi32(r0, t0);
1453
                // give results back
1454
0
                _mm_storeu_si128((__m128i *) &dst[x], r0);
1455
0
            }
1456
0
            src += srcstride;
1457
0
            dst += dststride;
1458
0
        }
1459
0
    }else if(!(width & 3)){
1460
0
        r1= _mm_setzero_si128();
1461
0
        for (y = 0; y < height; y++) {
1462
0
            for(x=0;x<width;x+=4){
1463
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1464
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1465
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1466
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1467
1468
                /* multiply by correct value : */
1469
0
                r0 = _mm_mullo_epi16(x0, f0);
1470
0
                t0 = _mm_mulhi_epi16(x0, f0);
1471
1472
0
                x0= _mm_unpacklo_epi16(r0,t0);
1473
1474
0
                r1 = _mm_mullo_epi16(x1, f1);
1475
0
                t1 = _mm_mulhi_epi16(x1, f1);
1476
1477
0
                x1= _mm_unpacklo_epi16(r1,t1);
1478
1479
1480
0
                r2 = _mm_mullo_epi16(x2, f2);
1481
0
                t2 = _mm_mulhi_epi16(x2, f2);
1482
1483
0
                x2= _mm_unpacklo_epi16(r2,t2);
1484
1485
1486
0
                r3 = _mm_mullo_epi16(x3, f3);
1487
0
                t3 = _mm_mulhi_epi16(x3, f3);
1488
1489
0
                x3= _mm_unpacklo_epi16(r3,t3);
1490
1491
1492
0
                r0= _mm_add_epi32(x0,x1);
1493
0
                r1= _mm_add_epi32(x2,x3);
1494
0
                r0= _mm_add_epi32(r0,r1);
1495
0
                r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1496
1497
0
                r0= _mm_packs_epi32(r0, r0);
1498
1499
                // give results back
1500
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
1501
0
            }
1502
0
            src += srcstride;
1503
0
            dst += dststride;
1504
0
        }
1505
0
    }else{
1506
0
        r1= _mm_setzero_si128();
1507
0
        for (y = 0; y < height; y++) {
1508
0
            for(x=0;x<width;x+=2){
1509
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1510
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1511
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1512
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1513
1514
                /* multiply by correct value : */
1515
0
                r0 = _mm_mullo_epi16(x0, f0);
1516
0
                t0 = _mm_mulhi_epi16(x0, f0);
1517
1518
0
                x0= _mm_unpacklo_epi16(r0,t0);
1519
1520
0
                r1 = _mm_mullo_epi16(x1, f1);
1521
0
                t1 = _mm_mulhi_epi16(x1, f1);
1522
1523
0
                x1= _mm_unpacklo_epi16(r1,t1);
1524
1525
0
                r2 = _mm_mullo_epi16(x2, f2);
1526
0
                t2 = _mm_mulhi_epi16(x2, f2);
1527
1528
0
                x2= _mm_unpacklo_epi16(r2,t2);
1529
1530
0
                r3 = _mm_mullo_epi16(x3, f3);
1531
0
                t3 = _mm_mulhi_epi16(x3, f3);
1532
1533
0
                x3= _mm_unpacklo_epi16(r3,t3);
1534
1535
0
                r0= _mm_add_epi32(x0,x1);
1536
0
                r1= _mm_add_epi32(x2,x3);
1537
0
                r0= _mm_add_epi32(r0,r1);
1538
0
                r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1539
1540
0
                r0= _mm_packs_epi32(r0, r0);
1541
1542
                /* give results back            */
1543
0
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1544
1545
0
            }
1546
0
            src += srcstride;
1547
0
            dst += dststride;
1548
0
        }
1549
0
    }
1550
0
}
1551
#endif
1552
1553
void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
1554
                                    const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1555
1.01M
                                    int my, int16_t* mcbuffer, int bit_depth) {
1556
1.01M
  int x, y;
1557
1.01M
  uint8_t *src = (uint8_t*) _src;
1558
1.01M
  ptrdiff_t srcstride = _srcstride;
1559
1.01M
  const int8_t *filter_h = epel_filters[mx - 1];
1560
1.01M
  const int8_t *filter_v = epel_filters[my - 1];
1561
1.01M
  __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1562
1.01M
  f2, f3, r1, r2;
1563
1.01M
  int8_t filter_0 = filter_h[0];
1564
1.01M
  int8_t filter_1 = filter_h[1];
1565
1.01M
  int8_t filter_2 = filter_h[2];
1566
1.01M
  int8_t filter_3 = filter_h[3];
1567
1.01M
  int16_t *tmp = mcbuffer;
1568
1.01M
  r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1569
1.01M
      filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1570
1.01M
      filter_0, filter_3, filter_2, filter_1, filter_0);
1571
1.01M
  bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1572
1573
1.01M
  src -= epel_extra_before * srcstride;
1574
1575
1.01M
  f3 = _mm_set1_epi16(filter_v[3]);
1576
1.01M
  f1 = _mm_set1_epi16(filter_v[1]);
1577
1.01M
  f2 = _mm_set1_epi16(filter_v[2]);
1578
1.01M
  f0 = _mm_set1_epi16(filter_v[0]);
1579
1580
  /* horizontal treatment */
1581
1.01M
  if(!(width & 7)){
1582
154k
    bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1583
154k
        4);
1584
2.48M
    for (y = 0; y < height + epel_extra; y++) {
1585
5.41M
      for (x = 0; x < width; x += 8) {
1586
1587
3.08M
        x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1588
3.08M
        x2 = _mm_shuffle_epi8(x1, bshuffle1);
1589
3.08M
        x3 = _mm_shuffle_epi8(x1, bshuffle2);
1590
1591
        /*  PMADDUBSW then PMADDW     */
1592
3.08M
        x2 = _mm_maddubs_epi16(x2, r0);
1593
3.08M
        x3 = _mm_maddubs_epi16(x3, r0);
1594
3.08M
        x2 = _mm_hadd_epi16(x2, x3);
1595
3.08M
        _mm_store_si128((__m128i *) &tmp[x], x2);
1596
3.08M
      }
1597
2.33M
      src += srcstride;
1598
2.33M
      tmp += MAX_PB_SIZE;
1599
2.33M
    }
1600
154k
    tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1601
1602
    /* vertical treatment */
1603
1604
2.02M
    for (y = 0; y < height; y++) {
1605
4.38M
      for (x = 0; x < width; x += 8) {
1606
        /* check if memory needs to be reloaded */
1607
2.51M
        x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]);
1608
2.51M
        x1 = _mm_load_si128((__m128i *) &tmp[x]);
1609
2.51M
        x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]);
1610
2.51M
        x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1611
1612
2.51M
        r0 = _mm_mullo_epi16(x0, f0);
1613
2.51M
        r1 = _mm_mulhi_epi16(x0, f0);
1614
2.51M
        r2 = _mm_mullo_epi16(x1, f1);
1615
2.51M
        t0 = _mm_unpacklo_epi16(r0, r1);
1616
2.51M
        x0 = _mm_unpackhi_epi16(r0, r1);
1617
2.51M
        r0 = _mm_mulhi_epi16(x1, f1);
1618
2.51M
        r1 = _mm_mullo_epi16(x2, f2);
1619
2.51M
        t1 = _mm_unpacklo_epi16(r2, r0);
1620
2.51M
        x1 = _mm_unpackhi_epi16(r2, r0);
1621
2.51M
        r2 = _mm_mulhi_epi16(x2, f2);
1622
2.51M
        r0 = _mm_mullo_epi16(x3, f3);
1623
2.51M
        t2 = _mm_unpacklo_epi16(r1, r2);
1624
2.51M
        x2 = _mm_unpackhi_epi16(r1, r2);
1625
2.51M
        r1 = _mm_mulhi_epi16(x3, f3);
1626
2.51M
        t3 = _mm_unpacklo_epi16(r0, r1);
1627
2.51M
        x3 = _mm_unpackhi_epi16(r0, r1);
1628
1629
        /* multiply by correct value : */
1630
2.51M
        r0 = _mm_add_epi32(t0, t1);
1631
2.51M
        r1 = _mm_add_epi32(x0, x1);
1632
2.51M
        r0 = _mm_add_epi32(r0, t2);
1633
2.51M
        r1 = _mm_add_epi32(r1, x2);
1634
2.51M
        r0 = _mm_add_epi32(r0, t3);
1635
2.51M
        r1 = _mm_add_epi32(r1, x3);
1636
2.51M
        r0 = _mm_srai_epi32(r0, 6);
1637
2.51M
        r1 = _mm_srai_epi32(r1, 6);
1638
1639
        /* give results back            */
1640
2.51M
        r0 = _mm_packs_epi32(r0, r1);
1641
2.51M
        _mm_store_si128((__m128i *) &dst[x], r0);
1642
2.51M
      }
1643
1.87M
      tmp += MAX_PB_SIZE;
1644
1.87M
      dst += dststride;
1645
1.87M
    }
1646
856k
  }else if(!(width & 3)){
1647
6.27M
    for (y = 0; y < height + epel_extra; y ++) {
1648
11.4M
      for(x=0;x<width;x+=4){
1649
        /* load data in register     */
1650
5.78M
        x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1651
1652
5.78M
        x1 = _mm_shuffle_epi8(x1, bshuffle1);
1653
1654
        /*  PMADDUBSW then PMADDW     */
1655
5.78M
        x1 = _mm_maddubs_epi16(x1, r0);
1656
5.78M
        x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1657
1658
        /* give results back            */
1659
5.78M
        _mm_storel_epi64((__m128i *) &tmp[x], x1);
1660
1661
5.78M
      }
1662
5.62M
      src += srcstride;
1663
5.62M
      tmp += MAX_PB_SIZE;
1664
5.62M
    }
1665
652k
    tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1666
1667
    /* vertical treatment */
1668
1669
1670
4.31M
    for (y = 0; y < height; y++) {
1671
7.47M
      for (x = 0; x < width; x += 4) {
1672
        /* check if memory needs to be reloaded */
1673
3.80M
        x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1674
3.80M
        x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1675
3.80M
        x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1676
3.80M
        x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1677
1678
3.80M
        r0 = _mm_mullo_epi16(x0, f0);
1679
3.80M
        r1 = _mm_mulhi_epi16(x0, f0);
1680
3.80M
        r2 = _mm_mullo_epi16(x1, f1);
1681
3.80M
        t0 = _mm_unpacklo_epi16(r0, r1);
1682
1683
3.80M
        r0 = _mm_mulhi_epi16(x1, f1);
1684
3.80M
        r1 = _mm_mullo_epi16(x2, f2);
1685
3.80M
        t1 = _mm_unpacklo_epi16(r2, r0);
1686
1687
3.80M
        r2 = _mm_mulhi_epi16(x2, f2);
1688
3.80M
        r0 = _mm_mullo_epi16(x3, f3);
1689
3.80M
        t2 = _mm_unpacklo_epi16(r1, r2);
1690
1691
3.80M
        r1 = _mm_mulhi_epi16(x3, f3);
1692
3.80M
        t3 = _mm_unpacklo_epi16(r0, r1);
1693
1694
1695
        /* multiply by correct value : */
1696
3.80M
        r0 = _mm_add_epi32(t0, t1);
1697
3.80M
        r0 = _mm_add_epi32(r0, t2);
1698
3.80M
        r0 = _mm_add_epi32(r0, t3);
1699
3.80M
        r0 = _mm_srai_epi32(r0, 6);
1700
1701
        /* give results back            */
1702
3.80M
        r0 = _mm_packs_epi32(r0, r0);
1703
3.80M
        _mm_storel_epi64((__m128i *) &dst[x], r0);
1704
3.80M
      }
1705
3.66M
      tmp += MAX_PB_SIZE;
1706
3.66M
      dst += dststride;
1707
3.66M
    }
1708
652k
  }else{
1709
#if MASKMOVE
1710
    bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1711
#endif
1712
2.03M
    for (y = 0; y < height + epel_extra; y ++) {
1713
3.76M
      for(x=0;x<width;x+=2){
1714
        /* load data in register     */
1715
1.93M
        x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1716
1.93M
        x1 = _mm_shuffle_epi8(x1, bshuffle1);
1717
1718
        /*  PMADDUBSW then PMADDW     */
1719
1.93M
        x1 = _mm_maddubs_epi16(x1, r0);
1720
1.93M
        x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1721
1722
        /* give results back            */
1723
#if MASKMOVE
1724
        _mm_maskmoveu_si128(x1,bshuffle2,(char *) (tmp+x));
1725
#else
1726
1.93M
                                *((uint32_t*)(tmp+x)) = _mm_cvtsi128_si32(x1);
1727
1.93M
#endif
1728
1.93M
      }
1729
1.82M
      src += srcstride;
1730
1.82M
      tmp += MAX_PB_SIZE;
1731
1.82M
    }
1732
1733
203k
    tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1734
1735
    /* vertical treatment */
1736
1737
1.41M
    for (y = 0; y < height; y++) {
1738
2.51M
      for (x = 0; x < width; x += 2) {
1739
        /* check if memory needs to be reloaded */
1740
1.30M
        x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1741
1.30M
        x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1742
1.30M
        x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1743
1.30M
        x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1744
1745
1.30M
        r0 = _mm_mullo_epi16(x0, f0);
1746
1.30M
        r1 = _mm_mulhi_epi16(x0, f0);
1747
1.30M
        r2 = _mm_mullo_epi16(x1, f1);
1748
1.30M
        t0 = _mm_unpacklo_epi16(r0, r1);
1749
1.30M
        r0 = _mm_mulhi_epi16(x1, f1);
1750
1.30M
        r1 = _mm_mullo_epi16(x2, f2);
1751
1.30M
        t1 = _mm_unpacklo_epi16(r2, r0);
1752
1.30M
        r2 = _mm_mulhi_epi16(x2, f2);
1753
1.30M
        r0 = _mm_mullo_epi16(x3, f3);
1754
1.30M
        t2 = _mm_unpacklo_epi16(r1, r2);
1755
1.30M
        r1 = _mm_mulhi_epi16(x3, f3);
1756
1.30M
        t3 = _mm_unpacklo_epi16(r0, r1);
1757
1758
        /* multiply by correct value : */
1759
1.30M
        r0 = _mm_add_epi32(t0, t1);
1760
1.30M
        r0 = _mm_add_epi32(r0, t2);
1761
1.30M
        r0 = _mm_add_epi32(r0, t3);
1762
1.30M
        r0 = _mm_srai_epi32(r0, 6);
1763
        /* give results back            */
1764
1.30M
        r0 = _mm_packs_epi32(r0, r0);
1765
#if MASKMOVE
1766
        _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1767
#else
1768
1.30M
                                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1769
1.30M
#endif
1770
1.30M
      }
1771
1.21M
      tmp += MAX_PB_SIZE;
1772
1.21M
      dst += dststride;
1773
1.21M
    }
1774
203k
  }
1775
1776
1.01M
}
1777
1778
1779
#ifndef __native_client__
1780
void ff_hevc_put_hevc_epel_hv_10_sse(int16_t *dst, ptrdiff_t dststride,
1781
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1782
0
        int my, int16_t* mcbuffer) {
1783
0
    int x, y;
1784
0
    uint16_t *src = (uint16_t*) _src;
1785
0
    ptrdiff_t srcstride = _srcstride>>1;
1786
0
    const int8_t *filter_h = epel_filters[mx - 1];
1787
0
    const int8_t *filter_v = epel_filters[my - 1];
1788
0
    __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1789
0
    f2, f3, r1, r2, r3;
1790
0
    int8_t filter_0 = filter_h[0];
1791
0
    int8_t filter_1 = filter_h[1];
1792
0
    int8_t filter_2 = filter_h[2];
1793
0
    int8_t filter_3 = filter_h[3];
1794
0
    int16_t *tmp = mcbuffer;
1795
1796
0
    r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1797
0
                filter_0, filter_3, filter_2, filter_1, filter_0);
1798
0
        bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1799
1800
0
    src -= epel_extra_before * srcstride;
1801
1802
0
    f0 = _mm_set1_epi16(filter_v[0]);
1803
0
    f1 = _mm_set1_epi16(filter_v[1]);
1804
0
    f2 = _mm_set1_epi16(filter_v[2]);
1805
0
    f3 = _mm_set1_epi16(filter_v[3]);
1806
1807
1808
    /* horizontal treatment */
1809
0
    if(!(width & 3)){
1810
0
        bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1811
0
        for (y = 0; y < height + epel_extra; y ++) {
1812
0
            for(x=0;x<width;x+=4){
1813
1814
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1815
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1816
0
                x3 = _mm_shuffle_epi8(x1, bshuffle2);
1817
1818
1819
0
                x2 = _mm_madd_epi16(x2, r0);
1820
0
                x3 = _mm_madd_epi16(x3, r0);
1821
0
                x2 = _mm_hadd_epi32(x2, x3);
1822
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1823
1824
0
                x2 = _mm_packs_epi32(x2,r0);
1825
                //give results back
1826
0
                _mm_storel_epi64((__m128i *) &tmp[x], x2);
1827
1828
0
            }
1829
0
            src += srcstride;
1830
0
            tmp += MAX_PB_SIZE;
1831
0
        }
1832
0
        tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1833
1834
        // vertical treatment
1835
1836
1837
0
        for (y = 0; y < height; y++) {
1838
0
            for (x = 0; x < width; x += 4) {
1839
0
                x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1840
0
                x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1841
0
                x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1842
0
                x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1843
1844
0
                r0 = _mm_mullo_epi16(x0, f0);
1845
0
                r1 = _mm_mulhi_epi16(x0, f0);
1846
0
                r2 = _mm_mullo_epi16(x1, f1);
1847
0
                t0 = _mm_unpacklo_epi16(r0, r1);
1848
1849
0
                r0 = _mm_mulhi_epi16(x1, f1);
1850
0
                r1 = _mm_mullo_epi16(x2, f2);
1851
0
                t1 = _mm_unpacklo_epi16(r2, r0);
1852
1853
0
                r2 = _mm_mulhi_epi16(x2, f2);
1854
0
                r0 = _mm_mullo_epi16(x3, f3);
1855
0
                t2 = _mm_unpacklo_epi16(r1, r2);
1856
1857
0
                r1 = _mm_mulhi_epi16(x3, f3);
1858
0
                t3 = _mm_unpacklo_epi16(r0, r1);
1859
1860
1861
1862
0
                r0 = _mm_add_epi32(t0, t1);
1863
0
                r0 = _mm_add_epi32(r0, t2);
1864
0
                r0 = _mm_add_epi32(r0, t3);
1865
0
                r0 = _mm_srai_epi32(r0, 6);
1866
1867
                // give results back
1868
0
                r0 = _mm_packs_epi32(r0, r0);
1869
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
1870
0
            }
1871
0
            tmp += MAX_PB_SIZE;
1872
0
            dst += dststride;
1873
0
        }
1874
0
    }else{
1875
0
        bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1876
0
        r1= _mm_setzero_si128();
1877
0
        for (y = 0; y < height + epel_extra; y ++) {
1878
0
            for(x=0;x<width;x+=2){
1879
                /* load data in register     */
1880
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1881
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1882
1883
                /*  PMADDUBSW then PMADDW     */
1884
0
                x2 = _mm_madd_epi16(x2, r0);
1885
0
                x2 = _mm_hadd_epi32(x2, r1);
1886
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1887
0
                x2 = _mm_packs_epi32(x2, r1);
1888
                /* give results back            */
1889
0
                _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x));
1890
0
            }
1891
0
            src += srcstride;
1892
0
            tmp += MAX_PB_SIZE;
1893
0
        }
1894
1895
0
        tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1896
1897
        /* vertical treatment */
1898
1899
0
        for (y = 0; y < height; y++) {
1900
0
            for (x = 0; x < width; x += 2) {
1901
                /* check if memory needs to be reloaded */
1902
0
                x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1903
0
                x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1904
0
                x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1905
0
                x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1906
1907
0
                r0 = _mm_mullo_epi16(x0, f0);
1908
0
                t0 = _mm_mulhi_epi16(x0, f0);
1909
1910
0
                x0= _mm_unpacklo_epi16(r0,t0);
1911
1912
0
                r1 = _mm_mullo_epi16(x1, f1);
1913
0
                t1 = _mm_mulhi_epi16(x1, f1);
1914
1915
0
                x1= _mm_unpacklo_epi16(r1,t1);
1916
1917
0
                r2 = _mm_mullo_epi16(x2, f2);
1918
0
                t2 = _mm_mulhi_epi16(x2, f2);
1919
1920
0
                x2= _mm_unpacklo_epi16(r2,t2);
1921
1922
0
                r3 = _mm_mullo_epi16(x3, f3);
1923
0
                t3 = _mm_mulhi_epi16(x3, f3);
1924
1925
0
                x3= _mm_unpacklo_epi16(r3,t3);
1926
1927
0
                r0= _mm_add_epi32(x0,x1);
1928
0
                r1= _mm_add_epi32(x2,x3);
1929
0
                r0= _mm_add_epi32(r0,r1);
1930
0
                r0 = _mm_srai_epi32(r0, 6);
1931
                /* give results back            */
1932
0
                r0 = _mm_packs_epi32(r0, r0);
1933
0
                _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1934
0
            }
1935
0
            tmp += MAX_PB_SIZE;
1936
0
            dst += dststride;
1937
0
        }
1938
0
    }
1939
0
}
1940
#endif
1941
1942
void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
1943
                                        const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
1944
600k
        int16_t* mcbuffer) {
1945
600k
    int x, y;
1946
600k
    __m128i x1, x2, x3, x0;
1947
600k
    uint8_t *src = (uint8_t*) _src;
1948
600k
    ptrdiff_t srcstride = _srcstride;
1949
600k
    x0= _mm_setzero_si128();
1950
600k
    if(!(width & 15)){
1951
1.92M
        for (y = 0; y < height; y++) {
1952
4.45M
            for (x = 0; x < width; x += 16) {
1953
1954
2.62M
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1955
2.62M
                x2 = _mm_unpacklo_epi8(x1, x0);
1956
1957
2.62M
                x3 = _mm_unpackhi_epi8(x1, x0);
1958
1959
2.62M
                x2 = _mm_slli_epi16(x2, 6);
1960
2.62M
                x3 = _mm_slli_epi16(x3, 6);
1961
2.62M
                _mm_storeu_si128((__m128i *) &dst[x], x2);
1962
2.62M
                _mm_storeu_si128((__m128i *) &dst[x + 8], x3);
1963
1964
2.62M
            }
1965
1.83M
            src += srcstride;
1966
1.83M
            dst += dststride;
1967
1.83M
        }
1968
507k
    }else if(!(width & 7)){
1969
3.78M
        for (y = 0; y < height; y++) {
1970
6.80M
            for (x = 0; x < width; x += 8) {
1971
1972
3.47M
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1973
3.47M
                x2 = _mm_unpacklo_epi8(x1, x0);
1974
3.47M
                x2 = _mm_slli_epi16(x2, 6);
1975
3.47M
                _mm_storeu_si128((__m128i *) &dst[x], x2);
1976
1977
3.47M
            }
1978
3.33M
            src += srcstride;
1979
3.33M
            dst += dststride;
1980
3.33M
        }
1981
444k
    }else if(!(width & 3)){
1982
610k
        for (y = 0; y < height; y++) {
1983
1.18M
            for(x=0;x<width;x+=4){
1984
635k
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1985
635k
                x2 = _mm_unpacklo_epi8(x1, x0);
1986
635k
                x2 = _mm_slli_epi16(x2, 6);
1987
635k
                _mm_storel_epi64((__m128i *) &dst[x], x2);
1988
635k
            }
1989
547k
            src += srcstride;
1990
547k
            dst += dststride;
1991
547k
        }
1992
62.8k
    }else{
1993
#if MASKMOVE
1994
        x4= _mm_set_epi32(0,0,0,-1); //mask to store
1995
#endif
1996
0
        for (y = 0; y < height; y++) {
1997
0
                    for(x=0;x<width;x+=2){
1998
0
                        x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1999
0
                        x2 = _mm_unpacklo_epi8(x1, x0);
2000
0
                        x2 = _mm_slli_epi16(x2, 6);
2001
#if MASKMOVE
2002
                        _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
2003
#else
2004
0
                        *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
2005
0
#endif
2006
0
                    }
2007
0
                    src += srcstride;
2008
0
                    dst += dststride;
2009
0
                }
2010
0
    }
2011
2012
2013
600k
}
2014
2015
#ifndef __native_client__
2016
void ff_hevc_put_hevc_qpel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
2017
                                         const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2018
0
        int16_t* mcbuffer) {
2019
0
    int x, y;
2020
0
    __m128i x1, x2, x4;
2021
0
    uint16_t *src = (uint16_t*) _src;
2022
0
    ptrdiff_t srcstride = _srcstride>>1;
2023
0
    if(!(width & 7)){
2024
0
        for (y = 0; y < height; y++) {
2025
0
            for (x = 0; x < width; x += 8) {
2026
2027
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
2028
0
                x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH
2029
0
                _mm_storeu_si128((__m128i *) &dst[x], x2);
2030
2031
0
            }
2032
0
            src += srcstride;
2033
0
            dst += dststride;
2034
0
        }
2035
0
    }else if(!(width & 3)){
2036
0
        for (y = 0; y < height; y++) {
2037
0
            for(x=0;x<width;x+=4){
2038
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2039
0
                x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2040
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
2041
0
            }
2042
0
            src += srcstride;
2043
0
            dst += dststride;
2044
0
        }
2045
0
    }else{
2046
0
        x4= _mm_set_epi32(0,0,0,-1); //mask to store
2047
0
        for (y = 0; y < height; y++) {
2048
0
                    for(x=0;x<width;x+=2){
2049
0
                        x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2050
0
                        x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2051
0
                        _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
2052
0
                    }
2053
0
                    src += srcstride;
2054
0
                    dst += dststride;
2055
0
                }
2056
0
    }
2057
2058
2059
0
}
2060
#endif
2061
2062
2063
void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2064
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2065
30.0k
        int16_t* mcbuffer) {
2066
30.0k
    int x, y;
2067
30.0k
    const uint8_t *src = _src;
2068
30.0k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2069
30.0k
    __m128i x1, r0, x2, x3, x4, x5;
2070
2071
30.0k
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
2072
30.0k
            -1);
2073
2074
30.0k
    if(!(width & 7)){
2075
277k
        for (y = 0; y < height; y++) {
2076
672k
            for (x = 0; x < width; x += 8) {
2077
                /* load data in register     */
2078
419k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2079
419k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2080
419k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2081
419k
                        _mm_srli_si128(x1, 3));
2082
419k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2083
419k
                        _mm_srli_si128(x1, 5));
2084
419k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2085
419k
                        _mm_srli_si128(x1, 7));
2086
2087
                /*  PMADDUBSW then PMADDW     */
2088
419k
                x2 = _mm_maddubs_epi16(x2, r0);
2089
419k
                x3 = _mm_maddubs_epi16(x3, r0);
2090
419k
                x4 = _mm_maddubs_epi16(x4, r0);
2091
419k
                x5 = _mm_maddubs_epi16(x5, r0);
2092
419k
                x2 = _mm_hadd_epi16(x2, x3);
2093
419k
                x4 = _mm_hadd_epi16(x4, x5);
2094
419k
                x2 = _mm_hadd_epi16(x2, x4);
2095
                /* give results back            */
2096
419k
                _mm_store_si128((__m128i *) &dst[x],x2);
2097
2098
419k
            }
2099
252k
            src += srcstride;
2100
252k
            dst += dststride;
2101
252k
        }
2102
24.5k
    }else if(!(width &3)){
2103
2104
52.6k
        for (y = 0; y < height; y ++) {
2105
100k
            for(x=0;x<width;x+=4){
2106
            /* load data in register     */
2107
53.5k
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2108
53.5k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2109
53.5k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2110
53.5k
                    _mm_srli_si128(x1, 3));
2111
2112
            /*  PMADDUBSW then PMADDW     */
2113
53.5k
            x2 = _mm_maddubs_epi16(x2, r0);
2114
53.5k
            x3 = _mm_maddubs_epi16(x3, r0);
2115
53.5k
            x2 = _mm_hadd_epi16(x2, x3);
2116
53.5k
            x2 = _mm_hadd_epi16(x2, x2);
2117
2118
            /* give results back            */
2119
53.5k
            _mm_storel_epi64((__m128i *) &dst[x], x2);
2120
53.5k
            }
2121
2122
47.1k
            src += srcstride;
2123
47.1k
            dst += dststride;
2124
47.1k
        }
2125
5.57k
    }else{
2126
0
        x5= _mm_setzero_si128();
2127
#if MASKMOVE
2128
        x3= _mm_set_epi32(0,0,0,-1);
2129
#endif
2130
0
        for (y = 0; y < height; y ++) {
2131
0
            for(x=0;x<width;x+=4){
2132
            /* load data in register     */
2133
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2134
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2135
2136
2137
2138
            /*  PMADDUBSW then PMADDW     */
2139
0
            x2 = _mm_maddubs_epi16(x2, r0);
2140
0
            x2 = _mm_hadd_epi16(x2,x5 );
2141
0
            x2 = _mm_hadd_epi16(x2,x5 );
2142
2143
            /* give results back            */
2144
            //_mm_storel_epi64((__m128i *) &dst[x], x2);
2145
#if MASKMOVE
2146
            _mm_maskmoveu_si128(x2,x3,(char *) (dst+x));
2147
#else
2148
0
            *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
2149
0
#endif
2150
0
            }
2151
2152
0
            src += srcstride;
2153
0
            dst += dststride;
2154
0
        }
2155
0
    }
2156
2157
30.0k
}
2158
#ifndef __native_client__
2159
/*
2160
 * @TODO : Valgrind to see if it's useful to use SSE or wait for AVX2 implementation
2161
 */
2162
void ff_hevc_put_hevc_qpel_h_1_10_sse(int16_t *dst, ptrdiff_t dststride,
2163
                                      const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2164
0
        int16_t* mcbuffer) {
2165
0
    int x, y;
2166
0
    uint16_t *src = (uint16_t*)_src;
2167
0
    ptrdiff_t srcstride = _srcstride>>1;
2168
0
    __m128i x0, x1, x2, x3, r0;
2169
2170
0
    r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2171
0
    x0= _mm_setzero_si128();
2172
0
    x3= _mm_set_epi32(0,0,0,-1);
2173
0
    for (y = 0; y < height; y ++) {
2174
0
        for(x=0;x<width;x+=2){
2175
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2176
0
            x2 = _mm_srli_si128(x1,2); //last 16bit not used so 1 load can be used for 2 dst
2177
2178
0
            x1 = _mm_madd_epi16(x1,r0);
2179
0
            x2 = _mm_madd_epi16(x2,r0);
2180
2181
0
            x1 = _mm_hadd_epi32(x1,x2);
2182
0
            x1 = _mm_hadd_epi32(x1,x0);
2183
0
            x1= _mm_srai_epi32(x1,2); //>>BIT_DEPTH-8
2184
0
            x1= _mm_packs_epi32(x1,x0);
2185
         //   dst[x]= _mm_extract_epi16(x1,0);
2186
0
            _mm_maskmoveu_si128(x1,x3,(char *) (dst+x));
2187
0
        }
2188
0
        src += srcstride;
2189
0
        dst += dststride;
2190
0
    }
2191
2192
0
}
2193
#endif
2194
2195
2196
void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2197
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2198
20.0k
        int16_t* mcbuffer) {
2199
20.0k
    int x, y;
2200
20.0k
    const uint8_t *src = _src;
2201
20.0k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2202
20.0k
    __m128i x1, r0, x2, x3, x4, x5;
2203
2204
20.0k
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2205
20.0k
            4, -1);
2206
2207
    /* LOAD src from memory to registers to limit memory bandwidth */
2208
20.0k
    if(!(width - 15)){
2209
0
        for (y = 0; y < height; y++) {
2210
0
                    for (x = 0; x < width; x += 8) {
2211
                        /* load data in register     */
2212
0
                        x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2213
0
                        x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2214
0
                        x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2215
0
                                _mm_srli_si128(x1, 3));
2216
0
                        x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2217
0
                                _mm_srli_si128(x1, 5));
2218
0
                        x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2219
0
                                _mm_srli_si128(x1, 7));
2220
2221
                        /*  PMADDUBSW then PMADDW     */
2222
0
                        x2 = _mm_maddubs_epi16(x2, r0);
2223
0
                        x3 = _mm_maddubs_epi16(x3, r0);
2224
0
                        x4 = _mm_maddubs_epi16(x4, r0);
2225
0
                        x5 = _mm_maddubs_epi16(x5, r0);
2226
0
                        x2 = _mm_hadd_epi16(x2, x3);
2227
0
                        x4 = _mm_hadd_epi16(x4, x5);
2228
0
                        x2 = _mm_hadd_epi16(x2, x4);
2229
                        /* give results back            */
2230
0
                        _mm_store_si128((__m128i *) &dst[x],x2);
2231
0
                    }
2232
0
                    src += srcstride;
2233
0
                    dst += dststride;
2234
0
                }
2235
2236
20.0k
    }else{
2237
2238
228k
        for (y = 0; y < height; y ++) {
2239
912k
            for(x=0;x<width;x+=4){
2240
            /* load data in register     */
2241
703k
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2242
2243
703k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2244
703k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2245
703k
                    _mm_srli_si128(x1, 3));
2246
2247
2248
            /*  PMADDUBSW then PMADDW     */
2249
703k
            x2 = _mm_maddubs_epi16(x2, r0);
2250
703k
            x3 = _mm_maddubs_epi16(x3, r0);
2251
703k
            x2 = _mm_hadd_epi16(x2, x3);
2252
703k
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2253
2254
            /* give results back            */
2255
703k
            _mm_storel_epi64((__m128i *) &dst[x], x2);
2256
2257
703k
            }
2258
208k
            src += srcstride;
2259
208k
            dst += dststride;
2260
208k
        }
2261
20.0k
    }
2262
2263
20.0k
}
2264
2265
#if 0
2266
static void ff_hevc_put_hevc_qpel_h_2_sse(int16_t *dst, ptrdiff_t dststride,
2267
                                          const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2268
        int16_t* mcbuffer) {
2269
    int x, y;
2270
    uint8_t *src = _src;
2271
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2272
    __m128i x1, r0, x2, x3, x4, x5;
2273
2274
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2275
            4, -1);
2276
2277
    /* LOAD src from memory to registers to limit memory bandwidth */
2278
    if(!(width & 7)){
2279
        for (y = 0; y < height; y++) {
2280
                    for (x = 0; x < width; x += 8) {
2281
                        /* load data in register     */
2282
                        x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2283
                        x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2284
                        x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2285
                                _mm_srli_si128(x1, 3));
2286
                        x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2287
                                _mm_srli_si128(x1, 5));
2288
                        x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2289
                                _mm_srli_si128(x1, 7));
2290
2291
                        /*  PMADDUBSW then PMADDW     */
2292
                        x2 = _mm_maddubs_epi16(x2, r0);
2293
                        x3 = _mm_maddubs_epi16(x3, r0);
2294
                        x4 = _mm_maddubs_epi16(x4, r0);
2295
                        x5 = _mm_maddubs_epi16(x5, r0);
2296
                        x2 = _mm_hadd_epi16(x2, x3);
2297
                        x4 = _mm_hadd_epi16(x4, x5);
2298
                        x2 = _mm_hadd_epi16(x2, x4);
2299
                        /* give results back            */
2300
                        _mm_store_si128((__m128i *) &dst[x],x2);
2301
                    }
2302
                    src += srcstride;
2303
                    dst += dststride;
2304
                }
2305
2306
    }else{
2307
2308
        for (y = 0; y < height; y ++) {
2309
            for(x=0;x<width;x+=4){
2310
            /* load data in register     */
2311
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2312
2313
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2314
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2315
                    _mm_srli_si128(x1, 3));
2316
2317
2318
            /*  PMADDUBSW then PMADDW     */
2319
            x2 = _mm_maddubs_epi16(x2, r0);
2320
            x3 = _mm_maddubs_epi16(x3, r0);
2321
            x2 = _mm_hadd_epi16(x2, x3);
2322
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2323
2324
            /* give results back            */
2325
            _mm_storel_epi64((__m128i *) &dst[x], x2);
2326
2327
            }
2328
            src += srcstride;
2329
            dst += dststride;
2330
        }
2331
    }
2332
2333
}
2334
static void ff_hevc_put_hevc_qpel_h_3_sse(int16_t *dst, ptrdiff_t dststride,
2335
                                          const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2336
        int16_t* mcbuffer) {
2337
    int x, y;
2338
    uint8_t *src = _src;
2339
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2340
    __m128i x1, r0, x2, x3, x4, x5;
2341
2342
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2343
            0);
2344
2345
    if(!(width & 7)){
2346
        for (y = 0; y < height; y++) {
2347
            for (x = 0; x < width; x += 8) {
2348
                /* load data in register     */
2349
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2350
                x1 = _mm_slli_si128(x1, 1);
2351
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2352
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2353
                        _mm_srli_si128(x1, 3));
2354
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2355
                        _mm_srli_si128(x1, 5));
2356
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2357
                        _mm_srli_si128(x1, 7));
2358
2359
                /*  PMADDUBSW then PMADDW     */
2360
                x2 = _mm_maddubs_epi16(x2, r0);
2361
                x3 = _mm_maddubs_epi16(x3, r0);
2362
                x4 = _mm_maddubs_epi16(x4, r0);
2363
                x5 = _mm_maddubs_epi16(x5, r0);
2364
                x2 = _mm_hadd_epi16(x2, x3);
2365
                x4 = _mm_hadd_epi16(x4, x5);
2366
                x2 = _mm_hadd_epi16(x2, x4);
2367
                /* give results back            */
2368
                _mm_store_si128((__m128i *) &dst[x],
2369
                        _mm_srli_si128(x2, BIT_DEPTH - 8));
2370
            }
2371
            src += srcstride;
2372
            dst += dststride;
2373
        }
2374
    }else{
2375
        for (y = 0; y < height; y ++) {
2376
            for(x=0;x<width;x+=4){
2377
                /* load data in register     */
2378
                x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2379
                x1 = _mm_slli_si128(x1, 1);
2380
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2381
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2382
                        _mm_srli_si128(x1, 3));
2383
2384
                /*  PMADDUBSW then PMADDW     */
2385
                x2 = _mm_maddubs_epi16(x2, r0);
2386
                x3 = _mm_maddubs_epi16(x3, r0);
2387
                x2 = _mm_hadd_epi16(x2, x3);
2388
                x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2389
                x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
2390
                /* give results back            */
2391
                _mm_storel_epi64((__m128i *) &dst[x], x2);
2392
2393
            }
2394
            src += srcstride;
2395
            dst += dststride;
2396
        }
2397
    }
2398
}
2399
#endif
2400
2401
void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
2402
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2403
28.0k
        int16_t* mcbuffer) {
2404
28.0k
    int x, y;
2405
28.0k
    const uint8_t *src = _src;
2406
28.0k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2407
28.0k
    __m128i x1, r0, x2, x3, x4, x5;
2408
2409
28.0k
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2410
28.0k
            0);
2411
2412
28.0k
    if(!(width & 7)){
2413
269k
        for (y = 0; y < height; y++) {
2414
666k
            for (x = 0; x < width; x += 8) {
2415
                /* load data in register     */
2416
421k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2417
421k
                x1 = _mm_slli_si128(x1, 1);
2418
421k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2419
421k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2420
421k
                        _mm_srli_si128(x1, 3));
2421
421k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2422
421k
                        _mm_srli_si128(x1, 5));
2423
421k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2424
421k
                        _mm_srli_si128(x1, 7));
2425
2426
                /*  PMADDUBSW then PMADDW     */
2427
421k
                x2 = _mm_maddubs_epi16(x2, r0);
2428
421k
                x3 = _mm_maddubs_epi16(x3, r0);
2429
421k
                x4 = _mm_maddubs_epi16(x4, r0);
2430
421k
                x5 = _mm_maddubs_epi16(x5, r0);
2431
421k
                x2 = _mm_hadd_epi16(x2, x3);
2432
421k
                x4 = _mm_hadd_epi16(x4, x5);
2433
421k
                x2 = _mm_hadd_epi16(x2, x4);
2434
                /* give results back            */
2435
421k
                _mm_store_si128((__m128i *) &dst[x],x2);
2436
421k
            }
2437
244k
            src += srcstride;
2438
244k
            dst += dststride;
2439
244k
        }
2440
24.4k
    }else{
2441
38.1k
        for (y = 0; y < height; y ++) {
2442
79.3k
            for(x=0;x<width;x+=4){
2443
                /* load data in register     */
2444
44.8k
                x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2445
44.8k
                x1 = _mm_slli_si128(x1, 1);
2446
44.8k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2447
44.8k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2448
44.8k
                        _mm_srli_si128(x1, 3));
2449
2450
                /*  PMADDUBSW then PMADDW     */
2451
44.8k
                x2 = _mm_maddubs_epi16(x2, r0);
2452
44.8k
                x3 = _mm_maddubs_epi16(x3, r0);
2453
44.8k
                x2 = _mm_hadd_epi16(x2, x3);
2454
44.8k
                x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2455
                /* give results back            */
2456
44.8k
                _mm_storel_epi64((__m128i *) &dst[x], x2);
2457
2458
44.8k
            }
2459
34.4k
            src += srcstride;
2460
34.4k
            dst += dststride;
2461
34.4k
        }
2462
3.63k
    }
2463
28.0k
}
2464
/**
2465
 for column MC treatment, we will calculate 8 pixels at the same time by multiplying the values
2466
 of each row.
2467
2468
 */
2469
void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2470
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2471
40.7k
        int16_t* mcbuffer) {
2472
40.7k
    int x, y;
2473
40.7k
    uint8_t *src = (uint8_t*) _src;
2474
40.7k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2475
40.7k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2476
40.7k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2477
40.7k
    r1 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2478
2479
40.7k
    if(!(width & 15)){
2480
6.52k
        x8 = _mm_setzero_si128();
2481
126k
        for (y = 0; y < height; y++) {
2482
289k
            for (x = 0; x < width; x += 16) {
2483
                /* check if memory needs to be reloaded */
2484
169k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2485
169k
                x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2486
169k
                x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2487
169k
                x4 = _mm_loadu_si128((__m128i *) &src[x]);
2488
169k
                x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2489
169k
                x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2490
169k
                x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2491
2492
169k
                t1 = _mm_unpacklo_epi8(x1,x8);
2493
169k
                t2 = _mm_unpacklo_epi8(x2, x8);
2494
169k
                t3 = _mm_unpacklo_epi8(x3, x8);
2495
169k
                t4 = _mm_unpacklo_epi8(x4, x8);
2496
169k
                t5 = _mm_unpacklo_epi8(x5, x8);
2497
169k
                t6 = _mm_unpacklo_epi8(x6, x8);
2498
169k
                t7 = _mm_unpacklo_epi8(x7, x8);
2499
2500
169k
                x1 = _mm_unpackhi_epi8(x1,x8);
2501
169k
                x2 = _mm_unpackhi_epi8(x2, x8);
2502
169k
                x3 = _mm_unpackhi_epi8(x3, x8);
2503
169k
                x4 = _mm_unpackhi_epi8(x4, x8);
2504
169k
                x5 = _mm_unpackhi_epi8(x5, x8);
2505
169k
                x6 = _mm_unpackhi_epi8(x6, x8);
2506
169k
                x7 = _mm_unpackhi_epi8(x7, x8);
2507
2508
                /* multiply by correct value : */
2509
169k
                r0 = _mm_mullo_epi16(t1,
2510
169k
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2511
169k
                r2 = _mm_mullo_epi16(x1,
2512
169k
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2513
169k
                r0 = _mm_adds_epi16(r0,
2514
169k
                        _mm_mullo_epi16(t2,
2515
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2516
169k
                r2 = _mm_adds_epi16(r2,
2517
169k
                        _mm_mullo_epi16(x2,
2518
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2519
169k
                r0 = _mm_adds_epi16(r0,
2520
169k
                        _mm_mullo_epi16(t3,
2521
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2522
169k
                r2 = _mm_adds_epi16(r2,
2523
169k
                        _mm_mullo_epi16(x3,
2524
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2525
2526
169k
                r0 = _mm_adds_epi16(r0,
2527
169k
                        _mm_mullo_epi16(t4,
2528
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2529
169k
                r2 = _mm_adds_epi16(r2,
2530
169k
                        _mm_mullo_epi16(x4,
2531
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2532
2533
169k
                r0 = _mm_adds_epi16(r0,
2534
169k
                        _mm_mullo_epi16(t5,
2535
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2536
169k
                r2 = _mm_adds_epi16(r2,
2537
169k
                        _mm_mullo_epi16(x5,
2538
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2539
2540
169k
                r0 = _mm_adds_epi16(r0,
2541
169k
                        _mm_mullo_epi16(t6,
2542
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2543
169k
                r2 = _mm_adds_epi16(r2,
2544
169k
                        _mm_mullo_epi16(x6,
2545
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2546
2547
169k
                r0 = _mm_adds_epi16(r0,
2548
169k
                        _mm_mullo_epi16(t7,
2549
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2550
169k
                r2 = _mm_adds_epi16(r2,
2551
169k
                        _mm_mullo_epi16(x7,
2552
169k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2553
2554
2555
                /* give results back            */
2556
169k
                _mm_store_si128((__m128i *) &dst[x],r0);
2557
169k
                _mm_store_si128((__m128i *) &dst[x + 8],r2);
2558
169k
            }
2559
119k
            src += srcstride;
2560
119k
            dst += dststride;
2561
119k
        }
2562
2563
34.1k
    }else{
2564
34.1k
        x = 0;
2565
34.1k
        x8 = _mm_setzero_si128();
2566
34.1k
        t8 = _mm_setzero_si128();
2567
312k
        for (y = 0; y < height; y ++) {
2568
850k
            for(x=0;x<width;x+=4){
2569
                /* load data in register  */
2570
571k
                x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2571
571k
                x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2572
571k
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2573
571k
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2574
571k
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2575
571k
                x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2576
571k
                x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2577
2578
2579
2580
571k
                x1 = _mm_unpacklo_epi8(x1, t8);
2581
571k
                x2 = _mm_unpacklo_epi8(x2, t8);
2582
571k
                x3 = _mm_unpacklo_epi8(x3, t8);
2583
571k
                x4 = _mm_unpacklo_epi8(x4, t8);
2584
571k
                x5 = _mm_unpacklo_epi8(x5, t8);
2585
571k
                x6 = _mm_unpacklo_epi8(x6, t8);
2586
571k
                x7 = _mm_unpacklo_epi8(x7, t8);
2587
2588
2589
571k
                r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2590
2591
571k
                r0 = _mm_adds_epi16(r0,
2592
571k
                        _mm_mullo_epi16(x2,
2593
571k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2594
2595
2596
571k
                r0 = _mm_adds_epi16(r0,
2597
571k
                        _mm_mullo_epi16(x3,
2598
571k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2599
2600
571k
                r0 = _mm_adds_epi16(r0,
2601
571k
                        _mm_mullo_epi16(x4,
2602
571k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2603
2604
571k
                r0 = _mm_adds_epi16(r0,
2605
571k
                        _mm_mullo_epi16(x5,
2606
571k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2607
2608
2609
571k
                r0 = _mm_adds_epi16(r0,
2610
571k
                        _mm_mullo_epi16(x6,
2611
571k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2612
2613
2614
571k
                r0 = _mm_adds_epi16(r0,
2615
571k
                        _mm_mullo_epi16(x7,
2616
571k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2617
2618
                /* give results back            */
2619
571k
                _mm_storel_epi64((__m128i *) &dst[x], r0);
2620
571k
            }
2621
278k
            src += srcstride;
2622
278k
            dst += dststride;
2623
278k
        }
2624
34.1k
    }
2625
40.7k
}
2626
2627
#if 0
2628
void ff_hevc_put_hevc_qpel_v_1_10_sse4(int16_t *dst, ptrdiff_t dststride,
2629
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2630
        int16_t* mcbuffer) {
2631
    int x, y;
2632
    uint16_t *src = (uint16_t*) _src;
2633
    ptrdiff_t srcstride = _srcstride >> 1;
2634
    __m128i x1, x2, x3, x4, x5, x6, x7, r1;
2635
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2636
2637
        t7= _mm_set1_epi32(1);
2638
        t6= _mm_set1_epi32(-5);
2639
        t5= _mm_set1_epi32(17);
2640
        t4= _mm_set1_epi32(58);
2641
        t3= _mm_set1_epi32(-10);
2642
        t2= _mm_set1_epi32(4);
2643
        t1= _mm_set1_epi32(-1);
2644
        t8= _mm_setzero_si128();
2645
2646
        for (y = 0; y < height; y ++) {
2647
            for(x=0;x<width;x+=4){
2648
                /* load data in register  */
2649
                x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2650
                x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2651
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2652
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2653
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2654
                x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2655
                x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2656
2657
2658
                x1 = _mm_unpacklo_epi16(x1, t8);
2659
                x2 = _mm_unpacklo_epi16(x2, t8);
2660
                x3 = _mm_unpacklo_epi16(x3, t8);
2661
                x4 = _mm_unpacklo_epi16(x4, t8);
2662
                x5 = _mm_unpacklo_epi16(x5, t8);
2663
                x6 = _mm_unpacklo_epi16(x6, t8);
2664
                x7 = _mm_unpacklo_epi16(x7, t8);
2665
2666
2667
                r1 = _mm_mullo_epi32(x1,t1);
2668
2669
                r1 = _mm_add_epi32(r1,
2670
                        _mm_mullo_epi32(x2,t2));
2671
2672
2673
                r1 = _mm_add_epi32(r1,
2674
                        _mm_mullo_epi32(x3,t3));
2675
2676
                r1 = _mm_add_epi32(r1,
2677
                        _mm_mullo_epi32(x4,t4));
2678
2679
                r1 = _mm_add_epi32(r1,
2680
                        _mm_mullo_epi32(x5,t5));
2681
2682
2683
                r1 = _mm_add_epi32(r1,
2684
                        _mm_mullo_epi32(x6,t6));
2685
2686
2687
                r1 = _mm_add_epi32(r1, _mm_mullo_epi32(x7,t7));
2688
                r1 = _mm_srai_epi32(r1,2); //bit depth - 8
2689
2690
2691
                r1 = _mm_packs_epi32(r1,t8);
2692
2693
                // give results back
2694
                _mm_storel_epi64((__m128i *) (dst + x), r1);
2695
            }
2696
            src += srcstride;
2697
            dst += dststride;
2698
        }
2699
2700
}
2701
#endif
2702
2703
2704
2705
void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2706
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2707
22.2k
        int16_t* mcbuffer) {
2708
22.2k
    int x, y;
2709
22.2k
    uint8_t *src = (uint8_t*) _src;
2710
22.2k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2711
22.2k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2712
22.2k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2713
22.2k
    r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2714
2715
22.2k
    if(!(width & 15)){
2716
81.6k
        for (y = 0; y < height; y++) {
2717
188k
            for (x = 0; x < width; x += 16) {
2718
110k
                r0 = _mm_setzero_si128();
2719
                /* check if memory needs to be reloaded */
2720
110k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2721
110k
                x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2722
110k
                x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2723
110k
                x4 = _mm_loadu_si128((__m128i *) &src[x]);
2724
110k
                x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2725
110k
                x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2726
110k
                x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2727
110k
                x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2728
2729
110k
                t1 = _mm_unpacklo_epi8(x1, r0);
2730
110k
                t2 = _mm_unpacklo_epi8(x2, r0);
2731
110k
                t3 = _mm_unpacklo_epi8(x3, r0);
2732
110k
                t4 = _mm_unpacklo_epi8(x4, r0);
2733
110k
                t5 = _mm_unpacklo_epi8(x5, r0);
2734
110k
                t6 = _mm_unpacklo_epi8(x6, r0);
2735
110k
                t7 = _mm_unpacklo_epi8(x7, r0);
2736
110k
                t8 = _mm_unpacklo_epi8(x8, r0);
2737
2738
110k
                x1 = _mm_unpackhi_epi8(x1, r0);
2739
110k
                x2 = _mm_unpackhi_epi8(x2, r0);
2740
110k
                x3 = _mm_unpackhi_epi8(x3, r0);
2741
110k
                x4 = _mm_unpackhi_epi8(x4, r0);
2742
110k
                x5 = _mm_unpackhi_epi8(x5, r0);
2743
110k
                x6 = _mm_unpackhi_epi8(x6, r0);
2744
110k
                x7 = _mm_unpackhi_epi8(x7, r0);
2745
110k
                x8 = _mm_unpackhi_epi8(x8, r0);
2746
2747
                /* multiply by correct value : */
2748
110k
                r0 = _mm_mullo_epi16(t1,
2749
110k
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2750
110k
                r2 = _mm_mullo_epi16(x1,
2751
110k
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2752
110k
                r0 = _mm_adds_epi16(r0,
2753
110k
                        _mm_mullo_epi16(t2,
2754
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2755
110k
                r2 = _mm_adds_epi16(r2,
2756
110k
                        _mm_mullo_epi16(x2,
2757
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2758
110k
                r0 = _mm_adds_epi16(r0,
2759
110k
                        _mm_mullo_epi16(t3,
2760
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2761
110k
                r2 = _mm_adds_epi16(r2,
2762
110k
                        _mm_mullo_epi16(x3,
2763
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2764
2765
110k
                r0 = _mm_adds_epi16(r0,
2766
110k
                        _mm_mullo_epi16(t4,
2767
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2768
110k
                r2 = _mm_adds_epi16(r2,
2769
110k
                        _mm_mullo_epi16(x4,
2770
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2771
2772
110k
                r0 = _mm_adds_epi16(r0,
2773
110k
                        _mm_mullo_epi16(t5,
2774
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2775
110k
                r2 = _mm_adds_epi16(r2,
2776
110k
                        _mm_mullo_epi16(x5,
2777
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2778
2779
110k
                r0 = _mm_adds_epi16(r0,
2780
110k
                        _mm_mullo_epi16(t6,
2781
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2782
110k
                r2 = _mm_adds_epi16(r2,
2783
110k
                        _mm_mullo_epi16(x6,
2784
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2785
2786
110k
                r0 = _mm_adds_epi16(r0,
2787
110k
                        _mm_mullo_epi16(t7,
2788
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2789
110k
                r2 = _mm_adds_epi16(r2,
2790
110k
                        _mm_mullo_epi16(x7,
2791
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2792
2793
110k
                r0 = _mm_adds_epi16(r0,
2794
110k
                        _mm_mullo_epi16(t8,
2795
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2796
110k
                r2 = _mm_adds_epi16(r2,
2797
110k
                        _mm_mullo_epi16(x8,
2798
110k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2799
2800
                /* give results back            */
2801
110k
                _mm_store_si128((__m128i *) &dst[x],r0);
2802
110k
                _mm_store_si128((__m128i *) &dst[x + 8],r2);
2803
110k
            }
2804
77.6k
            src += srcstride;
2805
77.6k
            dst += dststride;
2806
77.6k
        }
2807
18.2k
    }else{
2808
18.2k
        x = 0;
2809
162k
        for (y = 0; y < height; y ++) {
2810
411k
            for(x=0;x<width;x+=4){
2811
266k
                r0 = _mm_setzero_si128();
2812
                /* load data in register  */
2813
266k
                x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2814
266k
                x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2815
266k
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2816
266k
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2817
266k
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2818
266k
                x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2819
266k
                x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2820
266k
                x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2821
2822
266k
                x1 = _mm_unpacklo_epi8(x1,r0);
2823
266k
                x2 = _mm_unpacklo_epi8(x2, r0);
2824
266k
                x3 = _mm_unpacklo_epi8(x3, r0);
2825
266k
                x4 = _mm_unpacklo_epi8(x4, r0);
2826
266k
                x5 = _mm_unpacklo_epi8(x5, r0);
2827
266k
                x6 = _mm_unpacklo_epi8(x6, r0);
2828
266k
                x7 = _mm_unpacklo_epi8(x7, r0);
2829
266k
                x8 = _mm_unpacklo_epi8(x8, r0);
2830
2831
2832
266k
                r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2833
2834
266k
                r0 = _mm_adds_epi16(r0,
2835
266k
                        _mm_mullo_epi16(x2,
2836
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2837
2838
2839
266k
                r0 = _mm_adds_epi16(r0,
2840
266k
                        _mm_mullo_epi16(x3,
2841
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2842
2843
2844
266k
                r0 = _mm_adds_epi16(r0,
2845
266k
                        _mm_mullo_epi16(x4,
2846
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2847
2848
2849
266k
                r0 = _mm_adds_epi16(r0,
2850
266k
                        _mm_mullo_epi16(x5,
2851
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2852
2853
2854
266k
                r0 = _mm_adds_epi16(r0,
2855
266k
                        _mm_mullo_epi16(x6,
2856
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2857
2858
2859
266k
                r0 = _mm_adds_epi16(r0,
2860
266k
                        _mm_mullo_epi16(x7,
2861
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2862
2863
2864
266k
                r0 = _mm_adds_epi16(r0,
2865
266k
                        _mm_mullo_epi16(x8,
2866
266k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2867
2868
2869
                /* give results back            */
2870
266k
                _mm_storel_epi64((__m128i *) &dst[x], r0);
2871
2872
266k
            }
2873
144k
            src += srcstride;
2874
144k
            dst += dststride;
2875
144k
        }
2876
18.2k
    }
2877
22.2k
}
2878
2879
#if 0
2880
void ff_hevc_put_hevc_qpel_v_2_10_sse(int16_t *dst, ptrdiff_t dststride,
2881
                                      cosnt uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2882
        int16_t* mcbuffer) {
2883
    int x, y;
2884
    uint16_t *src = (uint16_t*) _src;
2885
    ptrdiff_t srcstride = _srcstride >> 1;
2886
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2887
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2888
    r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2889
2890
    t1= _mm_set1_epi32(-1);
2891
    t2= _mm_set1_epi32(4);
2892
    t3= _mm_set1_epi32(-11);
2893
    t4= _mm_set1_epi32(40);
2894
    t5= _mm_set1_epi32(40);
2895
    t6= _mm_set1_epi32(-11);
2896
    t7= _mm_set1_epi32(4);
2897
    t8= _mm_set1_epi32(-1);
2898
2899
    {
2900
        x = 0;
2901
        r0 = _mm_setzero_si128();
2902
        for (y = 0; y < height; y ++) {
2903
            for(x=0;x<width;x+=4){
2904
2905
                /* load data in register  */
2906
                x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2907
                x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2908
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2909
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2910
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2911
                x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2912
                x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2913
                x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2914
2915
                x1 = _mm_unpacklo_epi16(x1, r0);
2916
                x2 = _mm_unpacklo_epi16(x2, r0);
2917
                x3 = _mm_unpacklo_epi16(x3, r0);
2918
                x4 = _mm_unpacklo_epi16(x4, r0);
2919
                x5 = _mm_unpacklo_epi16(x5, r0);
2920
                x6 = _mm_unpacklo_epi16(x6, r0);
2921
                x7 = _mm_unpacklo_epi16(x7, r0);
2922
                x8 = _mm_unpacklo_epi16(x8, r0);
2923
2924
2925
                r1 = _mm_mullo_epi32(x1, t1);
2926
2927
                r1 = _mm_add_epi32(r1,
2928
                        _mm_mullo_epi32(x2,t2));
2929
2930
2931
                r1 = _mm_add_epi32(r1,
2932
                        _mm_mullo_epi32(x3,t3));
2933
2934
2935
                r1 = _mm_add_epi32(r1,
2936
                        _mm_mullo_epi32(x4,t4));
2937
2938
2939
                r1 = _mm_add_epi32(r1,
2940
                        _mm_mullo_epi32(x5,t5));
2941
2942
2943
                r1 = _mm_add_epi32(r1,
2944
                        _mm_mullo_epi32(x6,t6));
2945
2946
2947
                r1 = _mm_add_epi32(r1,
2948
                        _mm_mullo_epi32(x7,t7));
2949
2950
2951
                r1 = _mm_add_epi32(r1,
2952
                        _mm_mullo_epi32(x8,t8));
2953
2954
2955
                r1= _mm_srai_epi32(r1,2); //bit depth - 8
2956
2957
                r1= _mm_packs_epi32(r1,t8);
2958
2959
                /* give results back            */
2960
                _mm_storel_epi64((__m128i *) (dst+x), r1);
2961
2962
            }
2963
            src += srcstride;
2964
            dst += dststride;
2965
        }
2966
    }
2967
}
2968
#endif
2969
2970
#if 0
2971
static  void ff_hevc_put_hevc_qpel_v_3_sse(int16_t *dst, ptrdiff_t dststride,
2972
                                           const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2973
        int16_t* mcbuffer) {
2974
    int x, y;
2975
    uint8_t *src = (uint8_t*) _src;
2976
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2977
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2978
    __m128i t2, t3, t4, t5, t6, t7, t8;
2979
    r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
2980
2981
    if(!(width & 15)){
2982
        for (y = 0; y < height; y++) {
2983
                    for (x = 0; x < width; x += 16) {
2984
                        /* check if memory needs to be reloaded */
2985
                        x1 = _mm_setzero_si128();
2986
                        x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2987
                        x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2988
                        x4 = _mm_loadu_si128((__m128i *) &src[x]);
2989
                        x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2990
                        x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2991
                        x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2992
                        x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2993
2994
                        t2 = _mm_unpacklo_epi8(x2, x1);
2995
                        t3 = _mm_unpacklo_epi8(x3, x1);
2996
                        t4 = _mm_unpacklo_epi8(x4, x1);
2997
                        t5 = _mm_unpacklo_epi8(x5, x1);
2998
                        t6 = _mm_unpacklo_epi8(x6, x1);
2999
                        t7 = _mm_unpacklo_epi8(x7, x1);
3000
                        t8 = _mm_unpacklo_epi8(x8, x1);
3001
3002
                        x2 = _mm_unpackhi_epi8(x2, x1);
3003
                        x3 = _mm_unpackhi_epi8(x3, x1);
3004
                        x4 = _mm_unpackhi_epi8(x4, x1);
3005
                        x5 = _mm_unpackhi_epi8(x5, x1);
3006
                        x6 = _mm_unpackhi_epi8(x6, x1);
3007
                        x7 = _mm_unpackhi_epi8(x7, x1);
3008
                        x8 = _mm_unpackhi_epi8(x8, x1);
3009
3010
                        /* multiply by correct value : */
3011
                        r0 = _mm_mullo_epi16(t2,
3012
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3013
                        r2 = _mm_mullo_epi16(x2,
3014
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3015
3016
                        r0 = _mm_adds_epi16(r0,
3017
                                _mm_mullo_epi16(t3,
3018
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3019
                        r2 = _mm_adds_epi16(r2,
3020
                                _mm_mullo_epi16(x3,
3021
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3022
3023
                        r0 = _mm_adds_epi16(r0,
3024
                                _mm_mullo_epi16(t4,
3025
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3026
                        r2 = _mm_adds_epi16(r2,
3027
                                _mm_mullo_epi16(x4,
3028
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3029
3030
                        r0 = _mm_adds_epi16(r0,
3031
                                _mm_mullo_epi16(t5,
3032
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3033
                        r2 = _mm_adds_epi16(r2,
3034
                                _mm_mullo_epi16(x5,
3035
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3036
3037
                        r0 = _mm_adds_epi16(r0,
3038
                                _mm_mullo_epi16(t6,
3039
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3040
                        r2 = _mm_adds_epi16(r2,
3041
                                _mm_mullo_epi16(x6,
3042
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3043
3044
                        r0 = _mm_adds_epi16(r0,
3045
                                _mm_mullo_epi16(t7,
3046
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3047
                        r2 = _mm_adds_epi16(r2,
3048
                                _mm_mullo_epi16(x7,
3049
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3050
3051
                        r0 = _mm_adds_epi16(r0,
3052
                                _mm_mullo_epi16(t8,
3053
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3054
                        r2 = _mm_adds_epi16(r2,
3055
                                _mm_mullo_epi16(x8,
3056
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3057
3058
                        /* give results back            */
3059
                        _mm_store_si128((__m128i *) &dst[x],
3060
                                _mm_srli_epi16(r0, BIT_DEPTH - 8));
3061
                        _mm_store_si128((__m128i *) &dst[x + 8],
3062
                                _mm_srli_epi16(r2, BIT_DEPTH - 8));
3063
                    }
3064
                    src += srcstride;
3065
                    dst += dststride;
3066
                }
3067
    }else{
3068
        x = 0;
3069
                for (y = 0; y < height; y ++) {
3070
                    for(x=0;x<width;x+=4){
3071
                    r0 = _mm_set1_epi16(0);
3072
                    /* load data in register  */
3073
                    //x1 = _mm_setzero_si128();
3074
                    x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3075
                    x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3076
                    x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3077
                    x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3078
                    x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3079
                    x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3080
                    x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3081
3082
                    x1 = _mm_unpacklo_epi8(x1,r0);
3083
                    x2 = _mm_unpacklo_epi8(x2, r0);
3084
                    x3 = _mm_unpacklo_epi8(x3, r0);
3085
                    x4 = _mm_unpacklo_epi8(x4, r0);
3086
                    x5 = _mm_unpacklo_epi8(x5, r0);
3087
                    x6 = _mm_unpacklo_epi8(x6, r0);
3088
                    x7 = _mm_unpacklo_epi8(x7, r0);
3089
                    x8 = _mm_unpacklo_epi8(x8, r0);
3090
3091
3092
                    r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3093
3094
3095
                    r0 = _mm_adds_epi16(r0,
3096
                            _mm_mullo_epi16(x3,
3097
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3098
3099
3100
                    r0 = _mm_adds_epi16(r0,
3101
                            _mm_mullo_epi16(x4,
3102
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3103
3104
3105
                    r0 = _mm_adds_epi16(r0,
3106
                            _mm_mullo_epi16(x5,
3107
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3108
3109
3110
                    r0 = _mm_adds_epi16(r0,
3111
                            _mm_mullo_epi16(x6,
3112
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3113
3114
3115
                    r0 = _mm_adds_epi16(r0,
3116
                            _mm_mullo_epi16(x7,
3117
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3118
3119
3120
                    r0 = _mm_adds_epi16(r0,
3121
                            _mm_mullo_epi16(x8,
3122
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3123
3124
3125
                    r0 = _mm_srli_epi16(r0, BIT_DEPTH - 8);
3126
                    /* give results back            */
3127
                    _mm_storel_epi64((__m128i *) &dst[x], r0);
3128
3129
                    }
3130
                    src += srcstride;
3131
                    dst += dststride;
3132
                }
3133
    }
3134
3135
}
3136
#endif
3137
3138
void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
3139
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3140
35.2k
        int16_t* mcbuffer) {
3141
35.2k
    int x, y;
3142
35.2k
    uint8_t *src = (uint8_t*) _src;
3143
35.2k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3144
35.2k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
3145
35.2k
    __m128i t2, t3, t4, t5, t6, t7, t8;
3146
35.2k
    r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3147
3148
35.2k
    if(!(width & 15)){
3149
140k
        for (y = 0; y < height; y++) {
3150
313k
            for (x = 0; x < width; x += 16) {
3151
                /* check if memory needs to be reloaded */
3152
180k
                x1 = _mm_setzero_si128();
3153
180k
                x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
3154
180k
                x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
3155
180k
                x4 = _mm_loadu_si128((__m128i *) &src[x]);
3156
180k
                x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
3157
180k
                x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
3158
180k
                x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
3159
180k
                x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
3160
3161
180k
                t2 = _mm_unpacklo_epi8(x2, x1);
3162
180k
                t3 = _mm_unpacklo_epi8(x3, x1);
3163
180k
                t4 = _mm_unpacklo_epi8(x4, x1);
3164
180k
                t5 = _mm_unpacklo_epi8(x5, x1);
3165
180k
                t6 = _mm_unpacklo_epi8(x6, x1);
3166
180k
                t7 = _mm_unpacklo_epi8(x7, x1);
3167
180k
                t8 = _mm_unpacklo_epi8(x8, x1);
3168
3169
180k
                x2 = _mm_unpackhi_epi8(x2, x1);
3170
180k
                x3 = _mm_unpackhi_epi8(x3, x1);
3171
180k
                x4 = _mm_unpackhi_epi8(x4, x1);
3172
180k
                x5 = _mm_unpackhi_epi8(x5, x1);
3173
180k
                x6 = _mm_unpackhi_epi8(x6, x1);
3174
180k
                x7 = _mm_unpackhi_epi8(x7, x1);
3175
180k
                x8 = _mm_unpackhi_epi8(x8, x1);
3176
3177
                /* multiply by correct value : */
3178
180k
                r0 = _mm_mullo_epi16(t2,
3179
180k
                        _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3180
180k
                r2 = _mm_mullo_epi16(x2,
3181
180k
                        _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3182
3183
180k
                r0 = _mm_adds_epi16(r0,
3184
180k
                        _mm_mullo_epi16(t3,
3185
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3186
180k
                r2 = _mm_adds_epi16(r2,
3187
180k
                        _mm_mullo_epi16(x3,
3188
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3189
3190
180k
                r0 = _mm_adds_epi16(r0,
3191
180k
                        _mm_mullo_epi16(t4,
3192
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3193
180k
                r2 = _mm_adds_epi16(r2,
3194
180k
                        _mm_mullo_epi16(x4,
3195
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3196
3197
180k
                r0 = _mm_adds_epi16(r0,
3198
180k
                        _mm_mullo_epi16(t5,
3199
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3200
180k
                r2 = _mm_adds_epi16(r2,
3201
180k
                        _mm_mullo_epi16(x5,
3202
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3203
3204
180k
                r0 = _mm_adds_epi16(r0,
3205
180k
                        _mm_mullo_epi16(t6,
3206
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3207
180k
                r2 = _mm_adds_epi16(r2,
3208
180k
                        _mm_mullo_epi16(x6,
3209
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3210
3211
180k
                r0 = _mm_adds_epi16(r0,
3212
180k
                        _mm_mullo_epi16(t7,
3213
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3214
180k
                r2 = _mm_adds_epi16(r2,
3215
180k
                        _mm_mullo_epi16(x7,
3216
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3217
3218
180k
                r0 = _mm_adds_epi16(r0,
3219
180k
                        _mm_mullo_epi16(t8,
3220
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3221
180k
                r2 = _mm_adds_epi16(r2,
3222
180k
                        _mm_mullo_epi16(x8,
3223
180k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3224
3225
                /* give results back            */
3226
180k
                _mm_store_si128((__m128i *) &dst[x],r0);
3227
180k
                _mm_store_si128((__m128i *) &dst[x + 8],r2);
3228
180k
            }
3229
133k
            src += srcstride;
3230
133k
            dst += dststride;
3231
133k
        }
3232
27.9k
    }else{
3233
27.9k
        x = 0;
3234
251k
        for (y = 0; y < height; y ++) {
3235
640k
            for(x=0;x<width;x+=4){
3236
416k
                r0 = _mm_set1_epi16(0);
3237
                /* load data in register  */
3238
416k
                x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3239
416k
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3240
416k
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3241
416k
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3242
416k
                x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3243
416k
                x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3244
416k
                x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3245
3246
416k
                x2 = _mm_unpacklo_epi8(x2, r0);
3247
416k
                x3 = _mm_unpacklo_epi8(x3, r0);
3248
416k
                x4 = _mm_unpacklo_epi8(x4, r0);
3249
416k
                x5 = _mm_unpacklo_epi8(x5, r0);
3250
416k
                x6 = _mm_unpacklo_epi8(x6, r0);
3251
416k
                x7 = _mm_unpacklo_epi8(x7, r0);
3252
416k
                x8 = _mm_unpacklo_epi8(x8, r0);
3253
3254
416k
                r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3255
3256
416k
                r0 = _mm_adds_epi16(r0,
3257
416k
                        _mm_mullo_epi16(x3,
3258
416k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3259
3260
416k
                r0 = _mm_adds_epi16(r0,
3261
416k
                        _mm_mullo_epi16(x4,
3262
416k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3263
3264
416k
                r0 = _mm_adds_epi16(r0,
3265
416k
                        _mm_mullo_epi16(x5,
3266
416k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3267
3268
416k
                r0 = _mm_adds_epi16(r0,
3269
416k
                        _mm_mullo_epi16(x6,
3270
416k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3271
3272
416k
                r0 = _mm_adds_epi16(r0,
3273
416k
                        _mm_mullo_epi16(x7,
3274
416k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3275
3276
416k
                r0 = _mm_adds_epi16(r0,
3277
416k
                        _mm_mullo_epi16(x8,
3278
416k
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3279
3280
                /* give results back            */
3281
416k
                _mm_storel_epi64((__m128i *) &dst[x], r0);
3282
3283
416k
            }
3284
223k
            src += srcstride;
3285
223k
            dst += dststride;
3286
223k
        }
3287
27.9k
    }
3288
3289
35.2k
}
3290
3291
3292
#if 0
3293
void ff_hevc_put_hevc_qpel_v_3_10_sse(int16_t *dst, ptrdiff_t dststride,
3294
                                      const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3295
        int16_t* mcbuffer) {
3296
    int x, y;
3297
    uint16_t *src = (uint16_t*) _src;
3298
    ptrdiff_t srcstride = _srcstride >> 1;
3299
    __m128i x1, x2, x3, x4, x5, x6, x7, r0;
3300
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3301
3302
    t7 = _mm_set1_epi32(-1);
3303
    t6 = _mm_set1_epi32(4);
3304
    t5 = _mm_set1_epi32(-10);
3305
    t4 = _mm_set1_epi32(58);
3306
    t3 = _mm_set1_epi32(17);
3307
    t2 = _mm_set1_epi32(-5);
3308
    t1 = _mm_set1_epi32(1);
3309
    t8= _mm_setzero_si128();
3310
    {
3311
3312
        for (y = 0; y < height; y ++) {
3313
            for(x=0;x<width;x+=4){
3314
                /* load data in register  */
3315
                x1 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3316
                x2 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3317
                x3 = _mm_loadl_epi64((__m128i *) &src[x]);
3318
                x4 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3319
                x5 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3320
                x6 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3321
                x7 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3322
3323
                x1 = _mm_unpacklo_epi16(x1, t8);
3324
                x2 = _mm_unpacklo_epi16(x2, t8);
3325
                x3 = _mm_unpacklo_epi16(x3, t8);
3326
                x4 = _mm_unpacklo_epi16(x4, t8);
3327
                x5 = _mm_unpacklo_epi16(x5, t8);
3328
                x6 = _mm_unpacklo_epi16(x6, t8);
3329
                x7 = _mm_unpacklo_epi16(x7, t8);
3330
3331
                r0 = _mm_mullo_epi32(x1, t1);
3332
3333
                r0 = _mm_add_epi32(r0,
3334
                        _mm_mullo_epi32(x2,t2));
3335
3336
                r0 = _mm_add_epi32(r0,
3337
                        _mm_mullo_epi32(x3,t3));
3338
3339
                r0 = _mm_add_epi32(r0,
3340
                        _mm_mullo_epi32(x4,t4));
3341
3342
                r0 = _mm_add_epi32(r0,
3343
                        _mm_mullo_epi32(x5,t5));
3344
3345
                r0 = _mm_add_epi32(r0,
3346
                        _mm_mullo_epi32(x6,t6));
3347
3348
                r0 = _mm_add_epi32(r0,
3349
                        _mm_mullo_epi32(x7,t7));
3350
3351
                r0= _mm_srai_epi32(r0,2);
3352
3353
                r0= _mm_packs_epi32(r0,t8);
3354
3355
                /* give results back            */
3356
                _mm_storel_epi64((__m128i *) &dst[x], r0);
3357
3358
            }
3359
            src += srcstride;
3360
            dst += dststride;
3361
        }
3362
    }
3363
3364
}
3365
#endif
3366
3367
3368
3369
void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3370
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3371
43.5k
        int16_t* mcbuffer) {
3372
43.5k
    int x, y;
3373
43.5k
    uint8_t* src = (uint8_t*) _src;
3374
43.5k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3375
43.5k
    int16_t *tmp = mcbuffer;
3376
43.5k
    __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3377
43.5k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3378
3379
43.5k
    src -= qpel_extra_before[1] * srcstride;
3380
43.5k
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3381
43.5k
            -1);
3382
3383
    /* LOAD src from memory to registers to limit memory bandwidth */
3384
43.5k
    if (width == 4) {
3385
3386
104k
        for (y = 0; y < height + qpel_extra[1]; y += 2) {
3387
            /* load data in register     */
3388
91.4k
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3389
91.4k
            src += srcstride;
3390
91.4k
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3391
91.4k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3392
91.4k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3393
91.4k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3394
91.4k
                    _mm_srli_si128(x1, 3));
3395
91.4k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3396
91.4k
                    _mm_srli_si128(t1, 3));
3397
3398
            /*  PMADDUBSW then PMADDW     */
3399
91.4k
            x2 = _mm_maddubs_epi16(x2, r0);
3400
91.4k
            t2 = _mm_maddubs_epi16(t2, r0);
3401
91.4k
            x3 = _mm_maddubs_epi16(x3, r0);
3402
91.4k
            t3 = _mm_maddubs_epi16(t3, r0);
3403
91.4k
            x2 = _mm_hadd_epi16(x2, x3);
3404
91.4k
            t2 = _mm_hadd_epi16(t2, t3);
3405
91.4k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3406
91.4k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3407
91.4k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3408
91.4k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3409
            /* give results back            */
3410
91.4k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3411
3412
91.4k
            tmp += MAX_PB_SIZE;
3413
91.4k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3414
3415
91.4k
            src += srcstride;
3416
91.4k
            tmp += MAX_PB_SIZE;
3417
91.4k
        }
3418
13.0k
    } else
3419
525k
        for (y = 0; y < height + qpel_extra[1]; y++) {
3420
1.29M
            for (x = 0; x < width; x += 8) {
3421
                /* load data in register     */
3422
799k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3423
799k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3424
799k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3425
799k
                        _mm_srli_si128(x1, 3));
3426
799k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3427
799k
                        _mm_srli_si128(x1, 5));
3428
799k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3429
799k
                        _mm_srli_si128(x1, 7));
3430
3431
                /*  PMADDUBSW then PMADDW     */
3432
799k
                x2 = _mm_maddubs_epi16(x2, r0);
3433
799k
                x3 = _mm_maddubs_epi16(x3, r0);
3434
799k
                x4 = _mm_maddubs_epi16(x4, r0);
3435
799k
                x5 = _mm_maddubs_epi16(x5, r0);
3436
799k
                x2 = _mm_hadd_epi16(x2, x3);
3437
799k
                x4 = _mm_hadd_epi16(x4, x5);
3438
799k
                x2 = _mm_hadd_epi16(x2, x4);
3439
799k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3440
3441
                /* give results back            */
3442
799k
                _mm_store_si128((__m128i *) &tmp[x], x2);
3443
3444
799k
            }
3445
494k
            src += srcstride;
3446
494k
            tmp += MAX_PB_SIZE;
3447
494k
        }
3448
3449
43.5k
    tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3450
43.5k
    srcstride = MAX_PB_SIZE;
3451
3452
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3453
     for register calculations */
3454
43.5k
    rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3455
459k
    for (y = 0; y < height; y++) {
3456
1.07M
        for (x = 0; x < width; x += 8) {
3457
3458
657k
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3459
657k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3460
657k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3461
657k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3462
657k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3463
657k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3464
657k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3465
3466
657k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3467
657k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3468
657k
            t8 = _mm_mullo_epi16(x1, r0);
3469
657k
            rBuffer = _mm_mulhi_epi16(x1, r0);
3470
657k
            t7 = _mm_mullo_epi16(x2, r1);
3471
657k
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
3472
657k
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
3473
3474
657k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3475
657k
            rBuffer = _mm_mulhi_epi16(x2, r1);
3476
657k
            t8 = _mm_mullo_epi16(x3, r0);
3477
657k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
3478
657k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
3479
3480
657k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3481
657k
            rBuffer = _mm_mulhi_epi16(x3, r0);
3482
657k
            t7 = _mm_mullo_epi16(x4, r1);
3483
657k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
3484
657k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
3485
3486
657k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3487
657k
            rBuffer = _mm_mulhi_epi16(x4, r1);
3488
657k
            t8 = _mm_mullo_epi16(x5, r0);
3489
657k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
3490
657k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
3491
3492
657k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3493
657k
            rBuffer = _mm_mulhi_epi16(x5, r0);
3494
657k
            t7 = _mm_mullo_epi16(x6, r1);
3495
657k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
3496
657k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
3497
3498
657k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3499
657k
            rBuffer = _mm_mulhi_epi16(x6, r1);
3500
657k
            t8 = _mm_mullo_epi16(x7, r0);
3501
657k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
3502
657k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
3503
3504
657k
            rBuffer = _mm_mulhi_epi16(x7, r0);
3505
657k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
3506
657k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
3507
3508
3509
3510
            /* add calculus by correct value : */
3511
3512
657k
            r1 = _mm_add_epi32(x1, x2);
3513
657k
            x3 = _mm_add_epi32(x3, x4);
3514
657k
            x5 = _mm_add_epi32(x5, x6);
3515
657k
            r1 = _mm_add_epi32(r1, x3);
3516
3517
657k
            r1 = _mm_add_epi32(r1, x5);
3518
3519
657k
            r0 = _mm_add_epi32(t1, t2);
3520
657k
            t3 = _mm_add_epi32(t3, t4);
3521
657k
            t5 = _mm_add_epi32(t5, t6);
3522
657k
            r0 = _mm_add_epi32(r0, t3);
3523
657k
            r0 = _mm_add_epi32(r0, t5);
3524
657k
            r1 = _mm_add_epi32(r1, x7);
3525
657k
            r0 = _mm_add_epi32(r0, t7);
3526
657k
            r1 = _mm_srli_epi32(r1, 6);
3527
657k
            r0 = _mm_srli_epi32(r0, 6);
3528
3529
657k
            r1 = _mm_and_si128(r1,
3530
657k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3531
657k
            r0 = _mm_and_si128(r0,
3532
657k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3533
657k
            r0 = _mm_hadd_epi16(r0, r1);
3534
657k
            _mm_store_si128((__m128i *) &dst[x], r0);
3535
3536
657k
        }
3537
416k
        tmp += MAX_PB_SIZE;
3538
416k
        dst += dststride;
3539
416k
    }
3540
43.5k
}
3541
void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
3542
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3543
14.2k
        int16_t* mcbuffer) {
3544
14.2k
    int x, y;
3545
14.2k
    uint8_t *src = (uint8_t*) _src;
3546
14.2k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3547
14.2k
    int16_t *tmp = mcbuffer;
3548
14.2k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3549
14.2k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3550
3551
14.2k
    src -= qpel_extra_before[2] * srcstride;
3552
14.2k
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3553
14.2k
            -1);
3554
3555
    /* LOAD src from memory to registers to limit memory bandwidth */
3556
14.2k
    if (width == 4) {
3557
3558
24.1k
        for (y = 0; y < height + qpel_extra[2]; y += 2) {
3559
            /* load data in register     */
3560
21.4k
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3561
21.4k
            src += srcstride;
3562
21.4k
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3563
21.4k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3564
21.4k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3565
21.4k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3566
21.4k
                    _mm_srli_si128(x1, 3));
3567
21.4k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3568
21.4k
                    _mm_srli_si128(t1, 3));
3569
3570
            /*  PMADDUBSW then PMADDW     */
3571
21.4k
            x2 = _mm_maddubs_epi16(x2, r0);
3572
21.4k
            t2 = _mm_maddubs_epi16(t2, r0);
3573
21.4k
            x3 = _mm_maddubs_epi16(x3, r0);
3574
21.4k
            t3 = _mm_maddubs_epi16(t3, r0);
3575
21.4k
            x2 = _mm_hadd_epi16(x2, x3);
3576
21.4k
            t2 = _mm_hadd_epi16(t2, t3);
3577
21.4k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3578
21.4k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3579
21.4k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3580
21.4k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3581
            /* give results back            */
3582
21.4k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3583
3584
21.4k
            tmp += MAX_PB_SIZE;
3585
21.4k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3586
3587
21.4k
            src += srcstride;
3588
21.4k
            tmp += MAX_PB_SIZE;
3589
21.4k
        }
3590
2.65k
    } else
3591
210k
        for (y = 0; y < height + qpel_extra[2]; y++) {
3592
503k
            for (x = 0; x < width; x += 8) {
3593
                /* load data in register     */
3594
304k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3595
304k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3596
304k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3597
304k
                        _mm_srli_si128(x1, 3));
3598
304k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3599
304k
                        _mm_srli_si128(x1, 5));
3600
304k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3601
304k
                        _mm_srli_si128(x1, 7));
3602
3603
                /*  PMADDUBSW then PMADDW     */
3604
304k
                x2 = _mm_maddubs_epi16(x2, r0);
3605
304k
                x3 = _mm_maddubs_epi16(x3, r0);
3606
304k
                x4 = _mm_maddubs_epi16(x4, r0);
3607
304k
                x5 = _mm_maddubs_epi16(x5, r0);
3608
304k
                x2 = _mm_hadd_epi16(x2, x3);
3609
304k
                x4 = _mm_hadd_epi16(x4, x5);
3610
304k
                x2 = _mm_hadd_epi16(x2, x4);
3611
304k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3612
3613
                /* give results back            */
3614
304k
                _mm_store_si128((__m128i *) &tmp[x], x2);
3615
3616
304k
            }
3617
198k
            src += srcstride;
3618
198k
            tmp += MAX_PB_SIZE;
3619
198k
        }
3620
3621
14.2k
    tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
3622
14.2k
    srcstride = MAX_PB_SIZE;
3623
3624
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3625
     for register calculations */
3626
14.2k
    rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
3627
153k
    for (y = 0; y < height; y++) {
3628
357k
        for (x = 0; x < width; x += 8) {
3629
3630
218k
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3631
218k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3632
218k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3633
218k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3634
218k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3635
218k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3636
218k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3637
218k
            x8 = _mm_loadu_si128((__m128i *) &tmp[x + 4 * srcstride]);
3638
3639
218k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3640
218k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3641
218k
            t8 = _mm_mullo_epi16(x1, r0);
3642
218k
            rBuffer = _mm_mulhi_epi16(x1, r0);
3643
218k
            t7 = _mm_mullo_epi16(x2, r1);
3644
218k
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
3645
218k
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
3646
3647
218k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3648
218k
            rBuffer = _mm_mulhi_epi16(x2, r1);
3649
218k
            t8 = _mm_mullo_epi16(x3, r0);
3650
218k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
3651
218k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
3652
3653
218k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3654
218k
            rBuffer = _mm_mulhi_epi16(x3, r0);
3655
218k
            t7 = _mm_mullo_epi16(x4, r1);
3656
218k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
3657
218k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
3658
3659
218k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3660
218k
            rBuffer = _mm_mulhi_epi16(x4, r1);
3661
218k
            t8 = _mm_mullo_epi16(x5, r0);
3662
218k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
3663
218k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
3664
3665
218k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3666
218k
            rBuffer = _mm_mulhi_epi16(x5, r0);
3667
218k
            t7 = _mm_mullo_epi16(x6, r1);
3668
218k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
3669
218k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
3670
3671
218k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3672
218k
            rBuffer = _mm_mulhi_epi16(x6, r1);
3673
218k
            t8 = _mm_mullo_epi16(x7, r0);
3674
218k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
3675
218k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
3676
3677
218k
            rBuffer = _mm_mulhi_epi16(x7, r0);
3678
218k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
3679
218k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
3680
3681
218k
            t8 = _mm_unpacklo_epi16(
3682
218k
                    _mm_mullo_epi16(x8,
3683
218k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3684
218k
                            _mm_mulhi_epi16(x8,
3685
218k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3686
218k
            x8 = _mm_unpackhi_epi16(
3687
218k
                    _mm_mullo_epi16(x8,
3688
218k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3689
218k
                            _mm_mulhi_epi16(x8,
3690
218k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3691
3692
            /* add calculus by correct value : */
3693
3694
218k
            r1 = _mm_add_epi32(x1, x2);
3695
218k
            x3 = _mm_add_epi32(x3, x4);
3696
218k
            x5 = _mm_add_epi32(x5, x6);
3697
218k
            r1 = _mm_add_epi32(r1, x3);
3698
218k
            x7 = _mm_add_epi32(x7, x8);
3699
218k
            r1 = _mm_add_epi32(r1, x5);
3700
3701
218k
            r0 = _mm_add_epi32(t1, t2);
3702
218k
            t3 = _mm_add_epi32(t3, t4);
3703
218k
            t5 = _mm_add_epi32(t5, t6);
3704
218k
            r0 = _mm_add_epi32(r0, t3);
3705
218k
            t7 = _mm_add_epi32(t7, t8);
3706
218k
            r0 = _mm_add_epi32(r0, t5);
3707
218k
            r1 = _mm_add_epi32(r1, x7);
3708
218k
            r0 = _mm_add_epi32(r0, t7);
3709
218k
            r1 = _mm_srli_epi32(r1, 6);
3710
218k
            r0 = _mm_srli_epi32(r0, 6);
3711
3712
218k
            r1 = _mm_and_si128(r1,
3713
218k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3714
218k
            r0 = _mm_and_si128(r0,
3715
218k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3716
218k
            r0 = _mm_hadd_epi16(r0, r1);
3717
218k
            _mm_store_si128((__m128i *) &dst[x], r0);
3718
3719
218k
        }
3720
139k
        tmp += MAX_PB_SIZE;
3721
139k
        dst += dststride;
3722
139k
    }
3723
14.2k
}
3724
void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
3725
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3726
28.6k
        int16_t* mcbuffer) {
3727
28.6k
    int x, y;
3728
28.6k
    uint8_t *src = (uint8_t*) _src;
3729
28.6k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3730
28.6k
    int16_t *tmp = mcbuffer;
3731
28.6k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3732
28.6k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3733
3734
28.6k
    src -= qpel_extra_before[3] * srcstride;
3735
28.6k
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3736
28.6k
            -1);
3737
3738
    /* LOAD src from memory to registers to limit memory bandwidth */
3739
28.6k
    if (width == 4) {
3740
3741
31.7k
        for (y = 0; y < height + qpel_extra[3]; y += 2) {
3742
            /* load data in register     */
3743
27.8k
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3744
27.8k
            src += srcstride;
3745
27.8k
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3746
27.8k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3747
27.8k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3748
27.8k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3749
27.8k
                    _mm_srli_si128(x1, 3));
3750
27.8k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3751
27.8k
                    _mm_srli_si128(t1, 3));
3752
3753
            /*  PMADDUBSW then PMADDW     */
3754
27.8k
            x2 = _mm_maddubs_epi16(x2, r0);
3755
27.8k
            t2 = _mm_maddubs_epi16(t2, r0);
3756
27.8k
            x3 = _mm_maddubs_epi16(x3, r0);
3757
27.8k
            t3 = _mm_maddubs_epi16(t3, r0);
3758
27.8k
            x2 = _mm_hadd_epi16(x2, x3);
3759
27.8k
            t2 = _mm_hadd_epi16(t2, t3);
3760
27.8k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3761
27.8k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3762
27.8k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3763
27.8k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3764
            /* give results back            */
3765
27.8k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3766
3767
27.8k
            tmp += MAX_PB_SIZE;
3768
27.8k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3769
3770
27.8k
            src += srcstride;
3771
27.8k
            tmp += MAX_PB_SIZE;
3772
27.8k
        }
3773
3.90k
    } else
3774
394k
        for (y = 0; y < height + qpel_extra[3]; y++) {
3775
869k
            for (x = 0; x < width; x += 8) {
3776
                /* load data in register     */
3777
500k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3778
500k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3779
500k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3780
500k
                        _mm_srli_si128(x1, 3));
3781
500k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3782
500k
                        _mm_srli_si128(x1, 5));
3783
500k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3784
500k
                        _mm_srli_si128(x1, 7));
3785
3786
                /*  PMADDUBSW then PMADDW     */
3787
500k
                x2 = _mm_maddubs_epi16(x2, r0);
3788
500k
                x3 = _mm_maddubs_epi16(x3, r0);
3789
500k
                x4 = _mm_maddubs_epi16(x4, r0);
3790
500k
                x5 = _mm_maddubs_epi16(x5, r0);
3791
500k
                x2 = _mm_hadd_epi16(x2, x3);
3792
500k
                x4 = _mm_hadd_epi16(x4, x5);
3793
500k
                x2 = _mm_hadd_epi16(x2, x4);
3794
500k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3795
3796
                /* give results back            */
3797
500k
                _mm_store_si128((__m128i *) &tmp[x], x2);
3798
3799
500k
            }
3800
369k
            src += srcstride;
3801
369k
            tmp += MAX_PB_SIZE;
3802
369k
        }
3803
3804
28.6k
    tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
3805
28.6k
    srcstride = MAX_PB_SIZE;
3806
3807
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3808
     for register calculations */
3809
28.6k
    rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3810
281k
    for (y = 0; y < height; y++) {
3811
607k
        for (x = 0; x < width; x += 8) {
3812
3813
354k
            x1 = _mm_setzero_si128();
3814
354k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3815
354k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3816
354k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3817
354k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3818
354k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3819
354k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3820
354k
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
3821
3822
3823
354k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3824
3825
354k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3826
354k
            t7 = _mm_mullo_epi16(x2, r1);
3827
354k
            rBuffer = _mm_mulhi_epi16(x2, r1);
3828
354k
            t8 = _mm_mullo_epi16(x3, r0);
3829
354k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
3830
354k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
3831
3832
354k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3833
354k
            rBuffer = _mm_mulhi_epi16(x3, r0);
3834
354k
            t7 = _mm_mullo_epi16(x4, r1);
3835
354k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
3836
354k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
3837
3838
354k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3839
354k
            rBuffer = _mm_mulhi_epi16(x4, r1);
3840
354k
            t8 = _mm_mullo_epi16(x5, r0);
3841
354k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
3842
354k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
3843
3844
354k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3845
354k
            rBuffer = _mm_mulhi_epi16(x5, r0);
3846
354k
            t7 = _mm_mullo_epi16(x6, r1);
3847
354k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
3848
354k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
3849
3850
354k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3851
354k
            rBuffer = _mm_mulhi_epi16(x6, r1);
3852
354k
            t8 = _mm_mullo_epi16(x7, r0);
3853
354k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
3854
354k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
3855
3856
354k
            rBuffer = _mm_mulhi_epi16(x7, r0);
3857
354k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
3858
354k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
3859
3860
354k
            t8 = _mm_unpacklo_epi16(
3861
354k
                    _mm_mullo_epi16(x8,
3862
354k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3863
354k
                            _mm_mulhi_epi16(x8,
3864
354k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3865
354k
            x8 = _mm_unpackhi_epi16(
3866
354k
                    _mm_mullo_epi16(x8,
3867
354k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3868
354k
                            _mm_mulhi_epi16(x8,
3869
354k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3870
3871
            /* add calculus by correct value : */
3872
3873
354k
            x3 = _mm_add_epi32(x3, x4);
3874
354k
            x5 = _mm_add_epi32(x5, x6);
3875
354k
            r1 = _mm_add_epi32(x2, x3);
3876
354k
            x7 = _mm_add_epi32(x7, x8);
3877
354k
            r1 = _mm_add_epi32(r1, x5);
3878
3879
354k
            t3 = _mm_add_epi32(t3, t4);
3880
354k
            t5 = _mm_add_epi32(t5, t6);
3881
354k
            r0 = _mm_add_epi32(t2, t3);
3882
354k
            t7 = _mm_add_epi32(t7, t8);
3883
354k
            r0 = _mm_add_epi32(r0, t5);
3884
354k
            r1 = _mm_add_epi32(r1, x7);
3885
354k
            r0 = _mm_add_epi32(r0, t7);
3886
354k
            r1 = _mm_srli_epi32(r1, 6);
3887
354k
            r0 = _mm_srli_epi32(r0, 6);
3888
3889
354k
            r1 = _mm_and_si128(r1,
3890
354k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3891
354k
            r0 = _mm_and_si128(r0,
3892
354k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3893
354k
            r0 = _mm_hadd_epi16(r0, r1);
3894
354k
            _mm_store_si128((__m128i *) &dst[x], r0);
3895
3896
354k
        }
3897
253k
        tmp += MAX_PB_SIZE;
3898
253k
        dst += dststride;
3899
253k
    }
3900
28.6k
}
3901
void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3902
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3903
17.4k
        int16_t* mcbuffer) {
3904
17.4k
    int x, y;
3905
17.4k
    uint8_t *src = (uint8_t*) _src;
3906
17.4k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3907
17.4k
    int16_t *tmp = mcbuffer;
3908
17.4k
    __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3909
17.4k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3910
3911
17.4k
    src -= qpel_extra_before[1] * srcstride;
3912
17.4k
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
3913
17.4k
            4, -1);
3914
3915
    /* LOAD src from memory to registers to limit memory bandwidth */
3916
17.4k
    if (width == 4) {
3917
3918
24.5k
        for (y = 0; y < height + qpel_extra[1]; y += 2) {
3919
            /* load data in register     */
3920
21.5k
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3921
21.5k
            src += srcstride;
3922
21.5k
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3923
21.5k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3924
21.5k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3925
21.5k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3926
21.5k
                    _mm_srli_si128(x1, 3));
3927
21.5k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3928
21.5k
                    _mm_srli_si128(t1, 3));
3929
3930
            /*  PMADDUBSW then PMADDW     */
3931
21.5k
            x2 = _mm_maddubs_epi16(x2, r0);
3932
21.5k
            t2 = _mm_maddubs_epi16(t2, r0);
3933
21.5k
            x3 = _mm_maddubs_epi16(x3, r0);
3934
21.5k
            t3 = _mm_maddubs_epi16(t3, r0);
3935
21.5k
            x2 = _mm_hadd_epi16(x2, x3);
3936
21.5k
            t2 = _mm_hadd_epi16(t2, t3);
3937
21.5k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3938
21.5k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3939
21.5k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3940
21.5k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3941
            /* give results back            */
3942
21.5k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3943
3944
21.5k
            tmp += MAX_PB_SIZE;
3945
21.5k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3946
3947
21.5k
            src += srcstride;
3948
21.5k
            tmp += MAX_PB_SIZE;
3949
21.5k
        }
3950
3.00k
    } else
3951
243k
        for (y = 0; y < height + qpel_extra[1]; y++) {
3952
562k
            for (x = 0; x < width; x += 8) {
3953
                /* load data in register     */
3954
334k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3955
334k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3956
334k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3957
334k
                        _mm_srli_si128(x1, 3));
3958
334k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3959
334k
                        _mm_srli_si128(x1, 5));
3960
334k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3961
334k
                        _mm_srli_si128(x1, 7));
3962
3963
                /*  PMADDUBSW then PMADDW     */
3964
334k
                x2 = _mm_maddubs_epi16(x2, r0);
3965
334k
                x3 = _mm_maddubs_epi16(x3, r0);
3966
334k
                x4 = _mm_maddubs_epi16(x4, r0);
3967
334k
                x5 = _mm_maddubs_epi16(x5, r0);
3968
334k
                x2 = _mm_hadd_epi16(x2, x3);
3969
334k
                x4 = _mm_hadd_epi16(x4, x5);
3970
334k
                x2 = _mm_hadd_epi16(x2, x4);
3971
334k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3972
3973
                /* give results back            */
3974
334k
                _mm_store_si128((__m128i *) &tmp[x], x2);
3975
3976
334k
            }
3977
228k
            src += srcstride;
3978
228k
            tmp += MAX_PB_SIZE;
3979
228k
        }
3980
3981
17.4k
    tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3982
17.4k
    srcstride = MAX_PB_SIZE;
3983
3984
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3985
     for register calculations */
3986
17.4k
    rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3987
184k
    for (y = 0; y < height; y++) {
3988
416k
        for (x = 0; x < width; x += 8) {
3989
3990
249k
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3991
249k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3992
249k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3993
249k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3994
249k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3995
249k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3996
249k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3997
3998
249k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3999
249k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4000
249k
            t8 = _mm_mullo_epi16(x1, r0);
4001
249k
            rBuffer = _mm_mulhi_epi16(x1, r0);
4002
249k
            t7 = _mm_mullo_epi16(x2, r1);
4003
249k
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4004
249k
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4005
4006
249k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4007
249k
            rBuffer = _mm_mulhi_epi16(x2, r1);
4008
249k
            t8 = _mm_mullo_epi16(x3, r0);
4009
249k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4010
249k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4011
4012
249k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4013
249k
            rBuffer = _mm_mulhi_epi16(x3, r0);
4014
249k
            t7 = _mm_mullo_epi16(x4, r1);
4015
249k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4016
249k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4017
4018
249k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4019
249k
            rBuffer = _mm_mulhi_epi16(x4, r1);
4020
249k
            t8 = _mm_mullo_epi16(x5, r0);
4021
249k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4022
249k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4023
4024
249k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4025
249k
            rBuffer = _mm_mulhi_epi16(x5, r0);
4026
249k
            t7 = _mm_mullo_epi16(x6, r1);
4027
249k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4028
249k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4029
4030
249k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4031
249k
            rBuffer = _mm_mulhi_epi16(x6, r1);
4032
249k
            t8 = _mm_mullo_epi16(x7, r0);
4033
249k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4034
249k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4035
4036
249k
            rBuffer = _mm_mulhi_epi16(x7, r0);
4037
249k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4038
249k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4039
4040
4041
4042
            /* add calculus by correct value : */
4043
4044
249k
            r1 = _mm_add_epi32(x1, x2);
4045
249k
            x3 = _mm_add_epi32(x3, x4);
4046
249k
            x5 = _mm_add_epi32(x5, x6);
4047
249k
            r1 = _mm_add_epi32(r1, x3);
4048
249k
            r1 = _mm_add_epi32(r1, x5);
4049
4050
249k
            r0 = _mm_add_epi32(t1, t2);
4051
249k
            t3 = _mm_add_epi32(t3, t4);
4052
249k
            t5 = _mm_add_epi32(t5, t6);
4053
249k
            r0 = _mm_add_epi32(r0, t3);
4054
249k
            r0 = _mm_add_epi32(r0, t5);
4055
249k
            r1 = _mm_add_epi32(r1, x7);
4056
249k
            r0 = _mm_add_epi32(r0, t7);
4057
249k
            r1 = _mm_srli_epi32(r1, 6);
4058
249k
            r0 = _mm_srli_epi32(r0, 6);
4059
4060
249k
            r1 = _mm_and_si128(r1,
4061
249k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4062
249k
            r0 = _mm_and_si128(r0,
4063
249k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4064
249k
            r0 = _mm_hadd_epi16(r0, r1);
4065
249k
            _mm_store_si128((__m128i *) &dst[x], r0);
4066
4067
249k
        }
4068
167k
        tmp += MAX_PB_SIZE;
4069
167k
        dst += dststride;
4070
167k
    }
4071
17.4k
}
4072
void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4073
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4074
49.2k
        int16_t* mcbuffer) {
4075
49.2k
    int x, y;
4076
49.2k
    uint8_t *src = (uint8_t*) _src;
4077
49.2k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4078
49.2k
    int16_t *tmp = mcbuffer;
4079
49.2k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4080
49.2k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4081
4082
49.2k
    src -= qpel_extra_before[2] * srcstride;
4083
49.2k
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4084
49.2k
            4, -1);
4085
4086
    /* LOAD src from memory to registers to limit memory bandwidth */
4087
49.2k
    if (width == 4) {
4088
4089
98.5k
        for (y = 0; y < height + qpel_extra[2]; y += 2) {
4090
            /* load data in register     */
4091
87.6k
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4092
87.6k
            src += srcstride;
4093
87.6k
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4094
87.6k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4095
87.6k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4096
87.6k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4097
87.6k
                    _mm_srli_si128(x1, 3));
4098
87.6k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4099
87.6k
                    _mm_srli_si128(t1, 3));
4100
4101
            /*  PMADDUBSW then PMADDW     */
4102
87.6k
            x2 = _mm_maddubs_epi16(x2, r0);
4103
87.6k
            t2 = _mm_maddubs_epi16(t2, r0);
4104
87.6k
            x3 = _mm_maddubs_epi16(x3, r0);
4105
87.6k
            t3 = _mm_maddubs_epi16(t3, r0);
4106
87.6k
            x2 = _mm_hadd_epi16(x2, x3);
4107
87.6k
            t2 = _mm_hadd_epi16(t2, t3);
4108
87.6k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4109
87.6k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4110
87.6k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4111
87.6k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4112
            /* give results back            */
4113
87.6k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4114
4115
87.6k
            tmp += MAX_PB_SIZE;
4116
87.6k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4117
4118
87.6k
            src += srcstride;
4119
87.6k
            tmp += MAX_PB_SIZE;
4120
87.6k
        }
4121
10.9k
    } else
4122
703k
        for (y = 0; y < height + qpel_extra[2]; y++) {
4123
1.67M
            for (x = 0; x < width; x += 8) {
4124
                /* load data in register     */
4125
1.00M
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4126
1.00M
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4127
1.00M
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4128
1.00M
                        _mm_srli_si128(x1, 3));
4129
1.00M
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4130
1.00M
                        _mm_srli_si128(x1, 5));
4131
1.00M
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4132
1.00M
                        _mm_srli_si128(x1, 7));
4133
4134
                /*  PMADDUBSW then PMADDW     */
4135
1.00M
                x2 = _mm_maddubs_epi16(x2, r0);
4136
1.00M
                x3 = _mm_maddubs_epi16(x3, r0);
4137
1.00M
                x4 = _mm_maddubs_epi16(x4, r0);
4138
1.00M
                x5 = _mm_maddubs_epi16(x5, r0);
4139
1.00M
                x2 = _mm_hadd_epi16(x2, x3);
4140
1.00M
                x4 = _mm_hadd_epi16(x4, x5);
4141
1.00M
                x2 = _mm_hadd_epi16(x2, x4);
4142
1.00M
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4143
4144
                /* give results back            */
4145
1.00M
                _mm_store_si128((__m128i *) &tmp[x], x2);
4146
4147
1.00M
            }
4148
664k
            src += srcstride;
4149
664k
            tmp += MAX_PB_SIZE;
4150
664k
        }
4151
4152
49.2k
    tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4153
49.2k
    srcstride = MAX_PB_SIZE;
4154
4155
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4156
     for register calculations */
4157
49.2k
    rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4158
533k
    for (y = 0; y < height; y++) {
4159
1.23M
        for (x = 0; x < width; x += 8) {
4160
4161
753k
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4162
753k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4163
753k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4164
753k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4165
753k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4166
753k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4167
753k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4168
753k
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4169
4170
753k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4171
753k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4172
753k
            t8 = _mm_mullo_epi16(x1, r0);
4173
753k
            rBuffer = _mm_mulhi_epi16(x1, r0);
4174
753k
            t7 = _mm_mullo_epi16(x2, r1);
4175
753k
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4176
753k
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4177
4178
753k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4179
753k
            rBuffer = _mm_mulhi_epi16(x2, r1);
4180
753k
            t8 = _mm_mullo_epi16(x3, r0);
4181
753k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4182
753k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4183
4184
753k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4185
753k
            rBuffer = _mm_mulhi_epi16(x3, r0);
4186
753k
            t7 = _mm_mullo_epi16(x4, r1);
4187
753k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4188
753k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4189
4190
753k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4191
753k
            rBuffer = _mm_mulhi_epi16(x4, r1);
4192
753k
            t8 = _mm_mullo_epi16(x5, r0);
4193
753k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4194
753k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4195
4196
753k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4197
753k
            rBuffer = _mm_mulhi_epi16(x5, r0);
4198
753k
            t7 = _mm_mullo_epi16(x6, r1);
4199
753k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4200
753k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4201
4202
753k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4203
753k
            rBuffer = _mm_mulhi_epi16(x6, r1);
4204
753k
            t8 = _mm_mullo_epi16(x7, r0);
4205
753k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4206
753k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4207
4208
753k
            rBuffer = _mm_mulhi_epi16(x7, r0);
4209
753k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4210
753k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4211
4212
753k
            t8 = _mm_unpacklo_epi16(
4213
753k
                    _mm_mullo_epi16(x8,
4214
753k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4215
753k
                            _mm_mulhi_epi16(x8,
4216
753k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4217
753k
            x8 = _mm_unpackhi_epi16(
4218
753k
                    _mm_mullo_epi16(x8,
4219
753k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4220
753k
                            _mm_mulhi_epi16(x8,
4221
753k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4222
4223
            /* add calculus by correct value : */
4224
4225
753k
            r1 = _mm_add_epi32(x1, x2);
4226
753k
            x3 = _mm_add_epi32(x3, x4);
4227
753k
            x5 = _mm_add_epi32(x5, x6);
4228
753k
            r1 = _mm_add_epi32(r1, x3);
4229
753k
            x7 = _mm_add_epi32(x7, x8);
4230
753k
            r1 = _mm_add_epi32(r1, x5);
4231
4232
753k
            r0 = _mm_add_epi32(t1, t2);
4233
753k
            t3 = _mm_add_epi32(t3, t4);
4234
753k
            t5 = _mm_add_epi32(t5, t6);
4235
753k
            r0 = _mm_add_epi32(r0, t3);
4236
753k
            t7 = _mm_add_epi32(t7, t8);
4237
753k
            r0 = _mm_add_epi32(r0, t5);
4238
753k
            r1 = _mm_add_epi32(r1, x7);
4239
753k
            r0 = _mm_add_epi32(r0, t7);
4240
753k
            r1 = _mm_srli_epi32(r1, 6);
4241
753k
            r0 = _mm_srli_epi32(r0, 6);
4242
4243
753k
            r1 = _mm_and_si128(r1,
4244
753k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4245
753k
            r0 = _mm_and_si128(r0,
4246
753k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4247
753k
            r0 = _mm_hadd_epi16(r0, r1);
4248
753k
            _mm_store_si128((__m128i *) &dst[x], r0);
4249
4250
753k
        }
4251
484k
        tmp += MAX_PB_SIZE;
4252
484k
        dst += dststride;
4253
484k
    }
4254
49.2k
}
4255
void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4256
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4257
18.2k
        int16_t* mcbuffer) {
4258
18.2k
    int x, y;
4259
18.2k
    uint8_t *src = (uint8_t*) _src;
4260
18.2k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4261
18.2k
    int16_t *tmp = mcbuffer;
4262
18.2k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4263
18.2k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4264
4265
18.2k
    src -= qpel_extra_before[3] * srcstride;
4266
18.2k
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4267
18.2k
            4, -1);
4268
4269
    /* LOAD src from memory to registers to limit memory bandwidth */
4270
18.2k
    if (width == 4) {
4271
4272
31.9k
        for (y = 0; y < height + qpel_extra[3]; y += 2) {
4273
            /* load data in register     */
4274
27.9k
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4275
27.9k
            src += srcstride;
4276
27.9k
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4277
27.9k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4278
27.9k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4279
27.9k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4280
27.9k
                    _mm_srli_si128(x1, 3));
4281
27.9k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4282
27.9k
                    _mm_srli_si128(t1, 3));
4283
4284
            /*  PMADDUBSW then PMADDW     */
4285
27.9k
            x2 = _mm_maddubs_epi16(x2, r0);
4286
27.9k
            t2 = _mm_maddubs_epi16(t2, r0);
4287
27.9k
            x3 = _mm_maddubs_epi16(x3, r0);
4288
27.9k
            t3 = _mm_maddubs_epi16(t3, r0);
4289
27.9k
            x2 = _mm_hadd_epi16(x2, x3);
4290
27.9k
            t2 = _mm_hadd_epi16(t2, t3);
4291
27.9k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4292
27.9k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4293
27.9k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4294
27.9k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4295
            /* give results back            */
4296
27.9k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4297
4298
27.9k
            tmp += MAX_PB_SIZE;
4299
27.9k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4300
4301
27.9k
            src += srcstride;
4302
27.9k
            tmp += MAX_PB_SIZE;
4303
27.9k
        }
4304
3.95k
    } else
4305
243k
        for (y = 0; y < height + qpel_extra[3]; y++) {
4306
574k
            for (x = 0; x < width; x += 8) {
4307
                /* load data in register     */
4308
344k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4309
344k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4310
344k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4311
344k
                        _mm_srli_si128(x1, 3));
4312
344k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4313
344k
                        _mm_srli_si128(x1, 5));
4314
344k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4315
344k
                        _mm_srli_si128(x1, 7));
4316
4317
                /*  PMADDUBSW then PMADDW     */
4318
344k
                x2 = _mm_maddubs_epi16(x2, r0);
4319
344k
                x3 = _mm_maddubs_epi16(x3, r0);
4320
344k
                x4 = _mm_maddubs_epi16(x4, r0);
4321
344k
                x5 = _mm_maddubs_epi16(x5, r0);
4322
344k
                x2 = _mm_hadd_epi16(x2, x3);
4323
344k
                x4 = _mm_hadd_epi16(x4, x5);
4324
344k
                x2 = _mm_hadd_epi16(x2, x4);
4325
344k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4326
4327
                /* give results back            */
4328
344k
                _mm_store_si128((__m128i *) &tmp[x], x2);
4329
4330
344k
            }
4331
229k
            src += srcstride;
4332
229k
            tmp += MAX_PB_SIZE;
4333
229k
        }
4334
4335
18.2k
    tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4336
18.2k
    srcstride = MAX_PB_SIZE;
4337
4338
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4339
     for register calculations */
4340
18.2k
    rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4341
194k
    for (y = 0; y < height; y++) {
4342
442k
        for (x = 0; x < width; x += 8) {
4343
4344
266k
            x1 = _mm_setzero_si128();
4345
266k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4346
266k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4347
266k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4348
266k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4349
266k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4350
266k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4351
266k
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4352
4353
266k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4354
4355
266k
            t7 = _mm_mullo_epi16(x2, r1);
4356
4357
4358
266k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4359
266k
            rBuffer = _mm_mulhi_epi16(x2, r1);
4360
266k
            t8 = _mm_mullo_epi16(x3, r0);
4361
266k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4362
266k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4363
4364
266k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4365
266k
            rBuffer = _mm_mulhi_epi16(x3, r0);
4366
266k
            t7 = _mm_mullo_epi16(x4, r1);
4367
266k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4368
266k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4369
4370
266k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4371
266k
            rBuffer = _mm_mulhi_epi16(x4, r1);
4372
266k
            t8 = _mm_mullo_epi16(x5, r0);
4373
266k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4374
266k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4375
4376
266k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4377
266k
            rBuffer = _mm_mulhi_epi16(x5, r0);
4378
266k
            t7 = _mm_mullo_epi16(x6, r1);
4379
266k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4380
266k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4381
4382
266k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4383
266k
            rBuffer = _mm_mulhi_epi16(x6, r1);
4384
266k
            t8 = _mm_mullo_epi16(x7, r0);
4385
266k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4386
266k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4387
4388
266k
            rBuffer = _mm_mulhi_epi16(x7, r0);
4389
266k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4390
266k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4391
4392
266k
            t8 = _mm_unpacklo_epi16(
4393
266k
                    _mm_mullo_epi16(x8,
4394
266k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4395
266k
                            _mm_mulhi_epi16(x8,
4396
266k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4397
266k
            x8 = _mm_unpackhi_epi16(
4398
266k
                    _mm_mullo_epi16(x8,
4399
266k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4400
266k
                            _mm_mulhi_epi16(x8,
4401
266k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4402
4403
            /* add calculus by correct value : */
4404
4405
266k
            x3 = _mm_add_epi32(x3, x4);
4406
266k
            x5 = _mm_add_epi32(x5, x6);
4407
266k
            r1 = _mm_add_epi32(x2, x3);
4408
266k
            x7 = _mm_add_epi32(x7, x8);
4409
266k
            r1 = _mm_add_epi32(r1, x5);
4410
4411
266k
            t3 = _mm_add_epi32(t3, t4);
4412
266k
            t5 = _mm_add_epi32(t5, t6);
4413
266k
            r0 = _mm_add_epi32(t2, t3);
4414
266k
            t7 = _mm_add_epi32(t7, t8);
4415
266k
            r0 = _mm_add_epi32(r0, t5);
4416
266k
            r1 = _mm_add_epi32(r1, x7);
4417
266k
            r0 = _mm_add_epi32(r0, t7);
4418
266k
            r1 = _mm_srli_epi32(r1, 6);
4419
266k
            r0 = _mm_srli_epi32(r0, 6);
4420
4421
266k
            r1 = _mm_and_si128(r1,
4422
266k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4423
266k
            r0 = _mm_and_si128(r0,
4424
266k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4425
266k
            r0 = _mm_hadd_epi16(r0, r1);
4426
266k
            _mm_store_si128((__m128i *) &dst[x], r0);
4427
4428
266k
        }
4429
176k
        tmp += MAX_PB_SIZE;
4430
176k
        dst += dststride;
4431
176k
    }
4432
18.2k
}
4433
void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
4434
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4435
24.5k
        int16_t* mcbuffer) {
4436
24.5k
    int x, y;
4437
24.5k
    uint8_t *src = (uint8_t*) _src;
4438
24.5k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4439
24.5k
    int16_t *tmp = mcbuffer;
4440
24.5k
    __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
4441
24.5k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4442
4443
24.5k
    src -= qpel_extra_before[1] * srcstride;
4444
24.5k
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4445
24.5k
            0);
4446
4447
    /* LOAD src from memory to registers to limit memory bandwidth */
4448
24.5k
    if (width == 4) {
4449
4450
22.7k
        for (y = 0; y < height + qpel_extra[1]; y += 2) {
4451
            /* load data in register     */
4452
19.9k
            x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4453
19.9k
            x1 = _mm_slli_si128(x1, 1);
4454
19.9k
            src += srcstride;
4455
19.9k
            t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4456
19.9k
            t1 = _mm_slli_si128(t1, 1);
4457
19.9k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4458
19.9k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4459
19.9k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4460
19.9k
                    _mm_srli_si128(x1, 3));
4461
19.9k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4462
19.9k
                    _mm_srli_si128(t1, 3));
4463
4464
            /*  PMADDUBSW then PMADDW     */
4465
19.9k
            x2 = _mm_maddubs_epi16(x2, r0);
4466
19.9k
            t2 = _mm_maddubs_epi16(t2, r0);
4467
19.9k
            x3 = _mm_maddubs_epi16(x3, r0);
4468
19.9k
            t3 = _mm_maddubs_epi16(t3, r0);
4469
19.9k
            x2 = _mm_hadd_epi16(x2, x3);
4470
19.9k
            t2 = _mm_hadd_epi16(t2, t3);
4471
19.9k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4472
19.9k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4473
19.9k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4474
19.9k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4475
            /* give results back            */
4476
19.9k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4477
4478
19.9k
            tmp += MAX_PB_SIZE;
4479
19.9k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4480
4481
19.9k
            src += srcstride;
4482
19.9k
            tmp += MAX_PB_SIZE;
4483
19.9k
        }
4484
2.82k
    } else
4485
358k
        for (y = 0; y < height + qpel_extra[1]; y++) {
4486
817k
            for (x = 0; x < width; x += 8) {
4487
                /* load data in register     */
4488
480k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4489
480k
                x1 = _mm_slli_si128(x1, 1);
4490
480k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4491
480k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4492
480k
                        _mm_srli_si128(x1, 3));
4493
480k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4494
480k
                        _mm_srli_si128(x1, 5));
4495
480k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4496
480k
                        _mm_srli_si128(x1, 7));
4497
4498
                /*  PMADDUBSW then PMADDW     */
4499
480k
                x2 = _mm_maddubs_epi16(x2, r0);
4500
480k
                x3 = _mm_maddubs_epi16(x3, r0);
4501
480k
                x4 = _mm_maddubs_epi16(x4, r0);
4502
480k
                x5 = _mm_maddubs_epi16(x5, r0);
4503
480k
                x2 = _mm_hadd_epi16(x2, x3);
4504
480k
                x4 = _mm_hadd_epi16(x4, x5);
4505
480k
                x2 = _mm_hadd_epi16(x2, x4);
4506
480k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4507
4508
                /* give results back            */
4509
480k
                _mm_store_si128((__m128i *) &tmp[x], x2);
4510
4511
480k
            }
4512
336k
            src += srcstride;
4513
336k
            tmp += MAX_PB_SIZE;
4514
336k
        }
4515
4516
24.5k
    tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
4517
24.5k
    srcstride = MAX_PB_SIZE;
4518
4519
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4520
     for register calculations */
4521
24.5k
    rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
4522
254k
    for (y = 0; y < height; y++) {
4523
571k
        for (x = 0; x < width; x += 8) {
4524
4525
341k
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4526
341k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4527
341k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4528
341k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4529
341k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4530
341k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4531
341k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4532
4533
341k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4534
341k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4535
341k
            t8 = _mm_mullo_epi16(x1, r0);
4536
341k
            rBuffer = _mm_mulhi_epi16(x1, r0);
4537
341k
            t7 = _mm_mullo_epi16(x2, r1);
4538
341k
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4539
341k
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4540
4541
341k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4542
341k
            rBuffer = _mm_mulhi_epi16(x2, r1);
4543
341k
            t8 = _mm_mullo_epi16(x3, r0);
4544
341k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4545
341k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4546
4547
341k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4548
341k
            rBuffer = _mm_mulhi_epi16(x3, r0);
4549
341k
            t7 = _mm_mullo_epi16(x4, r1);
4550
341k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4551
341k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4552
4553
341k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4554
341k
            rBuffer = _mm_mulhi_epi16(x4, r1);
4555
341k
            t8 = _mm_mullo_epi16(x5, r0);
4556
341k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4557
341k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4558
4559
341k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4560
341k
            rBuffer = _mm_mulhi_epi16(x5, r0);
4561
341k
            t7 = _mm_mullo_epi16(x6, r1);
4562
341k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4563
341k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4564
4565
341k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4566
341k
            rBuffer = _mm_mulhi_epi16(x6, r1);
4567
341k
            t8 = _mm_mullo_epi16(x7, r0);
4568
341k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4569
341k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4570
4571
341k
            rBuffer = _mm_mulhi_epi16(x7, r0);
4572
341k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4573
341k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4574
4575
4576
            /* add calculus by correct value : */
4577
4578
341k
            r1 = _mm_add_epi32(x1, x2);
4579
341k
            x3 = _mm_add_epi32(x3, x4);
4580
341k
            x5 = _mm_add_epi32(x5, x6);
4581
341k
            r1 = _mm_add_epi32(r1, x3);
4582
341k
            r1 = _mm_add_epi32(r1, x5);
4583
4584
341k
            r0 = _mm_add_epi32(t1, t2);
4585
341k
            t3 = _mm_add_epi32(t3, t4);
4586
341k
            t5 = _mm_add_epi32(t5, t6);
4587
341k
            r0 = _mm_add_epi32(r0, t3);
4588
341k
            r0 = _mm_add_epi32(r0, t5);
4589
341k
            r1 = _mm_add_epi32(r1, x7);
4590
341k
            r0 = _mm_add_epi32(r0, t7);
4591
341k
            r1 = _mm_srli_epi32(r1, 6);
4592
341k
            r0 = _mm_srli_epi32(r0, 6);
4593
4594
341k
            r1 = _mm_and_si128(r1,
4595
341k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4596
341k
            r0 = _mm_and_si128(r0,
4597
341k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4598
341k
            r0 = _mm_hadd_epi16(r0, r1);
4599
341k
            _mm_store_si128((__m128i *) &dst[x], r0);
4600
4601
341k
        }
4602
229k
        tmp += MAX_PB_SIZE;
4603
229k
        dst += dststride;
4604
229k
    }
4605
24.5k
}
4606
void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4607
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4608
15.7k
        int16_t* mcbuffer) {
4609
15.7k
    int x, y;
4610
15.7k
    uint8_t *src = (uint8_t*) _src;
4611
15.7k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4612
15.7k
    int16_t *tmp = mcbuffer;
4613
15.7k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4614
15.7k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4615
4616
15.7k
    src -= qpel_extra_before[2] * srcstride;
4617
15.7k
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4618
15.7k
            0);
4619
4620
    /* LOAD src from memory to registers to limit memory bandwidth */
4621
15.7k
    if (width == 4) {
4622
4623
26.6k
        for (y = 0; y < height + qpel_extra[2]; y += 2) {
4624
            /* load data in register     */
4625
23.6k
            x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4626
23.6k
            x1 = _mm_slli_si128(x1, 1);
4627
23.6k
            src += srcstride;
4628
23.6k
            t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4629
23.6k
            t1 = _mm_slli_si128(t1, 1);
4630
23.6k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4631
23.6k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4632
23.6k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4633
23.6k
                    _mm_srli_si128(x1, 3));
4634
23.6k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4635
23.6k
                    _mm_srli_si128(t1, 3));
4636
4637
            /*  PMADDUBSW then PMADDW     */
4638
23.6k
            x2 = _mm_maddubs_epi16(x2, r0);
4639
23.6k
            t2 = _mm_maddubs_epi16(t2, r0);
4640
23.6k
            x3 = _mm_maddubs_epi16(x3, r0);
4641
23.6k
            t3 = _mm_maddubs_epi16(t3, r0);
4642
23.6k
            x2 = _mm_hadd_epi16(x2, x3);
4643
23.6k
            t2 = _mm_hadd_epi16(t2, t3);
4644
23.6k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4645
23.6k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4646
23.6k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4647
23.6k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4648
            /* give results back            */
4649
23.6k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4650
4651
23.6k
            tmp += MAX_PB_SIZE;
4652
23.6k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4653
4654
23.6k
            src += srcstride;
4655
23.6k
            tmp += MAX_PB_SIZE;
4656
23.6k
        }
4657
2.94k
    } else
4658
235k
        for (y = 0; y < height + qpel_extra[2]; y++) {
4659
582k
            for (x = 0; x < width; x += 8) {
4660
                /* load data in register     */
4661
360k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4662
360k
                x1 = _mm_slli_si128(x1, 1);
4663
360k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4664
360k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4665
360k
                        _mm_srli_si128(x1, 3));
4666
360k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4667
360k
                        _mm_srli_si128(x1, 5));
4668
360k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4669
360k
                        _mm_srli_si128(x1, 7));
4670
4671
                /*  PMADDUBSW then PMADDW     */
4672
360k
                x2 = _mm_maddubs_epi16(x2, r0);
4673
360k
                x3 = _mm_maddubs_epi16(x3, r0);
4674
360k
                x4 = _mm_maddubs_epi16(x4, r0);
4675
360k
                x5 = _mm_maddubs_epi16(x5, r0);
4676
360k
                x2 = _mm_hadd_epi16(x2, x3);
4677
360k
                x4 = _mm_hadd_epi16(x4, x5);
4678
360k
                x2 = _mm_hadd_epi16(x2, x4);
4679
360k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4680
4681
                /* give results back            */
4682
360k
                _mm_store_si128((__m128i *) &tmp[x], x2);
4683
4684
360k
            }
4685
222k
            src += srcstride;
4686
222k
            tmp += MAX_PB_SIZE;
4687
222k
        }
4688
4689
15.7k
    tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4690
15.7k
    srcstride = MAX_PB_SIZE;
4691
4692
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4693
     for register calculations */
4694
15.7k
    rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4695
172k
    for (y = 0; y < height; y++) {
4696
416k
        for (x = 0; x < width; x += 8) {
4697
4698
260k
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4699
260k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4700
260k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4701
260k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4702
260k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4703
260k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4704
260k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4705
260k
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4706
4707
260k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4708
260k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4709
260k
            t8 = _mm_mullo_epi16(x1, r0);
4710
260k
            rBuffer = _mm_mulhi_epi16(x1, r0);
4711
260k
            t7 = _mm_mullo_epi16(x2, r1);
4712
260k
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4713
260k
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4714
4715
260k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4716
260k
            rBuffer = _mm_mulhi_epi16(x2, r1);
4717
260k
            t8 = _mm_mullo_epi16(x3, r0);
4718
260k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4719
260k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4720
4721
260k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4722
260k
            rBuffer = _mm_mulhi_epi16(x3, r0);
4723
260k
            t7 = _mm_mullo_epi16(x4, r1);
4724
260k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4725
260k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4726
4727
260k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4728
260k
            rBuffer = _mm_mulhi_epi16(x4, r1);
4729
260k
            t8 = _mm_mullo_epi16(x5, r0);
4730
260k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4731
260k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4732
4733
260k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4734
260k
            rBuffer = _mm_mulhi_epi16(x5, r0);
4735
260k
            t7 = _mm_mullo_epi16(x6, r1);
4736
260k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4737
260k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4738
4739
260k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4740
260k
            rBuffer = _mm_mulhi_epi16(x6, r1);
4741
260k
            t8 = _mm_mullo_epi16(x7, r0);
4742
260k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4743
260k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4744
4745
260k
            rBuffer = _mm_mulhi_epi16(x7, r0);
4746
260k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4747
260k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4748
4749
260k
            t8 = _mm_unpacklo_epi16(
4750
260k
                    _mm_mullo_epi16(x8,
4751
260k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4752
260k
                            _mm_mulhi_epi16(x8,
4753
260k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4754
260k
            x8 = _mm_unpackhi_epi16(
4755
260k
                    _mm_mullo_epi16(x8,
4756
260k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4757
260k
                            _mm_mulhi_epi16(x8,
4758
260k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4759
4760
            /* add calculus by correct value : */
4761
4762
260k
            r1 = _mm_add_epi32(x1, x2);
4763
260k
            x3 = _mm_add_epi32(x3, x4);
4764
260k
            x5 = _mm_add_epi32(x5, x6);
4765
260k
            r1 = _mm_add_epi32(r1, x3);
4766
260k
            x7 = _mm_add_epi32(x7, x8);
4767
260k
            r1 = _mm_add_epi32(r1, x5);
4768
4769
260k
            r0 = _mm_add_epi32(t1, t2);
4770
260k
            t3 = _mm_add_epi32(t3, t4);
4771
260k
            t5 = _mm_add_epi32(t5, t6);
4772
260k
            r0 = _mm_add_epi32(r0, t3);
4773
260k
            t7 = _mm_add_epi32(t7, t8);
4774
260k
            r0 = _mm_add_epi32(r0, t5);
4775
260k
            r1 = _mm_add_epi32(r1, x7);
4776
260k
            r0 = _mm_add_epi32(r0, t7);
4777
260k
            r1 = _mm_srli_epi32(r1, 6);
4778
260k
            r0 = _mm_srli_epi32(r0, 6);
4779
4780
260k
            r1 = _mm_and_si128(r1,
4781
260k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4782
260k
            r0 = _mm_and_si128(r0,
4783
260k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4784
260k
            r0 = _mm_hadd_epi16(r0, r1);
4785
260k
            _mm_store_si128((__m128i *) &dst[x], r0);
4786
4787
260k
        }
4788
156k
        tmp += MAX_PB_SIZE;
4789
156k
        dst += dststride;
4790
156k
    }
4791
15.7k
}
4792
void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4793
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4794
33.0k
        int16_t* mcbuffer) {
4795
33.0k
    int x, y;
4796
33.0k
    uint8_t *src = (uint8_t*) _src;
4797
33.0k
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4798
33.0k
    int16_t *tmp = mcbuffer;
4799
33.0k
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4800
33.0k
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4801
4802
33.0k
    src -= qpel_extra_before[3] * srcstride;
4803
33.0k
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4804
33.0k
            0);
4805
4806
    /* LOAD src from memory to registers to limit memory bandwidth */
4807
33.0k
    if (width == 4) {
4808
4809
86.1k
        for (y = 0; y < height + qpel_extra[3]; y += 2) {
4810
            /* load data in register     */
4811
75.4k
            x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4812
75.4k
            x1 = _mm_slli_si128(x1, 1);
4813
75.4k
            src += srcstride;
4814
75.4k
            t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4815
75.4k
            t1 = _mm_slli_si128(t1, 1);
4816
75.4k
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4817
75.4k
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4818
75.4k
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4819
75.4k
                    _mm_srli_si128(x1, 3));
4820
75.4k
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4821
75.4k
                    _mm_srli_si128(t1, 3));
4822
4823
            /*  PMADDUBSW then PMADDW     */
4824
75.4k
            x2 = _mm_maddubs_epi16(x2, r0);
4825
75.4k
            t2 = _mm_maddubs_epi16(t2, r0);
4826
75.4k
            x3 = _mm_maddubs_epi16(x3, r0);
4827
75.4k
            t3 = _mm_maddubs_epi16(t3, r0);
4828
75.4k
            x2 = _mm_hadd_epi16(x2, x3);
4829
75.4k
            t2 = _mm_hadd_epi16(t2, t3);
4830
75.4k
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4831
75.4k
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4832
75.4k
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4833
75.4k
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4834
            /* give results back            */
4835
75.4k
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4836
4837
75.4k
            tmp += MAX_PB_SIZE;
4838
75.4k
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4839
4840
75.4k
            src += srcstride;
4841
75.4k
            tmp += MAX_PB_SIZE;
4842
75.4k
        }
4843
10.7k
    } else
4844
385k
        for (y = 0; y < height + qpel_extra[3]; y++) {
4845
953k
            for (x = 0; x < width; x += 8) {
4846
                /* load data in register     */
4847
590k
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4848
590k
                x1 = _mm_slli_si128(x1, 1);
4849
590k
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4850
590k
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4851
590k
                        _mm_srli_si128(x1, 3));
4852
590k
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4853
590k
                        _mm_srli_si128(x1, 5));
4854
590k
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4855
590k
                        _mm_srli_si128(x1, 7));
4856
4857
                /*  PMADDUBSW then PMADDW     */
4858
590k
                x2 = _mm_maddubs_epi16(x2, r0);
4859
590k
                x3 = _mm_maddubs_epi16(x3, r0);
4860
590k
                x4 = _mm_maddubs_epi16(x4, r0);
4861
590k
                x5 = _mm_maddubs_epi16(x5, r0);
4862
590k
                x2 = _mm_hadd_epi16(x2, x3);
4863
590k
                x4 = _mm_hadd_epi16(x4, x5);
4864
590k
                x2 = _mm_hadd_epi16(x2, x4);
4865
590k
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4866
4867
                /* give results back            */
4868
590k
                _mm_store_si128((__m128i *) &tmp[x], x2);
4869
4870
590k
            }
4871
363k
            src += srcstride;
4872
363k
            tmp += MAX_PB_SIZE;
4873
363k
        }
4874
4875
33.0k
    tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4876
33.0k
    srcstride = MAX_PB_SIZE;
4877
4878
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4879
     for register calculations */
4880
33.0k
    rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4881
348k
    for (y = 0; y < height; y++) {
4882
809k
        for (x = 0; x < width; x += 8) {
4883
4884
493k
            x1 = _mm_setzero_si128();
4885
493k
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4886
493k
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4887
493k
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4888
493k
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4889
493k
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4890
493k
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4891
493k
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4892
4893
493k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4894
493k
            t7 = _mm_mullo_epi16(x2, r1);
4895
4896
4897
493k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4898
493k
            rBuffer = _mm_mulhi_epi16(x2, r1);
4899
493k
            t8 = _mm_mullo_epi16(x3, r0);
4900
493k
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4901
493k
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4902
4903
493k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4904
493k
            rBuffer = _mm_mulhi_epi16(x3, r0);
4905
493k
            t7 = _mm_mullo_epi16(x4, r1);
4906
493k
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4907
493k
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4908
4909
493k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4910
493k
            rBuffer = _mm_mulhi_epi16(x4, r1);
4911
493k
            t8 = _mm_mullo_epi16(x5, r0);
4912
493k
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4913
493k
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4914
4915
493k
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4916
493k
            rBuffer = _mm_mulhi_epi16(x5, r0);
4917
493k
            t7 = _mm_mullo_epi16(x6, r1);
4918
493k
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4919
493k
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4920
4921
493k
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4922
493k
            rBuffer = _mm_mulhi_epi16(x6, r1);
4923
493k
            t8 = _mm_mullo_epi16(x7, r0);
4924
493k
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4925
493k
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4926
4927
493k
            rBuffer = _mm_mulhi_epi16(x7, r0);
4928
493k
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4929
493k
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4930
4931
493k
            t8 = _mm_unpacklo_epi16(
4932
493k
                    _mm_mullo_epi16(x8,
4933
493k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4934
493k
                            _mm_mulhi_epi16(x8,
4935
493k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4936
493k
            x8 = _mm_unpackhi_epi16(
4937
493k
                    _mm_mullo_epi16(x8,
4938
493k
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4939
493k
                            _mm_mulhi_epi16(x8,
4940
493k
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4941
4942
            /* add calculus by correct value : */
4943
4944
493k
            x3 = _mm_add_epi32(x3, x4);
4945
493k
            x5 = _mm_add_epi32(x5, x6);
4946
493k
            r1 = _mm_add_epi32(x2, x3);
4947
493k
            x7 = _mm_add_epi32(x7, x8);
4948
493k
            r1 = _mm_add_epi32(r1, x5);
4949
4950
493k
            t3 = _mm_add_epi32(t3, t4);
4951
493k
            t5 = _mm_add_epi32(t5, t6);
4952
493k
            r0 = _mm_add_epi32(t2, t3);
4953
493k
            t7 = _mm_add_epi32(t7, t8);
4954
493k
            r0 = _mm_add_epi32(r0, t5);
4955
493k
            r1 = _mm_add_epi32(r1, x7);
4956
493k
            r0 = _mm_add_epi32(r0, t7);
4957
493k
            r1 = _mm_srli_epi32(r1, 6);
4958
493k
            r0 = _mm_srli_epi32(r0, 6);
4959
4960
493k
            r1 = _mm_and_si128(r1,
4961
493k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4962
493k
            r0 = _mm_and_si128(r0,
4963
493k
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4964
493k
            r0 = _mm_hadd_epi16(r0, r1);
4965
493k
            _mm_store_si128((__m128i *) &dst[x], r0);
4966
4967
493k
        }
4968
315k
        tmp += MAX_PB_SIZE;
4969
315k
        dst += dststride;
4970
315k
    }
4971
33.0k
}