Coverage Report

Created: 2026-05-16 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/x86/sse-motion.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013 openHEVC contributors
4
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5
 *
6
 * This file is part of libde265.
7
 *
8
 * libde265 is free software: you can redistribute it and/or modify
9
 * it under the terms of the GNU Lesser General Public License as
10
 * published by the Free Software Foundation, either version 3 of
11
 * the License, or (at your option) any later version.
12
 *
13
 * libde265 is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public License
19
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#ifdef HAVE_CONFIG_H
23
#include "config.h"
24
#endif
25
26
#include <stdio.h>
27
#include <emmintrin.h>
28
#include <tmmintrin.h> // SSSE3
29
#if HAVE_SSE4_1
30
#include <smmintrin.h>
31
#endif
32
33
#include "sse-motion.h"
34
#include "libde265/util.h"
35
36
37
ALIGNED_16(const int8_t) epel_filters[7][16] = {
38
  { -2,  58,  10,  -2,-2,  58,  10,  -2,-2,  58,  10,  -2,-2,  58,  10,  -2 },
39
  { -4,  54,  16,  -2,-4,  54,  16,  -2,-4,  54,  16,  -2,-4,  54,  16,  -2 },
40
  { -6,  46,  28,  -4,-6,  46,  28,  -4,-6,  46,  28,  -4,-6,  46,  28,  -4 },
41
  { -4,  36,  36,  -4,-4,  36,  36,  -4,-4,  36,  36,  -4,-4,  36,  36,  -4 },
42
  { -4,  28,  46,  -6,-4,  28,  46,  -6,-4,  28,  46,  -6,-4,  28,  46,  -6 },
43
  { -2,  16,  54,  -4,-2,  16,  54,  -4,-2,  16,  54,  -4,-2,  16,  54,  -4 },
44
  { -2,  10,  58,  -2,-2,  10,  58,  -2,-2,  10,  58,  -2,-2,  10,  58,  -2 },
45
};
46
47
static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 };
48
//static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 };
49
static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 };
50
51
static const int epel_extra_before = 1;
52
//static const int epel_extra_after = 2;
53
static const int epel_extra = 3;
54
55
0
#define MAX_PB_SIZE 64
56
57
#define MASKMOVE 0
58
59
void print128(const char* prefix, __m128i r)
60
0
{
61
0
  unsigned char buf[16];
62
63
0
  *(__m128i*)buf = r;
64
65
0
  printf("%s ",prefix);
66
0
  for (int i=0;i<16;i++)
67
0
    {
68
0
      if (i>0) { printf(":"); }
69
0
      printf("%02x", buf[i]);
70
0
    }
71
72
0
  printf("\n");
73
0
}
74
75
76
void printm32(const char* prefix, unsigned char* p)
77
0
{
78
0
  printf("%s ",prefix);
79
80
0
  for (int i=0;i<4;i++)
81
0
    {
82
0
      if (i>0) { printf(":"); }
83
0
      printf("%02x", p[i]);
84
0
    }
85
86
0
  printf("\n");
87
0
}
88
89
90
0
#define BIT_DEPTH 8
91
92
void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
93
                                       const int16_t *src, ptrdiff_t srcstride,
94
0
                                       int width, int height) {
95
0
    int x, y;
96
0
    uint8_t *dst = (uint8_t*) _dst;
97
0
    __m128i r0, r1, f0;
98
99
0
    f0 = _mm_set1_epi16(32);
100
101
102
0
    if(!(width & 15))
103
0
    {
104
0
        for (y = 0; y < height; y++) {
105
0
                    for (x = 0; x < width; x += 16) {
106
0
                        r0 = _mm_load_si128((__m128i *) (src+x));
107
108
0
                        r1 = _mm_load_si128((__m128i *) (src+x + 8));
109
0
                        r0 = _mm_adds_epi16(r0, f0);
110
111
0
                        r1 = _mm_adds_epi16(r1, f0);
112
0
                        r0 = _mm_srai_epi16(r0, 6);
113
0
                        r1 = _mm_srai_epi16(r1, 6);
114
0
                        r0 = _mm_packus_epi16(r0, r1);
115
116
0
                        _mm_storeu_si128((__m128i *) (dst+x), r0);
117
0
                    }
118
0
                    dst += dststride;
119
0
                    src += srcstride;
120
0
                }
121
0
    }else if(!(width & 7))
122
0
    {
123
0
        for (y = 0; y < height; y++) {
124
0
            for (x = 0; x < width; x += 8) {
125
0
                    r0 = _mm_load_si128((__m128i *) (src+x));
126
127
0
                    r0 = _mm_adds_epi16(r0, f0);
128
129
0
                    r0 = _mm_srai_epi16(r0, 6);
130
0
                    r0 = _mm_packus_epi16(r0, r0);
131
132
0
                    _mm_storel_epi64((__m128i *) (dst+x), r0);
133
0
            }
134
0
                    dst += dststride;
135
0
                    src += srcstride;
136
0
                }
137
0
    }else if(!(width & 3)){
138
0
        for (y = 0; y < height; y++) {
139
0
                    for(x = 0;x < width; x+=4){
140
0
                    r0 = _mm_loadl_epi64((__m128i *) (src+x));
141
0
                    r0 = _mm_adds_epi16(r0, f0);
142
143
0
                    r0 = _mm_srai_epi16(r0, 6);
144
0
                    r0 = _mm_packus_epi16(r0, r0);
145
#if MASKMOVE
146
                    _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
147
#else
148
                    //r0 = _mm_shuffle_epi32 (r0, 0x00);
149
0
                    *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
150
0
#endif
151
0
                    }
152
0
                    dst += dststride;
153
0
                    src += srcstride;
154
0
                }
155
0
    }else{
156
0
        for (y = 0; y < height; y++) {
157
0
                    for(x = 0;x < width; x+=2){
158
0
                    r0 = _mm_loadl_epi64((__m128i *) (src+x));
159
0
                    r0 = _mm_adds_epi16(r0, f0);
160
161
0
                    r0 = _mm_srai_epi16(r0, 6);
162
0
                    r0 = _mm_packus_epi16(r0, r0);
163
#if MASKMOVE
164
                    _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
165
#else
166
0
                    *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
167
0
#endif
168
0
                    }
169
0
                    dst += dststride;
170
0
                    src += srcstride;
171
0
                }
172
0
    }
173
174
0
}
175
176
void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride,
177
                                     const int16_t *src, ptrdiff_t srcstride,
178
0
                                     int width, int height) {
179
0
    int x, y;
180
0
    uint8_t *dst = (uint8_t*) _dst;
181
0
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
182
0
    __m128i r0, r1, f0;
183
0
    int shift = 14 - BIT_DEPTH;
184
0
#if BIT_DEPTH < 14
185
0
    int16_t offset = 1 << (shift - 1);
186
#else
187
    int16_t offset = 0;
188
189
#endif
190
0
    f0 = _mm_set1_epi16(offset);
191
192
0
    for (y = 0; y < height; y++) {
193
0
        for (x = 0; x < width; x += 16) {
194
0
            r0 = _mm_load_si128((__m128i *) &src[x]);
195
196
0
            r1 = _mm_load_si128((__m128i *) &src[x + 8]);
197
0
            r0 = _mm_adds_epi16(r0, f0);
198
199
0
            r1 = _mm_adds_epi16(r1, f0);
200
0
            r0 = _mm_srai_epi16(r0, shift);
201
0
            r1 = _mm_srai_epi16(r1, shift);
202
0
            r0 = _mm_packus_epi16(r0, r1);
203
204
0
            _mm_storeu_si128((__m128i *) &dst[x], r0);
205
0
        }
206
0
        dst += dststride;
207
0
        src += srcstride;
208
0
    }
209
0
}
210
211
void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
212
                                         const int16_t *src1, const int16_t *src2,
213
                                         ptrdiff_t srcstride, int width,
214
0
                                         int height) {
215
0
    int x, y;
216
0
    uint8_t *dst = (uint8_t*) _dst;
217
0
    __m128i r0, r1, f0, r2, r3;
218
219
0
    f0 = _mm_set1_epi16(64);
220
0
    if(!(width & 15)){
221
0
        for (y = 0; y < height; y++) {
222
223
0
            for (x = 0; x < width; x += 16) {
224
0
                r0 = _mm_load_si128((__m128i *) &src1[x]);
225
0
                r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
226
0
                r2 = _mm_load_si128((__m128i *) &src2[x]);
227
0
                r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
228
229
0
                r0 = _mm_adds_epi16(r0, f0);
230
0
                r1 = _mm_adds_epi16(r1, f0);
231
0
                r0 = _mm_adds_epi16(r0, r2);
232
0
                r1 = _mm_adds_epi16(r1, r3);
233
0
                r0 = _mm_srai_epi16(r0, 7);
234
0
                r1 = _mm_srai_epi16(r1, 7);
235
0
                r0 = _mm_packus_epi16(r0, r1);
236
237
0
                _mm_storeu_si128((__m128i *) (dst + x), r0);
238
0
            }
239
0
            dst += dststride;
240
0
            src1 += srcstride;
241
0
            src2 += srcstride;
242
0
        }
243
0
    }else if(!(width & 7)){
244
0
        for (y = 0; y < height; y++) {
245
0
            for(x=0;x<width;x+=8){
246
0
                r0 = _mm_load_si128((__m128i *) (src1+x));
247
0
                r2 = _mm_load_si128((__m128i *) (src2+x));
248
249
0
                r0 = _mm_adds_epi16(r0, f0);
250
0
                r0 = _mm_adds_epi16(r0, r2);
251
0
                r0 = _mm_srai_epi16(r0, 7);
252
0
                r0 = _mm_packus_epi16(r0, r0);
253
254
0
                _mm_storel_epi64((__m128i *) (dst+x), r0);
255
0
            }
256
0
            dst += dststride;
257
0
            src1 += srcstride;
258
0
            src2 += srcstride;
259
0
        }
260
0
    }else if(!(width & 3)){
261
#if MASKMOVE
262
      r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
263
#endif
264
0
        for (y = 0; y < height; y++) {
265
266
0
            for(x=0;x<width;x+=4)
267
0
            {
268
0
                r0 = _mm_loadl_epi64((__m128i *) (src1+x));
269
0
                r2 = _mm_loadl_epi64((__m128i *) (src2+x));
270
271
0
                r0 = _mm_adds_epi16(r0, f0);
272
0
                r0 = _mm_adds_epi16(r0, r2);
273
0
                r0 = _mm_srai_epi16(r0, 7);
274
0
                r0 = _mm_packus_epi16(r0, r0);
275
276
#if MASKMOVE
277
                _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
278
#else
279
0
                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
280
0
#endif
281
0
            }
282
0
            dst += dststride;
283
0
            src1 += srcstride;
284
0
            src2 += srcstride;
285
0
        }
286
0
    }else{
287
#if MASKMOVE
288
      r1= _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1);
289
#endif
290
0
        for (y = 0; y < height; y++) {
291
0
                    for(x=0;x<width;x+=2)
292
0
                    {
293
0
                        r0 = _mm_loadl_epi64((__m128i *) (src1+x));
294
0
                        r2 = _mm_loadl_epi64((__m128i *) (src2+x));
295
296
0
                        r0 = _mm_adds_epi16(r0, f0);
297
0
                        r0 = _mm_adds_epi16(r0, r2);
298
0
                        r0 = _mm_srai_epi16(r0, 7);
299
0
                        r0 = _mm_packus_epi16(r0, r0);
300
301
#if MASKMOVE
302
                        _mm_maskmoveu_si128(r0,r1,(char *) (dst+x));
303
#else
304
0
                        *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
305
0
#endif
306
0
                    }
307
0
                    dst += dststride;
308
0
                    src1 += srcstride;
309
0
                    src2 += srcstride;
310
0
                }
311
0
    }
312
313
314
0
}
315
316
void ff_hevc_put_weighted_pred_avg_sse(uint8_t *_dst, ptrdiff_t _dststride,
317
                                       const int16_t *src1, const int16_t *src2,
318
                                       ptrdiff_t srcstride, int width,
319
0
                                       int height) {
320
0
    int x, y;
321
0
    uint8_t *dst = (uint8_t*) _dst;
322
0
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
323
0
    __m128i r0, r1, f0, r2, r3;
324
0
    int shift = 14 + 1 - BIT_DEPTH;
325
0
#if BIT_DEPTH < 14
326
0
    int offset = 1 << (shift - 1);
327
#else
328
    int offset = 0;
329
#endif
330
0
    f0 = _mm_set1_epi16(offset);
331
0
    for (y = 0; y < height; y++) {
332
333
0
        for (x = 0; x < width; x += 16) {
334
0
            r0 = _mm_load_si128((__m128i *) &src1[x]);
335
0
            r1 = _mm_load_si128((__m128i *) &src1[x + 8]);
336
0
            r2 = _mm_load_si128((__m128i *) &src2[x]);
337
0
            r3 = _mm_load_si128((__m128i *) &src2[x + 8]);
338
339
0
            r0 = _mm_adds_epi16(r0, f0);
340
0
            r1 = _mm_adds_epi16(r1, f0);
341
0
            r0 = _mm_adds_epi16(r0, r2);
342
0
            r1 = _mm_adds_epi16(r1, r3);
343
0
            r0 = _mm_srai_epi16(r0, shift);
344
0
            r1 = _mm_srai_epi16(r1, shift);
345
0
            r0 = _mm_packus_epi16(r0, r1);
346
347
0
            _mm_storeu_si128((__m128i *) (dst + x), r0);
348
0
        }
349
0
        dst += dststride;
350
0
        src1 += srcstride;
351
0
        src2 += srcstride;
352
0
    }
353
0
}
354
355
#if 0
356
void ff_hevc_weighted_pred_8_sse4(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
357
                                  uint8_t *_dst, ptrdiff_t _dststride,
358
                                  const int16_t *src, ptrdiff_t srcstride,
359
                                  int width, int height) {
360
361
    int log2Wd;
362
    int x, y;
363
364
    uint8_t *dst = (uint8_t*) _dst;
365
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
366
    __m128i x0, x1, x2, x3, c0, add, add2;
367
368
    log2Wd = denom + 14 - BIT_DEPTH;
369
370
    add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
371
    add2 = _mm_set1_epi32(1 << (log2Wd - 1));
372
    c0 = _mm_set1_epi16(wlxFlag);
373
    if (log2Wd >= 1){
374
        if(!(width & 15)){
375
            for (y = 0; y < height; y++) {
376
                for (x = 0; x < width; x += 16) {
377
                    x0 = _mm_load_si128((__m128i *) &src[x]);
378
                    x2 = _mm_load_si128((__m128i *) &src[x + 8]);
379
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
380
                            _mm_mulhi_epi16(x0, c0));
381
                    x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
382
                            _mm_mulhi_epi16(x2, c0));
383
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
384
                            _mm_mulhi_epi16(x0, c0));
385
                    x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
386
                            _mm_mulhi_epi16(x2, c0));
387
                    x0 = _mm_add_epi32(x0, add2);
388
                    x1 = _mm_add_epi32(x1, add2);
389
                    x2 = _mm_add_epi32(x2, add2);
390
                    x3 = _mm_add_epi32(x3, add2);
391
                    x0 = _mm_srai_epi32(x0, log2Wd);
392
                    x1 = _mm_srai_epi32(x1, log2Wd);
393
                    x2 = _mm_srai_epi32(x2, log2Wd);
394
                    x3 = _mm_srai_epi32(x3, log2Wd);
395
                    x0 = _mm_add_epi32(x0, add);
396
                    x1 = _mm_add_epi32(x1, add);
397
                    x2 = _mm_add_epi32(x2, add);
398
                    x3 = _mm_add_epi32(x3, add);
399
                    x0 = _mm_packus_epi32(x0, x1);
400
                    x2 = _mm_packus_epi32(x2, x3);
401
                    x0 = _mm_packus_epi16(x0, x2);
402
403
                    _mm_storeu_si128((__m128i *) (dst + x), x0);
404
405
                }
406
                dst += dststride;
407
                src += srcstride;
408
            }
409
        }else if(!(width & 7)){
410
            for (y = 0; y < height; y++) {
411
                for(x=0;x<width;x+=8){
412
                    x0 = _mm_load_si128((__m128i *) (src+x));
413
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
414
                            _mm_mulhi_epi16(x0, c0));
415
416
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
417
                            _mm_mulhi_epi16(x0, c0));
418
419
                    x0 = _mm_add_epi32(x0, add2);
420
                    x1 = _mm_add_epi32(x1, add2);
421
422
                    x0 = _mm_srai_epi32(x0, log2Wd);
423
                    x1 = _mm_srai_epi32(x1, log2Wd);
424
425
                    x0 = _mm_add_epi32(x0, add);
426
                    x1 = _mm_add_epi32(x1, add);
427
428
                    x0 = _mm_packus_epi32(x0, x1);
429
                    x0 = _mm_packus_epi16(x0, x0);
430
431
                    _mm_storel_epi64((__m128i *) (dst+x), x0);
432
433
                }
434
                dst += dststride;
435
                src += srcstride;
436
            }
437
        }else if(!(width & 3)){
438
            for (y = 0; y < height; y++) {
439
                for(x=0;x<width;x+=4){
440
                    x0 = _mm_loadl_epi64((__m128i *)(src+x));
441
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
442
                            _mm_mulhi_epi16(x0, c0));
443
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
444
                            _mm_mulhi_epi16(x0, c0));
445
446
                    x0 = _mm_add_epi32(x0, add2);
447
                    x1 = _mm_add_epi32(x1, add2);
448
                    x0 = _mm_srai_epi32(x0, log2Wd);
449
                    x1 = _mm_srai_epi32(x1, log2Wd);
450
                    x0 = _mm_add_epi32(x0, add);
451
                    x1 = _mm_add_epi32(x1, add);
452
                    x0 = _mm_packus_epi32(x0, x1);
453
                    x0 = _mm_packus_epi16(x0, x0);
454
455
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
456
                    // _mm_storeu_si128((__m128i *) (dst + x), x0);
457
                }
458
                dst += dststride;
459
                src += srcstride;
460
            }
461
        }else{
462
            for (y = 0; y < height; y++) {
463
                for(x=0;x<width;x+=2){
464
                    x0 = _mm_loadl_epi64((__m128i *)(src+x));
465
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
466
                            _mm_mulhi_epi16(x0, c0));
467
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
468
                            _mm_mulhi_epi16(x0, c0));
469
470
                    x0 = _mm_add_epi32(x0, add2);
471
                    x1 = _mm_add_epi32(x1, add2);
472
                    x0 = _mm_srai_epi32(x0, log2Wd);
473
                    x1 = _mm_srai_epi32(x1, log2Wd);
474
                    x0 = _mm_add_epi32(x0, add);
475
                    x1 = _mm_add_epi32(x1, add);
476
                    x0 = _mm_packus_epi32(x0, x1);
477
                    x0 = _mm_packus_epi16(x0, x0);
478
479
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
480
                    // _mm_storeu_si128((__m128i *) (dst + x), x0);
481
                }
482
                dst += dststride;
483
                src += srcstride;
484
            }
485
        }
486
    }else{
487
        if(!(width & 15)){
488
            for (y = 0; y < height; y++) {
489
                for (x = 0; x < width; x += 16) {
490
491
                    x0 = _mm_load_si128((__m128i *) &src[x]);
492
                    x2 = _mm_load_si128((__m128i *) &src[x + 8]);
493
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
494
                            _mm_mulhi_epi16(x0, c0));
495
                    x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
496
                            _mm_mulhi_epi16(x2, c0));
497
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
498
                            _mm_mulhi_epi16(x0, c0));
499
                    x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
500
                            _mm_mulhi_epi16(x2, c0));
501
502
                    x0 = _mm_add_epi32(x0, add2);
503
                    x1 = _mm_add_epi32(x1, add2);
504
                    x2 = _mm_add_epi32(x2, add2);
505
                    x3 = _mm_add_epi32(x3, add2);
506
507
                    x0 = _mm_packus_epi32(x0, x1);
508
                    x2 = _mm_packus_epi32(x2, x3);
509
                    x0 = _mm_packus_epi16(x0, x2);
510
511
                    _mm_storeu_si128((__m128i *) (dst + x), x0);
512
513
                }
514
                dst += dststride;
515
                src += srcstride;
516
            }
517
        }else if(!(width & 7)){
518
            for (y = 0; y < height; y++) {
519
                for(x=0;x<width;x+=8){
520
                    x0 = _mm_load_si128((__m128i *) (src+x));
521
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
522
                            _mm_mulhi_epi16(x0, c0));
523
524
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
525
                            _mm_mulhi_epi16(x0, c0));
526
527
528
                    x0 = _mm_add_epi32(x0, add2);
529
                    x1 = _mm_add_epi32(x1, add2);
530
531
                    x0 = _mm_packus_epi32(x0, x1);
532
                    x0 = _mm_packus_epi16(x0, x0);
533
534
                    _mm_storeu_si128((__m128i *) (dst+x), x0);
535
                }
536
537
                dst += dststride;
538
                src += srcstride;
539
            }
540
        }else if(!(width & 3)){
541
            for (y = 0; y < height; y++) {
542
                for(x=0;x<width;x+=4){
543
                    x0 = _mm_loadl_epi64((__m128i *) (src+x));
544
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
545
                            _mm_mulhi_epi16(x0, c0));
546
547
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
548
                            _mm_mulhi_epi16(x0, c0));
549
550
551
                    x0 = _mm_add_epi32(x0, add2);
552
                    x1 = _mm_add_epi32(x1, add2);
553
554
555
                    x0 = _mm_packus_epi32(x0, x1);
556
                    x0 = _mm_packus_epi16(x0, x0);
557
558
559
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
560
                }
561
                dst += dststride;
562
                src += srcstride;
563
            }
564
        }else{
565
            for (y = 0; y < height; y++) {
566
                for(x=0;x<width;x+=2){
567
                    x0 = _mm_loadl_epi64((__m128i *) (src+x));
568
                    x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
569
                            _mm_mulhi_epi16(x0, c0));
570
571
                    x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
572
                            _mm_mulhi_epi16(x0, c0));
573
574
575
                    x0 = _mm_add_epi32(x0, add2);
576
                    x1 = _mm_add_epi32(x1, add2);
577
578
579
                    x0 = _mm_packus_epi32(x0, x1);
580
                    x0 = _mm_packus_epi16(x0, x0);
581
582
583
                    _mm_maskmoveu_si128(x0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
584
                }
585
                dst += dststride;
586
                src += srcstride;
587
            }
588
589
        }
590
591
    }
592
593
}
594
#endif
595
596
597
#if 0
598
void ff_hevc_weighted_pred_sse(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
599
                               uint8_t *_dst, ptrdiff_t _dststride,
600
                               const int16_t *src, ptrdiff_t srcstride,
601
                               int width, int height) {
602
603
    int log2Wd;
604
    int x, y;
605
606
    uint8_t *dst = (uint8_t*) _dst;
607
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
608
    __m128i x0, x1, x2, x3, c0, add, add2;
609
610
    log2Wd = denom + 14 - BIT_DEPTH;
611
612
    add = _mm_set1_epi32(olxFlag * (1 << (BIT_DEPTH - 8)));
613
    add2 = _mm_set1_epi32(1 << (log2Wd - 1));
614
    c0 = _mm_set1_epi16(wlxFlag);
615
    if (log2Wd >= 1)
616
        for (y = 0; y < height; y++) {
617
            for (x = 0; x < width; x += 16) {
618
                x0 = _mm_load_si128((__m128i *) &src[x]);
619
                x2 = _mm_load_si128((__m128i *) &src[x + 8]);
620
                x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
621
                        _mm_mulhi_epi16(x0, c0));
622
                x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
623
                        _mm_mulhi_epi16(x2, c0));
624
                x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
625
                        _mm_mulhi_epi16(x0, c0));
626
                x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
627
                        _mm_mulhi_epi16(x2, c0));
628
                x0 = _mm_add_epi32(x0, add2);
629
                x1 = _mm_add_epi32(x1, add2);
630
                x2 = _mm_add_epi32(x2, add2);
631
                x3 = _mm_add_epi32(x3, add2);
632
                x0 = _mm_srai_epi32(x0, log2Wd);
633
                x1 = _mm_srai_epi32(x1, log2Wd);
634
                x2 = _mm_srai_epi32(x2, log2Wd);
635
                x3 = _mm_srai_epi32(x3, log2Wd);
636
                x0 = _mm_add_epi32(x0, add);
637
                x1 = _mm_add_epi32(x1, add);
638
                x2 = _mm_add_epi32(x2, add);
639
                x3 = _mm_add_epi32(x3, add);
640
                x0 = _mm_packus_epi32(x0, x1);
641
                x2 = _mm_packus_epi32(x2, x3);
642
                x0 = _mm_packus_epi16(x0, x2);
643
644
                _mm_storeu_si128((__m128i *) (dst + x), x0);
645
646
            }
647
            dst += dststride;
648
            src += srcstride;
649
        }
650
    else
651
        for (y = 0; y < height; y++) {
652
            for (x = 0; x < width; x += 16) {
653
654
                x0 = _mm_load_si128((__m128i *) &src[x]);
655
                x2 = _mm_load_si128((__m128i *) &src[x + 8]);
656
                x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
657
                        _mm_mulhi_epi16(x0, c0));
658
                x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0),
659
                        _mm_mulhi_epi16(x2, c0));
660
                x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
661
                        _mm_mulhi_epi16(x0, c0));
662
                x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0),
663
                        _mm_mulhi_epi16(x2, c0));
664
665
                x0 = _mm_add_epi32(x0, add2);
666
                x1 = _mm_add_epi32(x1, add2);
667
                x2 = _mm_add_epi32(x2, add2);
668
                x3 = _mm_add_epi32(x3, add2);
669
670
                x0 = _mm_packus_epi32(x0, x1);
671
                x2 = _mm_packus_epi32(x2, x3);
672
                x0 = _mm_packus_epi16(x0, x2);
673
674
                _mm_storeu_si128((__m128i *) (dst + x), x0);
675
676
            }
677
            dst += dststride;
678
            src += srcstride;
679
        }
680
}
681
#endif
682
683
#if HAVE_SSE4_1
684
void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag,
685
                                      int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag,
686
                                      uint8_t *_dst, ptrdiff_t _dststride,
687
                                      const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
688
0
                                      int width, int height) {
689
0
    int shift, shift2;
690
0
    int log2Wd;
691
0
    int o0;
692
0
    int o1;
693
0
    int x, y;
694
0
    uint8_t *dst = (uint8_t*) _dst;
695
0
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
696
0
    __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
697
0
    shift = 14 - BIT_DEPTH;
698
0
    log2Wd = denom + shift;
699
700
0
    o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
701
0
    o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
702
0
    shift2 = (log2Wd + 1);
703
0
    c0 = _mm_set1_epi16(wl0Flag);
704
0
    c1 = _mm_set1_epi16(wl1Flag);
705
0
    c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
706
707
0
    if(!(width & 15)){
708
0
        for (y = 0; y < height; y++) {
709
0
                   for (x = 0; x < width; x += 16) {
710
0
                       x0 = _mm_load_si128((__m128i *) &src1[x]);
711
0
                       x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
712
0
                       x2 = _mm_load_si128((__m128i *) &src2[x]);
713
0
                       x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
714
715
0
                       r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
716
0
                               _mm_mulhi_epi16(x0, c0));
717
0
                       r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
718
0
                               _mm_mulhi_epi16(x1, c0));
719
0
                       r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
720
0
                               _mm_mulhi_epi16(x2, c1));
721
0
                       r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
722
0
                               _mm_mulhi_epi16(x3, c1));
723
0
                       x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
724
0
                               _mm_mulhi_epi16(x0, c0));
725
0
                       x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
726
0
                               _mm_mulhi_epi16(x1, c0));
727
0
                       x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
728
0
                               _mm_mulhi_epi16(x2, c1));
729
0
                       x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
730
0
                               _mm_mulhi_epi16(x3, c1));
731
0
                       r0 = _mm_add_epi32(r0, r2);
732
0
                       r1 = _mm_add_epi32(r1, r3);
733
0
                       r2 = _mm_add_epi32(x0, x2);
734
0
                       r3 = _mm_add_epi32(x1, x3);
735
736
0
                       r0 = _mm_add_epi32(r0, c2);
737
0
                       r1 = _mm_add_epi32(r1, c2);
738
0
                       r2 = _mm_add_epi32(r2, c2);
739
0
                       r3 = _mm_add_epi32(r3, c2);
740
741
0
                       r0 = _mm_srai_epi32(r0, shift2);
742
0
                       r1 = _mm_srai_epi32(r1, shift2);
743
0
                       r2 = _mm_srai_epi32(r2, shift2);
744
0
                       r3 = _mm_srai_epi32(r3, shift2);
745
746
0
                       r0 = _mm_packus_epi32(r0, r2);
747
0
                       r1 = _mm_packus_epi32(r1, r3);
748
0
                       r0 = _mm_packus_epi16(r0, r1);
749
750
0
                       _mm_storeu_si128((__m128i *) (dst + x), r0);
751
752
0
                   }
753
0
                   dst += dststride;
754
0
                   src1 += srcstride;
755
0
                   src2 += srcstride;
756
0
               }
757
0
    }else if(!(width & 7)){
758
0
        for (y = 0; y < height; y++) {
759
0
            for(x=0;x<width;x+=8){
760
0
                x0 = _mm_load_si128((__m128i *) (src1+x));
761
0
                x2 = _mm_load_si128((__m128i *) (src2+x));
762
763
0
                r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
764
0
                        _mm_mulhi_epi16(x0, c0));
765
766
0
                r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
767
0
                        _mm_mulhi_epi16(x2, c1));
768
769
0
                x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
770
0
                        _mm_mulhi_epi16(x0, c0));
771
772
0
                x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
773
0
                        _mm_mulhi_epi16(x2, c1));
774
775
0
                r0 = _mm_add_epi32(r0, r2);
776
0
                r2 = _mm_add_epi32(x0, x2);
777
778
779
0
                r0 = _mm_add_epi32(r0, c2);
780
0
                r2 = _mm_add_epi32(r2, c2);
781
782
0
                r0 = _mm_srai_epi32(r0, shift2);
783
0
                r2 = _mm_srai_epi32(r2, shift2);
784
785
0
                r0 = _mm_packus_epi32(r0, r2);
786
0
                r0 = _mm_packus_epi16(r0, r0);
787
788
0
                _mm_storel_epi64((__m128i *) (dst+x), r0);
789
0
            }
790
791
0
            dst += dststride;
792
0
            src1 += srcstride;
793
0
            src2 += srcstride;
794
0
        }
795
0
    }else if(!(width & 3)){
796
0
        for (y = 0; y < height; y++) {
797
0
            for(x=0;x<width;x+=4){
798
0
                x0 = _mm_loadl_epi64((__m128i *) (src1+x));
799
0
                x2 = _mm_loadl_epi64((__m128i *) (src2+x));
800
801
0
                r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
802
0
                        _mm_mulhi_epi16(x0, c0));
803
804
0
                r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
805
0
                        _mm_mulhi_epi16(x2, c1));
806
807
0
                x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
808
0
                        _mm_mulhi_epi16(x0, c0));
809
810
0
                x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
811
0
                        _mm_mulhi_epi16(x2, c1));
812
813
0
                r0 = _mm_add_epi32(r0, r2);
814
0
                r2 = _mm_add_epi32(x0, x2);
815
816
0
                r0 = _mm_add_epi32(r0, c2);
817
0
                r2 = _mm_add_epi32(r2, c2);
818
819
0
                r0 = _mm_srai_epi32(r0, shift2);
820
0
                r2 = _mm_srai_epi32(r2, shift2);
821
822
0
                r0 = _mm_packus_epi32(r0, r2);
823
0
                r0 = _mm_packus_epi16(r0, r0);
824
825
#if MASKMOVE
826
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
827
#else
828
0
                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
829
0
#endif
830
0
            }
831
0
            dst += dststride;
832
0
            src1 += srcstride;
833
0
            src2 += srcstride;
834
0
        }
835
0
    }else{
836
0
        for (y = 0; y < height; y++) {
837
0
            for(x=0;x<width;x+=2){
838
0
                x0 = _mm_loadl_epi64((__m128i *) (src1+x));
839
0
                x2 = _mm_loadl_epi64((__m128i *) (src2+x));
840
841
0
                r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
842
0
                        _mm_mulhi_epi16(x0, c0));
843
844
0
                r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
845
0
                        _mm_mulhi_epi16(x2, c1));
846
847
0
                x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
848
0
                        _mm_mulhi_epi16(x0, c0));
849
850
0
                x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
851
0
                        _mm_mulhi_epi16(x2, c1));
852
853
0
                r0 = _mm_add_epi32(r0, r2);
854
0
                r2 = _mm_add_epi32(x0, x2);
855
856
0
                r0 = _mm_add_epi32(r0, c2);
857
0
                r2 = _mm_add_epi32(r2, c2);
858
859
0
                r0 = _mm_srai_epi32(r0, shift2);
860
0
                r2 = _mm_srai_epi32(r2, shift2);
861
862
0
                r0 = _mm_packus_epi32(r0, r2);
863
0
                r0 = _mm_packus_epi16(r0, r0);
864
865
#if MASKMOVE
866
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x));
867
#else
868
0
                *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
869
0
#endif
870
0
            }
871
0
            dst += dststride;
872
0
            src1 += srcstride;
873
0
            src2 += srcstride;
874
0
        }
875
0
    }
876
0
}
877
#endif
878
879
880
#if 0
881
void ff_hevc_weighted_pred_avg_sse(uint8_t denom, int16_t wl0Flag,
882
        int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
883
                                   ptrdiff_t _dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
884
        int width, int height) {
885
    int shift, shift2;
886
    int log2Wd;
887
    int o0;
888
    int o1;
889
    int x, y;
890
    uint8_t *dst = (uint8_t*) _dst;
891
    ptrdiff_t dststride = _dststride / sizeof(uint8_t);
892
    __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2;
893
    shift = 14 - BIT_DEPTH;
894
    log2Wd = denom + shift;
895
896
    o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
897
    o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
898
    shift2 = (log2Wd + 1);
899
    c0 = _mm_set1_epi16(wl0Flag);
900
    c1 = _mm_set1_epi16(wl1Flag);
901
    c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd);
902
903
    for (y = 0; y < height; y++) {
904
        for (x = 0; x < width; x += 16) {
905
            x0 = _mm_load_si128((__m128i *) &src1[x]);
906
            x1 = _mm_load_si128((__m128i *) &src1[x + 8]);
907
            x2 = _mm_load_si128((__m128i *) &src2[x]);
908
            x3 = _mm_load_si128((__m128i *) &src2[x + 8]);
909
910
            r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0),
911
                    _mm_mulhi_epi16(x0, c0));
912
            r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0),
913
                    _mm_mulhi_epi16(x1, c0));
914
            r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1),
915
                    _mm_mulhi_epi16(x2, c1));
916
            r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1),
917
                    _mm_mulhi_epi16(x3, c1));
918
            x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0),
919
                    _mm_mulhi_epi16(x0, c0));
920
            x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0),
921
                    _mm_mulhi_epi16(x1, c0));
922
            x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1),
923
                    _mm_mulhi_epi16(x2, c1));
924
            x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1),
925
                    _mm_mulhi_epi16(x3, c1));
926
            r0 = _mm_add_epi32(r0, r2);
927
            r1 = _mm_add_epi32(r1, r3);
928
            r2 = _mm_add_epi32(x0, x2);
929
            r3 = _mm_add_epi32(x1, x3);
930
931
            r0 = _mm_add_epi32(r0, c2);
932
            r1 = _mm_add_epi32(r1, c2);
933
            r2 = _mm_add_epi32(r2, c2);
934
            r3 = _mm_add_epi32(r3, c2);
935
936
            r0 = _mm_srai_epi32(r0, shift2);
937
            r1 = _mm_srai_epi32(r1, shift2);
938
            r2 = _mm_srai_epi32(r2, shift2);
939
            r3 = _mm_srai_epi32(r3, shift2);
940
941
            r0 = _mm_packus_epi32(r0, r2);
942
            r1 = _mm_packus_epi32(r1, r3);
943
            r0 = _mm_packus_epi16(r0, r1);
944
945
            _mm_storeu_si128((__m128i *) (dst + x), r0);
946
947
        }
948
        dst += dststride;
949
        src1 += srcstride;
950
        src2 += srcstride;
951
    }
952
}
953
#endif
954
955
956
void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
957
                                        const uint8_t *_src, ptrdiff_t srcstride,
958
                                        int width, int height, int mx,
959
0
                                        int my, int16_t* mcbuffer) {
960
0
    int x, y;
961
0
    __m128i x1, x2,x3;
962
0
    uint8_t *src = (uint8_t*) _src;
963
0
    if(!(width & 15)){
964
0
        x3= _mm_setzero_si128();
965
0
        for (y = 0; y < height; y++) {
966
0
                    for (x = 0; x < width; x += 16) {
967
968
0
                        x1 = _mm_loadu_si128((__m128i *) &src[x]);
969
0
                        x2 = _mm_unpacklo_epi8(x1, x3);
970
971
0
                        x1 = _mm_unpackhi_epi8(x1, x3);
972
973
0
                        x2 = _mm_slli_epi16(x2, 6);
974
0
                        x1 = _mm_slli_epi16(x1, 6);
975
0
                        _mm_store_si128((__m128i *) &dst[x], x2);
976
0
                        _mm_store_si128((__m128i *) &dst[x + 8], x1);
977
978
0
                    }
979
0
                    src += srcstride;
980
0
                    dst += dststride;
981
0
                }
982
0
    }else  if(!(width & 7)){
983
0
        x1= _mm_setzero_si128();
984
0
        for (y = 0; y < height; y++) {
985
0
                    for (x = 0; x < width; x += 8) {
986
987
0
                        x2 = _mm_loadl_epi64((__m128i *) &src[x]);
988
0
                        x2 = _mm_unpacklo_epi8(x2, x1);
989
0
                        x2 = _mm_slli_epi16(x2, 6);
990
0
                        _mm_store_si128((__m128i *) &dst[x], x2);
991
992
0
                    }
993
0
                    src += srcstride;
994
0
                    dst += dststride;
995
0
                }
996
0
    }else  if(!(width & 3)){
997
0
        x1= _mm_setzero_si128();
998
0
        for (y = 0; y < height; y++) {
999
0
                    for (x = 0; x < width; x += 4) {
1000
1001
0
                        x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1002
0
                        x2 = _mm_unpacklo_epi8(x2,x1);
1003
1004
0
                        x2 = _mm_slli_epi16(x2, 6);
1005
1006
0
                        _mm_storel_epi64((__m128i *) &dst[x], x2);
1007
1008
0
                    }
1009
0
                    src += srcstride;
1010
0
                    dst += dststride;
1011
0
                }
1012
0
    }else{
1013
0
        x1= _mm_setzero_si128();
1014
0
        for (y = 0; y < height; y++) {
1015
0
                    for (x = 0; x < width; x += 2) {
1016
1017
0
                        x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1018
0
                        x2 = _mm_unpacklo_epi8(x2, x1);
1019
0
                        x2 = _mm_slli_epi16(x2, 6);
1020
#if MASKMOVE
1021
                        _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1022
#else
1023
0
                        *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1024
0
#endif
1025
0
                    }
1026
0
                    src += srcstride;
1027
0
                    dst += dststride;
1028
0
                }
1029
0
    }
1030
1031
0
}
1032
1033
#ifndef __native_client__
1034
void ff_hevc_put_hevc_epel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
1035
                                         const uint8_t *_src, ptrdiff_t _srcstride,
1036
                                         int width, int height, int mx,
1037
0
                                         int my, int16_t* mcbuffer) {
1038
0
    int x, y;
1039
0
    __m128i x2;
1040
0
    uint16_t *src = (uint16_t*) _src;
1041
0
    ptrdiff_t srcstride = _srcstride>>1;
1042
0
    if(!(width & 7)){
1043
      //x1= _mm_setzero_si128();
1044
0
        for (y = 0; y < height; y++) {
1045
0
            for (x = 0; x < width; x += 8) {
1046
1047
0
                x2 = _mm_loadu_si128((__m128i *) &src[x]);
1048
0
                x2 = _mm_slli_epi16(x2, 4);         //shift 14 - BIT LENGTH
1049
0
                _mm_store_si128((__m128i *) &dst[x], x2);
1050
1051
0
            }
1052
0
            src += srcstride;
1053
0
            dst += dststride;
1054
0
        }
1055
0
    }else  if(!(width & 3)){
1056
      //x1= _mm_setzero_si128();
1057
0
        for (y = 0; y < height; y++) {
1058
0
            for (x = 0; x < width; x += 4) {
1059
1060
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1061
0
                x2 = _mm_slli_epi16(x2, 4);     //shift 14 - BIT LENGTH
1062
1063
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
1064
1065
0
            }
1066
0
            src += srcstride;
1067
0
            dst += dststride;
1068
0
        }
1069
0
    }else{
1070
      //x1= _mm_setzero_si128();
1071
0
        for (y = 0; y < height; y++) {
1072
0
            for (x = 0; x < width; x += 2) {
1073
1074
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x]);
1075
0
                x2 = _mm_slli_epi16(x2, 4);     //shift 14 - BIT LENGTH
1076
0
                _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1077
0
            }
1078
0
            src += srcstride;
1079
0
            dst += dststride;
1080
0
        }
1081
0
    }
1082
1083
0
}
1084
#endif
1085
1086
void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
1087
                                   const uint8_t *_src, ptrdiff_t _srcstride,
1088
                                   int width, int height, int mx,
1089
0
                                   int my, int16_t* mcbuffer, int bit_depth) {
1090
0
    int x, y;
1091
0
    const uint8_t *src = (const uint8_t*) _src;
1092
0
    ptrdiff_t srcstride = _srcstride;
1093
0
    const int8_t *filter = epel_filters[mx - 1];
1094
0
    __m128i r0, bshuffle1, bshuffle2, x1, x2, x3;
1095
0
    int8_t filter_0 = filter[0];
1096
0
    int8_t filter_1 = filter[1];
1097
0
    int8_t filter_2 = filter[2];
1098
0
    int8_t filter_3 = filter[3];
1099
0
    r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1100
0
            filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1101
0
            filter_0, filter_3, filter_2, filter_1, filter_0);
1102
0
    bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1103
1104
1105
    /*
1106
  printf("---IN---SSE\n");
1107
1108
  int extra_top  = 1;
1109
  int extra_left = 1;
1110
  int extra_right  = 2;
1111
  int extra_bottom = 2;
1112
1113
  for (int y=-extra_top;y<height+extra_bottom;y++) {
1114
    uint8_t* p = &_src[y*_srcstride -extra_left];
1115
1116
    for (int x=-extra_left;x<width+extra_right;x++) {
1117
      printf("%05d ",*p << 6);
1118
      p++;
1119
    }
1120
    printf("\n");
1121
  }
1122
    */
1123
1124
0
    if(!(width & 7)){
1125
0
        bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1126
0
                        4);
1127
0
                for (y = 0; y < height; y++) {
1128
0
                    for (x = 0; x < width; x += 8) {
1129
1130
0
                        x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1131
0
                        x2 = _mm_shuffle_epi8(x1, bshuffle1);
1132
0
                        x3 = _mm_shuffle_epi8(x1, bshuffle2);
1133
1134
                        /*  PMADDUBSW then PMADDW     */
1135
0
                        x2 = _mm_maddubs_epi16(x2, r0);
1136
0
                        x3 = _mm_maddubs_epi16(x3, r0);
1137
0
                        x2 = _mm_hadd_epi16(x2, x3);
1138
0
                        _mm_store_si128((__m128i *) &dst[x], x2);
1139
0
                    }
1140
0
                    src += srcstride;
1141
0
                    dst += dststride;
1142
0
                }
1143
0
    }else if(!(width & 3)){
1144
1145
0
        for (y = 0; y < height; y++) {
1146
0
            for (x = 0; x < width; x += 4) {
1147
            /* load data in register     */
1148
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1149
0
            x2 = _mm_shuffle_epi8(x1, bshuffle1);
1150
1151
            /*  PMADDUBSW then PMADDW     */
1152
0
            x2 = _mm_maddubs_epi16(x2, r0);
1153
0
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1154
            /* give results back            */
1155
0
            _mm_storel_epi64((__m128i *) &dst[x], x2);
1156
0
            }
1157
0
            src += srcstride;
1158
0
            dst += dststride;
1159
0
        }
1160
0
    }else{
1161
0
        for (y = 0; y < height; y++) {
1162
0
            for (x = 0; x < width; x += 2) {
1163
            /* load data in register     */
1164
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1165
0
            x2 = _mm_shuffle_epi8(x1, bshuffle1);
1166
1167
            /*  PMADDUBSW then PMADDW     */
1168
0
            x2 = _mm_maddubs_epi16(x2, r0);
1169
0
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
1170
            /* give results back            */
1171
#if MASKMOVE
1172
            _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1173
#else
1174
0
            *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
1175
0
#endif
1176
0
            }
1177
0
            src += srcstride;
1178
0
            dst += dststride;
1179
0
        }
1180
0
    }
1181
0
}
1182
1183
#ifndef __native_client__
1184
void ff_hevc_put_hevc_epel_h_10_sse(int16_t *dst, ptrdiff_t dststride,
1185
                                    const uint8_t *_src, ptrdiff_t _srcstride,
1186
                                    int width, int height, int mx,
1187
0
                                    int my, int16_t* mcbuffer) {
1188
0
    int x, y;
1189
0
    uint16_t *src = (uint16_t*) _src;
1190
0
    ptrdiff_t srcstride = _srcstride>>1;
1191
0
    const int8_t *filter = epel_filters[mx - 1];
1192
0
    __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1;
1193
0
    int8_t filter_0 = filter[0];
1194
0
    int8_t filter_1 = filter[1];
1195
0
    int8_t filter_2 = filter[2];
1196
0
    int8_t filter_3 = filter[3];
1197
0
    r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1198
0
            filter_0, filter_3, filter_2, filter_1, filter_0);
1199
0
    bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1200
1201
0
    if(!(width & 3)){
1202
0
        bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1203
0
        for (y = 0; y < height; y++) {
1204
0
            for (x = 0; x < width; x += 4) {
1205
1206
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1207
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1208
0
                x3 = _mm_shuffle_epi8(x1, bshuffle2);
1209
1210
1211
0
                x2 = _mm_madd_epi16(x2, r0);
1212
0
                x3 = _mm_madd_epi16(x3, r0);
1213
0
                x2 = _mm_hadd_epi32(x2, x3);
1214
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1215
1216
0
                x2 = _mm_packs_epi32(x2,r0);
1217
                //give results back
1218
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
1219
0
            }
1220
0
            src += srcstride;
1221
0
            dst += dststride;
1222
0
        }
1223
0
    }else{
1224
0
        r1= _mm_setzero_si128();
1225
0
        for (y = 0; y < height; y++) {
1226
0
            for (x = 0; x < width; x += 2) {
1227
                /* load data in register     */
1228
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1229
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1230
1231
                /*  PMADDUBSW then PMADDW     */
1232
0
                x2 = _mm_madd_epi16(x2, r0);
1233
0
                x2 = _mm_hadd_epi32(x2, r1);
1234
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1235
0
                x2 = _mm_packs_epi32(x2, r1);
1236
                /* give results back            */
1237
0
                _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1238
0
            }
1239
0
            src += srcstride;
1240
0
            dst += dststride;
1241
0
        }
1242
0
    }
1243
0
}
1244
#endif
1245
1246
1247
void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
1248
                                   const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1249
0
                                   int my, int16_t* mcbuffer, int bit_depth) {
1250
0
    int x, y;
1251
0
    __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1;
1252
0
    uint8_t *src = (uint8_t*) _src;
1253
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
1254
0
    const int8_t *filter = epel_filters[my - 1];
1255
0
    int8_t filter_0 = filter[0];
1256
0
    int8_t filter_1 = filter[1];
1257
0
    int8_t filter_2 = filter[2];
1258
0
    int8_t filter_3 = filter[3];
1259
0
    f0 = _mm_set1_epi16(filter_0);
1260
0
    f1 = _mm_set1_epi16(filter_1);
1261
0
    f2 = _mm_set1_epi16(filter_2);
1262
0
    f3 = _mm_set1_epi16(filter_3);
1263
1264
0
    if(!(width & 15)){
1265
0
        for (y = 0; y < height; y++) {
1266
0
            for (x = 0; x < width; x += 16) {
1267
                /* check if memory needs to be reloaded */
1268
1269
0
                x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1270
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1271
0
                x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1272
0
                x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1273
1274
0
                t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128());
1275
0
                t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128());
1276
0
                t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128());
1277
0
                t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128());
1278
1279
0
                x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128());
1280
0
                x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128());
1281
0
                x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128());
1282
0
                x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128());
1283
1284
                /* multiply by correct value : */
1285
0
                r0 = _mm_mullo_epi16(t0, f0);
1286
0
                r1 = _mm_mullo_epi16(x0, f0);
1287
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1288
0
                r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1));
1289
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1290
0
                r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2));
1291
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1292
0
                r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3));
1293
                /* give results back            */
1294
0
                _mm_store_si128((__m128i *) &dst[x], r0);
1295
0
                _mm_storeu_si128((__m128i *) &dst[x + 8], r1);
1296
0
            }
1297
0
            src += srcstride;
1298
0
            dst += dststride;
1299
0
        }
1300
0
    }else if(!(width & 7)){
1301
0
        r1= _mm_setzero_si128();
1302
0
        for (y = 0; y < height; y++) {
1303
0
            for(x=0;x<width;x+=8){
1304
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1305
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1306
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1307
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1308
1309
0
                t0 = _mm_unpacklo_epi8(x0, r1);
1310
0
                t1 = _mm_unpacklo_epi8(x1, r1);
1311
0
                t2 = _mm_unpacklo_epi8(x2, r1);
1312
0
                t3 = _mm_unpacklo_epi8(x3, r1);
1313
1314
1315
                /* multiply by correct value : */
1316
0
                r0 = _mm_mullo_epi16(t0, f0);
1317
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1318
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1319
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1320
                /* give results back            */
1321
0
                _mm_storeu_si128((__m128i *) &dst[x], r0);
1322
0
            }
1323
0
            src += srcstride;
1324
0
            dst += dststride;
1325
0
        }
1326
0
    }else if(!(width & 3)){
1327
0
        r1= _mm_setzero_si128();
1328
0
        for (y = 0; y < height; y++) {
1329
0
            for(x=0;x<width;x+=4){
1330
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1331
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1332
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1333
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1334
1335
0
                t0 = _mm_unpacklo_epi8(x0, r1);
1336
0
                t1 = _mm_unpacklo_epi8(x1, r1);
1337
0
                t2 = _mm_unpacklo_epi8(x2, r1);
1338
0
                t3 = _mm_unpacklo_epi8(x3, r1);
1339
1340
1341
                /* multiply by correct value : */
1342
0
                r0 = _mm_mullo_epi16(t0, f0);
1343
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1344
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1345
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1346
                /* give results back            */
1347
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
1348
0
            }
1349
0
            src += srcstride;
1350
0
            dst += dststride;
1351
0
        }
1352
0
    }else{
1353
0
        r1= _mm_setzero_si128();
1354
0
        for (y = 0; y < height; y++) {
1355
0
            for(x=0;x<width;x+=2){
1356
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1357
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1358
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1359
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1360
1361
0
                t0 = _mm_unpacklo_epi8(x0, r1);
1362
0
                t1 = _mm_unpacklo_epi8(x1, r1);
1363
0
                t2 = _mm_unpacklo_epi8(x2, r1);
1364
0
                t3 = _mm_unpacklo_epi8(x3, r1);
1365
1366
1367
                /* multiply by correct value : */
1368
0
                r0 = _mm_mullo_epi16(t0, f0);
1369
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1));
1370
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2));
1371
0
                r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3));
1372
                /* give results back            */
1373
#if MASKMOVE
1374
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1375
#else
1376
0
                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1377
0
#endif
1378
0
            }
1379
0
            src += srcstride;
1380
0
            dst += dststride;
1381
0
        }
1382
0
    }
1383
0
}
1384
1385
#ifndef __native_client__
1386
void ff_hevc_put_hevc_epel_v_10_sse(int16_t *dst, ptrdiff_t dststride,
1387
                                    const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1388
0
        int my, int16_t* mcbuffer) {
1389
0
    int x, y;
1390
0
    __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1, r2, r3;
1391
0
    uint16_t *src = (uint16_t*) _src;
1392
0
    ptrdiff_t srcstride = _srcstride >>1;
1393
0
    const int8_t *filter = epel_filters[my - 1];
1394
0
    int8_t filter_0 = filter[0];
1395
0
    int8_t filter_1 = filter[1];
1396
0
    int8_t filter_2 = filter[2];
1397
0
    int8_t filter_3 = filter[3];
1398
0
    f0 = _mm_set1_epi16(filter_0);
1399
0
    f1 = _mm_set1_epi16(filter_1);
1400
0
    f2 = _mm_set1_epi16(filter_2);
1401
0
    f3 = _mm_set1_epi16(filter_3);
1402
1403
0
    if(!(width & 7)){
1404
0
        r1= _mm_setzero_si128();
1405
0
        for (y = 0; y < height; y++) {
1406
0
            for(x=0;x<width;x+=8){
1407
0
                x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
1408
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1409
0
                x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
1410
0
                x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
1411
1412
                // multiply by correct value :
1413
0
                r0 = _mm_mullo_epi16(x0, f0);
1414
0
                t0 = _mm_mulhi_epi16(x0, f0);
1415
1416
0
                x0= _mm_unpacklo_epi16(r0,t0);
1417
0
                t0= _mm_unpackhi_epi16(r0,t0);
1418
1419
0
                r1 = _mm_mullo_epi16(x1, f1);
1420
0
                t1 = _mm_mulhi_epi16(x1, f1);
1421
1422
0
                x1= _mm_unpacklo_epi16(r1,t1);
1423
0
                t1= _mm_unpackhi_epi16(r1,t1);
1424
1425
1426
0
                r2 = _mm_mullo_epi16(x2, f2);
1427
0
                t2 = _mm_mulhi_epi16(x2, f2);
1428
1429
0
                x2= _mm_unpacklo_epi16(r2,t2);
1430
0
                t2= _mm_unpackhi_epi16(r2,t2);
1431
1432
1433
0
                r3 = _mm_mullo_epi16(x3, f3);
1434
0
                t3 = _mm_mulhi_epi16(x3, f3);
1435
1436
0
                x3= _mm_unpacklo_epi16(r3,t3);
1437
0
                t3= _mm_unpackhi_epi16(r3,t3);
1438
1439
1440
0
                r0= _mm_add_epi32(x0,x1);
1441
0
                r1= _mm_add_epi32(x2,x3);
1442
1443
0
                t0= _mm_add_epi32(t0,t1);
1444
0
                t1= _mm_add_epi32(t2,t3);
1445
1446
0
                r0= _mm_add_epi32(r0,r1);
1447
0
                t0= _mm_add_epi32(t0,t1);
1448
1449
0
                r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1450
0
                t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8)
1451
1452
0
                r0= _mm_packs_epi32(r0, t0);
1453
                // give results back
1454
0
                _mm_storeu_si128((__m128i *) &dst[x], r0);
1455
0
            }
1456
0
            src += srcstride;
1457
0
            dst += dststride;
1458
0
        }
1459
0
    }else if(!(width & 3)){
1460
0
        r1= _mm_setzero_si128();
1461
0
        for (y = 0; y < height; y++) {
1462
0
            for(x=0;x<width;x+=4){
1463
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1464
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1465
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1466
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1467
1468
                /* multiply by correct value : */
1469
0
                r0 = _mm_mullo_epi16(x0, f0);
1470
0
                t0 = _mm_mulhi_epi16(x0, f0);
1471
1472
0
                x0= _mm_unpacklo_epi16(r0,t0);
1473
1474
0
                r1 = _mm_mullo_epi16(x1, f1);
1475
0
                t1 = _mm_mulhi_epi16(x1, f1);
1476
1477
0
                x1= _mm_unpacklo_epi16(r1,t1);
1478
1479
1480
0
                r2 = _mm_mullo_epi16(x2, f2);
1481
0
                t2 = _mm_mulhi_epi16(x2, f2);
1482
1483
0
                x2= _mm_unpacklo_epi16(r2,t2);
1484
1485
1486
0
                r3 = _mm_mullo_epi16(x3, f3);
1487
0
                t3 = _mm_mulhi_epi16(x3, f3);
1488
1489
0
                x3= _mm_unpacklo_epi16(r3,t3);
1490
1491
1492
0
                r0= _mm_add_epi32(x0,x1);
1493
0
                r1= _mm_add_epi32(x2,x3);
1494
0
                r0= _mm_add_epi32(r0,r1);
1495
0
                r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1496
1497
0
                r0= _mm_packs_epi32(r0, r0);
1498
1499
                // give results back
1500
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
1501
0
            }
1502
0
            src += srcstride;
1503
0
            dst += dststride;
1504
0
        }
1505
0
    }else{
1506
0
        r1= _mm_setzero_si128();
1507
0
        for (y = 0; y < height; y++) {
1508
0
            for(x=0;x<width;x+=2){
1509
0
                x0 = _mm_loadl_epi64((__m128i *) &src[x - srcstride]);
1510
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1511
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
1512
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
1513
1514
                /* multiply by correct value : */
1515
0
                r0 = _mm_mullo_epi16(x0, f0);
1516
0
                t0 = _mm_mulhi_epi16(x0, f0);
1517
1518
0
                x0= _mm_unpacklo_epi16(r0,t0);
1519
1520
0
                r1 = _mm_mullo_epi16(x1, f1);
1521
0
                t1 = _mm_mulhi_epi16(x1, f1);
1522
1523
0
                x1= _mm_unpacklo_epi16(r1,t1);
1524
1525
0
                r2 = _mm_mullo_epi16(x2, f2);
1526
0
                t2 = _mm_mulhi_epi16(x2, f2);
1527
1528
0
                x2= _mm_unpacklo_epi16(r2,t2);
1529
1530
0
                r3 = _mm_mullo_epi16(x3, f3);
1531
0
                t3 = _mm_mulhi_epi16(x3, f3);
1532
1533
0
                x3= _mm_unpacklo_epi16(r3,t3);
1534
1535
0
                r0= _mm_add_epi32(x0,x1);
1536
0
                r1= _mm_add_epi32(x2,x3);
1537
0
                r0= _mm_add_epi32(r0,r1);
1538
0
                r0= _mm_srai_epi32(r0,2);//>> (BIT_DEPTH - 8)
1539
1540
0
                r0= _mm_packs_epi32(r0, r0);
1541
1542
                /* give results back            */
1543
0
                _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x));
1544
1545
0
            }
1546
0
            src += srcstride;
1547
0
            dst += dststride;
1548
0
        }
1549
0
    }
1550
0
}
1551
#endif
1552
1553
void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
1554
                                    const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1555
0
                                    int my, int16_t* mcbuffer, int bit_depth) {
1556
0
  int x, y;
1557
0
  uint8_t *src = (uint8_t*) _src;
1558
0
  ptrdiff_t srcstride = _srcstride;
1559
0
  const int8_t *filter_h = epel_filters[mx - 1];
1560
0
  const int8_t *filter_v = epel_filters[my - 1];
1561
0
  __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1562
0
  f2, f3, r1, r2;
1563
0
  int8_t filter_0 = filter_h[0];
1564
0
  int8_t filter_1 = filter_h[1];
1565
0
  int8_t filter_2 = filter_h[2];
1566
0
  int8_t filter_3 = filter_h[3];
1567
0
  int16_t *tmp = mcbuffer;
1568
0
  r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3,
1569
0
      filter_2, filter_1, filter_0, filter_3, filter_2, filter_1,
1570
0
      filter_0, filter_3, filter_2, filter_1, filter_0);
1571
0
  bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
1572
1573
0
  src -= epel_extra_before * srcstride;
1574
1575
0
  f3 = _mm_set1_epi16(filter_v[3]);
1576
0
  f1 = _mm_set1_epi16(filter_v[1]);
1577
0
  f2 = _mm_set1_epi16(filter_v[2]);
1578
0
  f0 = _mm_set1_epi16(filter_v[0]);
1579
1580
  /* horizontal treatment */
1581
0
  if(!(width & 7)){
1582
0
    bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5,
1583
0
        4);
1584
0
    for (y = 0; y < height + epel_extra; y++) {
1585
0
      for (x = 0; x < width; x += 8) {
1586
1587
0
        x1 = _mm_loadu_si128((__m128i *) &src[x - 1]);
1588
0
        x2 = _mm_shuffle_epi8(x1, bshuffle1);
1589
0
        x3 = _mm_shuffle_epi8(x1, bshuffle2);
1590
1591
        /*  PMADDUBSW then PMADDW     */
1592
0
        x2 = _mm_maddubs_epi16(x2, r0);
1593
0
        x3 = _mm_maddubs_epi16(x3, r0);
1594
0
        x2 = _mm_hadd_epi16(x2, x3);
1595
0
        _mm_store_si128((__m128i *) &tmp[x], x2);
1596
0
      }
1597
0
      src += srcstride;
1598
0
      tmp += MAX_PB_SIZE;
1599
0
    }
1600
0
    tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1601
1602
    /* vertical treatment */
1603
1604
0
    for (y = 0; y < height; y++) {
1605
0
      for (x = 0; x < width; x += 8) {
1606
        /* check if memory needs to be reloaded */
1607
0
        x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]);
1608
0
        x1 = _mm_load_si128((__m128i *) &tmp[x]);
1609
0
        x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]);
1610
0
        x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1611
1612
0
        r0 = _mm_mullo_epi16(x0, f0);
1613
0
        r1 = _mm_mulhi_epi16(x0, f0);
1614
0
        r2 = _mm_mullo_epi16(x1, f1);
1615
0
        t0 = _mm_unpacklo_epi16(r0, r1);
1616
0
        x0 = _mm_unpackhi_epi16(r0, r1);
1617
0
        r0 = _mm_mulhi_epi16(x1, f1);
1618
0
        r1 = _mm_mullo_epi16(x2, f2);
1619
0
        t1 = _mm_unpacklo_epi16(r2, r0);
1620
0
        x1 = _mm_unpackhi_epi16(r2, r0);
1621
0
        r2 = _mm_mulhi_epi16(x2, f2);
1622
0
        r0 = _mm_mullo_epi16(x3, f3);
1623
0
        t2 = _mm_unpacklo_epi16(r1, r2);
1624
0
        x2 = _mm_unpackhi_epi16(r1, r2);
1625
0
        r1 = _mm_mulhi_epi16(x3, f3);
1626
0
        t3 = _mm_unpacklo_epi16(r0, r1);
1627
0
        x3 = _mm_unpackhi_epi16(r0, r1);
1628
1629
        /* multiply by correct value : */
1630
0
        r0 = _mm_add_epi32(t0, t1);
1631
0
        r1 = _mm_add_epi32(x0, x1);
1632
0
        r0 = _mm_add_epi32(r0, t2);
1633
0
        r1 = _mm_add_epi32(r1, x2);
1634
0
        r0 = _mm_add_epi32(r0, t3);
1635
0
        r1 = _mm_add_epi32(r1, x3);
1636
0
        r0 = _mm_srai_epi32(r0, 6);
1637
0
        r1 = _mm_srai_epi32(r1, 6);
1638
1639
        /* give results back            */
1640
0
        r0 = _mm_packs_epi32(r0, r1);
1641
0
        _mm_store_si128((__m128i *) &dst[x], r0);
1642
0
      }
1643
0
      tmp += MAX_PB_SIZE;
1644
0
      dst += dststride;
1645
0
    }
1646
0
  }else if(!(width & 3)){
1647
0
    for (y = 0; y < height + epel_extra; y ++) {
1648
0
      for(x=0;x<width;x+=4){
1649
        /* load data in register     */
1650
0
        x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1651
1652
0
        x1 = _mm_shuffle_epi8(x1, bshuffle1);
1653
1654
        /*  PMADDUBSW then PMADDW     */
1655
0
        x1 = _mm_maddubs_epi16(x1, r0);
1656
0
        x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1657
1658
        /* give results back            */
1659
0
        _mm_storel_epi64((__m128i *) &tmp[x], x1);
1660
1661
0
      }
1662
0
      src += srcstride;
1663
0
      tmp += MAX_PB_SIZE;
1664
0
    }
1665
0
    tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1666
1667
    /* vertical treatment */
1668
1669
1670
0
    for (y = 0; y < height; y++) {
1671
0
      for (x = 0; x < width; x += 4) {
1672
        /* check if memory needs to be reloaded */
1673
0
        x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1674
0
        x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1675
0
        x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1676
0
        x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1677
1678
0
        r0 = _mm_mullo_epi16(x0, f0);
1679
0
        r1 = _mm_mulhi_epi16(x0, f0);
1680
0
        r2 = _mm_mullo_epi16(x1, f1);
1681
0
        t0 = _mm_unpacklo_epi16(r0, r1);
1682
1683
0
        r0 = _mm_mulhi_epi16(x1, f1);
1684
0
        r1 = _mm_mullo_epi16(x2, f2);
1685
0
        t1 = _mm_unpacklo_epi16(r2, r0);
1686
1687
0
        r2 = _mm_mulhi_epi16(x2, f2);
1688
0
        r0 = _mm_mullo_epi16(x3, f3);
1689
0
        t2 = _mm_unpacklo_epi16(r1, r2);
1690
1691
0
        r1 = _mm_mulhi_epi16(x3, f3);
1692
0
        t3 = _mm_unpacklo_epi16(r0, r1);
1693
1694
1695
        /* multiply by correct value : */
1696
0
        r0 = _mm_add_epi32(t0, t1);
1697
0
        r0 = _mm_add_epi32(r0, t2);
1698
0
        r0 = _mm_add_epi32(r0, t3);
1699
0
        r0 = _mm_srai_epi32(r0, 6);
1700
1701
        /* give results back            */
1702
0
        r0 = _mm_packs_epi32(r0, r0);
1703
0
        _mm_storel_epi64((__m128i *) &dst[x], r0);
1704
0
      }
1705
0
      tmp += MAX_PB_SIZE;
1706
0
      dst += dststride;
1707
0
    }
1708
0
  }else{
1709
#if MASKMOVE
1710
    bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1711
#endif
1712
0
    for (y = 0; y < height + epel_extra; y ++) {
1713
0
      for(x=0;x<width;x+=2){
1714
        /* load data in register     */
1715
0
        x1 = _mm_loadl_epi64((__m128i *) &src[x-1]);
1716
0
        x1 = _mm_shuffle_epi8(x1, bshuffle1);
1717
1718
        /*  PMADDUBSW then PMADDW     */
1719
0
        x1 = _mm_maddubs_epi16(x1, r0);
1720
0
        x1 = _mm_hadd_epi16(x1, _mm_setzero_si128());
1721
1722
        /* give results back            */
1723
#if MASKMOVE
1724
        _mm_maskmoveu_si128(x1,bshuffle2,(char *) (tmp+x));
1725
#else
1726
0
                                *((uint32_t*)(tmp+x)) = _mm_cvtsi128_si32(x1);
1727
0
#endif
1728
0
      }
1729
0
      src += srcstride;
1730
0
      tmp += MAX_PB_SIZE;
1731
0
    }
1732
1733
0
    tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1734
1735
    /* vertical treatment */
1736
1737
0
    for (y = 0; y < height; y++) {
1738
0
      for (x = 0; x < width; x += 2) {
1739
        /* check if memory needs to be reloaded */
1740
0
        x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1741
0
        x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1742
0
        x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1743
0
        x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1744
1745
0
        r0 = _mm_mullo_epi16(x0, f0);
1746
0
        r1 = _mm_mulhi_epi16(x0, f0);
1747
0
        r2 = _mm_mullo_epi16(x1, f1);
1748
0
        t0 = _mm_unpacklo_epi16(r0, r1);
1749
0
        r0 = _mm_mulhi_epi16(x1, f1);
1750
0
        r1 = _mm_mullo_epi16(x2, f2);
1751
0
        t1 = _mm_unpacklo_epi16(r2, r0);
1752
0
        r2 = _mm_mulhi_epi16(x2, f2);
1753
0
        r0 = _mm_mullo_epi16(x3, f3);
1754
0
        t2 = _mm_unpacklo_epi16(r1, r2);
1755
0
        r1 = _mm_mulhi_epi16(x3, f3);
1756
0
        t3 = _mm_unpacklo_epi16(r0, r1);
1757
1758
        /* multiply by correct value : */
1759
0
        r0 = _mm_add_epi32(t0, t1);
1760
0
        r0 = _mm_add_epi32(r0, t2);
1761
0
        r0 = _mm_add_epi32(r0, t3);
1762
0
        r0 = _mm_srai_epi32(r0, 6);
1763
        /* give results back            */
1764
0
        r0 = _mm_packs_epi32(r0, r0);
1765
#if MASKMOVE
1766
        _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1767
#else
1768
0
                                *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0);
1769
0
#endif
1770
0
      }
1771
0
      tmp += MAX_PB_SIZE;
1772
0
      dst += dststride;
1773
0
    }
1774
0
  }
1775
1776
0
}
1777
1778
1779
#ifndef __native_client__
1780
void ff_hevc_put_hevc_epel_hv_10_sse(int16_t *dst, ptrdiff_t dststride,
1781
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
1782
0
        int my, int16_t* mcbuffer) {
1783
0
    int x, y;
1784
0
    uint16_t *src = (uint16_t*) _src;
1785
0
    ptrdiff_t srcstride = _srcstride>>1;
1786
0
    const int8_t *filter_h = epel_filters[mx - 1];
1787
0
    const int8_t *filter_v = epel_filters[my - 1];
1788
0
    __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1,
1789
0
    f2, f3, r1, r2, r3;
1790
0
    int8_t filter_0 = filter_h[0];
1791
0
    int8_t filter_1 = filter_h[1];
1792
0
    int8_t filter_2 = filter_h[2];
1793
0
    int8_t filter_3 = filter_h[3];
1794
0
    int16_t *tmp = mcbuffer;
1795
1796
0
    r0 = _mm_set_epi16(filter_3, filter_2, filter_1,
1797
0
                filter_0, filter_3, filter_2, filter_1, filter_0);
1798
0
        bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0);
1799
1800
0
    src -= epel_extra_before * srcstride;
1801
1802
0
    f0 = _mm_set1_epi16(filter_v[0]);
1803
0
    f1 = _mm_set1_epi16(filter_v[1]);
1804
0
    f2 = _mm_set1_epi16(filter_v[2]);
1805
0
    f3 = _mm_set1_epi16(filter_v[3]);
1806
1807
1808
    /* horizontal treatment */
1809
0
    if(!(width & 3)){
1810
0
        bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4);
1811
0
        for (y = 0; y < height + epel_extra; y ++) {
1812
0
            for(x=0;x<width;x+=4){
1813
1814
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1815
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1816
0
                x3 = _mm_shuffle_epi8(x1, bshuffle2);
1817
1818
1819
0
                x2 = _mm_madd_epi16(x2, r0);
1820
0
                x3 = _mm_madd_epi16(x3, r0);
1821
0
                x2 = _mm_hadd_epi32(x2, x3);
1822
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1823
1824
0
                x2 = _mm_packs_epi32(x2,r0);
1825
                //give results back
1826
0
                _mm_storel_epi64((__m128i *) &tmp[x], x2);
1827
1828
0
            }
1829
0
            src += srcstride;
1830
0
            tmp += MAX_PB_SIZE;
1831
0
        }
1832
0
        tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1833
1834
        // vertical treatment
1835
1836
1837
0
        for (y = 0; y < height; y++) {
1838
0
            for (x = 0; x < width; x += 4) {
1839
0
                x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1840
0
                x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1841
0
                x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1842
0
                x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1843
1844
0
                r0 = _mm_mullo_epi16(x0, f0);
1845
0
                r1 = _mm_mulhi_epi16(x0, f0);
1846
0
                r2 = _mm_mullo_epi16(x1, f1);
1847
0
                t0 = _mm_unpacklo_epi16(r0, r1);
1848
1849
0
                r0 = _mm_mulhi_epi16(x1, f1);
1850
0
                r1 = _mm_mullo_epi16(x2, f2);
1851
0
                t1 = _mm_unpacklo_epi16(r2, r0);
1852
1853
0
                r2 = _mm_mulhi_epi16(x2, f2);
1854
0
                r0 = _mm_mullo_epi16(x3, f3);
1855
0
                t2 = _mm_unpacklo_epi16(r1, r2);
1856
1857
0
                r1 = _mm_mulhi_epi16(x3, f3);
1858
0
                t3 = _mm_unpacklo_epi16(r0, r1);
1859
1860
1861
1862
0
                r0 = _mm_add_epi32(t0, t1);
1863
0
                r0 = _mm_add_epi32(r0, t2);
1864
0
                r0 = _mm_add_epi32(r0, t3);
1865
0
                r0 = _mm_srai_epi32(r0, 6);
1866
1867
                // give results back
1868
0
                r0 = _mm_packs_epi32(r0, r0);
1869
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
1870
0
            }
1871
0
            tmp += MAX_PB_SIZE;
1872
0
            dst += dststride;
1873
0
        }
1874
0
    }else{
1875
0
        bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1);
1876
0
        r1= _mm_setzero_si128();
1877
0
        for (y = 0; y < height + epel_extra; y ++) {
1878
0
            for(x=0;x<width;x+=2){
1879
                /* load data in register     */
1880
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-1]);
1881
0
                x2 = _mm_shuffle_epi8(x1, bshuffle1);
1882
1883
                /*  PMADDUBSW then PMADDW     */
1884
0
                x2 = _mm_madd_epi16(x2, r0);
1885
0
                x2 = _mm_hadd_epi32(x2, r1);
1886
0
                x2= _mm_srai_epi32(x2,2);   //>> (BIT_DEPTH - 8)
1887
0
                x2 = _mm_packs_epi32(x2, r1);
1888
                /* give results back            */
1889
0
                _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x));
1890
0
            }
1891
0
            src += srcstride;
1892
0
            tmp += MAX_PB_SIZE;
1893
0
        }
1894
1895
0
        tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE;
1896
1897
        /* vertical treatment */
1898
1899
0
        for (y = 0; y < height; y++) {
1900
0
            for (x = 0; x < width; x += 2) {
1901
                /* check if memory needs to be reloaded */
1902
0
                x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]);
1903
0
                x1 = _mm_loadl_epi64((__m128i *) &tmp[x]);
1904
0
                x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]);
1905
0
                x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]);
1906
1907
0
                r0 = _mm_mullo_epi16(x0, f0);
1908
0
                t0 = _mm_mulhi_epi16(x0, f0);
1909
1910
0
                x0= _mm_unpacklo_epi16(r0,t0);
1911
1912
0
                r1 = _mm_mullo_epi16(x1, f1);
1913
0
                t1 = _mm_mulhi_epi16(x1, f1);
1914
1915
0
                x1= _mm_unpacklo_epi16(r1,t1);
1916
1917
0
                r2 = _mm_mullo_epi16(x2, f2);
1918
0
                t2 = _mm_mulhi_epi16(x2, f2);
1919
1920
0
                x2= _mm_unpacklo_epi16(r2,t2);
1921
1922
0
                r3 = _mm_mullo_epi16(x3, f3);
1923
0
                t3 = _mm_mulhi_epi16(x3, f3);
1924
1925
0
                x3= _mm_unpacklo_epi16(r3,t3);
1926
1927
0
                r0= _mm_add_epi32(x0,x1);
1928
0
                r1= _mm_add_epi32(x2,x3);
1929
0
                r0= _mm_add_epi32(r0,r1);
1930
0
                r0 = _mm_srai_epi32(r0, 6);
1931
                /* give results back            */
1932
0
                r0 = _mm_packs_epi32(r0, r0);
1933
0
                _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x));
1934
0
            }
1935
0
            tmp += MAX_PB_SIZE;
1936
0
            dst += dststride;
1937
0
        }
1938
0
    }
1939
0
}
1940
#endif
1941
1942
void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
1943
                                        const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
1944
0
        int16_t* mcbuffer) {
1945
0
    int x, y;
1946
0
    __m128i x1, x2, x3, x0;
1947
0
    uint8_t *src = (uint8_t*) _src;
1948
0
    ptrdiff_t srcstride = _srcstride;
1949
0
    x0= _mm_setzero_si128();
1950
0
    if(!(width & 15)){
1951
0
        for (y = 0; y < height; y++) {
1952
0
            for (x = 0; x < width; x += 16) {
1953
1954
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1955
0
                x2 = _mm_unpacklo_epi8(x1, x0);
1956
1957
0
                x3 = _mm_unpackhi_epi8(x1, x0);
1958
1959
0
                x2 = _mm_slli_epi16(x2, 6);
1960
0
                x3 = _mm_slli_epi16(x3, 6);
1961
0
                _mm_storeu_si128((__m128i *) &dst[x], x2);
1962
0
                _mm_storeu_si128((__m128i *) &dst[x + 8], x3);
1963
1964
0
            }
1965
0
            src += srcstride;
1966
0
            dst += dststride;
1967
0
        }
1968
0
    }else if(!(width & 7)){
1969
0
        for (y = 0; y < height; y++) {
1970
0
            for (x = 0; x < width; x += 8) {
1971
1972
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1973
0
                x2 = _mm_unpacklo_epi8(x1, x0);
1974
0
                x2 = _mm_slli_epi16(x2, 6);
1975
0
                _mm_storeu_si128((__m128i *) &dst[x], x2);
1976
1977
0
            }
1978
0
            src += srcstride;
1979
0
            dst += dststride;
1980
0
        }
1981
0
    }else if(!(width & 3)){
1982
0
        for (y = 0; y < height; y++) {
1983
0
            for(x=0;x<width;x+=4){
1984
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
1985
0
                x2 = _mm_unpacklo_epi8(x1, x0);
1986
0
                x2 = _mm_slli_epi16(x2, 6);
1987
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
1988
0
            }
1989
0
            src += srcstride;
1990
0
            dst += dststride;
1991
0
        }
1992
0
    }else{
1993
#if MASKMOVE
1994
        x4= _mm_set_epi32(0,0,0,-1); //mask to store
1995
#endif
1996
0
        for (y = 0; y < height; y++) {
1997
0
                    for(x=0;x<width;x+=2){
1998
0
                        x1 = _mm_loadl_epi64((__m128i *) &src[x]);
1999
0
                        x2 = _mm_unpacklo_epi8(x1, x0);
2000
0
                        x2 = _mm_slli_epi16(x2, 6);
2001
#if MASKMOVE
2002
                        _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
2003
#else
2004
0
                        *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
2005
0
#endif
2006
0
                    }
2007
0
                    src += srcstride;
2008
0
                    dst += dststride;
2009
0
                }
2010
0
    }
2011
2012
2013
0
}
2014
2015
#ifndef __native_client__
2016
void ff_hevc_put_hevc_qpel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
2017
                                         const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2018
0
        int16_t* mcbuffer) {
2019
0
    int x, y;
2020
0
    __m128i x1, x2, x4;
2021
0
    uint16_t *src = (uint16_t*) _src;
2022
0
    ptrdiff_t srcstride = _srcstride>>1;
2023
0
    if(!(width & 7)){
2024
0
        for (y = 0; y < height; y++) {
2025
0
            for (x = 0; x < width; x += 8) {
2026
2027
0
                x1 = _mm_loadu_si128((__m128i *) &src[x]);
2028
0
                x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH
2029
0
                _mm_storeu_si128((__m128i *) &dst[x], x2);
2030
2031
0
            }
2032
0
            src += srcstride;
2033
0
            dst += dststride;
2034
0
        }
2035
0
    }else if(!(width & 3)){
2036
0
        for (y = 0; y < height; y++) {
2037
0
            for(x=0;x<width;x+=4){
2038
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2039
0
                x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2040
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
2041
0
            }
2042
0
            src += srcstride;
2043
0
            dst += dststride;
2044
0
        }
2045
0
    }else{
2046
0
        x4= _mm_set_epi32(0,0,0,-1); //mask to store
2047
0
        for (y = 0; y < height; y++) {
2048
0
                    for(x=0;x<width;x+=2){
2049
0
                        x1 = _mm_loadl_epi64((__m128i *) &src[x]);
2050
0
                        x2 = _mm_slli_epi16(x1, 4);//14-BIT DEPTH
2051
0
                        _mm_maskmoveu_si128(x2,x4,(char *) (dst+x));
2052
0
                    }
2053
0
                    src += srcstride;
2054
0
                    dst += dststride;
2055
0
                }
2056
0
    }
2057
2058
2059
0
}
2060
#endif
2061
2062
2063
void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2064
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2065
0
        int16_t* mcbuffer) {
2066
0
    int x, y;
2067
0
    const uint8_t *src = _src;
2068
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2069
0
    __m128i x1, r0, x2, x3, x4, x5;
2070
2071
0
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
2072
0
            -1);
2073
2074
0
    if(!(width & 7)){
2075
0
        for (y = 0; y < height; y++) {
2076
0
            for (x = 0; x < width; x += 8) {
2077
                /* load data in register     */
2078
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2079
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2080
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2081
0
                        _mm_srli_si128(x1, 3));
2082
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2083
0
                        _mm_srli_si128(x1, 5));
2084
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2085
0
                        _mm_srli_si128(x1, 7));
2086
2087
                /*  PMADDUBSW then PMADDW     */
2088
0
                x2 = _mm_maddubs_epi16(x2, r0);
2089
0
                x3 = _mm_maddubs_epi16(x3, r0);
2090
0
                x4 = _mm_maddubs_epi16(x4, r0);
2091
0
                x5 = _mm_maddubs_epi16(x5, r0);
2092
0
                x2 = _mm_hadd_epi16(x2, x3);
2093
0
                x4 = _mm_hadd_epi16(x4, x5);
2094
0
                x2 = _mm_hadd_epi16(x2, x4);
2095
                /* give results back            */
2096
0
                _mm_store_si128((__m128i *) &dst[x],x2);
2097
2098
0
            }
2099
0
            src += srcstride;
2100
0
            dst += dststride;
2101
0
        }
2102
0
    }else if(!(width &3)){
2103
2104
0
        for (y = 0; y < height; y ++) {
2105
0
            for(x=0;x<width;x+=4){
2106
            /* load data in register     */
2107
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2108
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2109
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2110
0
                    _mm_srli_si128(x1, 3));
2111
2112
            /*  PMADDUBSW then PMADDW     */
2113
0
            x2 = _mm_maddubs_epi16(x2, r0);
2114
0
            x3 = _mm_maddubs_epi16(x3, r0);
2115
0
            x2 = _mm_hadd_epi16(x2, x3);
2116
0
            x2 = _mm_hadd_epi16(x2, x2);
2117
2118
            /* give results back            */
2119
0
            _mm_storel_epi64((__m128i *) &dst[x], x2);
2120
0
            }
2121
2122
0
            src += srcstride;
2123
0
            dst += dststride;
2124
0
        }
2125
0
    }else{
2126
0
        x5= _mm_setzero_si128();
2127
#if MASKMOVE
2128
        x3= _mm_set_epi32(0,0,0,-1);
2129
#endif
2130
0
        for (y = 0; y < height; y ++) {
2131
0
            for(x=0;x<width;x+=4){
2132
            /* load data in register     */
2133
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2134
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2135
2136
2137
2138
            /*  PMADDUBSW then PMADDW     */
2139
0
            x2 = _mm_maddubs_epi16(x2, r0);
2140
0
            x2 = _mm_hadd_epi16(x2,x5 );
2141
0
            x2 = _mm_hadd_epi16(x2,x5 );
2142
2143
            /* give results back            */
2144
            //_mm_storel_epi64((__m128i *) &dst[x], x2);
2145
#if MASKMOVE
2146
            _mm_maskmoveu_si128(x2,x3,(char *) (dst+x));
2147
#else
2148
0
            *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(x2);
2149
0
#endif
2150
0
            }
2151
2152
0
            src += srcstride;
2153
0
            dst += dststride;
2154
0
        }
2155
0
    }
2156
2157
0
}
2158
#ifndef __native_client__
2159
/*
2160
 * @TODO : Valgrind to see if it's useful to use SSE or wait for AVX2 implementation
2161
 */
2162
void ff_hevc_put_hevc_qpel_h_1_10_sse(int16_t *dst, ptrdiff_t dststride,
2163
                                      const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2164
0
        int16_t* mcbuffer) {
2165
0
    int x, y;
2166
0
    uint16_t *src = (uint16_t*)_src;
2167
0
    ptrdiff_t srcstride = _srcstride>>1;
2168
0
    __m128i x0, x1, x2, x3, r0;
2169
2170
0
    r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2171
0
    x0= _mm_setzero_si128();
2172
0
    x3= _mm_set_epi32(0,0,0,-1);
2173
0
    for (y = 0; y < height; y ++) {
2174
0
        for(x=0;x<width;x+=2){
2175
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2176
0
            x2 = _mm_srli_si128(x1,2); //last 16bit not used so 1 load can be used for 2 dst
2177
2178
0
            x1 = _mm_madd_epi16(x1,r0);
2179
0
            x2 = _mm_madd_epi16(x2,r0);
2180
2181
0
            x1 = _mm_hadd_epi32(x1,x2);
2182
0
            x1 = _mm_hadd_epi32(x1,x0);
2183
0
            x1= _mm_srai_epi32(x1,2); //>>BIT_DEPTH-8
2184
0
            x1= _mm_packs_epi32(x1,x0);
2185
         //   dst[x]= _mm_extract_epi16(x1,0);
2186
0
            _mm_maskmoveu_si128(x1,x3,(char *) (dst+x));
2187
0
        }
2188
0
        src += srcstride;
2189
0
        dst += dststride;
2190
0
    }
2191
2192
0
}
2193
#endif
2194
2195
2196
void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2197
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2198
0
        int16_t* mcbuffer) {
2199
0
    int x, y;
2200
0
    const uint8_t *src = _src;
2201
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2202
0
    __m128i x1, r0, x2, x3, x4, x5;
2203
2204
0
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2205
0
            4, -1);
2206
2207
    /* LOAD src from memory to registers to limit memory bandwidth */
2208
0
    if(!(width - 15)){
2209
0
        for (y = 0; y < height; y++) {
2210
0
                    for (x = 0; x < width; x += 8) {
2211
                        /* load data in register     */
2212
0
                        x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2213
0
                        x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2214
0
                        x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2215
0
                                _mm_srli_si128(x1, 3));
2216
0
                        x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2217
0
                                _mm_srli_si128(x1, 5));
2218
0
                        x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2219
0
                                _mm_srli_si128(x1, 7));
2220
2221
                        /*  PMADDUBSW then PMADDW     */
2222
0
                        x2 = _mm_maddubs_epi16(x2, r0);
2223
0
                        x3 = _mm_maddubs_epi16(x3, r0);
2224
0
                        x4 = _mm_maddubs_epi16(x4, r0);
2225
0
                        x5 = _mm_maddubs_epi16(x5, r0);
2226
0
                        x2 = _mm_hadd_epi16(x2, x3);
2227
0
                        x4 = _mm_hadd_epi16(x4, x5);
2228
0
                        x2 = _mm_hadd_epi16(x2, x4);
2229
                        /* give results back            */
2230
0
                        _mm_store_si128((__m128i *) &dst[x],x2);
2231
0
                    }
2232
0
                    src += srcstride;
2233
0
                    dst += dststride;
2234
0
                }
2235
2236
0
    }else{
2237
2238
0
        for (y = 0; y < height; y ++) {
2239
0
            for(x=0;x<width;x+=4){
2240
            /* load data in register     */
2241
0
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2242
2243
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2244
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2245
0
                    _mm_srli_si128(x1, 3));
2246
2247
2248
            /*  PMADDUBSW then PMADDW     */
2249
0
            x2 = _mm_maddubs_epi16(x2, r0);
2250
0
            x3 = _mm_maddubs_epi16(x3, r0);
2251
0
            x2 = _mm_hadd_epi16(x2, x3);
2252
0
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2253
2254
            /* give results back            */
2255
0
            _mm_storel_epi64((__m128i *) &dst[x], x2);
2256
2257
0
            }
2258
0
            src += srcstride;
2259
0
            dst += dststride;
2260
0
        }
2261
0
    }
2262
2263
0
}
2264
2265
#if 0
2266
static void ff_hevc_put_hevc_qpel_h_2_sse(int16_t *dst, ptrdiff_t dststride,
2267
                                          const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2268
        int16_t* mcbuffer) {
2269
    int x, y;
2270
    uint8_t *src = _src;
2271
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2272
    __m128i x1, r0, x2, x3, x4, x5;
2273
2274
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
2275
            4, -1);
2276
2277
    /* LOAD src from memory to registers to limit memory bandwidth */
2278
    if(!(width & 7)){
2279
        for (y = 0; y < height; y++) {
2280
                    for (x = 0; x < width; x += 8) {
2281
                        /* load data in register     */
2282
                        x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
2283
                        x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2284
                        x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2285
                                _mm_srli_si128(x1, 3));
2286
                        x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2287
                                _mm_srli_si128(x1, 5));
2288
                        x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2289
                                _mm_srli_si128(x1, 7));
2290
2291
                        /*  PMADDUBSW then PMADDW     */
2292
                        x2 = _mm_maddubs_epi16(x2, r0);
2293
                        x3 = _mm_maddubs_epi16(x3, r0);
2294
                        x4 = _mm_maddubs_epi16(x4, r0);
2295
                        x5 = _mm_maddubs_epi16(x5, r0);
2296
                        x2 = _mm_hadd_epi16(x2, x3);
2297
                        x4 = _mm_hadd_epi16(x4, x5);
2298
                        x2 = _mm_hadd_epi16(x2, x4);
2299
                        /* give results back            */
2300
                        _mm_store_si128((__m128i *) &dst[x],x2);
2301
                    }
2302
                    src += srcstride;
2303
                    dst += dststride;
2304
                }
2305
2306
    }else{
2307
2308
        for (y = 0; y < height; y ++) {
2309
            for(x=0;x<width;x+=4){
2310
            /* load data in register     */
2311
            x1 = _mm_loadu_si128((__m128i *) &src[x-3]);
2312
2313
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2314
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2315
                    _mm_srli_si128(x1, 3));
2316
2317
2318
            /*  PMADDUBSW then PMADDW     */
2319
            x2 = _mm_maddubs_epi16(x2, r0);
2320
            x3 = _mm_maddubs_epi16(x3, r0);
2321
            x2 = _mm_hadd_epi16(x2, x3);
2322
            x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2323
2324
            /* give results back            */
2325
            _mm_storel_epi64((__m128i *) &dst[x], x2);
2326
2327
            }
2328
            src += srcstride;
2329
            dst += dststride;
2330
        }
2331
    }
2332
2333
}
2334
static void ff_hevc_put_hevc_qpel_h_3_sse(int16_t *dst, ptrdiff_t dststride,
2335
                                          const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2336
        int16_t* mcbuffer) {
2337
    int x, y;
2338
    uint8_t *src = _src;
2339
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2340
    __m128i x1, r0, x2, x3, x4, x5;
2341
2342
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2343
            0);
2344
2345
    if(!(width & 7)){
2346
        for (y = 0; y < height; y++) {
2347
            for (x = 0; x < width; x += 8) {
2348
                /* load data in register     */
2349
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2350
                x1 = _mm_slli_si128(x1, 1);
2351
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2352
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2353
                        _mm_srli_si128(x1, 3));
2354
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2355
                        _mm_srli_si128(x1, 5));
2356
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2357
                        _mm_srli_si128(x1, 7));
2358
2359
                /*  PMADDUBSW then PMADDW     */
2360
                x2 = _mm_maddubs_epi16(x2, r0);
2361
                x3 = _mm_maddubs_epi16(x3, r0);
2362
                x4 = _mm_maddubs_epi16(x4, r0);
2363
                x5 = _mm_maddubs_epi16(x5, r0);
2364
                x2 = _mm_hadd_epi16(x2, x3);
2365
                x4 = _mm_hadd_epi16(x4, x5);
2366
                x2 = _mm_hadd_epi16(x2, x4);
2367
                /* give results back            */
2368
                _mm_store_si128((__m128i *) &dst[x],
2369
                        _mm_srli_si128(x2, BIT_DEPTH - 8));
2370
            }
2371
            src += srcstride;
2372
            dst += dststride;
2373
        }
2374
    }else{
2375
        for (y = 0; y < height; y ++) {
2376
            for(x=0;x<width;x+=4){
2377
                /* load data in register     */
2378
                x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2379
                x1 = _mm_slli_si128(x1, 1);
2380
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2381
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2382
                        _mm_srli_si128(x1, 3));
2383
2384
                /*  PMADDUBSW then PMADDW     */
2385
                x2 = _mm_maddubs_epi16(x2, r0);
2386
                x3 = _mm_maddubs_epi16(x3, r0);
2387
                x2 = _mm_hadd_epi16(x2, x3);
2388
                x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2389
                x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
2390
                /* give results back            */
2391
                _mm_storel_epi64((__m128i *) &dst[x], x2);
2392
2393
            }
2394
            src += srcstride;
2395
            dst += dststride;
2396
        }
2397
    }
2398
}
2399
#endif
2400
2401
void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
2402
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2403
0
        int16_t* mcbuffer) {
2404
0
    int x, y;
2405
0
    const uint8_t *src = _src;
2406
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2407
0
    __m128i x1, r0, x2, x3, x4, x5;
2408
2409
0
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
2410
0
            0);
2411
2412
0
    if(!(width & 7)){
2413
0
        for (y = 0; y < height; y++) {
2414
0
            for (x = 0; x < width; x += 8) {
2415
                /* load data in register     */
2416
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
2417
0
                x1 = _mm_slli_si128(x1, 1);
2418
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2419
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2420
0
                        _mm_srli_si128(x1, 3));
2421
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
2422
0
                        _mm_srli_si128(x1, 5));
2423
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
2424
0
                        _mm_srli_si128(x1, 7));
2425
2426
                /*  PMADDUBSW then PMADDW     */
2427
0
                x2 = _mm_maddubs_epi16(x2, r0);
2428
0
                x3 = _mm_maddubs_epi16(x3, r0);
2429
0
                x4 = _mm_maddubs_epi16(x4, r0);
2430
0
                x5 = _mm_maddubs_epi16(x5, r0);
2431
0
                x2 = _mm_hadd_epi16(x2, x3);
2432
0
                x4 = _mm_hadd_epi16(x4, x5);
2433
0
                x2 = _mm_hadd_epi16(x2, x4);
2434
                /* give results back            */
2435
0
                _mm_store_si128((__m128i *) &dst[x],x2);
2436
0
            }
2437
0
            src += srcstride;
2438
0
            dst += dststride;
2439
0
        }
2440
0
    }else{
2441
0
        for (y = 0; y < height; y ++) {
2442
0
            for(x=0;x<width;x+=4){
2443
                /* load data in register     */
2444
0
                x1 = _mm_loadu_si128((__m128i *) &src[x-2]);
2445
0
                x1 = _mm_slli_si128(x1, 1);
2446
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
2447
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
2448
0
                        _mm_srli_si128(x1, 3));
2449
2450
                /*  PMADDUBSW then PMADDW     */
2451
0
                x2 = _mm_maddubs_epi16(x2, r0);
2452
0
                x3 = _mm_maddubs_epi16(x3, r0);
2453
0
                x2 = _mm_hadd_epi16(x2, x3);
2454
0
                x2 = _mm_hadd_epi16(x2, _mm_setzero_si128());
2455
                /* give results back            */
2456
0
                _mm_storel_epi64((__m128i *) &dst[x], x2);
2457
2458
0
            }
2459
0
            src += srcstride;
2460
0
            dst += dststride;
2461
0
        }
2462
0
    }
2463
0
}
2464
/**
2465
 for column MC treatment, we will calculate 8 pixels at the same time by multiplying the values
2466
 of each row.
2467
2468
 */
2469
void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
2470
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2471
0
        int16_t* mcbuffer) {
2472
0
    int x, y;
2473
0
    uint8_t *src = (uint8_t*) _src;
2474
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2475
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2476
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2477
0
    r1 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
2478
2479
0
    if(!(width & 15)){
2480
0
        x8 = _mm_setzero_si128();
2481
0
        for (y = 0; y < height; y++) {
2482
0
            for (x = 0; x < width; x += 16) {
2483
                /* check if memory needs to be reloaded */
2484
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2485
0
                x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2486
0
                x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2487
0
                x4 = _mm_loadu_si128((__m128i *) &src[x]);
2488
0
                x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2489
0
                x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2490
0
                x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2491
2492
0
                t1 = _mm_unpacklo_epi8(x1,x8);
2493
0
                t2 = _mm_unpacklo_epi8(x2, x8);
2494
0
                t3 = _mm_unpacklo_epi8(x3, x8);
2495
0
                t4 = _mm_unpacklo_epi8(x4, x8);
2496
0
                t5 = _mm_unpacklo_epi8(x5, x8);
2497
0
                t6 = _mm_unpacklo_epi8(x6, x8);
2498
0
                t7 = _mm_unpacklo_epi8(x7, x8);
2499
2500
0
                x1 = _mm_unpackhi_epi8(x1,x8);
2501
0
                x2 = _mm_unpackhi_epi8(x2, x8);
2502
0
                x3 = _mm_unpackhi_epi8(x3, x8);
2503
0
                x4 = _mm_unpackhi_epi8(x4, x8);
2504
0
                x5 = _mm_unpackhi_epi8(x5, x8);
2505
0
                x6 = _mm_unpackhi_epi8(x6, x8);
2506
0
                x7 = _mm_unpackhi_epi8(x7, x8);
2507
2508
                /* multiply by correct value : */
2509
0
                r0 = _mm_mullo_epi16(t1,
2510
0
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2511
0
                r2 = _mm_mullo_epi16(x1,
2512
0
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2513
0
                r0 = _mm_adds_epi16(r0,
2514
0
                        _mm_mullo_epi16(t2,
2515
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2516
0
                r2 = _mm_adds_epi16(r2,
2517
0
                        _mm_mullo_epi16(x2,
2518
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2519
0
                r0 = _mm_adds_epi16(r0,
2520
0
                        _mm_mullo_epi16(t3,
2521
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2522
0
                r2 = _mm_adds_epi16(r2,
2523
0
                        _mm_mullo_epi16(x3,
2524
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2525
2526
0
                r0 = _mm_adds_epi16(r0,
2527
0
                        _mm_mullo_epi16(t4,
2528
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2529
0
                r2 = _mm_adds_epi16(r2,
2530
0
                        _mm_mullo_epi16(x4,
2531
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2532
2533
0
                r0 = _mm_adds_epi16(r0,
2534
0
                        _mm_mullo_epi16(t5,
2535
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2536
0
                r2 = _mm_adds_epi16(r2,
2537
0
                        _mm_mullo_epi16(x5,
2538
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2539
2540
0
                r0 = _mm_adds_epi16(r0,
2541
0
                        _mm_mullo_epi16(t6,
2542
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2543
0
                r2 = _mm_adds_epi16(r2,
2544
0
                        _mm_mullo_epi16(x6,
2545
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2546
2547
0
                r0 = _mm_adds_epi16(r0,
2548
0
                        _mm_mullo_epi16(t7,
2549
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2550
0
                r2 = _mm_adds_epi16(r2,
2551
0
                        _mm_mullo_epi16(x7,
2552
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2553
2554
2555
                /* give results back            */
2556
0
                _mm_store_si128((__m128i *) &dst[x],r0);
2557
0
                _mm_store_si128((__m128i *) &dst[x + 8],r2);
2558
0
            }
2559
0
            src += srcstride;
2560
0
            dst += dststride;
2561
0
        }
2562
2563
0
    }else{
2564
0
        x = 0;
2565
0
        x8 = _mm_setzero_si128();
2566
0
        t8 = _mm_setzero_si128();
2567
0
        for (y = 0; y < height; y ++) {
2568
0
            for(x=0;x<width;x+=4){
2569
                /* load data in register  */
2570
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2571
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2572
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2573
0
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2574
0
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2575
0
                x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2576
0
                x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2577
2578
2579
2580
0
                x1 = _mm_unpacklo_epi8(x1, t8);
2581
0
                x2 = _mm_unpacklo_epi8(x2, t8);
2582
0
                x3 = _mm_unpacklo_epi8(x3, t8);
2583
0
                x4 = _mm_unpacklo_epi8(x4, t8);
2584
0
                x5 = _mm_unpacklo_epi8(x5, t8);
2585
0
                x6 = _mm_unpacklo_epi8(x6, t8);
2586
0
                x7 = _mm_unpacklo_epi8(x7, t8);
2587
2588
2589
0
                r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2590
2591
0
                r0 = _mm_adds_epi16(r0,
2592
0
                        _mm_mullo_epi16(x2,
2593
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2594
2595
2596
0
                r0 = _mm_adds_epi16(r0,
2597
0
                        _mm_mullo_epi16(x3,
2598
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2599
2600
0
                r0 = _mm_adds_epi16(r0,
2601
0
                        _mm_mullo_epi16(x4,
2602
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2603
2604
0
                r0 = _mm_adds_epi16(r0,
2605
0
                        _mm_mullo_epi16(x5,
2606
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2607
2608
2609
0
                r0 = _mm_adds_epi16(r0,
2610
0
                        _mm_mullo_epi16(x6,
2611
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2612
2613
2614
0
                r0 = _mm_adds_epi16(r0,
2615
0
                        _mm_mullo_epi16(x7,
2616
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2617
2618
                /* give results back            */
2619
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
2620
0
            }
2621
0
            src += srcstride;
2622
0
            dst += dststride;
2623
0
        }
2624
0
    }
2625
0
}
2626
2627
#if 0
2628
void ff_hevc_put_hevc_qpel_v_1_10_sse4(int16_t *dst, ptrdiff_t dststride,
2629
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2630
        int16_t* mcbuffer) {
2631
    int x, y;
2632
    uint16_t *src = (uint16_t*) _src;
2633
    ptrdiff_t srcstride = _srcstride >> 1;
2634
    __m128i x1, x2, x3, x4, x5, x6, x7, r1;
2635
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2636
2637
        t7= _mm_set1_epi32(1);
2638
        t6= _mm_set1_epi32(-5);
2639
        t5= _mm_set1_epi32(17);
2640
        t4= _mm_set1_epi32(58);
2641
        t3= _mm_set1_epi32(-10);
2642
        t2= _mm_set1_epi32(4);
2643
        t1= _mm_set1_epi32(-1);
2644
        t8= _mm_setzero_si128();
2645
2646
        for (y = 0; y < height; y ++) {
2647
            for(x=0;x<width;x+=4){
2648
                /* load data in register  */
2649
                x1 = _mm_loadl_epi64((__m128i *) &src[x-(3 * srcstride)]);
2650
                x2 = _mm_loadl_epi64((__m128i *) &src[x-(2 * srcstride)]);
2651
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2652
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2653
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2654
                x6 = _mm_loadl_epi64((__m128i *) &src[x+(2 * srcstride)]);
2655
                x7 = _mm_loadl_epi64((__m128i *) &src[x+(3 * srcstride)]);
2656
2657
2658
                x1 = _mm_unpacklo_epi16(x1, t8);
2659
                x2 = _mm_unpacklo_epi16(x2, t8);
2660
                x3 = _mm_unpacklo_epi16(x3, t8);
2661
                x4 = _mm_unpacklo_epi16(x4, t8);
2662
                x5 = _mm_unpacklo_epi16(x5, t8);
2663
                x6 = _mm_unpacklo_epi16(x6, t8);
2664
                x7 = _mm_unpacklo_epi16(x7, t8);
2665
2666
2667
                r1 = _mm_mullo_epi32(x1,t1);
2668
2669
                r1 = _mm_add_epi32(r1,
2670
                        _mm_mullo_epi32(x2,t2));
2671
2672
2673
                r1 = _mm_add_epi32(r1,
2674
                        _mm_mullo_epi32(x3,t3));
2675
2676
                r1 = _mm_add_epi32(r1,
2677
                        _mm_mullo_epi32(x4,t4));
2678
2679
                r1 = _mm_add_epi32(r1,
2680
                        _mm_mullo_epi32(x5,t5));
2681
2682
2683
                r1 = _mm_add_epi32(r1,
2684
                        _mm_mullo_epi32(x6,t6));
2685
2686
2687
                r1 = _mm_add_epi32(r1, _mm_mullo_epi32(x7,t7));
2688
                r1 = _mm_srai_epi32(r1,2); //bit depth - 8
2689
2690
2691
                r1 = _mm_packs_epi32(r1,t8);
2692
2693
                // give results back
2694
                _mm_storel_epi64((__m128i *) (dst + x), r1);
2695
            }
2696
            src += srcstride;
2697
            dst += dststride;
2698
        }
2699
2700
}
2701
#endif
2702
2703
2704
2705
void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
2706
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2707
0
        int16_t* mcbuffer) {
2708
0
    int x, y;
2709
0
    uint8_t *src = (uint8_t*) _src;
2710
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2711
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2712
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2713
0
    r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2714
2715
0
    if(!(width & 15)){
2716
0
        for (y = 0; y < height; y++) {
2717
0
            for (x = 0; x < width; x += 16) {
2718
0
                r0 = _mm_setzero_si128();
2719
                /* check if memory needs to be reloaded */
2720
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3 * srcstride]);
2721
0
                x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2722
0
                x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2723
0
                x4 = _mm_loadu_si128((__m128i *) &src[x]);
2724
0
                x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2725
0
                x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2726
0
                x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2727
0
                x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2728
2729
0
                t1 = _mm_unpacklo_epi8(x1, r0);
2730
0
                t2 = _mm_unpacklo_epi8(x2, r0);
2731
0
                t3 = _mm_unpacklo_epi8(x3, r0);
2732
0
                t4 = _mm_unpacklo_epi8(x4, r0);
2733
0
                t5 = _mm_unpacklo_epi8(x5, r0);
2734
0
                t6 = _mm_unpacklo_epi8(x6, r0);
2735
0
                t7 = _mm_unpacklo_epi8(x7, r0);
2736
0
                t8 = _mm_unpacklo_epi8(x8, r0);
2737
2738
0
                x1 = _mm_unpackhi_epi8(x1, r0);
2739
0
                x2 = _mm_unpackhi_epi8(x2, r0);
2740
0
                x3 = _mm_unpackhi_epi8(x3, r0);
2741
0
                x4 = _mm_unpackhi_epi8(x4, r0);
2742
0
                x5 = _mm_unpackhi_epi8(x5, r0);
2743
0
                x6 = _mm_unpackhi_epi8(x6, r0);
2744
0
                x7 = _mm_unpackhi_epi8(x7, r0);
2745
0
                x8 = _mm_unpackhi_epi8(x8, r0);
2746
2747
                /* multiply by correct value : */
2748
0
                r0 = _mm_mullo_epi16(t1,
2749
0
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2750
0
                r2 = _mm_mullo_epi16(x1,
2751
0
                        _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2752
0
                r0 = _mm_adds_epi16(r0,
2753
0
                        _mm_mullo_epi16(t2,
2754
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2755
0
                r2 = _mm_adds_epi16(r2,
2756
0
                        _mm_mullo_epi16(x2,
2757
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2758
0
                r0 = _mm_adds_epi16(r0,
2759
0
                        _mm_mullo_epi16(t3,
2760
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2761
0
                r2 = _mm_adds_epi16(r2,
2762
0
                        _mm_mullo_epi16(x3,
2763
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2764
2765
0
                r0 = _mm_adds_epi16(r0,
2766
0
                        _mm_mullo_epi16(t4,
2767
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2768
0
                r2 = _mm_adds_epi16(r2,
2769
0
                        _mm_mullo_epi16(x4,
2770
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2771
2772
0
                r0 = _mm_adds_epi16(r0,
2773
0
                        _mm_mullo_epi16(t5,
2774
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2775
0
                r2 = _mm_adds_epi16(r2,
2776
0
                        _mm_mullo_epi16(x5,
2777
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2778
2779
0
                r0 = _mm_adds_epi16(r0,
2780
0
                        _mm_mullo_epi16(t6,
2781
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2782
0
                r2 = _mm_adds_epi16(r2,
2783
0
                        _mm_mullo_epi16(x6,
2784
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2785
2786
0
                r0 = _mm_adds_epi16(r0,
2787
0
                        _mm_mullo_epi16(t7,
2788
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2789
0
                r2 = _mm_adds_epi16(r2,
2790
0
                        _mm_mullo_epi16(x7,
2791
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2792
2793
0
                r0 = _mm_adds_epi16(r0,
2794
0
                        _mm_mullo_epi16(t8,
2795
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2796
0
                r2 = _mm_adds_epi16(r2,
2797
0
                        _mm_mullo_epi16(x8,
2798
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2799
2800
                /* give results back            */
2801
0
                _mm_store_si128((__m128i *) &dst[x],r0);
2802
0
                _mm_store_si128((__m128i *) &dst[x + 8],r2);
2803
0
            }
2804
0
            src += srcstride;
2805
0
            dst += dststride;
2806
0
        }
2807
0
    }else{
2808
0
        x = 0;
2809
0
        for (y = 0; y < height; y ++) {
2810
0
            for(x=0;x<width;x+=4){
2811
0
                r0 = _mm_setzero_si128();
2812
                /* load data in register  */
2813
0
                x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2814
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2815
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2816
0
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2817
0
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2818
0
                x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2819
0
                x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2820
0
                x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2821
2822
0
                x1 = _mm_unpacklo_epi8(x1,r0);
2823
0
                x2 = _mm_unpacklo_epi8(x2, r0);
2824
0
                x3 = _mm_unpacklo_epi8(x3, r0);
2825
0
                x4 = _mm_unpacklo_epi8(x4, r0);
2826
0
                x5 = _mm_unpacklo_epi8(x5, r0);
2827
0
                x6 = _mm_unpacklo_epi8(x6, r0);
2828
0
                x7 = _mm_unpacklo_epi8(x7, r0);
2829
0
                x8 = _mm_unpacklo_epi8(x8, r0);
2830
2831
2832
0
                r0 = _mm_mullo_epi16(x1, _mm_set1_epi16(_mm_extract_epi16(r1, 0)));
2833
2834
0
                r0 = _mm_adds_epi16(r0,
2835
0
                        _mm_mullo_epi16(x2,
2836
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1))));
2837
2838
2839
0
                r0 = _mm_adds_epi16(r0,
2840
0
                        _mm_mullo_epi16(x3,
2841
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
2842
2843
2844
0
                r0 = _mm_adds_epi16(r0,
2845
0
                        _mm_mullo_epi16(x4,
2846
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
2847
2848
2849
0
                r0 = _mm_adds_epi16(r0,
2850
0
                        _mm_mullo_epi16(x5,
2851
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
2852
2853
2854
0
                r0 = _mm_adds_epi16(r0,
2855
0
                        _mm_mullo_epi16(x6,
2856
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
2857
2858
2859
0
                r0 = _mm_adds_epi16(r0,
2860
0
                        _mm_mullo_epi16(x7,
2861
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
2862
2863
2864
0
                r0 = _mm_adds_epi16(r0,
2865
0
                        _mm_mullo_epi16(x8,
2866
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
2867
2868
2869
                /* give results back            */
2870
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
2871
2872
0
            }
2873
0
            src += srcstride;
2874
0
            dst += dststride;
2875
0
        }
2876
0
    }
2877
0
}
2878
2879
#if 0
2880
void ff_hevc_put_hevc_qpel_v_2_10_sse(int16_t *dst, ptrdiff_t dststride,
2881
                                      cosnt uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2882
        int16_t* mcbuffer) {
2883
    int x, y;
2884
    uint16_t *src = (uint16_t*) _src;
2885
    ptrdiff_t srcstride = _srcstride >> 1;
2886
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2887
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
2888
    r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
2889
2890
    t1= _mm_set1_epi32(-1);
2891
    t2= _mm_set1_epi32(4);
2892
    t3= _mm_set1_epi32(-11);
2893
    t4= _mm_set1_epi32(40);
2894
    t5= _mm_set1_epi32(40);
2895
    t6= _mm_set1_epi32(-11);
2896
    t7= _mm_set1_epi32(4);
2897
    t8= _mm_set1_epi32(-1);
2898
2899
    {
2900
        x = 0;
2901
        r0 = _mm_setzero_si128();
2902
        for (y = 0; y < height; y ++) {
2903
            for(x=0;x<width;x+=4){
2904
2905
                /* load data in register  */
2906
                x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
2907
                x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
2908
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
2909
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
2910
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
2911
                x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
2912
                x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
2913
                x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
2914
2915
                x1 = _mm_unpacklo_epi16(x1, r0);
2916
                x2 = _mm_unpacklo_epi16(x2, r0);
2917
                x3 = _mm_unpacklo_epi16(x3, r0);
2918
                x4 = _mm_unpacklo_epi16(x4, r0);
2919
                x5 = _mm_unpacklo_epi16(x5, r0);
2920
                x6 = _mm_unpacklo_epi16(x6, r0);
2921
                x7 = _mm_unpacklo_epi16(x7, r0);
2922
                x8 = _mm_unpacklo_epi16(x8, r0);
2923
2924
2925
                r1 = _mm_mullo_epi32(x1, t1);
2926
2927
                r1 = _mm_add_epi32(r1,
2928
                        _mm_mullo_epi32(x2,t2));
2929
2930
2931
                r1 = _mm_add_epi32(r1,
2932
                        _mm_mullo_epi32(x3,t3));
2933
2934
2935
                r1 = _mm_add_epi32(r1,
2936
                        _mm_mullo_epi32(x4,t4));
2937
2938
2939
                r1 = _mm_add_epi32(r1,
2940
                        _mm_mullo_epi32(x5,t5));
2941
2942
2943
                r1 = _mm_add_epi32(r1,
2944
                        _mm_mullo_epi32(x6,t6));
2945
2946
2947
                r1 = _mm_add_epi32(r1,
2948
                        _mm_mullo_epi32(x7,t7));
2949
2950
2951
                r1 = _mm_add_epi32(r1,
2952
                        _mm_mullo_epi32(x8,t8));
2953
2954
2955
                r1= _mm_srai_epi32(r1,2); //bit depth - 8
2956
2957
                r1= _mm_packs_epi32(r1,t8);
2958
2959
                /* give results back            */
2960
                _mm_storel_epi64((__m128i *) (dst+x), r1);
2961
2962
            }
2963
            src += srcstride;
2964
            dst += dststride;
2965
        }
2966
    }
2967
}
2968
#endif
2969
2970
#if 0
2971
static  void ff_hevc_put_hevc_qpel_v_3_sse(int16_t *dst, ptrdiff_t dststride,
2972
                                           const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
2973
        int16_t* mcbuffer) {
2974
    int x, y;
2975
    uint8_t *src = (uint8_t*) _src;
2976
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
2977
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
2978
    __m128i t2, t3, t4, t5, t6, t7, t8;
2979
    r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
2980
2981
    if(!(width & 15)){
2982
        for (y = 0; y < height; y++) {
2983
                    for (x = 0; x < width; x += 16) {
2984
                        /* check if memory needs to be reloaded */
2985
                        x1 = _mm_setzero_si128();
2986
                        x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
2987
                        x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
2988
                        x4 = _mm_loadu_si128((__m128i *) &src[x]);
2989
                        x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
2990
                        x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
2991
                        x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
2992
                        x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
2993
2994
                        t2 = _mm_unpacklo_epi8(x2, x1);
2995
                        t3 = _mm_unpacklo_epi8(x3, x1);
2996
                        t4 = _mm_unpacklo_epi8(x4, x1);
2997
                        t5 = _mm_unpacklo_epi8(x5, x1);
2998
                        t6 = _mm_unpacklo_epi8(x6, x1);
2999
                        t7 = _mm_unpacklo_epi8(x7, x1);
3000
                        t8 = _mm_unpacklo_epi8(x8, x1);
3001
3002
                        x2 = _mm_unpackhi_epi8(x2, x1);
3003
                        x3 = _mm_unpackhi_epi8(x3, x1);
3004
                        x4 = _mm_unpackhi_epi8(x4, x1);
3005
                        x5 = _mm_unpackhi_epi8(x5, x1);
3006
                        x6 = _mm_unpackhi_epi8(x6, x1);
3007
                        x7 = _mm_unpackhi_epi8(x7, x1);
3008
                        x8 = _mm_unpackhi_epi8(x8, x1);
3009
3010
                        /* multiply by correct value : */
3011
                        r0 = _mm_mullo_epi16(t2,
3012
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3013
                        r2 = _mm_mullo_epi16(x2,
3014
                                _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3015
3016
                        r0 = _mm_adds_epi16(r0,
3017
                                _mm_mullo_epi16(t3,
3018
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3019
                        r2 = _mm_adds_epi16(r2,
3020
                                _mm_mullo_epi16(x3,
3021
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3022
3023
                        r0 = _mm_adds_epi16(r0,
3024
                                _mm_mullo_epi16(t4,
3025
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3026
                        r2 = _mm_adds_epi16(r2,
3027
                                _mm_mullo_epi16(x4,
3028
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3029
3030
                        r0 = _mm_adds_epi16(r0,
3031
                                _mm_mullo_epi16(t5,
3032
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3033
                        r2 = _mm_adds_epi16(r2,
3034
                                _mm_mullo_epi16(x5,
3035
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3036
3037
                        r0 = _mm_adds_epi16(r0,
3038
                                _mm_mullo_epi16(t6,
3039
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3040
                        r2 = _mm_adds_epi16(r2,
3041
                                _mm_mullo_epi16(x6,
3042
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3043
3044
                        r0 = _mm_adds_epi16(r0,
3045
                                _mm_mullo_epi16(t7,
3046
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3047
                        r2 = _mm_adds_epi16(r2,
3048
                                _mm_mullo_epi16(x7,
3049
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3050
3051
                        r0 = _mm_adds_epi16(r0,
3052
                                _mm_mullo_epi16(t8,
3053
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3054
                        r2 = _mm_adds_epi16(r2,
3055
                                _mm_mullo_epi16(x8,
3056
                                        _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3057
3058
                        /* give results back            */
3059
                        _mm_store_si128((__m128i *) &dst[x],
3060
                                _mm_srli_epi16(r0, BIT_DEPTH - 8));
3061
                        _mm_store_si128((__m128i *) &dst[x + 8],
3062
                                _mm_srli_epi16(r2, BIT_DEPTH - 8));
3063
                    }
3064
                    src += srcstride;
3065
                    dst += dststride;
3066
                }
3067
    }else{
3068
        x = 0;
3069
                for (y = 0; y < height; y ++) {
3070
                    for(x=0;x<width;x+=4){
3071
                    r0 = _mm_set1_epi16(0);
3072
                    /* load data in register  */
3073
                    //x1 = _mm_setzero_si128();
3074
                    x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3075
                    x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3076
                    x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3077
                    x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3078
                    x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3079
                    x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3080
                    x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3081
3082
                    x1 = _mm_unpacklo_epi8(x1,r0);
3083
                    x2 = _mm_unpacklo_epi8(x2, r0);
3084
                    x3 = _mm_unpacklo_epi8(x3, r0);
3085
                    x4 = _mm_unpacklo_epi8(x4, r0);
3086
                    x5 = _mm_unpacklo_epi8(x5, r0);
3087
                    x6 = _mm_unpacklo_epi8(x6, r0);
3088
                    x7 = _mm_unpacklo_epi8(x7, r0);
3089
                    x8 = _mm_unpacklo_epi8(x8, r0);
3090
3091
3092
                    r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3093
3094
3095
                    r0 = _mm_adds_epi16(r0,
3096
                            _mm_mullo_epi16(x3,
3097
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3098
3099
3100
                    r0 = _mm_adds_epi16(r0,
3101
                            _mm_mullo_epi16(x4,
3102
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3103
3104
3105
                    r0 = _mm_adds_epi16(r0,
3106
                            _mm_mullo_epi16(x5,
3107
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3108
3109
3110
                    r0 = _mm_adds_epi16(r0,
3111
                            _mm_mullo_epi16(x6,
3112
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3113
3114
3115
                    r0 = _mm_adds_epi16(r0,
3116
                            _mm_mullo_epi16(x7,
3117
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3118
3119
3120
                    r0 = _mm_adds_epi16(r0,
3121
                            _mm_mullo_epi16(x8,
3122
                                    _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3123
3124
3125
                    r0 = _mm_srli_epi16(r0, BIT_DEPTH - 8);
3126
                    /* give results back            */
3127
                    _mm_storel_epi64((__m128i *) &dst[x], r0);
3128
3129
                    }
3130
                    src += srcstride;
3131
                    dst += dststride;
3132
                }
3133
    }
3134
3135
}
3136
#endif
3137
3138
void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
3139
                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3140
0
        int16_t* mcbuffer) {
3141
0
    int x, y;
3142
0
    uint8_t *src = (uint8_t*) _src;
3143
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3144
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2;
3145
0
    __m128i t2, t3, t4, t5, t6, t7, t8;
3146
0
    r1 = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3147
3148
0
    if(!(width & 15)){
3149
0
        for (y = 0; y < height; y++) {
3150
0
            for (x = 0; x < width; x += 16) {
3151
                /* check if memory needs to be reloaded */
3152
0
                x1 = _mm_setzero_si128();
3153
0
                x2 = _mm_loadu_si128((__m128i *) &src[x - 2 * srcstride]);
3154
0
                x3 = _mm_loadu_si128((__m128i *) &src[x - srcstride]);
3155
0
                x4 = _mm_loadu_si128((__m128i *) &src[x]);
3156
0
                x5 = _mm_loadu_si128((__m128i *) &src[x + srcstride]);
3157
0
                x6 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]);
3158
0
                x7 = _mm_loadu_si128((__m128i *) &src[x + 3 * srcstride]);
3159
0
                x8 = _mm_loadu_si128((__m128i *) &src[x + 4 * srcstride]);
3160
3161
0
                t2 = _mm_unpacklo_epi8(x2, x1);
3162
0
                t3 = _mm_unpacklo_epi8(x3, x1);
3163
0
                t4 = _mm_unpacklo_epi8(x4, x1);
3164
0
                t5 = _mm_unpacklo_epi8(x5, x1);
3165
0
                t6 = _mm_unpacklo_epi8(x6, x1);
3166
0
                t7 = _mm_unpacklo_epi8(x7, x1);
3167
0
                t8 = _mm_unpacklo_epi8(x8, x1);
3168
3169
0
                x2 = _mm_unpackhi_epi8(x2, x1);
3170
0
                x3 = _mm_unpackhi_epi8(x3, x1);
3171
0
                x4 = _mm_unpackhi_epi8(x4, x1);
3172
0
                x5 = _mm_unpackhi_epi8(x5, x1);
3173
0
                x6 = _mm_unpackhi_epi8(x6, x1);
3174
0
                x7 = _mm_unpackhi_epi8(x7, x1);
3175
0
                x8 = _mm_unpackhi_epi8(x8, x1);
3176
3177
                /* multiply by correct value : */
3178
0
                r0 = _mm_mullo_epi16(t2,
3179
0
                        _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3180
0
                r2 = _mm_mullo_epi16(x2,
3181
0
                        _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3182
3183
0
                r0 = _mm_adds_epi16(r0,
3184
0
                        _mm_mullo_epi16(t3,
3185
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3186
0
                r2 = _mm_adds_epi16(r2,
3187
0
                        _mm_mullo_epi16(x3,
3188
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3189
3190
0
                r0 = _mm_adds_epi16(r0,
3191
0
                        _mm_mullo_epi16(t4,
3192
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3193
0
                r2 = _mm_adds_epi16(r2,
3194
0
                        _mm_mullo_epi16(x4,
3195
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3196
3197
0
                r0 = _mm_adds_epi16(r0,
3198
0
                        _mm_mullo_epi16(t5,
3199
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3200
0
                r2 = _mm_adds_epi16(r2,
3201
0
                        _mm_mullo_epi16(x5,
3202
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3203
3204
0
                r0 = _mm_adds_epi16(r0,
3205
0
                        _mm_mullo_epi16(t6,
3206
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3207
0
                r2 = _mm_adds_epi16(r2,
3208
0
                        _mm_mullo_epi16(x6,
3209
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3210
3211
0
                r0 = _mm_adds_epi16(r0,
3212
0
                        _mm_mullo_epi16(t7,
3213
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3214
0
                r2 = _mm_adds_epi16(r2,
3215
0
                        _mm_mullo_epi16(x7,
3216
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3217
3218
0
                r0 = _mm_adds_epi16(r0,
3219
0
                        _mm_mullo_epi16(t8,
3220
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3221
0
                r2 = _mm_adds_epi16(r2,
3222
0
                        _mm_mullo_epi16(x8,
3223
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3224
3225
                /* give results back            */
3226
0
                _mm_store_si128((__m128i *) &dst[x],r0);
3227
0
                _mm_store_si128((__m128i *) &dst[x + 8],r2);
3228
0
            }
3229
0
            src += srcstride;
3230
0
            dst += dststride;
3231
0
        }
3232
0
    }else{
3233
0
        x = 0;
3234
0
        for (y = 0; y < height; y ++) {
3235
0
            for(x=0;x<width;x+=4){
3236
0
                r0 = _mm_set1_epi16(0);
3237
                /* load data in register  */
3238
0
                x2 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3239
0
                x3 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3240
0
                x4 = _mm_loadl_epi64((__m128i *) &src[x]);
3241
0
                x5 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3242
0
                x6 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3243
0
                x7 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3244
0
                x8 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3245
3246
0
                x2 = _mm_unpacklo_epi8(x2, r0);
3247
0
                x3 = _mm_unpacklo_epi8(x3, r0);
3248
0
                x4 = _mm_unpacklo_epi8(x4, r0);
3249
0
                x5 = _mm_unpacklo_epi8(x5, r0);
3250
0
                x6 = _mm_unpacklo_epi8(x6, r0);
3251
0
                x7 = _mm_unpacklo_epi8(x7, r0);
3252
0
                x8 = _mm_unpacklo_epi8(x8, r0);
3253
3254
0
                r0 = _mm_mullo_epi16(x2, _mm_set1_epi16(_mm_extract_epi16(r1, 1)));
3255
3256
0
                r0 = _mm_adds_epi16(r0,
3257
0
                        _mm_mullo_epi16(x3,
3258
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 2))));
3259
3260
0
                r0 = _mm_adds_epi16(r0,
3261
0
                        _mm_mullo_epi16(x4,
3262
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 3))));
3263
3264
0
                r0 = _mm_adds_epi16(r0,
3265
0
                        _mm_mullo_epi16(x5,
3266
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 4))));
3267
3268
0
                r0 = _mm_adds_epi16(r0,
3269
0
                        _mm_mullo_epi16(x6,
3270
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 5))));
3271
3272
0
                r0 = _mm_adds_epi16(r0,
3273
0
                        _mm_mullo_epi16(x7,
3274
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 6))));
3275
3276
0
                r0 = _mm_adds_epi16(r0,
3277
0
                        _mm_mullo_epi16(x8,
3278
0
                                _mm_set1_epi16(_mm_extract_epi16(r1, 7))));
3279
3280
                /* give results back            */
3281
0
                _mm_storel_epi64((__m128i *) &dst[x], r0);
3282
3283
0
            }
3284
0
            src += srcstride;
3285
0
            dst += dststride;
3286
0
        }
3287
0
    }
3288
3289
0
}
3290
3291
3292
#if 0
3293
void ff_hevc_put_hevc_qpel_v_3_10_sse(int16_t *dst, ptrdiff_t dststride,
3294
                                      const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3295
        int16_t* mcbuffer) {
3296
    int x, y;
3297
    uint16_t *src = (uint16_t*) _src;
3298
    ptrdiff_t srcstride = _srcstride >> 1;
3299
    __m128i x1, x2, x3, x4, x5, x6, x7, r0;
3300
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3301
3302
    t7 = _mm_set1_epi32(-1);
3303
    t6 = _mm_set1_epi32(4);
3304
    t5 = _mm_set1_epi32(-10);
3305
    t4 = _mm_set1_epi32(58);
3306
    t3 = _mm_set1_epi32(17);
3307
    t2 = _mm_set1_epi32(-5);
3308
    t1 = _mm_set1_epi32(1);
3309
    t8= _mm_setzero_si128();
3310
    {
3311
3312
        for (y = 0; y < height; y ++) {
3313
            for(x=0;x<width;x+=4){
3314
                /* load data in register  */
3315
                x1 = _mm_loadl_epi64((__m128i *) &src[x-2 * srcstride]);
3316
                x2 = _mm_loadl_epi64((__m128i *) &src[x-srcstride]);
3317
                x3 = _mm_loadl_epi64((__m128i *) &src[x]);
3318
                x4 = _mm_loadl_epi64((__m128i *) &src[x+srcstride]);
3319
                x5 = _mm_loadl_epi64((__m128i *) &src[x+2 * srcstride]);
3320
                x6 = _mm_loadl_epi64((__m128i *) &src[x+3 * srcstride]);
3321
                x7 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
3322
3323
                x1 = _mm_unpacklo_epi16(x1, t8);
3324
                x2 = _mm_unpacklo_epi16(x2, t8);
3325
                x3 = _mm_unpacklo_epi16(x3, t8);
3326
                x4 = _mm_unpacklo_epi16(x4, t8);
3327
                x5 = _mm_unpacklo_epi16(x5, t8);
3328
                x6 = _mm_unpacklo_epi16(x6, t8);
3329
                x7 = _mm_unpacklo_epi16(x7, t8);
3330
3331
                r0 = _mm_mullo_epi32(x1, t1);
3332
3333
                r0 = _mm_add_epi32(r0,
3334
                        _mm_mullo_epi32(x2,t2));
3335
3336
                r0 = _mm_add_epi32(r0,
3337
                        _mm_mullo_epi32(x3,t3));
3338
3339
                r0 = _mm_add_epi32(r0,
3340
                        _mm_mullo_epi32(x4,t4));
3341
3342
                r0 = _mm_add_epi32(r0,
3343
                        _mm_mullo_epi32(x5,t5));
3344
3345
                r0 = _mm_add_epi32(r0,
3346
                        _mm_mullo_epi32(x6,t6));
3347
3348
                r0 = _mm_add_epi32(r0,
3349
                        _mm_mullo_epi32(x7,t7));
3350
3351
                r0= _mm_srai_epi32(r0,2);
3352
3353
                r0= _mm_packs_epi32(r0,t8);
3354
3355
                /* give results back            */
3356
                _mm_storel_epi64((__m128i *) &dst[x], r0);
3357
3358
            }
3359
            src += srcstride;
3360
            dst += dststride;
3361
        }
3362
    }
3363
3364
}
3365
#endif
3366
3367
3368
3369
void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3370
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3371
0
        int16_t* mcbuffer) {
3372
0
    int x, y;
3373
0
    uint8_t* src = (uint8_t*) _src;
3374
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3375
0
    int16_t *tmp = mcbuffer;
3376
0
    __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3377
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3378
3379
0
    src -= qpel_extra_before[1] * srcstride;
3380
0
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3381
0
            -1);
3382
3383
    /* LOAD src from memory to registers to limit memory bandwidth */
3384
0
    if (width == 4) {
3385
3386
0
        for (y = 0; y < height + qpel_extra[1]; y += 2) {
3387
            /* load data in register     */
3388
0
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3389
0
            src += srcstride;
3390
0
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3391
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3392
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3393
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3394
0
                    _mm_srli_si128(x1, 3));
3395
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3396
0
                    _mm_srli_si128(t1, 3));
3397
3398
            /*  PMADDUBSW then PMADDW     */
3399
0
            x2 = _mm_maddubs_epi16(x2, r0);
3400
0
            t2 = _mm_maddubs_epi16(t2, r0);
3401
0
            x3 = _mm_maddubs_epi16(x3, r0);
3402
0
            t3 = _mm_maddubs_epi16(t3, r0);
3403
0
            x2 = _mm_hadd_epi16(x2, x3);
3404
0
            t2 = _mm_hadd_epi16(t2, t3);
3405
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3406
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3407
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3408
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3409
            /* give results back            */
3410
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3411
3412
0
            tmp += MAX_PB_SIZE;
3413
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3414
3415
0
            src += srcstride;
3416
0
            tmp += MAX_PB_SIZE;
3417
0
        }
3418
0
    } else
3419
0
        for (y = 0; y < height + qpel_extra[1]; y++) {
3420
0
            for (x = 0; x < width; x += 8) {
3421
                /* load data in register     */
3422
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3423
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3424
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3425
0
                        _mm_srli_si128(x1, 3));
3426
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3427
0
                        _mm_srli_si128(x1, 5));
3428
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3429
0
                        _mm_srli_si128(x1, 7));
3430
3431
                /*  PMADDUBSW then PMADDW     */
3432
0
                x2 = _mm_maddubs_epi16(x2, r0);
3433
0
                x3 = _mm_maddubs_epi16(x3, r0);
3434
0
                x4 = _mm_maddubs_epi16(x4, r0);
3435
0
                x5 = _mm_maddubs_epi16(x5, r0);
3436
0
                x2 = _mm_hadd_epi16(x2, x3);
3437
0
                x4 = _mm_hadd_epi16(x4, x5);
3438
0
                x2 = _mm_hadd_epi16(x2, x4);
3439
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3440
3441
                /* give results back            */
3442
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
3443
3444
0
            }
3445
0
            src += srcstride;
3446
0
            tmp += MAX_PB_SIZE;
3447
0
        }
3448
3449
0
    tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3450
0
    srcstride = MAX_PB_SIZE;
3451
3452
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3453
     for register calculations */
3454
0
    rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3455
0
    for (y = 0; y < height; y++) {
3456
0
        for (x = 0; x < width; x += 8) {
3457
3458
0
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3459
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3460
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3461
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3462
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3463
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3464
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3465
3466
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3467
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3468
0
            t8 = _mm_mullo_epi16(x1, r0);
3469
0
            rBuffer = _mm_mulhi_epi16(x1, r0);
3470
0
            t7 = _mm_mullo_epi16(x2, r1);
3471
0
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
3472
0
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
3473
3474
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3475
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
3476
0
            t8 = _mm_mullo_epi16(x3, r0);
3477
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
3478
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
3479
3480
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3481
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
3482
0
            t7 = _mm_mullo_epi16(x4, r1);
3483
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
3484
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
3485
3486
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3487
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
3488
0
            t8 = _mm_mullo_epi16(x5, r0);
3489
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
3490
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
3491
3492
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3493
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
3494
0
            t7 = _mm_mullo_epi16(x6, r1);
3495
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
3496
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
3497
3498
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3499
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
3500
0
            t8 = _mm_mullo_epi16(x7, r0);
3501
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
3502
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
3503
3504
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
3505
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
3506
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
3507
3508
3509
3510
            /* add calculus by correct value : */
3511
3512
0
            r1 = _mm_add_epi32(x1, x2);
3513
0
            x3 = _mm_add_epi32(x3, x4);
3514
0
            x5 = _mm_add_epi32(x5, x6);
3515
0
            r1 = _mm_add_epi32(r1, x3);
3516
3517
0
            r1 = _mm_add_epi32(r1, x5);
3518
3519
0
            r0 = _mm_add_epi32(t1, t2);
3520
0
            t3 = _mm_add_epi32(t3, t4);
3521
0
            t5 = _mm_add_epi32(t5, t6);
3522
0
            r0 = _mm_add_epi32(r0, t3);
3523
0
            r0 = _mm_add_epi32(r0, t5);
3524
0
            r1 = _mm_add_epi32(r1, x7);
3525
0
            r0 = _mm_add_epi32(r0, t7);
3526
0
            r1 = _mm_srli_epi32(r1, 6);
3527
0
            r0 = _mm_srli_epi32(r0, 6);
3528
3529
0
            r1 = _mm_and_si128(r1,
3530
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3531
0
            r0 = _mm_and_si128(r0,
3532
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3533
0
            r0 = _mm_hadd_epi16(r0, r1);
3534
0
            _mm_store_si128((__m128i *) &dst[x], r0);
3535
3536
0
        }
3537
0
        tmp += MAX_PB_SIZE;
3538
0
        dst += dststride;
3539
0
    }
3540
0
}
3541
void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
3542
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3543
0
        int16_t* mcbuffer) {
3544
0
    int x, y;
3545
0
    uint8_t *src = (uint8_t*) _src;
3546
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3547
0
    int16_t *tmp = mcbuffer;
3548
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3549
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3550
3551
0
    src -= qpel_extra_before[2] * srcstride;
3552
0
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3553
0
            -1);
3554
3555
    /* LOAD src from memory to registers to limit memory bandwidth */
3556
0
    if (width == 4) {
3557
3558
0
        for (y = 0; y < height + qpel_extra[2]; y += 2) {
3559
            /* load data in register     */
3560
0
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3561
0
            src += srcstride;
3562
0
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3563
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3564
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3565
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3566
0
                    _mm_srli_si128(x1, 3));
3567
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3568
0
                    _mm_srli_si128(t1, 3));
3569
3570
            /*  PMADDUBSW then PMADDW     */
3571
0
            x2 = _mm_maddubs_epi16(x2, r0);
3572
0
            t2 = _mm_maddubs_epi16(t2, r0);
3573
0
            x3 = _mm_maddubs_epi16(x3, r0);
3574
0
            t3 = _mm_maddubs_epi16(t3, r0);
3575
0
            x2 = _mm_hadd_epi16(x2, x3);
3576
0
            t2 = _mm_hadd_epi16(t2, t3);
3577
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3578
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3579
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3580
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3581
            /* give results back            */
3582
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3583
3584
0
            tmp += MAX_PB_SIZE;
3585
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3586
3587
0
            src += srcstride;
3588
0
            tmp += MAX_PB_SIZE;
3589
0
        }
3590
0
    } else
3591
0
        for (y = 0; y < height + qpel_extra[2]; y++) {
3592
0
            for (x = 0; x < width; x += 8) {
3593
                /* load data in register     */
3594
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3595
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3596
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3597
0
                        _mm_srli_si128(x1, 3));
3598
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3599
0
                        _mm_srli_si128(x1, 5));
3600
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3601
0
                        _mm_srli_si128(x1, 7));
3602
3603
                /*  PMADDUBSW then PMADDW     */
3604
0
                x2 = _mm_maddubs_epi16(x2, r0);
3605
0
                x3 = _mm_maddubs_epi16(x3, r0);
3606
0
                x4 = _mm_maddubs_epi16(x4, r0);
3607
0
                x5 = _mm_maddubs_epi16(x5, r0);
3608
0
                x2 = _mm_hadd_epi16(x2, x3);
3609
0
                x4 = _mm_hadd_epi16(x4, x5);
3610
0
                x2 = _mm_hadd_epi16(x2, x4);
3611
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3612
3613
                /* give results back            */
3614
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
3615
3616
0
            }
3617
0
            src += srcstride;
3618
0
            tmp += MAX_PB_SIZE;
3619
0
        }
3620
3621
0
    tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
3622
0
    srcstride = MAX_PB_SIZE;
3623
3624
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3625
     for register calculations */
3626
0
    rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
3627
0
    for (y = 0; y < height; y++) {
3628
0
        for (x = 0; x < width; x += 8) {
3629
3630
0
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3631
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3632
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3633
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3634
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3635
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3636
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3637
0
            x8 = _mm_loadu_si128((__m128i *) &tmp[x + 4 * srcstride]);
3638
3639
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3640
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3641
0
            t8 = _mm_mullo_epi16(x1, r0);
3642
0
            rBuffer = _mm_mulhi_epi16(x1, r0);
3643
0
            t7 = _mm_mullo_epi16(x2, r1);
3644
0
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
3645
0
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
3646
3647
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3648
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
3649
0
            t8 = _mm_mullo_epi16(x3, r0);
3650
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
3651
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
3652
3653
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3654
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
3655
0
            t7 = _mm_mullo_epi16(x4, r1);
3656
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
3657
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
3658
3659
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3660
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
3661
0
            t8 = _mm_mullo_epi16(x5, r0);
3662
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
3663
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
3664
3665
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3666
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
3667
0
            t7 = _mm_mullo_epi16(x6, r1);
3668
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
3669
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
3670
3671
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3672
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
3673
0
            t8 = _mm_mullo_epi16(x7, r0);
3674
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
3675
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
3676
3677
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
3678
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
3679
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
3680
3681
0
            t8 = _mm_unpacklo_epi16(
3682
0
                    _mm_mullo_epi16(x8,
3683
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3684
0
                            _mm_mulhi_epi16(x8,
3685
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3686
0
            x8 = _mm_unpackhi_epi16(
3687
0
                    _mm_mullo_epi16(x8,
3688
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3689
0
                            _mm_mulhi_epi16(x8,
3690
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3691
3692
            /* add calculus by correct value : */
3693
3694
0
            r1 = _mm_add_epi32(x1, x2);
3695
0
            x3 = _mm_add_epi32(x3, x4);
3696
0
            x5 = _mm_add_epi32(x5, x6);
3697
0
            r1 = _mm_add_epi32(r1, x3);
3698
0
            x7 = _mm_add_epi32(x7, x8);
3699
0
            r1 = _mm_add_epi32(r1, x5);
3700
3701
0
            r0 = _mm_add_epi32(t1, t2);
3702
0
            t3 = _mm_add_epi32(t3, t4);
3703
0
            t5 = _mm_add_epi32(t5, t6);
3704
0
            r0 = _mm_add_epi32(r0, t3);
3705
0
            t7 = _mm_add_epi32(t7, t8);
3706
0
            r0 = _mm_add_epi32(r0, t5);
3707
0
            r1 = _mm_add_epi32(r1, x7);
3708
0
            r0 = _mm_add_epi32(r0, t7);
3709
0
            r1 = _mm_srli_epi32(r1, 6);
3710
0
            r0 = _mm_srli_epi32(r0, 6);
3711
3712
0
            r1 = _mm_and_si128(r1,
3713
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3714
0
            r0 = _mm_and_si128(r0,
3715
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3716
0
            r0 = _mm_hadd_epi16(r0, r1);
3717
0
            _mm_store_si128((__m128i *) &dst[x], r0);
3718
3719
0
        }
3720
0
        tmp += MAX_PB_SIZE;
3721
0
        dst += dststride;
3722
0
    }
3723
0
}
3724
void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
3725
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3726
0
        int16_t* mcbuffer) {
3727
0
    int x, y;
3728
0
    uint8_t *src = (uint8_t*) _src;
3729
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3730
0
    int16_t *tmp = mcbuffer;
3731
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
3732
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3733
3734
0
    src -= qpel_extra_before[3] * srcstride;
3735
0
    r0 = _mm_set_epi8(0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4,
3736
0
            -1);
3737
3738
    /* LOAD src from memory to registers to limit memory bandwidth */
3739
0
    if (width == 4) {
3740
3741
0
        for (y = 0; y < height + qpel_extra[3]; y += 2) {
3742
            /* load data in register     */
3743
0
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3744
0
            src += srcstride;
3745
0
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3746
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3747
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3748
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3749
0
                    _mm_srli_si128(x1, 3));
3750
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3751
0
                    _mm_srli_si128(t1, 3));
3752
3753
            /*  PMADDUBSW then PMADDW     */
3754
0
            x2 = _mm_maddubs_epi16(x2, r0);
3755
0
            t2 = _mm_maddubs_epi16(t2, r0);
3756
0
            x3 = _mm_maddubs_epi16(x3, r0);
3757
0
            t3 = _mm_maddubs_epi16(t3, r0);
3758
0
            x2 = _mm_hadd_epi16(x2, x3);
3759
0
            t2 = _mm_hadd_epi16(t2, t3);
3760
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3761
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3762
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3763
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3764
            /* give results back            */
3765
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3766
3767
0
            tmp += MAX_PB_SIZE;
3768
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3769
3770
0
            src += srcstride;
3771
0
            tmp += MAX_PB_SIZE;
3772
0
        }
3773
0
    } else
3774
0
        for (y = 0; y < height + qpel_extra[3]; y++) {
3775
0
            for (x = 0; x < width; x += 8) {
3776
                /* load data in register     */
3777
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3778
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3779
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3780
0
                        _mm_srli_si128(x1, 3));
3781
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3782
0
                        _mm_srli_si128(x1, 5));
3783
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3784
0
                        _mm_srli_si128(x1, 7));
3785
3786
                /*  PMADDUBSW then PMADDW     */
3787
0
                x2 = _mm_maddubs_epi16(x2, r0);
3788
0
                x3 = _mm_maddubs_epi16(x3, r0);
3789
0
                x4 = _mm_maddubs_epi16(x4, r0);
3790
0
                x5 = _mm_maddubs_epi16(x5, r0);
3791
0
                x2 = _mm_hadd_epi16(x2, x3);
3792
0
                x4 = _mm_hadd_epi16(x4, x5);
3793
0
                x2 = _mm_hadd_epi16(x2, x4);
3794
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3795
3796
                /* give results back            */
3797
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
3798
3799
0
            }
3800
0
            src += srcstride;
3801
0
            tmp += MAX_PB_SIZE;
3802
0
        }
3803
3804
0
    tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
3805
0
    srcstride = MAX_PB_SIZE;
3806
3807
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3808
     for register calculations */
3809
0
    rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
3810
0
    for (y = 0; y < height; y++) {
3811
0
        for (x = 0; x < width; x += 8) {
3812
3813
0
            x1 = _mm_setzero_si128();
3814
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3815
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3816
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3817
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3818
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3819
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3820
0
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
3821
3822
3823
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
3824
3825
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
3826
0
            t7 = _mm_mullo_epi16(x2, r1);
3827
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
3828
0
            t8 = _mm_mullo_epi16(x3, r0);
3829
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
3830
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
3831
3832
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
3833
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
3834
0
            t7 = _mm_mullo_epi16(x4, r1);
3835
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
3836
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
3837
3838
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
3839
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
3840
0
            t8 = _mm_mullo_epi16(x5, r0);
3841
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
3842
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
3843
3844
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
3845
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
3846
0
            t7 = _mm_mullo_epi16(x6, r1);
3847
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
3848
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
3849
3850
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
3851
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
3852
0
            t8 = _mm_mullo_epi16(x7, r0);
3853
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
3854
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
3855
3856
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
3857
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
3858
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
3859
3860
0
            t8 = _mm_unpacklo_epi16(
3861
0
                    _mm_mullo_epi16(x8,
3862
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3863
0
                            _mm_mulhi_epi16(x8,
3864
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3865
0
            x8 = _mm_unpackhi_epi16(
3866
0
                    _mm_mullo_epi16(x8,
3867
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
3868
0
                            _mm_mulhi_epi16(x8,
3869
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
3870
3871
            /* add calculus by correct value : */
3872
3873
0
            x3 = _mm_add_epi32(x3, x4);
3874
0
            x5 = _mm_add_epi32(x5, x6);
3875
0
            r1 = _mm_add_epi32(x2, x3);
3876
0
            x7 = _mm_add_epi32(x7, x8);
3877
0
            r1 = _mm_add_epi32(r1, x5);
3878
3879
0
            t3 = _mm_add_epi32(t3, t4);
3880
0
            t5 = _mm_add_epi32(t5, t6);
3881
0
            r0 = _mm_add_epi32(t2, t3);
3882
0
            t7 = _mm_add_epi32(t7, t8);
3883
0
            r0 = _mm_add_epi32(r0, t5);
3884
0
            r1 = _mm_add_epi32(r1, x7);
3885
0
            r0 = _mm_add_epi32(r0, t7);
3886
0
            r1 = _mm_srli_epi32(r1, 6);
3887
0
            r0 = _mm_srli_epi32(r0, 6);
3888
3889
0
            r1 = _mm_and_si128(r1,
3890
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3891
0
            r0 = _mm_and_si128(r0,
3892
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3893
0
            r0 = _mm_hadd_epi16(r0, r1);
3894
0
            _mm_store_si128((__m128i *) &dst[x], r0);
3895
3896
0
        }
3897
0
        tmp += MAX_PB_SIZE;
3898
0
        dst += dststride;
3899
0
    }
3900
0
}
3901
void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
3902
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
3903
0
        int16_t* mcbuffer) {
3904
0
    int x, y;
3905
0
    uint8_t *src = (uint8_t*) _src;
3906
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
3907
0
    int16_t *tmp = mcbuffer;
3908
0
    __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
3909
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
3910
3911
0
    src -= qpel_extra_before[1] * srcstride;
3912
0
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
3913
0
            4, -1);
3914
3915
    /* LOAD src from memory to registers to limit memory bandwidth */
3916
0
    if (width == 4) {
3917
3918
0
        for (y = 0; y < height + qpel_extra[1]; y += 2) {
3919
            /* load data in register     */
3920
0
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
3921
0
            src += srcstride;
3922
0
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
3923
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3924
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
3925
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3926
0
                    _mm_srli_si128(x1, 3));
3927
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
3928
0
                    _mm_srli_si128(t1, 3));
3929
3930
            /*  PMADDUBSW then PMADDW     */
3931
0
            x2 = _mm_maddubs_epi16(x2, r0);
3932
0
            t2 = _mm_maddubs_epi16(t2, r0);
3933
0
            x3 = _mm_maddubs_epi16(x3, r0);
3934
0
            t3 = _mm_maddubs_epi16(t3, r0);
3935
0
            x2 = _mm_hadd_epi16(x2, x3);
3936
0
            t2 = _mm_hadd_epi16(t2, t3);
3937
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
3938
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
3939
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
3940
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
3941
            /* give results back            */
3942
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
3943
3944
0
            tmp += MAX_PB_SIZE;
3945
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
3946
3947
0
            src += srcstride;
3948
0
            tmp += MAX_PB_SIZE;
3949
0
        }
3950
0
    } else
3951
0
        for (y = 0; y < height + qpel_extra[1]; y++) {
3952
0
            for (x = 0; x < width; x += 8) {
3953
                /* load data in register     */
3954
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
3955
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
3956
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
3957
0
                        _mm_srli_si128(x1, 3));
3958
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
3959
0
                        _mm_srli_si128(x1, 5));
3960
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
3961
0
                        _mm_srli_si128(x1, 7));
3962
3963
                /*  PMADDUBSW then PMADDW     */
3964
0
                x2 = _mm_maddubs_epi16(x2, r0);
3965
0
                x3 = _mm_maddubs_epi16(x3, r0);
3966
0
                x4 = _mm_maddubs_epi16(x4, r0);
3967
0
                x5 = _mm_maddubs_epi16(x5, r0);
3968
0
                x2 = _mm_hadd_epi16(x2, x3);
3969
0
                x4 = _mm_hadd_epi16(x4, x5);
3970
0
                x2 = _mm_hadd_epi16(x2, x4);
3971
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
3972
3973
                /* give results back            */
3974
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
3975
3976
0
            }
3977
0
            src += srcstride;
3978
0
            tmp += MAX_PB_SIZE;
3979
0
        }
3980
3981
0
    tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
3982
0
    srcstride = MAX_PB_SIZE;
3983
3984
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
3985
     for register calculations */
3986
0
    rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
3987
0
    for (y = 0; y < height; y++) {
3988
0
        for (x = 0; x < width; x += 8) {
3989
3990
0
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
3991
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
3992
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
3993
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
3994
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
3995
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
3996
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
3997
3998
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
3999
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4000
0
            t8 = _mm_mullo_epi16(x1, r0);
4001
0
            rBuffer = _mm_mulhi_epi16(x1, r0);
4002
0
            t7 = _mm_mullo_epi16(x2, r1);
4003
0
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4004
0
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4005
4006
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4007
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
4008
0
            t8 = _mm_mullo_epi16(x3, r0);
4009
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4010
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4011
4012
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4013
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
4014
0
            t7 = _mm_mullo_epi16(x4, r1);
4015
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4016
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4017
4018
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4019
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
4020
0
            t8 = _mm_mullo_epi16(x5, r0);
4021
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4022
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4023
4024
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4025
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
4026
0
            t7 = _mm_mullo_epi16(x6, r1);
4027
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4028
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4029
4030
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4031
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
4032
0
            t8 = _mm_mullo_epi16(x7, r0);
4033
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4034
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4035
4036
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
4037
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4038
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4039
4040
4041
4042
            /* add calculus by correct value : */
4043
4044
0
            r1 = _mm_add_epi32(x1, x2);
4045
0
            x3 = _mm_add_epi32(x3, x4);
4046
0
            x5 = _mm_add_epi32(x5, x6);
4047
0
            r1 = _mm_add_epi32(r1, x3);
4048
0
            r1 = _mm_add_epi32(r1, x5);
4049
4050
0
            r0 = _mm_add_epi32(t1, t2);
4051
0
            t3 = _mm_add_epi32(t3, t4);
4052
0
            t5 = _mm_add_epi32(t5, t6);
4053
0
            r0 = _mm_add_epi32(r0, t3);
4054
0
            r0 = _mm_add_epi32(r0, t5);
4055
0
            r1 = _mm_add_epi32(r1, x7);
4056
0
            r0 = _mm_add_epi32(r0, t7);
4057
0
            r1 = _mm_srli_epi32(r1, 6);
4058
0
            r0 = _mm_srli_epi32(r0, 6);
4059
4060
0
            r1 = _mm_and_si128(r1,
4061
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4062
0
            r0 = _mm_and_si128(r0,
4063
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4064
0
            r0 = _mm_hadd_epi16(r0, r1);
4065
0
            _mm_store_si128((__m128i *) &dst[x], r0);
4066
4067
0
        }
4068
0
        tmp += MAX_PB_SIZE;
4069
0
        dst += dststride;
4070
0
    }
4071
0
}
4072
void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4073
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4074
0
        int16_t* mcbuffer) {
4075
0
    int x, y;
4076
0
    uint8_t *src = (uint8_t*) _src;
4077
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4078
0
    int16_t *tmp = mcbuffer;
4079
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4080
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4081
4082
0
    src -= qpel_extra_before[2] * srcstride;
4083
0
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4084
0
            4, -1);
4085
4086
    /* LOAD src from memory to registers to limit memory bandwidth */
4087
0
    if (width == 4) {
4088
4089
0
        for (y = 0; y < height + qpel_extra[2]; y += 2) {
4090
            /* load data in register     */
4091
0
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4092
0
            src += srcstride;
4093
0
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4094
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4095
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4096
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4097
0
                    _mm_srli_si128(x1, 3));
4098
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4099
0
                    _mm_srli_si128(t1, 3));
4100
4101
            /*  PMADDUBSW then PMADDW     */
4102
0
            x2 = _mm_maddubs_epi16(x2, r0);
4103
0
            t2 = _mm_maddubs_epi16(t2, r0);
4104
0
            x3 = _mm_maddubs_epi16(x3, r0);
4105
0
            t3 = _mm_maddubs_epi16(t3, r0);
4106
0
            x2 = _mm_hadd_epi16(x2, x3);
4107
0
            t2 = _mm_hadd_epi16(t2, t3);
4108
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4109
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4110
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4111
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4112
            /* give results back            */
4113
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4114
4115
0
            tmp += MAX_PB_SIZE;
4116
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4117
4118
0
            src += srcstride;
4119
0
            tmp += MAX_PB_SIZE;
4120
0
        }
4121
0
    } else
4122
0
        for (y = 0; y < height + qpel_extra[2]; y++) {
4123
0
            for (x = 0; x < width; x += 8) {
4124
                /* load data in register     */
4125
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4126
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4127
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4128
0
                        _mm_srli_si128(x1, 3));
4129
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4130
0
                        _mm_srli_si128(x1, 5));
4131
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4132
0
                        _mm_srli_si128(x1, 7));
4133
4134
                /*  PMADDUBSW then PMADDW     */
4135
0
                x2 = _mm_maddubs_epi16(x2, r0);
4136
0
                x3 = _mm_maddubs_epi16(x3, r0);
4137
0
                x4 = _mm_maddubs_epi16(x4, r0);
4138
0
                x5 = _mm_maddubs_epi16(x5, r0);
4139
0
                x2 = _mm_hadd_epi16(x2, x3);
4140
0
                x4 = _mm_hadd_epi16(x4, x5);
4141
0
                x2 = _mm_hadd_epi16(x2, x4);
4142
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4143
4144
                /* give results back            */
4145
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
4146
4147
0
            }
4148
0
            src += srcstride;
4149
0
            tmp += MAX_PB_SIZE;
4150
0
        }
4151
4152
0
    tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4153
0
    srcstride = MAX_PB_SIZE;
4154
4155
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4156
     for register calculations */
4157
0
    rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4158
0
    for (y = 0; y < height; y++) {
4159
0
        for (x = 0; x < width; x += 8) {
4160
4161
0
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4162
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4163
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4164
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4165
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4166
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4167
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4168
0
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4169
4170
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4171
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4172
0
            t8 = _mm_mullo_epi16(x1, r0);
4173
0
            rBuffer = _mm_mulhi_epi16(x1, r0);
4174
0
            t7 = _mm_mullo_epi16(x2, r1);
4175
0
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4176
0
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4177
4178
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4179
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
4180
0
            t8 = _mm_mullo_epi16(x3, r0);
4181
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4182
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4183
4184
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4185
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
4186
0
            t7 = _mm_mullo_epi16(x4, r1);
4187
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4188
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4189
4190
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4191
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
4192
0
            t8 = _mm_mullo_epi16(x5, r0);
4193
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4194
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4195
4196
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4197
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
4198
0
            t7 = _mm_mullo_epi16(x6, r1);
4199
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4200
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4201
4202
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4203
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
4204
0
            t8 = _mm_mullo_epi16(x7, r0);
4205
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4206
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4207
4208
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
4209
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4210
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4211
4212
0
            t8 = _mm_unpacklo_epi16(
4213
0
                    _mm_mullo_epi16(x8,
4214
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4215
0
                            _mm_mulhi_epi16(x8,
4216
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4217
0
            x8 = _mm_unpackhi_epi16(
4218
0
                    _mm_mullo_epi16(x8,
4219
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4220
0
                            _mm_mulhi_epi16(x8,
4221
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4222
4223
            /* add calculus by correct value : */
4224
4225
0
            r1 = _mm_add_epi32(x1, x2);
4226
0
            x3 = _mm_add_epi32(x3, x4);
4227
0
            x5 = _mm_add_epi32(x5, x6);
4228
0
            r1 = _mm_add_epi32(r1, x3);
4229
0
            x7 = _mm_add_epi32(x7, x8);
4230
0
            r1 = _mm_add_epi32(r1, x5);
4231
4232
0
            r0 = _mm_add_epi32(t1, t2);
4233
0
            t3 = _mm_add_epi32(t3, t4);
4234
0
            t5 = _mm_add_epi32(t5, t6);
4235
0
            r0 = _mm_add_epi32(r0, t3);
4236
0
            t7 = _mm_add_epi32(t7, t8);
4237
0
            r0 = _mm_add_epi32(r0, t5);
4238
0
            r1 = _mm_add_epi32(r1, x7);
4239
0
            r0 = _mm_add_epi32(r0, t7);
4240
0
            r1 = _mm_srli_epi32(r1, 6);
4241
0
            r0 = _mm_srli_epi32(r0, 6);
4242
4243
0
            r1 = _mm_and_si128(r1,
4244
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4245
0
            r0 = _mm_and_si128(r0,
4246
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4247
0
            r0 = _mm_hadd_epi16(r0, r1);
4248
0
            _mm_store_si128((__m128i *) &dst[x], r0);
4249
4250
0
        }
4251
0
        tmp += MAX_PB_SIZE;
4252
0
        dst += dststride;
4253
0
    }
4254
0
}
4255
void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4256
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4257
0
        int16_t* mcbuffer) {
4258
0
    int x, y;
4259
0
    uint8_t *src = (uint8_t*) _src;
4260
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4261
0
    int16_t *tmp = mcbuffer;
4262
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4263
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4264
4265
0
    src -= qpel_extra_before[3] * srcstride;
4266
0
    r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11,
4267
0
            4, -1);
4268
4269
    /* LOAD src from memory to registers to limit memory bandwidth */
4270
0
    if (width == 4) {
4271
4272
0
        for (y = 0; y < height + qpel_extra[3]; y += 2) {
4273
            /* load data in register     */
4274
0
            x1 = _mm_loadu_si128((__m128i *) &src[-3]);
4275
0
            src += srcstride;
4276
0
            t1 = _mm_loadu_si128((__m128i *) &src[-3]);
4277
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4278
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4279
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4280
0
                    _mm_srli_si128(x1, 3));
4281
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4282
0
                    _mm_srli_si128(t1, 3));
4283
4284
            /*  PMADDUBSW then PMADDW     */
4285
0
            x2 = _mm_maddubs_epi16(x2, r0);
4286
0
            t2 = _mm_maddubs_epi16(t2, r0);
4287
0
            x3 = _mm_maddubs_epi16(x3, r0);
4288
0
            t3 = _mm_maddubs_epi16(t3, r0);
4289
0
            x2 = _mm_hadd_epi16(x2, x3);
4290
0
            t2 = _mm_hadd_epi16(t2, t3);
4291
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4292
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4293
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4294
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4295
            /* give results back            */
4296
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4297
4298
0
            tmp += MAX_PB_SIZE;
4299
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4300
4301
0
            src += srcstride;
4302
0
            tmp += MAX_PB_SIZE;
4303
0
        }
4304
0
    } else
4305
0
        for (y = 0; y < height + qpel_extra[3]; y++) {
4306
0
            for (x = 0; x < width; x += 8) {
4307
                /* load data in register     */
4308
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 3]);
4309
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4310
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4311
0
                        _mm_srli_si128(x1, 3));
4312
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4313
0
                        _mm_srli_si128(x1, 5));
4314
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4315
0
                        _mm_srli_si128(x1, 7));
4316
4317
                /*  PMADDUBSW then PMADDW     */
4318
0
                x2 = _mm_maddubs_epi16(x2, r0);
4319
0
                x3 = _mm_maddubs_epi16(x3, r0);
4320
0
                x4 = _mm_maddubs_epi16(x4, r0);
4321
0
                x5 = _mm_maddubs_epi16(x5, r0);
4322
0
                x2 = _mm_hadd_epi16(x2, x3);
4323
0
                x4 = _mm_hadd_epi16(x4, x5);
4324
0
                x2 = _mm_hadd_epi16(x2, x4);
4325
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4326
4327
                /* give results back            */
4328
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
4329
4330
0
            }
4331
0
            src += srcstride;
4332
0
            tmp += MAX_PB_SIZE;
4333
0
        }
4334
4335
0
    tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4336
0
    srcstride = MAX_PB_SIZE;
4337
4338
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4339
     for register calculations */
4340
0
    rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4341
0
    for (y = 0; y < height; y++) {
4342
0
        for (x = 0; x < width; x += 8) {
4343
4344
0
            x1 = _mm_setzero_si128();
4345
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4346
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4347
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4348
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4349
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4350
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4351
0
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4352
4353
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4354
4355
0
            t7 = _mm_mullo_epi16(x2, r1);
4356
4357
4358
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4359
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
4360
0
            t8 = _mm_mullo_epi16(x3, r0);
4361
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4362
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4363
4364
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4365
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
4366
0
            t7 = _mm_mullo_epi16(x4, r1);
4367
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4368
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4369
4370
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4371
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
4372
0
            t8 = _mm_mullo_epi16(x5, r0);
4373
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4374
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4375
4376
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4377
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
4378
0
            t7 = _mm_mullo_epi16(x6, r1);
4379
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4380
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4381
4382
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4383
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
4384
0
            t8 = _mm_mullo_epi16(x7, r0);
4385
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4386
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4387
4388
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
4389
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4390
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4391
4392
0
            t8 = _mm_unpacklo_epi16(
4393
0
                    _mm_mullo_epi16(x8,
4394
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4395
0
                            _mm_mulhi_epi16(x8,
4396
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4397
0
            x8 = _mm_unpackhi_epi16(
4398
0
                    _mm_mullo_epi16(x8,
4399
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4400
0
                            _mm_mulhi_epi16(x8,
4401
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4402
4403
            /* add calculus by correct value : */
4404
4405
0
            x3 = _mm_add_epi32(x3, x4);
4406
0
            x5 = _mm_add_epi32(x5, x6);
4407
0
            r1 = _mm_add_epi32(x2, x3);
4408
0
            x7 = _mm_add_epi32(x7, x8);
4409
0
            r1 = _mm_add_epi32(r1, x5);
4410
4411
0
            t3 = _mm_add_epi32(t3, t4);
4412
0
            t5 = _mm_add_epi32(t5, t6);
4413
0
            r0 = _mm_add_epi32(t2, t3);
4414
0
            t7 = _mm_add_epi32(t7, t8);
4415
0
            r0 = _mm_add_epi32(r0, t5);
4416
0
            r1 = _mm_add_epi32(r1, x7);
4417
0
            r0 = _mm_add_epi32(r0, t7);
4418
0
            r1 = _mm_srli_epi32(r1, 6);
4419
0
            r0 = _mm_srli_epi32(r0, 6);
4420
4421
0
            r1 = _mm_and_si128(r1,
4422
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4423
0
            r0 = _mm_and_si128(r0,
4424
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4425
0
            r0 = _mm_hadd_epi16(r0, r1);
4426
0
            _mm_store_si128((__m128i *) &dst[x], r0);
4427
4428
0
        }
4429
0
        tmp += MAX_PB_SIZE;
4430
0
        dst += dststride;
4431
0
    }
4432
0
}
4433
void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
4434
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4435
0
        int16_t* mcbuffer) {
4436
0
    int x, y;
4437
0
    uint8_t *src = (uint8_t*) _src;
4438
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4439
0
    int16_t *tmp = mcbuffer;
4440
0
    __m128i x1, x2, x3, x4, x5, x6, x7, rBuffer, rTemp, r0, r1;
4441
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4442
4443
0
    src -= qpel_extra_before[1] * srcstride;
4444
0
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4445
0
            0);
4446
4447
    /* LOAD src from memory to registers to limit memory bandwidth */
4448
0
    if (width == 4) {
4449
4450
0
        for (y = 0; y < height + qpel_extra[1]; y += 2) {
4451
            /* load data in register     */
4452
0
            x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4453
0
            x1 = _mm_slli_si128(x1, 1);
4454
0
            src += srcstride;
4455
0
            t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4456
0
            t1 = _mm_slli_si128(t1, 1);
4457
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4458
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4459
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4460
0
                    _mm_srli_si128(x1, 3));
4461
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4462
0
                    _mm_srli_si128(t1, 3));
4463
4464
            /*  PMADDUBSW then PMADDW     */
4465
0
            x2 = _mm_maddubs_epi16(x2, r0);
4466
0
            t2 = _mm_maddubs_epi16(t2, r0);
4467
0
            x3 = _mm_maddubs_epi16(x3, r0);
4468
0
            t3 = _mm_maddubs_epi16(t3, r0);
4469
0
            x2 = _mm_hadd_epi16(x2, x3);
4470
0
            t2 = _mm_hadd_epi16(t2, t3);
4471
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4472
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4473
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4474
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4475
            /* give results back            */
4476
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4477
4478
0
            tmp += MAX_PB_SIZE;
4479
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4480
4481
0
            src += srcstride;
4482
0
            tmp += MAX_PB_SIZE;
4483
0
        }
4484
0
    } else
4485
0
        for (y = 0; y < height + qpel_extra[1]; y++) {
4486
0
            for (x = 0; x < width; x += 8) {
4487
                /* load data in register     */
4488
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4489
0
                x1 = _mm_slli_si128(x1, 1);
4490
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4491
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4492
0
                        _mm_srli_si128(x1, 3));
4493
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4494
0
                        _mm_srli_si128(x1, 5));
4495
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4496
0
                        _mm_srli_si128(x1, 7));
4497
4498
                /*  PMADDUBSW then PMADDW     */
4499
0
                x2 = _mm_maddubs_epi16(x2, r0);
4500
0
                x3 = _mm_maddubs_epi16(x3, r0);
4501
0
                x4 = _mm_maddubs_epi16(x4, r0);
4502
0
                x5 = _mm_maddubs_epi16(x5, r0);
4503
0
                x2 = _mm_hadd_epi16(x2, x3);
4504
0
                x4 = _mm_hadd_epi16(x4, x5);
4505
0
                x2 = _mm_hadd_epi16(x2, x4);
4506
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4507
4508
                /* give results back            */
4509
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
4510
4511
0
            }
4512
0
            src += srcstride;
4513
0
            tmp += MAX_PB_SIZE;
4514
0
        }
4515
4516
0
    tmp = mcbuffer + qpel_extra_before[1] * MAX_PB_SIZE;
4517
0
    srcstride = MAX_PB_SIZE;
4518
4519
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4520
     for register calculations */
4521
0
    rTemp = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1);
4522
0
    for (y = 0; y < height; y++) {
4523
0
        for (x = 0; x < width; x += 8) {
4524
4525
0
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4526
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4527
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4528
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4529
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4530
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4531
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4532
4533
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4534
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4535
0
            t8 = _mm_mullo_epi16(x1, r0);
4536
0
            rBuffer = _mm_mulhi_epi16(x1, r0);
4537
0
            t7 = _mm_mullo_epi16(x2, r1);
4538
0
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4539
0
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4540
4541
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4542
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
4543
0
            t8 = _mm_mullo_epi16(x3, r0);
4544
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4545
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4546
4547
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4548
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
4549
0
            t7 = _mm_mullo_epi16(x4, r1);
4550
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4551
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4552
4553
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4554
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
4555
0
            t8 = _mm_mullo_epi16(x5, r0);
4556
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4557
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4558
4559
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4560
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
4561
0
            t7 = _mm_mullo_epi16(x6, r1);
4562
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4563
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4564
4565
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4566
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
4567
0
            t8 = _mm_mullo_epi16(x7, r0);
4568
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4569
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4570
4571
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
4572
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4573
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4574
4575
4576
            /* add calculus by correct value : */
4577
4578
0
            r1 = _mm_add_epi32(x1, x2);
4579
0
            x3 = _mm_add_epi32(x3, x4);
4580
0
            x5 = _mm_add_epi32(x5, x6);
4581
0
            r1 = _mm_add_epi32(r1, x3);
4582
0
            r1 = _mm_add_epi32(r1, x5);
4583
4584
0
            r0 = _mm_add_epi32(t1, t2);
4585
0
            t3 = _mm_add_epi32(t3, t4);
4586
0
            t5 = _mm_add_epi32(t5, t6);
4587
0
            r0 = _mm_add_epi32(r0, t3);
4588
0
            r0 = _mm_add_epi32(r0, t5);
4589
0
            r1 = _mm_add_epi32(r1, x7);
4590
0
            r0 = _mm_add_epi32(r0, t7);
4591
0
            r1 = _mm_srli_epi32(r1, 6);
4592
0
            r0 = _mm_srli_epi32(r0, 6);
4593
4594
0
            r1 = _mm_and_si128(r1,
4595
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4596
0
            r0 = _mm_and_si128(r0,
4597
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4598
0
            r0 = _mm_hadd_epi16(r0, r1);
4599
0
            _mm_store_si128((__m128i *) &dst[x], r0);
4600
4601
0
        }
4602
0
        tmp += MAX_PB_SIZE;
4603
0
        dst += dststride;
4604
0
    }
4605
0
}
4606
void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
4607
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4608
0
        int16_t* mcbuffer) {
4609
0
    int x, y;
4610
0
    uint8_t *src = (uint8_t*) _src;
4611
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4612
0
    int16_t *tmp = mcbuffer;
4613
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4614
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4615
4616
0
    src -= qpel_extra_before[2] * srcstride;
4617
0
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4618
0
            0);
4619
4620
    /* LOAD src from memory to registers to limit memory bandwidth */
4621
0
    if (width == 4) {
4622
4623
0
        for (y = 0; y < height + qpel_extra[2]; y += 2) {
4624
            /* load data in register     */
4625
0
            x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4626
0
            x1 = _mm_slli_si128(x1, 1);
4627
0
            src += srcstride;
4628
0
            t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4629
0
            t1 = _mm_slli_si128(t1, 1);
4630
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4631
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4632
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4633
0
                    _mm_srli_si128(x1, 3));
4634
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4635
0
                    _mm_srli_si128(t1, 3));
4636
4637
            /*  PMADDUBSW then PMADDW     */
4638
0
            x2 = _mm_maddubs_epi16(x2, r0);
4639
0
            t2 = _mm_maddubs_epi16(t2, r0);
4640
0
            x3 = _mm_maddubs_epi16(x3, r0);
4641
0
            t3 = _mm_maddubs_epi16(t3, r0);
4642
0
            x2 = _mm_hadd_epi16(x2, x3);
4643
0
            t2 = _mm_hadd_epi16(t2, t3);
4644
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4645
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4646
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4647
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4648
            /* give results back            */
4649
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4650
4651
0
            tmp += MAX_PB_SIZE;
4652
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4653
4654
0
            src += srcstride;
4655
0
            tmp += MAX_PB_SIZE;
4656
0
        }
4657
0
    } else
4658
0
        for (y = 0; y < height + qpel_extra[2]; y++) {
4659
0
            for (x = 0; x < width; x += 8) {
4660
                /* load data in register     */
4661
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4662
0
                x1 = _mm_slli_si128(x1, 1);
4663
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4664
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4665
0
                        _mm_srli_si128(x1, 3));
4666
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4667
0
                        _mm_srli_si128(x1, 5));
4668
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4669
0
                        _mm_srli_si128(x1, 7));
4670
4671
                /*  PMADDUBSW then PMADDW     */
4672
0
                x2 = _mm_maddubs_epi16(x2, r0);
4673
0
                x3 = _mm_maddubs_epi16(x3, r0);
4674
0
                x4 = _mm_maddubs_epi16(x4, r0);
4675
0
                x5 = _mm_maddubs_epi16(x5, r0);
4676
0
                x2 = _mm_hadd_epi16(x2, x3);
4677
0
                x4 = _mm_hadd_epi16(x4, x5);
4678
0
                x2 = _mm_hadd_epi16(x2, x4);
4679
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4680
4681
                /* give results back            */
4682
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
4683
4684
0
            }
4685
0
            src += srcstride;
4686
0
            tmp += MAX_PB_SIZE;
4687
0
        }
4688
4689
0
    tmp = mcbuffer + qpel_extra_before[2] * MAX_PB_SIZE;
4690
0
    srcstride = MAX_PB_SIZE;
4691
4692
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4693
     for register calculations */
4694
0
    rTemp = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
4695
0
    for (y = 0; y < height; y++) {
4696
0
        for (x = 0; x < width; x += 8) {
4697
4698
0
            x1 = _mm_load_si128((__m128i *) &tmp[x - 3 * srcstride]);
4699
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4700
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4701
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4702
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4703
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4704
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4705
0
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4706
4707
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 0));
4708
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4709
0
            t8 = _mm_mullo_epi16(x1, r0);
4710
0
            rBuffer = _mm_mulhi_epi16(x1, r0);
4711
0
            t7 = _mm_mullo_epi16(x2, r1);
4712
0
            t1 = _mm_unpacklo_epi16(t8, rBuffer);
4713
0
            x1 = _mm_unpackhi_epi16(t8, rBuffer);
4714
4715
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4716
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
4717
0
            t8 = _mm_mullo_epi16(x3, r0);
4718
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4719
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4720
4721
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4722
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
4723
0
            t7 = _mm_mullo_epi16(x4, r1);
4724
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4725
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4726
4727
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4728
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
4729
0
            t8 = _mm_mullo_epi16(x5, r0);
4730
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4731
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4732
4733
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4734
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
4735
0
            t7 = _mm_mullo_epi16(x6, r1);
4736
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4737
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4738
4739
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4740
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
4741
0
            t8 = _mm_mullo_epi16(x7, r0);
4742
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4743
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4744
4745
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
4746
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4747
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4748
4749
0
            t8 = _mm_unpacklo_epi16(
4750
0
                    _mm_mullo_epi16(x8,
4751
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4752
0
                            _mm_mulhi_epi16(x8,
4753
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4754
0
            x8 = _mm_unpackhi_epi16(
4755
0
                    _mm_mullo_epi16(x8,
4756
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4757
0
                            _mm_mulhi_epi16(x8,
4758
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4759
4760
            /* add calculus by correct value : */
4761
4762
0
            r1 = _mm_add_epi32(x1, x2);
4763
0
            x3 = _mm_add_epi32(x3, x4);
4764
0
            x5 = _mm_add_epi32(x5, x6);
4765
0
            r1 = _mm_add_epi32(r1, x3);
4766
0
            x7 = _mm_add_epi32(x7, x8);
4767
0
            r1 = _mm_add_epi32(r1, x5);
4768
4769
0
            r0 = _mm_add_epi32(t1, t2);
4770
0
            t3 = _mm_add_epi32(t3, t4);
4771
0
            t5 = _mm_add_epi32(t5, t6);
4772
0
            r0 = _mm_add_epi32(r0, t3);
4773
0
            t7 = _mm_add_epi32(t7, t8);
4774
0
            r0 = _mm_add_epi32(r0, t5);
4775
0
            r1 = _mm_add_epi32(r1, x7);
4776
0
            r0 = _mm_add_epi32(r0, t7);
4777
0
            r1 = _mm_srli_epi32(r1, 6);
4778
0
            r0 = _mm_srli_epi32(r0, 6);
4779
4780
0
            r1 = _mm_and_si128(r1,
4781
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4782
0
            r0 = _mm_and_si128(r0,
4783
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4784
0
            r0 = _mm_hadd_epi16(r0, r1);
4785
0
            _mm_store_si128((__m128i *) &dst[x], r0);
4786
4787
0
        }
4788
0
        tmp += MAX_PB_SIZE;
4789
0
        dst += dststride;
4790
0
    }
4791
0
}
4792
void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
4793
                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
4794
0
        int16_t* mcbuffer) {
4795
0
    int x, y;
4796
0
    uint8_t *src = (uint8_t*) _src;
4797
0
    ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
4798
0
    int16_t *tmp = mcbuffer;
4799
0
    __m128i x1, x2, x3, x4, x5, x6, x7, x8, rBuffer, rTemp, r0, r1;
4800
0
    __m128i t1, t2, t3, t4, t5, t6, t7, t8;
4801
4802
0
    src -= qpel_extra_before[3] * srcstride;
4803
0
    r0 = _mm_set_epi8(-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1,
4804
0
            0);
4805
4806
    /* LOAD src from memory to registers to limit memory bandwidth */
4807
0
    if (width == 4) {
4808
4809
0
        for (y = 0; y < height + qpel_extra[3]; y += 2) {
4810
            /* load data in register     */
4811
0
            x1 = _mm_loadu_si128((__m128i *) &src[-2]);
4812
0
            x1 = _mm_slli_si128(x1, 1);
4813
0
            src += srcstride;
4814
0
            t1 = _mm_loadu_si128((__m128i *) &src[-2]);
4815
0
            t1 = _mm_slli_si128(t1, 1);
4816
0
            x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4817
0
            t2 = _mm_unpacklo_epi64(t1, _mm_srli_si128(t1, 1));
4818
0
            x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4819
0
                    _mm_srli_si128(x1, 3));
4820
0
            t3 = _mm_unpacklo_epi64(_mm_srli_si128(t1, 2),
4821
0
                    _mm_srli_si128(t1, 3));
4822
4823
            /*  PMADDUBSW then PMADDW     */
4824
0
            x2 = _mm_maddubs_epi16(x2, r0);
4825
0
            t2 = _mm_maddubs_epi16(t2, r0);
4826
0
            x3 = _mm_maddubs_epi16(x3, r0);
4827
0
            t3 = _mm_maddubs_epi16(t3, r0);
4828
0
            x2 = _mm_hadd_epi16(x2, x3);
4829
0
            t2 = _mm_hadd_epi16(t2, t3);
4830
0
            x2 = _mm_hadd_epi16(x2, _mm_set1_epi16(0));
4831
0
            t2 = _mm_hadd_epi16(t2, _mm_set1_epi16(0));
4832
0
            x2 = _mm_srli_epi16(x2, BIT_DEPTH - 8);
4833
0
            t2 = _mm_srli_epi16(t2, BIT_DEPTH - 8);
4834
            /* give results back            */
4835
0
            _mm_storel_epi64((__m128i *) &tmp[0], x2);
4836
4837
0
            tmp += MAX_PB_SIZE;
4838
0
            _mm_storel_epi64((__m128i *) &tmp[0], t2);
4839
4840
0
            src += srcstride;
4841
0
            tmp += MAX_PB_SIZE;
4842
0
        }
4843
0
    } else
4844
0
        for (y = 0; y < height + qpel_extra[3]; y++) {
4845
0
            for (x = 0; x < width; x += 8) {
4846
                /* load data in register     */
4847
0
                x1 = _mm_loadu_si128((__m128i *) &src[x - 2]);
4848
0
                x1 = _mm_slli_si128(x1, 1);
4849
0
                x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1));
4850
0
                x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2),
4851
0
                        _mm_srli_si128(x1, 3));
4852
0
                x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4),
4853
0
                        _mm_srli_si128(x1, 5));
4854
0
                x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6),
4855
0
                        _mm_srli_si128(x1, 7));
4856
4857
                /*  PMADDUBSW then PMADDW     */
4858
0
                x2 = _mm_maddubs_epi16(x2, r0);
4859
0
                x3 = _mm_maddubs_epi16(x3, r0);
4860
0
                x4 = _mm_maddubs_epi16(x4, r0);
4861
0
                x5 = _mm_maddubs_epi16(x5, r0);
4862
0
                x2 = _mm_hadd_epi16(x2, x3);
4863
0
                x4 = _mm_hadd_epi16(x4, x5);
4864
0
                x2 = _mm_hadd_epi16(x2, x4);
4865
0
                x2 = _mm_srli_si128(x2, BIT_DEPTH - 8);
4866
4867
                /* give results back            */
4868
0
                _mm_store_si128((__m128i *) &tmp[x], x2);
4869
4870
0
            }
4871
0
            src += srcstride;
4872
0
            tmp += MAX_PB_SIZE;
4873
0
        }
4874
4875
0
    tmp = mcbuffer + qpel_extra_before[3] * MAX_PB_SIZE;
4876
0
    srcstride = MAX_PB_SIZE;
4877
4878
    /* vertical treatment on temp table : tmp contains 16 bit values, so need to use 32 bit  integers
4879
     for register calculations */
4880
0
    rTemp = _mm_set_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
4881
0
    for (y = 0; y < height; y++) {
4882
0
        for (x = 0; x < width; x += 8) {
4883
4884
0
            x1 = _mm_setzero_si128();
4885
0
            x2 = _mm_load_si128((__m128i *) &tmp[x - 2 * srcstride]);
4886
0
            x3 = _mm_load_si128((__m128i *) &tmp[x - srcstride]);
4887
0
            x4 = _mm_load_si128((__m128i *) &tmp[x]);
4888
0
            x5 = _mm_load_si128((__m128i *) &tmp[x + srcstride]);
4889
0
            x6 = _mm_load_si128((__m128i *) &tmp[x + 2 * srcstride]);
4890
0
            x7 = _mm_load_si128((__m128i *) &tmp[x + 3 * srcstride]);
4891
0
            x8 = _mm_load_si128((__m128i *) &tmp[x + 4 * srcstride]);
4892
4893
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 1));
4894
0
            t7 = _mm_mullo_epi16(x2, r1);
4895
4896
4897
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 2));
4898
0
            rBuffer = _mm_mulhi_epi16(x2, r1);
4899
0
            t8 = _mm_mullo_epi16(x3, r0);
4900
0
            t2 = _mm_unpacklo_epi16(t7, rBuffer);
4901
0
            x2 = _mm_unpackhi_epi16(t7, rBuffer);
4902
4903
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 3));
4904
0
            rBuffer = _mm_mulhi_epi16(x3, r0);
4905
0
            t7 = _mm_mullo_epi16(x4, r1);
4906
0
            t3 = _mm_unpacklo_epi16(t8, rBuffer);
4907
0
            x3 = _mm_unpackhi_epi16(t8, rBuffer);
4908
4909
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 4));
4910
0
            rBuffer = _mm_mulhi_epi16(x4, r1);
4911
0
            t8 = _mm_mullo_epi16(x5, r0);
4912
0
            t4 = _mm_unpacklo_epi16(t7, rBuffer);
4913
0
            x4 = _mm_unpackhi_epi16(t7, rBuffer);
4914
4915
0
            r1 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 5));
4916
0
            rBuffer = _mm_mulhi_epi16(x5, r0);
4917
0
            t7 = _mm_mullo_epi16(x6, r1);
4918
0
            t5 = _mm_unpacklo_epi16(t8, rBuffer);
4919
0
            x5 = _mm_unpackhi_epi16(t8, rBuffer);
4920
4921
0
            r0 = _mm_set1_epi16(_mm_extract_epi16(rTemp, 6));
4922
0
            rBuffer = _mm_mulhi_epi16(x6, r1);
4923
0
            t8 = _mm_mullo_epi16(x7, r0);
4924
0
            t6 = _mm_unpacklo_epi16(t7, rBuffer);
4925
0
            x6 = _mm_unpackhi_epi16(t7, rBuffer);
4926
4927
0
            rBuffer = _mm_mulhi_epi16(x7, r0);
4928
0
            t7 = _mm_unpacklo_epi16(t8, rBuffer);
4929
0
            x7 = _mm_unpackhi_epi16(t8, rBuffer);
4930
4931
0
            t8 = _mm_unpacklo_epi16(
4932
0
                    _mm_mullo_epi16(x8,
4933
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4934
0
                            _mm_mulhi_epi16(x8,
4935
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4936
0
            x8 = _mm_unpackhi_epi16(
4937
0
                    _mm_mullo_epi16(x8,
4938
0
                            _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))),
4939
0
                            _mm_mulhi_epi16(x8,
4940
0
                                    _mm_set1_epi16(_mm_extract_epi16(rTemp, 7))));
4941
4942
            /* add calculus by correct value : */
4943
4944
0
            x3 = _mm_add_epi32(x3, x4);
4945
0
            x5 = _mm_add_epi32(x5, x6);
4946
0
            r1 = _mm_add_epi32(x2, x3);
4947
0
            x7 = _mm_add_epi32(x7, x8);
4948
0
            r1 = _mm_add_epi32(r1, x5);
4949
4950
0
            t3 = _mm_add_epi32(t3, t4);
4951
0
            t5 = _mm_add_epi32(t5, t6);
4952
0
            r0 = _mm_add_epi32(t2, t3);
4953
0
            t7 = _mm_add_epi32(t7, t8);
4954
0
            r0 = _mm_add_epi32(r0, t5);
4955
0
            r1 = _mm_add_epi32(r1, x7);
4956
0
            r0 = _mm_add_epi32(r0, t7);
4957
0
            r1 = _mm_srli_epi32(r1, 6);
4958
0
            r0 = _mm_srli_epi32(r0, 6);
4959
4960
0
            r1 = _mm_and_si128(r1,
4961
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4962
0
            r0 = _mm_and_si128(r0,
4963
0
                    _mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
4964
0
            r0 = _mm_hadd_epi16(r0, r1);
4965
0
            _mm_store_si128((__m128i *) &dst[x], r0);
4966
4967
0
        }
4968
0
        tmp += MAX_PB_SIZE;
4969
0
        dst += dststride;
4970
0
    }
4971
0
}