Coverage Report

Created: 2026-02-14 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openh264/codec/common/src/mc.cpp
Line
Count
Source
1
/*!
2
 * \copy
3
 *     Copyright (c)  2009-2013, Cisco Systems
4
 *     All rights reserved.
5
 *
6
 *     Redistribution and use in source and binary forms, with or without
7
 *     modification, are permitted provided that the following conditions
8
 *     are met:
9
 *
10
 *        * Redistributions of source code must retain the above copyright
11
 *          notice, this list of conditions and the following disclaimer.
12
 *
13
 *        * Redistributions in binary form must reproduce the above copyright
14
 *          notice, this list of conditions and the following disclaimer in
15
 *          the documentation and/or other materials provided with the
16
 *          distribution.
17
 *
18
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
 *     POSSIBILITY OF SUCH DAMAGE.
30
 *
31
 *
32
 * \file    mc.c
33
 *
34
 * \brief   Interfaces implementation for motion compensation
35
 *
36
 * \date    03/17/2009 Created
37
 *
38
 *************************************************************************************
39
 */
40
41
#include "mc.h"
42
43
#include "cpu_core.h"
44
#include "ls_defines.h"
45
#include "macros.h"
46
#include "asmdefs_mmi.h"
47
48
namespace {
49
50
typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
51
                                       const uint8_t* kpABCD, int32_t iHeight);
52
typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,
53
    int32_t, int32_t);
54
typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
55
                                        int32_t iWidth, int32_t iHeight);
56
57
/*------------------weight for chroma fraction pixel interpolation------------------*/
58
//iA = (8 - dx) * (8 - dy);
59
//iB = dx * (8 - dy);
60
//iC = (8 - dx) * dy;
61
//iD = dx * dy
62
static const uint8_t g_kuiABCD[8][8][4] = { //g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
63
  {
64
    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
65
    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
66
  },
67
  {
68
    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
69
    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
70
  },
71
  {
72
    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
73
    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
74
  },
75
  {
76
    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
77
    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
78
  },
79
  {
80
    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
81
    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
82
  },
83
  {
84
    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
85
    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
86
  },
87
  {
88
    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
89
    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
90
  },
91
  {
92
    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
93
    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
94
  }
95
};
96
97
//***************************************************************************//
98
//                          C code implementation                            //
99
//***************************************************************************//
100
static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
101
0
                                     int32_t iHeight) {
102
0
  int32_t i;
103
0
  for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
104
0
    ST16A2 (pDst, LD16 (pSrc));
105
0
    pDst += iDstStride;
106
0
    pSrc += iSrcStride;
107
0
  }
108
0
}
109
110
static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
111
0
                                     int32_t iHeight) {
112
0
  int32_t i;
113
0
  for (i = 0; i < iHeight; i++) {
114
0
    ST32A4 (pDst, LD32 (pSrc));
115
0
    pDst += iDstStride;
116
0
    pSrc += iSrcStride;
117
0
  }
118
0
}
119
120
static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
121
0
                                     int32_t iHeight) {
122
0
  int32_t i;
123
0
  for (i = 0; i < iHeight; i++) {
124
0
    ST64A8 (pDst, LD64 (pSrc));
125
0
    pDst += iDstStride;
126
0
    pSrc += iSrcStride;
127
0
  }
128
0
}
129
130
static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
131
0
                                      int32_t iHeight) {
132
0
  int32_t i;
133
0
  for (i = 0; i < iHeight; i++) {
134
0
    ST64A8 (pDst  , LD64 (pSrc));
135
0
    ST64A8 (pDst + 8, LD64 (pSrc + 8));
136
0
    pDst += iDstStride;
137
0
    pSrc += iSrcStride;
138
0
  }
139
0
}
140
141
//--------------------Luma sample MC------------------//
142
143
0
static inline int32_t HorFilterInput16bit_c (const int16_t* pSrc) {
144
0
  int32_t iPix05 = pSrc[0] + pSrc[5];
145
0
  int32_t iPix14 = pSrc[1] + pSrc[4];
146
0
  int32_t iPix23 = pSrc[2] + pSrc[3];
147
148
0
  return (iPix05 - (iPix14 * 5) + (iPix23 * 20));
149
0
}
150
// h: iOffset=1 / v: iOffset=iSrcStride
151
0
static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
152
0
  const int32_t kiOffset1 = kiOffset;
153
0
  const int32_t kiOffset2 = (kiOffset << 1);
154
0
  const int32_t kiOffset3 = kiOffset + kiOffset2;
155
0
  const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
156
0
  const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
157
0
  const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
158
159
0
  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
160
0
}
161
162
static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
163
0
                               const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
164
0
  int32_t i, j;
165
0
  for (i = 0; i < iHeight; i++) {
166
0
    for (j = 0; j < iWidth; j++) {
167
0
      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
168
0
    }
169
0
    pDst  += iDstStride;
170
0
    pSrcA += iSrcAStride;
171
0
    pSrcB += iSrcBStride;
172
0
  }
173
0
}
174
static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
175
0
                             int32_t iHeight) {
176
0
  if (iWidth == 16)
177
0
    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
178
0
  else if (iWidth == 8)
179
0
    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
180
0
  else if (iWidth == 4)
181
0
    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
182
0
  else //here iWidth == 2
183
0
    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
184
0
}
185
186
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
187
static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
188
                                 int32_t iWidth,
189
0
                                 int32_t iHeight) {
190
0
  int32_t i, j;
191
0
  for (i = 0; i < iHeight; i++) {
192
0
    for (j = 0; j < iWidth; j++) {
193
0
      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
194
0
    }
195
0
    pDst += iDstStride;
196
0
    pSrc += iSrcStride;
197
0
  }
198
0
}
199
200
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
201
static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
202
                                 int32_t iWidth,
203
0
                                 int32_t iHeight) {
204
0
  int32_t i, j;
205
0
  for (i = 0; i < iHeight; i++) {
206
0
    for (j = 0; j < iWidth; j++) {
207
0
      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
208
0
    }
209
0
    pDst += iDstStride;
210
0
    pSrc += iSrcStride;
211
0
  }
212
0
}
213
214
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
215
static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
216
                                 int32_t iWidth,
217
0
                                 int32_t iHeight) {
218
0
  int16_t iTmp[17 + 5];
219
0
  int32_t i, j, k;
220
221
0
  for (i = 0; i < iHeight; i++) {
222
0
    for (j = 0; j < iWidth + 5; j++) {
223
0
      iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
224
0
    }
225
0
    for (k = 0; k < iWidth; k++) {
226
0
      pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
227
0
    }
228
0
    pSrc += iSrcStride;
229
0
    pDst += iDstStride;
230
0
  }
231
0
}
232
233
/////////////////////luma MC//////////////////////////
234
static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
235
                                 int32_t iWidth,
236
0
                                 int32_t iHeight) {
237
0
  uint8_t uiTmp[256];
238
0
  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
239
0
  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
240
0
}
241
static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
242
                                 int32_t iWidth,
243
0
                                 int32_t iHeight) {
244
0
  uint8_t uiTmp[256];
245
0
  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
246
0
  PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
247
0
}
248
static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
249
                                 int32_t iWidth,
250
0
                                 int32_t iHeight) {
251
0
  uint8_t uiTmp[256];
252
0
  McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
253
0
  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
254
0
}
255
static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
256
                                 int32_t iWidth,
257
0
                                 int32_t iHeight) {
258
0
  uint8_t uiHorTmp[256];
259
0
  uint8_t uiVerTmp[256];
260
0
  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
261
0
  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
262
0
  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
263
0
}
264
static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
265
                                 int32_t iWidth,
266
0
                                 int32_t iHeight) {
267
0
  uint8_t uiVerTmp[256];
268
0
  uint8_t uiCtrTmp[256];
269
0
  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
270
0
  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
271
0
  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
272
0
}
273
static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
274
                                 int32_t iWidth,
275
0
                                 int32_t iHeight) {
276
0
  uint8_t uiHorTmp[256];
277
0
  uint8_t uiVerTmp[256];
278
0
  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
279
0
  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
280
0
  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
281
0
}
282
static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
283
                                 int32_t iWidth,
284
0
                                 int32_t iHeight) {
285
0
  uint8_t uiHorTmp[256];
286
0
  uint8_t uiCtrTmp[256];
287
0
  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
288
0
  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
289
0
  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
290
0
}
291
static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
292
                                 int32_t iWidth,
293
0
                                 int32_t iHeight) {
294
0
  uint8_t uiHorTmp[256];
295
0
  uint8_t uiCtrTmp[256];
296
0
  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
297
0
  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
298
0
  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
299
0
}
300
static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
301
                                 int32_t iWidth,
302
0
                                 int32_t iHeight) {
303
0
  uint8_t uiHorTmp[256];
304
0
  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
305
0
  PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
306
0
}
307
static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
308
                                 int32_t iWidth,
309
0
                                 int32_t iHeight) {
310
0
  uint8_t uiHorTmp[256];
311
0
  uint8_t uiVerTmp[256];
312
0
  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
313
0
  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
314
0
  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
315
0
}
316
static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
317
                                 int32_t iWidth,
318
0
                                 int32_t iHeight) {
319
0
  uint8_t uiVerTmp[256];
320
0
  uint8_t uiCtrTmp[256];
321
0
  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
322
0
  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
323
0
  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
324
0
}
325
static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
326
                                 int32_t iWidth,
327
0
                                 int32_t iHeight) {
328
0
  uint8_t uiHorTmp[256];
329
0
  uint8_t uiVerTmp[256];
330
0
  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
331
0
  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
332
0
  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
333
0
}
334
335
void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
336
               int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
337
//pSrc has been added the offset of mv
338
0
{
339
0
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
340
0
    {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
341
0
    {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
342
0
    {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
343
0
    {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
344
0
  };
345
346
0
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
347
0
}
348
349
static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
350
0
    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
351
0
  int32_t i, j;
352
0
  int32_t iA, iB, iC, iD;
353
0
  const uint8_t* pSrcNext = pSrc + iSrcStride;
354
0
  const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
355
0
  iA = pABCD[0];
356
0
  iB = pABCD[1];
357
0
  iC = pABCD[2];
358
0
  iD = pABCD[3];
359
0
  for (i = 0; i < iHeight; i++) {
360
0
    for (j = 0; j < iWidth; j++) {
361
0
      pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
362
0
    }
363
0
    pDst     += iDstStride;
364
0
    pSrc      = pSrcNext;
365
0
    pSrcNext += iSrcStride;
366
0
  }
367
0
}
368
369
void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
370
                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
371
//pSrc has been added the offset of mv
372
0
{
373
0
  const int32_t kiD8x = iMvX & 0x07;
374
0
  const int32_t kiD8y = iMvY & 0x07;
375
0
  if (0 == kiD8x && 0 == kiD8y)
376
0
    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
377
0
  else
378
0
    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
379
0
}
380
381
#if defined(X86_ASM)
382
//***************************************************************************//
383
//                       SSE2 implement                          //
384
//***************************************************************************//
385
static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
386
    int32_t iHeight) {
387
  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
388
  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
389
  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
390
}
391
392
static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
393
    int32_t iHeight) {
394
  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
395
  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
396
}
397
398
static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
399
    int32_t iHeight) {
400
  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
401
  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
402
}
403
404
void McHorVer20Width5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
405
        int32_t iWidth, int32_t iHeight) {
406
    if (iWidth == 17 || iWidth == 9)
407
        McHorVer20Width9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
408
    else //if (iWidth == 5)
409
        McHorVer20Width5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
410
}
411
412
void McHorVer02Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
413
        int32_t iWidth, int32_t iHeight) {
414
    if (iWidth == 16 || iWidth == 8)
415
        McHorVer02Height9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
416
    else //if (iWidth == 4)
417
        McHorVer02Height5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
418
}
419
420
void McHorVer22Width5Or9Or17Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
421
        int32_t iWidth, int32_t iHeight) {
422
    ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
423
    if (iWidth == 17 || iWidth == 9){
424
        int32_t tmp1 = 2 * (iWidth - 8);
425
        McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
426
        McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
427
        McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
428
    }
429
    else{ //if(iWidth == 5)
430
        int32_t tmp1 = 2 * (iWidth - 4);
431
        McHorVer22Width5HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
432
        McHorVer22Width4VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
433
        McHorVer22Width4VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 4, iDstStride, 4, iHeight);
434
    }
435
436
}
437
438
static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
439
                                int32_t iWidth,
440
                                int32_t iHeight) {
441
  if (iWidth == 16)
442
    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
443
  else if (iWidth == 8)
444
    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
445
  else if (iWidth == 4)
446
    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
447
  else
448
    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
449
}
450
451
static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
452
                                    int32_t iWidth, int32_t iHeight) {
453
  if (iWidth == 16)
454
    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
455
  else if (iWidth == 8)
456
    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
457
  else
458
    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
459
}
460
461
static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
462
                                    int32_t iWidth, int32_t iHeight) {
463
  if (iWidth == 16)
464
    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
465
  else if (iWidth == 8)
466
    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
467
  else
468
    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
469
}
470
471
static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
472
                                    int32_t iWidth, int32_t iHeight) {
473
  if (iWidth == 16)
474
    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
475
  else if (iWidth == 8)
476
    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
477
  else
478
    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
479
}
480
481
static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
482
                                    int32_t iWidth, int32_t iHeight) {
483
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
484
  if (iWidth == 16) {
485
    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
486
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
487
  } else if (iWidth == 8) {
488
    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
489
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
490
  } else {
491
    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
492
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
493
  }
494
}
495
static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
496
                                    int32_t iWidth, int32_t iHeight) {
497
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
498
  if (iWidth == 16) {
499
    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
500
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
501
  } else if (iWidth == 8) {
502
    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
503
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
504
  } else {
505
    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
506
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
507
  }
508
}
509
static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
510
                                    int32_t iWidth, int32_t iHeight) {
511
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
512
  if (iWidth == 16) {
513
    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
514
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
515
  } else if (iWidth == 8) {
516
    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
517
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
518
  } else {
519
    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
520
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
521
  }
522
}
523
static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
524
                                    int32_t iWidth, int32_t iHeight) {
525
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
526
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
527
  if (iWidth == 16) {
528
    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
529
    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
530
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
531
  } else if (iWidth == 8) {
532
    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
533
    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
534
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
535
  } else {
536
    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
537
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
538
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
539
  }
540
}
541
static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
542
                                    int32_t iWidth, int32_t iHeight) {
543
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
544
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
545
  if (iWidth == 16) {
546
    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
547
    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
548
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
549
  } else if (iWidth == 8) {
550
    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
551
    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
552
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
553
  } else {
554
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
555
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
556
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
557
  }
558
}
559
static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
560
                                    int32_t iWidth, int32_t iHeight) {
561
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
562
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
563
  if (iWidth == 16) {
564
    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
565
    McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
566
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
567
  } else if (iWidth == 8) {
568
    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
569
    McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
570
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
571
  } else {
572
    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
573
    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
574
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
575
  }
576
}
577
static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
578
                                    int32_t iWidth, int32_t iHeight) {
579
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
580
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
581
  if (iWidth == 16) {
582
    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
583
    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
584
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
585
  } else if (iWidth == 8) {
586
    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
587
    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
588
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
589
  } else {
590
    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
591
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
592
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
593
  }
594
}
595
static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
596
                                    int32_t iWidth, int32_t iHeight) {
597
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
598
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
599
  if (iWidth == 16) {
600
    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
601
    McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
602
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
603
  } else if (iWidth == 8) {
604
    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
605
    McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
606
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
607
  } else {
608
    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
609
    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
610
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
611
  }
612
}
613
static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
614
                                    int32_t iWidth, int32_t iHeight) {
615
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
616
  if (iWidth == 16) {
617
    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
618
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
619
  } else if (iWidth == 8) {
620
    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
621
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
622
  } else {
623
    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
624
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
625
  }
626
}
627
static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
628
                                    int32_t iWidth, int32_t iHeight) {
629
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
630
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
631
  if (iWidth == 16) {
632
    McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
633
    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
634
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
635
  } else if (iWidth == 8) {
636
    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
637
    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
638
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
639
  } else {
640
    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
641
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
642
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
643
  }
644
}
645
static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
646
                                    int32_t iWidth, int32_t iHeight) {
647
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
648
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
649
  if (iWidth == 16) {
650
    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
651
    McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
652
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
653
  } else if (iWidth == 8) {
654
    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
655
    McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
656
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
657
  } else {
658
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
659
    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
660
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
661
  }
662
}
663
static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
664
                                    int32_t iWidth, int32_t iHeight) {
665
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
666
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
667
  if (iWidth == 16) {
668
    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
669
    McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
670
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
671
  } else if (iWidth == 8) {
672
    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
673
    McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
674
    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
675
  } else {
676
    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
677
    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
678
    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
679
  }
680
}
681
682
void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
683
                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
684
//pSrc has been added the offset of mv
685
{
686
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
687
    {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
688
    {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
689
    {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
690
    {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
691
  };
692
693
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
694
}
695
696
void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
697
                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
698
  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
699
    McChromaWidthEq4_mmx,
700
    McChromaWidthEq8_sse2
701
  };
702
  const int32_t kiD8x = iMvX & 0x07;
703
  const int32_t kiD8y = iMvY & 0x07;
704
  if (kiD8x == 0 && kiD8y == 0) {
705
    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
706
    return;
707
  }
708
  if (iWidth != 2) {
709
    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
710
  } else
711
    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
712
}
713
714
//***************************************************************************//
715
//                          SSSE3 implementation                             //
716
//***************************************************************************//
717
718
void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
719
                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
720
  if (iWidth < 8) {
721
    PixelAvgWidthEq4_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
722
  } else if (iWidth == 8) {
723
    PixelAvgWidthEq8_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
724
  } else {
725
    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
726
  }
727
}
728
729
void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
730
                  int32_t iWidth, int32_t iHeight) {
731
  switch (iWidth) {
732
  case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
733
  case 8:  return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
734
  case 4:  return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
735
  }
736
  return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
737
}
738
739
void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
740
                       int32_t iWidth, int32_t iHeight) {
741
  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);
742
  if (iWidth < 8) {
743
    McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
744
    McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);
745
  } else if (iWidth == 8) {
746
    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
747
    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
748
  } else {
749
    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
750
    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);
751
    McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
752
    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);
753
  }
754
}
755
756
void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
757
                       int32_t iWidth, int32_t iHeight) {
758
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
759
  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
760
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
761
                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
762
}
763
764
void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
765
                       int32_t iWidth, int32_t iHeight) {
766
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
767
  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
768
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
769
                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
770
}
771
772
void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
773
                       int32_t iWidth, int32_t iHeight) {
774
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
775
  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
776
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
777
                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
778
}
779
780
void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
781
                       int32_t iWidth, int32_t iHeight) {
782
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
783
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
784
  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
785
  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
786
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
787
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
788
}
789
790
void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
791
                       int32_t iWidth, int32_t iHeight) {
792
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
793
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
794
  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
795
  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
796
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
797
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
798
}
799
800
void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
801
                       int32_t iWidth, int32_t iHeight) {
802
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
803
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
804
  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
805
  McHorVer02_ssse3 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
806
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
807
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
808
}
809
810
void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
811
                       int32_t iWidth, int32_t iHeight) {
812
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
813
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
814
  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
815
  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
816
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
817
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
818
}
819
820
void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
821
                       int32_t iWidth, int32_t iHeight) {
822
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
823
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
824
  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
825
  McHorVer22_ssse3 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
826
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
827
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
828
}
829
830
void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
831
                       int32_t iWidth, int32_t iHeight) {
832
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
833
  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
834
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
835
}
836
837
void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
838
                       int32_t iWidth, int32_t iHeight) {
839
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
840
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
841
  McHorVer20_ssse3 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
842
  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
843
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
844
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
845
}
846
847
void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
848
                       int32_t iWidth, int32_t iHeight) {
849
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
850
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
851
  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
852
  McHorVer22_ssse3 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
853
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
854
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
855
}
856
857
void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
858
                       int32_t iWidth, int32_t iHeight) {
859
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
860
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
861
  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
862
  McHorVer02_ssse3 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
863
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
864
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
865
}
866
867
void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
868
                                    int32_t iWidth, int32_t iHeight) {
869
  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)
870
  if (iWidth > 5) {
871
    McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);
872
    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
873
  } else {
874
    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
875
    McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);
876
  }
877
}
878
879
void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
880
                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
881
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
882
    {McCopy_sse3,      McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},
883
    {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},
884
    {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},
885
    {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},
886
  };
887
888
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
889
}
890
891
void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
892
                     int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
893
  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
894
    McChromaWidthEq4_mmx,
895
    McChromaWidthEq8_ssse3
896
  };
897
  const int32_t kiD8x = iMvX & 0x07;
898
  const int32_t kiD8y = iMvY & 0x07;
899
  if (kiD8x == 0 && kiD8y == 0) {
900
    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
901
    return;
902
  }
903
  if (iWidth != 2) {
904
    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
905
  } else
906
    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
907
}
908
909
//***************************************************************************//
910
//                          AVX2 implementation                              //
911
//***************************************************************************//
912
913
#ifdef HAVE_AVX2
914
915
void McHorVer22_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
916
                      int32_t iWidth, int32_t iHeight) {
917
  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 16, 32);
918
  if (iWidth < 8) {
919
    McHorVer20Width4U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
920
    McHorVer02Width4S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
921
  } else if (iWidth == 8) {
922
    McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
923
    McHorVer02Width8S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
924
  } else {
925
    McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
926
    McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
927
  }
928
}
929
930
void McHorVer01_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
931
                      int32_t iWidth, int32_t iHeight) {
932
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
933
  McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
934
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
935
                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
936
}
937
938
void McHorVer03_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
939
                      int32_t iWidth, int32_t iHeight) {
940
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
941
  McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
942
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
943
                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
944
}
945
946
void McHorVer10_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
947
                      int32_t iWidth, int32_t iHeight) {
948
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
949
  McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
950
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
951
                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
952
}
953
954
void McHorVer11_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
955
                      int32_t iWidth, int32_t iHeight) {
956
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
957
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
958
  McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
959
  McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
960
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
961
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
962
}
963
964
void McHorVer12_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
965
                      int32_t iWidth, int32_t iHeight) {
966
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
967
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
968
  McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
969
  McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
970
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
971
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
972
}
973
974
void McHorVer13_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
975
                      int32_t iWidth, int32_t iHeight) {
976
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
977
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
978
  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
979
  McHorVer02_avx2 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
980
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
981
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
982
}
983
984
void McHorVer21_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
985
                      int32_t iWidth, int32_t iHeight) {
986
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
987
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
988
  McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
989
  McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
990
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
991
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
992
}
993
994
void McHorVer23_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
995
                      int32_t iWidth, int32_t iHeight) {
996
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
997
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
998
  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
999
  McHorVer22_avx2 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1000
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1001
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1002
}
1003
1004
void McHorVer30_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1005
                      int32_t iWidth, int32_t iHeight) {
1006
  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
1007
  McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
1008
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
1009
}
1010
1011
void McHorVer31_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1012
                      int32_t iWidth, int32_t iHeight) {
1013
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
1014
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1015
  McHorVer20_avx2 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
1016
  McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1017
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1018
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1019
}
1020
1021
void McHorVer32_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1022
                      int32_t iWidth, int32_t iHeight) {
1023
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1024
  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
1025
  McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1026
  McHorVer22_avx2 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1027
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
1028
                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1029
}
1030
1031
void McHorVer33_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1032
                      int32_t iWidth, int32_t iHeight) {
1033
  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
1034
  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1035
  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
1036
  McHorVer02_avx2 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1037
  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1038
                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1039
}
1040
1041
void McHorVer22Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1042
                                   int32_t iWidth, int32_t iHeight) {
1043
  if (iWidth < 9) {
1044
    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 9 + 5, WELS_ALIGN(5, 16 / sizeof (int16_t)), 16)
1045
    McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1046
    McHorVer02Width5S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
1047
  } else if (iWidth == 9) {
1048
    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, 16, 32)
1049
    McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1050
    McHorVer02Width9S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
1051
  } else {
1052
    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 32 / sizeof (int16_t)), 32)
1053
    McHorVer20Width17U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1054
    McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
1055
  }
1056
}
1057
1058
void McLuma_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1059
                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1060
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
1061
    {McCopy_sse3,     McHorVer01_avx2, McHorVer02_avx2, McHorVer03_avx2},
1062
    {McHorVer10_avx2, McHorVer11_avx2, McHorVer12_avx2, McHorVer13_avx2},
1063
    {McHorVer20_avx2, McHorVer21_avx2, McHorVer22_avx2, McHorVer23_avx2},
1064
    {McHorVer30_avx2, McHorVer31_avx2, McHorVer32_avx2, McHorVer33_avx2},
1065
  };
1066
1067
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1068
}
1069
1070
#endif //HAVE_AVX2
1071
1072
void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1073
                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1074
  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1075
    PixelAvgWidthEq8_mmx,
1076
    PixelAvgWidthEq16_sse2
1077
  };
1078
  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1079
}
1080
1081
#endif //X86_ASM
1082
//***************************************************************************//
1083
//                       NEON implementation                      //
1084
//***************************************************************************//
1085
#if defined(HAVE_NEON)
1086
void McHorVer20Width5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1087
                                int32_t iWidth, int32_t iHeight) {
1088
  if (iWidth == 17)
1089
    McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1090
  else if (iWidth == 9)
1091
    McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1092
  else //if (iWidth == 5)
1093
    McHorVer20Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1094
}
1095
void McHorVer02Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1096
                                 int32_t iWidth, int32_t iHeight) {
1097
  if (iWidth == 16)
1098
    McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1099
  else if (iWidth == 8)
1100
    McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1101
  else //if (iWidth == 4)
1102
    McHorVer02Height5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1103
}
1104
void McHorVer22Width5Or9Or17Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1105
    int32_t iWidth, int32_t iHeight) {
1106
  if (iWidth == 17)
1107
    McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1108
  else if (iWidth == 9)
1109
    McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1110
  else //if (iWidth == 5)
1111
    McHorVer22Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1112
}
1113
void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1114
                  int32_t iWidth, int32_t iHeight) {
1115
  if (16 == iWidth)
1116
    McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1117
  else if (8 == iWidth)
1118
    McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1119
  else if (4 == iWidth)
1120
    McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1121
  else
1122
    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1123
}
1124
void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1125
                      int32_t iWidth, int32_t iHeight) {
1126
  if (iWidth == 16)
1127
    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1128
  else if (iWidth == 8)
1129
    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1130
  else if (iWidth == 4)
1131
    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1132
}
1133
void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1134
                      int32_t iWidth, int32_t iHeight) {
1135
  if (iWidth == 16)
1136
    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1137
  else if (iWidth == 8)
1138
    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1139
  else if (iWidth == 4)
1140
    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1141
}
1142
void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1143
                      int32_t iWidth, int32_t iHeight) {
1144
  if (iWidth == 16)
1145
    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1146
  else if (iWidth == 8)
1147
    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1148
  else if (iWidth == 4)
1149
    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1150
}
1151
1152
void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1153
                      int32_t iWidth, int32_t iHeight) {
1154
  if (iWidth == 16)
1155
    McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1156
  else if (iWidth == 8)
1157
    McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1158
  else if (iWidth == 4)
1159
    McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1160
}
1161
void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1162
                      int32_t iWidth, int32_t iHeight) {
1163
  if (iWidth == 16)
1164
    McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1165
  else if (iWidth == 8)
1166
    McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1167
  else if (iWidth == 4)
1168
    McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1169
}
1170
void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1171
                      int32_t iWidth, int32_t iHeight) {
1172
  if (iWidth == 16)
1173
    McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1174
  else if (iWidth == 8)
1175
    McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1176
  else if (iWidth == 4)
1177
    McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1178
}
1179
void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1180
                      int32_t iWidth, int32_t iHeight) {
1181
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1182
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1183
  if (iWidth == 16) {
1184
    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1185
    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1186
    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1187
  } else if (iWidth == 8) {
1188
    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1189
    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1190
    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1191
  } else if (iWidth == 4) {
1192
    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1193
    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1194
    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1195
  }
1196
}
1197
void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1198
                      int32_t iWidth, int32_t iHeight) {
1199
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1200
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1201
  if (iWidth == 16) {
1202
    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1203
    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1204
    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1205
  } else if (iWidth == 8) {
1206
    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1207
    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1208
    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1209
  } else if (iWidth == 4) {
1210
    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1211
    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1212
    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1213
  }
1214
}
1215
void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1216
                      int32_t iWidth, int32_t iHeight) {
1217
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1218
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1219
  if (iWidth == 16) {
1220
    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1221
    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1222
    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1223
  } else if (iWidth == 8) {
1224
    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1225
    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1226
    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1227
  } else if (iWidth == 4) {
1228
    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1229
    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1230
    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1231
  }
1232
}
1233
void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1234
                      int32_t iWidth, int32_t iHeight) {
1235
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1236
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1237
  if (iWidth == 16) {
1238
    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1239
    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1240
    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1241
  } else if (iWidth == 8) {
1242
    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1243
    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1244
    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1245
  } else if (iWidth == 4) {
1246
    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1247
    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1248
    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1249
  }
1250
}
1251
void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1252
                      int32_t iWidth, int32_t iHeight) {
1253
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1254
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1255
  if (iWidth == 16) {
1256
    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1257
    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1258
    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1259
  } else if (iWidth == 8) {
1260
    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1261
    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1262
    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1263
  } else if (iWidth == 4) {
1264
    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1265
    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1266
    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1267
  }
1268
}
1269
void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1270
                      int32_t iWidth, int32_t iHeight) {
1271
  if (iWidth == 16)
1272
    McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1273
  else if (iWidth == 8)
1274
    McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1275
  else if (iWidth == 4)
1276
    McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1277
}
1278
void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1279
                      int32_t iWidth, int32_t iHeight) {
1280
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1281
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1282
  if (iWidth == 16) {
1283
    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1284
    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1285
    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1286
  } else if (iWidth == 8) {
1287
    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1288
    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1289
    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1290
  } else if (iWidth == 4) {
1291
    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1292
    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1293
    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1294
  }
1295
}
1296
void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1297
                      int32_t iWidth, int32_t iHeight) {
1298
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1299
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1300
  if (iWidth == 16) {
1301
    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1302
    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1303
    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1304
  } else if (iWidth == 8) {
1305
    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1306
    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1307
    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1308
  } else if (iWidth == 4) {
1309
    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1310
    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1311
    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1312
  }
1313
}
1314
void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1315
                      int32_t iWidth, int32_t iHeight) {
1316
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1317
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1318
  if (iWidth == 16) {
1319
    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1320
    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1321
    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1322
  } else if (iWidth == 8) {
1323
    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1324
    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1325
    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1326
  } else if (iWidth == 4) {
1327
    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1328
    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1329
    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1330
  }
1331
}
1332
1333
void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1334
                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1335
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
1336
    {McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
1337
    {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
1338
    {McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
1339
    {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
1340
  };
1341
  // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
1342
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1343
}
1344
void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1345
                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1346
  if (0 == iMvX && 0 == iMvY) {
1347
    if (8 == iWidth)
1348
      McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1349
    else if (iWidth == 4)
1350
      McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1351
    else //here iWidth == 2
1352
      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1353
  } else {
1354
    const int32_t kiD8x = iMvX & 0x07;
1355
    const int32_t kiD8y = iMvY & 0x07;
1356
    if (8 == iWidth)
1357
      McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1358
    else if (4 == iWidth)
1359
      McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1360
    else //here iWidth == 2
1361
      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
1362
  }
1363
}
1364
void PixelAvg_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1365
                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1366
  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1367
    PixStrideAvgWidthEq8_neon,
1368
    PixStrideAvgWidthEq16_neon
1369
  };
1370
  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1371
}
1372
#endif
1373
#if defined(HAVE_NEON_AARCH64)
1374
void McHorVer20Width5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1375
                                        int32_t iWidth, int32_t iHeight) {
1376
  if (iWidth == 17)
1377
    McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1378
  else if (iWidth == 9)
1379
    McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1380
  else //if (iWidth == 5)
1381
    McHorVer20Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1382
}
1383
void McHorVer02Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1384
    int32_t iWidth, int32_t iHeight) {
1385
  if (iWidth == 16)
1386
    McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1387
  else if (iWidth == 8)
1388
    McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1389
  else //if (iWidth == 4)
1390
    McHorVer02Height5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1391
}
1392
void McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
1393
    int32_t iDstStride,
1394
    int32_t iWidth, int32_t iHeight) {
1395
  if (iWidth == 17)
1396
    McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1397
  else if (iWidth == 9)
1398
    McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1399
  else //if (iWidth == 5)
1400
    McHorVer22Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1401
}
1402
void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1403
                          int32_t iWidth, int32_t iHeight) {
1404
  if (16 == iWidth)
1405
    McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1406
  else if (8 == iWidth)
1407
    McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1408
  else if (4 == iWidth)
1409
    McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1410
  else
1411
    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1412
}
1413
void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1414
                              int32_t iWidth, int32_t iHeight) {
1415
  if (iWidth == 16)
1416
    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1417
  else if (iWidth == 8)
1418
    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1419
  else if (iWidth == 4)
1420
    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1421
}
1422
void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1423
                              int32_t iWidth, int32_t iHeight) {
1424
  if (iWidth == 16)
1425
    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1426
  else if (iWidth == 8)
1427
    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1428
  else if (iWidth == 4)
1429
    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1430
}
1431
void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1432
                              int32_t iWidth, int32_t iHeight) {
1433
  if (iWidth == 16)
1434
    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1435
  else if (iWidth == 8)
1436
    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1437
  else if (iWidth == 4)
1438
    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1439
}
1440
1441
void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1442
                              int32_t iWidth, int32_t iHeight) {
1443
  if (iWidth == 16)
1444
    McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1445
  else if (iWidth == 8)
1446
    McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1447
  else if (iWidth == 4)
1448
    McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1449
}
1450
void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1451
                              int32_t iWidth, int32_t iHeight) {
1452
  if (iWidth == 16)
1453
    McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1454
  else if (iWidth == 8)
1455
    McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1456
  else if (iWidth == 4)
1457
    McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1458
}
1459
void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1460
                              int32_t iWidth, int32_t iHeight) {
1461
  if (iWidth == 16)
1462
    McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1463
  else if (iWidth == 8)
1464
    McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1465
  else if (iWidth == 4)
1466
    McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1467
}
1468
void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1469
                              int32_t iWidth, int32_t iHeight) {
1470
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1471
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1472
  if (iWidth == 16) {
1473
    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1474
    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1475
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1476
  } else if (iWidth == 8) {
1477
    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1478
    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1479
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1480
  } else if (iWidth == 4) {
1481
    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1482
    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1483
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1484
  }
1485
}
1486
void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1487
                              int32_t iWidth, int32_t iHeight) {
1488
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1489
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1490
  if (iWidth == 16) {
1491
    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1492
    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1493
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1494
  } else if (iWidth == 8) {
1495
    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1496
    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1497
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1498
  } else if (iWidth == 4) {
1499
    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1500
    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1501
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1502
  }
1503
}
1504
void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1505
                              int32_t iWidth, int32_t iHeight) {
1506
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1507
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1508
  if (iWidth == 16) {
1509
    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1510
    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1511
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1512
  } else if (iWidth == 8) {
1513
    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1514
    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1515
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1516
  } else if (iWidth == 4) {
1517
    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1518
    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1519
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1520
  }
1521
}
1522
void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1523
                              int32_t iWidth, int32_t iHeight) {
1524
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1525
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1526
  if (iWidth == 16) {
1527
    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1528
    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1529
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1530
  } else if (iWidth == 8) {
1531
    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1532
    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1533
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1534
  } else if (iWidth == 4) {
1535
    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1536
    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1537
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1538
  }
1539
}
1540
void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1541
                              int32_t iWidth, int32_t iHeight) {
1542
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1543
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1544
  if (iWidth == 16) {
1545
    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1546
    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1547
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1548
  } else if (iWidth == 8) {
1549
    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1550
    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1551
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1552
  } else if (iWidth == 4) {
1553
    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1554
    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1555
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1556
  }
1557
}
1558
void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1559
                              int32_t iWidth, int32_t iHeight) {
1560
  if (iWidth == 16)
1561
    McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1562
  else if (iWidth == 8)
1563
    McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1564
  else if (iWidth == 4)
1565
    McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1566
}
1567
void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1568
                              int32_t iWidth, int32_t iHeight) {
1569
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1570
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1571
  if (iWidth == 16) {
1572
    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1573
    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1574
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1575
  } else if (iWidth == 8) {
1576
    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1577
    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1578
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1579
  } else if (iWidth == 4) {
1580
    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1581
    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1582
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1583
  }
1584
}
1585
void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1586
                              int32_t iWidth, int32_t iHeight) {
1587
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1588
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1589
  if (iWidth == 16) {
1590
    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1591
    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1592
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1593
  } else if (iWidth == 8) {
1594
    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1595
    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1596
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1597
  } else if (iWidth == 4) {
1598
    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1599
    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1600
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1601
  }
1602
}
1603
void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1604
                              int32_t iWidth, int32_t iHeight) {
1605
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1606
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1607
  if (iWidth == 16) {
1608
    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1609
    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1610
    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1611
  } else if (iWidth == 8) {
1612
    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1613
    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1614
    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1615
  } else if (iWidth == 4) {
1616
    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1617
    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1618
    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1619
  }
1620
}
1621
1622
void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1623
                          int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1624
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
1625
    {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
1626
    {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
1627
    {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
1628
    {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
1629
  };
1630
  // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
1631
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1632
}
1633
void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1634
                            int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1635
  if (0 == iMvX && 0 == iMvY) {
1636
    if (8 == iWidth)
1637
      McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1638
    else if (iWidth == 4)
1639
      McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1640
    else //here iWidth == 2
1641
      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1642
  } else {
1643
    const int32_t kiD8x = iMvX & 0x07;
1644
    const int32_t kiD8y = iMvY & 0x07;
1645
    if (8 == iWidth)
1646
      McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1647
    else if (4 == iWidth)
1648
      McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1649
    else //here iWidth == 2
1650
      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
1651
  }
1652
}
1653
void PixelAvg_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1654
                            const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1655
  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1656
    PixStrideAvgWidthEq8_AArch64_neon,
1657
    PixStrideAvgWidthEq16_AArch64_neon
1658
  };
1659
  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1660
}
1661
#endif
1662
1663
#if defined(HAVE_MMI)
1664
#define MMI_LOAD_8P(f0, f2, f4, r0) \
1665
  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
1666
  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
1667
  "punpckhbh  "#f2", "#f0", "#f4"             \n\t" \
1668
  "punpcklbh  "#f0", "#f0", "#f4"             \n\t"
1669
1670
#define FILTER_HV_W4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1671
                     f20, f22, f24, f26, f28, f30, r0, r1, r2) \
1672
  "paddh      "#f0", "#f0", "#f20"            \n\t" \
1673
  "paddh      "#f2", "#f2", "#f22"            \n\t" \
1674
  "mov.d      "#f28", "#f8"                   \n\t" \
1675
  "mov.d      "#f30", "#f10"                  \n\t" \
1676
  "mov.d      "#f24", "#f4"                   \n\t" \
1677
  "mov.d      "#f26", "#f6"                   \n\t" \
1678
  "dmfc1      "#r2", "#f8"                    \n\t" \
1679
  "dli        "#r1", 0x0010001000100010       \n\t" \
1680
  "dmtc1      "#r1", "#f8"                    \n\t" \
1681
  "paddh      "#f0", "#f0", "#f8"             \n\t" \
1682
  "paddh      "#f2", "#f2", "#f8"             \n\t" \
1683
  "paddh      "#f28", "#f28", "#f12"          \n\t" \
1684
  "paddh      "#f30", "#f30", "#f14"          \n\t" \
1685
  "paddh      "#f24", "#f24", "#f16"          \n\t" \
1686
  "paddh      "#f26", "#f26", "#f18"          \n\t" \
1687
  "dli        "#r1", 0x2                      \n\t" \
1688
  "dmtc1      "#r1", "#f8"                    \n\t" \
1689
  "psllh      "#f28", "#f28", "#f8"           \n\t" \
1690
  "psllh      "#f30", "#f30", "#f8"           \n\t" \
1691
  "psubh      "#f28", "#f28", "#f24"          \n\t" \
1692
  "psubh      "#f30", "#f30", "#f26"          \n\t" \
1693
  "paddh      "#f0", "#f0", "#f28"            \n\t" \
1694
  "paddh      "#f2", "#f2", "#f30"            \n\t" \
1695
  "psllh      "#f28", "#f28", "#f8"           \n\t" \
1696
  "psllh      "#f30", "#f30", "#f8"           \n\t" \
1697
  "paddh      "#f0", "#f0", "#f28"            \n\t" \
1698
  "paddh      "#f2", "#f2", "#f30"            \n\t" \
1699
  "dli        "#r1", 0x5                      \n\t" \
1700
  "dmtc1      "#r1", "#f8"                    \n\t" \
1701
  "psrah      "#f0", "#f0", "#f8"             \n\t" \
1702
  "psrah      "#f2", "#f2", "#f8"             \n\t" \
1703
  "xor        "#f28", "#f28", "#f28"          \n\t" \
1704
  "packushb   "#f0", "#f0", "#f2"             \n\t" \
1705
  "gsswlc1    "#f0", 0x3("#r0")               \n\t" \
1706
  "gsswrc1    "#f0", 0x0("#r0")               \n\t" \
1707
  "dmtc1      "#r2", "#f8"                    \n\t"
1708
1709
#define FILTER_HV_W8(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1710
                     f20, f22, f24, f26, f28, f30, r0, r1, r2) \
1711
  "paddh      "#f0", "#f0", "#f20"            \n\t" \
1712
  "paddh      "#f2", "#f2", "#f22"            \n\t" \
1713
  "mov.d      "#f28", "#f8"                   \n\t" \
1714
  "mov.d      "#f30", "#f10"                  \n\t" \
1715
  "mov.d      "#f24", "#f4"                   \n\t" \
1716
  "mov.d      "#f26", "#f6"                   \n\t" \
1717
  "dmfc1      "#r2", "#f8"                    \n\t" \
1718
  "dli        "#r1", 0x0010001000100010       \n\t" \
1719
  "dmtc1      "#r1", "#f8"                    \n\t" \
1720
  "paddh      "#f0", "#f0", "#f8"             \n\t" \
1721
  "paddh      "#f2", "#f2", "#f8"             \n\t" \
1722
  "paddh      "#f28", "#f28", "#f12"          \n\t" \
1723
  "paddh      "#f30", "#f30", "#f14"          \n\t" \
1724
  "paddh      "#f24", "#f24", "#f16"          \n\t" \
1725
  "paddh      "#f26", "#f26", "#f18"          \n\t" \
1726
  "dli        "#r1", 0x2                      \n\t" \
1727
  "dmtc1      "#r1", "#f8"                    \n\t" \
1728
  "psllh      "#f28", "#f28", "#f8"           \n\t" \
1729
  "psllh      "#f30", "#f30", "#f8"           \n\t" \
1730
  "psubh      "#f28", "#f28", "#f24"          \n\t" \
1731
  "psubh      "#f30", "#f30", "#f26"          \n\t" \
1732
  "paddh      "#f0", "#f0", "#f28"            \n\t" \
1733
  "paddh      "#f2", "#f2", "#f30"            \n\t" \
1734
  "psllh      "#f28", "#f28", "#f8"           \n\t" \
1735
  "psllh      "#f30", "#f30", "#f8"           \n\t" \
1736
  "paddh      "#f0", "#f0", "#f28"            \n\t" \
1737
  "paddh      "#f2", "#f2", "#f30"            \n\t" \
1738
  "dli        "#r1", 0x5                      \n\t" \
1739
  "dmtc1      "#r1", "#f8"                    \n\t" \
1740
  "psrah      "#f0", "#f0", "#f8"             \n\t" \
1741
  "psrah      "#f2", "#f2", "#f8"             \n\t" \
1742
  "xor        "#f28", "#f28", "#f28"          \n\t" \
1743
  "packushb   "#f0", "#f0", "#f2"             \n\t" \
1744
  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
1745
  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
1746
  "dmtc1      "#r2", "#f8"                    \n\t"
1747
1748
#define FILTER_VER_ALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1749
                         f20, f22, f24, f26, f28, f30, r0, r1, r2, r3, r4) \
1750
  "paddh      "#f0", "#f0", "#f20"            \n\t" \
1751
  "paddh      "#f2", "#f2", "#f22"            \n\t" \
1752
  "mov.d      "#f24", "#f4"                   \n\t" \
1753
  "mov.d      "#f26", "#f6"                   \n\t" \
1754
  "mov.d      "#f28", "#f8"                   \n\t" \
1755
  "mov.d      "#f30", "#f10"                  \n\t" \
1756
  "dli        "#r2", 0x2                      \n\t" \
1757
  "paddh      "#f24", "#f24", "#f16"          \n\t" \
1758
  "paddh      "#f26", "#f26", "#f18"          \n\t" \
1759
  "dmfc1      "#r3", "#f8"                    \n\t" \
1760
  "paddh      "#f28", "#f28", "#f12"          \n\t" \
1761
  "paddh      "#f30", "#f30", "#f14"          \n\t" \
1762
  "dmtc1      "#r2", "#f8"                    \n\t" \
1763
  "psubh      "#f0", "#f0", "#f24"            \n\t" \
1764
  "psubh      "#f2", "#f2", "#f26"            \n\t" \
1765
  "psrah      "#f0", "#f0", "#f8"             \n\t" \
1766
  "psrah      "#f2", "#f2", "#f8"             \n\t" \
1767
  "paddh      "#f0", "#f0", "#f28"            \n\t" \
1768
  "paddh      "#f2", "#f2", "#f30"            \n\t" \
1769
  "psubh      "#f0", "#f0", "#f24"            \n\t" \
1770
  "psubh      "#f2", "#f2", "#f26"            \n\t" \
1771
  "psrah      "#f0", "#f0", "#f8"             \n\t" \
1772
  "psrah      "#f2", "#f2", "#f8"             \n\t" \
1773
  "dmtc1      "#r4", "#f8"                    \n\t" \
1774
  "paddh      "#f28", "#f28", "#f0"           \n\t" \
1775
  "paddh      "#f30", "#f30", "#f2"           \n\t" \
1776
  "dli        "#r2", 0x6                      \n\t" \
1777
  "paddh      "#f28", "#f28", "#f8"           \n\t" \
1778
  "paddh      "#f30", "#f30", "#f8"           \n\t" \
1779
  "dmtc1      "#r2", "#f8"                    \n\t" \
1780
  "psrah      "#f28", "#f28", "#f8"           \n\t" \
1781
  "psrah      "#f30", "#f30", "#f8"           \n\t" \
1782
  "packushb   "#f28", "#f28", "#f30"          \n\t" \
1783
  "gssdxc1    "#f28", 0x0("#r0", "#r1")       \n\t" \
1784
  "dmtc1      "#r3", "#f8"                    \n\t"
1785
1786
#define FILTER_VER_UNALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1787
                           f20, f22, f24, f26, f28, f30, r0, r1, r2, r3) \
1788
  "paddh      "#f0", "#f0", "#f20"            \n\t" \
1789
  "paddh      "#f2", "#f2", "#f22"            \n\t" \
1790
  "mov.d      "#f24", "#f4"                   \n\t" \
1791
  "mov.d      "#f26", "#f6"                   \n\t" \
1792
  "mov.d      "#f28", "#f8"                   \n\t" \
1793
  "mov.d      "#f30", "#f10"                  \n\t" \
1794
  "dli        "#r1", 0x2                      \n\t" \
1795
  "paddh      "#f24", "#f24", "#f16"          \n\t" \
1796
  "paddh      "#f26", "#f26", "#f18"          \n\t" \
1797
  "dmfc1      "#r2", "#f8"                    \n\t" \
1798
  "paddh      "#f28", "#f28", "#f12"          \n\t" \
1799
  "paddh      "#f30", "#f30", "#f14"          \n\t" \
1800
  "dmtc1      "#r1", "#f8"                    \n\t" \
1801
  "psubh      "#f0", "#f0", "#f24"            \n\t" \
1802
  "psubh      "#f2", "#f2", "#f26"            \n\t" \
1803
  "psrah      "#f0", "#f0", "#f8"             \n\t" \
1804
  "psrah      "#f2", "#f2", "#f8"             \n\t" \
1805
  "paddh      "#f0", "#f0", "#f28"            \n\t" \
1806
  "paddh      "#f2", "#f2", "#f30"            \n\t" \
1807
  "psubh      "#f0", "#f0", "#f24"            \n\t" \
1808
  "psubh      "#f2", "#f2", "#f26"            \n\t" \
1809
  "psrah      "#f0", "#f0", "#f8"             \n\t" \
1810
  "psrah      "#f2", "#f2", "#f8"             \n\t" \
1811
  "dmtc1      "#r3", "#f8"                    \n\t" \
1812
  "paddh      "#f28", "#f28", "#f0"           \n\t" \
1813
  "paddh      "#f30", "#f30", "#f2"           \n\t" \
1814
  "dli        "#r1", 0x6                      \n\t" \
1815
  "paddh      "#f28", "#f28", "#f8"           \n\t" \
1816
  "paddh      "#f30", "#f30", "#f8"           \n\t" \
1817
  "dmtc1      "#r1", "#f8"                    \n\t" \
1818
  "psrah      "#f28", "#f28", "#f8"           \n\t" \
1819
  "psrah      "#f30", "#f30", "#f8"           \n\t" \
1820
  "packushb   "#f28", "#f28", "#f30"          \n\t" \
1821
  "gssdlc1    "#f28", 0x7("#r0")              \n\t" \
1822
  "gssdrc1    "#f28", 0x0("#r0")              \n\t" \
1823
  "dmtc1      "#r2", "#f8"                    \n\t"
1824
1825
void McHorVer20Width5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
1826
                          int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
1827
  BACKUP_REG;
1828
  __asm__ volatile (
1829
    ".set       arch=loongson3a                 \n\t"
1830
    "xor        $f28, $f28, $f28                \n\t"
1831
    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
1832
    "dli        $8, 0x2                         \n\t"
1833
    "dli        $10, 0x0010001000100010         \n\t"
1834
    "dli        $11, 0x5                        \n\t"
1835
    "1:                                         \n\t"
1836
    "xor        $f28, $f28, $f28                \n\t"
1837
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
1838
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
1839
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
1840
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
1841
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
1842
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
1843
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
1844
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
1845
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
1846
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
1847
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
1848
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
1849
    "punpckhbh  $f2, $f0, $f28                  \n\t"
1850
    "punpckhbh  $f6, $f4, $f28                  \n\t"
1851
    "punpckhbh  $f10, $f8, $f28                 \n\t"
1852
    "punpckhbh  $f14, $f12, $f28                \n\t"
1853
    "punpckhbh  $f18, $f16, $f28                \n\t"
1854
    "punpckhbh  $f22, $f20, $f28                \n\t"
1855
    "punpcklbh  $f0, $f0, $f28                  \n\t"
1856
    "punpcklbh  $f4, $f4, $f28                  \n\t"
1857
    "punpcklbh  $f8, $f8, $f28                  \n\t"
1858
    "punpcklbh  $f12, $f12, $f28                \n\t"
1859
    "punpcklbh  $f16, $f16, $f28                \n\t"
1860
    "punpcklbh  $f20, $f20, $f28                \n\t"
1861
1862
    "mov.d      $f28, $f8                       \n\t"
1863
    "mov.d      $f30, $f10                      \n\t"
1864
    "paddh      $f28, $f28, $f12                \n\t"
1865
    "paddh      $f30, $f30, $f14                \n\t"
1866
    "mov.d      $f24, $f16                      \n\t"
1867
    "mov.d      $f26, $f18                      \n\t"
1868
    "paddh      $f24, $f24, $f20                \n\t"
1869
    "paddh      $f26, $f26, $f22                \n\t"
1870
    "dmfc1      $9, $f12                        \n\t"
1871
    "dmtc1      $8, $f12                        \n\t"
1872
    "psllh      $f24, $f24, $f12                \n\t"
1873
    "psllh      $f26, $f26, $f12                \n\t"
1874
    "psubh      $f24, $f24, $f28                \n\t"
1875
    "psubh      $f26, $f26, $f30                \n\t"
1876
    "paddh      $f0, $f0, $f4                   \n\t"
1877
    "paddh      $f2, $f2, $f6                   \n\t"
1878
    "paddh      $f0, $f0, $f24                  \n\t"
1879
    "paddh      $f2, $f2, $f26                  \n\t"
1880
    "psllh      $f24, $f24, $f12                \n\t"
1881
    "psllh      $f26, $f26, $f12                \n\t"
1882
    "paddh      $f0, $f0, $f24                  \n\t"
1883
    "paddh      $f2, $f2, $f26                  \n\t"
1884
1885
    "dmtc1      $10, $f12                       \n\t"
1886
    "paddh      $f0, $f0, $f12                  \n\t"
1887
    "paddh      $f2, $f2, $f12                  \n\t"
1888
    "dmtc1      $11, $f12                       \n\t"
1889
    "psrah      $f0, $f0, $f12                  \n\t"
1890
    "psrah      $f2, $f2, $f12                  \n\t"
1891
    "packushb   $f0, $f0, $f2                   \n\t"
1892
1893
    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
1894
    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
1895
1896
    "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
1897
    "xor        $f28, $f28, $f28                \n\t"
1898
    "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
1899
    "punpckhbh  $f2, $f0, $f28                  \n\t"
1900
    "punpcklbh  $f0, $f0, $f28                  \n\t"
1901
    "dmtc1      $9, $f12                        \n\t"
1902
    "dmtc1      $8, $f24                        \n\t"
1903
1904
    "paddh      $f16, $f16, $f4                 \n\t"
1905
    "paddh      $f18, $f18, $f6                 \n\t"
1906
    "paddh      $f20, $f20, $f12                \n\t"
1907
    "paddh      $f22, $f22, $f14                \n\t"
1908
    "psllh      $f20, $f20, $f24                \n\t"
1909
    "psllh      $f22, $f22, $f24                \n\t"
1910
    "psubh      $f20, $f20, $f16                \n\t"
1911
    "psubh      $f22, $f22, $f18                \n\t"
1912
    "paddh      $f8, $f8, $f0                   \n\t"
1913
    "paddh      $f10, $f10, $f2                 \n\t"
1914
    "paddh      $f8, $f8, $f20                  \n\t"
1915
    "paddh      $f10, $f10, $f22                \n\t"
1916
    "psllh      $f20, $f20, $f24                \n\t"
1917
    "psllh      $f22, $f22, $f24                \n\t"
1918
    "paddh      $f8, $f8, $f20                  \n\t"
1919
    "paddh      $f10, $f10, $f22                \n\t"
1920
1921
    "dmtc1      $10, $f24                       \n\t"
1922
    "paddh      $f8, $f8, $f24                  \n\t"
1923
    "paddh      $f10, $f10, $f24                \n\t"
1924
    "dmtc1      $11, $f24                       \n\t"
1925
    "psrah      $f8, $f8, $f24                  \n\t"
1926
    "psrah      $f10, $f10, $f24                \n\t"
1927
    "packushb   $f8, $f8, $f10                  \n\t"
1928
    "gsswlc1    $f8, 0x4(%[pDst])               \n\t"
1929
    "gsswrc1    $f8, 0x1(%[pDst])               \n\t"
1930
1931
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
1932
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
1933
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
1934
    "bnez       %[iHeight], 1b                  \n\t"
1935
    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
1936
      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
1937
    : [iSrcStride]"r"((int)iSrcStride),  [iDstStride]"r"((int)iDstStride)
1938
    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
1939
      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1940
      "$f28", "$f30"
1941
  );
1942
  RECOVER_REG;
1943
}
1944
1945
void McHorVer20Width9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
1946
                              int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
1947
  BACKUP_REG;
1948
  __asm__ volatile (
1949
    ".set       arch=loongson3a                 \n\t"
1950
    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
1951
    "xor        $f28, $f28, $f28                \n\t"
1952
    "dli        $8, 0x2                         \n\t"
1953
    "dli        $9, 0x9                         \n\t"
1954
    "dli        $10, 0x0010001000100010         \n\t"
1955
    "dli        $11, 0x5                        \n\t"
1956
    "bne        %[iWidth], $9, 2f               \n\t"
1957
    "1:                                         \n\t"
1958
    "xor        $f28, $f28, $f28                \n\t"
1959
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
1960
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
1961
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
1962
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
1963
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
1964
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
1965
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
1966
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
1967
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
1968
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
1969
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
1970
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
1971
    "punpckhbh  $f2, $f0, $f28                  \n\t"
1972
    "punpckhbh  $f6, $f4, $f28                  \n\t"
1973
    "punpckhbh  $f10, $f8, $f28                 \n\t"
1974
    "punpckhbh  $f14, $f12, $f28                \n\t"
1975
    "punpckhbh  $f18, $f16, $f28                \n\t"
1976
    "punpckhbh  $f22, $f20, $f28                \n\t"
1977
    "punpcklbh  $f0, $f0, $f28                  \n\t"
1978
    "punpcklbh  $f4, $f4, $f28                  \n\t"
1979
    "punpcklbh  $f8, $f8, $f28                  \n\t"
1980
    "punpcklbh  $f12, $f12, $f28                \n\t"
1981
    "punpcklbh  $f16, $f16, $f28                \n\t"
1982
    "punpcklbh  $f20, $f20, $f28                \n\t"
1983
1984
    "mov.d      $f28, $f8                       \n\t"
1985
    "mov.d      $f30, $f10                      \n\t"
1986
    "paddh      $f28, $f28, $f12                \n\t"
1987
    "paddh      $f30, $f30, $f14                \n\t"
1988
    "mov.d      $f24, $f16                      \n\t"
1989
    "mov.d      $f26, $f18                      \n\t"
1990
    "paddh      $f24, $f24, $f20                \n\t"
1991
    "paddh      $f26, $f26, $f22                \n\t"
1992
    "dmfc1      $9, $f12                        \n\t"
1993
    "dmtc1      $8, $f12                        \n\t"
1994
    "psllh      $f24, $f24, $f12                \n\t"
1995
    "psllh      $f26, $f26, $f12                \n\t"
1996
    "psubh      $f24, $f24, $f28                \n\t"
1997
    "psubh      $f26, $f26, $f30                \n\t"
1998
    "paddh      $f0, $f0, $f4                   \n\t"
1999
    "paddh      $f2, $f2, $f6                   \n\t"
2000
    "paddh      $f0, $f0, $f24                  \n\t"
2001
    "paddh      $f2, $f2, $f26                  \n\t"
2002
    "psllh      $f24, $f24, $f12                \n\t"
2003
    "psllh      $f26, $f26, $f12                \n\t"
2004
    "paddh      $f0, $f0, $f24                  \n\t"
2005
    "paddh      $f2, $f2, $f26                  \n\t"
2006
2007
    "dmtc1      $10, $f12                       \n\t"
2008
    "paddh      $f0, $f0, $f12                  \n\t"
2009
    "paddh      $f2, $f2, $f12                  \n\t"
2010
    "dmtc1      $11, $f12                       \n\t"
2011
    "psrah      $f0, $f0, $f12                  \n\t"
2012
    "psrah      $f2, $f2, $f12                  \n\t"
2013
    "packushb   $f0, $f0, $f2                   \n\t"
2014
2015
    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
2016
    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
2017
2018
    "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
2019
    "xor        $f28, $f28, $f28                \n\t"
2020
    "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
2021
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2022
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2023
    "dmtc1      $9, $f12                        \n\t"
2024
    "dmtc1      $8, $f24                        \n\t"
2025
2026
    "paddh      $f16, $f16, $f4                 \n\t"
2027
    "paddh      $f18, $f18, $f6                 \n\t"
2028
    "paddh      $f20, $f20, $f12                \n\t"
2029
    "paddh      $f22, $f22, $f14                \n\t"
2030
    "psllh      $f20, $f20, $f24                \n\t"
2031
    "psllh      $f22, $f22, $f24                \n\t"
2032
    "psubh      $f20, $f20, $f16                \n\t"
2033
    "psubh      $f22, $f22, $f18                \n\t"
2034
    "paddh      $f8, $f8, $f0                   \n\t"
2035
    "paddh      $f10, $f10, $f2                 \n\t"
2036
    "paddh      $f8, $f8, $f20                  \n\t"
2037
    "paddh      $f10, $f10, $f22                \n\t"
2038
    "psllh      $f20, $f20, $f24                \n\t"
2039
    "psllh      $f22, $f22, $f24                \n\t"
2040
    "paddh      $f8, $f8, $f20                  \n\t"
2041
    "paddh      $f10, $f10, $f22                \n\t"
2042
2043
    "dmtc1      $10, $f24                       \n\t"
2044
    "paddh      $f8, $f8, $f24                  \n\t"
2045
    "paddh      $f10, $f10, $f24                \n\t"
2046
    "dmtc1      $11, $f24                       \n\t"
2047
    "psrah      $f8, $f8, $f24                  \n\t"
2048
    "psrah      $f10, $f10, $f24                \n\t"
2049
    "packushb   $f8, $f8, $f10                  \n\t"
2050
    "gssdlc1    $f8, 0x8(%[pDst])               \n\t"
2051
    "gssdrc1    $f8, 0x1(%[pDst])               \n\t"
2052
2053
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2054
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2055
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2056
    "bnez       %[iHeight], 1b                  \n\t"
2057
    "j          3f                              \n\t"
2058
2059
    "2:                                         \n\t"
2060
    "xor        $f28, $f28, $f28                \n\t"
2061
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
2062
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
2063
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
2064
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
2065
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
2066
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
2067
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
2068
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
2069
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
2070
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
2071
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
2072
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
2073
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2074
    "punpckhbh  $f6, $f4, $f28                  \n\t"
2075
    "punpckhbh  $f10, $f8, $f28                 \n\t"
2076
    "punpckhbh  $f14, $f12, $f28                \n\t"
2077
    "punpckhbh  $f18, $f16, $f28                \n\t"
2078
    "punpckhbh  $f22, $f20, $f28                \n\t"
2079
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2080
    "punpcklbh  $f4, $f4, $f28                  \n\t"
2081
    "punpcklbh  $f8, $f8, $f28                  \n\t"
2082
    "punpcklbh  $f12, $f12, $f28                \n\t"
2083
    "punpcklbh  $f16, $f16, $f28                \n\t"
2084
    "punpcklbh  $f20, $f20, $f28                \n\t"
2085
2086
    "dmtc1      $8, $f30                        \n\t"
2087
    "paddh      $f8, $f8, $f12                  \n\t"
2088
    "paddh      $f10, $f10, $f14                \n\t"
2089
    "paddh      $f16, $f16, $f20                \n\t"
2090
    "paddh      $f18, $f18, $f22                \n\t"
2091
    "psllh      $f16, $f16, $f30                \n\t"
2092
    "psllh      $f18, $f18, $f30                \n\t"
2093
    "psubh      $f16, $f16, $f8                 \n\t"
2094
    "psubh      $f18, $f18, $f10                \n\t"
2095
    "paddh      $f0, $f0, $f4                   \n\t"
2096
    "paddh      $f2, $f2, $f6                   \n\t"
2097
    "paddh      $f0, $f0, $f16                  \n\t"
2098
    "paddh      $f2, $f2, $f18                  \n\t"
2099
    "psllh      $f16, $f16, $f30                \n\t"
2100
    "psllh      $f18, $f18, $f30                \n\t"
2101
    "paddh      $f0, $f0, $f16                  \n\t"
2102
    "paddh      $f2, $f2, $f18                  \n\t"
2103
2104
    "dmtc1      $10, $f30                       \n\t"
2105
    "paddh      $f0, $f0, $f30                  \n\t"
2106
    "paddh      $f2, $f2, $f30                  \n\t"
2107
    "dmtc1      $11, $f30                       \n\t"
2108
    "psrah      $f0, $f0, $f30                  \n\t"
2109
    "psrah      $f2, $f2, $f30                  \n\t"
2110
    "packushb   $f0, $f0, $f2                   \n\t"
2111
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
2112
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
2113
2114
    "gsldlc1    $f0, 15(%[pSrc])                \n\t"
2115
    "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
2116
    "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
2117
    "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
2118
    "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
2119
    "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
2120
    "gsldrc1    $f0, 8(%[pSrc])                 \n\t"
2121
    "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
2122
    "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
2123
    "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
2124
    "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
2125
    "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
2126
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2127
    "punpckhbh  $f6, $f4, $f28                  \n\t"
2128
    "punpckhbh  $f10, $f8, $f28                 \n\t"
2129
    "punpckhbh  $f14, $f12, $f28                \n\t"
2130
    "punpckhbh  $f18, $f16, $f28                \n\t"
2131
    "punpckhbh  $f22, $f20, $f28                \n\t"
2132
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2133
    "punpcklbh  $f4, $f4, $f28                  \n\t"
2134
    "punpcklbh  $f8, $f8, $f28                  \n\t"
2135
    "punpcklbh  $f12, $f12, $f28                \n\t"
2136
    "punpcklbh  $f16, $f16, $f28                \n\t"
2137
    "punpcklbh  $f20, $f20, $f28                \n\t"
2138
2139
    "mov.d      $f28, $f8                       \n\t"
2140
    "mov.d      $f30, $f10                      \n\t"
2141
    "paddh      $f28, $f28, $f12                \n\t"
2142
    "paddh      $f30, $f30, $f14                \n\t"
2143
    "mov.d      $f24, $f16                      \n\t"
2144
    "mov.d      $f26, $f18                      \n\t"
2145
    "paddh      $f24, $f24, $f20                \n\t"
2146
    "paddh      $f26, $f26, $f22                \n\t"
2147
    "dmfc1      $9, $f12                        \n\t"
2148
    "dmtc1      $8, $f12                        \n\t"
2149
    "psllh      $f24, $f24, $f12                \n\t"
2150
    "psllh      $f26, $f26, $f12                \n\t"
2151
    "psubh      $f24, $f24, $f28                \n\t"
2152
    "psubh      $f26, $f26, $f30                \n\t"
2153
    "paddh      $f0, $f0, $f4                   \n\t"
2154
    "paddh      $f2, $f2, $f6                   \n\t"
2155
    "paddh      $f0, $f0, $f24                  \n\t"
2156
    "paddh      $f2, $f2, $f26                  \n\t"
2157
    "psllh      $f24, $f24, $f12                \n\t"
2158
    "psllh      $f26, $f26, $f12                \n\t"
2159
    "paddh      $f0, $f0, $f24                  \n\t"
2160
    "paddh      $f2, $f2, $f26                  \n\t"
2161
2162
    "dmtc1      $10, $f30                       \n\t"
2163
    "paddh      $f0, $f0, $f30                  \n\t"
2164
    "paddh      $f2, $f2, $f30                  \n\t"
2165
    "dmtc1      $11, $f30                       \n\t"
2166
    "psrah      $f0, $f0, $f30                  \n\t"
2167
    "psrah      $f2, $f2, $f30                  \n\t"
2168
    "packushb   $f0, $f0, $f2                   \n\t"
2169
    "gsswlc1    $f0, 0xb(%[pDst])               \n\t"
2170
    "gsswrc1    $f0, 0x8(%[pDst])               \n\t"
2171
2172
    "dmtc1      $9, $f12                        \n\t"
2173
    "xor        $f28, $f28, $f28                \n\t"
2174
    "dli        $9, 0x20                        \n\t"
2175
    "gsldlc1    $f0, 0x15(%[pSrc])              \n\t"
2176
    "dmtc1      $9, $f30                        \n\t"
2177
    "gsldrc1    $f0, 0xE(%[pSrc])               \n\t"
2178
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2179
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2180
    "dmtc1      $8, $f24                        \n\t"
2181
2182
    "paddh      $f16, $f16, $f4                 \n\t"
2183
    "paddh      $f18, $f18, $f6                 \n\t"
2184
    "paddh      $f20, $f20, $f12                \n\t"
2185
    "paddh      $f22, $f22, $f14                \n\t"
2186
    "psllh      $f20, $f20, $f24                \n\t"
2187
    "psllh      $f22, $f22, $f24                \n\t"
2188
    "psubh      $f20, $f20, $f16                \n\t"
2189
    "psubh      $f22, $f22, $f18                \n\t"
2190
    "paddh      $f8, $f8, $f0                   \n\t"
2191
    "paddh      $f10, $f10, $f2                 \n\t"
2192
    "paddh      $f8, $f8, $f20                  \n\t"
2193
    "paddh      $f10, $f10, $f22                \n\t"
2194
    "psllh      $f20, $f20, $f24                \n\t"
2195
    "psllh      $f22, $f22, $f24                \n\t"
2196
    "paddh      $f8, $f8, $f20                  \n\t"
2197
    "paddh      $f10, $f10, $f22                \n\t"
2198
2199
    "dmtc1      $10, $f24                       \n\t"
2200
    "paddh      $f8, $f8, $f24                  \n\t"
2201
    "paddh      $f10, $f10, $f24                \n\t"
2202
    "dmtc1      $11, $f24                       \n\t"
2203
    "psrah      $f8, $f8, $f24                  \n\t"
2204
    "psrah      $f10, $f10, $f24                \n\t"
2205
    "packushb   $f8, $f8, $f10                  \n\t"
2206
    "gssdlc1    $f8, 0x10(%[pDst])              \n\t"
2207
    "gssdrc1    $f8, 0x9(%[pDst])               \n\t"
2208
2209
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2210
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2211
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2212
    "bnez       %[iHeight], 2b                  \n\t"
2213
    "3:                                         \n\t"
2214
    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2215
      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
2216
    : [iSrcStride]"r"((int)iSrcStride),  [iDstStride]"r"((int)iDstStride)
2217
    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
2218
      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
2219
      "$f28", "$f30"
2220
  );
2221
  RECOVER_REG;
2222
}
2223
2224
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
2225
static inline void McHorVer20Width5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
2226
                                               uint8_t* pDst, int32_t iDstStride,
2227
                                               int32_t iWidth, int32_t iHeight) {
2228
  if (iWidth == 17 || iWidth == 9)
2229
      McHorVer20Width9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2230
  else //if (iWidth == 5)
2231
      McHorVer20Width5_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2232
}
2233
2234
void McHorVer02Height5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
2235
                           int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
2236
  BACKUP_REG;
2237
  __asm__ volatile (
2238
    ".set       arch=loongson3a                 \n\t"
2239
    "move       $12, %[pSrc]                    \n\t"
2240
    "move       $13, %[pDst]                    \n\t"
2241
    "move       $14, %[iHeight]                 \n\t"
2242
2243
    "dsrl       %[iWidth], %[iWidth], 0x2       \n\t"
2244
    PTR_ADDU   "$10, %[iSrcStride], %[iSrcStride] \n\t"
2245
    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2246
2247
    "1:                                         \n\t"
2248
    "xor        $f28, $f28, $f28                \n\t"
2249
    MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
2250
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2251
    MMI_LOAD_8P($f4, $f6, $f28, $8)
2252
2253
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2254
    MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
2255
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2256
    MMI_LOAD_8P($f12, $f14, $f28, $8)
2257
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2258
    MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
2259
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2260
    MMI_LOAD_8P($f20, $f22, $f28, $8)
2261
    FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2262
                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2263
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2264
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2265
    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2266
    "mov.d      $f0, $f4                        \n\t"
2267
    "mov.d      $f2, $f6                        \n\t"
2268
    "mov.d      $f4, $f8                        \n\t"
2269
    "mov.d      $f6, $f10                       \n\t"
2270
    "mov.d      $f8, $f12                       \n\t"
2271
    "mov.d      $f10, $f14                      \n\t"
2272
    "mov.d      $f12, $f16                      \n\t"
2273
    "mov.d      $f14, $f18                      \n\t"
2274
    "mov.d      $f16, $f20                      \n\t"
2275
    "mov.d      $f18, $f22                      \n\t"
2276
    "mov.d      $f20, $f24                      \n\t"
2277
    "mov.d      $f22, $f26                      \n\t"
2278
2279
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2280
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2281
2282
    "2:                                         \n\t"
2283
    FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2284
                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2285
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2286
    "beqz       %[iHeight], 3f                  \n\t"
2287
2288
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2289
    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2290
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2291
    FILTER_HV_W4($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2292
                 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
2293
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2294
    "beqz       %[iHeight], 3f                  \n\t"
2295
2296
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2297
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2298
    MMI_LOAD_8P($f28, $f30, $f0, $8)
2299
    FILTER_HV_W4($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2300
                 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
2301
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2302
    "beqz       %[iHeight], 3f                  \n\t"
2303
2304
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2305
    MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
2306
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2307
    FILTER_HV_W4($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2308
                 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
2309
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2310
    "beqz       %[iHeight], 3f                  \n\t"
2311
2312
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2313
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2314
    MMI_LOAD_8P($f4, $f6, $f8, $8)
2315
    FILTER_HV_W4($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
2316
                 $f8, $f10, $f12, $f14, %[pDst], $8, $9)
2317
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2318
    "beqz       %[iHeight], 3f                  \n\t"
2319
2320
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2321
    MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
2322
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2323
    FILTER_HV_W4($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
2324
                 $f12, $f14, $f16, $f18, %[pDst], $8, $9)
2325
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2326
    "beqz       %[iHeight], 3f                  \n\t"
2327
2328
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2329
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2330
    MMI_LOAD_8P($f12, $f14, $f16, $8)
2331
    FILTER_HV_W4($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
2332
                 $f16, $f18, $f20, $f22, %[pDst], $8, $9)
2333
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2334
    "beqz       %[iHeight], 3f                  \n\t"
2335
2336
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2337
    MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
2338
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2339
    FILTER_HV_W4($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2340
                 $f20, $f22, $f24, $f26, %[pDst], $8, $9)
2341
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2342
    "beqz       %[iHeight], 3f                  \n\t"
2343
2344
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2345
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2346
    MMI_LOAD_8P($f20, $f22, $f24, $8)
2347
    "j          2b                              \n\t"
2348
2349
    "3:                                         \n\t"
2350
    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
2351
    "beqz       %[iWidth], 4f                   \n\t"
2352
    "move       %[pSrc], $12                    \n\t"
2353
    "move       %[pDst], $13                    \n\t"
2354
    "move       %[iHeight], $14                 \n\t"
2355
    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2356
    PTR_ADDIU  "%[pSrc], %[pSrc], 0x4           \n\t"
2357
    PTR_ADDIU  "%[pDst], %[pDst], 0x4           \n\t"
2358
    "j          1b                              \n\t"
2359
    "4:                                         \n\t"
2360
    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2361
      [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
2362
    : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
2363
    : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
2364
      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2365
      "$f24", "$f26", "$f28", "$f30"
2366
  );
2367
  RECOVER_REG;
2368
}
2369
2370
void McHorVer02Height9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
2371
                               int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
2372
  BACKUP_REG;
2373
  __asm__ volatile (
2374
    ".set       arch=loongson3a                 \n\t"
2375
    "move       $12, %[pSrc]                    \n\t"
2376
    "move       $13, %[pDst]                    \n\t"
2377
    "move       $14, %[iHeight]                 \n\t"
2378
2379
    "dsrl       %[iWidth], %[iWidth], 0x3       \n\t"
2380
    PTR_ADDU   "$10, %[iSrcStride], %[iSrcStride] \n\t"
2381
    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2382
2383
    "1:                                         \n\t"
2384
    "dli        $8, 0x20                        \n\t"
2385
    "xor        $f28, $f28, $f28                \n\t"
2386
    "dmtc1      $8, $f30                        \n\t"
2387
2388
    MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
2389
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2390
    MMI_LOAD_8P($f4, $f6, $f28, $8)
2391
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2392
    MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
2393
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2394
    MMI_LOAD_8P($f12, $f14, $f28, $8)
2395
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2396
    MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
2397
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2398
    MMI_LOAD_8P($f20, $f22, $f28, $8)
2399
    FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2400
                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2401
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2402
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2403
    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2404
    "mov.d      $f0, $f4                        \n\t"
2405
    "mov.d      $f2, $f6                        \n\t"
2406
    "mov.d      $f4, $f8                        \n\t"
2407
    "mov.d      $f6, $f10                       \n\t"
2408
    "mov.d      $f8, $f12                       \n\t"
2409
    "mov.d      $f10, $f14                      \n\t"
2410
    "mov.d      $f12, $f16                      \n\t"
2411
    "mov.d      $f14, $f18                      \n\t"
2412
    "mov.d      $f16, $f20                      \n\t"
2413
    "mov.d      $f18, $f22                      \n\t"
2414
    "mov.d      $f20, $f24                      \n\t"
2415
    "mov.d      $f22, $f26                      \n\t"
2416
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2417
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2418
2419
    "2:                                         \n\t"
2420
    FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2421
                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2422
    "dmtc1      $9, $f8                         \n\t"
2423
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2424
    "beqz       %[iHeight], 3f                  \n\t"
2425
2426
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2427
    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2428
    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2429
    FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2430
                 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
2431
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2432
    "beqz       %[iHeight], 3f                  \n\t"
2433
2434
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2435
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2436
    MMI_LOAD_8P($f28, $f30, $f0, $8)
2437
    FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2438
                 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
2439
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2440
    "beqz       %[iHeight], 3f                  \n\t"
2441
2442
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2443
    MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
2444
    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2445
    FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2446
                 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
2447
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2448
    "beqz       %[iHeight], 3f                  \n\t"
2449
2450
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2451
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2452
    MMI_LOAD_8P($f4, $f6, $f8, $8)
2453
    FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
2454
                 $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
2455
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2456
    "beqz       %[iHeight], 3f                  \n\t"
2457
2458
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2459
    MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
2460
    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2461
    FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
2462
                 $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
2463
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2464
    "beqz       %[iHeight], 3f                  \n\t"
2465
2466
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2467
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2468
    MMI_LOAD_8P($f12, $f14, $f16, $8)
2469
    FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
2470
                 $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
2471
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2472
    "beqz       %[iHeight], 3f                  \n\t"
2473
2474
    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2475
    MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
2476
    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2477
    FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
2478
                 $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
2479
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2480
    "beqz       %[iHeight], 3f                  \n\t"
2481
2482
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2483
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2484
    MMI_LOAD_8P($f20, $f22, $f24, $8)
2485
    "j          2b                              \n\t"
2486
2487
    "3:                                         \n\t"
2488
    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
2489
    "beqz       %[iWidth], 4f                   \n\t"
2490
2491
    "move       %[pSrc], $12                    \n\t"
2492
    "move       %[pDst], $13                    \n\t"
2493
    "move       %[iHeight], $14                 \n\t"
2494
    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2495
    PTR_ADDIU  "%[pSrc], %[pSrc], 0x8           \n\t"
2496
    PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
2497
    "j          1b                              \n\t"
2498
    "4:                                         \n\t"
2499
    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2500
      [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
2501
    : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
2502
    : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
2503
      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2504
      "$f24", "$f26", "$f28", "$f30"
2505
  );
2506
  RECOVER_REG;
2507
}
2508
2509
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
2510
static inline void McHorVer02Height5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
2511
                                                uint8_t* pDst, int32_t iDstStride,
2512
                                                int32_t iWidth, int32_t iHeight) {
2513
  if (iWidth == 16 || iWidth == 8)
2514
    McHorVer02Height9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight );
2515
  else
2516
    McHorVer02Height5_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2517
}
2518
2519
static inline void McHorVer22HorFirst_mmi(const uint8_t *pSrc, int32_t iSrcStride,
2520
                                          uint8_t * pTap, int32_t iTapStride,
2521
                                          int32_t iWidth, int32_t iHeight) {
2522
  BACKUP_REG;
2523
  __asm__ volatile (
2524
    ".set       arch=loongson3a                 \n\t"
2525
    "dli        $8, 0x9                         \n\t"
2526
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2527
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2528
    "bne        %[iWidth], $8, 2f               \n\t"
2529
2530
    "1:                                         \n\t"
2531
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
2532
    "xor        $f28, $f28, $f28                \n\t"
2533
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
2534
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2535
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
2536
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2537
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
2538
    "punpckhbh  $f6, $f4, $f28                  \n\t"
2539
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
2540
    "punpcklbh  $f4, $f4, $f28                  \n\t"
2541
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
2542
    "punpckhbh  $f10, $f8, $f28                 \n\t"
2543
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
2544
    "punpcklbh  $f8, $f8, $f28                  \n\t"
2545
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
2546
    "punpckhbh  $f14, $f12, $f28                \n\t"
2547
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
2548
    "punpcklbh  $f12, $f12, $f28                \n\t"
2549
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
2550
    "punpckhbh  $f18, $f16, $f28                \n\t"
2551
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
2552
    "punpcklbh  $f16, $f16, $f28                \n\t"
2553
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
2554
    "punpckhbh  $f22, $f20, $f28                \n\t"
2555
    "punpcklbh  $f20, $f20, $f28                \n\t"
2556
2557
    "mov.d      $f28, $f8                       \n\t"
2558
    "mov.d      $f30, $f10                      \n\t"
2559
    "paddh      $f28, $f28, $f12                \n\t"
2560
    "paddh      $f30, $f30, $f14                \n\t"
2561
    "mov.d      $f24, $f16                      \n\t"
2562
    "mov.d      $f26, $f18                      \n\t"
2563
    "paddh      $f24, $f24, $f20                \n\t"
2564
    "paddh      $f26, $f26, $f22                \n\t"
2565
    "dli        $8, 0x2                         \n\t"
2566
    "dmfc1      $9, $f12                        \n\t"
2567
    "dmtc1      $8, $f12                        \n\t"
2568
    "psllh      $f24, $f24, $f12                \n\t"
2569
    "psllh      $f26, $f26, $f12                \n\t"
2570
    "psubh      $f24, $f24, $f28                \n\t"
2571
    "psubh      $f26, $f26, $f30                \n\t"
2572
    "paddh      $f0, $f0, $f4                   \n\t"
2573
    "paddh      $f2, $f2, $f6                   \n\t"
2574
    "paddh      $f0, $f0, $f24                  \n\t"
2575
    "paddh      $f2, $f2, $f26                  \n\t"
2576
    "psllh      $f24, $f24, $f12                \n\t"
2577
    "psllh      $f26, $f26, $f12                \n\t"
2578
    "paddh      $f0, $f0, $f24                  \n\t"
2579
    "paddh      $f2, $f2, $f26                  \n\t"
2580
    "gsswlc1    $f0, 0x3(%[pTap])               \n\t"
2581
    "gsswrc1    $f0, 0x0(%[pTap])               \n\t"
2582
2583
    "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
2584
    "xor        $f28, $f28, $f28                \n\t"
2585
    "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
2586
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2587
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2588
    "dli        $8, 0x2                         \n\t"
2589
    "dmtc1      $9, $f12                        \n\t"
2590
    "dmtc1      $8, $f24                        \n\t"
2591
2592
    "paddh      $f16, $f16, $f4                 \n\t"
2593
    "paddh      $f18, $f18, $f6                 \n\t"
2594
    "paddh      $f20, $f20, $f12                \n\t"
2595
    "paddh      $f22, $f22, $f14                \n\t"
2596
    "psllh      $f20, $f20, $f24                \n\t"
2597
    "psllh      $f22, $f22, $f24                \n\t"
2598
    "psubh      $f20, $f20, $f16                \n\t"
2599
    "psubh      $f22, $f22, $f18                \n\t"
2600
    "paddh      $f8, $f8, $f0                   \n\t"
2601
    "paddh      $f10, $f10, $f2                 \n\t"
2602
    "paddh      $f8, $f8, $f20                  \n\t"
2603
    "paddh      $f10, $f10, $f22                \n\t"
2604
    "psllh      $f20, $f20, $f24                \n\t"
2605
    "psllh      $f22, $f22, $f24                \n\t"
2606
    "paddh      $f8, $f8, $f20                  \n\t"
2607
    "paddh      $f10, $f10, $f22                \n\t"
2608
    "gssdlc1    $f8, 0x9(%[pTap])               \n\t"
2609
    "gssdlc1    $f10, 0x11(%[pTap])             \n\t"
2610
    "gssdrc1    $f8, 0x2(%[pTap])               \n\t"
2611
    "gssdrc1    $f10, 0xa(%[pTap])              \n\t"
2612
2613
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2614
    PTR_ADDU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2615
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2616
    "bnez       %[iHeight], 1b                  \n\t"
2617
    "j          3f                              \n\t"
2618
2619
    "2:                                         \n\t"
2620
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
2621
    "xor        $f28, $f28, $f28                \n\t"
2622
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
2623
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2624
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
2625
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2626
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
2627
    "punpckhbh  $f6, $f4, $f28                  \n\t"
2628
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
2629
    "punpcklbh  $f4, $f4, $f28                  \n\t"
2630
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
2631
    "punpckhbh  $f10, $f8, $f28                 \n\t"
2632
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
2633
    "punpcklbh  $f8, $f8, $f28                  \n\t"
2634
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
2635
    "punpckhbh  $f14, $f12, $f28                \n\t"
2636
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
2637
    "punpcklbh  $f12, $f12, $f28                \n\t"
2638
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
2639
    "punpckhbh  $f18, $f16, $f28                \n\t"
2640
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
2641
    "punpcklbh  $f16, $f16, $f28                \n\t"
2642
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
2643
    "punpckhbh  $f22, $f20, $f28                \n\t"
2644
    "dli        $8, 0x2                         \n\t"
2645
    "punpcklbh  $f20, $f20, $f28                \n\t"
2646
2647
    "dmtc1      $8, $f30                        \n\t"
2648
    "paddh      $f8, $f8, $f12                  \n\t"
2649
    "paddh      $f10, $f10, $f14                \n\t"
2650
    "paddh      $f16, $f16, $f20                \n\t"
2651
    "paddh      $f18, $f18, $f22                \n\t"
2652
    "psllh      $f16, $f16, $f30                \n\t"
2653
    "psllh      $f18, $f18, $f30                \n\t"
2654
    "psubh      $f16, $f16, $f8                 \n\t"
2655
    "psubh      $f18, $f18, $f10                \n\t"
2656
    "paddh      $f0, $f0, $f4                   \n\t"
2657
    "paddh      $f2, $f2, $f6                   \n\t"
2658
    "paddh      $f0, $f0, $f16                  \n\t"
2659
    "paddh      $f2, $f2, $f18                  \n\t"
2660
    "psllh      $f16, $f16, $f30                \n\t"
2661
    "psllh      $f18, $f18, $f30                \n\t"
2662
    "paddh      $f0, $f0, $f16                  \n\t"
2663
    "paddh      $f2, $f2, $f18                  \n\t"
2664
    "gssqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
2665
2666
    "gsldlc1    $f0, 15(%[pSrc])                \n\t"
2667
    "gsldrc1    $f0, 8(%[pSrc])                 \n\t"
2668
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2669
    "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
2670
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2671
    "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
2672
    "punpckhbh  $f6, $f4, $f28                  \n\t"
2673
    "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
2674
    "punpcklbh  $f4, $f4, $f28                  \n\t"
2675
    "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
2676
    "punpckhbh  $f10, $f8, $f28                 \n\t"
2677
    "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
2678
    "punpcklbh  $f8, $f8, $f28                  \n\t"
2679
    "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
2680
    "punpckhbh  $f14, $f12, $f28                \n\t"
2681
    "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
2682
    "punpcklbh  $f12, $f12, $f28                \n\t"
2683
    "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
2684
    "punpckhbh  $f18, $f16, $f28                \n\t"
2685
    "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
2686
    "punpcklbh  $f16, $f16, $f28                \n\t"
2687
    "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
2688
    "punpckhbh  $f22, $f20, $f28                \n\t"
2689
    "punpcklbh  $f20, $f20, $f28                \n\t"
2690
2691
    "mov.d      $f28, $f8                       \n\t"
2692
    "mov.d      $f30, $f10                      \n\t"
2693
    "paddh      $f28, $f28, $f12                \n\t"
2694
    "paddh      $f30, $f30, $f14                \n\t"
2695
    "mov.d      $f24, $f16                      \n\t"
2696
    "mov.d      $f26, $f18                      \n\t"
2697
    "dli        $8, 0x2                         \n\t"
2698
    "paddh      $f24, $f24, $f20                \n\t"
2699
    "paddh      $f26, $f26, $f22                \n\t"
2700
    "dmfc1      $9, $f12                        \n\t"
2701
    "dmtc1      $8, $f12                        \n\t"
2702
    "psllh      $f24, $f24, $f12                \n\t"
2703
    "psllh      $f26, $f26, $f12                \n\t"
2704
    "psubh      $f24, $f24, $f28                \n\t"
2705
    "psubh      $f26, $f26, $f30                \n\t"
2706
    "paddh      $f0, $f0, $f4                   \n\t"
2707
    "paddh      $f2, $f2, $f6                   \n\t"
2708
    "paddh      $f0, $f0, $f24                  \n\t"
2709
    "paddh      $f2, $f2, $f26                  \n\t"
2710
    "psllh      $f24, $f24, $f12                \n\t"
2711
    "psllh      $f26, $f26, $f12                \n\t"
2712
    "paddh      $f0, $f0, $f24                  \n\t"
2713
    "paddh      $f2, $f2, $f26                  \n\t"
2714
    "gsswlc1    $f0, 0x13(%[pTap])              \n\t"
2715
    "gsswrc1    $f0, 0x10(%[pTap])              \n\t"
2716
2717
    "gsldlc1    $f0, 0x15(%[pSrc])              \n\t"
2718
    "xor        $f28, $f28, $f28                \n\t"
2719
    "gsldrc1    $f0, 0xE(%[pSrc])               \n\t"
2720
    "punpckhbh  $f2, $f0, $f28                  \n\t"
2721
    "punpcklbh  $f0, $f0, $f28                  \n\t"
2722
    "dli        $8, 0x2                         \n\t"
2723
    "dmtc1      $9, $f12                        \n\t"
2724
    "dmtc1      $8, $f24                        \n\t"
2725
2726
    "paddh      $f16, $f16, $f4                 \n\t"
2727
    "paddh      $f18, $f18, $f6                 \n\t"
2728
    "paddh      $f20, $f20, $f12                \n\t"
2729
    "paddh      $f22, $f22, $f14                \n\t"
2730
    "psllh      $f20, $f20, $f24                \n\t"
2731
    "psllh      $f22, $f22, $f24                \n\t"
2732
    "psubh      $f20, $f20, $f16                \n\t"
2733
    "psubh      $f22, $f22, $f18                \n\t"
2734
    "paddh      $f8, $f8, $f0                   \n\t"
2735
    "paddh      $f10, $f10, $f2                 \n\t"
2736
    "paddh      $f8, $f8, $f20                  \n\t"
2737
    "paddh      $f10, $f10, $f22                \n\t"
2738
    "psllh      $f20, $f20, $f24                \n\t"
2739
    "psllh      $f22, $f22, $f24                \n\t"
2740
    "paddh      $f8, $f8, $f20                  \n\t"
2741
    "paddh      $f10, $f10, $f22                \n\t"
2742
    "gssdlc1    $f8, 0x19(%[pTap])              \n\t"
2743
    "gssdlc1    $f10, 0x21(%[pTap])             \n\t"
2744
    "gssdrc1    $f8, 0x12(%[pTap])              \n\t"
2745
    "gssdrc1    $f10, 0x1a(%[pTap])             \n\t"
2746
2747
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2748
    PTR_ADDU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2749
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2750
    "bnez       %[iHeight], 2b                  \n\t"
2751
    "3:                                         \n\t"
2752
    : [pSrc]"+&r"(pSrc), [pTap]"+&r"(pTap), [iWidth]"+&r"(iWidth),
2753
      [iHeight]"+&r"(iHeight)
2754
    : [iSrcStride]"r"(iSrcStride),  [iTapStride]"r"(iTapStride)
2755
    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
2756
      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
2757
  );
2758
  RECOVER_REG;
2759
}
2760
2761
static inline void McHorVer22Width8VerLastAlign_mmi(const uint8_t *pTap,
2762
                   int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
2763
                   int32_t iWidth, int32_t iHeight) {
2764
  BACKUP_REG;
2765
  __asm__ volatile (
2766
    ".set       arch=loongson3a                 \n\t"
2767
    "move       $10, %[pTap]                    \n\t"
2768
    "move       $11, %[pDst]                    \n\t"
2769
    "move       $12, %[iHeight]                 \n\t"
2770
    "dsrl       %[iWidth], 0x3                  \n\t"
2771
    PTR_ADDU   "$13, %[iTapStride], %[iTapStride] \n\t"
2772
    PTR_ADDU   "$14, %[iDstStride], %[iDstStride] \n\t"
2773
    "dli        $15, 0x0020002000200020         \n\t"
2774
2775
    "4:                                         \n\t"
2776
    "gslqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
2777
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2778
    "gslqc1     $f6, $f4, 0x0($8)               \n\t"
2779
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2780
    "gslqc1     $f10, $f8, 0x0(%[pTap])         \n\t"
2781
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2782
    "gslqc1     $f14, $f12, 0x0($8)             \n\t"
2783
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2784
    "gslqc1     $f18, $f16, 0x0(%[pTap])        \n\t"
2785
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2786
    "gslqc1     $f22, $f20, 0x0($8)             \n\t"
2787
2788
    FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2789
                     $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
2790
2791
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2792
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2793
    "gslqc1     $f26, $f24, 0x0(%[pTap])        \n\t"
2794
    "mov.d      $f0, $f4                        \n\t"
2795
    "mov.d      $f2, $f6                        \n\t"
2796
    "mov.d      $f4, $f8                        \n\t"
2797
    "mov.d      $f6, $f10                       \n\t"
2798
    "mov.d      $f8, $f12                       \n\t"
2799
    "mov.d      $f10, $f14                      \n\t"
2800
    "mov.d      $f12, $f16                      \n\t"
2801
    "mov.d      $f14, $f18                      \n\t"
2802
    "mov.d      $f16, $f20                      \n\t"
2803
    "mov.d      $f18, $f22                      \n\t"
2804
    "mov.d      $f20, $f24                      \n\t"
2805
    "mov.d      $f22, $f26                      \n\t"
2806
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2807
    PTR_SUBU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2808
2809
    "5:                                         \n\t"
2810
    FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2811
                     $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
2812
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2813
    "beqz       %[iHeight], 6f                  \n\t"
2814
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2815
    "gslqc1     $f26, $f24, 0x0(%[pTap])        \n\t"
2816
2817
    FILTER_VER_ALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2818
                     $f26, $f28, $f30, $f0, $f2, %[pDst], %[iDstStride], $8, $9, $15)
2819
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2820
    "beqz       %[iHeight], 6f                  \n\t"
2821
    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2822
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2823
    "gslqc1     $f30, $f28, 0x0($8)             \n\t"
2824
2825
    FILTER_VER_ALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2826
                     $f30, $f0, $f2, $f4, $f6, %[pDst], $0, $8, $9, $15)
2827
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2828
    "beqz       %[iHeight], 6f                  \n\t"
2829
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2830
    "gslqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
2831
2832
    FILTER_VER_ALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2833
                     $f2, $f4, $f6, $f8, $f10, %[pDst], %[iDstStride], $8, $9, $15)
2834
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2835
    "beqz       %[iHeight], 6f                  \n\t"
2836
    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2837
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2838
    "gslqc1     $f6, $f4, 0x0($8)               \n\t"
2839
2840
    FILTER_VER_ALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
2841
                     $f6, $f8, $f10, $f12, $f14, %[pDst], $0, $8, $9, $15)
2842
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2843
    "beqz       %[iHeight], 6f                  \n\t"
2844
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2845
    "gslqc1     $f10, $f8, 0x0(%[pTap])         \n\t"
2846
2847
    FILTER_VER_ALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
2848
                     $f10, $f12, $f14, $f16, $f18, %[pDst], %[iDstStride], $8, $9, $15)
2849
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2850
    "beqz       %[iHeight], 6f                  \n\t"
2851
    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2852
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2853
    "gslqc1     $f14, $f12, 0x0($8)             \n\t"
2854
2855
    FILTER_VER_ALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
2856
                     $f14, $f16, $f18, $f20, $f22, %[pDst], $0, $8, $9, $15)
2857
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2858
    "beqz       %[iHeight], 6f                  \n\t"
2859
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2860
    "gslqc1     $f18, $f16, 0x0(%[pTap])        \n\t"
2861
2862
    FILTER_VER_ALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
2863
                     $f18, $f20, $f22, $f24, $f26, %[pDst], %[iDstStride], $8, $9, $15)
2864
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2865
    "beqz       %[iHeight], 6f                  \n\t"
2866
    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2867
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2868
    "gslqc1     $f22, $f20, 0x0($8)             \n\t"
2869
    "j          5b                              \n\t"
2870
2871
    "6:                                         \n\t"
2872
    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
2873
    "beqz       %[iWidth], 7f                   \n\t"
2874
    "move       %[pTap], $10                    \n\t"
2875
    "move       %[pDst], $11                    \n\t"
2876
    "move       %[iHeight], $12                 \n\t"
2877
    PTR_ADDIU  "%[pTap], %[pTap], 0x10          \n\t"
2878
    PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
2879
    "j          4b                              \n\t"
2880
    "7:                                         \n\t"
2881
    : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
2882
      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
2883
    : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
2884
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
2885
      "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18",
2886
      "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
2887
  );
2888
  RECOVER_REG;
2889
}
2890
2891
static inline void McHorVer22Width8VerLastUnAlign_mmi(const uint8_t *pTap,
2892
                   int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
2893
                   int32_t iWidth, int32_t iHeight) {
2894
  BACKUP_REG;
2895
  __asm__ volatile (
2896
    ".set       arch=loongson3a                 \n\t"
2897
    "move       $10, %[pTap]                    \n\t"
2898
    "move       $11, %[pDst]                    \n\t"
2899
    "move       $12, %[iHeight]                 \n\t"
2900
    "dsrl       %[iWidth], 0x3                  \n\t"
2901
    PTR_ADDU   "$13, %[iTapStride], %[iTapStride] \n\t"
2902
    "dli        $14, 0x0020002000200020         \n\t"
2903
2904
    "4:                                         \n\t"
2905
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2906
    "gsldlc1    $f0, 0x7(%[pTap])               \n\t"
2907
    "gsldlc1    $f2, 0xF(%[pTap])               \n\t"
2908
    "gsldlc1    $f4, 0x7($8)                    \n\t"
2909
    "gsldlc1    $f6, 0xF($8)                    \n\t"
2910
    "gsldrc1    $f0, 0x0(%[pTap])               \n\t"
2911
    "gsldrc1    $f2, 0x8(%[pTap])               \n\t"
2912
    "gsldrc1    $f4, 0x0($8)                    \n\t"
2913
    "gsldrc1    $f6, 0x8($8)                    \n\t"
2914
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2915
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2916
    "gsldlc1    $f8, 0x7(%[pTap])               \n\t"
2917
    "gsldlc1    $f10, 0xF(%[pTap])              \n\t"
2918
    "gsldlc1    $f12, 0x7($8)                   \n\t"
2919
    "gsldlc1    $f14, 0xF($8)                   \n\t"
2920
    "gsldrc1    $f8, 0x0(%[pTap])               \n\t"
2921
    "gsldrc1    $f10, 0x8(%[pTap])              \n\t"
2922
    "gsldrc1    $f12, 0x0($8)                   \n\t"
2923
    "gsldrc1    $f14, 0x8($8)                   \n\t"
2924
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2925
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2926
    "gsldlc1    $f16, 0x7(%[pTap])              \n\t"
2927
    "gsldlc1    $f18, 0xF(%[pTap])              \n\t"
2928
    "gsldlc1    $f20, 0x7($8)                   \n\t"
2929
    "gsldlc1    $f22, 0xF($8)                   \n\t"
2930
    "gsldrc1    $f16, 0x0(%[pTap])              \n\t"
2931
    "gsldrc1    $f18, 0x8(%[pTap])              \n\t"
2932
    "gsldrc1    $f20, 0x0($8)                   \n\t"
2933
    "gsldrc1    $f22, 0x8($8)                   \n\t"
2934
2935
    FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2936
                       $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
2937
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2938
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2939
    "gsldlc1    $f24, 0x7(%[pTap])              \n\t"
2940
    "gsldlc1    $f26, 0xF(%[pTap])              \n\t"
2941
    "gsldrc1    $f24, 0x0(%[pTap])              \n\t"
2942
    "gsldrc1    $f26, 0x8(%[pTap])              \n\t"
2943
    "mov.d      $f0, $f4                        \n\t"
2944
    "mov.d      $f2, $f6                        \n\t"
2945
    "mov.d      $f4, $f8                        \n\t"
2946
    "mov.d      $f6, $f10                       \n\t"
2947
    "mov.d      $f8, $f12                       \n\t"
2948
    "mov.d      $f10, $f14                      \n\t"
2949
    "mov.d      $f12, $f16                      \n\t"
2950
    "mov.d      $f14, $f18                      \n\t"
2951
    "mov.d      $f16, $f20                      \n\t"
2952
    "mov.d      $f18, $f22                      \n\t"
2953
    "mov.d      $f20, $f24                      \n\t"
2954
    "mov.d      $f22, $f26                      \n\t"
2955
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2956
    PTR_SUBU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2957
2958
    "5:                                         \n\t"
2959
    FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2960
                       $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
2961
2962
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2963
    "beqz       %[iHeight], 6f                  \n\t"
2964
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2965
    "gsldlc1    $f24, 0x7(%[pTap])              \n\t"
2966
    "gsldlc1    $f26, 0xF(%[pTap])              \n\t"
2967
    "gsldrc1    $f24, 0x0(%[pTap])              \n\t"
2968
    "gsldrc1    $f26, 0x8(%[pTap])              \n\t"
2969
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2970
2971
    FILTER_VER_UNALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22,
2972
                       $f24, $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9, $14)
2973
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2974
    "beqz       %[iHeight], 6f                  \n\t"
2975
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2976
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2977
    "gsldlc1    $f28, 0x7($8)                   \n\t"
2978
    "gsldlc1    $f30, 0xF($8)                   \n\t"
2979
    "gsldrc1    $f28, 0x0($8)                   \n\t"
2980
    "gsldrc1    $f30, 0x8($8)                   \n\t"
2981
2982
    FILTER_VER_UNALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
2983
                       $f28, $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9, $14)
2984
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2985
    "beqz       %[iHeight], 6f                  \n\t"
2986
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2987
    "gsldlc1    $f0, 0x7(%[pTap])               \n\t"
2988
    "gsldlc1    $f2, 0xF(%[pTap])               \n\t"
2989
    "gsldrc1    $f0, 0x0(%[pTap])               \n\t"
2990
    "gsldrc1    $f2, 0x8(%[pTap])               \n\t"
2991
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2992
2993
    FILTER_VER_UNALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2994
                       $f30, $f0, $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9, $14)
2995
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2996
    "beqz       %[iHeight], 6f                  \n\t"
2997
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2998
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2999
    "gsldlc1    $f4, 0x7($8)                    \n\t"
3000
    "gsldlc1    $f6, 0xF($8)                    \n\t"
3001
    "gsldrc1    $f4, 0x0($8)                    \n\t"
3002
    "gsldrc1    $f6, 0x8($8)                    \n\t"
3003
3004
    FILTER_VER_UNALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2,
3005
                       $f4, $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9, $14)
3006
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3007
    "beqz       %[iHeight], 6f                  \n\t"
3008
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
3009
    "gsldlc1    $f8, 0x7(%[pTap])               \n\t"
3010
    "gsldlc1    $f10, 0xF(%[pTap])              \n\t"
3011
    "gsldrc1    $f8, 0x0(%[pTap])               \n\t"
3012
    "gsldrc1    $f10, 0x8(%[pTap])              \n\t"
3013
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3014
3015
    FILTER_VER_UNALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
3016
                       $f8, $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9, $14)
3017
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3018
    "beqz       %[iHeight], 6f                  \n\t"
3019
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3020
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
3021
    "gsldlc1    $f12, 0x7($8)                   \n\t"
3022
    "gsldlc1    $f14, 0xF($8)                   \n\t"
3023
    "gsldrc1    $f12, 0x0($8)                   \n\t"
3024
    "gsldrc1    $f14, 0x8($8)                   \n\t"
3025
3026
    FILTER_VER_UNALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
3027
                       $f12, $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9, $14)
3028
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3029
    "beqz       %[iHeight], 6f                  \n\t"
3030
    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
3031
    "gsldlc1    $f16, 0x7(%[pTap])              \n\t"
3032
    "gsldlc1    $f18, 0xF(%[pTap])              \n\t"
3033
    "gsldrc1    $f16, 0x0(%[pTap])              \n\t"
3034
    "gsldrc1    $f18, 0x8(%[pTap])              \n\t"
3035
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3036
3037
    FILTER_VER_UNALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
3038
                       $f16, $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9, $14)
3039
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3040
    "beqz       %[iHeight], 6f                  \n\t"
3041
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3042
    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
3043
    "gsldlc1    $f20, 0x7($8)                   \n\t"
3044
    "gsldlc1    $f22, 0xF($8)                   \n\t"
3045
    "gsldrc1    $f20, 0x0($8)                   \n\t"
3046
    "gsldrc1    $f22, 0x8($8)                   \n\t"
3047
    "j          5b                              \n\t"
3048
3049
    "6:                                         \n\t"
3050
    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
3051
    "beqz       %[iWidth], 7f                   \n\t"
3052
    "move       %[pTap], $10                    \n\t"
3053
    "move       %[pDst], $11                    \n\t"
3054
    "move       %[iHeight], $12                 \n\t"
3055
    PTR_ADDIU  "%[pTap], %[pTap], 0x10          \n\t"
3056
    PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
3057
    "j          4b                              \n\t"
3058
3059
    "7:                                         \n\t"
3060
    : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
3061
      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
3062
    : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
3063
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
3064
      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
3065
      "$f22", "$f24", "$f26", "$f28", "$f30"
3066
  );
3067
  RECOVER_REG;
3068
}
3069
3070
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
3071
static inline void McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t* pSrc,
3072
                   int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
3073
                   int32_t iWidth, int32_t iHeight) {
3074
  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
3075
3076
  if (iWidth == 17 || iWidth == 9){
3077
    int32_t tmp1 = 2 * (iWidth - 8);
3078
    McHorVer22HorFirst_mmi(pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
3079
3080
    McHorVer22Width8VerLastAlign_mmi((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
3081
3082
    McHorVer22Width8VerLastUnAlign_mmi((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8,
3083
                                        iDstStride, 8, iHeight);
3084
  } else {
3085
    int16_t iTmp[17 + 5];
3086
    int32_t i, j, k;
3087
3088
    for (i = 0; i < iHeight; i++) {
3089
      for (j = 0; j < iWidth + 5; j++) {
3090
        iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
3091
      }
3092
      for (k = 0; k < iWidth; k++) {
3093
        pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
3094
      }
3095
      pSrc += iSrcStride;
3096
      pDst += iDstStride;
3097
    }
3098
  }
3099
}
3100
3101
void McCopyWidthEq4_mmi(const uint8_t *pSrc, int iSrcStride,
3102
                        uint8_t *pDst, int iDstStride, int iHeight) {
3103
  __asm__ volatile (
3104
    ".set       arch=loongson3a                 \n\t"
3105
    "1:                                         \n\t"
3106
    "lwl        $8, 0x3(%[pSrc])                \n\t"
3107
    "lwr        $8, 0x0(%[pSrc])                \n\t"
3108
    "swl        $8, 0x3(%[pDst])                \n\t"
3109
    "swr        $8, 0x0(%[pDst])                \n\t"
3110
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3111
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3112
    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3113
    "bnez       %[iHeight], 1b                  \n\t"
3114
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3115
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3116
    : "memory", "$8"
3117
  );
3118
}
3119
3120
void McCopyWidthEq8_mmi(const uint8_t *pSrc, int iSrcStride,
3121
                        uint8_t *pDst, int iDstStride, int iHeight) {
3122
  __asm__ volatile (
3123
    ".set       arch=loongson3a                 \n\t"
3124
    "1:                                         \n\t"
3125
    "ldl        $8, 0x7(%[pSrc])                \n\t"
3126
    "ldr        $8, 0x0(%[pSrc])                \n\t"
3127
    "sdl        $8, 0x7(%[pDst])                \n\t"
3128
    "sdr        $8, 0x0(%[pDst])                \n\t"
3129
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3130
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3131
    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3132
    "bnez       %[iHeight], 1b                  \n\t"
3133
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3134
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3135
    : "memory", "$8"
3136
  );
3137
}
3138
3139
void McCopyWidthEq16_mmi(const uint8_t *pSrc, int iSrcStride,
3140
                         uint8_t *pDst, int iDstStride, int iHeight) {
3141
  __asm__ volatile (
3142
    ".set       arch=loongson3a                 \n\t"
3143
    "1:                                         \n\t"
3144
    "ldl        $8, 0x7(%[pSrc])                \n\t"
3145
    "ldl        $9, 0xF(%[pSrc])                \n\t"
3146
    "ldr        $8, 0x0(%[pSrc])                \n\t"
3147
    "ldr        $9, 0x8(%[pSrc])                \n\t"
3148
    "sdl        $8, 0x7(%[pDst])                \n\t"
3149
    "sdl        $9, 0xF(%[pDst])                \n\t"
3150
    "sdr        $8, 0x0(%[pDst])                \n\t"
3151
    "sdr        $9, 0x8(%[pDst])                \n\t"
3152
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3153
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3154
    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3155
    "bnez       %[iHeight], 1b                  \n\t"
3156
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3157
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3158
    : "memory", "$8", "$9"
3159
  );
3160
}
3161
3162
static inline void McCopy_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3163
                              int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3164
  if (iWidth == 16)
3165
    McCopyWidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3166
  else if (iWidth == 8)
3167
    McCopyWidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3168
  else if (iWidth == 4)
3169
    McCopyWidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3170
  else
3171
    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3172
}
3173
3174
void McChromaWidthEq4_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
3175
                          int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
3176
  __asm__ volatile (
3177
    ".set       arch=loongson3a                 \n\t"
3178
    "gsldlc1    $f6, 0x7(%[pABCD])              \n\t"
3179
    "gsldrc1    $f6, 0x0(%[pABCD])              \n\t"
3180
    "xor        $f14, $f14, $f14                \n\t"
3181
    "punpcklbh  $f6, $f6, $f6                   \n\t"
3182
    "mov.d      $f8, $f6                        \n\t"
3183
    "punpcklhw  $f6, $f6, $f6                   \n\t"
3184
    "punpckhhw  $f8, $f8, $f8                   \n\t"
3185
    "mov.d      $f10, $f6                       \n\t"
3186
    "punpcklbh  $f6, $f6, $f14                  \n\t"
3187
    "punpckhbh  $f10, $f10, $f14                \n\t"
3188
3189
    "mov.d      $f12, $f8                       \n\t"
3190
    "punpcklbh  $f8, $f8, $f14                  \n\t"
3191
    "punpckhbh  $f12, $f12, $f14                \n\t"
3192
    PTR_ADDU   "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
3193
    "dli        $8, 0x6                         \n\t"
3194
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3195
    "gsldlc1    $f2, 0x8(%[pSrc])               \n\t"
3196
    "dmtc1      $8, $f16                        \n\t"
3197
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3198
    "gsldrc1    $f2, 0x1(%[pSrc])               \n\t"
3199
    "dli        $8, 0x0020002000200020          \n\t"
3200
    "punpcklbh  $f0, $f0, $f14                  \n\t"
3201
    "punpcklbh  $f2, $f2, $f14                  \n\t"
3202
3203
    "dmtc1      $8, $f18                        \n\t"
3204
    "1:                                         \n\t"
3205
    "pmullh     $f0, $f0, $f6                   \n\t"
3206
    "pmullh     $f2, $f2, $f10                  \n\t"
3207
    "paddh      $f0, $f0, $f2                   \n\t"
3208
3209
    "gsldlc1    $f2, 0x7(%[pABCD])              \n\t"
3210
    "gsldrc1    $f2, 0x0(%[pABCD])              \n\t"
3211
    "punpcklbh  $f2, $f2, $f14                  \n\t"
3212
    "mov.d      $f4, $f2                        \n\t"
3213
    "pmullh     $f2, $f2, $f8                   \n\t"
3214
    "paddh      $f0, $f0, $f2                   \n\t"
3215
    "gsldlc1    $f2, 0x8(%[pABCD])              \n\t"
3216
    "gsldrc1    $f2, 0x1(%[pABCD])              \n\t"
3217
    "punpcklbh  $f2, $f2, $f14                  \n\t"
3218
    "mov.d      $f14, $f2                       \n\t"
3219
    "pmullh     $f2, $f2, $f12                  \n\t"
3220
    "paddh      $f0, $f0, $f2                   \n\t"
3221
    "mov.d      $f2, $f14                       \n\t"
3222
    "paddh      $f0, $f0, $f18                  \n\t"
3223
    "psrlh      $f0, $f0, $f16                  \n\t"
3224
    "xor        $f14, $f14, $f14                \n\t"
3225
    "packushb   $f0, $f0, $f14                  \n\t"
3226
    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
3227
    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
3228
    "mov.d      $f0, $f4                        \n\t"
3229
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3230
    PTR_ADDU   "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
3231
    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3232
    "bnez       %[iHeight], 1b                  \n\t"
3233
    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
3234
      [pABCD]"+&r"((unsigned char *)pABCD), [iHeight]"+&r"((int)iHeight)
3235
    : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
3236
    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3237
      "$f14", "$f16", "$f18"
3238
  );
3239
}
3240
3241
void McChromaWidthEq8_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
3242
                          int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
3243
  BACKUP_REG;
3244
  __asm__ volatile (
3245
    ".set       arch=loongson3a                 \n\t"
3246
    "gsldlc1    $f12, 0x7(%[pABCD])             \n\t"
3247
    "xor        $f28, $f28, $f28                \n\t"
3248
    "gsldrc1    $f12, 0x0(%[pABCD])             \n\t"
3249
    "punpcklbh  $f12, $f12, $f12                \n\t"
3250
    "punpckhhw  $f14, $f12, $f12                \n\t"
3251
    "punpcklhw  $f12, $f12, $f12                \n\t"
3252
3253
    "mov.d      $f16, $f14                      \n\t"
3254
    "punpckhwd  $f14, $f12, $f12                \n\t"
3255
    "punpcklwd  $f12, $f12, $f12                \n\t"
3256
    "punpckhwd  $f18, $f16, $f16                \n\t"
3257
    "punpcklwd  $f16, $f16, $f16                \n\t"
3258
    "mov.d      $f20, $f14                      \n\t"
3259
    "mov.d      $f24, $f18                      \n\t"
3260
3261
    "punpckhbh  $f14, $f12, $f28                \n\t"
3262
    "punpcklbh  $f12, $f12, $f28                \n\t"
3263
    "punpckhbh  $f22, $f20, $f28                \n\t"
3264
    "punpcklbh  $f20, $f20, $f28                \n\t"
3265
    "punpckhbh  $f18, $f16, $f28                \n\t"
3266
    "punpcklbh  $f16, $f16, $f28                \n\t"
3267
    "punpckhbh  $f26, $f24, $f28                \n\t"
3268
    "punpcklbh  $f24, $f24, $f28                \n\t"
3269
3270
    PTR_ADDU   "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
3271
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3272
    "gsldlc1    $f4, 0x8(%[pSrc])               \n\t"
3273
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3274
    "gsldrc1    $f4, 0x1(%[pSrc])               \n\t"
3275
    "punpckhbh  $f2, $f0, $f28                  \n\t"
3276
    "punpcklbh  $f0, $f0, $f28                  \n\t"
3277
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3278
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3279
    "1:                                         \n\t"
3280
    "dli        $8, 0x20                        \n\t"
3281
    "dmtc1      $8, $f30                        \n\t"
3282
3283
    "pmullh     $f0, $f0, $f12                  \n\t"
3284
    "pmullh     $f2, $f2, $f14                  \n\t"
3285
    "pmullh     $f4, $f4, $f20                  \n\t"
3286
    "pmullh     $f6, $f6, $f22                  \n\t"
3287
    "paddh      $f0, $f0, $f4                   \n\t"
3288
    "paddh      $f2, $f2, $f6                   \n\t"
3289
3290
    "gsldlc1    $f4, 0x7(%[pABCD])              \n\t"
3291
    "gsldrc1    $f4, 0x0(%[pABCD])              \n\t"
3292
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3293
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3294
    "mov.d      $f8, $f4                        \n\t"
3295
    "mov.d      $f10, $f6                       \n\t"
3296
    "pmullh     $f4, $f4, $f16                  \n\t"
3297
    "pmullh     $f6, $f6, $f18                  \n\t"
3298
    "paddh      $f0, $f0, $f4                   \n\t"
3299
    "paddh      $f2, $f2, $f6                   \n\t"
3300
3301
    "gsldlc1    $f4, 0x8(%[pABCD])              \n\t"
3302
    "gsldrc1    $f4, 0x1(%[pABCD])              \n\t"
3303
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3304
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3305
    "mov.d      $f28, $f4                       \n\t"
3306
    "mov.d      $f30, $f6                       \n\t"
3307
    "pmullh     $f4, $f4, $f24                  \n\t"
3308
    "pmullh     $f6, $f6, $f26                  \n\t"
3309
    "paddh      $f0, $f0, $f4                   \n\t"
3310
    "paddh      $f2, $f2, $f6                   \n\t"
3311
    "mov.d      $f4, $f28                       \n\t"
3312
    "mov.d      $f6, $f30                       \n\t"
3313
3314
    "dli        $8, 0x0020002000200020          \n\t"
3315
    "dmfc1      $9, $f20                        \n\t"
3316
    "dmtc1      $8, $f20                        \n\t"
3317
    "dli        $8, 0x6                         \n\t"
3318
    "paddh      $f0, $f0, $f20                  \n\t"
3319
    "paddh      $f2, $f2, $f20                  \n\t"
3320
    "dmtc1      $8, $f20                        \n\t"
3321
    "psrlh      $f0, $f0, $f20                  \n\t"
3322
    "psrlh      $f2, $f2, $f20                  \n\t"
3323
3324
    "xor        $f28, $f28, $f28                \n\t"
3325
    "packushb   $f0, $f0, $f2                   \n\t"
3326
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3327
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3328
3329
    "mov.d      $f0, $f8                        \n\t"
3330
    "mov.d      $f2, $f10                       \n\t"
3331
    "dmtc1      $9, $f20                        \n\t"
3332
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3333
    PTR_ADDU   "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
3334
3335
    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3336
    "bnez       %[iHeight], 1b                  \n\t"
3337
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [pABCD]"+&r"(pABCD),
3338
      [iHeight]"+&r"(iHeight)
3339
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3340
    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3341
      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3342
  );
3343
  RECOVER_REG;
3344
}
3345
3346
void McChroma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3347
                  int32_t iDstStride, int16_t iMvX, int16_t iMvY,
3348
                  int32_t iWidth, int32_t iHeight) {
3349
  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
3350
    McChromaWidthEq4_mmi,
3351
    McChromaWidthEq8_mmi
3352
  };
3353
  const int32_t kiD8x = iMvX & 0x07;
3354
  const int32_t kiD8y = iMvY & 0x07;
3355
  if (kiD8x == 0 && kiD8y == 0) {
3356
    McCopy_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
3357
    return;
3358
  }
3359
  if (iWidth != 2) {
3360
    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
3361
                                      g_kuiABCD[kiD8y][kiD8x], iHeight);
3362
  } else
3363
    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
3364
                          iWidth, iHeight);
3365
}
3366
3367
void McHorVer20WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3368
                            int iDstStride, int iHeight) {
3369
  BACKUP_REG;
3370
  __asm__ volatile (
3371
    ".set       arch=loongson3a                 \n\t"
3372
    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
3373
    "xor        $f28, $f28, $f28                \n\t"
3374
    "dli        $8, 0x0010001000100010          \n\t"
3375
    "dmtc1      $8, $f24                        \n\t"
3376
    "dli        $8, 0x2                         \n\t"
3377
    "dmtc1      $8, $f26                        \n\t"
3378
    "dli        $8, 0x5                         \n\t"
3379
    "dmtc1      $8, $f30                        \n\t"
3380
    "1:                                         \n\t"
3381
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3382
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
3383
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
3384
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
3385
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
3386
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
3387
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3388
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
3389
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
3390
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
3391
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
3392
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
3393
    "punpckhbh  $f2, $f0, $f28                  \n\t"
3394
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3395
    "punpckhbh  $f10, $f8, $f28                 \n\t"
3396
    "punpckhbh  $f14, $f12, $f28                \n\t"
3397
    "punpckhbh  $f18, $f16, $f28                \n\t"
3398
    "punpckhbh  $f22, $f20, $f28                \n\t"
3399
    "punpcklbh  $f0, $f0, $f28                  \n\t"
3400
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3401
    "punpcklbh  $f8, $f8, $f28                  \n\t"
3402
    "punpcklbh  $f12, $f12, $f28                \n\t"
3403
    "punpcklbh  $f16, $f16, $f28                \n\t"
3404
    "punpcklbh  $f20, $f20, $f28                \n\t"
3405
    "paddh      $f8, $f8, $f12                  \n\t"
3406
    "paddh      $f10, $f10, $f14                \n\t"
3407
    "paddh      $f16, $f16, $f20                \n\t"
3408
    "paddh      $f18, $f18, $f22                \n\t"
3409
    "psllh      $f16, $f16, $f26                \n\t"
3410
    "psllh      $f18, $f18, $f26                \n\t"
3411
    "psubh      $f16, $f16, $f8                 \n\t"
3412
    "psubh      $f18, $f18, $f10                \n\t"
3413
    "paddh      $f0, $f0, $f4                   \n\t"
3414
    "paddh      $f2, $f2, $f6                   \n\t"
3415
    "paddh      $f0, $f0, $f16                  \n\t"
3416
    "paddh      $f2, $f2, $f18                  \n\t"
3417
    "psllh      $f16, $f16, $f26                \n\t"
3418
    "psllh      $f18, $f18, $f26                \n\t"
3419
    "paddh      $f0, $f0, $f16                  \n\t"
3420
    "paddh      $f2, $f2, $f18                  \n\t"
3421
    "paddh      $f0, $f0, $f24                  \n\t"
3422
    "paddh      $f2, $f2, $f24                  \n\t"
3423
    "psrah      $f0, $f0, $f30                  \n\t"
3424
    "psrah      $f2, $f2, $f30                  \n\t"
3425
    "packushb   $f0, $f0, $f2                   \n\t"
3426
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3427
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3428
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3429
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3430
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3431
    "bnez       %[iHeight], 1b                  \n\t"
3432
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3433
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3434
    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3435
      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3436
  );
3437
  RECOVER_REG;
3438
}
3439
3440
void McHorVer20WidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3441
                             int iDstStride, int iHeight) {
3442
  BACKUP_REG;
3443
  __asm__ volatile (
3444
    ".set       arch=loongson3a                 \n\t"
3445
    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
3446
    "dli        $8, 0x0010001000100010          \n\t"
3447
    "dmtc1      $8, $f24                        \n\t"
3448
    "dli        $8, 0x2                         \n\t"
3449
    "dmtc1      $8, $f26                        \n\t"
3450
    "dli        $8, 0x5                         \n\t"
3451
    "dmtc1      $8, $f30                        \n\t"
3452
    "1:                                         \n\t"
3453
    "xor        $f28, $f28, $f28                \n\t"
3454
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3455
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
3456
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
3457
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
3458
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
3459
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
3460
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3461
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
3462
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
3463
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
3464
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
3465
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
3466
    "punpckhbh  $f2, $f0, $f28                  \n\t"
3467
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3468
    "punpckhbh  $f10, $f8, $f28                 \n\t"
3469
    "punpckhbh  $f14, $f12, $f28                \n\t"
3470
    "punpckhbh  $f18, $f16, $f28                \n\t"
3471
    "punpckhbh  $f22, $f20, $f28                \n\t"
3472
    "punpcklbh  $f0, $f0, $f28                  \n\t"
3473
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3474
    "punpcklbh  $f8, $f8, $f28                  \n\t"
3475
    "punpcklbh  $f12, $f12, $f28                \n\t"
3476
    "punpcklbh  $f16, $f16, $f28                \n\t"
3477
    "punpcklbh  $f20, $f20, $f28                \n\t"
3478
    "paddh      $f8, $f8, $f12                  \n\t"
3479
    "paddh      $f10, $f10, $f14                \n\t"
3480
    "paddh      $f16, $f16, $f20                \n\t"
3481
    "paddh      $f18, $f18, $f22                \n\t"
3482
    "psllh      $f16, $f16, $f26                \n\t"
3483
    "psllh      $f18, $f18, $f26                \n\t"
3484
    "psubh      $f16, $f16, $f8                 \n\t"
3485
    "psubh      $f18, $f18, $f10                \n\t"
3486
    "paddh      $f0, $f0, $f4                   \n\t"
3487
    "paddh      $f2, $f2, $f6                   \n\t"
3488
    "paddh      $f0, $f0, $f16                  \n\t"
3489
    "paddh      $f2, $f2, $f18                  \n\t"
3490
    "psllh      $f16, $f16, $f26                \n\t"
3491
    "psllh      $f18, $f18, $f26                \n\t"
3492
    "paddh      $f0, $f0, $f16                  \n\t"
3493
    "paddh      $f2, $f2, $f18                  \n\t"
3494
    "paddh      $f0, $f0, $f24                  \n\t"
3495
    "paddh      $f2, $f2, $f24                  \n\t"
3496
    "psrah      $f0, $f0, $f30                  \n\t"
3497
    "psrah      $f2, $f2, $f30                  \n\t"
3498
    "packushb   $f0, $f0, $f2                   \n\t"
3499
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3500
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3501
    "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
3502
    "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
3503
    "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
3504
    "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
3505
    "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
3506
    "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
3507
    "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
3508
    "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
3509
    "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
3510
    "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
3511
    "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
3512
    "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
3513
    "punpckhbh  $f2, $f0, $f28                  \n\t"
3514
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3515
    "punpckhbh  $f10, $f8, $f28                 \n\t"
3516
    "punpckhbh  $f14, $f12, $f28                \n\t"
3517
    "punpckhbh  $f18, $f16, $f28                \n\t"
3518
    "punpckhbh  $f22, $f20, $f28                \n\t"
3519
    "punpcklbh  $f0, $f0, $f28                  \n\t"
3520
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3521
    "punpcklbh  $f8, $f8, $f28                  \n\t"
3522
    "punpcklbh  $f12, $f12, $f28                \n\t"
3523
    "punpcklbh  $f16, $f16, $f28                \n\t"
3524
    "punpcklbh  $f20, $f20, $f28                \n\t"
3525
    "paddh      $f8, $f8, $f12                  \n\t"
3526
    "paddh      $f10, $f10, $f14                \n\t"
3527
    "paddh      $f16, $f16, $f20                \n\t"
3528
    "paddh      $f18, $f18, $f22                \n\t"
3529
    "psllh      $f16, $f16, $f26                \n\t"
3530
    "psllh      $f18, $f18, $f26                \n\t"
3531
    "psubh      $f16, $f16, $f8                 \n\t"
3532
    "psubh      $f18, $f18, $f10                \n\t"
3533
    "paddh      $f0, $f0, $f4                   \n\t"
3534
    "paddh      $f2, $f2, $f6                   \n\t"
3535
    "paddh      $f0, $f0, $f16                  \n\t"
3536
    "paddh      $f2, $f2, $f18                  \n\t"
3537
    "psllh      $f16, $f16, $f26                \n\t"
3538
    "psllh      $f18, $f18, $f26                \n\t"
3539
    "paddh      $f0, $f0, $f16                  \n\t"
3540
    "paddh      $f2, $f2, $f18                  \n\t"
3541
    "paddh      $f0, $f0, $f24                  \n\t"
3542
    "paddh      $f2, $f2, $f24                  \n\t"
3543
    "psrah      $f0, $f0, $f30                  \n\t"
3544
    "psrah      $f2, $f2, $f30                  \n\t"
3545
    "packushb   $f0, $f0, $f2                   \n\t"
3546
    "gssdlc1    $f0, 0xF(%[pDst])               \n\t"
3547
    "gssdrc1    $f0, 0x8(%[pDst])               \n\t"
3548
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3549
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3550
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3551
    "bnez       %[iHeight], 1b                  \n\t"
3552
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3553
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3554
    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3555
      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3556
  );
3557
  RECOVER_REG;
3558
}
3559
3560
void McHorVer20WidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3561
                            int iDstStride, int iHeight) {
3562
  __asm__ volatile (
3563
    ".set       arch=loongson3a                 \n\t"
3564
    "1:                                         \n\t"
3565
    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
3566
    "xor        $f14, $f14, $f14                \n\t"
3567
    "dli        $8, 0x0010001000100010          \n\t"
3568
    "dmtc1      $8, $f12                        \n\t"
3569
    "1:                                         \n\t"
3570
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3571
    "gsldlc1    $f2, 0xc(%[pSrc])               \n\t"
3572
    "gsldlc1    $f4, 0x8(%[pSrc])               \n\t"
3573
    "gsldlc1    $f6, 0xb(%[pSrc])               \n\t"
3574
    "gsldlc1    $f8, 0x9(%[pSrc])               \n\t"
3575
    "gsldlc1    $f10, 0xa(%[pSrc])              \n\t"
3576
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3577
    "gsldrc1    $f2, 0x5(%[pSrc])               \n\t"
3578
    "gsldrc1    $f4, 0x1(%[pSrc])               \n\t"
3579
    "gsldrc1    $f6, 0x4(%[pSrc])               \n\t"
3580
    "gsldrc1    $f8, 0x2(%[pSrc])               \n\t"
3581
    "gsldrc1    $f10, 0x3(%[pSrc])              \n\t"
3582
    "dli        $8, 0x2                         \n\t"
3583
    "punpcklbh  $f0, $f0, $f14                  \n\t"
3584
    "punpcklbh  $f2, $f2, $f14                  \n\t"
3585
    "punpcklbh  $f4, $f4, $f14                  \n\t"
3586
    "punpcklbh  $f6, $f6, $f14                  \n\t"
3587
    "punpcklbh  $f8, $f8, $f14                  \n\t"
3588
    "punpcklbh  $f10, $f10, $f14                \n\t"
3589
    "dmtc1      $8, $f16                        \n\t"
3590
    "paddh      $f4, $f4, $f6                   \n\t"
3591
    "paddh      $f8, $f8, $f10                  \n\t"
3592
    "psllh      $f8, $f8, $f16                  \n\t"
3593
    "psubh      $f8, $f8, $f4                   \n\t"
3594
    "paddh      $f0, $f0, $f2                   \n\t"
3595
    "paddh      $f0, $f0, $f8                   \n\t"
3596
    "dli        $8, 0x5                         \n\t"
3597
    "psllh      $f8, $f8, $f16                  \n\t"
3598
    "paddh      $f0, $f0, $f8                   \n\t"
3599
    "paddh      $f0, $f0, $f12                  \n\t"
3600
    "dmtc1      $8, $f16                        \n\t"
3601
    "psrah      $f0, $f0, $f16                  \n\t"
3602
    "packushb   $f0, $f0, $f14                  \n\t"
3603
    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
3604
    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
3605
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3606
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3607
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3608
    "bnez       %[iHeight], 1b                  \n\t"
3609
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3610
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3611
    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3612
      "$f14", "$f16"
3613
  );
3614
}
3615
3616
static inline void McHorVer20_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3617
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3618
  if (iWidth == 16)
3619
    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3620
  else if (iWidth == 8)
3621
    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3622
  else
3623
    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3624
}
3625
3626
void McHorVer02WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3627
                            int iDstStride, int iHeight) {
3628
  BACKUP_REG;
3629
  __asm__ volatile (
3630
    ".set       arch=loongson3a                 \n\t"
3631
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3632
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3633
    "xor        $f28, $f28, $f28                \n\t"
3634
    MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
3635
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3636
    MMI_LOAD_8P($f4, $f6, $f28, $8)
3637
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3638
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3639
    MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
3640
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3641
    MMI_LOAD_8P($f12, $f14, $f28, $8)
3642
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3643
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3644
    MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
3645
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3646
    MMI_LOAD_8P($f20, $f22, $f28, $8)
3647
3648
    "1:                                         \n\t"
3649
    FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
3650
                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
3651
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3652
    "beqz       %[iHeight], 2f                  \n\t"
3653
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3654
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3655
    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
3656
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3657
    FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
3658
                 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
3659
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3660
    "beqz       %[iHeight], 2f                  \n\t"
3661
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3662
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3663
    MMI_LOAD_8P($f28, $f30, $f0, $8)
3664
    FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
3665
                 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
3666
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3667
    "beqz       %[iHeight], 2f                  \n\t"
3668
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3669
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3670
    MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
3671
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3672
    FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
3673
                 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
3674
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3675
    "beqz       %[iHeight], 2f                  \n\t"
3676
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3677
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3678
    MMI_LOAD_8P($f4, $f6, $f8, $8)
3679
    FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
3680
                 $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
3681
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3682
    "beqz       %[iHeight], 2f                  \n\t"
3683
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3684
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3685
    MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
3686
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3687
    FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
3688
                 $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
3689
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3690
    "beqz       %[iHeight], 2f                  \n\t"
3691
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3692
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3693
    MMI_LOAD_8P($f12, $f14, $f16, $8)
3694
    FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
3695
                 $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
3696
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3697
    "beqz       %[iHeight], 2f                  \n\t"
3698
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3699
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3700
    MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
3701
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3702
    FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
3703
                 $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
3704
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3705
    "beqz       %[iHeight], 2f                  \n\t"
3706
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3707
    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3708
    MMI_LOAD_8P($f20, $f22, $f24, $8)
3709
    "j          1b                              \n\t"
3710
    "2:                                         \n\t"
3711
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3712
    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3713
    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3714
      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3715
  );
3716
  RECOVER_REG;
3717
}
3718
3719
static inline void McHorVer02WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3720
                   uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3721
  McHorVer02WidthEq8_mmi (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
3722
  McHorVer02WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
3723
}
3724
3725
static inline void McHorVer02_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3726
                   uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
3727
                   int32_t iHeight) {
3728
  if (iWidth == 16)
3729
    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3730
  else if (iWidth == 8)
3731
    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3732
  else
3733
    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
3734
}
3735
3736
void McHorVer22Width8HorFirst_mmi(const uint8_t *pSrc, int16_t iSrcStride,
3737
     uint8_t *pDst, int32_t iDstStride, int32_t iHeight) {
3738
  BACKUP_REG;
3739
  __asm__ volatile (
3740
    ".set       arch=loongson3a                 \n\t"
3741
    "xor        $f28, $f28, $f28                \n\t"
3742
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3743
    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3744
    "dli        $8, 0x2                         \n\t"
3745
    "dmtc1      $8, $f30                        \n\t"
3746
    "1:                                         \n\t"
3747
    "xor        $f28, $f28, $f28                \n\t"
3748
    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3749
    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
3750
    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
3751
    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
3752
    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
3753
    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
3754
    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3755
    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
3756
    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
3757
    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
3758
    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
3759
    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
3760
    "punpckhbh  $f2, $f0, $f28                  \n\t"
3761
    "punpckhbh  $f6, $f4, $f28                  \n\t"
3762
    "punpckhbh  $f10, $f8, $f28                 \n\t"
3763
    "punpckhbh  $f14, $f12, $f28                \n\t"
3764
    "punpckhbh  $f18, $f16, $f28                \n\t"
3765
    "punpckhbh  $f22, $f20, $f28                \n\t"
3766
    "punpcklbh  $f0, $f0, $f28                  \n\t"
3767
    "punpcklbh  $f4, $f4, $f28                  \n\t"
3768
    "punpcklbh  $f8, $f8, $f28                  \n\t"
3769
    "punpcklbh  $f12, $f12, $f28                \n\t"
3770
    "punpcklbh  $f16, $f16, $f28                \n\t"
3771
    "punpcklbh  $f20, $f20, $f28                \n\t"
3772
    "paddh      $f8, $f8, $f12                  \n\t"
3773
    "paddh      $f10, $f10, $f14                \n\t"
3774
    "paddh      $f16, $f16, $f20                \n\t"
3775
    "paddh      $f18, $f18, $f22                \n\t"
3776
    "psllh      $f16, $f16, $f30                \n\t"
3777
    "psllh      $f18, $f18, $f30                \n\t"
3778
    "psubh      $f16, $f16, $f8                 \n\t"
3779
    "psubh      $f18, $f18, $f10                \n\t"
3780
    "paddh      $f0, $f0, $f4                   \n\t"
3781
    "paddh      $f2, $f2, $f6                   \n\t"
3782
    "paddh      $f0, $f0, $f16                  \n\t"
3783
    "paddh      $f2, $f2, $f18                  \n\t"
3784
    "psllh      $f16, $f16, $f30                \n\t"
3785
    "psllh      $f18, $f18, $f30                \n\t"
3786
    "paddh      $f0, $f0, $f16                  \n\t"
3787
    "paddh      $f2, $f2, $f18                  \n\t"
3788
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3789
    "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
3790
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3791
    "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
3792
    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3793
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3794
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3795
    "bnez       %[iHeight], 1b                  \n\t"
3796
    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3797
    : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
3798
    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3799
      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3800
  );
3801
  RECOVER_REG;
3802
}
3803
3804
static inline void McHorVer22WidthEq8_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3805
                   uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3806
  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
3807
  McHorVer22Width8HorFirst_mmi (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
3808
  McHorVer22Width8VerLastAlign_mmi ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
3809
}
3810
3811
static inline void McHorVer22WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3812
                   uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3813
  McHorVer22WidthEq8_mmi (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
3814
  McHorVer22WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
3815
}
3816
3817
static inline void McHorVer22_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3818
                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3819
  if (iWidth == 16)
3820
    McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3821
  else if (iWidth == 8)
3822
    McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3823
  else
3824
    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
3825
}
3826
3827
void PixelAvgWidthEq4_mmi(uint8_t *pDst,  int iDstStride, const uint8_t *pSrcA,
3828
     int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3829
  __asm__ volatile (
3830
    ".set       arch=loongson3a                    \n\t"
3831
    "1:                                            \n\t"
3832
    "gsldlc1    $f0, 0x7(%[pSrcB])                 \n\t"
3833
    "gsldlc1    $f2, 0x7(%[pSrcA])                 \n\t"
3834
    "gsldrc1    $f0, 0x0(%[pSrcB])                 \n\t"
3835
    "gsldrc1    $f2, 0x0(%[pSrcA])                 \n\t"
3836
    "pavgb      $f0, $f0, $f2                      \n\t"
3837
    "gsswlc1    $f0, 0x3(%[pDst])                  \n\t"
3838
    "gsswrc1    $f0, 0x0(%[pDst])                  \n\t"
3839
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1       \n\t"
3840
    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride]    \n\t"
3841
    PTR_ADDU   "%[pSrcA], %[pSrcA], %[iSrcAStride] \n\t"
3842
    PTR_ADDU   "%[pSrcB], %[pSrcB], %[iSrcBStride] \n\t"
3843
    "bnez       %[iHeight], 1b                     \n\t"
3844
    : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3845
      [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3846
    : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3847
      [iSrcBStride]"r"((int)iSrcBStride)
3848
    : "memory", "$8", "$9", "$10", "$f0", "$f2"
3849
  );
3850
}
3851
3852
void PixelAvgWidthEq8_mmi(uint8_t *pDst,  int iDstStride, const uint8_t *pSrcA,
3853
     int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3854
  __asm__ volatile (
3855
    ".set       arch=loongson3a                 \n\t"
3856
    "1:                                         \n\t"
3857
    "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
3858
    "gsldlc1    $f2, 0x7(%[pSrcB])              \n\t"
3859
    "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
3860
    "gsldrc1    $f2, 0x0(%[pSrcB])              \n\t"
3861
    "pavgb      $f0, $f0, $f2                   \n\t"
3862
    PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
3863
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3864
    PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
3865
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3866
    "gsldlc1    $f0, 0x7($8)                    \n\t"
3867
    "gsldlc1    $f2, 0x7($9)                    \n\t"
3868
    "gsldrc1    $f0, 0x0($8)                    \n\t"
3869
    "gsldrc1    $f2, 0x0($9)                    \n\t"
3870
    "pavgb      $f0, $f0, $f2                   \n\t"
3871
    PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
3872
    "gssdlc1    $f0, 0x7($10)                   \n\t"
3873
    PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
3874
    "gssdrc1    $f0, 0x0($10)                   \n\t"
3875
    PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
3876
    PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
3877
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x2    \n\t"
3878
    "bnez       %[iHeight], 1b                  \n\t"
3879
    : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3880
      [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3881
    : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3882
      [iSrcBStride]"r"((int)iSrcBStride)
3883
    : "memory", "$8", "$9", "$10", "$f0", "$f2"
3884
  );
3885
}
3886
3887
void PixelAvgWidthEq16_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
3888
     int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3889
  __asm__ volatile (
3890
    ".set       arch=loongson3a                 \n\t"
3891
    "1:                                         \n\t"
3892
    "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
3893
    "gsldlc1    $f2, 0xF(%[pSrcA])              \n\t"
3894
    "gsldlc1    $f4, 0x7(%[pSrcB])              \n\t"
3895
    "gsldlc1    $f6, 0xF(%[pSrcB])              \n\t"
3896
    "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
3897
    "gsldrc1    $f2, 0x8(%[pSrcA])              \n\t"
3898
    "gsldrc1    $f4, 0x0(%[pSrcB])              \n\t"
3899
    "gsldrc1    $f6, 0x8(%[pSrcB])              \n\t"
3900
    "pavgb      $f0, $f0, $f4                   \n\t"
3901
    "pavgb      $f2, $f2, $f6                   \n\t"
3902
    PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
3903
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3904
    "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
3905
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3906
    "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
3907
    PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
3908
    "gsldlc1    $f0, 0x7($8)                    \n\t"
3909
    "gsldlc1    $f2, 0xF($8)                    \n\t"
3910
    "gsldrc1    $f0, 0x0($8)                    \n\t"
3911
    "gsldrc1    $f2, 0x8($8)                    \n\t"
3912
    PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
3913
    "gsldlc1    $f4, 0x7($9)                    \n\t"
3914
    "gsldlc1    $f6, 0xF($9)                    \n\t"
3915
    "gsldrc1    $f4, 0x0($9)                    \n\t"
3916
    "gsldrc1    $f6, 0x8($9)                    \n\t"
3917
    "pavgb      $f0, $f0, $f4                   \n\t"
3918
    "pavgb      $f2, $f2, $f6                   \n\t"
3919
    "gssdlc1    $f0, 0x7($10)                   \n\t"
3920
    "gssdlc1    $f2, 0xF($10)                   \n\t"
3921
    "gssdrc1    $f0, 0x0($10)                   \n\t"
3922
    "gssdrc1    $f2, 0x8($10)                   \n\t"
3923
3924
    PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
3925
    PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
3926
    PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
3927
    "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
3928
    "gsldlc1    $f2, 0xF(%[pSrcA])              \n\t"
3929
    "gsldlc1    $f4, 0x7(%[pSrcB])              \n\t"
3930
    "gsldlc1    $f6, 0xF(%[pSrcB])              \n\t"
3931
    "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
3932
    "gsldrc1    $f2, 0x8(%[pSrcA])              \n\t"
3933
    "gsldrc1    $f4, 0x0(%[pSrcB])              \n\t"
3934
    "gsldrc1    $f6, 0x8(%[pSrcB])              \n\t"
3935
    "pavgb      $f0, $f0, $f4                   \n\t"
3936
    "pavgb      $f2, $f2, $f6                   \n\t"
3937
    PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
3938
    PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
3939
    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3940
    "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
3941
    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3942
    "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
3943
    "gsldlc1    $f0, 0x7($8)                    \n\t"
3944
    "gsldlc1    $f2, 0xF($8)                    \n\t"
3945
    "gsldlc1    $f4, 0x7($9)                    \n\t"
3946
    "gsldlc1    $f6, 0xF($9)                    \n\t"
3947
    "gsldrc1    $f0, 0x0($8)                    \n\t"
3948
    "gsldrc1    $f2, 0x8($8)                    \n\t"
3949
    "gsldrc1    $f4, 0x0($9)                    \n\t"
3950
    "gsldrc1    $f6, 0x8($9)                    \n\t"
3951
    PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
3952
    "pavgb      $f0, $f0, $f4                   \n\t"
3953
    "pavgb      $f2, $f2, $f6                   \n\t"
3954
    "gssdlc1    $f0, 0x7($10)                   \n\t"
3955
    "gssdlc1    $f2, 0xF($10)                   \n\t"
3956
    "gssdrc1    $f0, 0x0($10)                   \n\t"
3957
    "gssdrc1    $f2, 0x8($10)                   \n\t"
3958
    PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
3959
    PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
3960
    PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
3961
    PTR_ADDIU  "%[iHeight], %[iHeight], -0x4    \n\t"
3962
    "bnez       %[iHeight], 1b                  \n\t"
3963
    : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3964
      [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3965
    : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3966
      [iSrcBStride]"r"((int)iSrcBStride)
3967
    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6"
3968
  );
3969
}
3970
3971
static inline void McHorVer01_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3972
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3973
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
3974
  if (iWidth == 16) {
3975
    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3976
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3977
  } else if (iWidth == 8) {
3978
    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3979
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3980
  } else {
3981
    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
3982
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3983
  }
3984
}
3985
3986
static inline void McHorVer03_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3987
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3988
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
3989
  if (iWidth == 16) {
3990
    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3991
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3992
  } else if (iWidth == 8) {
3993
    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3994
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3995
  } else {
3996
    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
3997
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3998
  }
3999
}
4000
4001
static inline void McHorVer10_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4002
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4003
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
4004
  if (iWidth == 16) {
4005
    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4006
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4007
  } else if (iWidth == 8) {
4008
    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4009
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4010
  } else {
4011
    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4012
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4013
  }
4014
}
4015
4016
static inline void McHorVer11_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4017
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4018
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4019
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4020
  if (iWidth == 16) {
4021
    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4022
    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4023
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4024
  } else if (iWidth == 8) {
4025
    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4026
    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4027
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4028
  } else {
4029
    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4030
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4031
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4032
  }
4033
}
4034
4035
static inline void McHorVer12_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4036
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4037
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4038
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4039
  if (iWidth == 16) {
4040
    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4041
    McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4042
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4043
  } else if (iWidth == 8) {
4044
    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4045
    McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4046
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4047
  } else {
4048
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4049
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4050
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4051
  }
4052
}
4053
static inline void McHorVer13_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4054
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4055
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4056
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4057
  if (iWidth == 16) {
4058
    McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4059
    McHorVer02WidthEq16_mmi (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
4060
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4061
  } else if (iWidth == 8) {
4062
    McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4063
    McHorVer02WidthEq8_mmi (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
4064
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4065
  } else {
4066
    McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4067
    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
4068
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4069
  }
4070
}
4071
static inline void McHorVer21_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4072
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4073
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4074
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4075
  if (iWidth == 16) {
4076
    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4077
    McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4078
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4079
  } else if (iWidth == 8) {
4080
    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4081
    McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4082
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4083
  } else {
4084
    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4085
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4086
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4087
  }
4088
}
4089
4090
static inline void McHorVer23_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4091
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4092
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4093
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4094
  if (iWidth == 16) {
4095
    McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4096
    McHorVer22WidthEq16_mmi (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
4097
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4098
  } else if (iWidth == 8) {
4099
    McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4100
    McHorVer22WidthEq8_mmi (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
4101
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4102
  } else {
4103
    McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4104
    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
4105
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4106
  }
4107
}
4108
static inline void McHorVer30_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4109
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4110
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4111
  if (iWidth == 16) {
4112
    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4113
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4114
  } else if (iWidth == 8) {
4115
    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4116
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4117
  } else {
4118
    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4119
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4120
  }
4121
}
4122
static inline void McHorVer31_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4123
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4124
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4125
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4126
  if (iWidth == 16) {
4127
    McHorVer20WidthEq16_mmi (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
4128
    McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4129
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4130
  } else if (iWidth == 8) {
4131
    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4132
    McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4133
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4134
  } else {
4135
    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4136
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4137
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4138
  }
4139
}
4140
static inline void McHorVer32_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4141
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4142
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4143
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4144
  if (iWidth == 16) {
4145
    McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4146
    McHorVer22WidthEq16_mmi (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
4147
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4148
  } else if (iWidth == 8) {
4149
    McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4150
    McHorVer22WidthEq8_mmi (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
4151
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4152
  } else {
4153
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4154
    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
4155
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4156
  }
4157
}
4158
static inline void McHorVer33_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4159
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4160
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4161
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4162
  if (iWidth == 16) {
4163
    McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4164
    McHorVer02WidthEq16_mmi (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
4165
    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4166
  } else if (iWidth == 8) {
4167
    McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4168
    McHorVer02WidthEq8_mmi (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
4169
    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4170
  } else {
4171
    McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4172
    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
4173
    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4174
  }
4175
}
4176
4177
void McLuma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
4178
                int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
4179
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
4180
    {McCopy_mmi,     McHorVer01_mmi, McHorVer02_mmi, McHorVer03_mmi},
4181
    {McHorVer10_mmi, McHorVer11_mmi, McHorVer12_mmi, McHorVer13_mmi},
4182
    {McHorVer20_mmi, McHorVer21_mmi, McHorVer22_mmi, McHorVer23_mmi},
4183
    {McHorVer30_mmi, McHorVer31_mmi, McHorVer32_mmi, McHorVer33_mmi},
4184
  };
4185
4186
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4187
}
4188
4189
void PixelAvg_mmi(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
4190
                  const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
4191
  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
4192
    PixelAvgWidthEq8_mmi,
4193
    PixelAvgWidthEq16_mmi
4194
  };
4195
  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
4196
}
4197
#endif//HAVE_MMI
4198
4199
#if defined(HAVE_LSX)
4200
static inline void McCopy_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4201
                              int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4202
  if (iWidth == 16)
4203
    McCopyWidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4204
  else if (iWidth == 8)
4205
    McCopyWidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4206
  else if (iWidth == 4)
4207
    McCopyWidthEq4_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4208
  else
4209
    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4210
}
4211
4212
void McChroma_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4213
                  int32_t iDstStride, int16_t iMvX, int16_t iMvY,
4214
                  int32_t iWidth, int32_t iHeight) {
4215
  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
4216
    McChromaWidthEq4_lsx,
4217
    McChromaWidthEq8_lsx
4218
  };
4219
  const int32_t kiD8x = iMvX & 0x07;
4220
  const int32_t kiD8y = iMvY & 0x07;
4221
  if (kiD8x == 0 && kiD8y == 0) {
4222
    McCopy_lsx (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4223
    return;
4224
  }
4225
  if (iWidth != 2) {
4226
    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
4227
                                       g_kuiABCD[kiD8y][kiD8x], iHeight);
4228
  } else
4229
    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
4230
                          iWidth, iHeight);
4231
}
4232
4233
void PixelAvg_lsx(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA,
4234
                  int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride,
4235
                  int32_t iWidth, int32_t iHeight) {
4236
  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
4237
    PixelAvgWidthEq8_lsx,
4238
    PixelAvgWidthEq16_lsx
4239
  };
4240
  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
4241
}
4242
4243
static inline void McHorVer01_lsx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4244
                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4245
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
4246
  if (iWidth == 16) {
4247
    McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4248
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4249
  } else if (iWidth == 8) {
4250
    McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4251
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4252
  } else {
4253
    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
4254
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4255
  }
4256
}
4257
4258
static inline void McHorVer02_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4259
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4260
  if (iWidth == 16)
4261
    McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4262
  else if (iWidth == 8)
4263
    McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4264
  else
4265
    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
4266
}
4267
4268
static inline void McHorVer03_lsx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4269
                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4270
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
4271
  if (iWidth == 16) {
4272
    McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4273
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
4274
  } else if (iWidth == 8) {
4275
    McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4276
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
4277
  } else {
4278
    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
4279
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
4280
  }
4281
}
4282
4283
static inline void McHorVer10_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4284
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4285
  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
4286
  if (iWidth == 16) {
4287
    McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4288
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4289
  } else if (iWidth == 8) {
4290
    McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4291
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4292
  } else {
4293
    McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pTmp, 16, iHeight);
4294
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4295
  }
4296
}
4297
4298
static inline void McHorVer11_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4299
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4300
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4301
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4302
  if (iWidth == 16) {
4303
    McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4304
    McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4305
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4306
  } else if (iWidth == 8) {
4307
    McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4308
    McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4309
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4310
  } else {
4311
    McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4312
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4313
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4314
  }
4315
}
4316
4317
static inline void McHorVer22WidthEq16_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4318
                                           int32_t iDstStride, int32_t iHeight) {
4319
  McHorVer22WidthEq8_lsx (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
4320
  McHorVer22WidthEq8_lsx (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
4321
}
4322
4323
static inline void McHorVer12_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4324
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4325
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4326
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4327
  if (iWidth == 16) {
4328
    McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4329
    McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4330
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4331
  } else if (iWidth == 8) {
4332
    McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4333
    McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4334
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4335
  } else {
4336
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4337
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4338
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4339
  }
4340
}
4341
4342
static inline void McHorVer13_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4343
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4344
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4345
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4346
  if (iWidth == 16) {
4347
    McHorVer20WidthEq16_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4348
    McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4349
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4350
  } else if (iWidth == 8) {
4351
    McHorVer20WidthEq8_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4352
    McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4353
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4354
  } else {
4355
    McHorVer20WidthEq4_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4356
    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4357
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4358
  }
4359
}
4360
4361
static inline void McHorVer20_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4362
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4363
  if (iWidth == 16)
4364
    McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4365
  else if (iWidth == 8)
4366
    McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4367
  else
4368
    McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4369
}
4370
4371
static inline void McHorVer21_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4372
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4373
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4374
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4375
  if (iWidth == 16) {
4376
    McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4377
    McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4378
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4379
  } else if (iWidth == 8) {
4380
    McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4381
    McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4382
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4383
  } else {
4384
    McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4385
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4386
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4387
  }
4388
}
4389
4390
static inline void McHorVer22_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4391
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4392
  if (iWidth == 16)
4393
    McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4394
  else if (iWidth == 8)
4395
    McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4396
  else
4397
    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
4398
}
4399
4400
static inline void McHorVer23_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4401
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4402
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4403
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4404
  if (iWidth == 16) {
4405
    McHorVer20WidthEq16_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4406
    McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4407
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4408
  } else if (iWidth == 8) {
4409
    McHorVer20WidthEq8_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4410
    McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4411
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4412
  } else {
4413
    McHorVer20WidthEq4_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4414
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4415
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4416
  }
4417
}
4418
4419
static inline void McHorVer30_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4420
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4421
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4422
  if (iWidth == 16) {
4423
    McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4424
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4425
  } else if (iWidth == 8) {
4426
    McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4427
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4428
  } else {
4429
    McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4430
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4431
  }
4432
}
4433
4434
static inline void McHorVer31_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4435
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4436
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4437
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4438
  if (iWidth == 16) {
4439
    McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4440
    McHorVer02WidthEq16_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4441
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4442
  } else if (iWidth == 8) {
4443
    McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4444
    McHorVer02WidthEq8_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4445
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4446
  } else {
4447
    McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4448
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4449
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4450
  }
4451
}
4452
4453
static inline void McHorVer32_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4454
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4455
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4456
  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4457
  if (iWidth == 16) {
4458
    McHorVer02WidthEq16_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4459
    McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4460
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4461
  } else if (iWidth == 8) {
4462
    McHorVer02WidthEq8_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4463
    McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4464
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4465
  } else {
4466
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4467
    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4468
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4469
  }
4470
}
4471
4472
static inline void McHorVer33_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4473
                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4474
  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4475
  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4476
  if (iWidth == 16) {
4477
    McHorVer20WidthEq16_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4478
    McHorVer02WidthEq16_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4479
    PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4480
  } else if (iWidth == 8) {
4481
    McHorVer20WidthEq8_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4482
    McHorVer02WidthEq8_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4483
    PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4484
  } else {
4485
    McHorVer20WidthEq4_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4486
    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4487
    PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4488
  }
4489
}
4490
4491
void McLuma_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4492
                int32_t iDstStride, int16_t iMvX, int16_t iMvY,
4493
                int32_t iWidth, int32_t iHeight) {
4494
  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
4495
    {McCopy_lsx,     McHorVer01_lsx, McHorVer02_lsx, McHorVer03_lsx},
4496
    {McHorVer10_lsx, McHorVer11_lsx, McHorVer12_lsx, McHorVer13_lsx},
4497
    {McHorVer20_lsx, McHorVer21_lsx, McHorVer22_lsx, McHorVer23_lsx},
4498
    {McHorVer30_lsx, McHorVer31_lsx, McHorVer32_lsx, McHorVer33_lsx},
4499
  };
4500
  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4501
}
4502
4503
static inline void McHorVer20Width5Or9Or17_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4504
                                               int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4505
  if (iWidth == 17) {
4506
      McHorVer20WidthEq17_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
4507
  } else if (iWidth == 9) {
4508
      McHorVer20WidthEq9_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
4509
  } else {//if (iWidth == 5)
4510
      McHorVer20WidthEq5_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
4511
  }
4512
}
4513
4514
void McHorVer22Width5Or9Or17_lsx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4515
                                                int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4516
  if (iWidth == 17) {
4517
    McHorVer22WidthEq17_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
4518
  } else if (iWidth == 9) {
4519
    McHorVer22WidthEq9_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
4520
  } else {
4521
    McHorVer22WidthEq5_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
4522
  }
4523
}
4524
#endif//HAVE_LSX
4525
4526
} // anon ns.
4527
4528
0
void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
4529
0
  pMcFuncs->pfLumaHalfpelHor  = McHorVer20_c;
4530
0
  pMcFuncs->pfLumaHalfpelVer  = McHorVer02_c;
4531
0
  pMcFuncs->pfLumaHalfpelCen  = McHorVer22_c;
4532
0
  pMcFuncs->pfSampleAveraging = PixelAvg_c;
4533
0
  pMcFuncs->pMcChromaFunc     = McChroma_c;
4534
0
  pMcFuncs->pMcLumaFunc       = McLuma_c;
4535
4536
#if defined (X86_ASM)
4537
  if (uiCpuFlag & WELS_CPU_SSE2) {
4538
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_sse2;
4539
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_sse2;
4540
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_sse2;
4541
    pMcFuncs->pfSampleAveraging = PixelAvg_sse2;
4542
    pMcFuncs->pMcChromaFunc     = McChroma_sse2;
4543
    pMcFuncs->pMcLumaFunc       = McLuma_sse2;
4544
  }
4545
4546
  if (uiCpuFlag & WELS_CPU_SSSE3) {
4547
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_ssse3;
4548
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_ssse3;
4549
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_ssse3;
4550
    pMcFuncs->pMcChromaFunc = McChroma_ssse3;
4551
    pMcFuncs->pMcLumaFunc   = McLuma_ssse3;
4552
  }
4553
#ifdef HAVE_AVX2
4554
  if (uiCpuFlag & WELS_CPU_AVX2) {
4555
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_avx2;
4556
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_avx2;
4557
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_avx2;
4558
    pMcFuncs->pMcLumaFunc       = McLuma_avx2;
4559
  }
4560
#endif
4561
#endif //(X86_ASM)
4562
4563
#if defined(HAVE_NEON)
4564
  if (uiCpuFlag & WELS_CPU_NEON) {
4565
    pMcFuncs->pMcLumaFunc       = McLuma_neon;
4566
    pMcFuncs->pMcChromaFunc     = McChroma_neon;
4567
    pMcFuncs->pfSampleAveraging = PixelAvg_neon;
4568
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_neon;//iWidth+1:4/8/16
4569
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_neon;//heigh+1:4/8/16
4570
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_neon;//iWidth+1/heigh+1
4571
  }
4572
#endif
4573
#if defined(HAVE_NEON_AARCH64)
4574
  if (uiCpuFlag & WELS_CPU_NEON) {
4575
    pMcFuncs->pMcLumaFunc       = McLuma_AArch64_neon;
4576
    pMcFuncs->pMcChromaFunc     = McChroma_AArch64_neon;
4577
    pMcFuncs->pfSampleAveraging = PixelAvg_AArch64_neon;
4578
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_AArch64_neon;//iWidth+1:4/8/16
4579
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_AArch64_neon;//heigh+1:4/8/16
4580
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1
4581
  }
4582
#endif
4583
4584
#if defined(HAVE_MMI)
4585
  if (uiCpuFlag & WELS_CPU_MMI) {
4586
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_mmi;
4587
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_mmi;
4588
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_mmi;
4589
    pMcFuncs->pfSampleAveraging = PixelAvg_mmi;
4590
    pMcFuncs->pMcChromaFunc     = McChroma_mmi;
4591
    pMcFuncs->pMcLumaFunc       = McLuma_mmi;
4592
  }
4593
#endif//HAVE_MMI
4594
4595
#if defined(HAVE_LSX)
4596
  if (uiCpuFlag & WELS_CPU_LSX) {
4597
    pMcFuncs->pMcChromaFunc     = McChroma_lsx;
4598
    pMcFuncs->pfSampleAveraging = PixelAvg_lsx;
4599
    pMcFuncs->pMcLumaFunc       = McLuma_lsx;
4600
    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_lsx;
4601
    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_lsx;
4602
    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_lsx;
4603
  }
4604
#endif//HAVE_LSX
4605
0
}