Coverage Report

Created: 2022-08-24 06:11

/src/x265/source/encoder/motion.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
 *
21
 * This program is also available under a commercial proprietary license.
22
 * For more information, contact us at license @ x265.com.
23
 *****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "lowres.h"
28
#include "motion.h"
29
#include "x265.h"
30
31
#if _MSC_VER
32
#pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
33
#endif
34
35
using namespace X265_NS;
36
37
namespace {
38
39
struct SubpelWorkload
40
{
41
    int hpel_iters;
42
    int hpel_dirs;
43
    int qpel_iters;
44
    int qpel_dirs;
45
    bool hpel_satd;
46
};
47
48
const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
49
{
50
    { 1, 4, 0, 4, false }, // 4 SAD HPEL only
51
    { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
52
    { 1, 4, 1, 4, true },  // 4 SATD HPEL + 4 SATD QPEL
53
    { 2, 4, 1, 4, true },  // 2x4 SATD HPEL + 4 SATD QPEL
54
    { 2, 4, 2, 4, true },  // 2x4 SATD HPEL + 2x4 SATD QPEL
55
    { 1, 8, 1, 8, true },  // 8 SATD HPEL + 8 SATD QPEL (default)
56
    { 2, 8, 1, 8, true },  // 2x8 SATD HPEL + 8 SATD QPEL
57
    { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
58
};
59
60
static int sizeScale[NUM_PU_SIZES];
61
0
#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
62
63
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
64
const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
65
const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
66
const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
67
const MV hex4[16] =
68
{
69
    MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
70
    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
71
    MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
72
    MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
73
};
74
const MV offsets[] =
75
{
76
    MV(-1, 0), MV(0, -1),
77
    MV(-1, -1), MV(1, -1),
78
    MV(-1, 0), MV(1, 0),
79
    MV(-1, 1), MV(-1, -1),
80
    MV(1, -1), MV(1, 1),
81
    MV(-1, 0), MV(0, 1),
82
    MV(-1, 1), MV(1, 1),
83
    MV(1, 0), MV(0, 1),
84
}; // offsets for Two Point Search
85
86
/* sum of absolute differences between MV candidates, used for adaptive ME range */
87
inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
88
0
{
89
0
    int sum = 0;
90
91
0
    for (int i = 0; i < numCandidates - 1; i++)
92
0
    {
93
0
        sum += abs(mvc[i].x - mvc[i + 1].x)
94
0
            +  abs(mvc[i].y - mvc[i + 1].y);
95
0
    }
96
97
0
    return sum;
98
0
}
99
100
}
101
102
MotionEstimate::MotionEstimate()
103
0
{
104
0
    ctuAddr = -1;
105
0
    absPartIdx = -1;
106
0
    searchMethod = X265_HEX_SEARCH;
107
0
    searchMethodL0 = X265_HEX_SEARCH;
108
0
    searchMethodL1 = X265_HEX_SEARCH;
109
0
    subpelRefine = 2;
110
0
    blockwidth = blockheight = 0;
111
0
    blockOffset = 0;
112
0
    bChromaSATD = false;
113
0
    chromaSatd = NULL;
114
0
    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
115
0
        integral[i] = NULL;
116
0
}
117
118
void MotionEstimate::init(int csp)
119
0
{
120
0
    fencPUYuv.create(FENC_STRIDE, csp);
121
0
}
122
123
void MotionEstimate::initScales(void)
124
0
{
125
0
#define SETUP_SCALE(W, H) \
126
0
    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
127
0
    SETUP_SCALE(4, 4);
128
0
    SETUP_SCALE(8, 8);
129
0
    SETUP_SCALE(8, 4);
130
0
    SETUP_SCALE(4, 8);
131
0
    SETUP_SCALE(16, 16);
132
0
    SETUP_SCALE(16, 8);
133
0
    SETUP_SCALE(8, 16);
134
0
    SETUP_SCALE(16, 12);
135
0
    SETUP_SCALE(12, 16);
136
0
    SETUP_SCALE(4, 16);
137
0
    SETUP_SCALE(16, 4);
138
0
    SETUP_SCALE(32, 32);
139
0
    SETUP_SCALE(32, 16);
140
0
    SETUP_SCALE(16, 32);
141
0
    SETUP_SCALE(32, 24);
142
0
    SETUP_SCALE(24, 32);
143
0
    SETUP_SCALE(32, 8);
144
0
    SETUP_SCALE(8, 32);
145
0
    SETUP_SCALE(64, 64);
146
0
    SETUP_SCALE(64, 32);
147
0
    SETUP_SCALE(32, 64);
148
0
    SETUP_SCALE(64, 48);
149
0
    SETUP_SCALE(48, 64);
150
0
    SETUP_SCALE(64, 16);
151
0
    SETUP_SCALE(16, 64);
152
0
#undef SETUP_SCALE
153
0
}
154
155
int MotionEstimate::hpelIterationCount(int subme)
156
0
{
157
0
    return workload[subme].hpel_iters +
158
0
           workload[subme].qpel_iters / 2;
159
0
}
160
161
MotionEstimate::~MotionEstimate()
162
0
{
163
0
    fencPUYuv.destroy();
164
0
}
165
166
/* Called by lookahead, luma only, no use of PicYuv */
167
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
168
0
{
169
0
    partEnum = partitionFromSizes(pwidth, pheight);
170
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
171
0
    sad = primitives.pu[partEnum].sad;
172
0
    ads = primitives.pu[partEnum].ads;
173
0
    satd = primitives.pu[partEnum].satd;
174
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
175
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
176
177
178
0
    blockwidth = pwidth;
179
0
    blockOffset = offset;
180
0
    absPartIdx = ctuAddr = -1;
181
182
    /* Search params */
183
0
    searchMethod = method;
184
0
    searchMethodL0 = searchL0;
185
0
    searchMethodL1 = searchL1;
186
0
    subpelRefine = refine;
187
188
    /* copy PU block into cache */
189
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
190
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
191
0
}
192
193
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
194
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
195
0
{
196
0
    partEnum = partitionFromSizes(pwidth, pheight);
197
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
198
0
    sad = primitives.pu[partEnum].sad;
199
0
    ads = primitives.pu[partEnum].ads;
200
0
    satd = primitives.pu[partEnum].satd;
201
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
202
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
203
204
0
    chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
205
206
    /* Set search characteristics */
207
0
    searchMethod = method;
208
0
    subpelRefine = refine;
209
210
    /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
211
     * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
212
0
    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
213
0
    X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
214
215
0
    ctuAddr = _ctuAddr;
216
0
    absPartIdx = cuPartIdx + puPartIdx;
217
0
    blockwidth = pwidth;
218
0
    blockOffset = 0;
219
220
    /* copy PU from CU Yuv */
221
0
    fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
222
0
}
223
224
#define COST_MV_PT_DIST(mx, my, point, dist) \
225
0
    do \
226
0
    { \
227
0
        MV tmv(mx, my); \
228
0
        int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
229
0
        cost += mvcost(tmv << 2); \
230
0
        if (cost < bcost) { \
231
0
            bcost = cost; \
232
0
            bmv = tmv; \
233
0
            bPointNr = point; \
234
0
            bDistance = dist; \
235
0
        } \
236
0
    } while (0)
237
238
#define COST_MV(mx, my) \
239
0
    do \
240
0
    { \
241
0
        int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
242
0
        cost += mvcost(MV(mx, my) << 2); \
243
0
        COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
244
0
    } while (0)
245
246
#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
247
0
    { \
248
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
249
0
        sad_x3(fenc, \
250
0
               pix_base + (m0x) + (m0y) * stride, \
251
0
               pix_base + (m1x) + (m1y) * stride, \
252
0
               pix_base + (m2x) + (m2y) * stride, \
253
0
               stride, costs); \
254
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
255
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
256
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
257
0
    }
258
259
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
260
0
    { \
261
0
        sad_x4(fenc, \
262
0
               fref + (m0x) + (m0y) * stride, \
263
0
               fref + (m1x) + (m1y) * stride, \
264
0
               fref + (m2x) + (m2y) * stride, \
265
0
               fref + (m3x) + (m3y) * stride, \
266
0
               stride, costs); \
267
0
        (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
268
0
        (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
269
0
        (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
270
0
        (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
271
0
        COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
272
0
        COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
273
0
        COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
274
0
        COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
275
0
    }
276
277
#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
278
0
    { \
279
0
        pixel *pix_base = fref + omv.x + omv.y * stride; \
280
0
        sad_x4(fenc, \
281
0
               pix_base + (m0x) + (m0y) * stride, \
282
0
               pix_base + (m1x) + (m1y) * stride, \
283
0
               pix_base + (m2x) + (m2y) * stride, \
284
0
               pix_base + (m3x) + (m3y) * stride, \
285
0
               stride, costs); \
286
0
        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
287
0
        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
288
0
        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
289
0
        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
290
0
        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
291
0
            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
292
0
        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
293
0
            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
294
0
        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
295
0
            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
296
0
        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
297
0
            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
298
0
    }
299
300
0
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
301
0
{\
302
0
    sad_x3(fenc, \
303
0
    fref + (m0x) + (m0y) * stride, \
304
0
    fref + (m1x) + (m1y) * stride, \
305
0
    fref + (m2x) + (m2y) * stride, \
306
0
    stride, costs); \
307
0
    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
308
0
    costs[1] += p_cost_mvx[(m1x) << 2]; \
309
0
    costs[2] += p_cost_mvx[(m2x) << 2]; \
310
0
    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
311
0
    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
312
0
    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
313
0
}
314
315
#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
316
0
    { \
317
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
318
0
        sad_x4(fenc, \
319
0
               pix_base + (m0x) + (m0y) * stride, \
320
0
               pix_base + (m1x) + (m1y) * stride, \
321
0
               pix_base + (m2x) + (m2y) * stride, \
322
0
               pix_base + (m3x) + (m3y) * stride, \
323
0
               stride, costs); \
324
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
325
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
326
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
327
0
        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
328
0
    }
329
330
#define DIA1_ITER(mx, my) \
331
0
    { \
332
0
        omv.x = mx; omv.y = my; \
333
0
        COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
334
0
    }
335
336
#define CROSS(start, x_max, y_max) \
337
0
    { \
338
0
        int16_t i = start; \
339
0
        if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
340
0
            for (; i < (x_max) - 2; i += 4) { \
341
0
                COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
342
0
        for (; i < (x_max); i += 2) \
343
0
        { \
344
0
            if (omv.x + i <= mvmax.x) \
345
0
                COST_MV(omv.x + i, omv.y); \
346
0
            if (omv.x - i >= mvmin.x) \
347
0
                COST_MV(omv.x - i, omv.y); \
348
0
        } \
349
0
        i = start; \
350
0
        if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
351
0
            for (; i < (y_max) - 2; i += 4) { \
352
0
                COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
353
0
        for (; i < (y_max); i += 2) \
354
0
        { \
355
0
            if (omv.y + i <= mvmax.y) \
356
0
                COST_MV(omv.x, omv.y + i); \
357
0
            if (omv.y - i >= mvmin.y) \
358
0
                COST_MV(omv.x, omv.y - i); \
359
0
        } \
360
0
    }
361
362
void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
363
                                       const MV &       mvmin,
364
                                       const MV &       mvmax,
365
                                       MV &             bmv,
366
                                       int &            bcost,
367
                                       int &            bPointNr,
368
                                       int &            bDistance,
369
                                       int              earlyExitIters,
370
                                       int              merange,
371
                                       int              hme)
372
0
{
373
0
    ALIGN_VAR_16(int, costs[16]);
374
0
    pixel* fenc = fencPUYuv.m_buf[0];
375
0
    pixel* fref = (hme? ref->fpelLowerResPlane[0] : ref->fpelPlane[0]) + blockOffset;
376
0
    intptr_t stride = hme? ref->lumaStride / 2 : ref->lumaStride;
377
378
0
    MV omv = bmv;
379
0
    int saved = bcost;
380
0
    int rounds = 0;
381
382
0
    {
383
0
        int16_t dist = 1;
384
385
        /* bPointNr
386
              2
387
            4 * 5
388
              7
389
         */
390
0
        const int32_t top    = omv.y - dist;
391
0
        const int32_t bottom = omv.y + dist;
392
0
        const int32_t left   = omv.x - dist;
393
0
        const int32_t right  = omv.x + dist;
394
395
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
396
0
        {
397
0
            COST_MV_PT_DIST_X4(omv.x,  top,    2, dist,
398
0
                               left,  omv.y,   4, dist,
399
0
                               right, omv.y,   5, dist,
400
0
                               omv.x,  bottom, 7, dist);
401
0
        }
402
0
        else
403
0
        {
404
0
            if (top >= mvmin.y) // check top
405
0
            {
406
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
407
0
            }
408
0
            if (left >= mvmin.x) // check middle left
409
0
            {
410
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
411
0
            }
412
0
            if (right <= mvmax.x) // check middle right
413
0
            {
414
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
415
0
            }
416
0
            if (bottom <= mvmax.y) // check bottom
417
0
            {
418
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
419
0
            }
420
0
        }
421
0
        if (bcost < saved)
422
0
            rounds = 0;
423
0
        else if (++rounds >= earlyExitIters)
424
0
            return;
425
0
    }
426
427
0
    for (int16_t dist = 2; dist <= 8; dist <<= 1)
428
0
    {
429
        /* bPointNr
430
              2
431
             1 3
432
            4 * 5
433
             6 8
434
              7
435
         Points 2, 4, 5, 7 are dist
436
         Points 1, 3, 6, 8 are dist>>1
437
         */
438
0
        const int32_t top     = omv.y - dist;
439
0
        const int32_t bottom  = omv.y + dist;
440
0
        const int32_t left    = omv.x - dist;
441
0
        const int32_t right   = omv.x + dist;
442
0
        const int32_t top2    = omv.y - (dist >> 1);
443
0
        const int32_t bottom2 = omv.y + (dist >> 1);
444
0
        const int32_t left2   = omv.x - (dist >> 1);
445
0
        const int32_t right2  = omv.x + (dist >> 1);
446
0
        saved = bcost;
447
448
0
        if (top >= mvmin.y && left >= mvmin.x &&
449
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
450
0
        {
451
0
            COST_MV_PT_DIST_X4(omv.x,  top,   2, dist,
452
0
                               left2,  top2,  1, dist >> 1,
453
0
                               right2, top2,  3, dist >> 1,
454
0
                               left,   omv.y, 4, dist);
455
0
            COST_MV_PT_DIST_X4(right,  omv.y,   5, dist,
456
0
                               left2,  bottom2, 6, dist >> 1,
457
0
                               right2, bottom2, 8, dist >> 1,
458
0
                               omv.x,  bottom,  7, dist);
459
0
        }
460
0
        else // check border for each mv
461
0
        {
462
0
            if (top >= mvmin.y) // check top
463
0
            {
464
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
465
0
            }
466
0
            if (top2 >= mvmin.y) // check half top
467
0
            {
468
0
                if (left2 >= mvmin.x) // check half left
469
0
                {
470
0
                    COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
471
0
                }
472
0
                if (right2 <= mvmax.x) // check half right
473
0
                {
474
0
                    COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
475
0
                }
476
0
            }
477
0
            if (left >= mvmin.x) // check left
478
0
            {
479
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
480
0
            }
481
0
            if (right <= mvmax.x) // check right
482
0
            {
483
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
484
0
            }
485
0
            if (bottom2 <= mvmax.y) // check half bottom
486
0
            {
487
0
                if (left2 >= mvmin.x) // check half left
488
0
                {
489
0
                    COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
490
0
                }
491
0
                if (right2 <= mvmax.x) // check half right
492
0
                {
493
0
                    COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
494
0
                }
495
0
            }
496
0
            if (bottom <= mvmax.y) // check bottom
497
0
            {
498
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
499
0
            }
500
0
        }
501
502
0
        if (bcost < saved)
503
0
            rounds = 0;
504
0
        else if (++rounds >= earlyExitIters)
505
0
            return;
506
0
    }
507
508
0
    for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
509
0
    {
510
0
        const int32_t top    = omv.y - dist;
511
0
        const int32_t bottom = omv.y + dist;
512
0
        const int32_t left   = omv.x - dist;
513
0
        const int32_t right  = omv.x + dist;
514
515
0
        saved = bcost;
516
0
        if (top >= mvmin.y && left >= mvmin.x &&
517
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
518
0
        {
519
            /* index
520
                  0
521
                  3
522
                  2
523
                  1
524
          0 3 2 1 * 1 2 3 0
525
                  1
526
                  2
527
                  3
528
                  0
529
            */
530
531
0
            COST_MV_PT_DIST_X4(omv.x,  top,    0, dist,
532
0
                               left,   omv.y,  0, dist,
533
0
                               right,  omv.y,  0, dist,
534
0
                               omv.x,  bottom, 0, dist);
535
536
0
            for (int16_t index = 1; index < 4; index++)
537
0
            {
538
0
                int32_t posYT = top    + ((dist >> 2) * index);
539
0
                int32_t posYB = bottom - ((dist >> 2) * index);
540
0
                int32_t posXL = omv.x  - ((dist >> 2) * index);
541
0
                int32_t posXR = omv.x  + ((dist >> 2) * index);
542
543
0
                COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
544
0
                                   posXR, posYT, 0, dist,
545
0
                                   posXL, posYB, 0, dist,
546
0
                                   posXR, posYB, 0, dist);
547
0
            }
548
0
        }
549
0
        else // check border for each mv
550
0
        {
551
0
            if (top >= mvmin.y) // check top
552
0
            {
553
0
                COST_MV_PT_DIST(omv.x, top, 0, dist);
554
0
            }
555
0
            if (left >= mvmin.x) // check left
556
0
            {
557
0
                COST_MV_PT_DIST(left, omv.y, 0, dist);
558
0
            }
559
0
            if (right <= mvmax.x) // check right
560
0
            {
561
0
                COST_MV_PT_DIST(right, omv.y, 0, dist);
562
0
            }
563
0
            if (bottom <= mvmax.y) // check bottom
564
0
            {
565
0
                COST_MV_PT_DIST(omv.x, bottom, 0, dist);
566
0
            }
567
0
            for (int16_t index = 1; index < 4; index++)
568
0
            {
569
0
                int32_t posYT = top    + ((dist >> 2) * index);
570
0
                int32_t posYB = bottom - ((dist >> 2) * index);
571
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
572
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
573
574
0
                if (posYT >= mvmin.y) // check top
575
0
                {
576
0
                    if (posXL >= mvmin.x) // check left
577
0
                    {
578
0
                        COST_MV_PT_DIST(posXL, posYT, 0, dist);
579
0
                    }
580
0
                    if (posXR <= mvmax.x) // check right
581
0
                    {
582
0
                        COST_MV_PT_DIST(posXR, posYT, 0, dist);
583
0
                    }
584
0
                }
585
0
                if (posYB <= mvmax.y) // check bottom
586
0
                {
587
0
                    if (posXL >= mvmin.x) // check left
588
0
                    {
589
0
                        COST_MV_PT_DIST(posXL, posYB, 0, dist);
590
0
                    }
591
0
                    if (posXR <= mvmax.x) // check right
592
0
                    {
593
0
                        COST_MV_PT_DIST(posXR, posYB, 0, dist);
594
0
                    }
595
0
                }
596
0
            }
597
0
        }
598
599
0
        if (bcost < saved)
600
0
            rounds = 0;
601
0
        else if (++rounds >= earlyExitIters)
602
0
            return;
603
0
    }
604
0
}
605
606
void MotionEstimate::refineMV(ReferencePlanes* ref,
607
                              const MV&        mvmin,
608
                              const MV&        mvmax,
609
                              const MV&        qmvp,
610
                              MV&              outQMv)
611
0
{
612
0
    ALIGN_VAR_16(int, costs[16]);
613
0
    if (ctuAddr >= 0)
614
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
615
0
    intptr_t stride = ref->lumaStride;
616
0
    pixel* fenc = fencPUYuv.m_buf[0];
617
0
    pixel* fref = ref->fpelPlane[0] + blockOffset;
618
    
619
0
    setMVP(qmvp);
620
    
621
0
    MV qmvmin = mvmin.toQPel();
622
0
    MV qmvmax = mvmax.toQPel();
623
   
624
    /* The term cost used here means satd/sad values for that particular search.
625
     * The costs used in ME integer search only includes the SAD cost of motion
626
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
627
     * cost of residual and sqrtLambda * MVD bits.
628
    */
629
             
630
    // measure SATD cost at clipped QPEL MVP
631
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
632
0
    MV bestpre = pmv;
633
0
    int bprecost;
634
635
0
    bprecost = subpelCompare(ref, pmv, sad);
636
637
    /* re-measure full pel rounded MVP with SAD as search start point */
638
0
    MV bmv = pmv.roundToFPel();
639
0
    int bcost = bprecost;
640
0
    if (pmv.isSubpel())
641
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
642
643
    /* square refine */
644
0
    int dir = 0;
645
0
    COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
646
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
647
0
        COPY2_IF_LT(bcost, costs[0], dir, 1);
648
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
649
0
        COPY2_IF_LT(bcost, costs[1], dir, 2);
650
0
    COPY2_IF_LT(bcost, costs[2], dir, 3);
651
0
    COPY2_IF_LT(bcost, costs[3], dir, 4);
652
0
    COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
653
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
654
0
        COPY2_IF_LT(bcost, costs[0], dir, 5);
655
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
656
0
        COPY2_IF_LT(bcost, costs[1], dir, 6);
657
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
658
0
        COPY2_IF_LT(bcost, costs[2], dir, 7);
659
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
660
0
        COPY2_IF_LT(bcost, costs[3], dir, 8);
661
0
    bmv += square1[dir];
662
663
0
    if (bprecost < bcost)
664
0
    {
665
0
        bmv = bestpre;
666
0
        bcost = bprecost;
667
0
    }
668
0
    else
669
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
670
671
    // TO DO: Change SubpelWorkload to fine tune MV
672
    // Now it is set to 5 for experiment.
673
    // const SubpelWorkload& wl = workload[this->subpelRefine];
674
0
    const SubpelWorkload& wl = workload[5];
675
676
0
    pixelcmp_t hpelcomp;
677
678
0
    if (wl.hpel_satd)
679
0
    {
680
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
681
0
        hpelcomp = satd;
682
0
    }
683
0
    else
684
0
        hpelcomp = sad;
685
686
0
    for (int iter = 0; iter < wl.hpel_iters; iter++)
687
0
    {
688
0
        int bdir = 0;
689
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
690
0
        {
691
0
            MV qmv = bmv + square1[i] * 2;            
692
693
            // check mv range for slice bound
694
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
695
0
                continue;
696
697
0
            int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
698
0
            COPY2_IF_LT(bcost, cost, bdir, i);
699
0
        }
700
701
0
        if (bdir)
702
0
            bmv += square1[bdir] * 2;            
703
0
        else
704
0
            break;
705
0
    }
706
707
    /* if HPEL search used SAD, remeasure with SATD before QPEL */
708
0
    if (!wl.hpel_satd)
709
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
710
711
0
    for (int iter = 0; iter < wl.qpel_iters; iter++)
712
0
    {
713
0
        int bdir = 0;
714
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
715
0
        {
716
0
            MV qmv = bmv + square1[i];
717
            
718
            // check mv range for slice bound
719
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
720
0
                continue;
721
722
0
            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
723
0
            COPY2_IF_LT(bcost, cost, bdir, i);
724
0
        }
725
726
0
        if (bdir)
727
0
            bmv += square1[bdir];
728
0
        else
729
0
            break;
730
0
    }
731
732
    // check mv range for slice bound
733
0
    X265_CHECK(((pmv.y >= qmvmin.y) & (pmv.y <= qmvmax.y)), "mv beyond range!");
734
    
735
0
    x265_emms();
736
0
    outQMv = bmv;
737
0
}
738
739
int MotionEstimate::motionEstimate(ReferencePlanes *ref,
740
                                   const MV &       mvmin,
741
                                   const MV &       mvmax,
742
                                   const MV &       qmvp,
743
                                   int              numCandidates,
744
                                   const MV *       mvc,
745
                                   int              merange,
746
                                   MV &             outQMv,
747
                                   uint32_t         maxSlices,
748
                                   pixel *          srcReferencePlane)
749
0
{
750
0
    ALIGN_VAR_16(int, costs[16]);
751
0
    bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
752
0
    if (ctuAddr >= 0)
753
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
754
0
    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
755
0
    pixel* fenc = fencPUYuv.m_buf[0];
756
0
    pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
757
758
0
    setMVP(qmvp);
759
760
0
    MV qmvmin = mvmin.toQPel();
761
0
    MV qmvmax = mvmax.toQPel();
762
763
    /* The term cost used here means satd/sad values for that particular search.
764
     * The costs used in ME integer search only includes the SAD cost of motion
765
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
766
     * cost of residual and sqrtLambda * MVD bits.  Mode decision will be based
767
     * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
768
     * (mode + MVD bits). */
769
770
    // measure SAD cost at clipped QPEL MVP
771
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
772
0
    MV bestpre = pmv;
773
0
    int bprecost;
774
775
0
    if (ref->isLowres)
776
0
        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
777
0
    else
778
0
        bprecost = subpelCompare(ref, pmv, sad);
779
780
    /* re-measure full pel rounded MVP with SAD as search start point */
781
0
    MV bmv = pmv.roundToFPel();
782
0
    int bcost = bprecost;
783
0
    if (pmv.isSubpel())
784
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
785
786
    // measure SAD cost at MV(0) if MVP is not zero
787
0
    if (pmv.notZero())
788
0
    {
789
0
        int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
790
0
        if (cost < bcost)
791
0
        {
792
0
            bcost = cost;
793
0
            bmv = 0;
794
0
            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
795
0
        }
796
0
    }
797
798
0
    X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
799
    // measure SAD cost at each QPEL motion vector candidate
800
0
    for (int i = 0; i < numCandidates; i++)
801
0
    {
802
0
        MV m = mvc[i].clipped(qmvmin, qmvmax);
803
0
        if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
804
0
        {
805
0
            int cost = subpelCompare(ref, m, sad) + mvcost(m);
806
0
            if (cost < bprecost)
807
0
            {
808
0
                bprecost = cost;
809
0
                bestpre = m;
810
0
            }
811
0
        }
812
0
    }
813
814
0
    pmv = pmv.roundToFPel();
815
0
    MV omv = bmv;  // current search origin or starting point
816
817
0
    int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
818
0
    switch (search)
819
0
    {
820
0
    case X265_DIA_SEARCH:
821
0
    {
822
        /* diamond search, radius 1 */
823
0
        bcost <<= 4;
824
0
        int i = merange;
825
0
        do
826
0
        {
827
0
            COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
828
0
            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
829
0
                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
830
0
            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
831
0
                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
832
0
            COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
833
0
            COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
834
0
            if (!(bcost & 15))
835
0
                break;
836
0
            bmv.x -= (bcost << 28) >> 30;
837
0
            bmv.y -= (bcost << 30) >> 30;
838
0
            bcost &= ~15;
839
0
        }
840
0
        while (--i && bmv.checkRange(mvmin, mvmax));
841
0
        bcost >>= 4;
842
0
        break;
843
0
    }
844
845
0
    case X265_HEX_SEARCH:
846
0
    {
847
0
me_hex2:
848
        /* hexagon search, radius 2 */
849
#if 0
850
        for (int i = 0; i < merange / 2; i++)
851
        {
852
            omv = bmv;
853
            COST_MV(omv.x - 2, omv.y);
854
            COST_MV(omv.x - 1, omv.y + 2);
855
            COST_MV(omv.x + 1, omv.y + 2);
856
            COST_MV(omv.x + 2, omv.y);
857
            COST_MV(omv.x + 1, omv.y - 2);
858
            COST_MV(omv.x - 1, omv.y - 2);
859
            if (omv == bmv)
860
                break;
861
            if (!bmv.checkRange(mvmin, mvmax))
862
                break;
863
        }
864
865
#else // if 0
866
      /* equivalent to the above, but eliminates duplicate candidates */
867
0
        COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
868
0
        bcost <<= 3;
869
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
870
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
871
0
        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
872
0
        {
873
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
874
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
875
0
        }
876
877
0
        COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
878
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
879
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
880
0
        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
881
0
        {
882
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
883
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
884
0
        }
885
886
0
        if (bcost & 7)
887
0
        {
888
0
            int dir = (bcost & 7) - 2;
889
890
0
            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
891
0
            {
892
0
                bmv += hex2[dir + 1];
893
894
                /* half hexagon, not overlapping the previous iteration */
895
0
                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
896
0
                {
897
0
                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
898
0
                        hex2[dir + 1].x, hex2[dir + 1].y,
899
0
                        hex2[dir + 2].x, hex2[dir + 2].y,
900
0
                        costs);
901
0
                    bcost &= ~7;
902
903
0
                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
904
0
                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
905
906
0
                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
907
0
                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
908
909
0
                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
910
0
                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
911
912
0
                    if (!(bcost & 7))
913
0
                        break;
914
915
0
                    dir += (bcost & 7) - 2;
916
0
                    dir = mod6m1[dir + 1];
917
0
                    bmv += hex2[dir + 1];
918
0
                }
919
0
            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
920
0
        }
921
0
        bcost >>= 3;
922
0
#endif // if 0
923
924
        /* square refine */
925
0
        int dir = 0;
926
0
        COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
927
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
928
0
            COPY2_IF_LT(bcost, costs[0], dir, 1);
929
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
930
0
            COPY2_IF_LT(bcost, costs[1], dir, 2);
931
0
        COPY2_IF_LT(bcost, costs[2], dir, 3);
932
0
        COPY2_IF_LT(bcost, costs[3], dir, 4);
933
0
        COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
934
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
935
0
            COPY2_IF_LT(bcost, costs[0], dir, 5);
936
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
937
0
            COPY2_IF_LT(bcost, costs[1], dir, 6);
938
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
939
0
            COPY2_IF_LT(bcost, costs[2], dir, 7);
940
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
941
0
            COPY2_IF_LT(bcost, costs[3], dir, 8);
942
0
        bmv += square1[dir];
943
0
        break;
944
0
    }
945
946
0
    case X265_UMH_SEARCH:
947
0
    {
948
0
        int ucost1, ucost2;
949
0
        int16_t cross_start = 1;
950
951
        /* refine predictors */
952
0
        omv = bmv;
953
0
        ucost1 = bcost;
954
0
        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
955
0
        DIA1_ITER(pmv.x, pmv.y);
956
0
        if (pmv.notZero())
957
0
            DIA1_ITER(0, 0);
958
959
0
        ucost2 = bcost;
960
0
        if (bmv.notZero() && bmv != pmv)
961
0
            DIA1_ITER(bmv.x, bmv.y);
962
0
        if (bcost == ucost2)
963
0
            cross_start = 3;
964
965
        /* Early Termination */
966
0
        omv = bmv;
967
0
        if (bcost == ucost2 && SAD_THRESH(2000))
968
0
        {
969
0
            COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
970
0
            COST_MV_X4(2, 0, -1, 1, 1, 1,  0, 2);
971
0
            if (bcost == ucost1 && SAD_THRESH(500))
972
0
                break;
973
0
            if (bcost == ucost2)
974
0
            {
975
0
                int16_t range = (int16_t)(merange >> 1) | 1;
976
0
                CROSS(3, range, range);
977
0
                COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
978
0
                COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
979
0
                if (bcost == ucost2)
980
0
                    break;
981
0
                cross_start = range + 2;
982
0
            }
983
0
        }
984
985
        // TODO: Need to study x264's logic for building mvc list to understand why they
986
        //       have special cases here for 16x16, and whether they apply to HEVC CTU
987
988
        // adaptive search range based on mvc variability
989
0
        if (numCandidates)
990
0
        {
991
            /* range multipliers based on casual inspection of some statistics of
992
             * average distance between current predictor and final mv found by ESA.
993
             * these have not been tuned much by actual encoding. */
994
0
            static const uint8_t range_mul[4][4] =
995
0
            {
996
0
                { 3, 3, 4, 4 },
997
0
                { 3, 4, 4, 4 },
998
0
                { 4, 4, 4, 5 },
999
0
                { 4, 4, 5, 6 },
1000
0
            };
1001
1002
0
            int mvd;
1003
0
            int sad_ctx, mvd_ctx;
1004
0
            int denom = 1;
1005
1006
0
            if (numCandidates == 1)
1007
0
            {
1008
0
                if (LUMA_64x64 == partEnum)
1009
                    /* mvc is probably the same as mvp, so the difference isn't meaningful.
1010
                     * but prediction usually isn't too bad, so just use medium range */
1011
0
                    mvd = 25;
1012
0
                else
1013
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1014
0
            }
1015
0
            else
1016
0
            {
1017
                /* calculate the degree of agreement between predictors. */
1018
1019
                /* in 64x64, mvc includes all the neighbors used to make mvp,
1020
                 * so don't count mvp separately. */
1021
1022
0
                denom = numCandidates - 1;
1023
0
                mvd = 0;
1024
0
                if (partEnum != LUMA_64x64)
1025
0
                {
1026
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1027
0
                    denom++;
1028
0
                }
1029
0
                mvd += predictorDifference(mvc, numCandidates);
1030
0
            }
1031
1032
0
            sad_ctx = SAD_THRESH(1000) ? 0
1033
0
                : SAD_THRESH(2000) ? 1
1034
0
                : SAD_THRESH(4000) ? 2 : 3;
1035
0
            mvd_ctx = mvd < 10 * denom ? 0
1036
0
                : mvd < 20 * denom ? 1
1037
0
                : mvd < 40 * denom ? 2 : 3;
1038
1039
0
            merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
1040
0
        }
1041
1042
        /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
1043
         * we are still centered on the same place as the DIA2. is this desirable? */
1044
0
        CROSS(cross_start, merange, merange >> 1);
1045
0
        COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
1046
1047
        /* hexagon grid */
1048
0
        omv = bmv;
1049
0
        const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
1050
0
        const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
1051
0
        uint16_t i = 1;
1052
0
        do
1053
0
        {
1054
0
            if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
1055
0
                                  mvmax.y - omv.y, omv.y - mvmin.y))
1056
0
            {
1057
0
                for (int j = 0; j < 16; j++)
1058
0
                {
1059
0
                    MV mv = omv + (hex4[j] * i);
1060
0
                    if (mv.checkRange(mvmin, mvmax))
1061
0
                        COST_MV(mv.x, mv.y);
1062
0
                }
1063
0
            }
1064
0
            else
1065
0
            {
1066
0
                int16_t dir = 0;
1067
0
                pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
1068
0
                size_t dy = (size_t)i * stride;
1069
0
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
1070
0
    sad_x4(fenc, \
1071
0
           fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
1072
0
           fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
1073
0
           fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
1074
0
           fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
1075
0
           stride, costs + 4 * k); \
1076
0
    fref_base += 2 * dy;
1077
0
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
1078
0
#define MIN_MV(k, dx, dy)     if ((omv.y + (dy) >= mvmin.y) & (omv.y + (dy) <= mvmax.y)) { COPY2_IF_LT(bcost, costs[k], dir, dx * 16 + (dy & 15)) }
1079
1080
0
                SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
1081
0
                SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
1082
0
                SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
1083
0
                SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
1084
0
                ADD_MVCOST(0, 0, -4);
1085
0
                ADD_MVCOST(1, 0, 4);
1086
0
                ADD_MVCOST(2, -2, -3);
1087
0
                ADD_MVCOST(3, 2, -3);
1088
0
                ADD_MVCOST(4, -4, -2);
1089
0
                ADD_MVCOST(5, 4, -2);
1090
0
                ADD_MVCOST(6, -4, -1);
1091
0
                ADD_MVCOST(7, 4, -1);
1092
0
                ADD_MVCOST(8, -4, 0);
1093
0
                ADD_MVCOST(9, 4, 0);
1094
0
                ADD_MVCOST(10, -4, 1);
1095
0
                ADD_MVCOST(11, 4, 1);
1096
0
                ADD_MVCOST(12, -4, 2);
1097
0
                ADD_MVCOST(13, 4, 2);
1098
0
                ADD_MVCOST(14, -2, 3);
1099
0
                ADD_MVCOST(15, 2, 3);
1100
0
                MIN_MV(0, 0, -4);
1101
0
                MIN_MV(1, 0, 4);
1102
0
                MIN_MV(2, -2, -3);
1103
0
                MIN_MV(3, 2, -3);
1104
0
                MIN_MV(4, -4, -2);
1105
0
                MIN_MV(5, 4, -2);
1106
0
                MIN_MV(6, -4, -1);
1107
0
                MIN_MV(7, 4, -1);
1108
0
                MIN_MV(8, -4, 0);
1109
0
                MIN_MV(9, 4, 0);
1110
0
                MIN_MV(10, -4, 1);
1111
0
                MIN_MV(11, 4, 1);
1112
0
                MIN_MV(12, -4, 2);
1113
0
                MIN_MV(13, 4, 2);
1114
0
                MIN_MV(14, -2, 3);
1115
0
                MIN_MV(15, 2, 3);
1116
0
#undef SADS
1117
0
#undef ADD_MVCOST
1118
0
#undef MIN_MV
1119
0
                if (dir)
1120
0
                {
1121
0
                    bmv.x = omv.x + i * (dir >> 4);
1122
0
                    bmv.y = omv.y + i * ((dir << 28) >> 28);
1123
0
                }
1124
0
            }
1125
0
        }
1126
0
        while (++i <= merange >> 2);
1127
0
        if (bmv.checkRange(mvmin, mvmax))
1128
0
            goto me_hex2;
1129
0
        break;
1130
0
    }
1131
1132
0
    case X265_STAR_SEARCH: // Adapted from HM ME
1133
0
    {
1134
0
        int bPointNr = 0;
1135
0
        int bDistance = 0;
1136
1137
0
        const int EarlyExitIters = 3;
1138
0
        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);
1139
0
        if (bDistance == 1)
1140
0
        {
1141
            // if best distance was only 1, check two missing points.  If no new point is found, stop
1142
0
            if (bPointNr)
1143
0
            {
1144
                /* For a given direction 1 to 8, check nearest two outer X pixels
1145
                     X   X
1146
                   X 1 2 3 X
1147
                     4 * 5
1148
                   X 6 7 8 X
1149
                     X   X
1150
                */
1151
0
                int saved = bcost;
1152
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1153
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1154
0
                if (mv1.checkRange(mvmin, mvmax))
1155
0
                {
1156
0
                    COST_MV(mv1.x, mv1.y);
1157
0
                }
1158
0
                if (mv2.checkRange(mvmin, mvmax))
1159
0
                {
1160
0
                    COST_MV(mv2.x, mv2.y);
1161
0
                }
1162
0
                if (bcost == saved)
1163
0
                    break;
1164
0
            }
1165
0
            else
1166
0
                break;
1167
0
        }
1168
1169
0
        const int RasterDistance = 5;
1170
0
        if (bDistance > RasterDistance)
1171
0
        {
1172
            // raster search refinement if original search distance was too big
1173
0
            MV tmv;
1174
0
            for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
1175
0
            {
1176
0
                for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
1177
0
                {
1178
0
                    if (tmv.x + (RasterDistance * 3) <= mvmax.x)
1179
0
                    {
1180
0
                        pixel *pix_base = fref + tmv.y * stride + tmv.x;
1181
0
                        sad_x4(fenc,
1182
0
                               pix_base,
1183
0
                               pix_base + RasterDistance,
1184
0
                               pix_base + RasterDistance * 2,
1185
0
                               pix_base + RasterDistance * 3,
1186
0
                               stride, costs);
1187
0
                        costs[0] += mvcost(tmv << 2);
1188
0
                        COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1189
0
                        tmv.x += RasterDistance;
1190
0
                        costs[1] += mvcost(tmv << 2);
1191
0
                        COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1192
0
                        tmv.x += RasterDistance;
1193
0
                        costs[2] += mvcost(tmv << 2);
1194
0
                        COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1195
0
                        tmv.x += RasterDistance;
1196
0
                        costs[3] += mvcost(tmv << 3);
1197
0
                        COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1198
0
                    }
1199
0
                    else
1200
0
                        COST_MV(tmv.x, tmv.y);
1201
0
                }
1202
0
            }
1203
0
        }
1204
1205
0
        while (bDistance > 0)
1206
0
        {
1207
            // center a new search around current best
1208
0
            bDistance = 0;
1209
0
            bPointNr = 0;
1210
0
            const int MaxIters = 32;
1211
0
            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);
1212
1213
0
            if (bDistance == 1)
1214
0
            {
1215
0
                if (!bPointNr)
1216
0
                    break;
1217
1218
                /* For a given direction 1 to 8, check nearest 2 outer X pixels
1219
                        X   X
1220
                    X 1 2 3 X
1221
                        4 * 5
1222
                    X 6 7 8 X
1223
                        X   X
1224
                */
1225
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1226
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1227
0
                if (mv1.checkRange(mvmin, mvmax))
1228
0
                {
1229
0
                    COST_MV(mv1.x, mv1.y);
1230
0
                }
1231
0
                if (mv2.checkRange(mvmin, mvmax))
1232
0
                {
1233
0
                    COST_MV(mv2.x, mv2.y);
1234
0
                }
1235
0
                break;
1236
0
            }
1237
0
        }
1238
1239
0
        break;
1240
0
    }
1241
1242
0
    case X265_SEA:
1243
0
    {
1244
        // Successive Elimination Algorithm
1245
0
        const int32_t minX = X265_MAX(omv.x - (int32_t)merange, mvmin.x);
1246
0
        const int32_t minY = X265_MAX(omv.y - (int32_t)merange, mvmin.y);
1247
0
        const int32_t maxX = X265_MIN(omv.x + (int32_t)merange, mvmax.x);
1248
0
        const int32_t maxY = X265_MIN(omv.y + (int32_t)merange, mvmax.y);
1249
0
        const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;
1250
0
        const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;
1251
0
        int16_t* meScratchBuffer = NULL;
1252
0
        int scratchSize = merange * 2 + 4;
1253
0
        if (scratchSize)
1254
0
        {
1255
0
            meScratchBuffer = X265_MALLOC(int16_t, scratchSize);
1256
0
            memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);
1257
0
        }
1258
1259
        /* SEA is fastest in multiples of 4 */
1260
0
        int meRangeWidth = (maxX - minX + 3) & ~3;
1261
0
        int w = 0, h = 0;                    // Width and height of the PU
1262
0
        ALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };
1263
0
        ALIGN_VAR_32(int, encDC[4]);
1264
0
        uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >> 2);
1265
0
        sizesFromPartition(partEnum, &w, &h);
1266
0
        int deltaX = (w <= 8) ? (w) : (w >> 1);
1267
0
        int deltaY = (h <= 8) ? (h) : (h >> 1);
1268
1269
        /* Check if very small rectangular blocks which cannot be sub-divided anymore */
1270
0
        bool smallRectPartition = partEnum == LUMA_4x4 || partEnum == LUMA_16x12 ||
1271
0
            partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum == LUMA_4x16;
1272
        /* Check if vertical partition */
1273
0
        bool verticalRect = partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1274
0
            partEnum == LUMA_4x8;
1275
        /* Check if horizontal partition */
1276
0
        bool horizontalRect = partEnum == LUMA_64x32 || partEnum == LUMA_32x16 || partEnum == LUMA_16x8 ||
1277
0
            partEnum == LUMA_8x4;
1278
        /* Check if assymetric vertical partition */
1279
0
        bool assymetricVertical = partEnum == LUMA_12x16 || partEnum == LUMA_4x16 || partEnum == LUMA_24x32 ||
1280
0
            partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum == LUMA_16x64;
1281
        /* Check if assymetric horizontal partition */
1282
0
        bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum == LUMA_16x4 || partEnum == LUMA_32x24 ||
1283
0
            partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum == LUMA_64x16;
1284
1285
0
        int tempPartEnum = 0;
1286
1287
        /* If a vertical rectangular partition, it is horizontally split into two, for ads_x2() */
1288
0
        if (verticalRect)
1289
0
            tempPartEnum = partitionFromSizes(w, h >> 1);
1290
        /* If a horizontal rectangular partition, it is vertically split into two, for ads_x2() */
1291
0
        else if (horizontalRect)
1292
0
            tempPartEnum = partitionFromSizes(w >> 1, h);
1293
        /* We have integral planes introduced to account for assymetric partitions.
1294
         * Hence all assymetric partitions except those which cannot be split into legal sizes,
1295
         * are split into four for ads_x4() */
1296
0
        else if (assymetricVertical || assymetricHorizontal)
1297
0
            tempPartEnum = smallRectPartition ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1298
        /* General case: Square partitions. All partitions with width > 8 are split into four
1299
         * for ads_x4(), for 4x4 and 8x8 we do ads_x1() */
1300
0
        else
1301
0
            tempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1302
1303
        /* Successive elimination by comparing DC before a full SAD,
1304
         * because sum(abs(diff)) >= abs(diff(sum)). */
1305
0
        primitives.pu[tempPartEnum].sad_x4(zero,
1306
0
                         fenc,
1307
0
                         fenc + deltaX,
1308
0
                         fenc + deltaY * FENC_STRIDE,
1309
0
                         fenc + deltaX + deltaY * FENC_STRIDE,
1310
0
                         FENC_STRIDE,
1311
0
                         encDC);
1312
1313
        /* Assigning appropriate integral plane */
1314
0
        uint32_t *sumsBase = NULL;
1315
0
        switch (deltaX)
1316
0
        {
1317
0
            case 32: if (deltaY % 24 == 0)
1318
0
                         sumsBase = integral[1];
1319
0
                     else if (deltaY == 8)
1320
0
                         sumsBase = integral[2];
1321
0
                     else
1322
0
                         sumsBase = integral[0];
1323
0
               break;
1324
0
            case 24: sumsBase = integral[3];
1325
0
               break;
1326
0
            case 16: if (deltaY % 12 == 0)
1327
0
                         sumsBase = integral[5];
1328
0
                     else if (deltaY == 4)
1329
0
                         sumsBase = integral[6];
1330
0
                     else
1331
0
                         sumsBase = integral[4];
1332
0
               break;
1333
0
            case 12: sumsBase = integral[7];
1334
0
                break;
1335
0
            case 8: if (deltaY == 32)
1336
0
                        sumsBase = integral[8];
1337
0
                    else
1338
0
                        sumsBase = integral[9];
1339
0
                break;
1340
0
            case 4: if (deltaY == 16)
1341
0
                        sumsBase = integral[10];
1342
0
                    else
1343
0
                        sumsBase = integral[11];
1344
0
                break;
1345
0
            default: sumsBase = integral[11];
1346
0
                break;
1347
0
        }
1348
1349
0
        if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum == LUMA_16x16 ||
1350
0
            partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1351
0
            partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum == LUMA_4x16 ||
1352
0
            partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum == LUMA_48x64 ||
1353
0
            partEnum == LUMA_16x64)
1354
0
            deltaY *= (int)stride;
1355
1356
0
        if (verticalRect)
1357
0
            encDC[1] = encDC[2];
1358
1359
0
        if (horizontalRect)
1360
0
            deltaY = deltaX;
1361
1362
        /* ADS and SAD */
1363
0
        MV tmv;
1364
0
        for (tmv.y = minY; tmv.y <= maxY; tmv.y++)
1365
0
        {
1366
0
            int i, xn;
1367
0
            int ycost = p_cost_mvy[tmv.y] << 2;
1368
0
            if (bcost <= ycost)
1369
0
                continue;
1370
0
            bcost -= ycost;
1371
1372
            /* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48, 32x8, 8x32, 64x16, 16x64 partitions
1373
             * ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions
1374
             * ADS_2 for all other rectangular partitions */
1375
0
            xn = ads(encDC,
1376
0
                    sumsBase + minX + tmv.y * stride,
1377
0
                    deltaY,
1378
0
                    fpelCostMvX + minX,
1379
0
                    meScratchBuffer,
1380
0
                    meRangeWidth,
1381
0
                    bcost);
1382
1383
0
            for (i = 0; i < xn - 2; i += 3)
1384
0
                COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,
1385
0
                             minX + meScratchBuffer[i + 1], tmv.y,
1386
0
                             minX + meScratchBuffer[i + 2], tmv.y);
1387
1388
0
            bcost += ycost;
1389
0
            for (; i < xn; i++)
1390
0
                COST_MV(minX + meScratchBuffer[i], tmv.y);
1391
0
        }
1392
0
        if (meScratchBuffer)
1393
0
            x265_free(meScratchBuffer);
1394
0
        break;
1395
0
    }
1396
1397
0
    case X265_FULL_SEARCH:
1398
0
    {
1399
        // dead slow exhaustive search, but at least it uses sad_x4()
1400
0
        MV tmv;
1401
0
        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
1402
0
        if (ref->isHMELowres)
1403
0
        {
1404
0
            merange = (merange < 0 ? -merange : merange);
1405
0
            mvmin_y = X265_MAX(mvmin.y, -merange);
1406
0
            mvmin_x = X265_MAX(mvmin.x, -merange);
1407
0
            mvmax_y = X265_MIN(mvmax.y, merange);
1408
0
            mvmax_x = X265_MIN(mvmax.x, merange);
1409
0
        }
1410
0
        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
1411
0
        {
1412
0
            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
1413
0
            {
1414
0
                if (tmv.x + 3 <= mvmax_x)
1415
0
                {
1416
0
                    pixel *pix_base = fref + tmv.y * stride + tmv.x;
1417
0
                    sad_x4(fenc,
1418
0
                           pix_base,
1419
0
                           pix_base + 1,
1420
0
                           pix_base + 2,
1421
0
                           pix_base + 3,
1422
0
                           stride, costs);
1423
0
                    costs[0] += mvcost(tmv << 2);
1424
0
                    COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1425
0
                    tmv.x++;
1426
0
                    costs[1] += mvcost(tmv << 2);
1427
0
                    COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1428
0
                    tmv.x++;
1429
0
                    costs[2] += mvcost(tmv << 2);
1430
0
                    COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1431
0
                    tmv.x++;
1432
0
                    costs[3] += mvcost(tmv << 2);
1433
0
                    COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1434
0
                }
1435
0
                else
1436
0
                    COST_MV(tmv.x, tmv.y);
1437
0
            }
1438
0
        }
1439
1440
0
        break;
1441
0
    }
1442
1443
0
    default:
1444
0
        X265_CHECK(0, "invalid motion estimate mode\n");
1445
0
        break;
1446
0
    }
1447
1448
0
    if (bprecost < bcost)
1449
0
    {
1450
0
        bmv = bestpre;
1451
0
        bcost = bprecost;
1452
0
    }
1453
0
    else
1454
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
1455
1456
0
    const SubpelWorkload& wl = workload[this->subpelRefine];
1457
1458
    // check mv range for slice bound
1459
0
    if ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y)))
1460
0
    {
1461
0
        bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);
1462
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1463
0
    }
1464
1465
0
    if (!bcost)
1466
0
    {
1467
        /* if there was zero residual at the clipped MVP, we can skip subpel
1468
         * refine, but we do need to include the mvcost in the returned cost */
1469
0
        bcost = mvcost(bmv);
1470
0
    }
1471
0
    else if (ref->isLowres)
1472
0
    {
1473
0
        int bdir = 0;
1474
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
1475
0
        {
1476
0
            MV qmv = bmv + square1[i] * 2;
1477
1478
            /* skip invalid range */
1479
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1480
0
                continue;
1481
1482
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
1483
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1484
0
        }
1485
1486
0
        bmv += square1[bdir] * 2;
1487
0
        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
1488
1489
0
        bdir = 0;
1490
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
1491
0
        {
1492
0
            MV qmv = bmv + square1[i];
1493
1494
            /* skip invalid range */
1495
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1496
0
                continue;
1497
1498
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
1499
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1500
0
        }
1501
1502
0
        bmv += square1[bdir];
1503
0
    }
1504
0
    else
1505
0
    {
1506
0
        pixelcmp_t hpelcomp;
1507
1508
0
        if (wl.hpel_satd)
1509
0
        {
1510
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1511
0
            hpelcomp = satd;
1512
0
        }
1513
0
        else
1514
0
            hpelcomp = sad;
1515
1516
0
        for (int iter = 0; iter < wl.hpel_iters; iter++)
1517
0
        {
1518
0
            int bdir = 0;
1519
0
            for (int i = 1; i <= wl.hpel_dirs; i++)
1520
0
            {
1521
0
                MV qmv = bmv + square1[i] * 2;
1522
1523
                // check mv range for slice bound
1524
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1525
0
                    continue;
1526
1527
0
                int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1528
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1529
0
            }
1530
1531
0
            if (bdir)
1532
0
                bmv += square1[bdir] * 2;
1533
0
            else
1534
0
                break;
1535
0
        }
1536
1537
        /* if HPEL search used SAD, remeasure with SATD before QPEL */
1538
0
        if (!wl.hpel_satd)
1539
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1540
1541
0
        for (int iter = 0; iter < wl.qpel_iters; iter++)
1542
0
        {
1543
0
            int bdir = 0;
1544
0
            for (int i = 1; i <= wl.qpel_dirs; i++)
1545
0
            {
1546
0
                MV qmv = bmv + square1[i];
1547
1548
                // check mv range for slice bound
1549
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1550
0
                    continue;
1551
1552
0
                int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1553
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1554
0
            }
1555
1556
0
            if (bdir)
1557
0
                bmv += square1[bdir];
1558
0
            else
1559
0
                break;
1560
0
        }
1561
0
    }
1562
1563
    // check mv range for slice bound
1564
0
    X265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");
1565
1566
0
    x265_emms();
1567
0
    outQMv = bmv;
1568
0
    return bcost;
1569
0
}
1570
1571
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1572
0
{
1573
0
    intptr_t refStride = ref->lumaStride;
1574
0
    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1575
0
    int xFrac = qmv.x & 0x3;
1576
0
    int yFrac = qmv.y & 0x3;
1577
0
    int cost;
1578
0
    const intptr_t fencStride = FENC_STRIDE;
1579
0
    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1580
1581
0
    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
1582
    
1583
0
    if (!(yFrac | xFrac))
1584
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
1585
0
    else
1586
0
    {
1587
        /* we are taking a short-cut here if the reference is weighted. To be
1588
         * accurate we should be interpolating unweighted pixels and weighting
1589
         * the final 16bit values prior to rounding and down shifting. Instead we
1590
         * are simply interpolating the weighted full-pel pixels. Not 100%
1591
         * accurate but good enough for fast qpel ME */
1592
0
        if (!yFrac)
1593
0
            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
1594
0
        else if (!xFrac)
1595
0
            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
1596
0
        else
1597
0
            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
1598
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
1599
0
    }
1600
1601
0
    if (bChromaSATD)
1602
0
    {
1603
0
        int csp    = fencPUYuv.m_csp;
1604
0
        int hshift = fencPUYuv.m_hChromaShift;
1605
0
        int vshift = fencPUYuv.m_vChromaShift;
1606
0
        int mvx = qmv.x << (1 - hshift);
1607
0
        int mvy = qmv.y << (1 - vshift);
1608
0
        intptr_t fencStrideC = fencPUYuv.m_csize;
1609
1610
0
        intptr_t refStrideC = ref->reconPic->m_strideC;
1611
0
        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
1612
1613
0
        const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1614
0
        const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1615
1616
0
        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
1617
0
        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
1618
1619
0
        xFrac = mvx & 7;
1620
0
        yFrac = mvy & 7;
1621
1622
0
        if (!(yFrac | xFrac))
1623
0
        {
1624
0
            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
1625
0
            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
1626
0
        }
1627
0
        else
1628
0
        {
1629
0
            int blockwidthC = blockwidth >> hshift;
1630
1631
0
            if (!yFrac)
1632
0
            {
1633
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
1634
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1635
1636
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
1637
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1638
0
            }
1639
0
            else if (!xFrac)
1640
0
            {
1641
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
1642
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1643
1644
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
1645
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1646
0
            }
1647
0
            else
1648
0
            {
1649
0
                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
1650
0
                const int halfFilterSize = (NTAPS_CHROMA >> 1);
1651
1652
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
1653
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1654
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1655
1656
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
1657
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1658
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1659
0
            }
1660
0
        }
1661
0
    }
1662
1663
0
    return cost;
1664
0
}