Coverage Report

Created: 2025-07-23 08:18

/src/x265/source/encoder/motion.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
 *
21
 * This program is also available under a commercial proprietary license.
22
 * For more information, contact us at license @ x265.com.
23
 *****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "lowres.h"
28
#include "motion.h"
29
#include "x265.h"
30
31
#if _MSC_VER
32
#pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
33
#endif
34
35
using namespace X265_NS;
36
37
namespace {
38
39
struct SubpelWorkload
40
{
41
    int hpel_iters;
42
    int hpel_dirs;
43
    int qpel_iters;
44
    int qpel_dirs;
45
    bool hpel_satd;
46
};
47
48
const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
49
{
50
    { 1, 4, 0, 4, false }, // 4 SAD HPEL only
51
    { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
52
    { 1, 4, 1, 4, true },  // 4 SATD HPEL + 4 SATD QPEL
53
    { 2, 4, 1, 4, true },  // 2x4 SATD HPEL + 4 SATD QPEL
54
    { 2, 4, 2, 4, true },  // 2x4 SATD HPEL + 2x4 SATD QPEL
55
    { 1, 8, 1, 8, true },  // 8 SATD HPEL + 8 SATD QPEL (default)
56
    { 2, 8, 1, 8, true },  // 2x8 SATD HPEL + 8 SATD QPEL
57
    { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
58
};
59
60
static int sizeScale[NUM_PU_SIZES];
61
0
#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
62
63
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
64
const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
65
const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
66
const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
67
const MV hex4[16] =
68
{
69
    MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
70
    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
71
    MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
72
    MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
73
};
74
const MV offsets[] =
75
{
76
    MV(-1, 0), MV(0, -1),
77
    MV(-1, -1), MV(1, -1),
78
    MV(-1, 0), MV(1, 0),
79
    MV(-1, 1), MV(-1, -1),
80
    MV(1, -1), MV(1, 1),
81
    MV(-1, 0), MV(0, 1),
82
    MV(-1, 1), MV(1, 1),
83
    MV(1, 0), MV(0, 1),
84
}; // offsets for Two Point Search
85
86
/* sum of absolute differences between MV candidates, used for adaptive ME range */
87
inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
88
0
{
89
0
    int sum = 0;
90
91
0
    for (int i = 0; i < numCandidates - 1; i++)
92
0
    {
93
0
        sum += abs(mvc[i].x - mvc[i + 1].x)
94
0
            +  abs(mvc[i].y - mvc[i + 1].y);
95
0
    }
96
97
0
    return sum;
98
0
}
99
100
}
101
102
MotionEstimate::MotionEstimate()
103
0
{
104
0
    ctuAddr = -1;
105
0
    absPartIdx = -1;
106
0
    searchMethod = X265_HEX_SEARCH;
107
0
    searchMethodL0 = X265_HEX_SEARCH;
108
0
    searchMethodL1 = X265_HEX_SEARCH;
109
0
    subpelRefine = 2;
110
0
    blockwidth = blockheight = 0;
111
0
    blockOffset = 0;
112
0
    bChromaSATD = false;
113
0
    chromaSatd = NULL;
114
0
    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
115
0
        integral[i] = NULL;
116
0
}
117
118
void MotionEstimate::init(int csp)
119
0
{
120
0
    fencPUYuv.create(FENC_STRIDE, csp);
121
0
}
122
123
void MotionEstimate::initScales(void)
124
0
{
125
0
#define SETUP_SCALE(W, H) \
126
0
    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
127
0
    SETUP_SCALE(4, 4);
128
0
    SETUP_SCALE(8, 8);
129
0
    SETUP_SCALE(8, 4);
130
0
    SETUP_SCALE(4, 8);
131
0
    SETUP_SCALE(16, 16);
132
0
    SETUP_SCALE(16, 8);
133
0
    SETUP_SCALE(8, 16);
134
0
    SETUP_SCALE(16, 12);
135
0
    SETUP_SCALE(12, 16);
136
0
    SETUP_SCALE(4, 16);
137
0
    SETUP_SCALE(16, 4);
138
0
    SETUP_SCALE(32, 32);
139
0
    SETUP_SCALE(32, 16);
140
0
    SETUP_SCALE(16, 32);
141
0
    SETUP_SCALE(32, 24);
142
0
    SETUP_SCALE(24, 32);
143
0
    SETUP_SCALE(32, 8);
144
0
    SETUP_SCALE(8, 32);
145
0
    SETUP_SCALE(64, 64);
146
0
    SETUP_SCALE(64, 32);
147
0
    SETUP_SCALE(32, 64);
148
0
    SETUP_SCALE(64, 48);
149
0
    SETUP_SCALE(48, 64);
150
0
    SETUP_SCALE(64, 16);
151
0
    SETUP_SCALE(16, 64);
152
0
#undef SETUP_SCALE
153
0
}
154
155
int MotionEstimate::hpelIterationCount(int subme)
156
0
{
157
0
    return workload[subme].hpel_iters +
158
0
           workload[subme].qpel_iters / 2;
159
0
}
160
161
MotionEstimate::~MotionEstimate()
162
0
{
163
0
    fencPUYuv.destroy();
164
0
}
165
166
/* Called by lookahead, luma only, no use of PicYuv */
167
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
168
0
{
169
0
    partEnum = partitionFromSizes(pwidth, pheight);
170
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
171
0
    sad = primitives.pu[partEnum].sad;
172
0
    ads = primitives.pu[partEnum].ads;
173
0
    satd = primitives.pu[partEnum].satd;
174
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
175
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
176
177
178
0
    blockwidth = pwidth;
179
0
    blockOffset = offset;
180
0
    absPartIdx = ctuAddr = -1;
181
182
    /* Search params */
183
0
    searchMethod = method;
184
0
    searchMethodL0 = searchL0;
185
0
    searchMethodL1 = searchL1;
186
0
    subpelRefine = refine;
187
188
    /* copy PU block into cache */
189
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
190
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
191
0
}
192
193
/* Called by lookahead, luma only, no use of PicYuv */
194
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
195
0
{
196
0
    partEnum = partitionFromSizes(pwidth, pheight);
197
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
198
0
    sad = primitives.pu[partEnum].sad;
199
0
    ads = primitives.pu[partEnum].ads;
200
0
    satd = primitives.pu[partEnum].satd;
201
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
202
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
203
204
205
0
    blockwidth = pwidth;
206
0
    blockOffset = offset;
207
0
    absPartIdx = ctuAddr = -1;
208
209
    /* Search params */
210
0
    searchMethod = method;
211
0
    subpelRefine = refine;
212
213
    /* copy PU block into cache */
214
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
215
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
216
0
}
217
218
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
219
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
220
0
{
221
0
    partEnum = partitionFromSizes(pwidth, pheight);
222
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
223
0
    sad = primitives.pu[partEnum].sad;
224
0
    ads = primitives.pu[partEnum].ads;
225
0
    satd = primitives.pu[partEnum].satd;
226
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
227
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
228
229
0
    chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
230
231
    /* Set search characteristics */
232
0
    searchMethod = method;
233
0
    subpelRefine = refine;
234
235
    /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
236
     * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
237
0
    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
238
0
    X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
239
240
0
    ctuAddr = _ctuAddr;
241
0
    absPartIdx = cuPartIdx + puPartIdx;
242
0
    blockwidth = pwidth;
243
0
    blockOffset = 0;
244
245
    /* copy PU from CU Yuv */
246
0
    fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
247
0
}
248
249
#define COST_MV_PT_DIST(mx, my, point, dist) \
250
0
    do \
251
0
    { \
252
0
        MV tmv(mx, my); \
253
0
        int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
254
0
        cost += mvcost(tmv << 2); \
255
0
        if (cost < bcost) { \
256
0
            bcost = cost; \
257
0
            bmv = tmv; \
258
0
            bPointNr = point; \
259
0
            bDistance = dist; \
260
0
        } \
261
0
    } while (0)
262
263
#define COST_MV(mx, my) \
264
0
    do \
265
0
    { \
266
0
        int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
267
0
        cost += mvcost(MV(mx, my) << 2); \
268
0
        COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
269
0
    } while (0)
270
271
#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
272
0
    { \
273
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
274
0
        sad_x3(fenc, \
275
0
               pix_base + (m0x) + (m0y) * stride, \
276
0
               pix_base + (m1x) + (m1y) * stride, \
277
0
               pix_base + (m2x) + (m2y) * stride, \
278
0
               stride, costs); \
279
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
280
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
281
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
282
0
    }
283
284
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
285
0
    { \
286
0
        sad_x4(fenc, \
287
0
               fref + (m0x) + (m0y) * stride, \
288
0
               fref + (m1x) + (m1y) * stride, \
289
0
               fref + (m2x) + (m2y) * stride, \
290
0
               fref + (m3x) + (m3y) * stride, \
291
0
               stride, costs); \
292
0
        (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
293
0
        (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
294
0
        (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
295
0
        (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
296
0
        COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
297
0
        COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
298
0
        COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
299
0
        COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
300
0
    }
301
302
#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
303
0
    { \
304
0
        pixel *pix_base = fref + omv.x + omv.y * stride; \
305
0
        sad_x4(fenc, \
306
0
               pix_base + (m0x) + (m0y) * stride, \
307
0
               pix_base + (m1x) + (m1y) * stride, \
308
0
               pix_base + (m2x) + (m2y) * stride, \
309
0
               pix_base + (m3x) + (m3y) * stride, \
310
0
               stride, costs); \
311
0
        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
312
0
        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
313
0
        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
314
0
        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
315
0
        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
316
0
            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
317
0
        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
318
0
            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
319
0
        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
320
0
            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
321
0
        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
322
0
            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
323
0
    }
324
325
0
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
326
0
{\
327
0
    sad_x3(fenc, \
328
0
    fref + (m0x) + (m0y) * stride, \
329
0
    fref + (m1x) + (m1y) * stride, \
330
0
    fref + (m2x) + (m2y) * stride, \
331
0
    stride, costs); \
332
0
    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
333
0
    costs[1] += p_cost_mvx[(m1x) << 2]; \
334
0
    costs[2] += p_cost_mvx[(m2x) << 2]; \
335
0
    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
336
0
    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
337
0
    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
338
0
}
339
340
#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
341
0
    { \
342
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
343
0
        sad_x4(fenc, \
344
0
               pix_base + (m0x) + (m0y) * stride, \
345
0
               pix_base + (m1x) + (m1y) * stride, \
346
0
               pix_base + (m2x) + (m2y) * stride, \
347
0
               pix_base + (m3x) + (m3y) * stride, \
348
0
               stride, costs); \
349
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
350
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
351
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
352
0
        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
353
0
    }
354
355
#define DIA1_ITER(mx, my) \
356
0
    { \
357
0
        omv.x = mx; omv.y = my; \
358
0
        COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
359
0
    }
360
361
#define CROSS(start, x_max, y_max) \
362
0
    { \
363
0
        int16_t i = start; \
364
0
        if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
365
0
            for (; i < (x_max) - 2; i += 4) { \
366
0
                COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
367
0
        for (; i < (x_max); i += 2) \
368
0
        { \
369
0
            if (omv.x + i <= mvmax.x) \
370
0
                COST_MV(omv.x + i, omv.y); \
371
0
            if (omv.x - i >= mvmin.x) \
372
0
                COST_MV(omv.x - i, omv.y); \
373
0
        } \
374
0
        i = start; \
375
0
        if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
376
0
            for (; i < (y_max) - 2; i += 4) { \
377
0
                COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
378
0
        for (; i < (y_max); i += 2) \
379
0
        { \
380
0
            if (omv.y + i <= mvmax.y) \
381
0
                COST_MV(omv.x, omv.y + i); \
382
0
            if (omv.y - i >= mvmin.y) \
383
0
                COST_MV(omv.x, omv.y - i); \
384
0
        } \
385
0
    }
386
387
void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
388
                                       const MV &       mvmin,
389
                                       const MV &       mvmax,
390
                                       MV &             bmv,
391
                                       int &            bcost,
392
                                       int &            bPointNr,
393
                                       int &            bDistance,
394
                                       int              earlyExitIters,
395
                                       int              merange,
396
                                       int              hme)
397
0
{
398
0
    ALIGN_VAR_16(int, costs[16]);
399
0
    pixel* fenc = fencPUYuv.m_buf[0];
400
0
    pixel* fref = (hme? ref->fpelLowerResPlane[0] : ref->fpelPlane[0]) + blockOffset;
401
0
    intptr_t stride = hme? ref->lumaStride / 2 : ref->lumaStride;
402
403
0
    MV omv = bmv;
404
0
    int saved = bcost;
405
0
    int rounds = 0;
406
407
0
    {
408
0
        int16_t dist = 1;
409
410
        /* bPointNr
411
              2
412
            4 * 5
413
              7
414
         */
415
0
        const int32_t top    = omv.y - dist;
416
0
        const int32_t bottom = omv.y + dist;
417
0
        const int32_t left   = omv.x - dist;
418
0
        const int32_t right  = omv.x + dist;
419
420
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
421
0
        {
422
0
            COST_MV_PT_DIST_X4(omv.x,  top,    2, dist,
423
0
                               left,  omv.y,   4, dist,
424
0
                               right, omv.y,   5, dist,
425
0
                               omv.x,  bottom, 7, dist);
426
0
        }
427
0
        else
428
0
        {
429
0
            if (top >= mvmin.y) // check top
430
0
            {
431
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
432
0
            }
433
0
            if (left >= mvmin.x) // check middle left
434
0
            {
435
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
436
0
            }
437
0
            if (right <= mvmax.x) // check middle right
438
0
            {
439
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
440
0
            }
441
0
            if (bottom <= mvmax.y) // check bottom
442
0
            {
443
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
444
0
            }
445
0
        }
446
0
        if (bcost < saved)
447
0
            rounds = 0;
448
0
        else if (++rounds >= earlyExitIters)
449
0
            return;
450
0
    }
451
452
0
    for (int16_t dist = 2; dist <= 8; dist <<= 1)
453
0
    {
454
        /* bPointNr
455
              2
456
             1 3
457
            4 * 5
458
             6 8
459
              7
460
         Points 2, 4, 5, 7 are dist
461
         Points 1, 3, 6, 8 are dist>>1
462
         */
463
0
        const int32_t top     = omv.y - dist;
464
0
        const int32_t bottom  = omv.y + dist;
465
0
        const int32_t left    = omv.x - dist;
466
0
        const int32_t right   = omv.x + dist;
467
0
        const int32_t top2    = omv.y - (dist >> 1);
468
0
        const int32_t bottom2 = omv.y + (dist >> 1);
469
0
        const int32_t left2   = omv.x - (dist >> 1);
470
0
        const int32_t right2  = omv.x + (dist >> 1);
471
0
        saved = bcost;
472
473
0
        if (top >= mvmin.y && left >= mvmin.x &&
474
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
475
0
        {
476
0
            COST_MV_PT_DIST_X4(omv.x,  top,   2, dist,
477
0
                               left2,  top2,  1, dist >> 1,
478
0
                               right2, top2,  3, dist >> 1,
479
0
                               left,   omv.y, 4, dist);
480
0
            COST_MV_PT_DIST_X4(right,  omv.y,   5, dist,
481
0
                               left2,  bottom2, 6, dist >> 1,
482
0
                               right2, bottom2, 8, dist >> 1,
483
0
                               omv.x,  bottom,  7, dist);
484
0
        }
485
0
        else // check border for each mv
486
0
        {
487
0
            if (top >= mvmin.y) // check top
488
0
            {
489
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
490
0
            }
491
0
            if (top2 >= mvmin.y) // check half top
492
0
            {
493
0
                if (left2 >= mvmin.x) // check half left
494
0
                {
495
0
                    COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
496
0
                }
497
0
                if (right2 <= mvmax.x) // check half right
498
0
                {
499
0
                    COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
500
0
                }
501
0
            }
502
0
            if (left >= mvmin.x) // check left
503
0
            {
504
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
505
0
            }
506
0
            if (right <= mvmax.x) // check right
507
0
            {
508
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
509
0
            }
510
0
            if (bottom2 <= mvmax.y) // check half bottom
511
0
            {
512
0
                if (left2 >= mvmin.x) // check half left
513
0
                {
514
0
                    COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
515
0
                }
516
0
                if (right2 <= mvmax.x) // check half right
517
0
                {
518
0
                    COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
519
0
                }
520
0
            }
521
0
            if (bottom <= mvmax.y) // check bottom
522
0
            {
523
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
524
0
            }
525
0
        }
526
527
0
        if (bcost < saved)
528
0
            rounds = 0;
529
0
        else if (++rounds >= earlyExitIters)
530
0
            return;
531
0
    }
532
533
0
    for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
534
0
    {
535
0
        const int32_t top    = omv.y - dist;
536
0
        const int32_t bottom = omv.y + dist;
537
0
        const int32_t left   = omv.x - dist;
538
0
        const int32_t right  = omv.x + dist;
539
540
0
        saved = bcost;
541
0
        if (top >= mvmin.y && left >= mvmin.x &&
542
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
543
0
        {
544
            /* index
545
                  0
546
                  3
547
                  2
548
                  1
549
          0 3 2 1 * 1 2 3 0
550
                  1
551
                  2
552
                  3
553
                  0
554
            */
555
556
0
            COST_MV_PT_DIST_X4(omv.x,  top,    0, dist,
557
0
                               left,   omv.y,  0, dist,
558
0
                               right,  omv.y,  0, dist,
559
0
                               omv.x,  bottom, 0, dist);
560
561
0
            for (int16_t index = 1; index < 4; index++)
562
0
            {
563
0
                int32_t posYT = top    + ((dist >> 2) * index);
564
0
                int32_t posYB = bottom - ((dist >> 2) * index);
565
0
                int32_t posXL = omv.x  - ((dist >> 2) * index);
566
0
                int32_t posXR = omv.x  + ((dist >> 2) * index);
567
568
0
                COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
569
0
                                   posXR, posYT, 0, dist,
570
0
                                   posXL, posYB, 0, dist,
571
0
                                   posXR, posYB, 0, dist);
572
0
            }
573
0
        }
574
0
        else // check border for each mv
575
0
        {
576
0
            if (top >= mvmin.y) // check top
577
0
            {
578
0
                COST_MV_PT_DIST(omv.x, top, 0, dist);
579
0
            }
580
0
            if (left >= mvmin.x) // check left
581
0
            {
582
0
                COST_MV_PT_DIST(left, omv.y, 0, dist);
583
0
            }
584
0
            if (right <= mvmax.x) // check right
585
0
            {
586
0
                COST_MV_PT_DIST(right, omv.y, 0, dist);
587
0
            }
588
0
            if (bottom <= mvmax.y) // check bottom
589
0
            {
590
0
                COST_MV_PT_DIST(omv.x, bottom, 0, dist);
591
0
            }
592
0
            for (int16_t index = 1; index < 4; index++)
593
0
            {
594
0
                int32_t posYT = top    + ((dist >> 2) * index);
595
0
                int32_t posYB = bottom - ((dist >> 2) * index);
596
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
597
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
598
599
0
                if (posYT >= mvmin.y) // check top
600
0
                {
601
0
                    if (posXL >= mvmin.x) // check left
602
0
                    {
603
0
                        COST_MV_PT_DIST(posXL, posYT, 0, dist);
604
0
                    }
605
0
                    if (posXR <= mvmax.x) // check right
606
0
                    {
607
0
                        COST_MV_PT_DIST(posXR, posYT, 0, dist);
608
0
                    }
609
0
                }
610
0
                if (posYB <= mvmax.y) // check bottom
611
0
                {
612
0
                    if (posXL >= mvmin.x) // check left
613
0
                    {
614
0
                        COST_MV_PT_DIST(posXL, posYB, 0, dist);
615
0
                    }
616
0
                    if (posXR <= mvmax.x) // check right
617
0
                    {
618
0
                        COST_MV_PT_DIST(posXR, posYB, 0, dist);
619
0
                    }
620
0
                }
621
0
            }
622
0
        }
623
624
0
        if (bcost < saved)
625
0
            rounds = 0;
626
0
        else if (++rounds >= earlyExitIters)
627
0
            return;
628
0
    }
629
0
}
630
631
void MotionEstimate::refineMV(ReferencePlanes* ref,
632
                              const MV&        mvmin,
633
                              const MV&        mvmax,
634
                              const MV&        qmvp,
635
                              MV&              outQMv)
636
0
{
637
0
    ALIGN_VAR_16(int, costs[16]);
638
0
    if (ctuAddr >= 0)
639
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
640
0
    intptr_t stride = ref->lumaStride;
641
0
    pixel* fenc = fencPUYuv.m_buf[0];
642
0
    pixel* fref = ref->fpelPlane[0] + blockOffset;
643
    
644
0
    setMVP(qmvp);
645
    
646
0
    MV qmvmin = mvmin.toQPel();
647
0
    MV qmvmax = mvmax.toQPel();
648
   
649
    /* The term cost used here means satd/sad values for that particular search.
650
     * The costs used in ME integer search only includes the SAD cost of motion
651
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
652
     * cost of residual and sqrtLambda * MVD bits.
653
    */
654
             
655
    // measure SATD cost at clipped QPEL MVP
656
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
657
0
    MV bestpre = pmv;
658
0
    int bprecost;
659
660
0
    bprecost = subpelCompare(ref, pmv, sad);
661
662
    /* re-measure full pel rounded MVP with SAD as search start point */
663
0
    MV bmv = pmv.roundToFPel();
664
0
    int bcost = bprecost;
665
0
    if (pmv.isSubpel())
666
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
667
668
    /* square refine */
669
0
    int dir = 0;
670
0
    COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
671
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
672
0
        COPY2_IF_LT(bcost, costs[0], dir, 1);
673
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
674
0
        COPY2_IF_LT(bcost, costs[1], dir, 2);
675
0
    COPY2_IF_LT(bcost, costs[2], dir, 3);
676
0
    COPY2_IF_LT(bcost, costs[3], dir, 4);
677
0
    COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
678
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
679
0
        COPY2_IF_LT(bcost, costs[0], dir, 5);
680
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
681
0
        COPY2_IF_LT(bcost, costs[1], dir, 6);
682
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
683
0
        COPY2_IF_LT(bcost, costs[2], dir, 7);
684
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
685
0
        COPY2_IF_LT(bcost, costs[3], dir, 8);
686
0
    bmv += square1[dir];
687
688
0
    if (bprecost < bcost)
689
0
    {
690
0
        bmv = bestpre;
691
0
        bcost = bprecost;
692
0
    }
693
0
    else
694
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
695
696
    // TO DO: Change SubpelWorkload to fine tune MV
697
    // Now it is set to 5 for experiment.
698
    // const SubpelWorkload& wl = workload[this->subpelRefine];
699
0
    const SubpelWorkload& wl = workload[5];
700
701
0
    pixelcmp_t hpelcomp;
702
703
0
    if (wl.hpel_satd)
704
0
    {
705
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
706
0
        hpelcomp = satd;
707
0
    }
708
0
    else
709
0
        hpelcomp = sad;
710
711
0
    for (int iter = 0; iter < wl.hpel_iters; iter++)
712
0
    {
713
0
        int bdir = 0;
714
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
715
0
        {
716
0
            MV qmv = bmv + square1[i] * 2;            
717
718
            // check mv range for slice bound
719
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
720
0
                continue;
721
722
0
            int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
723
0
            COPY2_IF_LT(bcost, cost, bdir, i);
724
0
        }
725
726
0
        if (bdir)
727
0
            bmv += square1[bdir] * 2;            
728
0
        else
729
0
            break;
730
0
    }
731
732
    /* if HPEL search used SAD, remeasure with SATD before QPEL */
733
0
    if (!wl.hpel_satd)
734
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
735
736
0
    for (int iter = 0; iter < wl.qpel_iters; iter++)
737
0
    {
738
0
        int bdir = 0;
739
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
740
0
        {
741
0
            MV qmv = bmv + square1[i];
742
            
743
            // check mv range for slice bound
744
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
745
0
                continue;
746
747
0
            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
748
0
            COPY2_IF_LT(bcost, cost, bdir, i);
749
0
        }
750
751
0
        if (bdir)
752
0
            bmv += square1[bdir];
753
0
        else
754
0
            break;
755
0
    }
756
757
    // check mv range for slice bound
758
0
    X265_CHECK(((pmv.y >= qmvmin.y) & (pmv.y <= qmvmax.y)), "mv beyond range!");
759
    
760
0
    x265_emms();
761
0
    outQMv = bmv;
762
0
}
763
764
int MotionEstimate::motionEstimate(ReferencePlanes *ref,
765
                                   const MV &       mvmin,
766
                                   const MV &       mvmax,
767
                                   const MV &       qmvp,
768
                                   int              numCandidates,
769
                                   const MV *       mvc,
770
                                   int              merange,
771
                                   MV &             outQMv,
772
                                   uint32_t         maxSlices,
773
                                    bool            m_vertRestriction,
774
                                   pixel *          srcReferencePlane)
775
0
{
776
0
    ALIGN_VAR_16(int, costs[16]);
777
0
    bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
778
0
    if (ctuAddr >= 0)
779
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
780
0
    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
781
0
    pixel* fenc = fencPUYuv.m_buf[0];
782
0
    pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
783
784
0
    setMVP(qmvp);
785
786
0
    MV qmvmin = mvmin.toQPel();
787
0
    MV qmvmax = mvmax.toQPel();
788
789
    /* The term cost used here means satd/sad values for that particular search.
790
     * The costs used in ME integer search only includes the SAD cost of motion
791
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
792
     * cost of residual and sqrtLambda * MVD bits.  Mode decision will be based
793
     * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
794
     * (mode + MVD bits). */
795
796
    // measure SAD cost at clipped QPEL MVP
797
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
798
0
    if (m_vertRestriction)
799
0
    {
800
0
        if (pmv.y > mvmax.y << 2)
801
0
        {
802
0
            pmv.y = (mvmax.y << 2);
803
0
        }
804
0
    }
805
0
    MV bestpre = pmv;
806
0
    int bprecost;
807
808
0
    if (ref->isLowres)
809
0
        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
810
0
    else
811
0
        bprecost = subpelCompare(ref, pmv, sad);
812
813
    /* re-measure full pel rounded MVP with SAD as search start point */
814
0
    MV bmv = pmv.roundToFPel();
815
0
    int bcost = bprecost;
816
0
    if (pmv.isSubpel())
817
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
818
819
    // measure SAD cost at MV(0) if MVP is not zero
820
0
    if (pmv.notZero())
821
0
    {
822
0
        int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
823
0
        if (cost < bcost)
824
0
        {
825
0
            bcost = cost;
826
0
            bmv = 0;
827
0
            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
828
0
        }
829
0
    }
830
831
0
    X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
832
    // measure SAD cost at each QPEL motion vector candidate
833
0
    for (int i = 0; i < numCandidates; i++)
834
0
    {
835
0
        MV m = mvc[i].clipped(qmvmin, qmvmax);
836
0
        if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
837
0
        {
838
0
            int cost = subpelCompare(ref, m, sad) + mvcost(m);
839
0
            if (cost < bprecost)
840
0
            {
841
0
                bprecost = cost;
842
0
                bestpre = m;
843
0
            }
844
0
        }
845
0
    }
846
847
0
    pmv = pmv.roundToFPel();
848
0
    MV omv = bmv;  // current search origin or starting point
849
850
0
    int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
851
0
    switch (search)
852
0
    {
853
0
    case X265_DIA_SEARCH:
854
0
    {
855
        /* diamond search, radius 1 */
856
0
        bcost <<= 4;
857
0
        int i = merange;
858
0
        do
859
0
        {
860
0
            COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
861
0
            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
862
0
                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
863
0
            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
864
0
                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
865
0
            COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
866
0
            COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
867
0
            if (!(bcost & 15))
868
0
                break;
869
0
            bmv.x -= (bcost << 28) >> 30;
870
0
            bmv.y -= (bcost << 30) >> 30;
871
0
            bcost &= ~15;
872
0
        }
873
0
        while (--i && bmv.checkRange(mvmin, mvmax));
874
0
        bcost >>= 4;
875
0
        break;
876
0
    }
877
878
0
    case X265_HEX_SEARCH:
879
0
    {
880
0
me_hex2:
881
        /* hexagon search, radius 2 */
882
#if 0
883
        for (int i = 0; i < merange / 2; i++)
884
        {
885
            omv = bmv;
886
            COST_MV(omv.x - 2, omv.y);
887
            COST_MV(omv.x - 1, omv.y + 2);
888
            COST_MV(omv.x + 1, omv.y + 2);
889
            COST_MV(omv.x + 2, omv.y);
890
            COST_MV(omv.x + 1, omv.y - 2);
891
            COST_MV(omv.x - 1, omv.y - 2);
892
            if (omv == bmv)
893
                break;
894
            if (!bmv.checkRange(mvmin, mvmax))
895
                break;
896
        }
897
898
#else // if 0
899
      /* equivalent to the above, but eliminates duplicate candidates */
900
0
        COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
901
0
        bcost <<= 3;
902
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
903
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
904
0
        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
905
0
        {
906
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
907
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
908
0
        }
909
910
0
        COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
911
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
912
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
913
0
        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
914
0
        {
915
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
916
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
917
0
        }
918
919
0
        if (bcost & 7)
920
0
        {
921
0
            int dir = (bcost & 7) - 2;
922
923
0
            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
924
0
            {
925
0
                bmv += hex2[dir + 1];
926
927
                /* half hexagon, not overlapping the previous iteration */
928
0
                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
929
0
                {
930
0
                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
931
0
                        hex2[dir + 1].x, hex2[dir + 1].y,
932
0
                        hex2[dir + 2].x, hex2[dir + 2].y,
933
0
                        costs);
934
0
                    bcost &= ~7;
935
936
0
                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
937
0
                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
938
939
0
                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
940
0
                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
941
942
0
                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
943
0
                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
944
945
0
                    if (!(bcost & 7))
946
0
                        break;
947
948
0
                    dir += (bcost & 7) - 2;
949
0
                    dir = mod6m1[dir + 1];
950
0
                    bmv += hex2[dir + 1];
951
0
                }
952
0
            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
953
0
        }
954
0
        bcost >>= 3;
955
0
#endif // if 0
956
957
        /* square refine */
958
0
        int dir = 0;
959
0
        COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
960
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
961
0
            COPY2_IF_LT(bcost, costs[0], dir, 1);
962
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
963
0
            COPY2_IF_LT(bcost, costs[1], dir, 2);
964
0
        COPY2_IF_LT(bcost, costs[2], dir, 3);
965
0
        COPY2_IF_LT(bcost, costs[3], dir, 4);
966
0
        COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
967
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
968
0
            COPY2_IF_LT(bcost, costs[0], dir, 5);
969
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
970
0
            COPY2_IF_LT(bcost, costs[1], dir, 6);
971
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
972
0
            COPY2_IF_LT(bcost, costs[2], dir, 7);
973
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
974
0
            COPY2_IF_LT(bcost, costs[3], dir, 8);
975
0
        bmv += square1[dir];
976
0
        break;
977
0
    }
978
979
0
    case X265_UMH_SEARCH:
980
0
    {
981
0
        int ucost1, ucost2;
982
0
        int16_t cross_start = 1;
983
984
        /* refine predictors */
985
0
        omv = bmv;
986
0
        ucost1 = bcost;
987
0
        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
988
0
        DIA1_ITER(pmv.x, pmv.y);
989
0
        if (pmv.notZero())
990
0
            DIA1_ITER(0, 0);
991
992
0
        ucost2 = bcost;
993
0
        if (bmv.notZero() && bmv != pmv)
994
0
            DIA1_ITER(bmv.x, bmv.y);
995
0
        if (bcost == ucost2)
996
0
            cross_start = 3;
997
998
        /* Early Termination */
999
0
        omv = bmv;
1000
0
        if (bcost == ucost2 && SAD_THRESH(2000))
1001
0
        {
1002
0
            COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
1003
0
            COST_MV_X4(2, 0, -1, 1, 1, 1,  0, 2);
1004
0
            if (bcost == ucost1 && SAD_THRESH(500))
1005
0
                break;
1006
0
            if (bcost == ucost2)
1007
0
            {
1008
0
                int16_t range = (int16_t)(merange >> 1) | 1;
1009
0
                CROSS(3, range, range);
1010
0
                COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
1011
0
                COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
1012
0
                if (bcost == ucost2)
1013
0
                    break;
1014
0
                cross_start = range + 2;
1015
0
            }
1016
0
        }
1017
1018
        // TODO: Need to study x264's logic for building mvc list to understand why they
1019
        //       have special cases here for 16x16, and whether they apply to HEVC CTU
1020
1021
        // adaptive search range based on mvc variability
1022
0
        if (numCandidates)
1023
0
        {
1024
            /* range multipliers based on casual inspection of some statistics of
1025
             * average distance between current predictor and final mv found by ESA.
1026
             * these have not been tuned much by actual encoding. */
1027
0
            static const uint8_t range_mul[4][4] =
1028
0
            {
1029
0
                { 3, 3, 4, 4 },
1030
0
                { 3, 4, 4, 4 },
1031
0
                { 4, 4, 4, 5 },
1032
0
                { 4, 4, 5, 6 },
1033
0
            };
1034
1035
0
            int mvd;
1036
0
            int sad_ctx, mvd_ctx;
1037
0
            int denom = 1;
1038
1039
0
            if (numCandidates == 1)
1040
0
            {
1041
0
                if (LUMA_64x64 == partEnum)
1042
                    /* mvc is probably the same as mvp, so the difference isn't meaningful.
1043
                     * but prediction usually isn't too bad, so just use medium range */
1044
0
                    mvd = 25;
1045
0
                else
1046
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1047
0
            }
1048
0
            else
1049
0
            {
1050
                /* calculate the degree of agreement between predictors. */
1051
1052
                /* in 64x64, mvc includes all the neighbors used to make mvp,
1053
                 * so don't count mvp separately. */
1054
1055
0
                denom = numCandidates - 1;
1056
0
                mvd = 0;
1057
0
                if (partEnum != LUMA_64x64)
1058
0
                {
1059
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1060
0
                    denom++;
1061
0
                }
1062
0
                mvd += predictorDifference(mvc, numCandidates);
1063
0
            }
1064
1065
0
            sad_ctx = SAD_THRESH(1000) ? 0
1066
0
                : SAD_THRESH(2000) ? 1
1067
0
                : SAD_THRESH(4000) ? 2 : 3;
1068
0
            mvd_ctx = mvd < 10 * denom ? 0
1069
0
                : mvd < 20 * denom ? 1
1070
0
                : mvd < 40 * denom ? 2 : 3;
1071
1072
0
            merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
1073
0
        }
1074
1075
        /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
1076
         * we are still centered on the same place as the DIA2. is this desirable? */
1077
0
        CROSS(cross_start, merange, merange >> 1);
1078
0
        COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
1079
1080
        /* hexagon grid */
1081
0
        omv = bmv;
1082
0
        const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
1083
0
        const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
1084
0
        uint16_t i = 1;
1085
0
        do
1086
0
        {
1087
0
            if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
1088
0
                                  mvmax.y - omv.y, omv.y - mvmin.y))
1089
0
            {
1090
0
                for (int j = 0; j < 16; j++)
1091
0
                {
1092
0
                    MV mv = omv + (hex4[j] * i);
1093
0
                    if (mv.checkRange(mvmin, mvmax))
1094
0
                        COST_MV(mv.x, mv.y);
1095
0
                }
1096
0
            }
1097
0
            else
1098
0
            {
1099
0
                int16_t dir = 0;
1100
0
                pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
1101
0
                size_t dy = (size_t)i * stride;
1102
0
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
1103
0
    sad_x4(fenc, \
1104
0
           fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
1105
0
           fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
1106
0
           fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
1107
0
           fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
1108
0
           stride, costs + 4 * k); \
1109
0
    fref_base += 2 * dy;
1110
0
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
1111
0
#define MIN_MV(k, dx, dy)     if ((omv.y + (dy) >= mvmin.y) & (omv.y + (dy) <= mvmax.y)) { COPY2_IF_LT(bcost, costs[k], dir, dx * 16 + (dy & 15)) }
1112
1113
0
                SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
1114
0
                SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
1115
0
                SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
1116
0
                SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
1117
0
                ADD_MVCOST(0, 0, -4);
1118
0
                ADD_MVCOST(1, 0, 4);
1119
0
                ADD_MVCOST(2, -2, -3);
1120
0
                ADD_MVCOST(3, 2, -3);
1121
0
                ADD_MVCOST(4, -4, -2);
1122
0
                ADD_MVCOST(5, 4, -2);
1123
0
                ADD_MVCOST(6, -4, -1);
1124
0
                ADD_MVCOST(7, 4, -1);
1125
0
                ADD_MVCOST(8, -4, 0);
1126
0
                ADD_MVCOST(9, 4, 0);
1127
0
                ADD_MVCOST(10, -4, 1);
1128
0
                ADD_MVCOST(11, 4, 1);
1129
0
                ADD_MVCOST(12, -4, 2);
1130
0
                ADD_MVCOST(13, 4, 2);
1131
0
                ADD_MVCOST(14, -2, 3);
1132
0
                ADD_MVCOST(15, 2, 3);
1133
0
                MIN_MV(0, 0, -4);
1134
0
                MIN_MV(1, 0, 4);
1135
0
                MIN_MV(2, -2, -3);
1136
0
                MIN_MV(3, 2, -3);
1137
0
                MIN_MV(4, -4, -2);
1138
0
                MIN_MV(5, 4, -2);
1139
0
                MIN_MV(6, -4, -1);
1140
0
                MIN_MV(7, 4, -1);
1141
0
                MIN_MV(8, -4, 0);
1142
0
                MIN_MV(9, 4, 0);
1143
0
                MIN_MV(10, -4, 1);
1144
0
                MIN_MV(11, 4, 1);
1145
0
                MIN_MV(12, -4, 2);
1146
0
                MIN_MV(13, 4, 2);
1147
0
                MIN_MV(14, -2, 3);
1148
0
                MIN_MV(15, 2, 3);
1149
0
#undef SADS
1150
0
#undef ADD_MVCOST
1151
0
#undef MIN_MV
1152
0
                if (dir)
1153
0
                {
1154
0
                    bmv.x = omv.x + i * (dir >> 4);
1155
0
                    bmv.y = omv.y + i * ((dir << 28) >> 28);
1156
0
                }
1157
0
            }
1158
0
        }
1159
0
        while (++i <= merange >> 2);
1160
0
        if (bmv.checkRange(mvmin, mvmax))
1161
0
            goto me_hex2;
1162
0
        break;
1163
0
    }
1164
1165
0
    case X265_STAR_SEARCH: // Adapted from HM ME
1166
0
    {
1167
0
        int bPointNr = 0;
1168
0
        int bDistance = 0;
1169
1170
0
        const int EarlyExitIters = 3;
1171
0
        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);
1172
0
        if (bDistance == 1)
1173
0
        {
1174
            // if best distance was only 1, check two missing points.  If no new point is found, stop
1175
0
            if (bPointNr)
1176
0
            {
1177
                /* For a given direction 1 to 8, check nearest two outer X pixels
1178
                     X   X
1179
                   X 1 2 3 X
1180
                     4 * 5
1181
                   X 6 7 8 X
1182
                     X   X
1183
                */
1184
0
                int saved = bcost;
1185
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1186
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1187
0
                if (mv1.checkRange(mvmin, mvmax))
1188
0
                {
1189
0
                    COST_MV(mv1.x, mv1.y);
1190
0
                }
1191
0
                if (mv2.checkRange(mvmin, mvmax))
1192
0
                {
1193
0
                    COST_MV(mv2.x, mv2.y);
1194
0
                }
1195
0
                if (bcost == saved)
1196
0
                    break;
1197
0
            }
1198
0
            else
1199
0
                break;
1200
0
        }
1201
1202
0
        const int RasterDistance = 5;
1203
0
        if (bDistance > RasterDistance)
1204
0
        {
1205
            // raster search refinement if original search distance was too big
1206
0
            MV tmv;
1207
0
            for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
1208
0
            {
1209
0
                for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
1210
0
                {
1211
0
                    if (tmv.x + (RasterDistance * 3) <= mvmax.x)
1212
0
                    {
1213
0
                        pixel *pix_base = fref + tmv.y * stride + tmv.x;
1214
0
                        sad_x4(fenc,
1215
0
                               pix_base,
1216
0
                               pix_base + RasterDistance,
1217
0
                               pix_base + RasterDistance * 2,
1218
0
                               pix_base + RasterDistance * 3,
1219
0
                               stride, costs);
1220
0
                        costs[0] += mvcost(tmv << 2);
1221
0
                        COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1222
0
                        tmv.x += RasterDistance;
1223
0
                        costs[1] += mvcost(tmv << 2);
1224
0
                        COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1225
0
                        tmv.x += RasterDistance;
1226
0
                        costs[2] += mvcost(tmv << 2);
1227
0
                        COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1228
0
                        tmv.x += RasterDistance;
1229
0
                        costs[3] += mvcost(tmv << 3);
1230
0
                        COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1231
0
                    }
1232
0
                    else
1233
0
                        COST_MV(tmv.x, tmv.y);
1234
0
                }
1235
0
            }
1236
0
        }
1237
1238
0
        while (bDistance > 0)
1239
0
        {
1240
            // center a new search around current best
1241
0
            bDistance = 0;
1242
0
            bPointNr = 0;
1243
0
            const int MaxIters = 32;
1244
0
            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);
1245
1246
0
            if (bDistance == 1)
1247
0
            {
1248
0
                if (!bPointNr)
1249
0
                    break;
1250
1251
                /* For a given direction 1 to 8, check nearest 2 outer X pixels
1252
                        X   X
1253
                    X 1 2 3 X
1254
                        4 * 5
1255
                    X 6 7 8 X
1256
                        X   X
1257
                */
1258
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1259
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1260
0
                if (mv1.checkRange(mvmin, mvmax))
1261
0
                {
1262
0
                    COST_MV(mv1.x, mv1.y);
1263
0
                }
1264
0
                if (mv2.checkRange(mvmin, mvmax))
1265
0
                {
1266
0
                    COST_MV(mv2.x, mv2.y);
1267
0
                }
1268
0
                break;
1269
0
            }
1270
0
        }
1271
1272
0
        break;
1273
0
    }
1274
1275
0
    case X265_SEA:
1276
0
    {
1277
        // Successive Elimination Algorithm
1278
0
        const int32_t minX = X265_MAX(omv.x - (int32_t)merange, mvmin.x);
1279
0
        const int32_t minY = X265_MAX(omv.y - (int32_t)merange, mvmin.y);
1280
0
        const int32_t maxX = X265_MIN(omv.x + (int32_t)merange, mvmax.x);
1281
0
        const int32_t maxY = X265_MIN(omv.y + (int32_t)merange, mvmax.y);
1282
0
        const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;
1283
0
        const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;
1284
0
        int16_t* meScratchBuffer = NULL;
1285
0
        int scratchSize = merange * 2 + 4;
1286
0
        if (scratchSize)
1287
0
        {
1288
0
            meScratchBuffer = X265_MALLOC(int16_t, scratchSize);
1289
0
            memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);
1290
0
        }
1291
1292
        /* SEA is fastest in multiples of 4 */
1293
0
        int meRangeWidth = (maxX - minX + 3) & ~3;
1294
0
        int w = 0, h = 0;                    // Width and height of the PU
1295
0
        ALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };
1296
0
        ALIGN_VAR_32(int, encDC[4]);
1297
0
        uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >> 2);
1298
0
        sizesFromPartition(partEnum, &w, &h);
1299
0
        int deltaX = (w <= 8) ? (w) : (w >> 1);
1300
0
        int deltaY = (h <= 8) ? (h) : (h >> 1);
1301
1302
        /* Check if very small rectangular blocks which cannot be sub-divided anymore */
1303
0
        bool smallRectPartition = partEnum == LUMA_4x4 || partEnum == LUMA_16x12 ||
1304
0
            partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum == LUMA_4x16;
1305
        /* Check if vertical partition */
1306
0
        bool verticalRect = partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1307
0
            partEnum == LUMA_4x8;
1308
        /* Check if horizontal partition */
1309
0
        bool horizontalRect = partEnum == LUMA_64x32 || partEnum == LUMA_32x16 || partEnum == LUMA_16x8 ||
1310
0
            partEnum == LUMA_8x4;
1311
        /* Check if assymetric vertical partition */
1312
0
        bool assymetricVertical = partEnum == LUMA_12x16 || partEnum == LUMA_4x16 || partEnum == LUMA_24x32 ||
1313
0
            partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum == LUMA_16x64;
1314
        /* Check if assymetric horizontal partition */
1315
0
        bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum == LUMA_16x4 || partEnum == LUMA_32x24 ||
1316
0
            partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum == LUMA_64x16;
1317
1318
0
        int tempPartEnum = 0;
1319
1320
        /* If a vertical rectangular partition, it is horizontally split into two, for ads_x2() */
1321
0
        if (verticalRect)
1322
0
            tempPartEnum = partitionFromSizes(w, h >> 1);
1323
        /* If a horizontal rectangular partition, it is vertically split into two, for ads_x2() */
1324
0
        else if (horizontalRect)
1325
0
            tempPartEnum = partitionFromSizes(w >> 1, h);
1326
        /* We have integral planes introduced to account for assymetric partitions.
1327
         * Hence all assymetric partitions except those which cannot be split into legal sizes,
1328
         * are split into four for ads_x4() */
1329
0
        else if (assymetricVertical || assymetricHorizontal)
1330
0
            tempPartEnum = smallRectPartition ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1331
        /* General case: Square partitions. All partitions with width > 8 are split into four
1332
         * for ads_x4(), for 4x4 and 8x8 we do ads_x1() */
1333
0
        else
1334
0
            tempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1335
1336
        /* Successive elimination by comparing DC before a full SAD,
1337
         * because sum(abs(diff)) >= abs(diff(sum)). */
1338
0
        primitives.pu[tempPartEnum].sad_x4(zero,
1339
0
                         fenc,
1340
0
                         fenc + deltaX,
1341
0
                         fenc + deltaY * FENC_STRIDE,
1342
0
                         fenc + deltaX + deltaY * FENC_STRIDE,
1343
0
                         FENC_STRIDE,
1344
0
                         encDC);
1345
1346
        /* Assigning appropriate integral plane */
1347
0
        uint32_t *sumsBase = NULL;
1348
0
        switch (deltaX)
1349
0
        {
1350
0
            case 32: if (deltaY % 24 == 0)
1351
0
                         sumsBase = integral[1];
1352
0
                     else if (deltaY == 8)
1353
0
                         sumsBase = integral[2];
1354
0
                     else
1355
0
                         sumsBase = integral[0];
1356
0
               break;
1357
0
            case 24: sumsBase = integral[3];
1358
0
               break;
1359
0
            case 16: if (deltaY % 12 == 0)
1360
0
                         sumsBase = integral[5];
1361
0
                     else if (deltaY == 4)
1362
0
                         sumsBase = integral[6];
1363
0
                     else
1364
0
                         sumsBase = integral[4];
1365
0
               break;
1366
0
            case 12: sumsBase = integral[7];
1367
0
                break;
1368
0
            case 8: if (deltaY == 32)
1369
0
                        sumsBase = integral[8];
1370
0
                    else
1371
0
                        sumsBase = integral[9];
1372
0
                break;
1373
0
            case 4: if (deltaY == 16)
1374
0
                        sumsBase = integral[10];
1375
0
                    else
1376
0
                        sumsBase = integral[11];
1377
0
                break;
1378
0
            default: sumsBase = integral[11];
1379
0
                break;
1380
0
        }
1381
1382
0
        if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum == LUMA_16x16 ||
1383
0
            partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1384
0
            partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum == LUMA_4x16 ||
1385
0
            partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum == LUMA_48x64 ||
1386
0
            partEnum == LUMA_16x64)
1387
0
            deltaY *= (int)stride;
1388
1389
0
        if (verticalRect)
1390
0
            encDC[1] = encDC[2];
1391
1392
0
        if (horizontalRect)
1393
0
            deltaY = deltaX;
1394
1395
        /* ADS and SAD */
1396
0
        MV tmv;
1397
0
        for (tmv.y = minY; tmv.y <= maxY; tmv.y++)
1398
0
        {
1399
0
            int i, xn;
1400
0
            int ycost = p_cost_mvy[tmv.y] << 2;
1401
0
            if (bcost <= ycost)
1402
0
                continue;
1403
0
            bcost -= ycost;
1404
1405
            /* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48, 32x8, 8x32, 64x16, 16x64 partitions
1406
             * ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions
1407
             * ADS_2 for all other rectangular partitions */
1408
0
            xn = ads(encDC,
1409
0
                    sumsBase + minX + tmv.y * stride,
1410
0
                    deltaY,
1411
0
                    fpelCostMvX + minX,
1412
0
                    meScratchBuffer,
1413
0
                    meRangeWidth,
1414
0
                    bcost);
1415
1416
0
            for (i = 0; i < xn - 2; i += 3)
1417
0
                COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,
1418
0
                             minX + meScratchBuffer[i + 1], tmv.y,
1419
0
                             minX + meScratchBuffer[i + 2], tmv.y);
1420
1421
0
            bcost += ycost;
1422
0
            for (; i < xn; i++)
1423
0
                COST_MV(minX + meScratchBuffer[i], tmv.y);
1424
0
        }
1425
0
        if (meScratchBuffer)
1426
0
            x265_free(meScratchBuffer);
1427
0
        break;
1428
0
    }
1429
1430
0
    case X265_FULL_SEARCH:
1431
0
    {
1432
        // dead slow exhaustive search, but at least it uses sad_x4()
1433
0
        MV tmv;
1434
0
        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
1435
0
        if (ref->isHMELowres)
1436
0
        {
1437
0
            merange = (merange < 0 ? -merange : merange);
1438
0
            mvmin_y = X265_MAX(mvmin.y, -merange);
1439
0
            mvmin_x = X265_MAX(mvmin.x, -merange);
1440
0
            mvmax_y = X265_MIN(mvmax.y, merange);
1441
0
            mvmax_x = X265_MIN(mvmax.x, merange);
1442
0
        }
1443
0
        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
1444
0
        {
1445
0
            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
1446
0
            {
1447
0
                if (tmv.x + 3 <= mvmax_x)
1448
0
                {
1449
0
                    pixel *pix_base = fref + tmv.y * stride + tmv.x;
1450
0
                    sad_x4(fenc,
1451
0
                           pix_base,
1452
0
                           pix_base + 1,
1453
0
                           pix_base + 2,
1454
0
                           pix_base + 3,
1455
0
                           stride, costs);
1456
0
                    costs[0] += mvcost(tmv << 2);
1457
0
                    COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1458
0
                    tmv.x++;
1459
0
                    costs[1] += mvcost(tmv << 2);
1460
0
                    COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1461
0
                    tmv.x++;
1462
0
                    costs[2] += mvcost(tmv << 2);
1463
0
                    COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1464
0
                    tmv.x++;
1465
0
                    costs[3] += mvcost(tmv << 2);
1466
0
                    COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1467
0
                }
1468
0
                else
1469
0
                    COST_MV(tmv.x, tmv.y);
1470
0
            }
1471
0
        }
1472
1473
0
        break;
1474
0
    }
1475
1476
0
    default:
1477
0
        X265_CHECK(0, "invalid motion estimate mode\n");
1478
0
        break;
1479
0
    }
1480
1481
0
    if (bprecost < bcost)
1482
0
    {
1483
0
        bmv = bestpre;
1484
0
        bcost = bprecost;
1485
0
    }
1486
0
    else
1487
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
1488
1489
0
    const SubpelWorkload& wl = workload[this->subpelRefine];
1490
1491
    // check mv range for slice bound
1492
0
    if ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y)))
1493
0
    {
1494
0
        bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);
1495
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1496
0
    }
1497
1498
0
    if (!bcost)
1499
0
    {
1500
        /* if there was zero residual at the clipped MVP, we can skip subpel
1501
         * refine, but we do need to include the mvcost in the returned cost */
1502
0
        bcost = mvcost(bmv);
1503
0
    }
1504
0
    else if (ref->isLowres)
1505
0
    {
1506
0
        int bdir = 0;
1507
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
1508
0
        {
1509
0
            MV qmv = bmv + square1[i] * 2;
1510
1511
            /* skip invalid range */
1512
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1513
0
                continue;
1514
1515
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
1516
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1517
0
        }
1518
1519
0
        bmv += square1[bdir] * 2;
1520
0
        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
1521
1522
0
        bdir = 0;
1523
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
1524
0
        {
1525
0
            MV qmv = bmv + square1[i];
1526
1527
            /* skip invalid range */
1528
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1529
0
                continue;
1530
1531
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
1532
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1533
0
        }
1534
1535
0
        bmv += square1[bdir];
1536
0
    }
1537
0
    else
1538
0
    {
1539
0
        pixelcmp_t hpelcomp;
1540
1541
0
        if (wl.hpel_satd)
1542
0
        {
1543
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1544
0
            hpelcomp = satd;
1545
0
        }
1546
0
        else
1547
0
            hpelcomp = sad;
1548
1549
0
        for (int iter = 0; iter < wl.hpel_iters; iter++)
1550
0
        {
1551
0
            int bdir = 0;
1552
0
            for (int i = 1; i <= wl.hpel_dirs; i++)
1553
0
            {
1554
0
                MV qmv = bmv + square1[i] * 2;
1555
1556
                // check mv range for slice bound
1557
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1558
0
                    continue;
1559
1560
0
                int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1561
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1562
0
            }
1563
1564
0
            if (bdir)
1565
0
                bmv += square1[bdir] * 2;
1566
0
            else
1567
0
                break;
1568
0
        }
1569
1570
        /* if HPEL search used SAD, remeasure with SATD before QPEL */
1571
0
        if (!wl.hpel_satd)
1572
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1573
1574
0
        for (int iter = 0; iter < wl.qpel_iters; iter++)
1575
0
        {
1576
0
            int bdir = 0;
1577
0
            for (int i = 1; i <= wl.qpel_dirs; i++)
1578
0
            {
1579
0
                MV qmv = bmv + square1[i];
1580
1581
                // check mv range for slice bound
1582
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1583
0
                    continue;
1584
1585
0
                int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1586
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1587
0
            }
1588
1589
0
            if (bdir)
1590
0
                bmv += square1[bdir];
1591
0
            else
1592
0
                break;
1593
0
        }
1594
0
    }
1595
1596
    // check mv range for slice bound
1597
0
    X265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");
1598
1599
    // Get a chance to ZeroMv
1600
0
    if (bmv.notZero())
1601
0
    {
1602
0
      int cost = subpelCompare(ref, MV(0, 0), satd) + mvcost(MV(0, 0));
1603
0
      if (cost <= bcost)
1604
0
        bmv = MV(0, 0);
1605
0
    }
1606
1607
0
    x265_emms();
1608
0
    outQMv = bmv;
1609
0
    return bcost;
1610
0
}
1611
1612
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1613
0
{
1614
0
    intptr_t refStride = ref->lumaStride;
1615
0
    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1616
0
    int xFrac = qmv.x & 0x3;
1617
0
    int yFrac = qmv.y & 0x3;
1618
0
    int cost;
1619
0
    const intptr_t fencStride = FENC_STRIDE;
1620
0
    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1621
1622
0
    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
1623
    
1624
0
    if (!(yFrac | xFrac))
1625
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
1626
0
    else
1627
0
    {
1628
        /* we are taking a short-cut here if the reference is weighted. To be
1629
         * accurate we should be interpolating unweighted pixels and weighting
1630
         * the final 16bit values prior to rounding and down shifting. Instead we
1631
         * are simply interpolating the weighted full-pel pixels. Not 100%
1632
         * accurate but good enough for fast qpel ME */
1633
0
        if (!yFrac)
1634
0
            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
1635
0
        else if (!xFrac)
1636
0
            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
1637
0
        else
1638
0
            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
1639
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
1640
0
    }
1641
1642
0
    if (bChromaSATD)
1643
0
    {
1644
0
        int csp    = fencPUYuv.m_csp;
1645
0
        int hshift = fencPUYuv.m_hChromaShift;
1646
0
        int vshift = fencPUYuv.m_vChromaShift;
1647
0
        int mvx = qmv.x << (1 - hshift);
1648
0
        int mvy = qmv.y << (1 - vshift);
1649
0
        intptr_t fencStrideC = fencPUYuv.m_csize;
1650
1651
0
        intptr_t refStrideC = ref->reconPic->m_strideC;
1652
0
        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
1653
1654
0
        const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1655
0
        const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1656
1657
0
        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
1658
0
        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
1659
1660
0
        xFrac = mvx & 7;
1661
0
        yFrac = mvy & 7;
1662
1663
0
        if (!(yFrac | xFrac))
1664
0
        {
1665
0
            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
1666
0
            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
1667
0
        }
1668
0
        else
1669
0
        {
1670
0
            int blockwidthC = blockwidth >> hshift;
1671
1672
0
            if (!yFrac)
1673
0
            {
1674
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
1675
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1676
1677
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
1678
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1679
0
            }
1680
0
            else if (!xFrac)
1681
0
            {
1682
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
1683
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1684
1685
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
1686
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1687
0
            }
1688
0
            else
1689
0
            {
1690
0
                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
1691
0
                const int halfFilterSize = (NTAPS_CHROMA >> 1);
1692
1693
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
1694
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1695
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1696
1697
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
1698
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1699
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1700
0
            }
1701
0
        }
1702
0
    }
1703
1704
0
    return cost;
1705
0
}