Coverage Report

Created: 2026-05-30 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/motion.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
 *
21
 * This program is also available under a commercial proprietary license.
22
 * For more information, contact us at license @ x265.com.
23
 *****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "lowres.h"
28
#include "motion.h"
29
#include "x265.h"
30
31
#if _MSC_VER
32
#pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
33
#endif
34
35
using namespace X265_NS;
36
37
namespace {
38
39
struct SubpelWorkload
40
{
41
    int hpel_iters;
42
    int hpel_dirs;
43
    int qpel_iters;
44
    int qpel_dirs;
45
    bool hpel_satd;
46
};
47
48
const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
49
{
50
    { 1, 4, 0, 4, false }, // 4 SAD HPEL only
51
    { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
52
    { 1, 4, 1, 4, true },  // 4 SATD HPEL + 4 SATD QPEL
53
    { 2, 4, 1, 4, true },  // 2x4 SATD HPEL + 4 SATD QPEL
54
    { 2, 4, 2, 4, true },  // 2x4 SATD HPEL + 2x4 SATD QPEL
55
    { 1, 8, 1, 8, true },  // 8 SATD HPEL + 8 SATD QPEL (default)
56
    { 2, 8, 1, 8, true },  // 2x8 SATD HPEL + 8 SATD QPEL
57
    { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
58
};
59
60
static int sizeScale[NUM_PU_SIZES];
61
0
#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
62
63
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
64
const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
65
const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
66
const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
67
const MV hex4[16] =
68
{
69
    MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
70
    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
71
    MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
72
    MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
73
};
74
const MV offsets[] =
75
{
76
    MV(-1, 0), MV(0, -1),
77
    MV(-1, -1), MV(1, -1),
78
    MV(-1, 0), MV(1, 0),
79
    MV(-1, 1), MV(-1, -1),
80
    MV(1, -1), MV(1, 1),
81
    MV(-1, 0), MV(0, 1),
82
    MV(-1, 1), MV(1, 1),
83
    MV(1, 0), MV(0, 1),
84
}; // offsets for Two Point Search
85
86
/* sum of absolute differences between MV candidates, used for adaptive ME range */
87
inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
88
0
{
89
0
    int sum = 0;
90
91
0
    for (int i = 0; i < numCandidates - 1; i++)
92
0
    {
93
0
        sum += abs(mvc[i].x - mvc[i + 1].x)
94
0
            +  abs(mvc[i].y - mvc[i + 1].y);
95
0
    }
96
97
0
    return sum;
98
0
}
99
100
}
101
102
MotionEstimate::MotionEstimate()
103
0
{
104
0
    ctuAddr = -1;
105
0
    absPartIdx = -1;
106
0
    searchMethod = X265_HEX_SEARCH;
107
0
    searchMethodL0 = X265_HEX_SEARCH;
108
0
    searchMethodL1 = X265_HEX_SEARCH;
109
0
    subpelRefine = 2;
110
0
    blockwidth = blockheight = 0;
111
0
    blockOffset = 0;
112
0
    bChromaSATD = false;
113
0
    chromaSatd = NULL;
114
0
    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
115
0
        integral[i] = NULL;
116
0
}
117
118
void MotionEstimate::init(int csp)
119
0
{
120
0
    fencPUYuv.create(FENC_STRIDE, csp);
121
0
}
122
123
void MotionEstimate::initScales(void)
124
0
{
125
0
#define SETUP_SCALE(W, H) \
126
0
    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
127
0
    SETUP_SCALE(4, 4);
128
0
    SETUP_SCALE(8, 8);
129
0
    SETUP_SCALE(8, 4);
130
0
    SETUP_SCALE(4, 8);
131
0
    SETUP_SCALE(16, 16);
132
0
    SETUP_SCALE(16, 8);
133
0
    SETUP_SCALE(8, 16);
134
0
    SETUP_SCALE(16, 12);
135
0
    SETUP_SCALE(12, 16);
136
0
    SETUP_SCALE(4, 16);
137
0
    SETUP_SCALE(16, 4);
138
0
    SETUP_SCALE(32, 32);
139
0
    SETUP_SCALE(32, 16);
140
0
    SETUP_SCALE(16, 32);
141
0
    SETUP_SCALE(32, 24);
142
0
    SETUP_SCALE(24, 32);
143
0
    SETUP_SCALE(32, 8);
144
0
    SETUP_SCALE(8, 32);
145
0
    SETUP_SCALE(64, 64);
146
0
    SETUP_SCALE(64, 32);
147
0
    SETUP_SCALE(32, 64);
148
0
    SETUP_SCALE(64, 48);
149
0
    SETUP_SCALE(48, 64);
150
0
    SETUP_SCALE(64, 16);
151
0
    SETUP_SCALE(16, 64);
152
0
#undef SETUP_SCALE
153
0
}
154
155
int MotionEstimate::hpelIterationCount(int subme)
156
0
{
157
0
    return workload[subme].hpel_iters +
158
0
           workload[subme].qpel_iters / 2;
159
0
}
160
161
MotionEstimate::~MotionEstimate()
162
0
{
163
0
    fencPUYuv.destroy();
164
0
}
165
166
/* Called by lookahead, luma only, no use of PicYuv */
167
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
168
0
{
169
0
    partEnum = partitionFromSizes(pwidth, pheight);
170
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
171
0
    sad = primitives.pu[partEnum].sad;
172
0
    ads = primitives.pu[partEnum].ads;
173
0
    satd = primitives.pu[partEnum].satd;
174
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
175
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
176
177
178
0
    blockwidth = pwidth;
179
0
    blockOffset = offset;
180
0
    absPartIdx = ctuAddr = -1;
181
182
    /* Search params */
183
0
    searchMethod = method;
184
0
    searchMethodL0 = searchL0;
185
0
    searchMethodL1 = searchL1;
186
0
    subpelRefine = refine;
187
188
    /* copy PU block into cache */
189
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
190
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
191
0
}
192
193
/* Called by lookahead, luma only, no use of PicYuv */
194
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
195
0
{
196
0
    partEnum = partitionFromSizes(pwidth, pheight);
197
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
198
0
    sad = primitives.pu[partEnum].sad;
199
0
    ads = primitives.pu[partEnum].ads;
200
0
    satd = primitives.pu[partEnum].satd;
201
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
202
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
203
204
205
0
    blockwidth = pwidth;
206
0
    blockOffset = offset;
207
0
    absPartIdx = ctuAddr = -1;
208
209
    /* Search params */
210
0
    searchMethod = method;
211
0
    subpelRefine = refine;
212
213
    /* copy PU block into cache */
214
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
215
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
216
0
}
217
218
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
219
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
220
0
{
221
0
    partEnum = partitionFromSizes(pwidth, pheight);
222
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
223
0
    sad = primitives.pu[partEnum].sad;
224
0
    ads = primitives.pu[partEnum].ads;
225
0
    satd = primitives.pu[partEnum].satd;
226
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
227
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
228
229
0
    chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
230
231
    /* Set search characteristics */
232
0
    searchMethod = method;
233
0
    subpelRefine = refine;
234
235
    /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
236
     * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
237
0
    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
238
0
    X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
239
240
0
    ctuAddr = _ctuAddr;
241
0
    absPartIdx = cuPartIdx + puPartIdx;
242
0
    blockwidth = pwidth;
243
0
    blockOffset = 0;
244
245
    /* copy PU from CU Yuv */
246
0
    fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
247
0
}
248
249
#define COST_MV_PT_DIST(mx, my, point, dist) \
250
0
    do \
251
0
    { \
252
0
        MV tmv(mx, my); \
253
0
        int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
254
0
        cost += mvcost(tmv << 2); \
255
0
        if (cost < bcost) { \
256
0
            bcost = cost; \
257
0
            bmv = tmv; \
258
0
            bPointNr = point; \
259
0
            bDistance = dist; \
260
0
        } \
261
0
    } while (0)
262
263
#define COST_MV(mx, my) \
264
0
    do \
265
0
    { \
266
0
        int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
267
0
        cost += mvcost(MV(mx, my) << 2); \
268
0
        COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
269
0
    } while (0)
270
271
#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
272
0
    { \
273
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
274
0
        sad_x3(fenc, \
275
0
               pix_base + (m0x) + (m0y) * stride, \
276
0
               pix_base + (m1x) + (m1y) * stride, \
277
0
               pix_base + (m2x) + (m2y) * stride, \
278
0
               stride, costs); \
279
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
280
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
281
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
282
0
    }
283
284
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
285
0
    { \
286
0
        sad_x4(fenc, \
287
0
               fref + (m0x) + (m0y) * stride, \
288
0
               fref + (m1x) + (m1y) * stride, \
289
0
               fref + (m2x) + (m2y) * stride, \
290
0
               fref + (m3x) + (m3y) * stride, \
291
0
               stride, costs); \
292
0
        (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
293
0
        (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
294
0
        (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
295
0
        (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
296
0
        COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
297
0
        COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
298
0
        COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
299
0
        COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
300
0
    }
301
302
#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
303
0
    { \
304
0
        pixel *pix_base = fref + omv.x + omv.y * stride; \
305
0
        sad_x4(fenc, \
306
0
               pix_base + (m0x) + (m0y) * stride, \
307
0
               pix_base + (m1x) + (m1y) * stride, \
308
0
               pix_base + (m2x) + (m2y) * stride, \
309
0
               pix_base + (m3x) + (m3y) * stride, \
310
0
               stride, costs); \
311
0
        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
312
0
        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
313
0
        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
314
0
        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
315
0
        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
316
0
            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
317
0
        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
318
0
            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
319
0
        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
320
0
            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
321
0
        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
322
0
            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
323
0
    }
324
325
0
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
326
0
{\
327
0
    sad_x3(fenc, \
328
0
    fref + (m0x) + (m0y) * stride, \
329
0
    fref + (m1x) + (m1y) * stride, \
330
0
    fref + (m2x) + (m2y) * stride, \
331
0
    stride, costs); \
332
0
    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
333
0
    costs[1] += p_cost_mvx[(m1x) << 2]; \
334
0
    costs[2] += p_cost_mvx[(m2x) << 2]; \
335
0
    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
336
0
    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
337
0
    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
338
0
}
339
340
#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
341
0
    { \
342
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
343
0
        sad_x4(fenc, \
344
0
               pix_base + (m0x) + (m0y) * stride, \
345
0
               pix_base + (m1x) + (m1y) * stride, \
346
0
               pix_base + (m2x) + (m2y) * stride, \
347
0
               pix_base + (m3x) + (m3y) * stride, \
348
0
               stride, costs); \
349
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
350
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
351
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
352
0
        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
353
0
    }
354
355
#define DIA1_ITER(mx, my) \
356
0
    { \
357
0
        omv.x = mx; omv.y = my; \
358
0
        COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
359
0
    }
360
361
#define CROSS(start, x_max, y_max) \
362
0
    { \
363
0
        int16_t i = start; \
364
0
        if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
365
0
            for (; i < (x_max) - 2; i += 4) { \
366
0
                COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
367
0
        for (; i < (x_max); i += 2) \
368
0
        { \
369
0
            if (omv.x + i <= mvmax.x) \
370
0
                COST_MV(omv.x + i, omv.y); \
371
0
            if (omv.x - i >= mvmin.x) \
372
0
                COST_MV(omv.x - i, omv.y); \
373
0
        } \
374
0
        i = start; \
375
0
        if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
376
0
            for (; i < (y_max) - 2; i += 4) { \
377
0
                COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
378
0
        for (; i < (y_max); i += 2) \
379
0
        { \
380
0
            if (omv.y + i <= mvmax.y) \
381
0
                COST_MV(omv.x, omv.y + i); \
382
0
            if (omv.y - i >= mvmin.y) \
383
0
                COST_MV(omv.x, omv.y - i); \
384
0
        } \
385
0
    }
386
387
void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
388
                                       const MV &       mvmin,
389
                                       const MV &       mvmax,
390
                                       MV &             bmv,
391
                                       int &            bcost,
392
                                       int &            bPointNr,
393
                                       int &            bDistance,
394
                                       int              earlyExitIters,
395
                                       int              merange,
396
                                       int              hme)
397
0
{
398
0
    ALIGN_VAR_16(int, costs[16]);
399
0
    pixel* fenc = fencPUYuv.m_buf[0];
400
0
    pixel* fref = (hme? ref->fpelLowerResPlane[0] : ref->fpelPlane[0]) + blockOffset;
401
0
    intptr_t stride = hme? ref->lumaStride / 2 : ref->lumaStride;
402
403
0
    MV omv = bmv;
404
0
    int saved = bcost;
405
0
    int rounds = 0;
406
407
0
    {
408
0
        int16_t dist = 1;
409
410
        /* bPointNr
411
              2
412
            4 * 5
413
              7
414
         */
415
0
        const int32_t top    = omv.y - dist;
416
0
        const int32_t bottom = omv.y + dist;
417
0
        const int32_t left   = omv.x - dist;
418
0
        const int32_t right  = omv.x + dist;
419
420
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
421
0
        {
422
0
            COST_MV_PT_DIST_X4(omv.x,  top,    2, dist,
423
0
                               left,  omv.y,   4, dist,
424
0
                               right, omv.y,   5, dist,
425
0
                               omv.x,  bottom, 7, dist);
426
0
        }
427
0
        else
428
0
        {
429
0
            if (top >= mvmin.y) // check top
430
0
            {
431
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
432
0
            }
433
0
            if (left >= mvmin.x) // check middle left
434
0
            {
435
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
436
0
            }
437
0
            if (right <= mvmax.x) // check middle right
438
0
            {
439
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
440
0
            }
441
0
            if (bottom <= mvmax.y) // check bottom
442
0
            {
443
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
444
0
            }
445
0
        }
446
0
        if (bcost < saved)
447
0
            rounds = 0;
448
0
        else if (++rounds >= earlyExitIters)
449
0
            return;
450
0
    }
451
452
0
    for (int16_t dist = 2; dist <= 8; dist <<= 1)
453
0
    {
454
        /* bPointNr
455
              2
456
             1 3
457
            4 * 5
458
             6 8
459
              7
460
         Points 2, 4, 5, 7 are dist
461
         Points 1, 3, 6, 8 are dist>>1
462
         */
463
0
        const int32_t top     = omv.y - dist;
464
0
        const int32_t bottom  = omv.y + dist;
465
0
        const int32_t left    = omv.x - dist;
466
0
        const int32_t right   = omv.x + dist;
467
0
        const int32_t top2    = omv.y - (dist >> 1);
468
0
        const int32_t bottom2 = omv.y + (dist >> 1);
469
0
        const int32_t left2   = omv.x - (dist >> 1);
470
0
        const int32_t right2  = omv.x + (dist >> 1);
471
0
        saved = bcost;
472
473
0
        if (top >= mvmin.y && left >= mvmin.x &&
474
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
475
0
        {
476
0
            COST_MV_PT_DIST_X4(omv.x,  top,   2, dist,
477
0
                               left2,  top2,  1, dist >> 1,
478
0
                               right2, top2,  3, dist >> 1,
479
0
                               left,   omv.y, 4, dist);
480
0
            COST_MV_PT_DIST_X4(right,  omv.y,   5, dist,
481
0
                               left2,  bottom2, 6, dist >> 1,
482
0
                               right2, bottom2, 8, dist >> 1,
483
0
                               omv.x,  bottom,  7, dist);
484
0
        }
485
0
        else // check border for each mv
486
0
        {
487
0
            if (top >= mvmin.y) // check top
488
0
            {
489
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
490
0
            }
491
0
            if (top2 >= mvmin.y) // check half top
492
0
            {
493
0
                if (left2 >= mvmin.x) // check half left
494
0
                {
495
0
                    COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
496
0
                }
497
0
                if (right2 <= mvmax.x) // check half right
498
0
                {
499
0
                    COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
500
0
                }
501
0
            }
502
0
            if (left >= mvmin.x) // check left
503
0
            {
504
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
505
0
            }
506
0
            if (right <= mvmax.x) // check right
507
0
            {
508
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
509
0
            }
510
0
            if (bottom2 <= mvmax.y) // check half bottom
511
0
            {
512
0
                if (left2 >= mvmin.x) // check half left
513
0
                {
514
0
                    COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
515
0
                }
516
0
                if (right2 <= mvmax.x) // check half right
517
0
                {
518
0
                    COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
519
0
                }
520
0
            }
521
0
            if (bottom <= mvmax.y) // check bottom
522
0
            {
523
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
524
0
            }
525
0
        }
526
527
0
        if (bcost < saved)
528
0
            rounds = 0;
529
0
        else if (++rounds >= earlyExitIters)
530
0
            return;
531
0
    }
532
533
0
    for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
534
0
    {
535
0
        const int32_t top    = omv.y - dist;
536
0
        const int32_t bottom = omv.y + dist;
537
0
        const int32_t left   = omv.x - dist;
538
0
        const int32_t right  = omv.x + dist;
539
540
0
        saved = bcost;
541
0
        if (top >= mvmin.y && left >= mvmin.x &&
542
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
543
0
        {
544
            /* index
545
                  0
546
                  3
547
                  2
548
                  1
549
          0 3 2 1 * 1 2 3 0
550
                  1
551
                  2
552
                  3
553
                  0
554
            */
555
556
0
            COST_MV_PT_DIST_X4(omv.x,  top,    0, dist,
557
0
                               left,   omv.y,  0, dist,
558
0
                               right,  omv.y,  0, dist,
559
0
                               omv.x,  bottom, 0, dist);
560
561
0
            for (int16_t index = 1; index < 4; index++)
562
0
            {
563
0
                int32_t posYT = top    + ((dist >> 2) * index);
564
0
                int32_t posYB = bottom - ((dist >> 2) * index);
565
0
                int32_t posXL = omv.x  - ((dist >> 2) * index);
566
0
                int32_t posXR = omv.x  + ((dist >> 2) * index);
567
568
0
                COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
569
0
                                   posXR, posYT, 0, dist,
570
0
                                   posXL, posYB, 0, dist,
571
0
                                   posXR, posYB, 0, dist);
572
0
            }
573
0
        }
574
0
        else // check border for each mv
575
0
        {
576
0
            if (top >= mvmin.y) // check top
577
0
            {
578
0
                COST_MV_PT_DIST(omv.x, top, 0, dist);
579
0
            }
580
0
            if (left >= mvmin.x) // check left
581
0
            {
582
0
                COST_MV_PT_DIST(left, omv.y, 0, dist);
583
0
            }
584
0
            if (right <= mvmax.x) // check right
585
0
            {
586
0
                COST_MV_PT_DIST(right, omv.y, 0, dist);
587
0
            }
588
0
            if (bottom <= mvmax.y) // check bottom
589
0
            {
590
0
                COST_MV_PT_DIST(omv.x, bottom, 0, dist);
591
0
            }
592
0
            for (int16_t index = 1; index < 4; index++)
593
0
            {
594
0
                int32_t posYT = top    + ((dist >> 2) * index);
595
0
                int32_t posYB = bottom - ((dist >> 2) * index);
596
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
597
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
598
599
0
                if (posYT >= mvmin.y) // check top
600
0
                {
601
0
                    if (posXL >= mvmin.x) // check left
602
0
                    {
603
0
                        COST_MV_PT_DIST(posXL, posYT, 0, dist);
604
0
                    }
605
0
                    if (posXR <= mvmax.x) // check right
606
0
                    {
607
0
                        COST_MV_PT_DIST(posXR, posYT, 0, dist);
608
0
                    }
609
0
                }
610
0
                if (posYB <= mvmax.y) // check bottom
611
0
                {
612
0
                    if (posXL >= mvmin.x) // check left
613
0
                    {
614
0
                        COST_MV_PT_DIST(posXL, posYB, 0, dist);
615
0
                    }
616
0
                    if (posXR <= mvmax.x) // check right
617
0
                    {
618
0
                        COST_MV_PT_DIST(posXR, posYB, 0, dist);
619
0
                    }
620
0
                }
621
0
            }
622
0
        }
623
624
0
        if (bcost < saved)
625
0
            rounds = 0;
626
0
        else if (++rounds >= earlyExitIters)
627
0
            return;
628
0
    }
629
0
}
630
631
int MotionEstimate::diamondSearch(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, MV& outMV)
632
0
{
633
0
    int bcost = INT_MAX;
634
0
    MV bmv(0, 0);
635
0
    MV omv = bmv;
636
637
0
    ALIGN_VAR_16(int, costs[16]);
638
639
0
    intptr_t stride = ref->lumaStride;
640
0
    pixel* fenc = fencPUYuv.m_buf[0];
641
0
    pixel* fref = ref->fpelPlane[0] + blockOffset;
642
643
0
    for (int16_t dist = 1; dist <= 4; dist <<= 1)
644
0
    {
645
0
        const MV bmv0 = bmv;
646
0
        const int32_t top = omv.y - dist;
647
0
        const int32_t bottom = omv.y + dist;
648
0
        const int32_t left = omv.x - dist;
649
0
        const int32_t right = omv.x + dist;
650
0
        const int32_t top2 = omv.y - (dist >> 1);
651
0
        const int32_t bottom2 = omv.y + (dist >> 1);
652
0
        const int32_t left2 = omv.x - (dist >> 1);
653
0
        const int32_t right2 = omv.x + (dist >> 1);
654
655
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
656
0
        {
657
0
            COST_MV_X4(omv.x, top, omv.x, bottom, left, omv.y, right, omv.y);
658
0
            COST_MV_X4(left2, top2, right2, top2, left2, bottom2, right2, bottom2);
659
0
        }
660
0
        else // check border for each mv
661
0
        {
662
0
            if (top >= mvmin.y) // check top
663
0
            {
664
0
                COST_MV(omv.x, top);
665
0
            }
666
0
            if (top2 >= mvmin.y) // check half top
667
0
            {
668
0
                if (left2 >= mvmin.x)  // check half left
669
0
                {
670
0
                    COST_MV(left2, top2);
671
0
                }
672
0
                if (right2 <= mvmax.x) // check half right
673
0
                {
674
0
                    COST_MV(right2, top2);
675
0
                }
676
0
            }
677
0
            if (left >= mvmin.x) // check left
678
0
            {
679
0
                COST_MV(left, omv.y);
680
0
            }
681
0
            if (right <= mvmax.x) // check right
682
0
            {
683
0
                COST_MV(right, omv.y);
684
0
            }
685
0
            if (bottom2 <= mvmax.y) // check half bottom
686
0
            {
687
0
                if (left2 >= mvmin.x) // check half left
688
0
                {
689
0
                    COST_MV(left2, bottom2);
690
0
                }
691
0
                if (right2 <= mvmax.x) // check half right
692
0
                {
693
0
                    COST_MV(right2, bottom2);
694
0
                }
695
0
            }
696
0
            if (bottom <= mvmax.y) // check bottom
697
0
            {
698
0
                COST_MV(omv.x, bottom);
699
0
            }
700
0
        }
701
702
0
        if (bmv == bmv0)
703
0
            break;
704
0
    }
705
706
0
    omv = bmv;
707
0
    for (int16_t dist = 8; dist <= 64; dist += 8)
708
0
    {
709
0
        const MV bmv0 = bmv;
710
0
        const int32_t top = omv.y - dist;
711
0
        const int32_t bottom = omv.y + dist;
712
0
        const int32_t left = omv.x - dist;
713
0
        const int32_t right = omv.x + dist;
714
715
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
716
0
        {
717
0
            COST_MV_X4(omv.x, top, left, omv.y, right, omv.y, omv.x, bottom);
718
719
0
            for (int16_t index = 1; index < 4; index++)
720
0
            {
721
0
                int32_t posYT = top + ((dist >> 2) * index);
722
0
                int32_t posYB = bottom - ((dist >> 2) * index);
723
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
724
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
725
726
0
                COST_MV_X4(posXL, posYT,
727
0
                    posXR, posYT,
728
0
                    posXL, posYB,
729
0
                    posXR, posYB);
730
0
            }
731
0
        }
732
0
        else // check border for each mv
733
0
        {
734
0
            if (top >= mvmin.y) // check top
735
0
            {
736
0
                COST_MV(omv.x, top);
737
0
            }
738
0
            if (left >= mvmin.x) // check left
739
0
            {
740
0
                COST_MV(left, omv.y);
741
0
            }
742
0
            if (right <= mvmax.x) // check right
743
0
            {
744
0
                COST_MV(right, omv.y);
745
0
            }
746
0
            if (bottom <= mvmax.y) // check bottom
747
0
            {
748
0
                COST_MV(omv.x, bottom);
749
0
            }
750
0
            for (int16_t index = 1; index < 4; index++)
751
0
            {
752
0
                int32_t posYT = top + ((dist >> 2) * index);
753
0
                int32_t posYB = bottom - ((dist >> 2) * index);
754
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
755
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
756
757
0
                if (posYT >= mvmin.y) // check top
758
0
                {
759
0
                    if (posXL >= mvmin.x) // check left
760
0
                    {
761
0
                        COST_MV(posXL, posYT);
762
0
                    }
763
0
                    if (posXR <= mvmax.x) // check right
764
0
                    {
765
0
                        COST_MV(posXR, posYT);
766
0
                    }
767
0
                }
768
0
                if (posYB <= mvmax.y) // check bottom
769
0
                {
770
0
                    if (posXL >= mvmin.x) // check left
771
0
                    {
772
0
                        COST_MV(posXL, posYB);
773
0
                    }
774
0
                    if (posXR <= mvmax.x) // check right
775
0
                    {
776
0
                        COST_MV(posXR, posYB);
777
0
                    }
778
0
                }
779
0
            }
780
0
        }
781
782
0
        if (bmv == bmv0)
783
0
            break;
784
0
        omv = bmv;
785
0
    }
786
0
    outMV = bmv;
787
0
    return bcost;
788
0
}
789
790
void MotionEstimate::refineMV(ReferencePlanes* ref,
791
                              const MV&        mvmin,
792
                              const MV&        mvmax,
793
                              const MV&        qmvp,
794
                              MV&              outQMv)
795
0
{
796
0
    ALIGN_VAR_16(int, costs[16]);
797
0
    if (ctuAddr >= 0)
798
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
799
0
    intptr_t stride = ref->lumaStride;
800
0
    pixel* fenc = fencPUYuv.m_buf[0];
801
0
    pixel* fref = ref->fpelPlane[0] + blockOffset;
802
    
803
0
    setMVP(qmvp);
804
    
805
0
    MV qmvmin = mvmin.toQPel();
806
0
    MV qmvmax = mvmax.toQPel();
807
   
808
    /* The term cost used here means satd/sad values for that particular search.
809
     * The costs used in ME integer search only includes the SAD cost of motion
810
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
811
     * cost of residual and sqrtLambda * MVD bits.
812
    */
813
             
814
    // measure SATD cost at clipped QPEL MVP
815
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
816
0
    MV bestpre = pmv;
817
0
    int bprecost;
818
819
0
    bprecost = subpelCompare(ref, pmv, sad);
820
821
    /* re-measure full pel rounded MVP with SAD as search start point */
822
0
    MV bmv = pmv.roundToFPel();
823
0
    int bcost = bprecost;
824
0
    if (pmv.isSubpel())
825
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
826
827
    /* square refine */
828
0
    int dir = 0;
829
0
    COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
830
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
831
0
        COPY2_IF_LT(bcost, costs[0], dir, 1);
832
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
833
0
        COPY2_IF_LT(bcost, costs[1], dir, 2);
834
0
    COPY2_IF_LT(bcost, costs[2], dir, 3);
835
0
    COPY2_IF_LT(bcost, costs[3], dir, 4);
836
0
    COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
837
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
838
0
        COPY2_IF_LT(bcost, costs[0], dir, 5);
839
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
840
0
        COPY2_IF_LT(bcost, costs[1], dir, 6);
841
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
842
0
        COPY2_IF_LT(bcost, costs[2], dir, 7);
843
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
844
0
        COPY2_IF_LT(bcost, costs[3], dir, 8);
845
0
    bmv += square1[dir];
846
847
0
    if (bprecost < bcost)
848
0
    {
849
0
        bmv = bestpre;
850
0
        bcost = bprecost;
851
0
    }
852
0
    else
853
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
854
855
    // TO DO: Change SubpelWorkload to fine tune MV
856
    // Now it is set to 5 for experiment.
857
    // const SubpelWorkload& wl = workload[this->subpelRefine];
858
0
    const SubpelWorkload& wl = workload[5];
859
860
0
    pixelcmp_t hpelcomp;
861
862
0
    if (wl.hpel_satd)
863
0
    {
864
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
865
0
        hpelcomp = satd;
866
0
    }
867
0
    else
868
0
        hpelcomp = sad;
869
870
0
    for (int iter = 0; iter < wl.hpel_iters; iter++)
871
0
    {
872
0
        int bdir = 0;
873
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
874
0
        {
875
0
            MV qmv = bmv + square1[i] * 2;            
876
877
            // check mv range for slice bound
878
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
879
0
                continue;
880
881
0
            int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
882
0
            COPY2_IF_LT(bcost, cost, bdir, i);
883
0
        }
884
885
0
        if (bdir)
886
0
            bmv += square1[bdir] * 2;            
887
0
        else
888
0
            break;
889
0
    }
890
891
    /* if HPEL search used SAD, remeasure with SATD before QPEL */
892
0
    if (!wl.hpel_satd)
893
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
894
895
0
    for (int iter = 0; iter < wl.qpel_iters; iter++)
896
0
    {
897
0
        int bdir = 0;
898
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
899
0
        {
900
0
            MV qmv = bmv + square1[i];
901
            
902
            // check mv range for slice bound
903
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
904
0
                continue;
905
906
0
            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
907
0
            COPY2_IF_LT(bcost, cost, bdir, i);
908
0
        }
909
910
0
        if (bdir)
911
0
            bmv += square1[bdir];
912
0
        else
913
0
            break;
914
0
    }
915
916
    // check mv range for slice bound
917
0
    X265_CHECK(((pmv.y >= qmvmin.y) & (pmv.y <= qmvmax.y)), "mv beyond range!");
918
    
919
0
    x265_emms();
920
0
    outQMv = bmv;
921
0
}
922
923
int MotionEstimate::motionEstimate(ReferencePlanes *ref,
924
                                   const MV &       mvmin,
925
                                   const MV &       mvmax,
926
                                   const MV &       qmvp,
927
                                   int              numCandidates,
928
                                   const MV *       mvc,
929
                                   int              merange,
930
                                   MV &             outQMv,
931
                                   uint32_t         maxSlices,
932
                                    bool            m_vertRestriction,
933
                                   pixel *          srcReferencePlane)
934
0
{
935
0
    ALIGN_VAR_16(int, costs[16]);
936
0
    bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
937
0
    if (ctuAddr >= 0)
938
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
939
0
    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
940
0
    pixel* fenc = fencPUYuv.m_buf[0];
941
0
    pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
942
943
0
    setMVP(qmvp);
944
945
0
    MV qmvmin = mvmin.toQPel();
946
0
    MV qmvmax = mvmax.toQPel();
947
948
    /* The term cost used here means satd/sad values for that particular search.
949
     * The costs used in ME integer search only includes the SAD cost of motion
950
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
951
     * cost of residual and sqrtLambda * MVD bits.  Mode decision will be based
952
     * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
953
     * (mode + MVD bits). */
954
955
    // measure SAD cost at clipped QPEL MVP
956
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
957
0
    if (m_vertRestriction)
958
0
    {
959
0
        if (pmv.y > mvmax.y << 2)
960
0
        {
961
0
            pmv.y = (mvmax.y << 2);
962
0
        }
963
0
    }
964
0
    MV bestpre = pmv;
965
0
    int bprecost;
966
967
0
    if (ref->isLowres)
968
0
        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
969
0
    else
970
0
        bprecost = subpelCompare(ref, pmv, sad);
971
972
    /* re-measure full pel rounded MVP with SAD as search start point */
973
0
    MV bmv = pmv.roundToFPel();
974
0
    int bcost = bprecost;
975
0
    if (pmv.isSubpel())
976
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
977
978
    // measure SAD cost at MV(0) if MVP is not zero
979
0
    if (pmv.notZero())
980
0
    {
981
0
        int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
982
0
        if (cost < bcost)
983
0
        {
984
0
            bcost = cost;
985
0
            bmv = 0;
986
0
            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
987
0
        }
988
0
    }
989
990
0
    X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
991
    // measure SAD cost at each QPEL motion vector candidate
992
0
    for (int i = 0; i < numCandidates; i++)
993
0
    {
994
0
        MV m = mvc[i].clipped(qmvmin, qmvmax);
995
0
        if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
996
0
        {
997
0
            int cost = subpelCompare(ref, m, sad) + mvcost(m);
998
0
            if (cost < bprecost)
999
0
            {
1000
0
                bprecost = cost;
1001
0
                bestpre = m;
1002
0
            }
1003
0
        }
1004
0
    }
1005
1006
0
    pmv = pmv.roundToFPel();
1007
0
    MV omv = bmv;  // current search origin or starting point
1008
0
    if (bcost == 0)
1009
0
    {
1010
0
        outQMv = bmv.toQPel();
1011
0
        return mvcost(bmv << 2); // return just the MV cost (no residual)
1012
0
    }
1013
0
    int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
1014
0
    switch (search)
1015
0
    {
1016
0
    case X265_DIA_SEARCH:
1017
0
    {
1018
        /* diamond search, radius 1 */
1019
0
        bcost <<= 4;
1020
0
        int i = merange;
1021
0
        do
1022
0
        {
1023
0
            COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
1024
0
            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1025
0
                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
1026
0
            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1027
0
                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
1028
0
            COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
1029
0
            COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
1030
0
            if (!(bcost & 15))
1031
0
                break;
1032
0
            bmv.x -= (bcost << 28) >> 30;
1033
0
            bmv.y -= (bcost << 30) >> 30;
1034
0
            bcost &= ~15;
1035
0
        }
1036
0
        while (--i && bmv.checkRange(mvmin, mvmax));
1037
0
        bcost >>= 4;
1038
0
        break;
1039
0
    }
1040
1041
0
    case X265_HEX_SEARCH:
1042
0
    {
1043
0
me_hex2:
1044
        /* hexagon search, radius 2 */
1045
#if 0
1046
        for (int i = 0; i < merange / 2; i++)
1047
        {
1048
            omv = bmv;
1049
            COST_MV(omv.x - 2, omv.y);
1050
            COST_MV(omv.x - 1, omv.y + 2);
1051
            COST_MV(omv.x + 1, omv.y + 2);
1052
            COST_MV(omv.x + 2, omv.y);
1053
            COST_MV(omv.x + 1, omv.y - 2);
1054
            COST_MV(omv.x - 1, omv.y - 2);
1055
            if (omv == bmv)
1056
                break;
1057
            if (!bmv.checkRange(mvmin, mvmax))
1058
                break;
1059
        }
1060
1061
#else // if 0
1062
      /* equivalent to the above, but eliminates duplicate candidates */
1063
0
        COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
1064
0
        bcost <<= 3;
1065
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
1066
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
1067
0
        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
1068
0
        {
1069
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
1070
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
1071
0
        }
1072
1073
0
        COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
1074
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
1075
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
1076
0
        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
1077
0
        {
1078
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
1079
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
1080
0
        }
1081
1082
0
        if (bcost & 7)
1083
0
        {
1084
0
            int dir = (bcost & 7) - 2;
1085
1086
0
            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
1087
0
            {
1088
0
                bmv += hex2[dir + 1];
1089
1090
                /* half hexagon, not overlapping the previous iteration */
1091
0
                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
1092
0
                {
1093
0
                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
1094
0
                        hex2[dir + 1].x, hex2[dir + 1].y,
1095
0
                        hex2[dir + 2].x, hex2[dir + 2].y,
1096
0
                        costs);
1097
0
                    bcost &= ~7;
1098
1099
0
                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
1100
0
                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
1101
1102
0
                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
1103
0
                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
1104
1105
0
                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
1106
0
                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
1107
1108
0
                    if (!(bcost & 7))
1109
0
                        break;
1110
1111
0
                    dir += (bcost & 7) - 2;
1112
0
                    dir = mod6m1[dir + 1];
1113
0
                    bmv += hex2[dir + 1];
1114
0
                }
1115
0
            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
1116
0
        }
1117
0
        bcost >>= 3;
1118
0
#endif // if 0
1119
1120
        /* square refine */
1121
0
        int dir = 0;
1122
0
        COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
1123
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1124
0
            COPY2_IF_LT(bcost, costs[0], dir, 1);
1125
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1126
0
            COPY2_IF_LT(bcost, costs[1], dir, 2);
1127
0
        COPY2_IF_LT(bcost, costs[2], dir, 3);
1128
0
        COPY2_IF_LT(bcost, costs[3], dir, 4);
1129
0
        COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
1130
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1131
0
            COPY2_IF_LT(bcost, costs[0], dir, 5);
1132
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1133
0
            COPY2_IF_LT(bcost, costs[1], dir, 6);
1134
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1135
0
            COPY2_IF_LT(bcost, costs[2], dir, 7);
1136
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1137
0
            COPY2_IF_LT(bcost, costs[3], dir, 8);
1138
0
        bmv += square1[dir];
1139
0
        break;
1140
0
    }
1141
1142
0
    case X265_UMH_SEARCH:
1143
0
    {
1144
0
        int ucost1, ucost2;
1145
0
        int16_t cross_start = 1;
1146
1147
        /* refine predictors */
1148
0
        omv = bmv;
1149
0
        ucost1 = bcost;
1150
0
        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
1151
0
        DIA1_ITER(pmv.x, pmv.y);
1152
0
        if (pmv.notZero())
1153
0
            DIA1_ITER(0, 0);
1154
1155
0
        ucost2 = bcost;
1156
0
        if (bmv.notZero() && bmv != pmv)
1157
0
            DIA1_ITER(bmv.x, bmv.y);
1158
0
        if (bcost == ucost2)
1159
0
            cross_start = 3;
1160
1161
        /* Early Termination */
1162
0
        omv = bmv;
1163
0
        if (bcost == ucost2 && SAD_THRESH(2000))
1164
0
        {
1165
0
            COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
1166
0
            COST_MV_X4(2, 0, -1, 1, 1, 1,  0, 2);
1167
0
            if (bcost == ucost1 && SAD_THRESH(500))
1168
0
                break;
1169
0
            if (bcost == ucost2)
1170
0
            {
1171
0
                int16_t range = (int16_t)(merange >> 1) | 1;
1172
0
                CROSS(3, range, range);
1173
0
                COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
1174
0
                COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
1175
0
                if (bcost == ucost2)
1176
0
                    break;
1177
0
                cross_start = range + 2;
1178
0
            }
1179
0
        }
1180
1181
        // TODO: Need to study x264's logic for building mvc list to understand why they
1182
        //       have special cases here for 16x16, and whether they apply to HEVC CTU
1183
1184
        // adaptive search range based on mvc variability
1185
0
        if (numCandidates)
1186
0
        {
1187
            /* range multipliers based on casual inspection of some statistics of
1188
             * average distance between current predictor and final mv found by ESA.
1189
             * these have not been tuned much by actual encoding. */
1190
0
            static const uint8_t range_mul[4][4] =
1191
0
            {
1192
0
                { 3, 3, 4, 4 },
1193
0
                { 3, 4, 4, 4 },
1194
0
                { 4, 4, 4, 5 },
1195
0
                { 4, 4, 5, 6 },
1196
0
            };
1197
1198
0
            int mvd;
1199
0
            int sad_ctx, mvd_ctx;
1200
0
            int denom = 1;
1201
1202
0
            if (numCandidates == 1)
1203
0
            {
1204
0
                if (LUMA_64x64 == partEnum)
1205
                    /* mvc is probably the same as mvp, so the difference isn't meaningful.
1206
                     * but prediction usually isn't too bad, so just use medium range */
1207
0
                    mvd = 25;
1208
0
                else
1209
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1210
0
            }
1211
0
            else
1212
0
            {
1213
                /* calculate the degree of agreement between predictors. */
1214
1215
                /* in 64x64, mvc includes all the neighbors used to make mvp,
1216
                 * so don't count mvp separately. */
1217
1218
0
                denom = numCandidates - 1;
1219
0
                mvd = 0;
1220
0
                if (partEnum != LUMA_64x64)
1221
0
                {
1222
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1223
0
                    denom++;
1224
0
                }
1225
0
                mvd += predictorDifference(mvc, numCandidates);
1226
0
            }
1227
1228
0
            sad_ctx = SAD_THRESH(1000) ? 0
1229
0
                : SAD_THRESH(2000) ? 1
1230
0
                : SAD_THRESH(4000) ? 2 : 3;
1231
0
            mvd_ctx = mvd < 10 * denom ? 0
1232
0
                : mvd < 20 * denom ? 1
1233
0
                : mvd < 40 * denom ? 2 : 3;
1234
1235
0
            merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
1236
0
        }
1237
1238
        /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
1239
         * we are still centered on the same place as the DIA2. is this desirable? */
1240
0
        CROSS(cross_start, merange, merange >> 1);
1241
0
        COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
1242
1243
        /* hexagon grid */
1244
0
        omv = bmv;
1245
0
        const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
1246
0
        const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
1247
0
        uint16_t i = 1;
1248
0
        do
1249
0
        {
1250
0
            if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
1251
0
                                  mvmax.y - omv.y, omv.y - mvmin.y))
1252
0
            {
1253
0
                for (int j = 0; j < 16; j++)
1254
0
                {
1255
0
                    MV mv = omv + (hex4[j] * i);
1256
0
                    if (mv.checkRange(mvmin, mvmax))
1257
0
                        COST_MV(mv.x, mv.y);
1258
0
                }
1259
0
            }
1260
0
            else
1261
0
            {
1262
0
                int16_t dir = 0;
1263
0
                pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
1264
0
                size_t dy = (size_t)i * stride;
1265
0
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
1266
0
    sad_x4(fenc, \
1267
0
           fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
1268
0
           fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
1269
0
           fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
1270
0
           fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
1271
0
           stride, costs + 4 * k); \
1272
0
    fref_base += 2 * dy;
1273
0
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
1274
0
#define MIN_MV(k, dx, dy)     if ((omv.y + (dy) >= mvmin.y) & (omv.y + (dy) <= mvmax.y)) { COPY2_IF_LT(bcost, costs[k], dir, dx * 16 + (dy & 15)) }
1275
1276
0
                SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
1277
0
                SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
1278
0
                SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
1279
0
                SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
1280
0
                ADD_MVCOST(0, 0, -4);
1281
0
                ADD_MVCOST(1, 0, 4);
1282
0
                ADD_MVCOST(2, -2, -3);
1283
0
                ADD_MVCOST(3, 2, -3);
1284
0
                ADD_MVCOST(4, -4, -2);
1285
0
                ADD_MVCOST(5, 4, -2);
1286
0
                ADD_MVCOST(6, -4, -1);
1287
0
                ADD_MVCOST(7, 4, -1);
1288
0
                ADD_MVCOST(8, -4, 0);
1289
0
                ADD_MVCOST(9, 4, 0);
1290
0
                ADD_MVCOST(10, -4, 1);
1291
0
                ADD_MVCOST(11, 4, 1);
1292
0
                ADD_MVCOST(12, -4, 2);
1293
0
                ADD_MVCOST(13, 4, 2);
1294
0
                ADD_MVCOST(14, -2, 3);
1295
0
                ADD_MVCOST(15, 2, 3);
1296
0
                MIN_MV(0, 0, -4);
1297
0
                MIN_MV(1, 0, 4);
1298
0
                MIN_MV(2, -2, -3);
1299
0
                MIN_MV(3, 2, -3);
1300
0
                MIN_MV(4, -4, -2);
1301
0
                MIN_MV(5, 4, -2);
1302
0
                MIN_MV(6, -4, -1);
1303
0
                MIN_MV(7, 4, -1);
1304
0
                MIN_MV(8, -4, 0);
1305
0
                MIN_MV(9, 4, 0);
1306
0
                MIN_MV(10, -4, 1);
1307
0
                MIN_MV(11, 4, 1);
1308
0
                MIN_MV(12, -4, 2);
1309
0
                MIN_MV(13, 4, 2);
1310
0
                MIN_MV(14, -2, 3);
1311
0
                MIN_MV(15, 2, 3);
1312
0
#undef SADS
1313
0
#undef ADD_MVCOST
1314
0
#undef MIN_MV
1315
0
                if (dir)
1316
0
                {
1317
0
                    bmv.x = omv.x + i * (dir >> 4);
1318
0
                    bmv.y = omv.y + i * ((dir << 28) >> 28);
1319
0
                }
1320
0
            }
1321
0
        }
1322
0
        while (++i <= merange >> 2);
1323
0
        if (bmv.checkRange(mvmin, mvmax))
1324
0
            goto me_hex2;
1325
0
        break;
1326
0
    }
1327
1328
0
    case X265_STAR_SEARCH: // Adapted from HM ME
1329
0
    {
1330
0
        int bPointNr = 0;
1331
0
        int bDistance = 0;
1332
1333
0
        const int EarlyExitIters = 3;
1334
0
        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);
1335
0
        if (bDistance == 1)
1336
0
        {
1337
            // if best distance was only 1, check two missing points.  If no new point is found, stop
1338
0
            if (bPointNr)
1339
0
            {
1340
                /* For a given direction 1 to 8, check nearest two outer X pixels
1341
                     X   X
1342
                   X 1 2 3 X
1343
                     4 * 5
1344
                   X 6 7 8 X
1345
                     X   X
1346
                */
1347
0
                int saved = bcost;
1348
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1349
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1350
0
                if (mv1.checkRange(mvmin, mvmax))
1351
0
                {
1352
0
                    COST_MV(mv1.x, mv1.y);
1353
0
                }
1354
0
                if (mv2.checkRange(mvmin, mvmax))
1355
0
                {
1356
0
                    COST_MV(mv2.x, mv2.y);
1357
0
                }
1358
0
                if (bcost == saved)
1359
0
                    break;
1360
0
            }
1361
0
            else
1362
0
                break;
1363
0
        }
1364
1365
0
        const int RasterDistance = 5;
1366
0
        if (bDistance > RasterDistance)
1367
0
        {
1368
            // raster search refinement if original search distance was too big
1369
0
            MV tmv;
1370
0
            for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
1371
0
            {
1372
0
                for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
1373
0
                {
1374
0
                    if (tmv.x + (RasterDistance * 3) <= mvmax.x)
1375
0
                    {
1376
0
                        pixel *pix_base = fref + tmv.y * stride + tmv.x;
1377
0
                        sad_x4(fenc,
1378
0
                               pix_base,
1379
0
                               pix_base + RasterDistance,
1380
0
                               pix_base + RasterDistance * 2,
1381
0
                               pix_base + RasterDistance * 3,
1382
0
                               stride, costs);
1383
0
                        costs[0] += mvcost(tmv << 2);
1384
0
                        COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1385
0
                        tmv.x += RasterDistance;
1386
0
                        costs[1] += mvcost(tmv << 2);
1387
0
                        COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1388
0
                        tmv.x += RasterDistance;
1389
0
                        costs[2] += mvcost(tmv << 2);
1390
0
                        COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1391
0
                        tmv.x += RasterDistance;
1392
0
                        costs[3] += mvcost(tmv << 3);
1393
0
                        COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1394
0
                    }
1395
0
                    else
1396
0
                        COST_MV(tmv.x, tmv.y);
1397
0
                }
1398
0
            }
1399
0
        }
1400
1401
0
        while (bDistance > 0)
1402
0
        {
1403
            // center a new search around current best
1404
0
            bDistance = 0;
1405
0
            bPointNr = 0;
1406
0
            const int MaxIters = 32;
1407
0
            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);
1408
1409
0
            if (bDistance == 1)
1410
0
            {
1411
0
                if (!bPointNr)
1412
0
                    break;
1413
1414
                /* For a given direction 1 to 8, check nearest 2 outer X pixels
1415
                        X   X
1416
                    X 1 2 3 X
1417
                        4 * 5
1418
                    X 6 7 8 X
1419
                        X   X
1420
                */
1421
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1422
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1423
0
                if (mv1.checkRange(mvmin, mvmax))
1424
0
                {
1425
0
                    COST_MV(mv1.x, mv1.y);
1426
0
                }
1427
0
                if (mv2.checkRange(mvmin, mvmax))
1428
0
                {
1429
0
                    COST_MV(mv2.x, mv2.y);
1430
0
                }
1431
0
                break;
1432
0
            }
1433
0
        }
1434
1435
0
        break;
1436
0
    }
1437
1438
0
    case X265_SEA:
1439
0
    {
1440
        // Successive Elimination Algorithm
1441
0
        const int32_t minX = X265_MAX(omv.x - (int32_t)merange, mvmin.x);
1442
0
        const int32_t minY = X265_MAX(omv.y - (int32_t)merange, mvmin.y);
1443
0
        const int32_t maxX = X265_MIN(omv.x + (int32_t)merange, mvmax.x);
1444
0
        const int32_t maxY = X265_MIN(omv.y + (int32_t)merange, mvmax.y);
1445
0
        const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;
1446
0
        const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;
1447
0
        int16_t* meScratchBuffer = NULL;
1448
0
        int scratchSize = merange * 2 + 4;
1449
0
        if (scratchSize)
1450
0
        {
1451
0
            meScratchBuffer = X265_MALLOC(int16_t, scratchSize);
1452
0
            memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);
1453
0
        }
1454
1455
        /* SEA is fastest in multiples of 4 */
1456
0
        int meRangeWidth = (maxX - minX + 3) & ~3;
1457
0
        int w = 0, h = 0;                    // Width and height of the PU
1458
0
        ALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };
1459
0
        ALIGN_VAR_32(int, encDC[4]);
1460
0
        uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >> 2);
1461
0
        sizesFromPartition(partEnum, &w, &h);
1462
0
        int deltaX = (w <= 8) ? (w) : (w >> 1);
1463
0
        int deltaY = (h <= 8) ? (h) : (h >> 1);
1464
1465
        /* Check if very small rectangular blocks which cannot be sub-divided anymore */
1466
0
        bool smallRectPartition = partEnum == LUMA_4x4 || partEnum == LUMA_16x12 ||
1467
0
            partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum == LUMA_4x16;
1468
        /* Check if vertical partition */
1469
0
        bool verticalRect = partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1470
0
            partEnum == LUMA_4x8;
1471
        /* Check if horizontal partition */
1472
0
        bool horizontalRect = partEnum == LUMA_64x32 || partEnum == LUMA_32x16 || partEnum == LUMA_16x8 ||
1473
0
            partEnum == LUMA_8x4;
1474
        /* Check if assymetric vertical partition */
1475
0
        bool assymetricVertical = partEnum == LUMA_12x16 || partEnum == LUMA_4x16 || partEnum == LUMA_24x32 ||
1476
0
            partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum == LUMA_16x64;
1477
        /* Check if assymetric horizontal partition */
1478
0
        bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum == LUMA_16x4 || partEnum == LUMA_32x24 ||
1479
0
            partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum == LUMA_64x16;
1480
1481
0
        int tempPartEnum = 0;
1482
1483
        /* If a vertical rectangular partition, it is horizontally split into two, for ads_x2() */
1484
0
        if (verticalRect)
1485
0
            tempPartEnum = partitionFromSizes(w, h >> 1);
1486
        /* If a horizontal rectangular partition, it is vertically split into two, for ads_x2() */
1487
0
        else if (horizontalRect)
1488
0
            tempPartEnum = partitionFromSizes(w >> 1, h);
1489
        /* We have integral planes introduced to account for assymetric partitions.
1490
         * Hence all assymetric partitions except those which cannot be split into legal sizes,
1491
         * are split into four for ads_x4() */
1492
0
        else if (assymetricVertical || assymetricHorizontal)
1493
0
            tempPartEnum = smallRectPartition ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1494
        /* General case: Square partitions. All partitions with width > 8 are split into four
1495
         * for ads_x4(), for 4x4 and 8x8 we do ads_x1() */
1496
0
        else
1497
0
            tempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1498
1499
        /* Successive elimination by comparing DC before a full SAD,
1500
         * because sum(abs(diff)) >= abs(diff(sum)). */
1501
0
        primitives.pu[tempPartEnum].sad_x4(zero,
1502
0
                         fenc,
1503
0
                         fenc + deltaX,
1504
0
                         fenc + deltaY * FENC_STRIDE,
1505
0
                         fenc + deltaX + deltaY * FENC_STRIDE,
1506
0
                         FENC_STRIDE,
1507
0
                         encDC);
1508
1509
        /* Assigning appropriate integral plane */
1510
0
        uint32_t *sumsBase = NULL;
1511
0
        switch (deltaX)
1512
0
        {
1513
0
            case 32: if (deltaY % 24 == 0)
1514
0
                         sumsBase = integral[1];
1515
0
                     else if (deltaY == 8)
1516
0
                         sumsBase = integral[2];
1517
0
                     else
1518
0
                         sumsBase = integral[0];
1519
0
               break;
1520
0
            case 24: sumsBase = integral[3];
1521
0
               break;
1522
0
            case 16: if (deltaY % 12 == 0)
1523
0
                         sumsBase = integral[5];
1524
0
                     else if (deltaY == 4)
1525
0
                         sumsBase = integral[6];
1526
0
                     else
1527
0
                         sumsBase = integral[4];
1528
0
               break;
1529
0
            case 12: sumsBase = integral[7];
1530
0
                break;
1531
0
            case 8: if (deltaY == 32)
1532
0
                        sumsBase = integral[8];
1533
0
                    else
1534
0
                        sumsBase = integral[9];
1535
0
                break;
1536
0
            case 4: if (deltaY == 16)
1537
0
                        sumsBase = integral[10];
1538
0
                    else
1539
0
                        sumsBase = integral[11];
1540
0
                break;
1541
0
            default: sumsBase = integral[11];
1542
0
                break;
1543
0
        }
1544
1545
0
        if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum == LUMA_16x16 ||
1546
0
            partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1547
0
            partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum == LUMA_4x16 ||
1548
0
            partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum == LUMA_48x64 ||
1549
0
            partEnum == LUMA_16x64)
1550
0
            deltaY *= (int)stride;
1551
1552
0
        if (verticalRect)
1553
0
            encDC[1] = encDC[2];
1554
1555
0
        if (horizontalRect)
1556
0
            deltaY = deltaX;
1557
1558
        /* ADS and SAD */
1559
0
        MV tmv;
1560
0
        for (tmv.y = minY; tmv.y <= maxY; tmv.y++)
1561
0
        {
1562
0
            int i, xn;
1563
0
            int ycost = p_cost_mvy[tmv.y] << 2;
1564
0
            if (bcost <= ycost)
1565
0
                continue;
1566
0
            bcost -= ycost;
1567
1568
            /* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48, 32x8, 8x32, 64x16, 16x64 partitions
1569
             * ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions
1570
             * ADS_2 for all other rectangular partitions */
1571
0
            xn = ads(encDC,
1572
0
                    sumsBase + minX + tmv.y * stride,
1573
0
                    deltaY,
1574
0
                    fpelCostMvX + minX,
1575
0
                    meScratchBuffer,
1576
0
                    meRangeWidth,
1577
0
                    bcost);
1578
1579
0
            for (i = 0; i < xn - 2; i += 3)
1580
0
                COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,
1581
0
                             minX + meScratchBuffer[i + 1], tmv.y,
1582
0
                             minX + meScratchBuffer[i + 2], tmv.y);
1583
1584
0
            bcost += ycost;
1585
0
            for (; i < xn; i++)
1586
0
                COST_MV(minX + meScratchBuffer[i], tmv.y);
1587
0
        }
1588
0
        if (meScratchBuffer)
1589
0
            x265_free(meScratchBuffer);
1590
0
        break;
1591
0
    }
1592
1593
0
    case X265_FULL_SEARCH:
1594
0
    {
1595
        // dead slow exhaustive search, but at least it uses sad_x4()
1596
0
        MV tmv;
1597
0
        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
1598
0
        if (ref->isHMELowres)
1599
0
        {
1600
0
            merange = (merange < 0 ? -merange : merange);
1601
0
            mvmin_y = X265_MAX(mvmin.y, -merange);
1602
0
            mvmin_x = X265_MAX(mvmin.x, -merange);
1603
0
            mvmax_y = X265_MIN(mvmax.y, merange);
1604
0
            mvmax_x = X265_MIN(mvmax.x, merange);
1605
0
        }
1606
0
        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
1607
0
        {
1608
0
            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
1609
0
            {
1610
0
                if (tmv.x + 3 <= mvmax_x)
1611
0
                {
1612
0
                    pixel *pix_base = fref + tmv.y * stride + tmv.x;
1613
0
                    sad_x4(fenc,
1614
0
                           pix_base,
1615
0
                           pix_base + 1,
1616
0
                           pix_base + 2,
1617
0
                           pix_base + 3,
1618
0
                           stride, costs);
1619
0
                    costs[0] += mvcost(tmv << 2);
1620
0
                    COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1621
0
                    tmv.x++;
1622
0
                    costs[1] += mvcost(tmv << 2);
1623
0
                    COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1624
0
                    tmv.x++;
1625
0
                    costs[2] += mvcost(tmv << 2);
1626
0
                    COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1627
0
                    tmv.x++;
1628
0
                    costs[3] += mvcost(tmv << 2);
1629
0
                    COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1630
0
                }
1631
0
                else
1632
0
                    COST_MV(tmv.x, tmv.y);
1633
0
            }
1634
0
        }
1635
1636
0
        break;
1637
0
    }
1638
1639
0
    default:
1640
0
        X265_CHECK(0, "invalid motion estimate mode\n");
1641
0
        break;
1642
0
    }
1643
1644
0
    if (bprecost < bcost)
1645
0
    {
1646
0
        bmv = bestpre;
1647
0
        bcost = bprecost;
1648
0
    }
1649
0
    else
1650
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
1651
1652
0
    const SubpelWorkload& wl = workload[this->subpelRefine];
1653
1654
    // check mv range for slice bound
1655
0
    if ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y)))
1656
0
    {
1657
0
        bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);
1658
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1659
0
    }
1660
1661
0
    if (!bcost)
1662
0
    {
1663
        /* if there was zero residual at the clipped MVP, we can skip subpel
1664
         * refine, but we do need to include the mvcost in the returned cost */
1665
0
        bcost = mvcost(bmv);
1666
0
    }
1667
0
    else if (ref->isLowres)
1668
0
    {
1669
0
        int bdir = 0;
1670
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
1671
0
        {
1672
0
            MV qmv = bmv + square1[i] * 2;
1673
1674
            /* skip invalid range */
1675
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1676
0
                continue;
1677
1678
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
1679
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1680
0
        }
1681
1682
0
        bmv += square1[bdir] * 2;
1683
0
        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
1684
1685
0
        bdir = 0;
1686
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
1687
0
        {
1688
0
            MV qmv = bmv + square1[i];
1689
1690
            /* skip invalid range */
1691
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1692
0
                continue;
1693
1694
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
1695
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1696
0
        }
1697
1698
0
        bmv += square1[bdir];
1699
0
    }
1700
0
    else
1701
0
    {
1702
0
        pixelcmp_t hpelcomp;
1703
1704
0
        if (wl.hpel_satd)
1705
0
        {
1706
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1707
0
            hpelcomp = satd;
1708
0
        }
1709
0
        else
1710
0
            hpelcomp = sad;
1711
1712
0
        for (int iter = 0; iter < wl.hpel_iters; iter++)
1713
0
        {
1714
0
            int bdir = 0;
1715
0
            for (int i = 1; i <= wl.hpel_dirs; i++)
1716
0
            {
1717
0
                MV qmv = bmv + square1[i] * 2;
1718
1719
                // check mv range for slice bound
1720
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1721
0
                    continue;
1722
1723
0
                int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1724
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1725
0
            }
1726
1727
0
            if (bdir)
1728
0
                bmv += square1[bdir] * 2;
1729
0
            else
1730
0
                break;
1731
0
        }
1732
1733
        /* if HPEL search used SAD, remeasure with SATD before QPEL */
1734
0
        if (!wl.hpel_satd)
1735
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1736
1737
0
        for (int iter = 0; iter < wl.qpel_iters; iter++)
1738
0
        {
1739
0
            int bdir = 0;
1740
0
            for (int i = 1; i <= wl.qpel_dirs; i++)
1741
0
            {
1742
0
                MV qmv = bmv + square1[i];
1743
1744
                // check mv range for slice bound
1745
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1746
0
                    continue;
1747
1748
0
                int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1749
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1750
0
            }
1751
1752
0
            if (bdir)
1753
0
                bmv += square1[bdir];
1754
0
            else
1755
0
                break;
1756
0
        }
1757
0
    }
1758
1759
    // check mv range for slice bound
1760
0
    X265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");
1761
1762
    // Get a chance to ZeroMv
1763
0
    if (bmv.notZero())
1764
0
    {
1765
0
      int cost = subpelCompare(ref, MV(0, 0), satd) + mvcost(MV(0, 0));
1766
0
      if (cost <= bcost)
1767
0
        bmv = MV(0, 0);
1768
0
    }
1769
1770
0
    x265_emms();
1771
0
    outQMv = bmv;
1772
0
    return bcost;
1773
0
}
1774
1775
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1776
0
{
1777
0
    intptr_t refStride = ref->lumaStride;
1778
0
    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1779
0
    int xFrac = qmv.x & 0x3;
1780
0
    int yFrac = qmv.y & 0x3;
1781
0
    int cost;
1782
0
    const intptr_t fencStride = FENC_STRIDE;
1783
0
    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1784
1785
0
    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
1786
    
1787
0
    if (!(yFrac | xFrac))
1788
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
1789
0
    else
1790
0
    {
1791
        /* we are taking a short-cut here if the reference is weighted. To be
1792
         * accurate we should be interpolating unweighted pixels and weighting
1793
         * the final 16bit values prior to rounding and down shifting. Instead we
1794
         * are simply interpolating the weighted full-pel pixels. Not 100%
1795
         * accurate but good enough for fast qpel ME */
1796
0
        if (!yFrac)
1797
0
            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
1798
0
        else if (!xFrac)
1799
0
            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
1800
0
        else
1801
0
            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
1802
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
1803
0
    }
1804
1805
0
    if (bChromaSATD)
1806
0
    {
1807
0
        int csp    = fencPUYuv.m_csp;
1808
0
        int hshift = fencPUYuv.m_hChromaShift;
1809
0
        int vshift = fencPUYuv.m_vChromaShift;
1810
0
        int mvx = qmv.x << (1 - hshift);
1811
0
        int mvy = qmv.y << (1 - vshift);
1812
0
        intptr_t fencStrideC = fencPUYuv.m_csize;
1813
1814
0
        intptr_t refStrideC = ref->reconPic->m_strideC;
1815
0
        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
1816
1817
0
        const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1818
0
        const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1819
1820
0
        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
1821
0
        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
1822
1823
0
        xFrac = mvx & 7;
1824
0
        yFrac = mvy & 7;
1825
1826
0
        if (!(yFrac | xFrac))
1827
0
        {
1828
0
            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
1829
0
            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
1830
0
        }
1831
0
        else
1832
0
        {
1833
0
            int blockwidthC = blockwidth >> hshift;
1834
1835
0
            if (!yFrac)
1836
0
            {
1837
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
1838
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1839
1840
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
1841
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1842
0
            }
1843
0
            else if (!xFrac)
1844
0
            {
1845
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
1846
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1847
1848
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
1849
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1850
0
            }
1851
0
            else
1852
0
            {
1853
0
                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
1854
0
                const int halfFilterSize = (NTAPS_CHROMA >> 1);
1855
1856
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
1857
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1858
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1859
1860
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
1861
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1862
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1863
0
            }
1864
0
        }
1865
0
    }
1866
1867
0
    return cost;
1868
0
}