Coverage Report

Created: 2026-03-08 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/motion.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
 *
21
 * This program is also available under a commercial proprietary license.
22
 * For more information, contact us at license @ x265.com.
23
 *****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "lowres.h"
28
#include "motion.h"
29
#include "x265.h"
30
31
#if _MSC_VER
32
#pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
33
#endif
34
35
using namespace X265_NS;
36
37
namespace {
38
39
struct SubpelWorkload
40
{
41
    int hpel_iters;
42
    int hpel_dirs;
43
    int qpel_iters;
44
    int qpel_dirs;
45
    bool hpel_satd;
46
};
47
48
const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
49
{
50
    { 1, 4, 0, 4, false }, // 4 SAD HPEL only
51
    { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
52
    { 1, 4, 1, 4, true },  // 4 SATD HPEL + 4 SATD QPEL
53
    { 2, 4, 1, 4, true },  // 2x4 SATD HPEL + 4 SATD QPEL
54
    { 2, 4, 2, 4, true },  // 2x4 SATD HPEL + 2x4 SATD QPEL
55
    { 1, 8, 1, 8, true },  // 8 SATD HPEL + 8 SATD QPEL (default)
56
    { 2, 8, 1, 8, true },  // 2x8 SATD HPEL + 8 SATD QPEL
57
    { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
58
};
59
60
static int sizeScale[NUM_PU_SIZES];
61
0
#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
62
63
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
64
const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
65
const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
66
const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
67
const MV hex4[16] =
68
{
69
    MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
70
    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
71
    MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
72
    MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
73
};
74
const MV offsets[] =
75
{
76
    MV(-1, 0), MV(0, -1),
77
    MV(-1, -1), MV(1, -1),
78
    MV(-1, 0), MV(1, 0),
79
    MV(-1, 1), MV(-1, -1),
80
    MV(1, -1), MV(1, 1),
81
    MV(-1, 0), MV(0, 1),
82
    MV(-1, 1), MV(1, 1),
83
    MV(1, 0), MV(0, 1),
84
}; // offsets for Two Point Search
85
86
/* sum of absolute differences between MV candidates, used for adaptive ME range */
87
inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
88
0
{
89
0
    int sum = 0;
90
91
0
    for (int i = 0; i < numCandidates - 1; i++)
92
0
    {
93
0
        sum += abs(mvc[i].x - mvc[i + 1].x)
94
0
            +  abs(mvc[i].y - mvc[i + 1].y);
95
0
    }
96
97
0
    return sum;
98
0
}
99
100
}
101
102
MotionEstimate::MotionEstimate()
103
42.8k
{
104
42.8k
    ctuAddr = -1;
105
42.8k
    absPartIdx = -1;
106
42.8k
    searchMethod = X265_HEX_SEARCH;
107
42.8k
    searchMethodL0 = X265_HEX_SEARCH;
108
42.8k
    searchMethodL1 = X265_HEX_SEARCH;
109
42.8k
    subpelRefine = 2;
110
42.8k
    blockwidth = blockheight = 0;
111
42.8k
    blockOffset = 0;
112
42.8k
    bChromaSATD = false;
113
42.8k
    chromaSatd = NULL;
114
557k
    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
115
514k
        integral[i] = NULL;
116
42.8k
}
117
118
void MotionEstimate::init(int csp)
119
42.8k
{
120
42.8k
    fencPUYuv.create(FENC_STRIDE, csp);
121
42.8k
}
122
123
void MotionEstimate::initScales(void)
124
654
{
125
654
#define SETUP_SCALE(W, H) \
126
16.3k
    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
127
654
    SETUP_SCALE(4, 4);
128
654
    SETUP_SCALE(8, 8);
129
654
    SETUP_SCALE(8, 4);
130
654
    SETUP_SCALE(4, 8);
131
654
    SETUP_SCALE(16, 16);
132
654
    SETUP_SCALE(16, 8);
133
654
    SETUP_SCALE(8, 16);
134
654
    SETUP_SCALE(16, 12);
135
654
    SETUP_SCALE(12, 16);
136
654
    SETUP_SCALE(4, 16);
137
654
    SETUP_SCALE(16, 4);
138
654
    SETUP_SCALE(32, 32);
139
654
    SETUP_SCALE(32, 16);
140
654
    SETUP_SCALE(16, 32);
141
654
    SETUP_SCALE(32, 24);
142
654
    SETUP_SCALE(24, 32);
143
654
    SETUP_SCALE(32, 8);
144
654
    SETUP_SCALE(8, 32);
145
654
    SETUP_SCALE(64, 64);
146
654
    SETUP_SCALE(64, 32);
147
654
    SETUP_SCALE(32, 64);
148
654
    SETUP_SCALE(64, 48);
149
654
    SETUP_SCALE(48, 64);
150
654
    SETUP_SCALE(64, 16);
151
654
    SETUP_SCALE(16, 64);
152
654
#undef SETUP_SCALE
153
654
}
154
155
int MotionEstimate::hpelIterationCount(int subme)
156
2.89k
{
157
2.89k
    return workload[subme].hpel_iters +
158
2.89k
           workload[subme].qpel_iters / 2;
159
2.89k
}
160
161
MotionEstimate::~MotionEstimate()
162
42.8k
{
163
42.8k
    fencPUYuv.destroy();
164
42.8k
}
165
166
/* Called by lookahead, luma only, no use of PicYuv */
167
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
168
0
{
169
0
    partEnum = partitionFromSizes(pwidth, pheight);
170
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
171
0
    sad = primitives.pu[partEnum].sad;
172
0
    ads = primitives.pu[partEnum].ads;
173
0
    satd = primitives.pu[partEnum].satd;
174
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
175
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
176
177
178
0
    blockwidth = pwidth;
179
0
    blockOffset = offset;
180
0
    absPartIdx = ctuAddr = -1;
181
182
    /* Search params */
183
0
    searchMethod = method;
184
0
    searchMethodL0 = searchL0;
185
0
    searchMethodL1 = searchL1;
186
0
    subpelRefine = refine;
187
188
    /* copy PU block into cache */
189
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
190
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
191
0
}
192
193
/* Called by lookahead, luma only, no use of PicYuv */
194
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
195
0
{
196
0
    partEnum = partitionFromSizes(pwidth, pheight);
197
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
198
0
    sad = primitives.pu[partEnum].sad;
199
0
    ads = primitives.pu[partEnum].ads;
200
0
    satd = primitives.pu[partEnum].satd;
201
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
202
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
203
204
205
0
    blockwidth = pwidth;
206
0
    blockOffset = offset;
207
0
    absPartIdx = ctuAddr = -1;
208
209
    /* Search params */
210
0
    searchMethod = method;
211
0
    subpelRefine = refine;
212
213
    /* copy PU block into cache */
214
0
    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
215
0
    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
216
0
}
217
218
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
219
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
220
0
{
221
0
    partEnum = partitionFromSizes(pwidth, pheight);
222
0
    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
223
0
    sad = primitives.pu[partEnum].sad;
224
0
    ads = primitives.pu[partEnum].ads;
225
0
    satd = primitives.pu[partEnum].satd;
226
0
    sad_x3 = primitives.pu[partEnum].sad_x3;
227
0
    sad_x4 = primitives.pu[partEnum].sad_x4;
228
229
0
    chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
230
231
    /* Set search characteristics */
232
0
    searchMethod = method;
233
0
    subpelRefine = refine;
234
235
    /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
236
     * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
237
0
    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
238
0
    X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
239
240
0
    ctuAddr = _ctuAddr;
241
0
    absPartIdx = cuPartIdx + puPartIdx;
242
0
    blockwidth = pwidth;
243
0
    blockOffset = 0;
244
245
    /* copy PU from CU Yuv */
246
0
    fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
247
0
}
248
249
#define COST_MV_PT_DIST(mx, my, point, dist) \
250
0
    do \
251
0
    { \
252
0
        MV tmv(mx, my); \
253
0
        int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
254
0
        cost += mvcost(tmv << 2); \
255
0
        if (cost < bcost) { \
256
0
            bcost = cost; \
257
0
            bmv = tmv; \
258
0
            bPointNr = point; \
259
0
            bDistance = dist; \
260
0
        } \
261
0
    } while (0)
262
263
#define COST_MV(mx, my) \
264
0
    do \
265
0
    { \
266
0
        int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
267
0
        cost += mvcost(MV(mx, my) << 2); \
268
0
        COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
269
0
    } while (0)
270
271
#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
272
0
    { \
273
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
274
0
        sad_x3(fenc, \
275
0
               pix_base + (m0x) + (m0y) * stride, \
276
0
               pix_base + (m1x) + (m1y) * stride, \
277
0
               pix_base + (m2x) + (m2y) * stride, \
278
0
               stride, costs); \
279
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
280
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
281
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
282
0
    }
283
284
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
285
0
    { \
286
0
        sad_x4(fenc, \
287
0
               fref + (m0x) + (m0y) * stride, \
288
0
               fref + (m1x) + (m1y) * stride, \
289
0
               fref + (m2x) + (m2y) * stride, \
290
0
               fref + (m3x) + (m3y) * stride, \
291
0
               stride, costs); \
292
0
        (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
293
0
        (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
294
0
        (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
295
0
        (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
296
0
        COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
297
0
        COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
298
0
        COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
299
0
        COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
300
0
    }
301
302
#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
303
0
    { \
304
0
        pixel *pix_base = fref + omv.x + omv.y * stride; \
305
0
        sad_x4(fenc, \
306
0
               pix_base + (m0x) + (m0y) * stride, \
307
0
               pix_base + (m1x) + (m1y) * stride, \
308
0
               pix_base + (m2x) + (m2y) * stride, \
309
0
               pix_base + (m3x) + (m3y) * stride, \
310
0
               stride, costs); \
311
0
        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
312
0
        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
313
0
        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
314
0
        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
315
0
        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
316
0
            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
317
0
        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
318
0
            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
319
0
        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
320
0
            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
321
0
        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
322
0
            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
323
0
    }
324
325
0
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
326
0
{\
327
0
    sad_x3(fenc, \
328
0
    fref + (m0x) + (m0y) * stride, \
329
0
    fref + (m1x) + (m1y) * stride, \
330
0
    fref + (m2x) + (m2y) * stride, \
331
0
    stride, costs); \
332
0
    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
333
0
    costs[1] += p_cost_mvx[(m1x) << 2]; \
334
0
    costs[2] += p_cost_mvx[(m2x) << 2]; \
335
0
    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
336
0
    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
337
0
    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
338
0
}
339
340
#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
341
0
    { \
342
0
        pixel *pix_base = fref + bmv.x + bmv.y * stride; \
343
0
        sad_x4(fenc, \
344
0
               pix_base + (m0x) + (m0y) * stride, \
345
0
               pix_base + (m1x) + (m1y) * stride, \
346
0
               pix_base + (m2x) + (m2y) * stride, \
347
0
               pix_base + (m3x) + (m3y) * stride, \
348
0
               stride, costs); \
349
0
        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
350
0
        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
351
0
        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
352
0
        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
353
0
    }
354
355
#define DIA1_ITER(mx, my) \
356
0
    { \
357
0
        omv.x = mx; omv.y = my; \
358
0
        COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
359
0
    }
360
361
#define CROSS(start, x_max, y_max) \
362
0
    { \
363
0
        int16_t i = start; \
364
0
        if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
365
0
            for (; i < (x_max) - 2; i += 4) { \
366
0
                COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
367
0
        for (; i < (x_max); i += 2) \
368
0
        { \
369
0
            if (omv.x + i <= mvmax.x) \
370
0
                COST_MV(omv.x + i, omv.y); \
371
0
            if (omv.x - i >= mvmin.x) \
372
0
                COST_MV(omv.x - i, omv.y); \
373
0
        } \
374
0
        i = start; \
375
0
        if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
376
0
            for (; i < (y_max) - 2; i += 4) { \
377
0
                COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
378
0
        for (; i < (y_max); i += 2) \
379
0
        { \
380
0
            if (omv.y + i <= mvmax.y) \
381
0
                COST_MV(omv.x, omv.y + i); \
382
0
            if (omv.y - i >= mvmin.y) \
383
0
                COST_MV(omv.x, omv.y - i); \
384
0
        } \
385
0
    }
386
387
void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
388
                                       const MV &       mvmin,
389
                                       const MV &       mvmax,
390
                                       MV &             bmv,
391
                                       int &            bcost,
392
                                       int &            bPointNr,
393
                                       int &            bDistance,
394
                                       int              earlyExitIters,
395
                                       int              merange,
396
                                       int              hme)
397
0
{
398
0
    ALIGN_VAR_16(int, costs[16]);
399
0
    pixel* fenc = fencPUYuv.m_buf[0];
400
0
    pixel* fref = (hme? ref->fpelLowerResPlane[0] : ref->fpelPlane[0]) + blockOffset;
401
0
    intptr_t stride = hme? ref->lumaStride / 2 : ref->lumaStride;
402
403
0
    MV omv = bmv;
404
0
    int saved = bcost;
405
0
    int rounds = 0;
406
407
0
    {
408
0
        int16_t dist = 1;
409
410
        /* bPointNr
411
              2
412
            4 * 5
413
              7
414
         */
415
0
        const int32_t top    = omv.y - dist;
416
0
        const int32_t bottom = omv.y + dist;
417
0
        const int32_t left   = omv.x - dist;
418
0
        const int32_t right  = omv.x + dist;
419
420
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
421
0
        {
422
0
            COST_MV_PT_DIST_X4(omv.x,  top,    2, dist,
423
0
                               left,  omv.y,   4, dist,
424
0
                               right, omv.y,   5, dist,
425
0
                               omv.x,  bottom, 7, dist);
426
0
        }
427
0
        else
428
0
        {
429
0
            if (top >= mvmin.y) // check top
430
0
            {
431
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
432
0
            }
433
0
            if (left >= mvmin.x) // check middle left
434
0
            {
435
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
436
0
            }
437
0
            if (right <= mvmax.x) // check middle right
438
0
            {
439
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
440
0
            }
441
0
            if (bottom <= mvmax.y) // check bottom
442
0
            {
443
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
444
0
            }
445
0
        }
446
0
        if (bcost < saved)
447
0
            rounds = 0;
448
0
        else if (++rounds >= earlyExitIters)
449
0
            return;
450
0
    }
451
452
0
    for (int16_t dist = 2; dist <= 8; dist <<= 1)
453
0
    {
454
        /* bPointNr
455
              2
456
             1 3
457
            4 * 5
458
             6 8
459
              7
460
         Points 2, 4, 5, 7 are dist
461
         Points 1, 3, 6, 8 are dist>>1
462
         */
463
0
        const int32_t top     = omv.y - dist;
464
0
        const int32_t bottom  = omv.y + dist;
465
0
        const int32_t left    = omv.x - dist;
466
0
        const int32_t right   = omv.x + dist;
467
0
        const int32_t top2    = omv.y - (dist >> 1);
468
0
        const int32_t bottom2 = omv.y + (dist >> 1);
469
0
        const int32_t left2   = omv.x - (dist >> 1);
470
0
        const int32_t right2  = omv.x + (dist >> 1);
471
0
        saved = bcost;
472
473
0
        if (top >= mvmin.y && left >= mvmin.x &&
474
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
475
0
        {
476
0
            COST_MV_PT_DIST_X4(omv.x,  top,   2, dist,
477
0
                               left2,  top2,  1, dist >> 1,
478
0
                               right2, top2,  3, dist >> 1,
479
0
                               left,   omv.y, 4, dist);
480
0
            COST_MV_PT_DIST_X4(right,  omv.y,   5, dist,
481
0
                               left2,  bottom2, 6, dist >> 1,
482
0
                               right2, bottom2, 8, dist >> 1,
483
0
                               omv.x,  bottom,  7, dist);
484
0
        }
485
0
        else // check border for each mv
486
0
        {
487
0
            if (top >= mvmin.y) // check top
488
0
            {
489
0
                COST_MV_PT_DIST(omv.x, top, 2, dist);
490
0
            }
491
0
            if (top2 >= mvmin.y) // check half top
492
0
            {
493
0
                if (left2 >= mvmin.x) // check half left
494
0
                {
495
0
                    COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
496
0
                }
497
0
                if (right2 <= mvmax.x) // check half right
498
0
                {
499
0
                    COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
500
0
                }
501
0
            }
502
0
            if (left >= mvmin.x) // check left
503
0
            {
504
0
                COST_MV_PT_DIST(left, omv.y, 4, dist);
505
0
            }
506
0
            if (right <= mvmax.x) // check right
507
0
            {
508
0
                COST_MV_PT_DIST(right, omv.y, 5, dist);
509
0
            }
510
0
            if (bottom2 <= mvmax.y) // check half bottom
511
0
            {
512
0
                if (left2 >= mvmin.x) // check half left
513
0
                {
514
0
                    COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
515
0
                }
516
0
                if (right2 <= mvmax.x) // check half right
517
0
                {
518
0
                    COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
519
0
                }
520
0
            }
521
0
            if (bottom <= mvmax.y) // check bottom
522
0
            {
523
0
                COST_MV_PT_DIST(omv.x, bottom, 7, dist);
524
0
            }
525
0
        }
526
527
0
        if (bcost < saved)
528
0
            rounds = 0;
529
0
        else if (++rounds >= earlyExitIters)
530
0
            return;
531
0
    }
532
533
0
    for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
534
0
    {
535
0
        const int32_t top    = omv.y - dist;
536
0
        const int32_t bottom = omv.y + dist;
537
0
        const int32_t left   = omv.x - dist;
538
0
        const int32_t right  = omv.x + dist;
539
540
0
        saved = bcost;
541
0
        if (top >= mvmin.y && left >= mvmin.x &&
542
0
            right <= mvmax.x && bottom <= mvmax.y) // check border
543
0
        {
544
            /* index
545
                  0
546
                  3
547
                  2
548
                  1
549
          0 3 2 1 * 1 2 3 0
550
                  1
551
                  2
552
                  3
553
                  0
554
            */
555
556
0
            COST_MV_PT_DIST_X4(omv.x,  top,    0, dist,
557
0
                               left,   omv.y,  0, dist,
558
0
                               right,  omv.y,  0, dist,
559
0
                               omv.x,  bottom, 0, dist);
560
561
0
            for (int16_t index = 1; index < 4; index++)
562
0
            {
563
0
                int32_t posYT = top    + ((dist >> 2) * index);
564
0
                int32_t posYB = bottom - ((dist >> 2) * index);
565
0
                int32_t posXL = omv.x  - ((dist >> 2) * index);
566
0
                int32_t posXR = omv.x  + ((dist >> 2) * index);
567
568
0
                COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
569
0
                                   posXR, posYT, 0, dist,
570
0
                                   posXL, posYB, 0, dist,
571
0
                                   posXR, posYB, 0, dist);
572
0
            }
573
0
        }
574
0
        else // check border for each mv
575
0
        {
576
0
            if (top >= mvmin.y) // check top
577
0
            {
578
0
                COST_MV_PT_DIST(omv.x, top, 0, dist);
579
0
            }
580
0
            if (left >= mvmin.x) // check left
581
0
            {
582
0
                COST_MV_PT_DIST(left, omv.y, 0, dist);
583
0
            }
584
0
            if (right <= mvmax.x) // check right
585
0
            {
586
0
                COST_MV_PT_DIST(right, omv.y, 0, dist);
587
0
            }
588
0
            if (bottom <= mvmax.y) // check bottom
589
0
            {
590
0
                COST_MV_PT_DIST(omv.x, bottom, 0, dist);
591
0
            }
592
0
            for (int16_t index = 1; index < 4; index++)
593
0
            {
594
0
                int32_t posYT = top    + ((dist >> 2) * index);
595
0
                int32_t posYB = bottom - ((dist >> 2) * index);
596
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
597
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
598
599
0
                if (posYT >= mvmin.y) // check top
600
0
                {
601
0
                    if (posXL >= mvmin.x) // check left
602
0
                    {
603
0
                        COST_MV_PT_DIST(posXL, posYT, 0, dist);
604
0
                    }
605
0
                    if (posXR <= mvmax.x) // check right
606
0
                    {
607
0
                        COST_MV_PT_DIST(posXR, posYT, 0, dist);
608
0
                    }
609
0
                }
610
0
                if (posYB <= mvmax.y) // check bottom
611
0
                {
612
0
                    if (posXL >= mvmin.x) // check left
613
0
                    {
614
0
                        COST_MV_PT_DIST(posXL, posYB, 0, dist);
615
0
                    }
616
0
                    if (posXR <= mvmax.x) // check right
617
0
                    {
618
0
                        COST_MV_PT_DIST(posXR, posYB, 0, dist);
619
0
                    }
620
0
                }
621
0
            }
622
0
        }
623
624
0
        if (bcost < saved)
625
0
            rounds = 0;
626
0
        else if (++rounds >= earlyExitIters)
627
0
            return;
628
0
    }
629
0
}
630
631
int MotionEstimate::diamondSearch(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, MV& outMV)
632
0
{
633
0
    int bcost = INT_MAX;
634
0
    MV bmv(0, 0);
635
0
    MV omv = bmv;
636
637
0
    ALIGN_VAR_16(int, costs[16]);
638
639
0
    intptr_t stride = ref->lumaStride;
640
0
    pixel* fenc = fencPUYuv.m_buf[0];
641
0
    pixel* fref = ref->fpelPlane[0] + blockOffset;
642
643
0
    for (int16_t dist = 1; dist <= 4; dist <<= 1)
644
0
    {
645
0
        const int32_t top = omv.y - dist;
646
0
        const int32_t bottom = omv.y + dist;
647
0
        const int32_t left = omv.x - dist;
648
0
        const int32_t right = omv.x + dist;
649
0
        const int32_t top2 = omv.y - (dist >> 1);
650
0
        const int32_t bottom2 = omv.y + (dist >> 1);
651
0
        const int32_t left2 = omv.x - (dist >> 1);
652
0
        const int32_t right2 = omv.x + (dist >> 1);
653
654
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
655
0
        {
656
0
            COST_MV_X4(omv.x, top, omv.x, bottom, left, omv.y, right, omv.y);
657
0
            COST_MV_X4(left2, top2, right2, top2, left2, bottom2, right2, bottom2);
658
0
        }
659
0
        else // check border for each mv
660
0
        {
661
0
            if (top >= mvmin.y) // check top
662
0
            {
663
0
                COST_MV(omv.x, top);
664
0
            }
665
0
            if (top2 >= mvmin.y) // check half top
666
0
            {
667
0
                if (left2 >= mvmin.x)  // check half left
668
0
                {
669
0
                    COST_MV(left2, top2);
670
0
                }
671
0
                if (right2 <= mvmax.x) // check half right
672
0
                {
673
0
                    COST_MV(right2, top2);
674
0
                }
675
0
            }
676
0
            if (left >= mvmin.x) // check left
677
0
            {
678
0
                COST_MV(left, omv.y);
679
0
            }
680
0
            if (right <= mvmax.x) // check right
681
0
            {
682
0
                COST_MV(right, omv.y);
683
0
            }
684
0
            if (bottom2 <= mvmax.y) // check half bottom
685
0
            {
686
0
                if (left2 >= mvmin.x) // check half left
687
0
                {
688
0
                    COST_MV(left2, bottom2);
689
0
                }
690
0
                if (right2 <= mvmax.x) // check half right
691
0
                {
692
0
                    COST_MV(right2, bottom2);
693
0
                }
694
0
            }
695
0
            if (bottom <= mvmax.y) // check bottom
696
0
            {
697
0
                COST_MV(omv.x, bottom);
698
0
            }
699
0
        }
700
0
    }
701
702
0
    for (int16_t dist = 8; dist <= 64; dist += 8)
703
0
    {
704
0
        const int32_t top = omv.y - dist;
705
0
        const int32_t bottom = omv.y + dist;
706
0
        const int32_t left = omv.x - dist;
707
0
        const int32_t right = omv.x + dist;
708
709
0
        if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
710
0
        {
711
0
            COST_MV_X4(omv.x, top, left, omv.y, right, omv.y, omv.x, bottom);
712
713
0
            for (int16_t index = 1; index < 4; index++)
714
0
            {
715
0
                int32_t posYT = top + ((dist >> 2) * index);
716
0
                int32_t posYB = bottom - ((dist >> 2) * index);
717
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
718
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
719
720
0
                COST_MV_X4(posXL, posYT,
721
0
                    posXR, posYT,
722
0
                    posXL, posYB,
723
0
                    posXR, posYB);
724
0
            }
725
0
        }
726
0
        else // check border for each mv
727
0
        {
728
0
            if (top >= mvmin.y) // check top
729
0
            {
730
0
                COST_MV(omv.x, top);
731
0
            }
732
0
            if (left >= mvmin.x) // check left
733
0
            {
734
0
                COST_MV(left, omv.y);
735
0
            }
736
0
            if (right <= mvmax.x) // check right
737
0
            {
738
0
                COST_MV(right, omv.y);
739
0
            }
740
0
            if (bottom <= mvmax.y) // check bottom
741
0
            {
742
0
                COST_MV(omv.x, bottom);
743
0
            }
744
0
            for (int16_t index = 1; index < 4; index++)
745
0
            {
746
0
                int32_t posYT = top + ((dist >> 2) * index);
747
0
                int32_t posYB = bottom - ((dist >> 2) * index);
748
0
                int32_t posXL = omv.x - ((dist >> 2) * index);
749
0
                int32_t posXR = omv.x + ((dist >> 2) * index);
750
751
0
                if (posYT >= mvmin.y) // check top
752
0
                {
753
0
                    if (posXL >= mvmin.x) // check left
754
0
                    {
755
0
                        COST_MV(posXL, posYT);
756
0
                    }
757
0
                    if (posXR <= mvmax.x) // check right
758
0
                    {
759
0
                        COST_MV(posXR, posYT);
760
0
                    }
761
0
                }
762
0
                if (posYB <= mvmax.y) // check bottom
763
0
                {
764
0
                    if (posXL >= mvmin.x) // check left
765
0
                    {
766
0
                        COST_MV(posXL, posYB);
767
0
                    }
768
0
                    if (posXR <= mvmax.x) // check right
769
0
                    {
770
0
                        COST_MV(posXR, posYB);
771
0
                    }
772
0
                }
773
0
            }
774
0
        }
775
0
    }
776
0
    outMV = bmv;
777
0
    return bcost;
778
0
}
779
780
void MotionEstimate::refineMV(ReferencePlanes* ref,
781
                              const MV&        mvmin,
782
                              const MV&        mvmax,
783
                              const MV&        qmvp,
784
                              MV&              outQMv)
785
0
{
786
0
    ALIGN_VAR_16(int, costs[16]);
787
0
    if (ctuAddr >= 0)
788
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
789
0
    intptr_t stride = ref->lumaStride;
790
0
    pixel* fenc = fencPUYuv.m_buf[0];
791
0
    pixel* fref = ref->fpelPlane[0] + blockOffset;
792
    
793
0
    setMVP(qmvp);
794
    
795
0
    MV qmvmin = mvmin.toQPel();
796
0
    MV qmvmax = mvmax.toQPel();
797
   
798
    /* The term cost used here means satd/sad values for that particular search.
799
     * The costs used in ME integer search only includes the SAD cost of motion
800
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
801
     * cost of residual and sqrtLambda * MVD bits.
802
    */
803
             
804
    // measure SATD cost at clipped QPEL MVP
805
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
806
0
    MV bestpre = pmv;
807
0
    int bprecost;
808
809
0
    bprecost = subpelCompare(ref, pmv, sad);
810
811
    /* re-measure full pel rounded MVP with SAD as search start point */
812
0
    MV bmv = pmv.roundToFPel();
813
0
    int bcost = bprecost;
814
0
    if (pmv.isSubpel())
815
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
816
817
    /* square refine */
818
0
    int dir = 0;
819
0
    COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
820
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
821
0
        COPY2_IF_LT(bcost, costs[0], dir, 1);
822
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
823
0
        COPY2_IF_LT(bcost, costs[1], dir, 2);
824
0
    COPY2_IF_LT(bcost, costs[2], dir, 3);
825
0
    COPY2_IF_LT(bcost, costs[3], dir, 4);
826
0
    COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
827
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
828
0
        COPY2_IF_LT(bcost, costs[0], dir, 5);
829
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
830
0
        COPY2_IF_LT(bcost, costs[1], dir, 6);
831
0
    if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
832
0
        COPY2_IF_LT(bcost, costs[2], dir, 7);
833
0
    if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
834
0
        COPY2_IF_LT(bcost, costs[3], dir, 8);
835
0
    bmv += square1[dir];
836
837
0
    if (bprecost < bcost)
838
0
    {
839
0
        bmv = bestpre;
840
0
        bcost = bprecost;
841
0
    }
842
0
    else
843
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
844
845
    // TO DO: Change SubpelWorkload to fine tune MV
846
    // Now it is set to 5 for experiment.
847
    // const SubpelWorkload& wl = workload[this->subpelRefine];
848
0
    const SubpelWorkload& wl = workload[5];
849
850
0
    pixelcmp_t hpelcomp;
851
852
0
    if (wl.hpel_satd)
853
0
    {
854
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
855
0
        hpelcomp = satd;
856
0
    }
857
0
    else
858
0
        hpelcomp = sad;
859
860
0
    for (int iter = 0; iter < wl.hpel_iters; iter++)
861
0
    {
862
0
        int bdir = 0;
863
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
864
0
        {
865
0
            MV qmv = bmv + square1[i] * 2;            
866
867
            // check mv range for slice bound
868
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
869
0
                continue;
870
871
0
            int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
872
0
            COPY2_IF_LT(bcost, cost, bdir, i);
873
0
        }
874
875
0
        if (bdir)
876
0
            bmv += square1[bdir] * 2;            
877
0
        else
878
0
            break;
879
0
    }
880
881
    /* if HPEL search used SAD, remeasure with SATD before QPEL */
882
0
    if (!wl.hpel_satd)
883
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
884
885
0
    for (int iter = 0; iter < wl.qpel_iters; iter++)
886
0
    {
887
0
        int bdir = 0;
888
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
889
0
        {
890
0
            MV qmv = bmv + square1[i];
891
            
892
            // check mv range for slice bound
893
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
894
0
                continue;
895
896
0
            int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
897
0
            COPY2_IF_LT(bcost, cost, bdir, i);
898
0
        }
899
900
0
        if (bdir)
901
0
            bmv += square1[bdir];
902
0
        else
903
0
            break;
904
0
    }
905
906
    // check mv range for slice bound
907
0
    X265_CHECK(((pmv.y >= qmvmin.y) & (pmv.y <= qmvmax.y)), "mv beyond range!");
908
    
909
0
    x265_emms();
910
0
    outQMv = bmv;
911
0
}
912
913
int MotionEstimate::motionEstimate(ReferencePlanes *ref,
914
                                   const MV &       mvmin,
915
                                   const MV &       mvmax,
916
                                   const MV &       qmvp,
917
                                   int              numCandidates,
918
                                   const MV *       mvc,
919
                                   int              merange,
920
                                   MV &             outQMv,
921
                                   uint32_t         maxSlices,
922
                                    bool            m_vertRestriction,
923
                                   pixel *          srcReferencePlane)
924
0
{
925
0
    ALIGN_VAR_16(int, costs[16]);
926
0
    bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
927
0
    if (ctuAddr >= 0)
928
0
        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
929
0
    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
930
0
    pixel* fenc = fencPUYuv.m_buf[0];
931
0
    pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
932
933
0
    setMVP(qmvp);
934
935
0
    MV qmvmin = mvmin.toQPel();
936
0
    MV qmvmax = mvmax.toQPel();
937
938
    /* The term cost used here means satd/sad values for that particular search.
939
     * The costs used in ME integer search only includes the SAD cost of motion
940
     * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
941
     * cost of residual and sqrtLambda * MVD bits.  Mode decision will be based
942
     * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
943
     * (mode + MVD bits). */
944
945
    // measure SAD cost at clipped QPEL MVP
946
0
    MV pmv = qmvp.clipped(qmvmin, qmvmax);
947
0
    if (m_vertRestriction)
948
0
    {
949
0
        if (pmv.y > mvmax.y << 2)
950
0
        {
951
0
            pmv.y = (mvmax.y << 2);
952
0
        }
953
0
    }
954
0
    MV bestpre = pmv;
955
0
    int bprecost;
956
957
0
    if (ref->isLowres)
958
0
        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
959
0
    else
960
0
        bprecost = subpelCompare(ref, pmv, sad);
961
962
    /* re-measure full pel rounded MVP with SAD as search start point */
963
0
    MV bmv = pmv.roundToFPel();
964
0
    int bcost = bprecost;
965
0
    if (pmv.isSubpel())
966
0
        bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
967
968
    // measure SAD cost at MV(0) if MVP is not zero
969
0
    if (pmv.notZero())
970
0
    {
971
0
        int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
972
0
        if (cost < bcost)
973
0
        {
974
0
            bcost = cost;
975
0
            bmv = 0;
976
0
            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
977
0
        }
978
0
    }
979
980
0
    X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
981
    // measure SAD cost at each QPEL motion vector candidate
982
0
    for (int i = 0; i < numCandidates; i++)
983
0
    {
984
0
        MV m = mvc[i].clipped(qmvmin, qmvmax);
985
0
        if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
986
0
        {
987
0
            int cost = subpelCompare(ref, m, sad) + mvcost(m);
988
0
            if (cost < bprecost)
989
0
            {
990
0
                bprecost = cost;
991
0
                bestpre = m;
992
0
            }
993
0
        }
994
0
    }
995
996
0
    pmv = pmv.roundToFPel();
997
0
    MV omv = bmv;  // current search origin or starting point
998
999
0
    int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
1000
0
    switch (search)
1001
0
    {
1002
0
    case X265_DIA_SEARCH:
1003
0
    {
1004
        /* diamond search, radius 1 */
1005
0
        bcost <<= 4;
1006
0
        int i = merange;
1007
0
        do
1008
0
        {
1009
0
            COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
1010
0
            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1011
0
                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
1012
0
            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1013
0
                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
1014
0
            COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
1015
0
            COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
1016
0
            if (!(bcost & 15))
1017
0
                break;
1018
0
            bmv.x -= (bcost << 28) >> 30;
1019
0
            bmv.y -= (bcost << 30) >> 30;
1020
0
            bcost &= ~15;
1021
0
        }
1022
0
        while (--i && bmv.checkRange(mvmin, mvmax));
1023
0
        bcost >>= 4;
1024
0
        break;
1025
0
    }
1026
1027
0
    case X265_HEX_SEARCH:
1028
0
    {
1029
0
me_hex2:
1030
        /* hexagon search, radius 2 */
1031
#if 0
1032
        for (int i = 0; i < merange / 2; i++)
1033
        {
1034
            omv = bmv;
1035
            COST_MV(omv.x - 2, omv.y);
1036
            COST_MV(omv.x - 1, omv.y + 2);
1037
            COST_MV(omv.x + 1, omv.y + 2);
1038
            COST_MV(omv.x + 2, omv.y);
1039
            COST_MV(omv.x + 1, omv.y - 2);
1040
            COST_MV(omv.x - 1, omv.y - 2);
1041
            if (omv == bmv)
1042
                break;
1043
            if (!bmv.checkRange(mvmin, mvmax))
1044
                break;
1045
        }
1046
1047
#else // if 0
1048
      /* equivalent to the above, but eliminates duplicate candidates */
1049
0
        COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
1050
0
        bcost <<= 3;
1051
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
1052
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
1053
0
        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
1054
0
        {
1055
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
1056
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
1057
0
        }
1058
1059
0
        COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
1060
0
        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
1061
0
            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
1062
0
        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
1063
0
        {
1064
0
            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
1065
0
            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
1066
0
        }
1067
1068
0
        if (bcost & 7)
1069
0
        {
1070
0
            int dir = (bcost & 7) - 2;
1071
1072
0
            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
1073
0
            {
1074
0
                bmv += hex2[dir + 1];
1075
1076
                /* half hexagon, not overlapping the previous iteration */
1077
0
                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
1078
0
                {
1079
0
                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
1080
0
                        hex2[dir + 1].x, hex2[dir + 1].y,
1081
0
                        hex2[dir + 2].x, hex2[dir + 2].y,
1082
0
                        costs);
1083
0
                    bcost &= ~7;
1084
1085
0
                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
1086
0
                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
1087
1088
0
                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
1089
0
                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
1090
1091
0
                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
1092
0
                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
1093
1094
0
                    if (!(bcost & 7))
1095
0
                        break;
1096
1097
0
                    dir += (bcost & 7) - 2;
1098
0
                    dir = mod6m1[dir + 1];
1099
0
                    bmv += hex2[dir + 1];
1100
0
                }
1101
0
            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
1102
0
        }
1103
0
        bcost >>= 3;
1104
0
#endif // if 0
1105
1106
        /* square refine */
1107
0
        int dir = 0;
1108
0
        COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
1109
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1110
0
            COPY2_IF_LT(bcost, costs[0], dir, 1);
1111
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1112
0
            COPY2_IF_LT(bcost, costs[1], dir, 2);
1113
0
        COPY2_IF_LT(bcost, costs[2], dir, 3);
1114
0
        COPY2_IF_LT(bcost, costs[3], dir, 4);
1115
0
        COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
1116
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1117
0
            COPY2_IF_LT(bcost, costs[0], dir, 5);
1118
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1119
0
            COPY2_IF_LT(bcost, costs[1], dir, 6);
1120
0
        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
1121
0
            COPY2_IF_LT(bcost, costs[2], dir, 7);
1122
0
        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
1123
0
            COPY2_IF_LT(bcost, costs[3], dir, 8);
1124
0
        bmv += square1[dir];
1125
0
        break;
1126
0
    }
1127
1128
0
    case X265_UMH_SEARCH:
1129
0
    {
1130
0
        int ucost1, ucost2;
1131
0
        int16_t cross_start = 1;
1132
1133
        /* refine predictors */
1134
0
        omv = bmv;
1135
0
        ucost1 = bcost;
1136
0
        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
1137
0
        DIA1_ITER(pmv.x, pmv.y);
1138
0
        if (pmv.notZero())
1139
0
            DIA1_ITER(0, 0);
1140
1141
0
        ucost2 = bcost;
1142
0
        if (bmv.notZero() && bmv != pmv)
1143
0
            DIA1_ITER(bmv.x, bmv.y);
1144
0
        if (bcost == ucost2)
1145
0
            cross_start = 3;
1146
1147
        /* Early Termination */
1148
0
        omv = bmv;
1149
0
        if (bcost == ucost2 && SAD_THRESH(2000))
1150
0
        {
1151
0
            COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
1152
0
            COST_MV_X4(2, 0, -1, 1, 1, 1,  0, 2);
1153
0
            if (bcost == ucost1 && SAD_THRESH(500))
1154
0
                break;
1155
0
            if (bcost == ucost2)
1156
0
            {
1157
0
                int16_t range = (int16_t)(merange >> 1) | 1;
1158
0
                CROSS(3, range, range);
1159
0
                COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
1160
0
                COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
1161
0
                if (bcost == ucost2)
1162
0
                    break;
1163
0
                cross_start = range + 2;
1164
0
            }
1165
0
        }
1166
1167
        // TODO: Need to study x264's logic for building mvc list to understand why they
1168
        //       have special cases here for 16x16, and whether they apply to HEVC CTU
1169
1170
        // adaptive search range based on mvc variability
1171
0
        if (numCandidates)
1172
0
        {
1173
            /* range multipliers based on casual inspection of some statistics of
1174
             * average distance between current predictor and final mv found by ESA.
1175
             * these have not been tuned much by actual encoding. */
1176
0
            static const uint8_t range_mul[4][4] =
1177
0
            {
1178
0
                { 3, 3, 4, 4 },
1179
0
                { 3, 4, 4, 4 },
1180
0
                { 4, 4, 4, 5 },
1181
0
                { 4, 4, 5, 6 },
1182
0
            };
1183
1184
0
            int mvd;
1185
0
            int sad_ctx, mvd_ctx;
1186
0
            int denom = 1;
1187
1188
0
            if (numCandidates == 1)
1189
0
            {
1190
0
                if (LUMA_64x64 == partEnum)
1191
                    /* mvc is probably the same as mvp, so the difference isn't meaningful.
1192
                     * but prediction usually isn't too bad, so just use medium range */
1193
0
                    mvd = 25;
1194
0
                else
1195
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1196
0
            }
1197
0
            else
1198
0
            {
1199
                /* calculate the degree of agreement between predictors. */
1200
1201
                /* in 64x64, mvc includes all the neighbors used to make mvp,
1202
                 * so don't count mvp separately. */
1203
1204
0
                denom = numCandidates - 1;
1205
0
                mvd = 0;
1206
0
                if (partEnum != LUMA_64x64)
1207
0
                {
1208
0
                    mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
1209
0
                    denom++;
1210
0
                }
1211
0
                mvd += predictorDifference(mvc, numCandidates);
1212
0
            }
1213
1214
0
            sad_ctx = SAD_THRESH(1000) ? 0
1215
0
                : SAD_THRESH(2000) ? 1
1216
0
                : SAD_THRESH(4000) ? 2 : 3;
1217
0
            mvd_ctx = mvd < 10 * denom ? 0
1218
0
                : mvd < 20 * denom ? 1
1219
0
                : mvd < 40 * denom ? 2 : 3;
1220
1221
0
            merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
1222
0
        }
1223
1224
        /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
1225
         * we are still centered on the same place as the DIA2. is this desirable? */
1226
0
        CROSS(cross_start, merange, merange >> 1);
1227
0
        COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
1228
1229
        /* hexagon grid */
1230
0
        omv = bmv;
1231
0
        const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
1232
0
        const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
1233
0
        uint16_t i = 1;
1234
0
        do
1235
0
        {
1236
0
            if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
1237
0
                                  mvmax.y - omv.y, omv.y - mvmin.y))
1238
0
            {
1239
0
                for (int j = 0; j < 16; j++)
1240
0
                {
1241
0
                    MV mv = omv + (hex4[j] * i);
1242
0
                    if (mv.checkRange(mvmin, mvmax))
1243
0
                        COST_MV(mv.x, mv.y);
1244
0
                }
1245
0
            }
1246
0
            else
1247
0
            {
1248
0
                int16_t dir = 0;
1249
0
                pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
1250
0
                size_t dy = (size_t)i * stride;
1251
0
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
1252
0
    sad_x4(fenc, \
1253
0
           fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
1254
0
           fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
1255
0
           fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
1256
0
           fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
1257
0
           stride, costs + 4 * k); \
1258
0
    fref_base += 2 * dy;
1259
0
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
1260
0
#define MIN_MV(k, dx, dy)     if ((omv.y + (dy) >= mvmin.y) & (omv.y + (dy) <= mvmax.y)) { COPY2_IF_LT(bcost, costs[k], dir, dx * 16 + (dy & 15)) }
1261
1262
0
                SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
1263
0
                SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
1264
0
                SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
1265
0
                SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
1266
0
                ADD_MVCOST(0, 0, -4);
1267
0
                ADD_MVCOST(1, 0, 4);
1268
0
                ADD_MVCOST(2, -2, -3);
1269
0
                ADD_MVCOST(3, 2, -3);
1270
0
                ADD_MVCOST(4, -4, -2);
1271
0
                ADD_MVCOST(5, 4, -2);
1272
0
                ADD_MVCOST(6, -4, -1);
1273
0
                ADD_MVCOST(7, 4, -1);
1274
0
                ADD_MVCOST(8, -4, 0);
1275
0
                ADD_MVCOST(9, 4, 0);
1276
0
                ADD_MVCOST(10, -4, 1);
1277
0
                ADD_MVCOST(11, 4, 1);
1278
0
                ADD_MVCOST(12, -4, 2);
1279
0
                ADD_MVCOST(13, 4, 2);
1280
0
                ADD_MVCOST(14, -2, 3);
1281
0
                ADD_MVCOST(15, 2, 3);
1282
0
                MIN_MV(0, 0, -4);
1283
0
                MIN_MV(1, 0, 4);
1284
0
                MIN_MV(2, -2, -3);
1285
0
                MIN_MV(3, 2, -3);
1286
0
                MIN_MV(4, -4, -2);
1287
0
                MIN_MV(5, 4, -2);
1288
0
                MIN_MV(6, -4, -1);
1289
0
                MIN_MV(7, 4, -1);
1290
0
                MIN_MV(8, -4, 0);
1291
0
                MIN_MV(9, 4, 0);
1292
0
                MIN_MV(10, -4, 1);
1293
0
                MIN_MV(11, 4, 1);
1294
0
                MIN_MV(12, -4, 2);
1295
0
                MIN_MV(13, 4, 2);
1296
0
                MIN_MV(14, -2, 3);
1297
0
                MIN_MV(15, 2, 3);
1298
0
#undef SADS
1299
0
#undef ADD_MVCOST
1300
0
#undef MIN_MV
1301
0
                if (dir)
1302
0
                {
1303
0
                    bmv.x = omv.x + i * (dir >> 4);
1304
0
                    bmv.y = omv.y + i * ((dir << 28) >> 28);
1305
0
                }
1306
0
            }
1307
0
        }
1308
0
        while (++i <= merange >> 2);
1309
0
        if (bmv.checkRange(mvmin, mvmax))
1310
0
            goto me_hex2;
1311
0
        break;
1312
0
    }
1313
1314
0
    case X265_STAR_SEARCH: // Adapted from HM ME
1315
0
    {
1316
0
        int bPointNr = 0;
1317
0
        int bDistance = 0;
1318
1319
0
        const int EarlyExitIters = 3;
1320
0
        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);
1321
0
        if (bDistance == 1)
1322
0
        {
1323
            // if best distance was only 1, check two missing points.  If no new point is found, stop
1324
0
            if (bPointNr)
1325
0
            {
1326
                /* For a given direction 1 to 8, check nearest two outer X pixels
1327
                     X   X
1328
                   X 1 2 3 X
1329
                     4 * 5
1330
                   X 6 7 8 X
1331
                     X   X
1332
                */
1333
0
                int saved = bcost;
1334
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1335
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1336
0
                if (mv1.checkRange(mvmin, mvmax))
1337
0
                {
1338
0
                    COST_MV(mv1.x, mv1.y);
1339
0
                }
1340
0
                if (mv2.checkRange(mvmin, mvmax))
1341
0
                {
1342
0
                    COST_MV(mv2.x, mv2.y);
1343
0
                }
1344
0
                if (bcost == saved)
1345
0
                    break;
1346
0
            }
1347
0
            else
1348
0
                break;
1349
0
        }
1350
1351
0
        const int RasterDistance = 5;
1352
0
        if (bDistance > RasterDistance)
1353
0
        {
1354
            // raster search refinement if original search distance was too big
1355
0
            MV tmv;
1356
0
            for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
1357
0
            {
1358
0
                for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
1359
0
                {
1360
0
                    if (tmv.x + (RasterDistance * 3) <= mvmax.x)
1361
0
                    {
1362
0
                        pixel *pix_base = fref + tmv.y * stride + tmv.x;
1363
0
                        sad_x4(fenc,
1364
0
                               pix_base,
1365
0
                               pix_base + RasterDistance,
1366
0
                               pix_base + RasterDistance * 2,
1367
0
                               pix_base + RasterDistance * 3,
1368
0
                               stride, costs);
1369
0
                        costs[0] += mvcost(tmv << 2);
1370
0
                        COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1371
0
                        tmv.x += RasterDistance;
1372
0
                        costs[1] += mvcost(tmv << 2);
1373
0
                        COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1374
0
                        tmv.x += RasterDistance;
1375
0
                        costs[2] += mvcost(tmv << 2);
1376
0
                        COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1377
0
                        tmv.x += RasterDistance;
1378
0
                        costs[3] += mvcost(tmv << 3);
1379
0
                        COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1380
0
                    }
1381
0
                    else
1382
0
                        COST_MV(tmv.x, tmv.y);
1383
0
                }
1384
0
            }
1385
0
        }
1386
1387
0
        while (bDistance > 0)
1388
0
        {
1389
            // center a new search around current best
1390
0
            bDistance = 0;
1391
0
            bPointNr = 0;
1392
0
            const int MaxIters = 32;
1393
0
            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);
1394
1395
0
            if (bDistance == 1)
1396
0
            {
1397
0
                if (!bPointNr)
1398
0
                    break;
1399
1400
                /* For a given direction 1 to 8, check nearest 2 outer X pixels
1401
                        X   X
1402
                    X 1 2 3 X
1403
                        4 * 5
1404
                    X 6 7 8 X
1405
                        X   X
1406
                */
1407
0
                const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1408
0
                const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1409
0
                if (mv1.checkRange(mvmin, mvmax))
1410
0
                {
1411
0
                    COST_MV(mv1.x, mv1.y);
1412
0
                }
1413
0
                if (mv2.checkRange(mvmin, mvmax))
1414
0
                {
1415
0
                    COST_MV(mv2.x, mv2.y);
1416
0
                }
1417
0
                break;
1418
0
            }
1419
0
        }
1420
1421
0
        break;
1422
0
    }
1423
1424
0
    case X265_SEA:
1425
0
    {
1426
        // Successive Elimination Algorithm
1427
0
        const int32_t minX = X265_MAX(omv.x - (int32_t)merange, mvmin.x);
1428
0
        const int32_t minY = X265_MAX(omv.y - (int32_t)merange, mvmin.y);
1429
0
        const int32_t maxX = X265_MIN(omv.x + (int32_t)merange, mvmax.x);
1430
0
        const int32_t maxY = X265_MIN(omv.y + (int32_t)merange, mvmax.y);
1431
0
        const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;
1432
0
        const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;
1433
0
        int16_t* meScratchBuffer = NULL;
1434
0
        int scratchSize = merange * 2 + 4;
1435
0
        if (scratchSize)
1436
0
        {
1437
0
            meScratchBuffer = X265_MALLOC(int16_t, scratchSize);
1438
0
            memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);
1439
0
        }
1440
1441
        /* SEA is fastest in multiples of 4 */
1442
0
        int meRangeWidth = (maxX - minX + 3) & ~3;
1443
0
        int w = 0, h = 0;                    // Width and height of the PU
1444
0
        ALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };
1445
0
        ALIGN_VAR_32(int, encDC[4]);
1446
0
        uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >> 2);
1447
0
        sizesFromPartition(partEnum, &w, &h);
1448
0
        int deltaX = (w <= 8) ? (w) : (w >> 1);
1449
0
        int deltaY = (h <= 8) ? (h) : (h >> 1);
1450
1451
        /* Check if very small rectangular blocks which cannot be sub-divided anymore */
1452
0
        bool smallRectPartition = partEnum == LUMA_4x4 || partEnum == LUMA_16x12 ||
1453
0
            partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum == LUMA_4x16;
1454
        /* Check if vertical partition */
1455
0
        bool verticalRect = partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1456
0
            partEnum == LUMA_4x8;
1457
        /* Check if horizontal partition */
1458
0
        bool horizontalRect = partEnum == LUMA_64x32 || partEnum == LUMA_32x16 || partEnum == LUMA_16x8 ||
1459
0
            partEnum == LUMA_8x4;
1460
        /* Check if assymetric vertical partition */
1461
0
        bool assymetricVertical = partEnum == LUMA_12x16 || partEnum == LUMA_4x16 || partEnum == LUMA_24x32 ||
1462
0
            partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum == LUMA_16x64;
1463
        /* Check if assymetric horizontal partition */
1464
0
        bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum == LUMA_16x4 || partEnum == LUMA_32x24 ||
1465
0
            partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum == LUMA_64x16;
1466
1467
0
        int tempPartEnum = 0;
1468
1469
        /* If a vertical rectangular partition, it is horizontally split into two, for ads_x2() */
1470
0
        if (verticalRect)
1471
0
            tempPartEnum = partitionFromSizes(w, h >> 1);
1472
        /* If a horizontal rectangular partition, it is vertically split into two, for ads_x2() */
1473
0
        else if (horizontalRect)
1474
0
            tempPartEnum = partitionFromSizes(w >> 1, h);
1475
        /* We have integral planes introduced to account for assymetric partitions.
1476
         * Hence all assymetric partitions except those which cannot be split into legal sizes,
1477
         * are split into four for ads_x4() */
1478
0
        else if (assymetricVertical || assymetricHorizontal)
1479
0
            tempPartEnum = smallRectPartition ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1480
        /* General case: Square partitions. All partitions with width > 8 are split into four
1481
         * for ads_x4(), for 4x4 and 8x8 we do ads_x1() */
1482
0
        else
1483
0
            tempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >> 1, h >> 1);
1484
1485
        /* Successive elimination by comparing DC before a full SAD,
1486
         * because sum(abs(diff)) >= abs(diff(sum)). */
1487
0
        primitives.pu[tempPartEnum].sad_x4(zero,
1488
0
                         fenc,
1489
0
                         fenc + deltaX,
1490
0
                         fenc + deltaY * FENC_STRIDE,
1491
0
                         fenc + deltaX + deltaY * FENC_STRIDE,
1492
0
                         FENC_STRIDE,
1493
0
                         encDC);
1494
1495
        /* Assigning appropriate integral plane */
1496
0
        uint32_t *sumsBase = NULL;
1497
0
        switch (deltaX)
1498
0
        {
1499
0
            case 32: if (deltaY % 24 == 0)
1500
0
                         sumsBase = integral[1];
1501
0
                     else if (deltaY == 8)
1502
0
                         sumsBase = integral[2];
1503
0
                     else
1504
0
                         sumsBase = integral[0];
1505
0
               break;
1506
0
            case 24: sumsBase = integral[3];
1507
0
               break;
1508
0
            case 16: if (deltaY % 12 == 0)
1509
0
                         sumsBase = integral[5];
1510
0
                     else if (deltaY == 4)
1511
0
                         sumsBase = integral[6];
1512
0
                     else
1513
0
                         sumsBase = integral[4];
1514
0
               break;
1515
0
            case 12: sumsBase = integral[7];
1516
0
                break;
1517
0
            case 8: if (deltaY == 32)
1518
0
                        sumsBase = integral[8];
1519
0
                    else
1520
0
                        sumsBase = integral[9];
1521
0
                break;
1522
0
            case 4: if (deltaY == 16)
1523
0
                        sumsBase = integral[10];
1524
0
                    else
1525
0
                        sumsBase = integral[11];
1526
0
                break;
1527
0
            default: sumsBase = integral[11];
1528
0
                break;
1529
0
        }
1530
1531
0
        if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum == LUMA_16x16 ||
1532
0
            partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||
1533
0
            partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum == LUMA_4x16 ||
1534
0
            partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum == LUMA_48x64 ||
1535
0
            partEnum == LUMA_16x64)
1536
0
            deltaY *= (int)stride;
1537
1538
0
        if (verticalRect)
1539
0
            encDC[1] = encDC[2];
1540
1541
0
        if (horizontalRect)
1542
0
            deltaY = deltaX;
1543
1544
        /* ADS and SAD */
1545
0
        MV tmv;
1546
0
        for (tmv.y = minY; tmv.y <= maxY; tmv.y++)
1547
0
        {
1548
0
            int i, xn;
1549
0
            int ycost = p_cost_mvy[tmv.y] << 2;
1550
0
            if (bcost <= ycost)
1551
0
                continue;
1552
0
            bcost -= ycost;
1553
1554
            /* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48, 32x8, 8x32, 64x16, 16x64 partitions
1555
             * ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions
1556
             * ADS_2 for all other rectangular partitions */
1557
0
            xn = ads(encDC,
1558
0
                    sumsBase + minX + tmv.y * stride,
1559
0
                    deltaY,
1560
0
                    fpelCostMvX + minX,
1561
0
                    meScratchBuffer,
1562
0
                    meRangeWidth,
1563
0
                    bcost);
1564
1565
0
            for (i = 0; i < xn - 2; i += 3)
1566
0
                COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,
1567
0
                             minX + meScratchBuffer[i + 1], tmv.y,
1568
0
                             minX + meScratchBuffer[i + 2], tmv.y);
1569
1570
0
            bcost += ycost;
1571
0
            for (; i < xn; i++)
1572
0
                COST_MV(minX + meScratchBuffer[i], tmv.y);
1573
0
        }
1574
0
        if (meScratchBuffer)
1575
0
            x265_free(meScratchBuffer);
1576
0
        break;
1577
0
    }
1578
1579
0
    case X265_FULL_SEARCH:
1580
0
    {
1581
        // dead slow exhaustive search, but at least it uses sad_x4()
1582
0
        MV tmv;
1583
0
        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
1584
0
        if (ref->isHMELowres)
1585
0
        {
1586
0
            merange = (merange < 0 ? -merange : merange);
1587
0
            mvmin_y = X265_MAX(mvmin.y, -merange);
1588
0
            mvmin_x = X265_MAX(mvmin.x, -merange);
1589
0
            mvmax_y = X265_MIN(mvmax.y, merange);
1590
0
            mvmax_x = X265_MIN(mvmax.x, merange);
1591
0
        }
1592
0
        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
1593
0
        {
1594
0
            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
1595
0
            {
1596
0
                if (tmv.x + 3 <= mvmax_x)
1597
0
                {
1598
0
                    pixel *pix_base = fref + tmv.y * stride + tmv.x;
1599
0
                    sad_x4(fenc,
1600
0
                           pix_base,
1601
0
                           pix_base + 1,
1602
0
                           pix_base + 2,
1603
0
                           pix_base + 3,
1604
0
                           stride, costs);
1605
0
                    costs[0] += mvcost(tmv << 2);
1606
0
                    COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1607
0
                    tmv.x++;
1608
0
                    costs[1] += mvcost(tmv << 2);
1609
0
                    COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1610
0
                    tmv.x++;
1611
0
                    costs[2] += mvcost(tmv << 2);
1612
0
                    COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1613
0
                    tmv.x++;
1614
0
                    costs[3] += mvcost(tmv << 2);
1615
0
                    COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1616
0
                }
1617
0
                else
1618
0
                    COST_MV(tmv.x, tmv.y);
1619
0
            }
1620
0
        }
1621
1622
0
        break;
1623
0
    }
1624
1625
0
    default:
1626
0
        X265_CHECK(0, "invalid motion estimate mode\n");
1627
0
        break;
1628
0
    }
1629
1630
0
    if (bprecost < bcost)
1631
0
    {
1632
0
        bmv = bestpre;
1633
0
        bcost = bprecost;
1634
0
    }
1635
0
    else
1636
0
        bmv = bmv.toQPel(); // promote search bmv to qpel
1637
1638
0
    const SubpelWorkload& wl = workload[this->subpelRefine];
1639
1640
    // check mv range for slice bound
1641
0
    if ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y)))
1642
0
    {
1643
0
        bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);
1644
0
        bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1645
0
    }
1646
1647
0
    if (!bcost)
1648
0
    {
1649
        /* if there was zero residual at the clipped MVP, we can skip subpel
1650
         * refine, but we do need to include the mvcost in the returned cost */
1651
0
        bcost = mvcost(bmv);
1652
0
    }
1653
0
    else if (ref->isLowres)
1654
0
    {
1655
0
        int bdir = 0;
1656
0
        for (int i = 1; i <= wl.hpel_dirs; i++)
1657
0
        {
1658
0
            MV qmv = bmv + square1[i] * 2;
1659
1660
            /* skip invalid range */
1661
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1662
0
                continue;
1663
1664
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
1665
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1666
0
        }
1667
1668
0
        bmv += square1[bdir] * 2;
1669
0
        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
1670
1671
0
        bdir = 0;
1672
0
        for (int i = 1; i <= wl.qpel_dirs; i++)
1673
0
        {
1674
0
            MV qmv = bmv + square1[i];
1675
1676
            /* skip invalid range */
1677
0
            if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1678
0
                continue;
1679
1680
0
            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
1681
0
            COPY2_IF_LT(bcost, cost, bdir, i);
1682
0
        }
1683
1684
0
        bmv += square1[bdir];
1685
0
    }
1686
0
    else
1687
0
    {
1688
0
        pixelcmp_t hpelcomp;
1689
1690
0
        if (wl.hpel_satd)
1691
0
        {
1692
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1693
0
            hpelcomp = satd;
1694
0
        }
1695
0
        else
1696
0
            hpelcomp = sad;
1697
1698
0
        for (int iter = 0; iter < wl.hpel_iters; iter++)
1699
0
        {
1700
0
            int bdir = 0;
1701
0
            for (int i = 1; i <= wl.hpel_dirs; i++)
1702
0
            {
1703
0
                MV qmv = bmv + square1[i] * 2;
1704
1705
                // check mv range for slice bound
1706
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1707
0
                    continue;
1708
1709
0
                int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1710
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1711
0
            }
1712
1713
0
            if (bdir)
1714
0
                bmv += square1[bdir] * 2;
1715
0
            else
1716
0
                break;
1717
0
        }
1718
1719
        /* if HPEL search used SAD, remeasure with SATD before QPEL */
1720
0
        if (!wl.hpel_satd)
1721
0
            bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1722
1723
0
        for (int iter = 0; iter < wl.qpel_iters; iter++)
1724
0
        {
1725
0
            int bdir = 0;
1726
0
            for (int i = 1; i <= wl.qpel_dirs; i++)
1727
0
            {
1728
0
                MV qmv = bmv + square1[i];
1729
1730
                // check mv range for slice bound
1731
0
                if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
1732
0
                    continue;
1733
1734
0
                int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1735
0
                COPY2_IF_LT(bcost, cost, bdir, i);
1736
0
            }
1737
1738
0
            if (bdir)
1739
0
                bmv += square1[bdir];
1740
0
            else
1741
0
                break;
1742
0
        }
1743
0
    }
1744
1745
    // check mv range for slice bound
1746
0
    X265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");
1747
1748
    // Get a chance to ZeroMv
1749
0
    if (bmv.notZero())
1750
0
    {
1751
0
      int cost = subpelCompare(ref, MV(0, 0), satd) + mvcost(MV(0, 0));
1752
0
      if (cost <= bcost)
1753
0
        bmv = MV(0, 0);
1754
0
    }
1755
1756
0
    x265_emms();
1757
0
    outQMv = bmv;
1758
0
    return bcost;
1759
0
}
1760
1761
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1762
0
{
1763
0
    intptr_t refStride = ref->lumaStride;
1764
0
    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1765
0
    int xFrac = qmv.x & 0x3;
1766
0
    int yFrac = qmv.y & 0x3;
1767
0
    int cost;
1768
0
    const intptr_t fencStride = FENC_STRIDE;
1769
0
    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1770
1771
0
    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
1772
    
1773
0
    if (!(yFrac | xFrac))
1774
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
1775
0
    else
1776
0
    {
1777
        /* we are taking a short-cut here if the reference is weighted. To be
1778
         * accurate we should be interpolating unweighted pixels and weighting
1779
         * the final 16bit values prior to rounding and down shifting. Instead we
1780
         * are simply interpolating the weighted full-pel pixels. Not 100%
1781
         * accurate but good enough for fast qpel ME */
1782
0
        if (!yFrac)
1783
0
            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
1784
0
        else if (!xFrac)
1785
0
            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
1786
0
        else
1787
0
            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
1788
0
        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
1789
0
    }
1790
1791
0
    if (bChromaSATD)
1792
0
    {
1793
0
        int csp    = fencPUYuv.m_csp;
1794
0
        int hshift = fencPUYuv.m_hChromaShift;
1795
0
        int vshift = fencPUYuv.m_vChromaShift;
1796
0
        int mvx = qmv.x << (1 - hshift);
1797
0
        int mvy = qmv.y << (1 - vshift);
1798
0
        intptr_t fencStrideC = fencPUYuv.m_csize;
1799
1800
0
        intptr_t refStrideC = ref->reconPic->m_strideC;
1801
0
        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
1802
1803
0
        const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1804
0
        const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1805
1806
0
        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
1807
0
        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
1808
1809
0
        xFrac = mvx & 7;
1810
0
        yFrac = mvy & 7;
1811
1812
0
        if (!(yFrac | xFrac))
1813
0
        {
1814
0
            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
1815
0
            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
1816
0
        }
1817
0
        else
1818
0
        {
1819
0
            int blockwidthC = blockwidth >> hshift;
1820
1821
0
            if (!yFrac)
1822
0
            {
1823
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
1824
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1825
1826
0
                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
1827
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1828
0
            }
1829
0
            else if (!xFrac)
1830
0
            {
1831
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
1832
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1833
1834
0
                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
1835
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1836
0
            }
1837
0
            else
1838
0
            {
1839
0
                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
1840
0
                const int halfFilterSize = (NTAPS_CHROMA >> 1);
1841
1842
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
1843
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1844
0
                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
1845
1846
0
                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
1847
0
                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
1848
0
                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
1849
0
            }
1850
0
        }
1851
0
    }
1852
1853
0
    return cost;
1854
0
}