Coverage Report

Created: 2025-08-11 08:01

/src/libde265/libde265/fallback-motion.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "fallback-motion.h"
22
#include "util.h"
23
24
#if defined(_MSC_VER) || defined(__MINGW32__)
25
# include <malloc.h>
26
#elif defined(HAVE_ALLOCA_H)
27
# include <alloca.h>
28
#endif
29
30
#include <assert.h>
31
32
33
void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
34
                                    const int16_t *src, ptrdiff_t srcstride,
35
                                    int width, int height)
36
0
{
37
0
  int offset8bit = 32;
38
0
  int shift8bit = 6;
39
40
0
  assert((width&1)==0);
41
42
0
  for (int y=0;y<height;y++) {
43
0
    const int16_t* in  = &src[y*srcstride];
44
0
    uint8_t* out = &dst[y*dststride];
45
46
0
    for (int x=0;x<width;x+=2) {
47
0
      out[0] = Clip1_8bit((in[0] + offset8bit)>>shift8bit);
48
0
      out[1] = Clip1_8bit((in[1] + offset8bit)>>shift8bit);
49
0
      out+=2; in+=2;
50
0
    }
51
0
  }
52
0
}
53
54
55
void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
56
                                  const int16_t *src, ptrdiff_t srcstride,
57
                                  int width, int height,
58
                                  int w,int o,int log2WD)
59
1.33M
{
60
1.33M
  assert(log2WD>=1); // TODO
61
62
1.33M
  const int rnd = (1<<(log2WD-1));
63
64
8.82M
  for (int y=0;y<height;y++) {
65
7.48M
    const int16_t* in  = &src[y*srcstride];
66
7.48M
    uint8_t* out = &dst[y*dststride];
67
68
63.2M
    for (int x=0;x<width;x++) {
69
55.7M
      out[0] = Clip1_8bit(((in[0]*w + rnd)>>log2WD) + o);
70
55.7M
      out++; in++;
71
55.7M
    }
72
7.48M
  }
73
1.33M
}
74
75
void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
76
                                    const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
77
                                    int width, int height,
78
                                    int w1,int o1, int w2,int o2, int log2WD)
79
613k
{
80
613k
  assert(log2WD>=1); // TODO
81
82
613k
  const int rnd = ((o1+o2+1) << log2WD);
83
84
5.65M
  for (int y=0;y<height;y++) {
85
5.04M
    const int16_t* in1 = &src1[y*srcstride];
86
5.04M
    const int16_t* in2 = &src2[y*srcstride];
87
5.04M
    uint8_t* out = &dst[y*dststride];
88
89
67.6M
    for (int x=0;x<width;x++) {
90
62.6M
      out[0] = Clip1_8bit((in1[0]*w1 + in2[0]*w2 + rnd)>>(log2WD+1));
91
62.6M
      out++; in1++; in2++;
92
62.6M
    }
93
5.04M
  }
94
613k
}
95
96
97
void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
98
                                      const int16_t *src1, const int16_t *src2,
99
                                      ptrdiff_t srcstride, int width,
100
                                      int height)
101
0
{
102
0
  int offset8bit = 64;
103
0
  int shift8bit = 7;
104
105
0
  assert((width&1)==0);
106
107
  // I had a special case for 8-pixel parallel, unrolled code,
108
  // but I did not see any speedup.
109
110
#if 0
111
  for (int y=0;y<height;y++) {
112
    int16_t* in1 = &src1[y*srcstride];
113
    int16_t* in2 = &src2[y*srcstride];
114
    uint8_t* out = &dst[y*dststride];
115
116
    for (int x=0;x<width;x++) {
117
      out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
118
      out++; in1++; in2++;
119
    }
120
  }
121
#endif
122
123
#if 0
124
  if ((width&7)==0) {
125
    for (int y=0;y<height;y++) {
126
      int16_t* in1 = &src1[y*srcstride];
127
      int16_t* in2 = &src2[y*srcstride];
128
      uint8_t* out = &dst[y*dststride];
129
130
      for (int x=0;x<width;x+=8) {
131
        out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
132
        out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit);
133
        out[2] = Clip1_8bit((in1[2] + in2[2] + offset8bit)>>shift8bit);
134
        out[3] = Clip1_8bit((in1[3] + in2[3] + offset8bit)>>shift8bit);
135
        out[4] = Clip1_8bit((in1[4] + in2[4] + offset8bit)>>shift8bit);
136
        out[5] = Clip1_8bit((in1[5] + in2[5] + offset8bit)>>shift8bit);
137
        out[6] = Clip1_8bit((in1[6] + in2[6] + offset8bit)>>shift8bit);
138
        out[7] = Clip1_8bit((in1[7] + in2[7] + offset8bit)>>shift8bit);
139
        out+=8; in1+=8; in2+=8;
140
      }
141
    }
142
  }
143
  else
144
#endif
145
0
    {
146
0
      for (int y=0;y<height;y++) {
147
0
        const int16_t* in1 = &src1[y*srcstride];
148
0
        const int16_t* in2 = &src2[y*srcstride];
149
0
        uint8_t* out = &dst[y*dststride];
150
151
0
        for (int x=0;x<width;x+=2) {
152
0
          out[0] = Clip1_8bit((in1[0] + in2[0] + offset8bit)>>shift8bit);
153
0
          out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit);
154
0
          out+=2; in1+=2; in2+=2;
155
0
        }
156
0
      }
157
0
    }
158
0
}
159
160
161
162
163
164
void put_unweighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride,
165
                                     const int16_t *src, ptrdiff_t srcstride,
166
                                     int width, int height, int bit_depth)
167
1.40M
{
168
1.40M
  int shift1 = 14-bit_depth;
169
1.40M
  int offset1 = 0;
170
1.40M
  if (shift1>0) { offset1 = 1<<(shift1-1); }
171
172
1.40M
  assert((width&1)==0);
173
174
10.5M
  for (int y=0;y<height;y++) {
175
9.12M
    const int16_t* in  = &src[y*srcstride];
176
9.12M
    uint16_t* out = &dst[y*dststride];
177
178
50.1M
    for (int x=0;x<width;x+=2) {
179
41.0M
      out[0] = Clip_BitDepth((in[0] + offset1)>>shift1, bit_depth);
180
41.0M
      out[1] = Clip_BitDepth((in[1] + offset1)>>shift1, bit_depth);
181
41.0M
      out+=2; in+=2;
182
41.0M
    }
183
9.12M
  }
184
1.40M
}
185
186
#include <stdlib.h>
187
188
void put_weighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride,
189
                                   const int16_t *src, ptrdiff_t srcstride,
190
                                   int width, int height,
191
                                   int w,int o,int log2WD, int bit_depth)
192
935k
{
193
935k
  assert(log2WD>=1); // TODO
194
195
935k
  const int rnd = (1<<(log2WD-1));
196
197
5.99M
  for (int y=0;y<height;y++) {
198
5.06M
    const int16_t* in  = &src[y*srcstride];
199
5.06M
    uint16_t* out = &dst[y*dststride];
200
201
41.5M
    for (int x=0;x<width;x++) {
202
36.4M
      out[0] = Clip_BitDepth(((in[0]*w + rnd)>>log2WD) + o, bit_depth);
203
36.4M
      out++; in++;
204
36.4M
    }
205
5.06M
  }
206
935k
}
207
208
void put_weighted_bipred_16_fallback(uint16_t *dst, ptrdiff_t dststride,
209
                                     const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
210
                                     int width, int height,
211
                                     int w1,int o1, int w2,int o2, int log2WD, int bit_depth)
212
292k
{
213
292k
  assert(log2WD>=1); // TODO
214
215
292k
  const int rnd = ((o1+o2+1) << log2WD);
216
217
2.53M
  for (int y=0;y<height;y++) {
218
2.24M
    const int16_t* in1 = &src1[y*srcstride];
219
2.24M
    const int16_t* in2 = &src2[y*srcstride];
220
2.24M
    uint16_t* out = &dst[y*dststride];
221
222
28.3M
    for (int x=0;x<width;x++) {
223
26.0M
      out[0] = Clip_BitDepth((in1[0]*w1 + in2[0]*w2 + rnd)>>(log2WD+1), bit_depth);
224
26.0M
      out++; in1++; in2++;
225
26.0M
    }
226
2.24M
  }
227
292k
}
228
229
230
void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride,
231
                                       const int16_t *src1, const int16_t *src2,
232
                                       ptrdiff_t srcstride, int width,
233
                                       int height, int bit_depth)
234
575k
{
235
575k
  int shift2 = 15-bit_depth;
236
575k
  int offset2 = 1<<(shift2-1);
237
238
575k
  assert((width&1)==0);
239
240
4.99M
  for (int y=0;y<height;y++) {
241
4.42M
    const int16_t* in1 = &src1[y*srcstride];
242
4.42M
    const int16_t* in2 = &src2[y*srcstride];
243
4.42M
    uint16_t* out = &dst[y*dststride];
244
245
29.6M
    for (int x=0;x<width;x+=2) {
246
25.2M
      out[0] = Clip_BitDepth((in1[0] + in2[0] + offset2)>>shift2, bit_depth);
247
25.2M
      out[1] = Clip_BitDepth((in1[1] + in2[1] + offset2)>>shift2, bit_depth);
248
25.2M
      out+=2; in1+=2; in2+=2;
249
25.2M
    }
250
4.42M
  }
251
575k
}
252
253
254
255
256
257
void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride,
258
                         const uint8_t *src, ptrdiff_t src_stride,
259
                         int width, int height,
260
                         int mx, int my, int16_t* mcbuffer)
261
0
{
262
0
  int shift3 = 6;
263
264
0
  for (int y=0;y<height;y++) {
265
0
    int16_t* o = &out[y*out_stride];
266
0
    const uint8_t* i = &src[y*src_stride];
267
268
0
    for (int x=0;x<width;x++) {
269
0
      *o = *i << shift3;
270
0
      o++;
271
0
      i++;
272
0
    }
273
0
  }
274
0
}
275
276
277
void put_epel_16_fallback(int16_t *out, ptrdiff_t out_stride,
278
                          const uint16_t *src, ptrdiff_t src_stride,
279
                          int width, int height,
280
                          int mx, int my, int16_t* mcbuffer, int bit_depth)
281
1.32M
{
282
1.32M
  int shift3 = 14 - bit_depth;
283
284
8.27M
  for (int y=0;y<height;y++) {
285
6.95M
    int16_t* o = &out[y*out_stride];
286
6.95M
    const uint16_t* i = &src[y*src_stride];
287
288
55.9M
    for (int x=0;x<width;x++) {
289
48.9M
      *o = *i << shift3;
290
48.9M
      o++;
291
48.9M
      i++;
292
48.9M
    }
293
6.95M
  }
294
1.32M
}
295
296
297
template <class pixel_t>
298
void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride,
299
                          const pixel_t *src, ptrdiff_t src_stride,
300
                          int nPbWC, int nPbHC,
301
                          int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth)
302
1.17M
{
303
1.17M
  const int shift1 = bit_depth-8;
304
1.17M
  const int shift2 = 6;
305
  //const int shift3 = 6;
306
307
1.17M
  int extra_left = 1;
308
1.17M
  int extra_top  = 1;
309
  //  int extra_right = 2;
310
1.17M
  int extra_bottom= 2;
311
312
313
1.17M
  int nPbH_extra = extra_top  + nPbHC + extra_bottom;
314
315
1.17M
  int16_t* tmp2buf = (int16_t*)alloca( nPbWC      * nPbH_extra * sizeof(int16_t) );
316
317
  /*
318
  int nPbW_extra = extra_left + nPbWC + extra_right;
319
320
321
  printf("x,y FracC: %d/%d\n",xFracC,yFracC);
322
323
  printf("---IN---\n");
324
325
  for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
326
    uint8_t* p = &src[y*src_stride -extra_left];
327
328
    for (int x=-extra_left;x<nPbWC+extra_right;x++) {
329
      printf("%05d ",*p << 6);
330
      p++;
331
    }
332
    printf("\n");
333
  }
334
  */
335
336
337
  // H-filters
338
339
1.17M
  logtrace(LogMotion,"---H---\n");
340
  //printf("---H---(%d)\n",xFracC);
341
342
10.8M
  for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
343
9.62M
    const pixel_t* p = &src[y*src_stride - extra_left];
344
345
67.6M
    for (int x=0;x<nPbWC;x++) {
346
58.0M
      int16_t v;
347
58.0M
      switch (xFracC) {
348
11.3M
      case 0: v = p[1]; break;
349
8.60M
      case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>shift1; break;
350
7.95M
      case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break;
351
3.82M
      case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break;
352
5.52M
      case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break;
353
3.57M
      case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break;
354
7.26M
      case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break;
355
0
      default:
356
9.92M
      case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break;
357
58.0M
      }
358
359
      //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v);
360
361
58.0M
      tmp2buf[y+extra_top + x*nPbH_extra] = v;
362
58.0M
      p++;
363
364
      //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]);
365
58.0M
    }
366
    //printf("\n");
367
9.62M
  }
368
369
  // V-filters
370
371
1.17M
  int vshift = (xFracC==0 ? shift1 : shift2);
372
373
6.81M
  for (int x=0;x<nPbWC;x++) {
374
5.64M
    int16_t* p = &tmp2buf[x*nPbH_extra];
375
376
46.7M
    for (int y=0;y<nPbHC;y++) {
377
41.0M
      int16_t v;
378
      //logtrace(LogMotion,"%x %x %x  %x  %x %x %x\n",p[0],p[1],p[2],p[3],p[4],p[5],p[6]);
379
380
41.0M
      switch (yFracC) {
381
7.30M
      case 0: v = p[1]; break;
382
5.37M
      case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>vshift; break;
383
6.93M
      case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break;
384
3.10M
      case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break;
385
5.34M
      case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break;
386
2.49M
      case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break;
387
6.32M
      case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break;
388
0
      default:
389
4.20M
      case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break;
390
41.0M
      }
391
392
41.0M
      dst[x + y*dst_stride] = v;
393
41.0M
      p++;
394
41.0M
    }
395
396
5.64M
  }
397
398
  /*
399
  printf("---V---\n");
400
  for (int y=0;y<nPbHC;y++) {
401
    for (int x=0;x<nPbWC;x++) {
402
      printf("%05d ",dst[x+y*dst_stride]);
403
    }
404
    printf("\n");
405
  }
406
  */
407
1.17M
}
Unexecuted instantiation: void put_epel_hv_fallback<unsigned char>(short*, long, unsigned char const*, long, int, int, int, int, short*, int)
void put_epel_hv_fallback<unsigned short>(short*, long, unsigned short const*, long, int, int, int, int, short*, int)
Line
Count
Source
302
1.17M
{
303
1.17M
  const int shift1 = bit_depth-8;
304
1.17M
  const int shift2 = 6;
305
  //const int shift3 = 6;
306
307
1.17M
  int extra_left = 1;
308
1.17M
  int extra_top  = 1;
309
  //  int extra_right = 2;
310
1.17M
  int extra_bottom= 2;
311
312
313
1.17M
  int nPbH_extra = extra_top  + nPbHC + extra_bottom;
314
315
1.17M
  int16_t* tmp2buf = (int16_t*)alloca( nPbWC      * nPbH_extra * sizeof(int16_t) );
316
317
  /*
318
  int nPbW_extra = extra_left + nPbWC + extra_right;
319
320
321
  printf("x,y FracC: %d/%d\n",xFracC,yFracC);
322
323
  printf("---IN---\n");
324
325
  for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
326
    uint8_t* p = &src[y*src_stride -extra_left];
327
328
    for (int x=-extra_left;x<nPbWC+extra_right;x++) {
329
      printf("%05d ",*p << 6);
330
      p++;
331
    }
332
    printf("\n");
333
  }
334
  */
335
336
337
  // H-filters
338
339
1.17M
  logtrace(LogMotion,"---H---\n");
340
  //printf("---H---(%d)\n",xFracC);
341
342
10.8M
  for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
343
9.62M
    const pixel_t* p = &src[y*src_stride - extra_left];
344
345
67.6M
    for (int x=0;x<nPbWC;x++) {
346
58.0M
      int16_t v;
347
58.0M
      switch (xFracC) {
348
11.3M
      case 0: v = p[1]; break;
349
8.60M
      case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>shift1; break;
350
7.95M
      case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break;
351
3.82M
      case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break;
352
5.52M
      case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break;
353
3.57M
      case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break;
354
7.26M
      case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break;
355
0
      default:
356
9.92M
      case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break;
357
58.0M
      }
358
359
      //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v);
360
361
58.0M
      tmp2buf[y+extra_top + x*nPbH_extra] = v;
362
58.0M
      p++;
363
364
      //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]);
365
58.0M
    }
366
    //printf("\n");
367
9.62M
  }
368
369
  // V-filters
370
371
1.17M
  int vshift = (xFracC==0 ? shift1 : shift2);
372
373
6.81M
  for (int x=0;x<nPbWC;x++) {
374
5.64M
    int16_t* p = &tmp2buf[x*nPbH_extra];
375
376
46.7M
    for (int y=0;y<nPbHC;y++) {
377
41.0M
      int16_t v;
378
      //logtrace(LogMotion,"%x %x %x  %x  %x %x %x\n",p[0],p[1],p[2],p[3],p[4],p[5],p[6]);
379
380
41.0M
      switch (yFracC) {
381
7.30M
      case 0: v = p[1]; break;
382
5.37M
      case 1: v = (-2*p[0]+58*p[1]+10*p[2]-2*p[3])>>vshift; break;
383
6.93M
      case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break;
384
3.10M
      case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break;
385
5.34M
      case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break;
386
2.49M
      case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break;
387
6.32M
      case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break;
388
0
      default:
389
4.20M
      case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break;
390
41.0M
      }
391
392
41.0M
      dst[x + y*dst_stride] = v;
393
41.0M
      p++;
394
41.0M
    }
395
396
5.64M
  }
397
398
  /*
399
  printf("---V---\n");
400
  for (int y=0;y<nPbHC;y++) {
401
    for (int x=0;x<nPbWC;x++) {
402
      printf("%05d ",dst[x+y*dst_stride]);
403
    }
404
    printf("\n");
405
  }
406
  */
407
1.17M
}
408
409
410
template
411
void put_epel_hv_fallback<uint8_t>(int16_t *dst, ptrdiff_t dst_stride,
412
                                   const uint8_t *src, ptrdiff_t src_stride,
413
                                   int nPbWC, int nPbHC,
414
                                   int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth);
415
template
416
void put_epel_hv_fallback<uint16_t>(int16_t *dst, ptrdiff_t dst_stride,
417
                                    const uint16_t *src, ptrdiff_t src_stride,
418
                                    int nPbWC, int nPbHC,
419
                                    int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth);
420
421
422
423
void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
424
                           const uint8_t *src, ptrdiff_t srcstride,
425
                           int nPbW, int nPbH, int16_t* mcbuffer)
426
0
{
427
  //const int shift1 = 0; // sps->BitDepth_Y-8;
428
0
  const int shift2 = 6;
429
430
  // straight copy
431
432
0
  for (int y=0;y<nPbH;y++) {
433
0
      const uint8_t* p = src + srcstride*y;
434
0
      int16_t* o = out + out_stride*y;
435
436
0
      for (int x=0;x<nPbW;x+=4) {
437
438
        // does not seem to be faster...
439
0
        int16_t o0,o1,o2,o3;
440
0
        o0 = p[0] << shift2;
441
0
        o1 = p[1] << shift2;
442
0
        o2 = p[2] << shift2;
443
0
        o3 = p[3] << shift2;
444
0
        o[0]=o0;
445
0
        o[1]=o1;
446
0
        o[2]=o2;
447
0
        o[3]=o3;
448
449
0
        o+=4;
450
0
        p+=4;
451
0
      }
452
0
  }
453
0
}
454
455
456
void put_qpel_0_0_fallback_16(int16_t *out, ptrdiff_t out_stride,
457
                              const uint16_t *src, ptrdiff_t srcstride,
458
                              int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth)
459
868k
{
460
  //const int shift1 = bit_depth-8;
461
  //const int shift2 = 6;
462
868k
  const int shift3 = 14-bit_depth;
463
464
  // straight copy
465
466
8.94M
  for (int y=0;y<nPbH;y++) {
467
8.07M
    const uint16_t* p = src + srcstride*y;
468
8.07M
    int16_t* o = out + out_stride*y;
469
470
110M
    for (int x=0;x<nPbW;x++) {
471
102M
      *o++ = *p++ << shift3;
472
102M
    }
473
8.07M
  }
474
868k
}
475
476
477
478
static int extra_before[4] = { 0,3,3,2 };
479
static int extra_after [4] = { 0,3,4,4 };
480
481
template <class pixel_t>
482
void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
483
                       const pixel_t *src, ptrdiff_t srcstride,
484
                       int nPbW, int nPbH, int16_t* mcbuffer,
485
                       int xFracL, int yFracL, int bit_depth)
486
625k
{
487
625k
  int extra_left   = extra_before[xFracL];
488
  //int extra_right  = extra_after [xFracL];
489
625k
  int extra_top    = extra_before[yFracL];
490
625k
  int extra_bottom = extra_after [yFracL];
491
492
  //int nPbW_extra = extra_left + nPbW + extra_right;
493
625k
  int nPbH_extra = extra_top  + nPbH + extra_bottom;
494
495
625k
  const int shift1 = bit_depth-8;
496
625k
  const int shift2 = 6;
497
498
499
  // H-filters
500
501
625k
  switch (xFracL) {
502
146k
  case 0:
503
2.40M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
504
2.26M
      const pixel_t* p = src + srcstride*y - extra_left;
505
2.26M
      int16_t* o = &mcbuffer[y+extra_top];
506
507
29.3M
      for (int x=0;x<nPbW;x++) {
508
27.0M
        *o = *p;
509
27.0M
        o += nPbH_extra;
510
27.0M
        p++;
511
27.0M
      }
512
2.26M
    }
513
146k
    break;
514
163k
  case 1:
515
2.39M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
516
2.22M
      const pixel_t* p = src + srcstride*y - extra_left;
517
2.22M
      int16_t* o = &mcbuffer[y+extra_top];
518
519
28.4M
      for (int x=0;x<nPbW;x++) {
520
26.1M
        *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5]  +p[6])>>shift1;
521
26.1M
        o += nPbH_extra;
522
26.1M
        p++;
523
26.1M
      }
524
2.22M
    }
525
163k
    break;
526
135k
  case 2:
527
2.05M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
528
1.91M
      const pixel_t* p = src + srcstride*y - extra_left;
529
1.91M
      int16_t* o = &mcbuffer[y+extra_top];
530
531
23.6M
      for (int x=0;x<nPbW;x++) {
532
21.6M
        *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>shift1;
533
21.6M
        o += nPbH_extra;
534
21.6M
        p++;
535
21.6M
      }
536
1.91M
    }
537
135k
    break;
538
179k
  case 3:
539
2.66M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
540
2.48M
      const pixel_t* p = src + srcstride*y - extra_left;
541
2.48M
      int16_t* o = &mcbuffer[y+extra_top];
542
543
32.5M
      for (int x=0;x<nPbW;x++) {
544
30.1M
        *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5]  -p[6])>>shift1;
545
30.1M
        o += nPbH_extra;
546
30.1M
        p++;
547
30.1M
      }
548
2.48M
    }
549
179k
    break;
550
625k
  }
551
552
553
625k
  logtrace(LogMotion,"---H---\n");
554
555
9.51M
  for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
556
113M
    for (int x=0;x<nPbW;x++) {
557
105M
      logtrace(LogMotion,"%04x ",mcbuffer[y+extra_top + x*nPbH_extra]);
558
105M
    }
559
8.89M
    logtrace(LogMotion,"\n");
560
8.89M
  }
561
562
  // V-filters
563
564
625k
  int vshift = (xFracL==0 ? shift1 : shift2);
565
566
625k
  switch (yFracL) {
567
127k
  case 0:
568
1.31M
    for (int x=0;x<nPbW;x++) {
569
1.18M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
570
1.18M
      int16_t* o = &out[x];
571
572
15.2M
      for (int y=0;y<nPbH;y++) {
573
14.0M
        *o = *p;
574
14.0M
        o+=out_stride;
575
14.0M
        p++;
576
14.0M
      }
577
1.18M
    }
578
127k
    break;
579
198k
  case 1:
580
2.14M
    for (int x=0;x<nPbW;x++) {
581
1.94M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
582
1.94M
      int16_t* o = &out[x];
583
584
24.9M
      for (int y=0;y<nPbH;y++) {
585
22.9M
        *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5]  +p[6])>>vshift;
586
22.9M
        o+=out_stride;
587
22.9M
        p++;
588
22.9M
      }
589
1.94M
    }
590
198k
    break;
591
125k
  case 2:
592
1.39M
    for (int x=0;x<nPbW;x++) {
593
1.27M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
594
1.27M
      int16_t* o = &out[x];
595
596
17.8M
      for (int y=0;y<nPbH;y++) {
597
16.6M
        *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>vshift;
598
16.6M
        o+=out_stride;
599
16.6M
        p++;
600
16.6M
      }
601
1.27M
    }
602
125k
    break;
603
173k
  case 3:
604
1.86M
    for (int x=0;x<nPbW;x++) {
605
1.68M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
606
1.68M
      int16_t* o = &out[x];
607
608
22.3M
      for (int y=0;y<nPbH;y++) {
609
20.6M
        *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5]  -p[6])>>vshift;
610
20.6M
        o+=out_stride;
611
20.6M
        p++;
612
20.6M
      }
613
1.68M
    }
614
173k
    break;
615
625k
  }
616
617
618
625k
  logtrace(LogMotion,"---V---\n");
619
6.40M
  for (int y=0;y<nPbH;y++) {
620
80.0M
    for (int x=0;x<nPbW;x++) {
621
74.3M
      logtrace(LogMotion,"%04x ",out[x+y*out_stride]);
622
74.3M
    }
623
5.77M
    logtrace(LogMotion,"\n");
624
5.77M
  }
625
625k
}
Unexecuted instantiation: void put_qpel_fallback<unsigned char>(short*, long, unsigned char const*, long, int, int, short*, int, int, int)
void put_qpel_fallback<unsigned short>(short*, long, unsigned short const*, long, int, int, short*, int, int, int)
Line
Count
Source
486
625k
{
487
625k
  int extra_left   = extra_before[xFracL];
488
  //int extra_right  = extra_after [xFracL];
489
625k
  int extra_top    = extra_before[yFracL];
490
625k
  int extra_bottom = extra_after [yFracL];
491
492
  //int nPbW_extra = extra_left + nPbW + extra_right;
493
625k
  int nPbH_extra = extra_top  + nPbH + extra_bottom;
494
495
625k
  const int shift1 = bit_depth-8;
496
625k
  const int shift2 = 6;
497
498
499
  // H-filters
500
501
625k
  switch (xFracL) {
502
146k
  case 0:
503
2.40M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
504
2.26M
      const pixel_t* p = src + srcstride*y - extra_left;
505
2.26M
      int16_t* o = &mcbuffer[y+extra_top];
506
507
29.3M
      for (int x=0;x<nPbW;x++) {
508
27.0M
        *o = *p;
509
27.0M
        o += nPbH_extra;
510
27.0M
        p++;
511
27.0M
      }
512
2.26M
    }
513
146k
    break;
514
163k
  case 1:
515
2.39M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
516
2.22M
      const pixel_t* p = src + srcstride*y - extra_left;
517
2.22M
      int16_t* o = &mcbuffer[y+extra_top];
518
519
28.4M
      for (int x=0;x<nPbW;x++) {
520
26.1M
        *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5]  +p[6])>>shift1;
521
26.1M
        o += nPbH_extra;
522
26.1M
        p++;
523
26.1M
      }
524
2.22M
    }
525
163k
    break;
526
135k
  case 2:
527
2.05M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
528
1.91M
      const pixel_t* p = src + srcstride*y - extra_left;
529
1.91M
      int16_t* o = &mcbuffer[y+extra_top];
530
531
23.6M
      for (int x=0;x<nPbW;x++) {
532
21.6M
        *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>shift1;
533
21.6M
        o += nPbH_extra;
534
21.6M
        p++;
535
21.6M
      }
536
1.91M
    }
537
135k
    break;
538
179k
  case 3:
539
2.66M
    for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
540
2.48M
      const pixel_t* p = src + srcstride*y - extra_left;
541
2.48M
      int16_t* o = &mcbuffer[y+extra_top];
542
543
32.5M
      for (int x=0;x<nPbW;x++) {
544
30.1M
        *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5]  -p[6])>>shift1;
545
30.1M
        o += nPbH_extra;
546
30.1M
        p++;
547
30.1M
      }
548
2.48M
    }
549
179k
    break;
550
625k
  }
551
552
553
625k
  logtrace(LogMotion,"---H---\n");
554
555
9.51M
  for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
556
113M
    for (int x=0;x<nPbW;x++) {
557
105M
      logtrace(LogMotion,"%04x ",mcbuffer[y+extra_top + x*nPbH_extra]);
558
105M
    }
559
8.89M
    logtrace(LogMotion,"\n");
560
8.89M
  }
561
562
  // V-filters
563
564
625k
  int vshift = (xFracL==0 ? shift1 : shift2);
565
566
625k
  switch (yFracL) {
567
127k
  case 0:
568
1.31M
    for (int x=0;x<nPbW;x++) {
569
1.18M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
570
1.18M
      int16_t* o = &out[x];
571
572
15.2M
      for (int y=0;y<nPbH;y++) {
573
14.0M
        *o = *p;
574
14.0M
        o+=out_stride;
575
14.0M
        p++;
576
14.0M
      }
577
1.18M
    }
578
127k
    break;
579
198k
  case 1:
580
2.14M
    for (int x=0;x<nPbW;x++) {
581
1.94M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
582
1.94M
      int16_t* o = &out[x];
583
584
24.9M
      for (int y=0;y<nPbH;y++) {
585
22.9M
        *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5]  +p[6])>>vshift;
586
22.9M
        o+=out_stride;
587
22.9M
        p++;
588
22.9M
      }
589
1.94M
    }
590
198k
    break;
591
125k
  case 2:
592
1.39M
    for (int x=0;x<nPbW;x++) {
593
1.27M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
594
1.27M
      int16_t* o = &out[x];
595
596
17.8M
      for (int y=0;y<nPbH;y++) {
597
16.6M
        *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>vshift;
598
16.6M
        o+=out_stride;
599
16.6M
        p++;
600
16.6M
      }
601
1.27M
    }
602
125k
    break;
603
173k
  case 3:
604
1.86M
    for (int x=0;x<nPbW;x++) {
605
1.68M
      const int16_t* p = &mcbuffer[x*nPbH_extra];
606
1.68M
      int16_t* o = &out[x];
607
608
22.3M
      for (int y=0;y<nPbH;y++) {
609
20.6M
        *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5]  -p[6])>>vshift;
610
20.6M
        o+=out_stride;
611
20.6M
        p++;
612
20.6M
      }
613
1.68M
    }
614
173k
    break;
615
625k
  }
616
617
618
625k
  logtrace(LogMotion,"---V---\n");
619
6.40M
  for (int y=0;y<nPbH;y++) {
620
80.0M
    for (int x=0;x<nPbW;x++) {
621
74.3M
      logtrace(LogMotion,"%04x ",out[x+y*out_stride]);
622
74.3M
    }
623
5.77M
    logtrace(LogMotion,"\n");
624
5.77M
  }
625
625k
}
626
627
628
629
#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride,    \
630
                                                             const uint8_t *src, ptrdiff_t srcstride, \
631
                                                             int nPbW, int nPbH, int16_t* mcbuffer) \
632
0
  { put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, 8 ); }
Unexecuted instantiation: put_qpel_0_1_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_0_2_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_0_3_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_1_0_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_1_1_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_1_2_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_1_3_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_2_0_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_2_1_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_2_2_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_2_3_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_3_0_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_3_1_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_3_2_fallback(short*, long, unsigned char const*, long, int, int, short*)
Unexecuted instantiation: put_qpel_3_3_fallback(short*, long, unsigned char const*, long, int, int, short*)
633
634
635
#define QPEL16(x,y) void put_qpel_ ## x ## _ ## y ## _fallback_16(int16_t *out, ptrdiff_t out_stride,    \
636
                                                                  const uint16_t *src, ptrdiff_t srcstride, \
637
625k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
625k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_0_1_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
65.2k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
65.2k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_0_2_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
24.1k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
24.1k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_0_3_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
57.4k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
57.4k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_1_0_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
50.5k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
50.5k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_1_1_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
62.6k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
62.6k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_1_2_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
20.3k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
20.3k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_1_3_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
29.8k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
29.8k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_2_0_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
27.2k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
27.2k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_2_1_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
28.8k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
28.8k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_2_2_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
52.8k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
52.8k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_2_3_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
26.9k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
26.9k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_3_0_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
49.4k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
49.4k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_3_1_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
41.6k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
41.6k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_3_2_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
28.4k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
28.4k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
put_qpel_3_3_fallback_16(short*, long, unsigned short const*, long, int, int, short*, int)
Line
Count
Source
637
59.6k
                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
638
59.6k
{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
639
640
/*     */ QPEL(0,1) QPEL(0,2) QPEL(0,3)
641
QPEL(1,0) QPEL(1,1) QPEL(1,2) QPEL(1,3)
642
QPEL(2,0) QPEL(2,1) QPEL(2,2) QPEL(2,3)
643
QPEL(3,0) QPEL(3,1) QPEL(3,2) QPEL(3,3)
644
645
/*       */ QPEL16(0,1) QPEL16(0,2) QPEL16(0,3)
646
QPEL16(1,0) QPEL16(1,1) QPEL16(1,2) QPEL16(1,3)
647
QPEL16(2,0) QPEL16(2,1) QPEL16(2,2) QPEL16(2,3)
648
QPEL16(3,0) QPEL16(3,1) QPEL16(3,2) QPEL16(3,3)