Coverage Report

Created: 2025-06-13 06:48

/src/libwebp/src/enc/quant_enc.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
//   Quantization
11
//
12
// Author: Skal (pascal.massimino@gmail.com)
13
14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h>  // for abs()
17
#include <string.h>
18
19
#include "src/dec/common_dec.h"
20
#include "src/dsp/dsp.h"
21
#include "src/dsp/quant.h"
22
#include "src/enc/cost_enc.h"
23
#include "src/enc/vp8i_enc.h"
24
#include "src/webp/types.h"
25
26
0
#define DO_TRELLIS_I4  1
27
0
#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
28
0
#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
29
#define USE_TDISTO 1
30
31
0
#define MID_ALPHA 64      // neutral value for susceptibility
32
0
#define MIN_ALPHA 30      // lowest usable value for susceptibility
33
0
#define MAX_ALPHA 100     // higher meaningful value for susceptibility
34
35
0
#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
36
                          // power-law modulation. Must be strictly less than 1.
37
38
// number of non-zero coeffs below which we consider the block very flat
39
// (and apply a penalty to complex predictions)
40
0
#define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
41
0
#define FLATNESS_LIMIT_I4  3       // I4 mode
42
0
#define FLATNESS_LIMIT_UV  2       // UV mode
43
0
#define FLATNESS_PENALTY   140     // roughly ~1bit per block
44
45
0
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
46
47
0
#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
48
49
// #define DEBUG_BLOCK
50
51
//------------------------------------------------------------------------------
52
53
#if defined(DEBUG_BLOCK)
54
55
#include <stdio.h>
56
#include <stdlib.h>
57
58
static void PrintBlockInfo(const VP8EncIterator* const it,
59
                           const VP8ModeScore* const rd) {
60
  int i, j;
61
  const int is_i16 = (it->mb->type == 1);
62
  const uint8_t* const y_in = it->yuv_in + Y_OFF_ENC;
63
  const uint8_t* const y_out = it->yuv_out + Y_OFF_ENC;
64
  const uint8_t* const uv_in = it->yuv_in + U_OFF_ENC;
65
  const uint8_t* const uv_out = it->yuv_out + U_OFF_ENC;
66
  printf("SOURCE / OUTPUT / ABS DELTA\n");
67
  for (j = 0; j < 16; ++j) {
68
    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
69
    printf("     ");
70
    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
71
    printf("     ");
72
    for (i = 0; i < 16; ++i) {
73
      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
74
    }
75
    printf("\n");
76
  }
77
  printf("\n");   // newline before the U/V block
78
  for (j = 0; j < 8; ++j) {
79
    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
80
    printf(" ");
81
    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
82
    printf("    ");
83
    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
84
    printf(" ");
85
    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
86
    printf("   ");
87
    for (i = 0; i < 8; ++i) {
88
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89
    }
90
    printf(" ");
91
    for (i = 8; i < 16; ++i) {
92
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
93
    }
94
    printf("\n");
95
  }
96
  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
97
    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
98
    (int)rd->score);
99
  if (is_i16) {
100
    printf("Mode: %d\n", rd->mode_i16);
101
    printf("y_dc_levels:");
102
    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
103
    printf("\n");
104
  } else {
105
    printf("Modes[16]: ");
106
    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
107
    printf("\n");
108
  }
109
  printf("y_ac_levels:\n");
110
  for (j = 0; j < 16; ++j) {
111
    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
112
      printf("%4d ", rd->y_ac_levels[j][i]);
113
    }
114
    printf("\n");
115
  }
116
  printf("\n");
117
  printf("uv_levels (mode=%d):\n", rd->mode_uv);
118
  for (j = 0; j < 8; ++j) {
119
    for (i = 0; i < 16; ++i) {
120
      printf("%4d ", rd->uv_levels[j][i]);
121
    }
122
    printf("\n");
123
  }
124
}
125
126
#endif   // DEBUG_BLOCK
127
128
//------------------------------------------------------------------------------
129
130
0
static WEBP_INLINE int clip(int v, int m, int M) {
131
0
  return v < m ? m : v > M ? M : v;
132
0
}
133
134
static const uint8_t kZigzag[16] = {
135
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
136
};
137
138
static const uint8_t kDcTable[128] = {
139
  4,     5,   6,   7,   8,   9,  10,  10,
140
  11,   12,  13,  14,  15,  16,  17,  17,
141
  18,   19,  20,  20,  21,  21,  22,  22,
142
  23,   23,  24,  25,  25,  26,  27,  28,
143
  29,   30,  31,  32,  33,  34,  35,  36,
144
  37,   37,  38,  39,  40,  41,  42,  43,
145
  44,   45,  46,  46,  47,  48,  49,  50,
146
  51,   52,  53,  54,  55,  56,  57,  58,
147
  59,   60,  61,  62,  63,  64,  65,  66,
148
  67,   68,  69,  70,  71,  72,  73,  74,
149
  75,   76,  76,  77,  78,  79,  80,  81,
150
  82,   83,  84,  85,  86,  87,  88,  89,
151
  91,   93,  95,  96,  98, 100, 101, 102,
152
  104, 106, 108, 110, 112, 114, 116, 118,
153
  122, 124, 126, 128, 130, 132, 134, 136,
154
  138, 140, 143, 145, 148, 151, 154, 157
155
};
156
157
static const uint16_t kAcTable[128] = {
158
  4,     5,   6,   7,   8,   9,  10,  11,
159
  12,   13,  14,  15,  16,  17,  18,  19,
160
  20,   21,  22,  23,  24,  25,  26,  27,
161
  28,   29,  30,  31,  32,  33,  34,  35,
162
  36,   37,  38,  39,  40,  41,  42,  43,
163
  44,   45,  46,  47,  48,  49,  50,  51,
164
  52,   53,  54,  55,  56,  57,  58,  60,
165
  62,   64,  66,  68,  70,  72,  74,  76,
166
  78,   80,  82,  84,  86,  88,  90,  92,
167
  94,   96,  98, 100, 102, 104, 106, 108,
168
  110, 112, 114, 116, 119, 122, 125, 128,
169
  131, 134, 137, 140, 143, 146, 149, 152,
170
  155, 158, 161, 164, 167, 170, 173, 177,
171
  181, 185, 189, 193, 197, 201, 205, 209,
172
  213, 217, 221, 225, 229, 234, 239, 245,
173
  249, 254, 259, 264, 269, 274, 279, 284
174
};
175
176
static const uint16_t kAcTable2[128] = {
177
  8,     8,   9,  10,  12,  13,  15,  17,
178
  18,   20,  21,  23,  24,  26,  27,  29,
179
  31,   32,  34,  35,  37,  38,  40,  41,
180
  43,   44,  46,  48,  49,  51,  52,  54,
181
  55,   57,  58,  60,  62,  63,  65,  66,
182
  68,   69,  71,  72,  74,  75,  77,  79,
183
  80,   82,  83,  85,  86,  88,  89,  93,
184
  96,   99, 102, 105, 108, 111, 114, 117,
185
  120, 124, 127, 130, 133, 136, 139, 142,
186
  145, 148, 151, 155, 158, 161, 164, 167,
187
  170, 173, 176, 179, 184, 189, 193, 198,
188
  203, 207, 212, 217, 221, 226, 230, 235,
189
  240, 244, 249, 254, 258, 263, 268, 274,
190
  280, 286, 292, 299, 305, 311, 317, 323,
191
  330, 336, 342, 348, 354, 362, 370, 379,
192
  385, 393, 401, 409, 416, 424, 432, 440
193
};
194
195
static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
196
  { 96, 110 }, { 96, 108 }, { 110, 115 }
197
};
198
199
// Sharpening by (slightly) raising the hi-frequency coeffs.
200
// Hack-ish but helpful for mid-bitrate range. Use with care.
201
0
#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
202
static const uint8_t kFreqSharpening[16] = {
203
  0,  30, 60, 90,
204
  30, 60, 90, 90,
205
  60, 90, 90, 90,
206
  90, 90, 90, 90
207
};
208
209
//------------------------------------------------------------------------------
210
// Initialize quantization parameters in VP8Matrix
211
212
// Returns the average quantizer
213
0
static int ExpandMatrix(VP8Matrix* const m, int type) {
214
0
  int i, sum;
215
0
  for (i = 0; i < 2; ++i) {
216
0
    const int is_ac_coeff = (i > 0);
217
0
    const int bias = kBiasMatrices[type][is_ac_coeff];
218
0
    m->iq[i] = (1 << QFIX) / m->q[i];
219
0
    m->bias[i] = BIAS(bias);
220
    // zthresh is the exact value such that QUANTDIV(coeff, iQ, B) is:
221
    //   * zero if coeff <= zthresh
222
    //   * non-zero if coeff > zthresh
223
0
    m->zthresh[i] = ((1 << QFIX) - 1 - m->bias[i]) / m->iq[i];
224
0
  }
225
0
  for (i = 2; i < 16; ++i) {
226
0
    m->q[i] = m->q[1];
227
0
    m->iq[i] = m->iq[1];
228
0
    m->bias[i] = m->bias[1];
229
0
    m->zthresh[i] = m->zthresh[1];
230
0
  }
231
0
  for (sum = 0, i = 0; i < 16; ++i) {
232
0
    if (type == 0) {  // we only use sharpening for AC luma coeffs
233
0
      m->sharpen[i] = (kFreqSharpening[i] * m->q[i]) >> SHARPEN_BITS;
234
0
    } else {
235
0
      m->sharpen[i] = 0;
236
0
    }
237
0
    sum += m->q[i];
238
0
  }
239
0
  return (sum + 8) >> 4;
240
0
}
241
242
0
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
243
244
0
static void SetupMatrices(VP8Encoder* enc) {
245
0
  int i;
246
0
  const int tlambda_scale =
247
0
    (enc->method >= 4) ? enc->config->sns_strength
248
0
                        : 0;
249
0
  const int num_segments = enc->segment_hdr.num_segments;
250
0
  for (i = 0; i < num_segments; ++i) {
251
0
    VP8SegmentInfo* const m = &enc->dqm[i];
252
0
    const int q = m->quant;
253
0
    int q_i4, q_i16, q_uv;
254
0
    m->y1.q[0] = kDcTable[clip(q + enc->dq_y1_dc, 0, 127)];
255
0
    m->y1.q[1] = kAcTable[clip(q,                  0, 127)];
256
257
0
    m->y2.q[0] = kDcTable[ clip(q + enc->dq_y2_dc, 0, 127)] * 2;
258
0
    m->y2.q[1] = kAcTable2[clip(q + enc->dq_y2_ac, 0, 127)];
259
260
0
    m->uv.q[0] = kDcTable[clip(q + enc->dq_uv_dc, 0, 117)];
261
0
    m->uv.q[1] = kAcTable[clip(q + enc->dq_uv_ac, 0, 127)];
262
263
0
    q_i4  = ExpandMatrix(&m->y1, 0);
264
0
    q_i16 = ExpandMatrix(&m->y2, 1);
265
0
    q_uv  = ExpandMatrix(&m->uv, 2);
266
267
0
    m->lambda_i4          = (3 * q_i4 * q_i4) >> 7;
268
0
    m->lambda_i16         = (3 * q_i16 * q_i16);
269
0
    m->lambda_uv          = (3 * q_uv * q_uv) >> 6;
270
0
    m->lambda_mode        = (1 * q_i4 * q_i4) >> 7;
271
0
    m->lambda_trellis_i4  = (7 * q_i4 * q_i4) >> 3;
272
0
    m->lambda_trellis_i16 = (q_i16 * q_i16) >> 2;
273
0
    m->lambda_trellis_uv  = (q_uv * q_uv) << 1;
274
0
    m->tlambda            = (tlambda_scale * q_i4) >> 5;
275
276
    // none of these constants should be < 1
277
0
    CheckLambdaValue(&m->lambda_i4);
278
0
    CheckLambdaValue(&m->lambda_i16);
279
0
    CheckLambdaValue(&m->lambda_uv);
280
0
    CheckLambdaValue(&m->lambda_mode);
281
0
    CheckLambdaValue(&m->lambda_trellis_i4);
282
0
    CheckLambdaValue(&m->lambda_trellis_i16);
283
0
    CheckLambdaValue(&m->lambda_trellis_uv);
284
0
    CheckLambdaValue(&m->tlambda);
285
286
0
    m->min_disto = 20 * m->y1.q[0];   // quantization-aware min disto
287
0
    m->max_edge  = 0;
288
289
0
    m->i4_penalty = 1000 * q_i4 * q_i4;
290
0
  }
291
0
}
292
293
//------------------------------------------------------------------------------
294
// Initialize filtering parameters
295
296
// Very small filter-strength values have close to no visual effect. So we can
297
// save a little decoding-CPU by turning filtering off for these.
298
0
#define FSTRENGTH_CUTOFF 2
299
300
0
static void SetupFilterStrength(VP8Encoder* const enc) {
301
0
  int i;
302
  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
303
0
  const int level0 = 5 * enc->config->filter_strength;
304
0
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
305
0
    VP8SegmentInfo* const m = &enc->dqm[i];
306
    // We focus on the quantization of AC coeffs.
307
0
    const int qstep = kAcTable[clip(m->quant, 0, 127)] >> 2;
308
0
    const int base_strength =
309
0
        VP8FilterStrengthFromDelta(enc->filter_hdr.sharpness, qstep);
310
    // Segments with lower complexity ('beta') will be less filtered.
311
0
    const int f = base_strength * level0 / (256 + m->beta);
312
0
    m->fstrength = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
313
0
  }
314
  // We record the initial strength (mainly for the case of 1-segment only).
315
0
  enc->filter_hdr.level = enc->dqm[0].fstrength;
316
0
  enc->filter_hdr.simple = (enc->config->filter_type == 0);
317
0
  enc->filter_hdr.sharpness = enc->config->filter_sharpness;
318
0
}
319
320
//------------------------------------------------------------------------------
321
322
// Note: if you change the values below, remember that the max range
323
// allowed by the syntax for DQ_UV is [-16,16].
324
0
#define MAX_DQ_UV (6)
325
0
#define MIN_DQ_UV (-4)
326
327
// We want to emulate jpeg-like behaviour where the expected "good" quality
328
// is around q=75. Internally, our "good" middle is around c=50. So we
329
// map accordingly using linear piece-wise function
330
0
static double QualityToCompression(double c) {
331
0
  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
332
  // The file size roughly scales as pow(quantizer, 3.). Actually, the
333
  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
334
  // in the mid-quant range. So we scale the compressibility inversely to
335
  // this power-law: quant ~= compression ^ 1/3. This law holds well for
336
  // low quant. Finer modeling for high-quant would make use of kAcTable[]
337
  // more explicitly.
338
0
  const double v = pow(linear_c, 1 / 3.);
339
0
  return v;
340
0
}
341
342
0
static double QualityToJPEGCompression(double c, double alpha) {
343
  // We map the complexity 'alpha' and quality setting 'c' to a compression
344
  // exponent empirically matched to the compression curve of libjpeg6b.
345
  // On average, the WebP output size will be roughly similar to that of a
346
  // JPEG file compressed with same quality factor.
347
0
  const double amin = 0.30;
348
0
  const double amax = 0.85;
349
0
  const double exp_min = 0.4;
350
0
  const double exp_max = 0.9;
351
0
  const double slope = (exp_min - exp_max) / (amax - amin);
352
  // Linearly interpolate 'expn' from exp_min to exp_max
353
  // in the [amin, amax] range.
354
0
  const double expn = (alpha > amax) ? exp_min
355
0
                    : (alpha < amin) ? exp_max
356
0
                    : exp_max + slope * (alpha - amin);
357
0
  const double v = pow(c, expn);
358
0
  return v;
359
0
}
360
361
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
362
0
                                 const VP8SegmentInfo* const S2) {
363
0
  return (S1->quant == S2->quant) && (S1->fstrength == S2->fstrength);
364
0
}
365
366
0
static void SimplifySegments(VP8Encoder* const enc) {
367
0
  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
368
  // 'num_segments' is previously validated and <= NUM_MB_SEGMENTS, but an
369
  // explicit check is needed to avoid a spurious warning about 'i' exceeding
370
  // array bounds of 'dqm' with some compilers (noticed with gcc-4.9).
371
0
  const int num_segments = (enc->segment_hdr.num_segments < NUM_MB_SEGMENTS)
372
0
                               ? enc->segment_hdr.num_segments
373
0
                               : NUM_MB_SEGMENTS;
374
0
  int num_final_segments = 1;
375
0
  int s1, s2;
376
0
  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
377
0
    const VP8SegmentInfo* const S1 = &enc->dqm[s1];
378
0
    int found = 0;
379
    // check if we already have similar segment
380
0
    for (s2 = 0; s2 < num_final_segments; ++s2) {
381
0
      const VP8SegmentInfo* const S2 = &enc->dqm[s2];
382
0
      if (SegmentsAreEquivalent(S1, S2)) {
383
0
        found = 1;
384
0
        break;
385
0
      }
386
0
    }
387
0
    map[s1] = s2;
388
0
    if (!found) {
389
0
      if (num_final_segments != s1) {
390
0
        enc->dqm[num_final_segments] = enc->dqm[s1];
391
0
      }
392
0
      ++num_final_segments;
393
0
    }
394
0
  }
395
0
  if (num_final_segments < num_segments) {  // Remap
396
0
    int i = enc->mb_w * enc->mb_h;
397
0
    while (i-- > 0) enc->mb_info[i].segment = map[enc->mb_info[i].segment];
398
0
    enc->segment_hdr.num_segments = num_final_segments;
399
    // Replicate the trailing segment infos (it's mostly cosmetics)
400
0
    for (i = num_final_segments; i < num_segments; ++i) {
401
0
      enc->dqm[i] = enc->dqm[num_final_segments - 1];
402
0
    }
403
0
  }
404
0
}
405
406
0
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
407
0
  int i;
408
0
  int dq_uv_ac, dq_uv_dc;
409
0
  const int num_segments = enc->segment_hdr.num_segments;
410
0
  const double amp = SNS_TO_DQ * enc->config->sns_strength / 100. / 128.;
411
0
  const double Q = quality / 100.;
412
0
  const double c_base = enc->config->emulate_jpeg_size ?
413
0
      QualityToJPEGCompression(Q, enc->alpha / 255.) :
414
0
      QualityToCompression(Q);
415
0
  for (i = 0; i < num_segments; ++i) {
416
    // We modulate the base coefficient to accommodate for the quantization
417
    // susceptibility and allow denser segments to be quantized more.
418
0
    const double expn = 1. - amp * enc->dqm[i].alpha;
419
0
    const double c = pow(c_base, expn);
420
0
    const int q = (int)(127. * (1. - c));
421
0
    assert(expn > 0.);
422
0
    enc->dqm[i].quant = clip(q, 0, 127);
423
0
  }
424
425
  // purely indicative in the bitstream (except for the 1-segment case)
426
0
  enc->base_quant = enc->dqm[0].quant;
427
428
  // fill-in values for the unused segments (required by the syntax)
429
0
  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
430
0
    enc->dqm[i].quant = enc->base_quant;
431
0
  }
432
433
  // uv_alpha is normally spread around ~60. The useful range is
434
  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
435
  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
436
0
  dq_uv_ac = (enc->uv_alpha - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
437
0
                                         / (MAX_ALPHA - MIN_ALPHA);
438
  // we rescale by the user-defined strength of adaptation
439
0
  dq_uv_ac = dq_uv_ac * enc->config->sns_strength / 100;
440
  // and make it safe.
441
0
  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
442
  // We also boost the dc-uv-quant a little, based on sns-strength, since
443
  // U/V channels are quite more reactive to high quants (flat DC-blocks
444
  // tend to appear, and are unpleasant).
445
0
  dq_uv_dc = -4 * enc->config->sns_strength / 100;
446
0
  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
447
448
0
  enc->dq_y1_dc = 0;       // TODO(skal): dq-lum
449
0
  enc->dq_y2_dc = 0;
450
0
  enc->dq_y2_ac = 0;
451
0
  enc->dq_uv_dc = dq_uv_dc;
452
0
  enc->dq_uv_ac = dq_uv_ac;
453
454
0
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
455
456
0
  if (num_segments > 1) SimplifySegments(enc);
457
458
0
  SetupMatrices(enc);         // finalize quantization matrices
459
0
}
460
461
//------------------------------------------------------------------------------
462
// Form the predictions in cache
463
464
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
465
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
466
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
467
468
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
469
static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
470
  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
471
};
472
473
0
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
474
0
  const uint8_t* const left = it->x ? it->y_left : NULL;
475
0
  const uint8_t* const top = it->y ? it->y_top : NULL;
476
0
  VP8EncPredLuma16(it->yuv_p, left, top);
477
0
}
478
479
0
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
480
0
  const uint8_t* const left = it->x ? it->u_left : NULL;
481
0
  const uint8_t* const top = it->y ? it->uv_top : NULL;
482
0
  VP8EncPredChroma8(it->yuv_p, left, top);
483
0
}
484
485
// Form all the ten Intra4x4 predictions in the 'yuv_p' cache
486
// for the 4x4 block it->i4
487
0
static void MakeIntra4Preds(const VP8EncIterator* const it) {
488
0
  VP8EncPredLuma4(it->yuv_p, it->i4_top);
489
0
}
490
491
//------------------------------------------------------------------------------
492
// Quantize
493
494
// Layout:
495
// +----+----+
496
// |YYYY|UUVV| 0
497
// |YYYY|UUVV| 4
498
// |YYYY|....| 8
499
// |YYYY|....| 12
500
// +----+----+
501
502
const uint16_t VP8Scan[16] = {  // Luma
503
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
504
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
505
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
506
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
507
};
508
509
static const uint16_t VP8ScanUV[4 + 4] = {
510
  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
511
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
512
};
513
514
//------------------------------------------------------------------------------
515
// Distortion measurement
516
517
static const uint16_t kWeightY[16] = {
518
  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
519
};
520
521
static const uint16_t kWeightTrellis[16] = {
522
#if USE_TDISTO == 0
523
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
524
#else
525
  30, 27, 19, 11,
526
  27, 24, 17, 10,
527
  19, 17, 12,  8,
528
  11, 10,  8,  6
529
#endif
530
};
531
532
// Init/Copy the common fields in score.
533
0
static void InitScore(VP8ModeScore* const rd) {
534
0
  rd->D  = 0;
535
0
  rd->SD = 0;
536
0
  rd->R  = 0;
537
0
  rd->H  = 0;
538
0
  rd->nz = 0;
539
0
  rd->score = MAX_COST;
540
0
}
541
542
static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
543
0
                      const VP8ModeScore* WEBP_RESTRICT const src) {
544
0
  dst->D  = src->D;
545
0
  dst->SD = src->SD;
546
0
  dst->R  = src->R;
547
0
  dst->H  = src->H;
548
0
  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
549
0
  dst->score = src->score;
550
0
}
551
552
static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
553
0
                     const VP8ModeScore* WEBP_RESTRICT const src) {
554
0
  dst->D  += src->D;
555
0
  dst->SD += src->SD;
556
0
  dst->R  += src->R;
557
0
  dst->H  += src->H;
558
0
  dst->nz |= src->nz;     // here, new nz bits are accumulated.
559
0
  dst->score += src->score;
560
0
}
561
562
//------------------------------------------------------------------------------
563
// Performs trellis-optimized quantization.
564
565
// Trellis node
566
typedef struct {
567
  int8_t prev;            // best previous node
568
  int8_t sign;            // sign of coeff_i
569
  int16_t level;          // level
570
} Node;
571
572
// Score state
573
typedef struct {
574
  score_t score;          // partial RD score
575
  const uint16_t* costs;  // shortcut to cost tables
576
} ScoreState;
577
578
// If a coefficient was quantized to a value Q (using a neutral bias),
579
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
580
// We don't test negative values though.
581
0
#define MIN_DELTA 0   // how much lower level to try
582
0
#define MAX_DELTA 1   // how much higher
583
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
584
0
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
585
0
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
586
587
0
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
588
0
  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
589
0
}
590
591
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
592
0
                                          score_t distortion) {
593
0
  return rate * lambda + RD_DISTO_MULT * distortion;
594
0
}
595
596
// Coefficient type.
597
enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
598
599
static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
600
                                int16_t in[16], int16_t out[16],
601
                                int ctx0, int coeff_type,
602
                                const VP8Matrix* WEBP_RESTRICT const mtx,
603
0
                                int lambda) {
604
0
  const ProbaArray* const probas = enc->proba.coeffs[coeff_type];
605
0
  CostArrayPtr const costs =
606
0
      (CostArrayPtr)enc->proba.remapped_costs[coeff_type];
607
0
  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
608
0
  Node nodes[16][NUM_NODES];
609
0
  ScoreState score_states[2][NUM_NODES];
610
0
  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
611
0
  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
612
0
  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
613
0
  score_t best_score;
614
0
  int n, m, p, last;
615
616
0
  {
617
0
    score_t cost;
618
0
    const int thresh = mtx->q[1] * mtx->q[1] / 4;
619
0
    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
620
621
    // compute the position of the last interesting coefficient
622
0
    last = first - 1;
623
0
    for (n = 15; n >= first; --n) {
624
0
      const int j = kZigzag[n];
625
0
      const int err = in[j] * in[j];
626
0
      if (err > thresh) {
627
0
        last = n;
628
0
        break;
629
0
      }
630
0
    }
631
    // we don't need to go inspect up to n = 16 coeffs. We can just go up
632
    // to last + 1 (inclusive) without losing much.
633
0
    if (last < 15) ++last;
634
635
    // compute 'skip' score. This is the max score one can do.
636
0
    cost = VP8BitCost(0, last_proba);
637
0
    best_score = RDScoreTrellis(lambda, cost, 0);
638
639
    // initialize source node.
640
0
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
641
0
      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
642
0
      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
643
0
      ss_cur[m].costs = costs[first][ctx0];
644
0
    }
645
0
  }
646
647
  // traverse trellis.
648
0
  for (n = first; n <= last; ++n) {
649
0
    const int j = kZigzag[n];
650
0
    const uint32_t Q  = mtx->q[j];
651
0
    const uint32_t iQ = mtx->iq[j];
652
0
    const uint32_t B = BIAS(0x00);     // neutral bias
653
    // note: it's important to take sign of the _original_ coeff,
654
    // so we don't have to consider level < 0 afterward.
655
0
    const int sign = (in[j] < 0);
656
0
    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen[j];
657
0
    int level0 = QUANTDIV(coeff0, iQ, B);
658
0
    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
659
0
    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
660
0
    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
661
662
0
    {   // Swap current and previous score states
663
0
      ScoreState* const tmp = ss_cur;
664
0
      ss_cur = ss_prev;
665
0
      ss_prev = tmp;
666
0
    }
667
668
    // test all alternate level values around level0.
669
0
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
670
0
      Node* const cur = &NODE(n, m);
671
0
      const int level = level0 + m;
672
0
      const int ctx = (level > 2) ? 2 : level;
673
0
      const int band = VP8EncBands[n + 1];
674
0
      score_t base_score;
675
0
      score_t best_cur_score;
676
0
      int best_prev;
677
0
      score_t cost, score;
678
679
0
      ss_cur[m].costs = costs[n + 1][ctx];
680
0
      if (level < 0 || level > thresh_level) {
681
0
        ss_cur[m].score = MAX_COST;
682
        // Node is dead.
683
0
        continue;
684
0
      }
685
686
0
      {
687
        // Compute delta_error = how much coding this level will
688
        // subtract to max_error as distortion.
689
        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
690
0
        const int new_error = coeff0 - level * Q;
691
0
        const int delta_error =
692
0
            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
693
0
        base_score = RDScoreTrellis(lambda, 0, delta_error);
694
0
      }
695
696
      // Inspect all possible non-dead predecessors. Retain only the best one.
697
      // The base_score is added to all scores so it is only added for the final
698
      // value after the loop.
699
0
      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
700
0
      best_cur_score =
701
0
          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
702
0
      best_prev = -MIN_DELTA;
703
0
      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
704
        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
705
        // eliminated since their score can't be better than the current best.
706
0
        cost = VP8LevelCost(ss_prev[p].costs, level);
707
        // Examine node assuming it's a non-terminal one.
708
0
        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
709
0
        if (score < best_cur_score) {
710
0
          best_cur_score = score;
711
0
          best_prev = p;
712
0
        }
713
0
      }
714
0
      best_cur_score += base_score;
715
      // Store best finding in current node.
716
0
      cur->sign = sign;
717
0
      cur->level = level;
718
0
      cur->prev = best_prev;
719
0
      ss_cur[m].score = best_cur_score;
720
721
      // Now, record best terminal node (and thus best entry in the graph).
722
0
      if (level != 0 && best_cur_score < best_score) {
723
0
        const score_t last_pos_cost =
724
0
            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
725
0
        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
726
0
        score = best_cur_score + last_pos_score;
727
0
        if (score < best_score) {
728
0
          best_score = score;
729
0
          best_path[0] = n;                     // best eob position
730
0
          best_path[1] = m;                     // best node index
731
0
          best_path[2] = best_prev;             // best predecessor
732
0
        }
733
0
      }
734
0
    }
735
0
  }
736
737
  // Fresh start
738
  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
739
0
  if (coeff_type == TYPE_I16_AC) {
740
0
    memset(in + 1, 0, 15 * sizeof(*in));
741
0
    memset(out + 1, 0, 15 * sizeof(*out));
742
0
  } else {
743
0
    memset(in, 0, 16 * sizeof(*in));
744
0
    memset(out, 0, 16 * sizeof(*out));
745
0
  }
746
0
  if (best_path[0] == -1) {
747
0
    return 0;  // skip!
748
0
  }
749
750
0
  {
751
    // Unwind the best path.
752
    // Note: best-prev on terminal node is not necessarily equal to the
753
    // best_prev for non-terminal. So we patch best_path[2] in.
754
0
    int nz = 0;
755
0
    int best_node = best_path[1];
756
0
    n = best_path[0];
757
0
    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
758
759
0
    for (; n >= first; --n) {
760
0
      const Node* const node = &NODE(n, best_node);
761
0
      const int j = kZigzag[n];
762
0
      out[n] = node->sign ? -node->level : node->level;
763
0
      nz |= node->level;
764
0
      in[j] = out[n] * mtx->q[j];
765
0
      best_node = node->prev;
766
0
    }
767
0
    return (nz != 0);
768
0
  }
769
0
}
770
771
#undef NODE
772
773
//------------------------------------------------------------------------------
774
// Performs: difference, transform, quantize, back-transform, add
775
// all at once. Output is the reconstructed block in *yuv_out, and the
776
// quantized levels in *levels.
777
778
static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
779
                              VP8ModeScore* WEBP_RESTRICT const rd,
780
                              uint8_t* WEBP_RESTRICT const yuv_out,
781
0
                              int mode) {
782
0
  const VP8Encoder* const enc = it->enc;
783
0
  const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode];
784
0
  const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
785
0
  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
786
0
  int nz = 0;
787
0
  int n;
788
0
  int16_t tmp[16][16], dc_tmp[16];
789
790
0
  for (n = 0; n < 16; n += 2) {
791
0
    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
792
0
  }
793
0
  VP8FTransformWHT(tmp[0], dc_tmp);
794
0
  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2) << 24;
795
796
0
  if (DO_TRELLIS_I16 && it->do_trellis) {
797
0
    int x, y;
798
0
    VP8IteratorNzToBytes(it);
799
0
    for (y = 0, n = 0; y < 4; ++y) {
800
0
      for (x = 0; x < 4; ++x, ++n) {
801
0
        const int ctx = it->top_nz[x] + it->left_nz[y];
802
0
        const int non_zero = TrellisQuantizeBlock(
803
0
            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1,
804
0
            dqm->lambda_trellis_i16);
805
0
        it->top_nz[x] = it->left_nz[y] = non_zero;
806
0
        rd->y_ac_levels[n][0] = 0;
807
0
        nz |= non_zero << n;
808
0
      }
809
0
    }
810
0
  } else {
811
0
    for (n = 0; n < 16; n += 2) {
812
      // Zero-out the first coeff, so that: a) nz is correct below, and
813
      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
814
0
      tmp[n][0] = tmp[n + 1][0] = 0;
815
0
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1) << n;
816
0
      assert(rd->y_ac_levels[n + 0][0] == 0);
817
0
      assert(rd->y_ac_levels[n + 1][0] == 0);
818
0
    }
819
0
  }
820
821
  // Transform back
822
0
  VP8TransformWHT(dc_tmp, tmp[0]);
823
0
  for (n = 0; n < 16; n += 2) {
824
0
    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
825
0
  }
826
827
0
  return nz;
828
0
}
829
830
static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
831
                             int16_t levels[16],
832
                             const uint8_t* WEBP_RESTRICT const src,
833
                             uint8_t* WEBP_RESTRICT const yuv_out,
834
0
                             int mode) {
835
0
  const VP8Encoder* const enc = it->enc;
836
0
  const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode];
837
0
  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
838
0
  int nz = 0;
839
0
  int16_t tmp[16];
840
841
0
  VP8FTransform(src, ref, tmp);
842
0
  if (DO_TRELLIS_I4 && it->do_trellis) {
843
0
    const int x = it->i4 & 3, y = it->i4 >> 2;
844
0
    const int ctx = it->top_nz[x] + it->left_nz[y];
845
0
    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1,
846
0
                              dqm->lambda_trellis_i4);
847
0
  } else {
848
0
    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1);
849
0
  }
850
0
  VP8ITransform(ref, tmp, yuv_out, 0);
851
0
  return nz;
852
0
}
853
854
//------------------------------------------------------------------------------
855
// DC-error diffusion
856
857
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
858
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
859
0
#define C1 7    // fraction of error sent to the 4x4 block below
860
0
#define C2 8    // fraction of error sent to the 4x4 block on the right
861
0
#define DSHIFT 4
862
0
#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
863
864
// Quantize as usual, but also compute and return the quantization error.
865
// Error is already divided by DSHIFT.
866
static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
867
0
                          const VP8Matrix* WEBP_RESTRICT const mtx) {
868
0
  int V = *v;
869
0
  const int sign = (V < 0);
870
0
  if (sign) V = -V;
871
0
  if (V > (int)mtx->zthresh[0]) {
872
0
    const int qV = QUANTDIV(V, mtx->iq[0], mtx->bias[0]) * mtx->q[0];
873
0
    const int err = (V - qV);
874
0
    *v = sign ? -qV : qV;
875
0
    return (sign ? -err : err) >> DSCALE;
876
0
  }
877
0
  *v = 0;
878
0
  return (sign ? -V : V) >> DSCALE;
879
0
}
880
881
static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
882
                            const VP8Matrix* WEBP_RESTRICT const mtx,
883
                            int16_t tmp[][16],
884
0
                            VP8ModeScore* WEBP_RESTRICT const rd) {
885
  //         | top[0] | top[1]
886
  // --------+--------+---------
887
  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
888
  // left[1] | tmp[2]   tmp[3]        err2 err3
889
  //
890
  // Final errors {err1,err2,err3} are preserved and later restored
891
  // as top[]/left[] on the next block.
892
0
  int ch;
893
0
  for (ch = 0; ch <= 1; ++ch) {
894
0
    const int8_t* const top = it->top_derr[it->x][ch];
895
0
    const int8_t* const left = it->left_derr[ch];
896
0
    int16_t (* const c)[16] = &tmp[ch * 4];
897
0
    int err0, err1, err2, err3;
898
0
    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
899
0
    err0 = QuantizeSingle(&c[0][0], mtx);
900
0
    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
901
0
    err1 = QuantizeSingle(&c[1][0], mtx);
902
0
    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
903
0
    err2 = QuantizeSingle(&c[2][0], mtx);
904
0
    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
905
0
    err3 = QuantizeSingle(&c[3][0], mtx);
906
    // error 'err' is bounded by mtx->q[0] which is 132 at max. Hence
907
    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
908
0
    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
909
0
    rd->derr[ch][0] = (int8_t)err1;
910
0
    rd->derr[ch][1] = (int8_t)err2;
911
0
    rd->derr[ch][2] = (int8_t)err3;
912
0
  }
913
0
}
914
915
static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
916
0
                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
917
0
  int ch;
918
0
  for (ch = 0; ch <= 1; ++ch) {
919
0
    int8_t* const top = it->top_derr[it->x][ch];
920
0
    int8_t* const left = it->left_derr[ch];
921
0
    left[0] = rd->derr[ch][0];            // restore err1
922
0
    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
923
0
    top[0]  = rd->derr[ch][1];            //     ... err2
924
0
    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
925
0
  }
926
0
}
927
928
#undef C1
929
#undef C2
930
#undef DSHIFT
931
#undef DSCALE
932
933
//------------------------------------------------------------------------------
934
935
static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
936
                         VP8ModeScore* WEBP_RESTRICT const rd,
937
0
                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
938
0
  const VP8Encoder* const enc = it->enc;
939
0
  const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode];
940
0
  const uint8_t* const src = it->yuv_in + U_OFF_ENC;
941
0
  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
942
0
  int nz = 0;
943
0
  int n;
944
0
  int16_t tmp[8][16];
945
946
0
  for (n = 0; n < 8; n += 2) {
947
0
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
948
0
  }
949
0
  if (it->top_derr != NULL) CorrectDCValues(it, &dqm->uv, tmp, rd);
950
951
0
  if (DO_TRELLIS_UV && it->do_trellis) {
952
0
    int ch, x, y;
953
0
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
954
0
      for (y = 0; y < 2; ++y) {
955
0
        for (x = 0; x < 2; ++x, ++n) {
956
0
          const int ctx = it->top_nz[4 + ch + x] + it->left_nz[4 + ch + y];
957
0
          const int non_zero = TrellisQuantizeBlock(
958
0
              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv,
959
0
              dqm->lambda_trellis_uv);
960
0
          it->top_nz[4 + ch + x] = it->left_nz[4 + ch + y] = non_zero;
961
0
          nz |= non_zero << n;
962
0
        }
963
0
      }
964
0
    }
965
0
  } else {
966
0
    for (n = 0; n < 8; n += 2) {
967
0
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv) << n;
968
0
    }
969
0
  }
970
971
0
  for (n = 0; n < 8; n += 2) {
972
0
    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
973
0
  }
974
0
  return (nz << 16);
975
0
}
976
977
//------------------------------------------------------------------------------
978
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
979
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
980
981
0
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
982
  // We look at the first three AC coefficients to determine what is the average
983
  // delta between each sub-4x4 block.
984
0
  const int v0 = abs(DCs[1]);
985
0
  const int v1 = abs(DCs[2]);
986
0
  const int v2 = abs(DCs[4]);
987
0
  int max_v = (v1 > v0) ? v1 : v0;
988
0
  max_v = (v2 > max_v) ? v2 : max_v;
989
0
  if (max_v > dqm->max_edge) dqm->max_edge = max_v;
990
0
}
991
992
0
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
993
0
  VP8ModeScore* const tmp = *a;
994
0
  *a = *b;
995
0
  *b = tmp;
996
0
}
997
998
0
static void SwapPtr(uint8_t** a, uint8_t** b) {
999
0
  uint8_t* const tmp = *a;
1000
0
  *a = *b;
1001
0
  *b = tmp;
1002
0
}
1003
1004
0
static void SwapOut(VP8EncIterator* const it) {
1005
0
  SwapPtr(&it->yuv_out, &it->yuv_out2);
1006
0
}
1007
1008
static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1009
0
                            VP8ModeScore* WEBP_RESTRICT rd) {
1010
0
  const int kNumBlocks = 16;
1011
0
  VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
1012
0
  const int lambda = dqm->lambda_i16;
1013
0
  const int tlambda = dqm->tlambda;
1014
0
  const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
1015
0
  VP8ModeScore rd_tmp;
1016
0
  VP8ModeScore* rd_cur = &rd_tmp;
1017
0
  VP8ModeScore* rd_best = rd;
1018
0
  int mode;
1019
0
  int is_flat = IsFlatSource16(it->yuv_in + Y_OFF_ENC);
1020
1021
0
  rd->mode_i16 = -1;
1022
0
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1023
0
    uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC;  // scratch buffer
1024
0
    rd_cur->mode_i16 = mode;
1025
1026
    // Reconstruct
1027
0
    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1028
1029
    // Measure RD-score
1030
0
    rd_cur->D = VP8SSE16x16(src, tmp_dst);
1031
0
    rd_cur->SD =
1032
0
        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1033
0
    rd_cur->H = VP8FixedCostsI16[mode];
1034
0
    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1035
0
    if (is_flat) {
1036
      // refine the first impression (which was in pixel space)
1037
0
      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1038
0
      if (is_flat) {
1039
        // Block is very flat. We put emphasis on the distortion being very low!
1040
0
        rd_cur->D *= 2;
1041
0
        rd_cur->SD *= 2;
1042
0
      }
1043
0
    }
1044
1045
    // Since we always examine Intra16 first, we can overwrite *rd directly.
1046
0
    SetRDScore(lambda, rd_cur);
1047
0
    if (mode == 0 || rd_cur->score < rd_best->score) {
1048
0
      SwapModeScore(&rd_cur, &rd_best);
1049
0
      SwapOut(it);
1050
0
    }
1051
0
  }
1052
0
  if (rd_best != rd) {
1053
0
    memcpy(rd, rd_best, sizeof(*rd));
1054
0
  }
1055
0
  SetRDScore(dqm->lambda_mode, rd);   // finalize score for mode decision.
1056
0
  VP8SetIntra16Mode(it, rd->mode_i16);
1057
1058
  // we have a blocky macroblock (only DCs are non-zero) with fairly high
1059
  // distortion, record max delta so we can later adjust the minimal filtering
1060
  // strength needed to smooth these blocks out.
1061
0
  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto) {
1062
0
    StoreMaxDelta(dqm, rd->y_dc_levels);
1063
0
  }
1064
0
}
1065
1066
//------------------------------------------------------------------------------
1067
1068
// return the cost array corresponding to the surrounding prediction modes.
1069
static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1070
0
                                     const uint8_t modes[16]) {
1071
0
  const int preds_w = it->enc->preds_w;
1072
0
  const int x = (it->i4 & 3), y = it->i4 >> 2;
1073
0
  const int left = (x == 0) ? it->preds[y * preds_w - 1] : modes[it->i4 - 1];
1074
0
  const int top = (y == 0) ? it->preds[-preds_w + x] : modes[it->i4 - 4];
1075
0
  return VP8FixedCostsI4[top][left];
1076
0
}
1077
1078
static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1079
0
                          VP8ModeScore* WEBP_RESTRICT const rd) {
1080
0
  const VP8Encoder* const enc = it->enc;
1081
0
  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
1082
0
  const int lambda = dqm->lambda_i4;
1083
0
  const int tlambda = dqm->tlambda;
1084
0
  const uint8_t* const src0 = it->yuv_in + Y_OFF_ENC;
1085
0
  uint8_t* const best_blocks = it->yuv_out2 + Y_OFF_ENC;
1086
0
  int total_header_bits = 0;
1087
0
  VP8ModeScore rd_best;
1088
1089
0
  if (enc->max_i4_header_bits == 0) {
1090
0
    return 0;
1091
0
  }
1092
1093
0
  InitScore(&rd_best);
1094
0
  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1095
0
  SetRDScore(dqm->lambda_mode, &rd_best);
1096
0
  VP8IteratorStartI4(it);
1097
0
  do {
1098
0
    const int kNumBlocks = 1;
1099
0
    VP8ModeScore rd_i4;
1100
0
    int mode;
1101
0
    int best_mode = -1;
1102
0
    const uint8_t* const src = src0 + VP8Scan[it->i4];
1103
0
    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1104
0
    uint8_t* best_block = best_blocks + VP8Scan[it->i4];
1105
0
    uint8_t* tmp_dst = it->yuv_p + I4TMP;    // scratch buffer.
1106
1107
0
    InitScore(&rd_i4);
1108
0
    MakeIntra4Preds(it);
1109
0
    for (mode = 0; mode < NUM_BMODES; ++mode) {
1110
0
      VP8ModeScore rd_tmp;
1111
0
      int16_t tmp_levels[16];
1112
1113
      // Reconstruct
1114
0
      rd_tmp.nz =
1115
0
          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4;
1116
1117
      // Compute RD-score
1118
0
      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1119
0
      rd_tmp.SD =
1120
0
          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1121
0
                  : 0;
1122
0
      rd_tmp.H = mode_costs[mode];
1123
1124
      // Add flatness penalty, to avoid flat area to be mispredicted
1125
      // by a complex mode.
1126
0
      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1127
0
        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1128
0
      } else {
1129
0
        rd_tmp.R = 0;
1130
0
      }
1131
1132
      // early-out check
1133
0
      SetRDScore(lambda, &rd_tmp);
1134
0
      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1135
1136
      // finish computing score
1137
0
      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1138
0
      SetRDScore(lambda, &rd_tmp);
1139
1140
0
      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1141
0
        CopyScore(&rd_i4, &rd_tmp);
1142
0
        best_mode = mode;
1143
0
        SwapPtr(&tmp_dst, &best_block);
1144
0
        memcpy(rd_best.y_ac_levels[it->i4], tmp_levels,
1145
0
               sizeof(rd_best.y_ac_levels[it->i4]));
1146
0
      }
1147
0
    }
1148
0
    SetRDScore(dqm->lambda_mode, &rd_i4);
1149
0
    AddScore(&rd_best, &rd_i4);
1150
0
    if (rd_best.score >= rd->score) {
1151
0
      return 0;
1152
0
    }
1153
0
    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1154
0
    if (total_header_bits > enc->max_i4_header_bits) {
1155
0
      return 0;
1156
0
    }
1157
    // Copy selected samples if not in the right place already.
1158
0
    if (best_block != best_blocks + VP8Scan[it->i4]) {
1159
0
      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4]);
1160
0
    }
1161
0
    rd->modes_i4[it->i4] = best_mode;
1162
0
    it->top_nz[it->i4 & 3] = it->left_nz[it->i4 >> 2] = (rd_i4.nz ? 1 : 0);
1163
0
  } while (VP8IteratorRotateI4(it, best_blocks));
1164
1165
  // finalize state
1166
0
  CopyScore(rd, &rd_best);
1167
0
  VP8SetIntra4Mode(it, rd->modes_i4);
1168
0
  SwapOut(it);
1169
0
  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1170
0
  return 1;   // select intra4x4 over intra16x16
1171
0
}
1172
1173
//------------------------------------------------------------------------------
1174
1175
static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1176
0
                       VP8ModeScore* WEBP_RESTRICT const rd) {
1177
0
  const int kNumBlocks = 8;
1178
0
  const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
1179
0
  const int lambda = dqm->lambda_uv;
1180
0
  const uint8_t* const src = it->yuv_in + U_OFF_ENC;
1181
0
  uint8_t* tmp_dst = it->yuv_out2 + U_OFF_ENC;  // scratch buffer
1182
0
  uint8_t* dst0 = it->yuv_out + U_OFF_ENC;
1183
0
  uint8_t* dst = dst0;
1184
0
  VP8ModeScore rd_best;
1185
0
  int mode;
1186
1187
0
  rd->mode_uv = -1;
1188
0
  InitScore(&rd_best);
1189
0
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1190
0
    VP8ModeScore rd_uv;
1191
1192
    // Reconstruct
1193
0
    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1194
1195
    // Compute RD-score
1196
0
    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1197
0
    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1198
0
    rd_uv.H  = VP8FixedCostsUV[mode];
1199
0
    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1200
0
    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1201
0
      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1202
0
    }
1203
1204
0
    SetRDScore(lambda, &rd_uv);
1205
0
    if (mode == 0 || rd_uv.score < rd_best.score) {
1206
0
      CopyScore(&rd_best, &rd_uv);
1207
0
      rd->mode_uv = mode;
1208
0
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1209
0
      if (it->top_derr != NULL) {
1210
0
        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1211
0
      }
1212
0
      SwapPtr(&dst, &tmp_dst);
1213
0
    }
1214
0
  }
1215
0
  VP8SetIntraUVMode(it, rd->mode_uv);
1216
0
  AddScore(rd, &rd_best);
1217
0
  if (dst != dst0) {   // copy 16x8 block if needed
1218
0
    VP8Copy16x8(dst, dst0);
1219
0
  }
1220
0
  if (it->top_derr != NULL) {  // store diffusion errors for next block
1221
0
    StoreDiffusionErrors(it, rd);
1222
0
  }
1223
0
}
1224
1225
//------------------------------------------------------------------------------
1226
// Final reconstruction and quantization.
1227
1228
static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1229
0
                           VP8ModeScore* WEBP_RESTRICT const rd) {
1230
0
  const VP8Encoder* const enc = it->enc;
1231
0
  const int is_i16 = (it->mb->type == 1);
1232
0
  int nz = 0;
1233
1234
0
  if (is_i16) {
1235
0
    nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]);
1236
0
  } else {
1237
0
    VP8IteratorStartI4(it);
1238
0
    do {
1239
0
      const int mode =
1240
0
          it->preds[(it->i4 & 3) + (it->i4 >> 2) * enc->preds_w];
1241
0
      const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4];
1242
0
      uint8_t* const dst = it->yuv_out + Y_OFF_ENC + VP8Scan[it->i4];
1243
0
      MakeIntra4Preds(it);
1244
0
      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4],
1245
0
                              src, dst, mode) << it->i4;
1246
0
    } while (VP8IteratorRotateI4(it, it->yuv_out + Y_OFF_ENC));
1247
0
  }
1248
1249
0
  nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode);
1250
0
  rd->nz = nz;
1251
0
}
1252
1253
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1254
static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1255
                                  int try_both_modes, int refine_uv_mode,
1256
0
                                  VP8ModeScore* WEBP_RESTRICT const rd) {
1257
0
  score_t best_score = MAX_COST;
1258
0
  int nz = 0;
1259
0
  int mode;
1260
0
  int is_i16 = try_both_modes || (it->mb->type == 1);
1261
1262
0
  const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
1263
  // Some empiric constants, of approximate order of magnitude.
1264
0
  const int lambda_d_i16 = 106;
1265
0
  const int lambda_d_i4 = 11;
1266
0
  const int lambda_d_uv = 120;
1267
0
  score_t score_i4 = dqm->i4_penalty;
1268
0
  score_t i4_bit_sum = 0;
1269
0
  const score_t bit_limit = try_both_modes ? it->enc->mb_header_limit
1270
0
                                           : MAX_COST;  // no early-out allowed
1271
1272
0
  if (is_i16) {   // First, evaluate Intra16 distortion
1273
0
    int best_mode = -1;
1274
0
    const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
1275
0
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1276
0
      const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode];
1277
0
      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1278
0
                          + VP8FixedCostsI16[mode] * lambda_d_i16;
1279
0
      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1280
0
        continue;
1281
0
      }
1282
1283
0
      if (score < best_score) {
1284
0
        best_mode = mode;
1285
0
        best_score = score;
1286
0
      }
1287
0
    }
1288
0
    if (it->x == 0 || it->y == 0) {
1289
      // avoid starting a checkerboard resonance from the border. See bug #432.
1290
0
      if (IsFlatSource16(src)) {
1291
0
        best_mode = (it->x == 0) ? 0 : 2;
1292
0
        try_both_modes = 0;  // stick to i16
1293
0
      }
1294
0
    }
1295
0
    VP8SetIntra16Mode(it, best_mode);
1296
    // we'll reconstruct later, if i16 mode actually gets selected
1297
0
  }
1298
1299
  // Next, evaluate Intra4
1300
0
  if (try_both_modes || !is_i16) {
1301
    // We don't evaluate the rate here, but just account for it through a
1302
    // constant penalty (i4 mode usually needs more bits compared to i16).
1303
0
    is_i16 = 0;
1304
0
    VP8IteratorStartI4(it);
1305
0
    do {
1306
0
      int best_i4_mode = -1;
1307
0
      score_t best_i4_score = MAX_COST;
1308
0
      const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4];
1309
0
      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1310
1311
0
      MakeIntra4Preds(it);
1312
0
      for (mode = 0; mode < NUM_BMODES; ++mode) {
1313
0
        const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode];
1314
0
        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1315
0
                            + mode_costs[mode] * lambda_d_i4;
1316
0
        if (score < best_i4_score) {
1317
0
          best_i4_mode = mode;
1318
0
          best_i4_score = score;
1319
0
        }
1320
0
      }
1321
0
      i4_bit_sum += mode_costs[best_i4_mode];
1322
0
      rd->modes_i4[it->i4] = best_i4_mode;
1323
0
      score_i4 += best_i4_score;
1324
0
      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1325
        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1326
0
        is_i16 = 1;
1327
0
        break;
1328
0
      } else {  // reconstruct partial block inside yuv_out2 buffer
1329
0
        uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC + VP8Scan[it->i4];
1330
0
        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4],
1331
0
                                src, tmp_dst, best_i4_mode) << it->i4;
1332
0
      }
1333
0
    } while (VP8IteratorRotateI4(it, it->yuv_out2 + Y_OFF_ENC));
1334
0
  }
1335
1336
  // Final reconstruction, depending on which mode is selected.
1337
0
  if (!is_i16) {
1338
0
    VP8SetIntra4Mode(it, rd->modes_i4);
1339
0
    SwapOut(it);
1340
0
    best_score = score_i4;
1341
0
  } else {
1342
0
    nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]);
1343
0
  }
1344
1345
  // ... and UV!
1346
0
  if (refine_uv_mode) {
1347
0
    int best_mode = -1;
1348
0
    score_t best_uv_score = MAX_COST;
1349
0
    const uint8_t* const src = it->yuv_in + U_OFF_ENC;
1350
0
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1351
0
      const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode];
1352
0
      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1353
0
                          + VP8FixedCostsUV[mode] * lambda_d_uv;
1354
0
      if (score < best_uv_score) {
1355
0
        best_mode = mode;
1356
0
        best_uv_score = score;
1357
0
      }
1358
0
    }
1359
0
    VP8SetIntraUVMode(it, best_mode);
1360
0
  }
1361
0
  nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode);
1362
1363
0
  rd->nz = nz;
1364
0
  rd->score = best_score;
1365
0
}
1366
1367
//------------------------------------------------------------------------------
1368
// Entry point
1369
1370
int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1371
                VP8ModeScore* WEBP_RESTRICT const rd,
1372
0
                VP8RDLevel rd_opt) {
1373
0
  int is_skipped;
1374
0
  const int method = it->enc->method;
1375
1376
0
  InitScore(rd);
1377
1378
  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1379
  // Luma4x4 predictions needs to be done as-we-go.
1380
0
  VP8MakeLuma16Preds(it);
1381
0
  VP8MakeChroma8Preds(it);
1382
1383
0
  if (rd_opt > RD_OPT_NONE) {
1384
0
    it->do_trellis = (rd_opt >= RD_OPT_TRELLIS_ALL);
1385
0
    PickBestIntra16(it, rd);
1386
0
    if (method >= 2) {
1387
0
      PickBestIntra4(it, rd);
1388
0
    }
1389
0
    PickBestUV(it, rd);
1390
0
    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1391
0
      it->do_trellis = 1;
1392
0
      SimpleQuantize(it, rd);
1393
0
    }
1394
0
  } else {
1395
    // At this point we have heuristically decided intra16 / intra4.
1396
    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1397
    // For method <= 1, we don't re-examine the decision but just go ahead with
1398
    // quantization/reconstruction.
1399
0
    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1400
0
  }
1401
0
  is_skipped = (rd->nz == 0);
1402
0
  VP8SetSkip(it, is_skipped);
1403
0
  return is_skipped;
1404
0
}