Coverage Report

Created: 2024-07-27 06:27

/src/libwebp/src/enc/quant_enc.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2011 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
//   Quantization
11
//
12
// Author: Skal (pascal.massimino@gmail.com)
13
14
#include <assert.h>
15
#include <math.h>
16
#include <stdlib.h>  // for abs()
17
18
#include "src/dsp/quant.h"
19
#include "src/enc/vp8i_enc.h"
20
#include "src/enc/cost_enc.h"
21
22
0
#define DO_TRELLIS_I4  1
23
0
#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24
0
#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25
#define USE_TDISTO 1
26
27
0
#define MID_ALPHA 64      // neutral value for susceptibility
28
0
#define MIN_ALPHA 30      // lowest usable value for susceptibility
29
0
#define MAX_ALPHA 100     // higher meaningful value for susceptibility
30
31
0
#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32
                          // power-law modulation. Must be strictly less than 1.
33
34
// number of non-zero coeffs below which we consider the block very flat
35
// (and apply a penalty to complex predictions)
36
0
#define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
37
0
#define FLATNESS_LIMIT_I4  3       // I4 mode
38
0
#define FLATNESS_LIMIT_UV  2       // UV mode
39
0
#define FLATNESS_PENALTY   140     // roughly ~1bit per block
40
41
0
#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43
0
#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44
45
// #define DEBUG_BLOCK
46
47
//------------------------------------------------------------------------------
48
49
#if defined(DEBUG_BLOCK)
50
51
#include <stdio.h>
52
#include <stdlib.h>
53
54
static void PrintBlockInfo(const VP8EncIterator* const it,
55
                           const VP8ModeScore* const rd) {
56
  int i, j;
57
  const int is_i16 = (it->mb_->type_ == 1);
58
  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59
  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60
  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61
  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62
  printf("SOURCE / OUTPUT / ABS DELTA\n");
63
  for (j = 0; j < 16; ++j) {
64
    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65
    printf("     ");
66
    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67
    printf("     ");
68
    for (i = 0; i < 16; ++i) {
69
      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70
    }
71
    printf("\n");
72
  }
73
  printf("\n");   // newline before the U/V block
74
  for (j = 0; j < 8; ++j) {
75
    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76
    printf(" ");
77
    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78
    printf("    ");
79
    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80
    printf(" ");
81
    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82
    printf("   ");
83
    for (i = 0; i < 8; ++i) {
84
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85
    }
86
    printf(" ");
87
    for (i = 8; i < 16; ++i) {
88
      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89
    }
90
    printf("\n");
91
  }
92
  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93
    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94
    (int)rd->score);
95
  if (is_i16) {
96
    printf("Mode: %d\n", rd->mode_i16);
97
    printf("y_dc_levels:");
98
    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99
    printf("\n");
100
  } else {
101
    printf("Modes[16]: ");
102
    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103
    printf("\n");
104
  }
105
  printf("y_ac_levels:\n");
106
  for (j = 0; j < 16; ++j) {
107
    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108
      printf("%4d ", rd->y_ac_levels[j][i]);
109
    }
110
    printf("\n");
111
  }
112
  printf("\n");
113
  printf("uv_levels (mode=%d):\n", rd->mode_uv);
114
  for (j = 0; j < 8; ++j) {
115
    for (i = 0; i < 16; ++i) {
116
      printf("%4d ", rd->uv_levels[j][i]);
117
    }
118
    printf("\n");
119
  }
120
}
121
122
#endif   // DEBUG_BLOCK
123
124
//------------------------------------------------------------------------------
125
126
0
static WEBP_INLINE int clip(int v, int m, int M) {
127
0
  return v < m ? m : v > M ? M : v;
128
0
}
129
130
static const uint8_t kZigzag[16] = {
131
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132
};
133
134
static const uint8_t kDcTable[128] = {
135
  4,     5,   6,   7,   8,   9,  10,  10,
136
  11,   12,  13,  14,  15,  16,  17,  17,
137
  18,   19,  20,  20,  21,  21,  22,  22,
138
  23,   23,  24,  25,  25,  26,  27,  28,
139
  29,   30,  31,  32,  33,  34,  35,  36,
140
  37,   37,  38,  39,  40,  41,  42,  43,
141
  44,   45,  46,  46,  47,  48,  49,  50,
142
  51,   52,  53,  54,  55,  56,  57,  58,
143
  59,   60,  61,  62,  63,  64,  65,  66,
144
  67,   68,  69,  70,  71,  72,  73,  74,
145
  75,   76,  76,  77,  78,  79,  80,  81,
146
  82,   83,  84,  85,  86,  87,  88,  89,
147
  91,   93,  95,  96,  98, 100, 101, 102,
148
  104, 106, 108, 110, 112, 114, 116, 118,
149
  122, 124, 126, 128, 130, 132, 134, 136,
150
  138, 140, 143, 145, 148, 151, 154, 157
151
};
152
153
static const uint16_t kAcTable[128] = {
154
  4,     5,   6,   7,   8,   9,  10,  11,
155
  12,   13,  14,  15,  16,  17,  18,  19,
156
  20,   21,  22,  23,  24,  25,  26,  27,
157
  28,   29,  30,  31,  32,  33,  34,  35,
158
  36,   37,  38,  39,  40,  41,  42,  43,
159
  44,   45,  46,  47,  48,  49,  50,  51,
160
  52,   53,  54,  55,  56,  57,  58,  60,
161
  62,   64,  66,  68,  70,  72,  74,  76,
162
  78,   80,  82,  84,  86,  88,  90,  92,
163
  94,   96,  98, 100, 102, 104, 106, 108,
164
  110, 112, 114, 116, 119, 122, 125, 128,
165
  131, 134, 137, 140, 143, 146, 149, 152,
166
  155, 158, 161, 164, 167, 170, 173, 177,
167
  181, 185, 189, 193, 197, 201, 205, 209,
168
  213, 217, 221, 225, 229, 234, 239, 245,
169
  249, 254, 259, 264, 269, 274, 279, 284
170
};
171
172
static const uint16_t kAcTable2[128] = {
173
  8,     8,   9,  10,  12,  13,  15,  17,
174
  18,   20,  21,  23,  24,  26,  27,  29,
175
  31,   32,  34,  35,  37,  38,  40,  41,
176
  43,   44,  46,  48,  49,  51,  52,  54,
177
  55,   57,  58,  60,  62,  63,  65,  66,
178
  68,   69,  71,  72,  74,  75,  77,  79,
179
  80,   82,  83,  85,  86,  88,  89,  93,
180
  96,   99, 102, 105, 108, 111, 114, 117,
181
  120, 124, 127, 130, 133, 136, 139, 142,
182
  145, 148, 151, 155, 158, 161, 164, 167,
183
  170, 173, 176, 179, 184, 189, 193, 198,
184
  203, 207, 212, 217, 221, 226, 230, 235,
185
  240, 244, 249, 254, 258, 263, 268, 274,
186
  280, 286, 292, 299, 305, 311, 317, 323,
187
  330, 336, 342, 348, 354, 362, 370, 379,
188
  385, 393, 401, 409, 416, 424, 432, 440
189
};
190
191
static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192
  { 96, 110 }, { 96, 108 }, { 110, 115 }
193
};
194
195
// Sharpening by (slightly) raising the hi-frequency coeffs.
196
// Hack-ish but helpful for mid-bitrate range. Use with care.
197
0
#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198
static const uint8_t kFreqSharpening[16] = {
199
  0,  30, 60, 90,
200
  30, 60, 90, 90,
201
  60, 90, 90, 90,
202
  90, 90, 90, 90
203
};
204
205
//------------------------------------------------------------------------------
206
// Initialize quantization parameters in VP8Matrix
207
208
// Returns the average quantizer
209
0
static int ExpandMatrix(VP8Matrix* const m, int type) {
210
0
  int i, sum;
211
0
  for (i = 0; i < 2; ++i) {
212
0
    const int is_ac_coeff = (i > 0);
213
0
    const int bias = kBiasMatrices[type][is_ac_coeff];
214
0
    m->iq_[i] = (1 << QFIX) / m->q_[i];
215
0
    m->bias_[i] = BIAS(bias);
216
    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217
    //   * zero if coeff <= zthresh
218
    //   * non-zero if coeff > zthresh
219
0
    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220
0
  }
221
0
  for (i = 2; i < 16; ++i) {
222
0
    m->q_[i] = m->q_[1];
223
0
    m->iq_[i] = m->iq_[1];
224
0
    m->bias_[i] = m->bias_[1];
225
0
    m->zthresh_[i] = m->zthresh_[1];
226
0
  }
227
0
  for (sum = 0, i = 0; i < 16; ++i) {
228
0
    if (type == 0) {  // we only use sharpening for AC luma coeffs
229
0
      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230
0
    } else {
231
0
      m->sharpen_[i] = 0;
232
0
    }
233
0
    sum += m->q_[i];
234
0
  }
235
0
  return (sum + 8) >> 4;
236
0
}
237
238
0
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
240
0
static void SetupMatrices(VP8Encoder* enc) {
241
0
  int i;
242
0
  const int tlambda_scale =
243
0
    (enc->method_ >= 4) ? enc->config_->sns_strength
244
0
                        : 0;
245
0
  const int num_segments = enc->segment_hdr_.num_segments_;
246
0
  for (i = 0; i < num_segments; ++i) {
247
0
    VP8SegmentInfo* const m = &enc->dqm_[i];
248
0
    const int q = m->quant_;
249
0
    int q_i4, q_i16, q_uv;
250
0
    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251
0
    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252
253
0
    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254
0
    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256
0
    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257
0
    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259
0
    q_i4  = ExpandMatrix(&m->y1_, 0);
260
0
    q_i16 = ExpandMatrix(&m->y2_, 1);
261
0
    q_uv  = ExpandMatrix(&m->uv_, 2);
262
263
0
    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264
0
    m->lambda_i16_         = (3 * q_i16 * q_i16);
265
0
    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266
0
    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267
0
    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268
0
    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269
0
    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270
0
    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271
272
    // none of these constants should be < 1
273
0
    CheckLambdaValue(&m->lambda_i4_);
274
0
    CheckLambdaValue(&m->lambda_i16_);
275
0
    CheckLambdaValue(&m->lambda_uv_);
276
0
    CheckLambdaValue(&m->lambda_mode_);
277
0
    CheckLambdaValue(&m->lambda_trellis_i4_);
278
0
    CheckLambdaValue(&m->lambda_trellis_i16_);
279
0
    CheckLambdaValue(&m->lambda_trellis_uv_);
280
0
    CheckLambdaValue(&m->tlambda_);
281
282
0
    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283
0
    m->max_edge_  = 0;
284
285
0
    m->i4_penalty_ = 1000 * q_i4 * q_i4;
286
0
  }
287
0
}
288
289
//------------------------------------------------------------------------------
290
// Initialize filtering parameters
291
292
// Very small filter-strength values have close to no visual effect. So we can
293
// save a little decoding-CPU by turning filtering off for these.
294
0
#define FSTRENGTH_CUTOFF 2
295
296
0
static void SetupFilterStrength(VP8Encoder* const enc) {
297
0
  int i;
298
  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299
0
  const int level0 = 5 * enc->config_->filter_strength;
300
0
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301
0
    VP8SegmentInfo* const m = &enc->dqm_[i];
302
    // We focus on the quantization of AC coeffs.
303
0
    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304
0
    const int base_strength =
305
0
        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306
    // Segments with lower complexity ('beta') will be less filtered.
307
0
    const int f = base_strength * level0 / (256 + m->beta_);
308
0
    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309
0
  }
310
  // We record the initial strength (mainly for the case of 1-segment only).
311
0
  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312
0
  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313
0
  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314
0
}
315
316
//------------------------------------------------------------------------------
317
318
// Note: if you change the values below, remember that the max range
319
// allowed by the syntax for DQ_UV is [-16,16].
320
0
#define MAX_DQ_UV (6)
321
0
#define MIN_DQ_UV (-4)
322
323
// We want to emulate jpeg-like behaviour where the expected "good" quality
324
// is around q=75. Internally, our "good" middle is around c=50. So we
325
// map accordingly using linear piece-wise function
326
0
static double QualityToCompression(double c) {
327
0
  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328
  // The file size roughly scales as pow(quantizer, 3.). Actually, the
329
  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330
  // in the mid-quant range. So we scale the compressibility inversely to
331
  // this power-law: quant ~= compression ^ 1/3. This law holds well for
332
  // low quant. Finer modeling for high-quant would make use of kAcTable[]
333
  // more explicitly.
334
0
  const double v = pow(linear_c, 1 / 3.);
335
0
  return v;
336
0
}
337
338
0
static double QualityToJPEGCompression(double c, double alpha) {
339
  // We map the complexity 'alpha' and quality setting 'c' to a compression
340
  // exponent empirically matched to the compression curve of libjpeg6b.
341
  // On average, the WebP output size will be roughly similar to that of a
342
  // JPEG file compressed with same quality factor.
343
0
  const double amin = 0.30;
344
0
  const double amax = 0.85;
345
0
  const double exp_min = 0.4;
346
0
  const double exp_max = 0.9;
347
0
  const double slope = (exp_min - exp_max) / (amax - amin);
348
  // Linearly interpolate 'expn' from exp_min to exp_max
349
  // in the [amin, amax] range.
350
0
  const double expn = (alpha > amax) ? exp_min
351
0
                    : (alpha < amin) ? exp_max
352
0
                    : exp_max + slope * (alpha - amin);
353
0
  const double v = pow(c, expn);
354
0
  return v;
355
0
}
356
357
static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358
0
                                 const VP8SegmentInfo* const S2) {
359
0
  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360
0
}
361
362
0
static void SimplifySegments(VP8Encoder* const enc) {
363
0
  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364
  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365
  // explicit check is needed to avoid a spurious warning about 'i' exceeding
366
  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367
0
  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368
0
                               ? enc->segment_hdr_.num_segments_
369
0
                               : NUM_MB_SEGMENTS;
370
0
  int num_final_segments = 1;
371
0
  int s1, s2;
372
0
  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373
0
    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374
0
    int found = 0;
375
    // check if we already have similar segment
376
0
    for (s2 = 0; s2 < num_final_segments; ++s2) {
377
0
      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378
0
      if (SegmentsAreEquivalent(S1, S2)) {
379
0
        found = 1;
380
0
        break;
381
0
      }
382
0
    }
383
0
    map[s1] = s2;
384
0
    if (!found) {
385
0
      if (num_final_segments != s1) {
386
0
        enc->dqm_[num_final_segments] = enc->dqm_[s1];
387
0
      }
388
0
      ++num_final_segments;
389
0
    }
390
0
  }
391
0
  if (num_final_segments < num_segments) {  // Remap
392
0
    int i = enc->mb_w_ * enc->mb_h_;
393
0
    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394
0
    enc->segment_hdr_.num_segments_ = num_final_segments;
395
    // Replicate the trailing segment infos (it's mostly cosmetics)
396
0
    for (i = num_final_segments; i < num_segments; ++i) {
397
0
      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398
0
    }
399
0
  }
400
0
}
401
402
0
void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403
0
  int i;
404
0
  int dq_uv_ac, dq_uv_dc;
405
0
  const int num_segments = enc->segment_hdr_.num_segments_;
406
0
  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407
0
  const double Q = quality / 100.;
408
0
  const double c_base = enc->config_->emulate_jpeg_size ?
409
0
      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410
0
      QualityToCompression(Q);
411
0
  for (i = 0; i < num_segments; ++i) {
412
    // We modulate the base coefficient to accommodate for the quantization
413
    // susceptibility and allow denser segments to be quantized more.
414
0
    const double expn = 1. - amp * enc->dqm_[i].alpha_;
415
0
    const double c = pow(c_base, expn);
416
0
    const int q = (int)(127. * (1. - c));
417
0
    assert(expn > 0.);
418
0
    enc->dqm_[i].quant_ = clip(q, 0, 127);
419
0
  }
420
421
  // purely indicative in the bitstream (except for the 1-segment case)
422
0
  enc->base_quant_ = enc->dqm_[0].quant_;
423
424
  // fill-in values for the unused segments (required by the syntax)
425
0
  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426
0
    enc->dqm_[i].quant_ = enc->base_quant_;
427
0
  }
428
429
  // uv_alpha_ is normally spread around ~60. The useful range is
430
  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431
  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432
0
  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433
0
                                          / (MAX_ALPHA - MIN_ALPHA);
434
  // we rescale by the user-defined strength of adaptation
435
0
  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436
  // and make it safe.
437
0
  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438
  // We also boost the dc-uv-quant a little, based on sns-strength, since
439
  // U/V channels are quite more reactive to high quants (flat DC-blocks
440
  // tend to appear, and are unpleasant).
441
0
  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442
0
  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443
444
0
  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445
0
  enc->dq_y2_dc_ = 0;
446
0
  enc->dq_y2_ac_ = 0;
447
0
  enc->dq_uv_dc_ = dq_uv_dc;
448
0
  enc->dq_uv_ac_ = dq_uv_ac;
449
450
0
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451
452
0
  if (num_segments > 1) SimplifySegments(enc);
453
454
0
  SetupMatrices(enc);         // finalize quantization matrices
455
0
}
456
457
//------------------------------------------------------------------------------
458
// Form the predictions in cache
459
460
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461
const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465
static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466
  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467
};
468
469
0
void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470
0
  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471
0
  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472
0
  VP8EncPredLuma16(it->yuv_p_, left, top);
473
0
}
474
475
0
void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476
0
  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477
0
  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478
0
  VP8EncPredChroma8(it->yuv_p_, left, top);
479
0
}
480
481
// Form all the ten Intra4x4 predictions in the yuv_p_ cache
482
// for the 4x4 block it->i4_
483
0
static void MakeIntra4Preds(const VP8EncIterator* const it) {
484
0
  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
485
0
}
486
487
//------------------------------------------------------------------------------
488
// Quantize
489
490
// Layout:
491
// +----+----+
492
// |YYYY|UUVV| 0
493
// |YYYY|UUVV| 4
494
// |YYYY|....| 8
495
// |YYYY|....| 12
496
// +----+----+
497
498
const uint16_t VP8Scan[16] = {  // Luma
499
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
500
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
501
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
502
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
503
};
504
505
static const uint16_t VP8ScanUV[4 + 4] = {
506
  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
507
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
508
};
509
510
//------------------------------------------------------------------------------
511
// Distortion measurement
512
513
static const uint16_t kWeightY[16] = {
514
  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
515
};
516
517
static const uint16_t kWeightTrellis[16] = {
518
#if USE_TDISTO == 0
519
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
520
#else
521
  30, 27, 19, 11,
522
  27, 24, 17, 10,
523
  19, 17, 12,  8,
524
  11, 10,  8,  6
525
#endif
526
};
527
528
// Init/Copy the common fields in score.
529
0
static void InitScore(VP8ModeScore* const rd) {
530
0
  rd->D  = 0;
531
0
  rd->SD = 0;
532
0
  rd->R  = 0;
533
0
  rd->H  = 0;
534
0
  rd->nz = 0;
535
0
  rd->score = MAX_COST;
536
0
}
537
538
static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
539
0
                      const VP8ModeScore* WEBP_RESTRICT const src) {
540
0
  dst->D  = src->D;
541
0
  dst->SD = src->SD;
542
0
  dst->R  = src->R;
543
0
  dst->H  = src->H;
544
0
  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
545
0
  dst->score = src->score;
546
0
}
547
548
static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
549
0
                     const VP8ModeScore* WEBP_RESTRICT const src) {
550
0
  dst->D  += src->D;
551
0
  dst->SD += src->SD;
552
0
  dst->R  += src->R;
553
0
  dst->H  += src->H;
554
0
  dst->nz |= src->nz;     // here, new nz bits are accumulated.
555
0
  dst->score += src->score;
556
0
}
557
558
//------------------------------------------------------------------------------
559
// Performs trellis-optimized quantization.
560
561
// Trellis node
562
typedef struct {
563
  int8_t prev;            // best previous node
564
  int8_t sign;            // sign of coeff_i
565
  int16_t level;          // level
566
} Node;
567
568
// Score state
569
typedef struct {
570
  score_t score;          // partial RD score
571
  const uint16_t* costs;  // shortcut to cost tables
572
} ScoreState;
573
574
// If a coefficient was quantized to a value Q (using a neutral bias),
575
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
576
// We don't test negative values though.
577
0
#define MIN_DELTA 0   // how much lower level to try
578
0
#define MAX_DELTA 1   // how much higher
579
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
580
0
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
581
0
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
582
583
0
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
584
0
  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
585
0
}
586
587
static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
588
0
                                          score_t distortion) {
589
0
  return rate * lambda + RD_DISTO_MULT * distortion;
590
0
}
591
592
// Coefficient type.
593
enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
594
595
static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
596
                                int16_t in[16], int16_t out[16],
597
                                int ctx0, int coeff_type,
598
                                const VP8Matrix* WEBP_RESTRICT const mtx,
599
0
                                int lambda) {
600
0
  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
601
0
  CostArrayPtr const costs =
602
0
      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
603
0
  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
604
0
  Node nodes[16][NUM_NODES];
605
0
  ScoreState score_states[2][NUM_NODES];
606
0
  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
607
0
  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
608
0
  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
609
0
  score_t best_score;
610
0
  int n, m, p, last;
611
612
0
  {
613
0
    score_t cost;
614
0
    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
615
0
    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
616
617
    // compute the position of the last interesting coefficient
618
0
    last = first - 1;
619
0
    for (n = 15; n >= first; --n) {
620
0
      const int j = kZigzag[n];
621
0
      const int err = in[j] * in[j];
622
0
      if (err > thresh) {
623
0
        last = n;
624
0
        break;
625
0
      }
626
0
    }
627
    // we don't need to go inspect up to n = 16 coeffs. We can just go up
628
    // to last + 1 (inclusive) without losing much.
629
0
    if (last < 15) ++last;
630
631
    // compute 'skip' score. This is the max score one can do.
632
0
    cost = VP8BitCost(0, last_proba);
633
0
    best_score = RDScoreTrellis(lambda, cost, 0);
634
635
    // initialize source node.
636
0
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
637
0
      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
638
0
      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
639
0
      ss_cur[m].costs = costs[first][ctx0];
640
0
    }
641
0
  }
642
643
  // traverse trellis.
644
0
  for (n = first; n <= last; ++n) {
645
0
    const int j = kZigzag[n];
646
0
    const uint32_t Q  = mtx->q_[j];
647
0
    const uint32_t iQ = mtx->iq_[j];
648
0
    const uint32_t B = BIAS(0x00);     // neutral bias
649
    // note: it's important to take sign of the _original_ coeff,
650
    // so we don't have to consider level < 0 afterward.
651
0
    const int sign = (in[j] < 0);
652
0
    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
653
0
    int level0 = QUANTDIV(coeff0, iQ, B);
654
0
    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
655
0
    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
656
0
    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
657
658
0
    {   // Swap current and previous score states
659
0
      ScoreState* const tmp = ss_cur;
660
0
      ss_cur = ss_prev;
661
0
      ss_prev = tmp;
662
0
    }
663
664
    // test all alternate level values around level0.
665
0
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
666
0
      Node* const cur = &NODE(n, m);
667
0
      const int level = level0 + m;
668
0
      const int ctx = (level > 2) ? 2 : level;
669
0
      const int band = VP8EncBands[n + 1];
670
0
      score_t base_score;
671
0
      score_t best_cur_score;
672
0
      int best_prev;
673
0
      score_t cost, score;
674
675
0
      ss_cur[m].costs = costs[n + 1][ctx];
676
0
      if (level < 0 || level > thresh_level) {
677
0
        ss_cur[m].score = MAX_COST;
678
        // Node is dead.
679
0
        continue;
680
0
      }
681
682
0
      {
683
        // Compute delta_error = how much coding this level will
684
        // subtract to max_error as distortion.
685
        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
686
0
        const int new_error = coeff0 - level * Q;
687
0
        const int delta_error =
688
0
            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
689
0
        base_score = RDScoreTrellis(lambda, 0, delta_error);
690
0
      }
691
692
      // Inspect all possible non-dead predecessors. Retain only the best one.
693
      // The base_score is added to all scores so it is only added for the final
694
      // value after the loop.
695
0
      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
696
0
      best_cur_score =
697
0
          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
698
0
      best_prev = -MIN_DELTA;
699
0
      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
700
        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
701
        // eliminated since their score can't be better than the current best.
702
0
        cost = VP8LevelCost(ss_prev[p].costs, level);
703
        // Examine node assuming it's a non-terminal one.
704
0
        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
705
0
        if (score < best_cur_score) {
706
0
          best_cur_score = score;
707
0
          best_prev = p;
708
0
        }
709
0
      }
710
0
      best_cur_score += base_score;
711
      // Store best finding in current node.
712
0
      cur->sign = sign;
713
0
      cur->level = level;
714
0
      cur->prev = best_prev;
715
0
      ss_cur[m].score = best_cur_score;
716
717
      // Now, record best terminal node (and thus best entry in the graph).
718
0
      if (level != 0 && best_cur_score < best_score) {
719
0
        const score_t last_pos_cost =
720
0
            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
721
0
        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
722
0
        score = best_cur_score + last_pos_score;
723
0
        if (score < best_score) {
724
0
          best_score = score;
725
0
          best_path[0] = n;                     // best eob position
726
0
          best_path[1] = m;                     // best node index
727
0
          best_path[2] = best_prev;             // best predecessor
728
0
        }
729
0
      }
730
0
    }
731
0
  }
732
733
  // Fresh start
734
  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
735
0
  if (coeff_type == TYPE_I16_AC) {
736
0
    memset(in + 1, 0, 15 * sizeof(*in));
737
0
    memset(out + 1, 0, 15 * sizeof(*out));
738
0
  } else {
739
0
    memset(in, 0, 16 * sizeof(*in));
740
0
    memset(out, 0, 16 * sizeof(*out));
741
0
  }
742
0
  if (best_path[0] == -1) {
743
0
    return 0;  // skip!
744
0
  }
745
746
0
  {
747
    // Unwind the best path.
748
    // Note: best-prev on terminal node is not necessarily equal to the
749
    // best_prev for non-terminal. So we patch best_path[2] in.
750
0
    int nz = 0;
751
0
    int best_node = best_path[1];
752
0
    n = best_path[0];
753
0
    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
754
755
0
    for (; n >= first; --n) {
756
0
      const Node* const node = &NODE(n, best_node);
757
0
      const int j = kZigzag[n];
758
0
      out[n] = node->sign ? -node->level : node->level;
759
0
      nz |= node->level;
760
0
      in[j] = out[n] * mtx->q_[j];
761
0
      best_node = node->prev;
762
0
    }
763
0
    return (nz != 0);
764
0
  }
765
0
}
766
767
#undef NODE
768
769
//------------------------------------------------------------------------------
770
// Performs: difference, transform, quantize, back-transform, add
771
// all at once. Output is the reconstructed block in *yuv_out, and the
772
// quantized levels in *levels.
773
774
static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
775
                              VP8ModeScore* WEBP_RESTRICT const rd,
776
                              uint8_t* WEBP_RESTRICT const yuv_out,
777
0
                              int mode) {
778
0
  const VP8Encoder* const enc = it->enc_;
779
0
  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
780
0
  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
781
0
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
782
0
  int nz = 0;
783
0
  int n;
784
0
  int16_t tmp[16][16], dc_tmp[16];
785
786
0
  for (n = 0; n < 16; n += 2) {
787
0
    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
788
0
  }
789
0
  VP8FTransformWHT(tmp[0], dc_tmp);
790
0
  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
791
792
0
  if (DO_TRELLIS_I16 && it->do_trellis_) {
793
0
    int x, y;
794
0
    VP8IteratorNzToBytes(it);
795
0
    for (y = 0, n = 0; y < 4; ++y) {
796
0
      for (x = 0; x < 4; ++x, ++n) {
797
0
        const int ctx = it->top_nz_[x] + it->left_nz_[y];
798
0
        const int non_zero = TrellisQuantizeBlock(
799
0
            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
800
0
            dqm->lambda_trellis_i16_);
801
0
        it->top_nz_[x] = it->left_nz_[y] = non_zero;
802
0
        rd->y_ac_levels[n][0] = 0;
803
0
        nz |= non_zero << n;
804
0
      }
805
0
    }
806
0
  } else {
807
0
    for (n = 0; n < 16; n += 2) {
808
      // Zero-out the first coeff, so that: a) nz is correct below, and
809
      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
810
0
      tmp[n][0] = tmp[n + 1][0] = 0;
811
0
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
812
0
      assert(rd->y_ac_levels[n + 0][0] == 0);
813
0
      assert(rd->y_ac_levels[n + 1][0] == 0);
814
0
    }
815
0
  }
816
817
  // Transform back
818
0
  VP8TransformWHT(dc_tmp, tmp[0]);
819
0
  for (n = 0; n < 16; n += 2) {
820
0
    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
821
0
  }
822
823
0
  return nz;
824
0
}
825
826
static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
827
                             int16_t levels[16],
828
                             const uint8_t* WEBP_RESTRICT const src,
829
                             uint8_t* WEBP_RESTRICT const yuv_out,
830
0
                             int mode) {
831
0
  const VP8Encoder* const enc = it->enc_;
832
0
  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
833
0
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
834
0
  int nz = 0;
835
0
  int16_t tmp[16];
836
837
0
  VP8FTransform(src, ref, tmp);
838
0
  if (DO_TRELLIS_I4 && it->do_trellis_) {
839
0
    const int x = it->i4_ & 3, y = it->i4_ >> 2;
840
0
    const int ctx = it->top_nz_[x] + it->left_nz_[y];
841
0
    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
842
0
                              dqm->lambda_trellis_i4_);
843
0
  } else {
844
0
    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
845
0
  }
846
0
  VP8ITransform(ref, tmp, yuv_out, 0);
847
0
  return nz;
848
0
}
849
850
//------------------------------------------------------------------------------
851
// DC-error diffusion
852
853
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
854
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
855
0
#define C1 7    // fraction of error sent to the 4x4 block below
856
0
#define C2 8    // fraction of error sent to the 4x4 block on the right
857
0
#define DSHIFT 4
858
0
#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
859
860
// Quantize as usual, but also compute and return the quantization error.
861
// Error is already divided by DSHIFT.
862
static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
863
0
                          const VP8Matrix* WEBP_RESTRICT const mtx) {
864
0
  int V = *v;
865
0
  const int sign = (V < 0);
866
0
  if (sign) V = -V;
867
0
  if (V > (int)mtx->zthresh_[0]) {
868
0
    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
869
0
    const int err = (V - qV);
870
0
    *v = sign ? -qV : qV;
871
0
    return (sign ? -err : err) >> DSCALE;
872
0
  }
873
0
  *v = 0;
874
0
  return (sign ? -V : V) >> DSCALE;
875
0
}
876
877
static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
878
                            const VP8Matrix* WEBP_RESTRICT const mtx,
879
                            int16_t tmp[][16],
880
0
                            VP8ModeScore* WEBP_RESTRICT const rd) {
881
  //         | top[0] | top[1]
882
  // --------+--------+---------
883
  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
884
  // left[1] | tmp[2]   tmp[3]        err2 err3
885
  //
886
  // Final errors {err1,err2,err3} are preserved and later restored
887
  // as top[]/left[] on the next block.
888
0
  int ch;
889
0
  for (ch = 0; ch <= 1; ++ch) {
890
0
    const int8_t* const top = it->top_derr_[it->x_][ch];
891
0
    const int8_t* const left = it->left_derr_[ch];
892
0
    int16_t (* const c)[16] = &tmp[ch * 4];
893
0
    int err0, err1, err2, err3;
894
0
    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
895
0
    err0 = QuantizeSingle(&c[0][0], mtx);
896
0
    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
897
0
    err1 = QuantizeSingle(&c[1][0], mtx);
898
0
    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
899
0
    err2 = QuantizeSingle(&c[2][0], mtx);
900
0
    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
901
0
    err3 = QuantizeSingle(&c[3][0], mtx);
902
    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
903
    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
904
0
    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
905
0
    rd->derr[ch][0] = (int8_t)err1;
906
0
    rd->derr[ch][1] = (int8_t)err2;
907
0
    rd->derr[ch][2] = (int8_t)err3;
908
0
  }
909
0
}
910
911
static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
912
0
                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
913
0
  int ch;
914
0
  for (ch = 0; ch <= 1; ++ch) {
915
0
    int8_t* const top = it->top_derr_[it->x_][ch];
916
0
    int8_t* const left = it->left_derr_[ch];
917
0
    left[0] = rd->derr[ch][0];            // restore err1
918
0
    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
919
0
    top[0]  = rd->derr[ch][1];            //     ... err2
920
0
    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
921
0
  }
922
0
}
923
924
#undef C1
925
#undef C2
926
#undef DSHIFT
927
#undef DSCALE
928
929
//------------------------------------------------------------------------------
930
931
static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
932
                         VP8ModeScore* WEBP_RESTRICT const rd,
933
0
                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
934
0
  const VP8Encoder* const enc = it->enc_;
935
0
  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
936
0
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
937
0
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
938
0
  int nz = 0;
939
0
  int n;
940
0
  int16_t tmp[8][16];
941
942
0
  for (n = 0; n < 8; n += 2) {
943
0
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
944
0
  }
945
0
  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
946
947
0
  if (DO_TRELLIS_UV && it->do_trellis_) {
948
0
    int ch, x, y;
949
0
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
950
0
      for (y = 0; y < 2; ++y) {
951
0
        for (x = 0; x < 2; ++x, ++n) {
952
0
          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
953
0
          const int non_zero = TrellisQuantizeBlock(
954
0
              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
955
0
              dqm->lambda_trellis_uv_);
956
0
          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
957
0
          nz |= non_zero << n;
958
0
        }
959
0
      }
960
0
    }
961
0
  } else {
962
0
    for (n = 0; n < 8; n += 2) {
963
0
      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
964
0
    }
965
0
  }
966
967
0
  for (n = 0; n < 8; n += 2) {
968
0
    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
969
0
  }
970
0
  return (nz << 16);
971
0
}
972
973
//------------------------------------------------------------------------------
974
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
975
// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
976
977
0
static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
978
  // We look at the first three AC coefficients to determine what is the average
979
  // delta between each sub-4x4 block.
980
0
  const int v0 = abs(DCs[1]);
981
0
  const int v1 = abs(DCs[2]);
982
0
  const int v2 = abs(DCs[4]);
983
0
  int max_v = (v1 > v0) ? v1 : v0;
984
0
  max_v = (v2 > max_v) ? v2 : max_v;
985
0
  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
986
0
}
987
988
0
static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
989
0
  VP8ModeScore* const tmp = *a;
990
0
  *a = *b;
991
0
  *b = tmp;
992
0
}
993
994
0
static void SwapPtr(uint8_t** a, uint8_t** b) {
995
0
  uint8_t* const tmp = *a;
996
0
  *a = *b;
997
0
  *b = tmp;
998
0
}
999
1000
0
static void SwapOut(VP8EncIterator* const it) {
1001
0
  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1002
0
}
1003
1004
static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1005
0
                            VP8ModeScore* WEBP_RESTRICT rd) {
1006
0
  const int kNumBlocks = 16;
1007
0
  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1008
0
  const int lambda = dqm->lambda_i16_;
1009
0
  const int tlambda = dqm->tlambda_;
1010
0
  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1011
0
  VP8ModeScore rd_tmp;
1012
0
  VP8ModeScore* rd_cur = &rd_tmp;
1013
0
  VP8ModeScore* rd_best = rd;
1014
0
  int mode;
1015
0
  int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1016
1017
0
  rd->mode_i16 = -1;
1018
0
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1019
0
    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1020
0
    rd_cur->mode_i16 = mode;
1021
1022
    // Reconstruct
1023
0
    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1024
1025
    // Measure RD-score
1026
0
    rd_cur->D = VP8SSE16x16(src, tmp_dst);
1027
0
    rd_cur->SD =
1028
0
        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1029
0
    rd_cur->H = VP8FixedCostsI16[mode];
1030
0
    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1031
0
    if (is_flat) {
1032
      // refine the first impression (which was in pixel space)
1033
0
      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1034
0
      if (is_flat) {
1035
        // Block is very flat. We put emphasis on the distortion being very low!
1036
0
        rd_cur->D *= 2;
1037
0
        rd_cur->SD *= 2;
1038
0
      }
1039
0
    }
1040
1041
    // Since we always examine Intra16 first, we can overwrite *rd directly.
1042
0
    SetRDScore(lambda, rd_cur);
1043
0
    if (mode == 0 || rd_cur->score < rd_best->score) {
1044
0
      SwapModeScore(&rd_cur, &rd_best);
1045
0
      SwapOut(it);
1046
0
    }
1047
0
  }
1048
0
  if (rd_best != rd) {
1049
0
    memcpy(rd, rd_best, sizeof(*rd));
1050
0
  }
1051
0
  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1052
0
  VP8SetIntra16Mode(it, rd->mode_i16);
1053
1054
  // we have a blocky macroblock (only DCs are non-zero) with fairly high
1055
  // distortion, record max delta so we can later adjust the minimal filtering
1056
  // strength needed to smooth these blocks out.
1057
0
  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1058
0
    StoreMaxDelta(dqm, rd->y_dc_levels);
1059
0
  }
1060
0
}
1061
1062
//------------------------------------------------------------------------------
1063
1064
// return the cost array corresponding to the surrounding prediction modes.
1065
static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1066
0
                                     const uint8_t modes[16]) {
1067
0
  const int preds_w = it->enc_->preds_w_;
1068
0
  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1069
0
  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1070
0
  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1071
0
  return VP8FixedCostsI4[top][left];
1072
0
}
1073
1074
static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1075
0
                          VP8ModeScore* WEBP_RESTRICT const rd) {
1076
0
  const VP8Encoder* const enc = it->enc_;
1077
0
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1078
0
  const int lambda = dqm->lambda_i4_;
1079
0
  const int tlambda = dqm->tlambda_;
1080
0
  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1081
0
  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1082
0
  int total_header_bits = 0;
1083
0
  VP8ModeScore rd_best;
1084
1085
0
  if (enc->max_i4_header_bits_ == 0) {
1086
0
    return 0;
1087
0
  }
1088
1089
0
  InitScore(&rd_best);
1090
0
  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1091
0
  SetRDScore(dqm->lambda_mode_, &rd_best);
1092
0
  VP8IteratorStartI4(it);
1093
0
  do {
1094
0
    const int kNumBlocks = 1;
1095
0
    VP8ModeScore rd_i4;
1096
0
    int mode;
1097
0
    int best_mode = -1;
1098
0
    const uint8_t* const src = src0 + VP8Scan[it->i4_];
1099
0
    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1100
0
    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1101
0
    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1102
1103
0
    InitScore(&rd_i4);
1104
0
    MakeIntra4Preds(it);
1105
0
    for (mode = 0; mode < NUM_BMODES; ++mode) {
1106
0
      VP8ModeScore rd_tmp;
1107
0
      int16_t tmp_levels[16];
1108
1109
      // Reconstruct
1110
0
      rd_tmp.nz =
1111
0
          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1112
1113
      // Compute RD-score
1114
0
      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1115
0
      rd_tmp.SD =
1116
0
          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1117
0
                  : 0;
1118
0
      rd_tmp.H = mode_costs[mode];
1119
1120
      // Add flatness penalty, to avoid flat area to be mispredicted
1121
      // by a complex mode.
1122
0
      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1123
0
        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1124
0
      } else {
1125
0
        rd_tmp.R = 0;
1126
0
      }
1127
1128
      // early-out check
1129
0
      SetRDScore(lambda, &rd_tmp);
1130
0
      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1131
1132
      // finish computing score
1133
0
      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1134
0
      SetRDScore(lambda, &rd_tmp);
1135
1136
0
      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1137
0
        CopyScore(&rd_i4, &rd_tmp);
1138
0
        best_mode = mode;
1139
0
        SwapPtr(&tmp_dst, &best_block);
1140
0
        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1141
0
               sizeof(rd_best.y_ac_levels[it->i4_]));
1142
0
      }
1143
0
    }
1144
0
    SetRDScore(dqm->lambda_mode_, &rd_i4);
1145
0
    AddScore(&rd_best, &rd_i4);
1146
0
    if (rd_best.score >= rd->score) {
1147
0
      return 0;
1148
0
    }
1149
0
    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1150
0
    if (total_header_bits > enc->max_i4_header_bits_) {
1151
0
      return 0;
1152
0
    }
1153
    // Copy selected samples if not in the right place already.
1154
0
    if (best_block != best_blocks + VP8Scan[it->i4_]) {
1155
0
      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1156
0
    }
1157
0
    rd->modes_i4[it->i4_] = best_mode;
1158
0
    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1159
0
  } while (VP8IteratorRotateI4(it, best_blocks));
1160
1161
  // finalize state
1162
0
  CopyScore(rd, &rd_best);
1163
0
  VP8SetIntra4Mode(it, rd->modes_i4);
1164
0
  SwapOut(it);
1165
0
  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1166
0
  return 1;   // select intra4x4 over intra16x16
1167
0
}
1168
1169
//------------------------------------------------------------------------------
1170
1171
static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1172
0
                       VP8ModeScore* WEBP_RESTRICT const rd) {
1173
0
  const int kNumBlocks = 8;
1174
0
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1175
0
  const int lambda = dqm->lambda_uv_;
1176
0
  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1177
0
  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1178
0
  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1179
0
  uint8_t* dst = dst0;
1180
0
  VP8ModeScore rd_best;
1181
0
  int mode;
1182
1183
0
  rd->mode_uv = -1;
1184
0
  InitScore(&rd_best);
1185
0
  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1186
0
    VP8ModeScore rd_uv;
1187
1188
    // Reconstruct
1189
0
    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1190
1191
    // Compute RD-score
1192
0
    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1193
0
    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1194
0
    rd_uv.H  = VP8FixedCostsUV[mode];
1195
0
    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1196
0
    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1197
0
      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1198
0
    }
1199
1200
0
    SetRDScore(lambda, &rd_uv);
1201
0
    if (mode == 0 || rd_uv.score < rd_best.score) {
1202
0
      CopyScore(&rd_best, &rd_uv);
1203
0
      rd->mode_uv = mode;
1204
0
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1205
0
      if (it->top_derr_ != NULL) {
1206
0
        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1207
0
      }
1208
0
      SwapPtr(&dst, &tmp_dst);
1209
0
    }
1210
0
  }
1211
0
  VP8SetIntraUVMode(it, rd->mode_uv);
1212
0
  AddScore(rd, &rd_best);
1213
0
  if (dst != dst0) {   // copy 16x8 block if needed
1214
0
    VP8Copy16x8(dst, dst0);
1215
0
  }
1216
0
  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1217
0
    StoreDiffusionErrors(it, rd);
1218
0
  }
1219
0
}
1220
1221
//------------------------------------------------------------------------------
1222
// Final reconstruction and quantization.
1223
1224
static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1225
0
                           VP8ModeScore* WEBP_RESTRICT const rd) {
1226
0
  const VP8Encoder* const enc = it->enc_;
1227
0
  const int is_i16 = (it->mb_->type_ == 1);
1228
0
  int nz = 0;
1229
1230
0
  if (is_i16) {
1231
0
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1232
0
  } else {
1233
0
    VP8IteratorStartI4(it);
1234
0
    do {
1235
0
      const int mode =
1236
0
          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1237
0
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1238
0
      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1239
0
      MakeIntra4Preds(it);
1240
0
      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1241
0
                              src, dst, mode) << it->i4_;
1242
0
    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1243
0
  }
1244
1245
0
  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1246
0
  rd->nz = nz;
1247
0
}
1248
1249
// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1250
static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1251
                                  int try_both_modes, int refine_uv_mode,
1252
0
                                  VP8ModeScore* WEBP_RESTRICT const rd) {
1253
0
  score_t best_score = MAX_COST;
1254
0
  int nz = 0;
1255
0
  int mode;
1256
0
  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1257
1258
0
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1259
  // Some empiric constants, of approximate order of magnitude.
1260
0
  const int lambda_d_i16 = 106;
1261
0
  const int lambda_d_i4 = 11;
1262
0
  const int lambda_d_uv = 120;
1263
0
  score_t score_i4 = dqm->i4_penalty_;
1264
0
  score_t i4_bit_sum = 0;
1265
0
  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1266
0
                                           : MAX_COST;  // no early-out allowed
1267
1268
0
  if (is_i16) {   // First, evaluate Intra16 distortion
1269
0
    int best_mode = -1;
1270
0
    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1271
0
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1272
0
      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1273
0
      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1274
0
                          + VP8FixedCostsI16[mode] * lambda_d_i16;
1275
0
      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1276
0
        continue;
1277
0
      }
1278
1279
0
      if (score < best_score) {
1280
0
        best_mode = mode;
1281
0
        best_score = score;
1282
0
      }
1283
0
    }
1284
0
    if (it->x_ == 0 || it->y_ == 0) {
1285
      // avoid starting a checkerboard resonance from the border. See bug #432.
1286
0
      if (IsFlatSource16(src)) {
1287
0
        best_mode = (it->x_ == 0) ? 0 : 2;
1288
0
        try_both_modes = 0;  // stick to i16
1289
0
      }
1290
0
    }
1291
0
    VP8SetIntra16Mode(it, best_mode);
1292
    // we'll reconstruct later, if i16 mode actually gets selected
1293
0
  }
1294
1295
  // Next, evaluate Intra4
1296
0
  if (try_both_modes || !is_i16) {
1297
    // We don't evaluate the rate here, but just account for it through a
1298
    // constant penalty (i4 mode usually needs more bits compared to i16).
1299
0
    is_i16 = 0;
1300
0
    VP8IteratorStartI4(it);
1301
0
    do {
1302
0
      int best_i4_mode = -1;
1303
0
      score_t best_i4_score = MAX_COST;
1304
0
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1305
0
      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1306
1307
0
      MakeIntra4Preds(it);
1308
0
      for (mode = 0; mode < NUM_BMODES; ++mode) {
1309
0
        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1310
0
        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1311
0
                            + mode_costs[mode] * lambda_d_i4;
1312
0
        if (score < best_i4_score) {
1313
0
          best_i4_mode = mode;
1314
0
          best_i4_score = score;
1315
0
        }
1316
0
      }
1317
0
      i4_bit_sum += mode_costs[best_i4_mode];
1318
0
      rd->modes_i4[it->i4_] = best_i4_mode;
1319
0
      score_i4 += best_i4_score;
1320
0
      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1321
        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1322
0
        is_i16 = 1;
1323
0
        break;
1324
0
      } else {  // reconstruct partial block inside yuv_out2_ buffer
1325
0
        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1326
0
        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1327
0
                                src, tmp_dst, best_i4_mode) << it->i4_;
1328
0
      }
1329
0
    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1330
0
  }
1331
1332
  // Final reconstruction, depending on which mode is selected.
1333
0
  if (!is_i16) {
1334
0
    VP8SetIntra4Mode(it, rd->modes_i4);
1335
0
    SwapOut(it);
1336
0
    best_score = score_i4;
1337
0
  } else {
1338
0
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1339
0
  }
1340
1341
  // ... and UV!
1342
0
  if (refine_uv_mode) {
1343
0
    int best_mode = -1;
1344
0
    score_t best_uv_score = MAX_COST;
1345
0
    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1346
0
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1347
0
      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1348
0
      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1349
0
                          + VP8FixedCostsUV[mode] * lambda_d_uv;
1350
0
      if (score < best_uv_score) {
1351
0
        best_mode = mode;
1352
0
        best_uv_score = score;
1353
0
      }
1354
0
    }
1355
0
    VP8SetIntraUVMode(it, best_mode);
1356
0
  }
1357
0
  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1358
1359
0
  rd->nz = nz;
1360
0
  rd->score = best_score;
1361
0
}
1362
1363
//------------------------------------------------------------------------------
1364
// Entry point
1365
1366
int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1367
                VP8ModeScore* WEBP_RESTRICT const rd,
1368
0
                VP8RDLevel rd_opt) {
1369
0
  int is_skipped;
1370
0
  const int method = it->enc_->method_;
1371
1372
0
  InitScore(rd);
1373
1374
  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1375
  // Luma4x4 predictions needs to be done as-we-go.
1376
0
  VP8MakeLuma16Preds(it);
1377
0
  VP8MakeChroma8Preds(it);
1378
1379
0
  if (rd_opt > RD_OPT_NONE) {
1380
0
    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1381
0
    PickBestIntra16(it, rd);
1382
0
    if (method >= 2) {
1383
0
      PickBestIntra4(it, rd);
1384
0
    }
1385
0
    PickBestUV(it, rd);
1386
0
    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1387
0
      it->do_trellis_ = 1;
1388
0
      SimpleQuantize(it, rd);
1389
0
    }
1390
0
  } else {
1391
    // At this point we have heuristically decided intra16 / intra4.
1392
    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1393
    // For method <= 1, we don't re-examine the decision but just go ahead with
1394
    // quantization/reconstruction.
1395
0
    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1396
0
  }
1397
0
  is_skipped = (rd->nz == 0);
1398
0
  VP8SetSkip(it, is_skipped);
1399
0
  return is_skipped;
1400
0
}