Coverage Report

Created: 2025-06-22 08:04

/src/libjxl/lib/jxl/enc_ac_strategy.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ac_strategy.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cmath>
12
#include <cstdint>
13
#include <cstdio>
14
#include <cstring>
15
16
#include "lib/jxl/base/common.h"
17
#include "lib/jxl/memory_manager_internal.h"
18
19
#undef HWY_TARGET_INCLUDE
20
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc"
21
#include <hwy/foreach_target.h>
22
#include <hwy/highway.h>
23
24
#include "lib/jxl/ac_strategy.h"
25
#include "lib/jxl/base/bits.h"
26
#include "lib/jxl/base/compiler_specific.h"
27
#include "lib/jxl/base/fast_math-inl.h"
28
#include "lib/jxl/base/rect.h"
29
#include "lib/jxl/base/status.h"
30
#include "lib/jxl/dec_transforms-inl.h"
31
#include "lib/jxl/enc_aux_out.h"
32
#include "lib/jxl/enc_debug_image.h"
33
#include "lib/jxl/enc_params.h"
34
#include "lib/jxl/enc_transforms-inl.h"
35
#include "lib/jxl/simd_util.h"
36
37
// Some of the floating point constants in this file and in other
38
// files in the libjxl project have been obtained using the
39
// tools/optimizer/simplex_fork.py tool. It is a variation of
40
// Nelder-Mead optimization, and we generally try to minimize
41
// BPP * pnorm aggregate as reported by the benchmark_xl tool,
42
// but occasionally the values are optimized by using additional
43
// constraints such as maintaining a certain density, or ratio of
44
// popularity of integral transforms. Jyrki visually reviews all
45
// such changes and often makes manual changes to maintain good
46
// visual quality to changes where butteraugli was not sufficiently
47
// sensitive to some kind of degradation. Unfortunately image quality
48
// is still more of an art than science.
49
50
// Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging.
51
#ifndef JXL_DEBUG_AC_STRATEGY
52
0
#define JXL_DEBUG_AC_STRATEGY 0
53
#endif
54
55
// This must come before the begin/end_target, but HWY_ONCE is only true
56
// after that, so use an "include guard".
57
#ifndef LIB_JXL_ENC_AC_STRATEGY_
58
#define LIB_JXL_ENC_AC_STRATEGY_
59
// Parameters of the heuristic are marked with a OPTIMIZE comment.
60
namespace jxl {
61
namespace {
62
63
// Debugging utilities.
64
65
// Returns a linear sRGB color (as bytes) for each AC strategy.
66
0
const uint8_t* TypeColor(uint8_t raw_strategy) {
67
0
  JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
68
0
  static_assert(AcStrategy::kNumValidStrategies == 27, "Update colors");
69
0
  static constexpr uint8_t kColors[AcStrategy::kNumValidStrategies + 1][3] = {
70
0
      {0xFF, 0xFF, 0x00},  // DCT8       | yellow
71
0
      {0xFF, 0x80, 0x80},  // HORNUSS    | vivid tangerine
72
0
      {0xFF, 0x80, 0x80},  // DCT2x2     | vivid tangerine
73
0
      {0xFF, 0x80, 0x80},  // DCT4x4     | vivid tangerine
74
0
      {0x80, 0xFF, 0x00},  // DCT16x16   | chartreuse
75
0
      {0x00, 0xC0, 0x00},  // DCT32x32   | waystone green
76
0
      {0xC0, 0xFF, 0x00},  // DCT16x8    | lime
77
0
      {0xC0, 0xFF, 0x00},  // DCT8x16    | lime
78
0
      {0x00, 0xFF, 0x00},  // DCT32x8    | green
79
0
      {0x00, 0xFF, 0x00},  // DCT8x32    | green
80
0
      {0x00, 0xFF, 0x00},  // DCT32x16   | green
81
0
      {0x00, 0xFF, 0x00},  // DCT16x32   | green
82
0
      {0xFF, 0x80, 0x00},  // DCT4x8     | orange juice
83
0
      {0xFF, 0x80, 0x00},  // DCT8x4     | orange juice
84
0
      {0xFF, 0xFF, 0x80},  // AFV0       | butter
85
0
      {0xFF, 0xFF, 0x80},  // AFV1       | butter
86
0
      {0xFF, 0xFF, 0x80},  // AFV2       | butter
87
0
      {0xFF, 0xFF, 0x80},  // AFV3       | butter
88
0
      {0x00, 0xC0, 0xFF},  // DCT64x64   | capri
89
0
      {0x00, 0xFF, 0xFF},  // DCT64x32   | aqua
90
0
      {0x00, 0xFF, 0xFF},  // DCT32x64   | aqua
91
0
      {0x00, 0x40, 0xFF},  // DCT128x128 | rare blue
92
0
      {0x00, 0x80, 0xFF},  // DCT128x64  | magic ink
93
0
      {0x00, 0x80, 0xFF},  // DCT64x128  | magic ink
94
0
      {0x00, 0x00, 0xC0},  // DCT256x256 | keese blue
95
0
      {0x00, 0x00, 0xFF},  // DCT256x128 | blue
96
0
      {0x00, 0x00, 0xFF},  // DCT128x256 | blue
97
0
      {0x00, 0x00, 0x00}   // invalid    | black
98
0
  };
99
0
  raw_strategy =
100
0
      Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies);
101
0
  return kColors[raw_strategy];
102
0
}
103
104
0
const uint8_t* TypeMask(uint8_t raw_strategy) {
105
0
  JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
106
0
  static_assert(AcStrategy::kNumValidStrategies == 27, "Update masks");
107
0
  // implicitly, first row and column is made dark
108
0
  static constexpr uint8_t kMask[AcStrategy::kNumValidStrategies + 1][64] = {
109
0
      {
110
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
111
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
112
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
113
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
114
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
115
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
116
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
117
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
118
0
      },                           // DCT8
119
0
      {
120
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
121
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
122
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
123
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
124
0
          0, 0, 1, 1, 1, 1, 0, 0,  //
125
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
126
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
127
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
128
0
      },                           // HORNUSS
129
0
      {
130
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
131
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
132
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
133
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
134
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
135
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
136
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
137
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
138
0
      },                           // 2x2
139
0
      {
140
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
141
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
142
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
143
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
144
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
145
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
146
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
147
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
148
0
      },                           // 4x4
149
0
      {},                          // DCT16x16 (unused)
150
0
      {},                          // DCT32x32 (unused)
151
0
      {},                          // DCT16x8 (unused)
152
0
      {},                          // DCT8x16 (unused)
153
0
      {},                          // DCT32x8 (unused)
154
0
      {},                          // DCT8x32 (unused)
155
0
      {},                          // DCT32x16 (unused)
156
0
      {},                          // DCT16x32 (unused)
157
0
      {
158
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
159
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
160
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
161
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
162
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
163
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
164
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
165
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
166
0
      },                           // DCT4x8
167
0
      {
168
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
169
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
170
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
171
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
172
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
173
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
174
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
175
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
176
0
      },                           // DCT8x4
177
0
      {
178
0
          1, 1, 1, 1, 1, 0, 0, 0,  //
179
0
          1, 1, 1, 1, 0, 0, 0, 0,  //
180
0
          1, 1, 1, 0, 0, 0, 0, 0,  //
181
0
          1, 1, 0, 0, 0, 0, 0, 0,  //
182
0
          1, 0, 0, 0, 0, 0, 0, 0,  //
183
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
184
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
185
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
186
0
      },                           // AFV0
187
0
      {
188
0
          0, 0, 0, 0, 1, 1, 1, 1,  //
189
0
          0, 0, 0, 0, 0, 1, 1, 1,  //
190
0
          0, 0, 0, 0, 0, 0, 1, 1,  //
191
0
          0, 0, 0, 0, 0, 0, 0, 1,  //
192
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
193
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
194
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
195
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
196
0
      },                           // AFV1
197
0
      {
198
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
199
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
200
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
201
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
202
0
          1, 0, 0, 0, 0, 0, 0, 0,  //
203
0
          1, 1, 0, 0, 0, 0, 0, 0,  //
204
0
          1, 1, 1, 0, 0, 0, 0, 0,  //
205
0
          1, 1, 1, 1, 0, 0, 0, 0,  //
206
0
      },                           // AFV2
207
0
      {
208
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
209
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
210
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
211
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
212
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
213
0
          0, 0, 0, 0, 0, 0, 0, 1,  //
214
0
          0, 0, 0, 0, 0, 0, 1, 1,  //
215
0
          0, 0, 0, 0, 0, 1, 1, 1,  //
216
0
      },                           // AFV3
217
0
      {}                           // invalid
218
0
  };
219
0
  raw_strategy =
220
0
      Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies);
221
0
  return kMask[raw_strategy];
222
0
}
223
224
Status DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
225
                      size_t ysize, const char* tag, AuxOut* aux_out,
226
0
                      const CompressParams& cparams) {
227
0
  JxlMemoryManager* memory_manager = ac_strategy.memory_manager();
228
0
  JXL_ASSIGN_OR_RETURN(Image3F color_acs,
229
0
                       Image3F::Create(memory_manager, xsize, ysize));
230
0
  for (size_t y = 0; y < ysize; y++) {
231
0
    float* JXL_RESTRICT rows[3] = {
232
0
        color_acs.PlaneRow(0, y),
233
0
        color_acs.PlaneRow(1, y),
234
0
        color_acs.PlaneRow(2, y),
235
0
    };
236
0
    const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim);
237
0
    for (size_t x = 0; x < xsize; x++) {
238
0
      AcStrategy acs = acs_row[x / kBlockDim];
239
0
      const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
240
0
      for (size_t c = 0; c < 3; c++) {
241
0
        rows[c][x] = color[c] / 255.f;
242
0
      }
243
0
    }
244
0
  }
245
0
  size_t stride = color_acs.PixelsPerRow();
246
0
  for (size_t c = 0; c < 3; c++) {
247
0
    for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) {
248
0
      float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim);
249
0
      const AcStrategyRow acs_row = ac_strategy.ConstRow(by);
250
0
      for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) {
251
0
        AcStrategy acs = acs_row[bx];
252
0
        if (!acs.IsFirstBlock()) continue;
253
0
        const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
254
0
        const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy());
255
0
        if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) {
256
0
          for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize;
257
0
               iy++) {
258
0
            for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize;
259
0
                 ix++) {
260
0
              if (mask[iy * kBlockDim + ix]) {
261
0
                row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f;
262
0
              }
263
0
            }
264
0
          }
265
0
        }
266
0
        // draw block edges
267
0
        for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() &&
268
0
                            bx * kBlockDim + ix < xsize;
269
0
             ix++) {
270
0
          row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f;
271
0
        }
272
0
        for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() &&
273
0
                            by * kBlockDim + iy < ysize;
274
0
             iy++) {
275
0
          row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f;
276
0
        }
277
0
      }
278
0
    }
279
0
  }
280
0
  return DumpImage(cparams, tag, color_acs);
281
0
}
282
283
}  // namespace
284
}  // namespace jxl
285
#endif  // LIB_JXL_ENC_AC_STRATEGY_
286
287
HWY_BEFORE_NAMESPACE();
288
namespace jxl {
289
namespace HWY_NAMESPACE {
290
291
// These templates are not found via ADL.
292
using hwy::HWY_NAMESPACE::AbsDiff;
293
using hwy::HWY_NAMESPACE::Eq;
294
using hwy::HWY_NAMESPACE::IfThenElseZero;
295
using hwy::HWY_NAMESPACE::IfThenZeroElse;
296
using hwy::HWY_NAMESPACE::Round;
297
using hwy::HWY_NAMESPACE::Sqrt;
298
299
bool MultiBlockTransformCrossesHorizontalBoundary(
300
    const AcStrategyImage& ac_strategy, size_t start_x, size_t y,
301
0
    size_t end_x) {
302
0
  if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) {
303
0
    return false;
304
0
  }
305
0
  if (y % 8 == 0) {
306
    // Nothing crosses 64x64 boundaries, and the memory on the other side
307
    // of the 64x64 block may still uninitialized.
308
0
    return false;
309
0
  }
310
0
  end_x = std::min(end_x, ac_strategy.xsize());
311
  // The first multiblock might be before the start_x, let's adjust it
312
  // to point to the first IsFirstBlock() == true block we find by backward
313
  // tracing.
314
0
  AcStrategyRow row = ac_strategy.ConstRow(y);
315
0
  const size_t start_x_limit = start_x & ~7;
316
0
  while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) {
317
0
    --start_x;
318
0
  }
319
0
  for (size_t x = start_x; x < end_x;) {
320
0
    if (row[x].IsFirstBlock()) {
321
0
      x += row[x].covered_blocks_x();
322
0
    } else {
323
0
      return true;
324
0
    }
325
0
  }
326
0
  return false;
327
0
}
328
329
bool MultiBlockTransformCrossesVerticalBoundary(
330
    const AcStrategyImage& ac_strategy, size_t x, size_t start_y,
331
0
    size_t end_y) {
332
0
  if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) {
333
0
    return false;
334
0
  }
335
0
  if (x % 8 == 0) {
336
    // Nothing crosses 64x64 boundaries, and the memory on the other side
337
    // of the 64x64 block may still uninitialized.
338
0
    return false;
339
0
  }
340
0
  end_y = std::min(end_y, ac_strategy.ysize());
341
  // The first multiblock might be before the start_y, let's adjust it
342
  // to point to the first IsFirstBlock() == true block we find by backward
343
  // tracing.
344
0
  const size_t start_y_limit = start_y & ~7;
345
0
  while (start_y != start_y_limit &&
346
0
         !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) {
347
0
    --start_y;
348
0
  }
349
350
0
  for (size_t y = start_y; y < end_y;) {
351
0
    AcStrategyRow row = ac_strategy.ConstRow(y);
352
0
    if (row[x].IsFirstBlock()) {
353
0
      y += row[x].covered_blocks_y();
354
0
    } else {
355
0
      return true;
356
0
    }
357
0
  }
358
0
  return false;
359
0
}
360
361
Status EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x,
362
                       size_t y, const ACSConfig& config,
363
                       const float* JXL_RESTRICT cmap_factors, float* block,
364
                       float* full_scratch_space, uint32_t* quantized,
365
0
                       float& entropy) {
366
0
  entropy = 0.0f;
367
0
  float* mem = full_scratch_space;
368
0
  float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea;
369
0
  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
370
371
  // Apply transform.
372
0
  for (size_t c = 0; c < 3; c++) {
373
0
    float* JXL_RESTRICT block_c = block + size * c;
374
0
    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
375
0
                        config.src_stride, block_c, scratch_space);
376
0
  }
377
0
  HWY_FULL(float) df;
378
379
0
  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
380
  // avoid large blocks when there is a lot going on in red-green.
381
0
  float quant_norm16 = 0;
382
0
  if (num_blocks == 1) {
383
    // When it is only one 8x8, we don't need aggregation of values.
384
0
    quant_norm16 = config.Quant(x / 8, y / 8);
385
0
  } else if (num_blocks == 2) {
386
    // Taking max instead of 8th norm seems to work
387
    // better for smallest blocks up to 16x8. Jyrki couldn't get
388
    // improvements in trying the same for 16x16 blocks.
389
0
    if (acs.covered_blocks_y() == 2) {
390
0
      quant_norm16 =
391
0
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
392
0
    } else {
393
0
      quant_norm16 =
394
0
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
395
0
    }
396
0
  } else {
397
    // Load QF value, calculate empirical heuristic on masking field
398
    // for weighting the information loss. Information loss manifests
399
    // itself as ringing, and masking could hide it.
400
0
    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
401
0
      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
402
0
        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
403
0
        qval *= qval;
404
0
        qval *= qval;
405
0
        qval *= qval;
406
0
        quant_norm16 += qval * qval;
407
0
      }
408
0
    }
409
0
    quant_norm16 /= num_blocks;
410
0
    quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f);
411
0
  }
412
0
  const auto quant = Set(df, quant_norm16);
413
414
  // Compute entropy.
415
0
  const HWY_CAPPED(float, 8) df8;
416
417
0
  auto loss = Zero(df8);
418
0
  for (size_t c = 0; c < 3; c++) {
419
0
    const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c);
420
0
    const float* matrix = config.dequant->Matrix(acs.Strategy(), c);
421
0
    const auto cmap_factor = Set(df, cmap_factors[c]);
422
423
0
    auto entropy_v = Zero(df);
424
0
    auto nzeros_v = Zero(df);
425
0
    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
426
0
      const auto in = Load(df, block + c * size + i);
427
0
      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
428
0
      const auto im = Load(df, inv_matrix + i);
429
0
      const auto val = Mul(Sub(in, in_y), Mul(im, quant));
430
0
      const auto rval = Round(val);
431
0
      const auto diff = Sub(val, rval);
432
0
      const auto m = Load(df, matrix + i);
433
0
      Store(Mul(m, diff), df, &mem[i]);
434
0
      const auto q = Abs(rval);
435
0
      const auto q_is_zero = Eq(q, Zero(df));
436
      // We used to have q * C here, but that cost model seems to
437
      // be punishing large values more than necessary. Sqrt tries
438
      // to avoid large values less aggressively.
439
0
      entropy_v = Add(Sqrt(q), entropy_v);
440
0
      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
441
0
    }
442
443
0
    {
444
0
      auto lossc = Zero(df8);
445
0
      TransformToPixels(acs.Strategy(), &mem[0], block,
446
0
                        acs.covered_blocks_x() * 8, scratch_space);
447
448
0
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
449
0
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
450
0
          for (size_t dy = 0; dy < kBlockDim; ++dy) {
451
0
            for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) {
452
0
              auto in = Load(df8, block +
453
0
                                      (iy * kBlockDim + dy) *
454
0
                                          (acs.covered_blocks_x() * kBlockDim) +
455
0
                                      ix * kBlockDim + dx);
456
0
              if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) {
457
0
                auto masku =
458
0
                    Abs(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx,
459
0
                                                       y + iy * 8 + dy)));
460
0
                in = Mul(masku, in);
461
0
                in = Mul(in, in);
462
0
                in = Mul(in, in);
463
0
                in = Mul(in, in);
464
0
                lossc = Add(lossc, in);
465
0
              }
466
0
            }
467
0
          }
468
0
        }
469
0
      }
470
0
      static const double kChannelMul[3] = {
471
0
          pow(10.2, 8.0),
472
0
          pow(1.0, 8.0),
473
0
          pow(1.03, 8.0),
474
0
      };
475
0
      lossc = Mul(Set(df8, kChannelMul[c]), lossc);
476
0
      loss = Add(loss, lossc);
477
0
    }
478
0
    entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v));
479
0
    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
480
    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
481
    // number of non-zeros of the block.
482
0
    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
483
    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
484
    // bias.
485
0
    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
486
0
  }
487
0
  float loss_scalar =
488
0
      pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize),
489
0
          1.0 / 8.0) *
490
0
      (num_blocks * kDCTBlockSize) / quant_norm16;
491
0
  entropy *= entropy_mul;
492
0
  entropy += config.info_loss_multiplier * loss_scalar;
493
0
  return true;
494
0
}
495
496
Status FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
497
                            float butteraugli_target, const ACSConfig& config,
498
                            const float* JXL_RESTRICT cmap_factors,
499
                            AcStrategyImage* JXL_RESTRICT ac_strategy,
500
                            float* block, float* scratch_space,
501
                            uint32_t* quantized, float* entropy_out,
502
0
                            AcStrategyType& best_tx) {
503
0
  struct TransformTry8x8 {
504
0
    AcStrategyType type;
505
0
    int encoding_speed_tier_max_limit;
506
0
    double entropy_mul;
507
0
  };
508
0
  static const TransformTry8x8 kTransforms8x8[] = {
509
0
      {
510
0
          AcStrategyType::DCT,
511
0
          9,
512
0
          0.8,
513
0
      },
514
0
      {
515
0
          AcStrategyType::DCT4X4,
516
0
          5,
517
0
          1.08,
518
0
      },
519
0
      {
520
0
          AcStrategyType::DCT2X2,
521
0
          5,
522
0
          0.95,
523
0
      },
524
0
      {
525
0
          AcStrategyType::DCT4X8,
526
0
          4,
527
0
          0.85931637428340035,
528
0
      },
529
0
      {
530
0
          AcStrategyType::DCT8X4,
531
0
          4,
532
0
          0.85931637428340035,
533
0
      },
534
0
      {
535
0
          AcStrategyType::IDENTITY,
536
0
          5,
537
0
          1.0427542510634957,
538
0
      },
539
0
      {
540
0
          AcStrategyType::AFV0,
541
0
          4,
542
0
          0.81779489591359944,
543
0
      },
544
0
      {
545
0
          AcStrategyType::AFV1,
546
0
          4,
547
0
          0.81779489591359944,
548
0
      },
549
0
      {
550
0
          AcStrategyType::AFV2,
551
0
          4,
552
0
          0.81779489591359944,
553
0
      },
554
0
      {
555
0
          AcStrategyType::AFV3,
556
0
          4,
557
0
          0.81779489591359944,
558
0
      },
559
0
  };
560
0
  double best = 1e30;
561
0
  best_tx = kTransforms8x8[0].type;
562
0
  for (auto tx : kTransforms8x8) {
563
0
    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
564
0
      continue;
565
0
    }
566
0
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
567
0
    float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul;
568
0
    if ((tx.type == AcStrategyType::DCT2X2 ||
569
0
         tx.type == AcStrategyType::IDENTITY) &&
570
0
        butteraugli_target < 5.0) {
571
0
      static const float kFavor2X2AtHighQuality = 0.4;
572
0
      float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0);
573
0
      entropy_mul -= kFavor2X2AtHighQuality * weight;
574
0
    }
575
0
    if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 &&
576
0
         tx.type != AcStrategyType::IDENTITY) &&
577
0
        butteraugli_target > 4.0) {
578
0
      static const float kAvoidEntropyOfTransforms = 0.5;
579
0
      float mul = 1.0;
580
0
      if (butteraugli_target < 12.0) {
581
0
        mul *= (12.0 - 4.0) / (butteraugli_target - 4.0);
582
0
      }
583
0
      entropy_mul += kAvoidEntropyOfTransforms * mul;
584
0
    }
585
0
    float entropy;
586
0
    JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config,
587
0
                                        cmap_factors, block, scratch_space,
588
0
                                        quantized, entropy));
589
0
    if (entropy < best) {
590
0
      best_tx = tx.type;
591
0
      best = entropy;
592
0
    }
593
0
  }
594
0
  *entropy_out = best;
595
0
  return true;
596
0
}
597
598
// bx, by addresses the 64x64 block at 8x8 subresolution
599
// cx, cy addresses the left, upper 8x8 block position of the candidate
600
// transform.
601
Status TryMergeAcs(AcStrategyType acs_raw, size_t bx, size_t by, size_t cx,
602
                   size_t cy, const ACSConfig& config,
603
                   const float* JXL_RESTRICT cmap_factors,
604
                   AcStrategyImage* JXL_RESTRICT ac_strategy,
605
                   const float entropy_mul, const uint8_t candidate_priority,
606
                   uint8_t* priority, float* JXL_RESTRICT entropy_estimate,
607
0
                   float* block, float* scratch_space, uint32_t* quantized) {
608
0
  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
609
0
  float entropy_current = 0;
610
0
  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
611
0
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
612
0
      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
613
        // Transform would reuse already allocated blocks and
614
        // lead to invalid overlaps, for example DCT64X32 vs.
615
        // DCT32X64.
616
0
        return true;
617
0
      }
618
0
      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
619
0
    }
620
0
  }
621
0
  float entropy_candidate;
622
0
  JXL_RETURN_IF_ERROR(EstimateEntropy(
623
0
      acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors,
624
0
      block, scratch_space, quantized, entropy_candidate));
625
0
  if (entropy_candidate >= entropy_current) return true;
626
  // Accept the candidate.
627
0
  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
628
0
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
629
0
      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
630
0
      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
631
0
    }
632
0
  }
633
0
  JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw));
634
0
  entropy_estimate[cy * 8 + cx] = entropy_candidate;
635
0
  return true;
636
0
}
637
638
static void SetEntropyForTransform(size_t cx, size_t cy,
639
                                   const AcStrategyType acs_raw, float entropy,
640
0
                                   float* JXL_RESTRICT entropy_estimate) {
641
0
  const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
642
0
  for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) {
643
0
    for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) {
644
0
      entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0;
645
0
    }
646
0
  }
647
0
  entropy_estimate[cy * 8 + cx] = entropy;
648
0
}
649
650
0
AcStrategyType AcsSquare(size_t blocks) {
651
0
  if (blocks == 2) {
652
0
    return AcStrategyType::DCT16X16;
653
0
  } else if (blocks == 4) {
654
0
    return AcStrategyType::DCT32X32;
655
0
  } else {
656
0
    return AcStrategyType::DCT64X64;
657
0
  }
658
0
}
659
660
0
AcStrategyType AcsVerticalSplit(size_t blocks) {
661
0
  if (blocks == 2) {
662
0
    return AcStrategyType::DCT16X8;
663
0
  } else if (blocks == 4) {
664
0
    return AcStrategyType::DCT32X16;
665
0
  } else {
666
0
    return AcStrategyType::DCT64X32;
667
0
  }
668
0
}
669
670
0
AcStrategyType AcsHorizontalSplit(size_t blocks) {
671
0
  if (blocks == 2) {
672
0
    return AcStrategyType::DCT8X16;
673
0
  } else if (blocks == 4) {
674
0
    return AcStrategyType::DCT16X32;
675
0
  } else {
676
0
    return AcStrategyType::DCT32X64;
677
0
  }
678
0
}
679
680
// The following function tries to merge smaller transforms into
681
// squares and the rectangles originating from a single middle division
682
// (horizontal or vertical) fairly.
683
//
684
// This is now generalized to concern about squares
685
// of blocks X blocks size, where a block is 8x8 pixels.
686
Status FindBestFirstLevelDivisionForSquare(
687
    size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx,
688
    size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors,
689
    AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK,
690
    const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate,
691
0
    float* block, float* scratch_space, uint32_t* quantized) {
692
  // We denote J for the larger dimension here, and K for the smaller.
693
  // For example, for 32x32 block splitting, J would be 32, K 16.
694
0
  const size_t blocks_half = blocks / 2;
695
0
  const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks);
696
0
  const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks);
697
0
  const AcStrategyType acs_rawJXJ = AcsSquare(blocks);
698
0
  const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK);
699
0
  const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ);
700
0
  const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ);
701
0
  AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0);
702
0
  AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half);
703
  // Let's check if we can consider a JXJ block here at all.
704
  // This is not necessary in the basic use of hierarchically merging
705
  // blocks in the simplest possible way, but is needed when we try other
706
  // 'floating' options of merging, possibly after a simple hierarchical
707
  // merge has been explored.
708
0
  if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx,
709
0
                                                   by + cy, bx + cx + blocks) ||
710
0
      MultiBlockTransformCrossesHorizontalBoundary(
711
0
          *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) ||
712
0
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy,
713
0
                                                 by + cy + blocks) ||
714
0
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks,
715
0
                                                 by + cy, by + cy + blocks)) {
716
0
    return true;  // not suitable for JxJ analysis, some transforms leak out.
717
0
  }
718
  // For floating transforms there may be
719
  // already blocks selected that make either or both JXK and
720
  // KXJ not feasible for this location.
721
0
  const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary(
722
0
      *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks);
723
0
  const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary(
724
0
      *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks);
725
  // Current entropies aggregated on NxN resolution.
726
0
  float entropy[2][2] = {};
727
0
  for (size_t dy = 0; dy < blocks; ++dy) {
728
0
    for (size_t dx = 0; dx < blocks; ++dx) {
729
0
      entropy[dy / blocks_half][dx / blocks_half] +=
730
0
          entropy_estimate[(cy + dy) * 8 + (cx + dx)];
731
0
    }
732
0
  }
733
0
  float entropy_JXK_left = std::numeric_limits<float>::max();
734
0
  float entropy_JXK_right = std::numeric_limits<float>::max();
735
0
  float entropy_KXJ_top = std::numeric_limits<float>::max();
736
0
  float entropy_KXJ_bottom = std::numeric_limits<float>::max();
737
0
  float entropy_JXJ = std::numeric_limits<float>::max();
738
0
  if (allow_JXK) {
739
0
    if (row0[bx + cx + 0].Strategy() != acs_rawJXK) {
740
0
      JXL_RETURN_IF_ERROR(EstimateEntropy(
741
0
          acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
742
0
          cmap_factors, block, scratch_space, quantized, entropy_JXK_left));
743
0
    }
744
0
    if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) {
745
0
      JXL_RETURN_IF_ERROR(
746
0
          EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8,
747
0
                          (by + cy + 0) * 8, config, cmap_factors, block,
748
0
                          scratch_space, quantized, entropy_JXK_right));
749
0
    }
750
0
  }
751
0
  if (allow_KXJ) {
752
0
    if (row0[bx + cx].Strategy() != acs_rawKXJ) {
753
0
      JXL_RETURN_IF_ERROR(EstimateEntropy(
754
0
          acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
755
0
          cmap_factors, block, scratch_space, quantized, entropy_KXJ_top));
756
0
    }
757
0
    if (row1[bx + cx].Strategy() != acs_rawKXJ) {
758
0
      JXL_RETURN_IF_ERROR(
759
0
          EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8,
760
0
                          (by + cy + blocks_half) * 8, config, cmap_factors,
761
0
                          block, scratch_space, quantized, entropy_KXJ_bottom));
762
0
    }
763
0
  }
764
0
  if (allow_square_transform) {
765
    // We control the exploration of the square transform separately so that
766
    // we can turn it off at high decoding speeds for 32x32, but still allow
767
    // exploring 16x32 and 32x16.
768
0
    JXL_RETURN_IF_ERROR(EstimateEntropy(
769
0
        acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
770
0
        cmap_factors, block, scratch_space, quantized, entropy_JXJ));
771
0
  }
772
773
  // Test if this block should have JXK or KXJ transforms,
774
  // because it can have only one or the other.
775
0
  float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) +
776
0
                  std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]);
777
0
  float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) +
778
0
                  std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]);
779
0
  if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) {
780
0
    JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ));
781
0
    SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate);
782
0
  } else if (costJxN < costNxJ) {
783
0
    if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) {
784
0
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK));
785
0
      SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left,
786
0
                             entropy_estimate);
787
0
    }
788
0
    if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) {
789
0
      JXL_RETURN_IF_ERROR(
790
0
          ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK));
791
0
      SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK,
792
0
                             entropy_JXK_right, entropy_estimate);
793
0
    }
794
0
  } else {
795
0
    if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) {
796
0
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ));
797
0
      SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top,
798
0
                             entropy_estimate);
799
0
    }
800
0
    if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) {
801
0
      JXL_RETURN_IF_ERROR(
802
0
          ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ));
803
0
      SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ,
804
0
                             entropy_KXJ_bottom, entropy_estimate);
805
0
    }
806
0
  }
807
0
  return true;
808
0
}
809
810
Status ProcessRectACS(const CompressParams& cparams, const ACSConfig& config,
811
                      const Rect& rect, const ColorCorrelationMap& cmap,
812
                      float* JXL_RESTRICT block,
813
                      uint32_t* JXL_RESTRICT quantized,
814
0
                      AcStrategyImage* ac_strategy) {
815
  // Main philosophy here:
816
  // 1. First find best 8x8 transform for each area.
817
  // 2. Merging them into larger transforms where possibly, but
818
  // starting from the smallest transforms (16x8 and 8x16).
819
  // Additional complication: 16x8 and 8x16 are considered
820
  // simultaneously and fairly against each other.
821
  // We are looking at 64x64 squares since the Y-to-X and Y-to-B
822
  // maps happen to be at that resolution, and having
823
  // integral transforms cross these boundaries leads to
824
  // additional complications.
825
0
  const float butteraugli_target = cparams.butteraugli_distance;
826
0
  float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea;
827
0
  size_t bx = rect.x0();
828
0
  size_t by = rect.y0();
829
0
  JXL_ENSURE(rect.xsize() <= 8);
830
0
  JXL_ENSURE(rect.ysize() <= 8);
831
0
  size_t tx = bx / kColorTileDimInBlocks;
832
0
  size_t ty = by / kColorTileDimInBlocks;
833
0
  const float cmap_factors[3] = {
834
0
      cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]),
835
0
      0.0f,
836
0
      cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]),
837
0
  };
838
0
  if (cparams.speed_tier > SpeedTier::kHare) return true;
839
  // First compute the best 8x8 transform for each square. Later, we do not
840
  // experiment with different combinations, but only use the best of the 8x8s
841
  // when DCT8X8 is specified in the tree search.
842
  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
843
0
  float entropy_estimate[64] = {};
844
  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
845
  // low butteraugli_target distances.
846
0
  static const float k8x8mul1 = -0.4;
847
0
  static const float k8x8mul2 = 1.0;
848
0
  static const float k8x8base = 1.4;
849
0
  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
850
0
  for (size_t iy = 0; iy < rect.ysize(); iy++) {
851
0
    for (size_t ix = 0; ix < rect.xsize(); ix++) {
852
0
      float entropy = 0.0;
853
0
      AcStrategyType best_of_8x8s;
854
0
      JXL_RETURN_IF_ERROR(FindBest8x8Transform(
855
0
          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
856
0
          butteraugli_target, config, cmap_factors, ac_strategy, block,
857
0
          scratch_space, quantized, &entropy, best_of_8x8s));
858
0
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s));
859
0
      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
860
0
    }
861
0
  }
862
  // Merge when a larger transform is better than the previously
863
  // searched best combination of 8x8 transforms.
864
0
  struct MergeTry {
865
0
    AcStrategyType type;
866
0
    uint8_t priority;
867
0
    uint8_t decoding_speed_tier_max_limit;
868
0
    uint8_t encoding_speed_tier_max_limit;
869
0
    float entropy_mul;
870
0
  };
871
  // These numbers need to be figured out manually and looking at
872
  // ringing next to sky etc. Optimization will find smaller numbers
873
  // and produce more ringing than is ideal. Larger numbers will
874
  // help stop ringing.
875
0
  const float entropy_mul16X8 = 1.25;
876
0
  const float entropy_mul16X16 = 1.35;
877
0
  const float entropy_mul16X32 = 1.5;
878
0
  const float entropy_mul32X32 = 1.5;
879
0
  const float entropy_mul64X32 = 2.26;
880
0
  const float entropy_mul64X64 = 2.26;
881
  // TODO(jyrki): Consider this feedback in further changes:
882
  // Also effectively when the multipliers for smaller blocks are
883
  // below 1, this raises the bar for the bigger blocks even higher
884
  // in that sense these constants are not independent (e.g. changing
885
  // the constant for DCT16x32 by -5% (making it more likely) also
886
  // means that DCT32x32 becomes harder to do when starting from
887
  // two DCT16x32s). It might be better to make them more independent,
888
  // e.g. by not applying the multiplier when storing the new entropy
889
  // estimates in TryMergeToACSCandidate().
890
0
  const MergeTry kTransformsForMerge[9] = {
891
0
      {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8},
892
0
      {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8},
893
      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
894
      // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16},
895
0
      {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32},
896
0
      {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32},
897
      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
898
      // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5,
899
      // 0.9822994906548809f},
900
0
      {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32},
901
0
      {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32},
902
      // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f},
903
0
  };
904
  /*
905
  These sizes not yet included in merge heuristic:
906
  set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f);
907
  set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f);
908
  set(AcStrategyType::DCT128X128, 0.0f, 1.0f);
909
  set(AcStrategyType::DCT128X64, 0.0f, 0.73f);
910
  set(AcStrategyType::DCT64X128, 0.0f, 0.73f);
911
  set(AcStrategyType::DCT256X256, 0.0f, 1.0f);
912
  set(AcStrategyType::DCT256X128, 0.0f, 0.73f);
913
  set(AcStrategyType::DCT128X256, 0.0f, 0.73f);
914
  */
915
916
  // Priority is a tricky kludge to avoid collisions so that transforms
917
  // don't overlap.
918
0
  uint8_t priority[64] = {};
919
0
  bool enable_32x32 = cparams.decoding_speed_tier < 4;
920
0
  for (auto tx : kTransformsForMerge) {
921
0
    if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
922
0
      continue;
923
0
    }
924
0
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
925
926
0
    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
927
0
         cy += acs.covered_blocks_y()) {
928
0
      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
929
0
           cx += acs.covered_blocks_x()) {
930
0
        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
931
0
          if (cparams.decoding_speed_tier < 4 &&
932
0
              tx.type == AcStrategyType::DCT32X64) {
933
            // We handle both DCT8X16 and DCT16X8 at the same time.
934
0
            if ((cy | cx) % 8 == 0) {
935
0
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
936
0
                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
937
0
                  tx.entropy_mul, entropy_mul64X64, entropy_estimate, block,
938
0
                  scratch_space, quantized));
939
0
            }
940
0
            continue;
941
0
          } else if (tx.type == AcStrategyType::DCT32X16) {
942
            // We handled both DCT8X16 and DCT16X8 at the same time,
943
            // and that is above. The last column and last row,
944
            // when the last column or last row is odd numbered,
945
            // are still handled by TryMergeAcs.
946
0
            continue;
947
0
          }
948
0
        }
949
0
        if ((tx.type == AcStrategyType::DCT16X32 && cy % 4 != 0) ||
950
0
            (tx.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) {
951
          // already covered by FindBest32X32
952
0
          continue;
953
0
        }
954
955
0
        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
956
0
          if (tx.type == AcStrategyType::DCT16X32) {
957
            // We handle both DCT8X16 and DCT16X8 at the same time.
958
0
            if ((cy | cx) % 4 == 0) {
959
0
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
960
0
                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
961
0
                  ac_strategy, tx.entropy_mul, entropy_mul32X32,
962
0
                  entropy_estimate, block, scratch_space, quantized));
963
0
            }
964
0
            continue;
965
0
          } else if (tx.type == AcStrategyType::DCT32X16) {
966
            // We handled both DCT8X16 and DCT16X8 at the same time,
967
            // and that is above. The last column and last row,
968
            // when the last column or last row is odd numbered,
969
            // are still handled by TryMergeAcs.
970
0
            continue;
971
0
          }
972
0
        }
973
0
        if ((tx.type == AcStrategyType::DCT16X32 && cy % 4 != 0) ||
974
0
            (tx.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) {
975
          // already covered by FindBest32X32
976
0
          continue;
977
0
        }
978
0
        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
979
0
          if (tx.type == AcStrategyType::DCT8X16) {
980
            // We handle both DCT8X16 and DCT16X8 at the same time.
981
0
            if ((cy | cx) % 2 == 0) {
982
0
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
983
0
                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
984
0
                  tx.entropy_mul, entropy_mul16X16, entropy_estimate, block,
985
0
                  scratch_space, quantized));
986
0
            }
987
0
            continue;
988
0
          } else if (tx.type == AcStrategyType::DCT16X8) {
989
            // We handled both DCT8X16 and DCT16X8 at the same time,
990
            // and that is above. The last column and last row,
991
            // when the last column or last row is odd numbered,
992
            // are still handled by TryMergeAcs.
993
0
            continue;
994
0
          }
995
0
        }
996
0
        if ((tx.type == AcStrategyType::DCT8X16 && cy % 2 == 1) ||
997
0
            (tx.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) {
998
          // already covered by FindBestFirstLevelDivisionForSquare
999
0
          continue;
1000
0
        }
1001
        // All other merge sizes are handled here.
1002
        // Some of the DCT16X8s and DCT8X16s will still leak through here
1003
        // when there is an odd number of 8x8 blocks, then the last row
1004
        // and column will get their DCT16X8s and DCT8X16s through the
1005
        // normal integral transform merging process.
1006
0
        JXL_RETURN_IF_ERROR(
1007
0
            TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors,
1008
0
                        ac_strategy, tx.entropy_mul, tx.priority, &priority[0],
1009
0
                        entropy_estimate, block, scratch_space, quantized));
1010
0
      }
1011
0
    }
1012
0
  }
1013
0
  if (cparams.speed_tier >= SpeedTier::kHare) {
1014
0
    return true;
1015
0
  }
1016
  // Here we still try to do some non-aligned matching, find a few more
1017
  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
1018
0
  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
1019
0
    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
1020
0
      if ((cy | cx) % 2 != 0) {
1021
0
        JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1022
0
            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1023
0
            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
1024
0
            scratch_space, quantized));
1025
0
      }
1026
0
    }
1027
0
  }
1028
  // Non-aligned matching for 32X32, 16X32 and 32X16.
1029
0
  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
1030
0
  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
1031
0
    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
1032
0
      if ((cy | cx) % 4 == 0) {
1033
0
        continue;  // Already tried with loop above (DCT16X32 case).
1034
0
      }
1035
0
      JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1036
0
          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1037
0
          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
1038
0
          scratch_space, quantized));
1039
0
    }
1040
0
  }
1041
0
  return true;
1042
0
}
1043
1044
// NOLINTNEXTLINE(google-readability-namespace-comments)
1045
}  // namespace HWY_NAMESPACE
1046
}  // namespace jxl
1047
HWY_AFTER_NAMESPACE();
1048
1049
#if HWY_ONCE
1050
namespace jxl {
1051
HWY_EXPORT(ProcessRectACS);
1052
1053
Status AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in,
1054
                                  const ImageF& quant_field, const ImageF& mask,
1055
                                  const ImageF& mask1x1,
1056
0
                                  DequantMatrices* matrices) {
1057
0
  config.dequant = matrices;
1058
1059
0
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
1060
0
    JXL_RETURN_IF_ERROR(
1061
0
        matrices->EnsureComputed(memory_manager, 1));  // DCT8 only
1062
0
  } else {
1063
0
    uint32_t acs_mask = 0;
1064
    // All transforms up to 64x64.
1065
0
    for (size_t i = 0; i < static_cast<size_t>(AcStrategyType::DCT128X128);
1066
0
         i++) {
1067
0
      acs_mask |= (1 << i);
1068
0
    }
1069
0
    JXL_RETURN_IF_ERROR(matrices->EnsureComputed(memory_manager, acs_mask));
1070
0
  }
1071
1072
  // Image row pointers and strides.
1073
0
  config.quant_field_row = quant_field.Row(0);
1074
0
  config.quant_field_stride = quant_field.PixelsPerRow();
1075
0
  if (mask.xsize() > 0 && mask.ysize() > 0) {
1076
0
    config.masking_field_row = mask.Row(0);
1077
0
    config.masking_field_stride = mask.PixelsPerRow();
1078
0
  }
1079
0
  config.mask1x1_xsize = mask1x1.xsize();
1080
0
  if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) {
1081
0
    config.masking1x1_field_row = mask1x1.Row(0);
1082
0
    config.masking1x1_field_stride = mask1x1.PixelsPerRow();
1083
0
  }
1084
1085
0
  config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0);
1086
0
  config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0);
1087
0
  config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0);
1088
0
  config.src_stride = src.PixelsPerRow();
1089
1090
  // Entropy estimate is composed of two factors:
1091
  //  - estimate of the number of bits that will be used by the block
1092
  //  - information loss due to quantization
1093
  // The following constant controls the relative weights of these components.
1094
0
  config.info_loss_multiplier = 1.2;
1095
0
  config.zeros_mul = 9.3089059022677905;
1096
0
  config.cost_delta = 10.833273317067883;
1097
1098
0
  static const float kBias = 0.13731742964354549;
1099
0
  const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias);
1100
1101
0
  static const float kPow1 = 0.33677806662454718;
1102
0
  static const float kPow2 = 0.50990926717963703;
1103
0
  static const float kPow3 = 0.36702940662370243;
1104
0
  config.info_loss_multiplier *= std::pow(ratio, kPow1);
1105
0
  config.zeros_mul *= std::pow(ratio, kPow2);
1106
0
  config.cost_delta *= std::pow(ratio, kPow3);
1107
0
  return true;
1108
0
}
1109
1110
0
Status AcStrategyHeuristics::PrepareForThreads(std::size_t num_threads) {
1111
0
  const size_t dct_scratch_size =
1112
0
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
1113
0
  mem_per_thread = 6 * AcStrategy::kMaxCoeffArea + dct_scratch_size;
1114
0
  size_t mem_bytes = num_threads * mem_per_thread * sizeof(float);
1115
0
  JXL_ASSIGN_OR_RETURN(mem, AlignedMemory::Create(memory_manager, mem_bytes));
1116
0
  qmem_per_thread = AcStrategy::kMaxCoeffArea;
1117
0
  size_t qmem_bytes = num_threads * qmem_per_thread * sizeof(uint32_t);
1118
0
  JXL_ASSIGN_OR_RETURN(qmem, AlignedMemory::Create(memory_manager, qmem_bytes));
1119
0
  return true;
1120
0
}
1121
1122
Status AcStrategyHeuristics::ProcessRect(const Rect& rect,
1123
                                         const ColorCorrelationMap& cmap,
1124
                                         AcStrategyImage* ac_strategy,
1125
0
                                         size_t thread) {
1126
  // In Falcon mode, use DCT8 everywhere and uniform quantization.
1127
0
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
1128
0
    ac_strategy->FillDCT8(rect);
1129
0
    return true;
1130
0
  }
1131
0
  return HWY_DYNAMIC_DISPATCH(ProcessRectACS)(
1132
0
      cparams, config, rect, cmap,
1133
0
      mem.address<float>() + thread * mem_per_thread,
1134
0
      qmem.address<uint32_t>() + thread * qmem_per_thread, ac_strategy);
1135
0
}
1136
1137
Status AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim,
1138
                                      const AcStrategyImage& ac_strategy,
1139
0
                                      AuxOut* aux_out) {
1140
  // Accounting and debug output.
1141
0
  if (aux_out != nullptr) {
1142
0
    aux_out->num_small_blocks =
1143
0
        ac_strategy.CountBlocks(AcStrategyType::IDENTITY) +
1144
0
        ac_strategy.CountBlocks(AcStrategyType::DCT2X2) +
1145
0
        ac_strategy.CountBlocks(AcStrategyType::DCT4X4);
1146
0
    aux_out->num_dct4x8_blocks =
1147
0
        ac_strategy.CountBlocks(AcStrategyType::DCT4X8) +
1148
0
        ac_strategy.CountBlocks(AcStrategyType::DCT8X4);
1149
0
    aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategyType::AFV0) +
1150
0
                              ac_strategy.CountBlocks(AcStrategyType::AFV1) +
1151
0
                              ac_strategy.CountBlocks(AcStrategyType::AFV2) +
1152
0
                              ac_strategy.CountBlocks(AcStrategyType::AFV3);
1153
0
    aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategyType::DCT);
1154
0
    aux_out->num_dct8x16_blocks =
1155
0
        ac_strategy.CountBlocks(AcStrategyType::DCT8X16) +
1156
0
        ac_strategy.CountBlocks(AcStrategyType::DCT16X8);
1157
0
    aux_out->num_dct8x32_blocks =
1158
0
        ac_strategy.CountBlocks(AcStrategyType::DCT8X32) +
1159
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X8);
1160
0
    aux_out->num_dct16_blocks =
1161
0
        ac_strategy.CountBlocks(AcStrategyType::DCT16X16);
1162
0
    aux_out->num_dct16x32_blocks =
1163
0
        ac_strategy.CountBlocks(AcStrategyType::DCT16X32) +
1164
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X16);
1165
0
    aux_out->num_dct32_blocks =
1166
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X32);
1167
0
    aux_out->num_dct32x64_blocks =
1168
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X64) +
1169
0
        ac_strategy.CountBlocks(AcStrategyType::DCT64X32);
1170
0
    aux_out->num_dct64_blocks =
1171
0
        ac_strategy.CountBlocks(AcStrategyType::DCT64X64);
1172
0
  }
1173
1174
0
  if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) {
1175
0
    JXL_RETURN_IF_ERROR(DumpAcStrategy(ac_strategy, frame_dim.xsize,
1176
0
                                       frame_dim.ysize, "ac_strategy", aux_out,
1177
0
                                       cparams));
1178
0
  }
1179
0
  return true;
1180
0
}
1181
1182
}  // namespace jxl
1183
#endif  // HWY_ONCE