Coverage Report

Created: 2024-05-21 06:41

/src/libjxl/lib/jxl/enc_ac_strategy.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ac_strategy.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cmath>
12
#include <cstdint>
13
#include <cstdio>
14
#include <cstring>
15
16
#undef HWY_TARGET_INCLUDE
17
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc"
18
#include <hwy/foreach_target.h>
19
#include <hwy/highway.h>
20
21
#include "lib/jxl/ac_strategy.h"
22
#include "lib/jxl/base/bits.h"
23
#include "lib/jxl/base/compiler_specific.h"
24
#include "lib/jxl/base/fast_math-inl.h"
25
#include "lib/jxl/base/rect.h"
26
#include "lib/jxl/base/status.h"
27
#include "lib/jxl/dec_transforms-inl.h"
28
#include "lib/jxl/enc_aux_out.h"
29
#include "lib/jxl/enc_debug_image.h"
30
#include "lib/jxl/enc_params.h"
31
#include "lib/jxl/enc_transforms-inl.h"
32
#include "lib/jxl/simd_util.h"
33
34
// Some of the floating point constants in this file and in other
35
// files in the libjxl project have been obtained using the
36
// tools/optimizer/simplex_fork.py tool. It is a variation of
37
// Nelder-Mead optimization, and we generally try to minimize
38
// BPP * pnorm aggregate as reported by the benchmark_xl tool,
39
// but occasionally the values are optimized by using additional
40
// constraints such as maintaining a certain density, or ratio of
41
// popularity of integral transforms. Jyrki visually reviews all
42
// such changes and often makes manual changes to maintain good
43
// visual quality to changes where butteraugli was not sufficiently
44
// sensitive to some kind of degradation. Unfortunately image quality
45
// is still more of an art than science.
46
47
// Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging.
48
#ifndef JXL_DEBUG_AC_STRATEGY
49
104
#define JXL_DEBUG_AC_STRATEGY 0
50
#endif
51
52
// This must come before the begin/end_target, but HWY_ONCE is only true
53
// after that, so use an "include guard".
54
#ifndef LIB_JXL_ENC_AC_STRATEGY_
55
#define LIB_JXL_ENC_AC_STRATEGY_
56
// Parameters of the heuristic are marked with a OPTIMIZE comment.
57
namespace jxl {
58
namespace {
59
60
// Debugging utilities.
61
62
// Returns a linear sRGB color (as bytes) for each AC strategy.
63
0
const uint8_t* TypeColor(const uint8_t& raw_strategy) {
64
0
  JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
65
0
  static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors");
66
0
  static constexpr uint8_t kColors[][3] = {
67
0
      {0xFF, 0xFF, 0x00},  // DCT8
68
0
      {0xFF, 0x80, 0x80},  // HORNUSS
69
0
      {0xFF, 0x80, 0x80},  // DCT2x2
70
0
      {0xFF, 0x80, 0x80},  // DCT4x4
71
0
      {0x80, 0xFF, 0x00},  // DCT16x16
72
0
      {0x00, 0xC0, 0x00},  // DCT32x32
73
0
      {0xC0, 0xFF, 0x00},  // DCT16x8
74
0
      {0xC0, 0xFF, 0x00},  // DCT8x16
75
0
      {0x00, 0xFF, 0x00},  // DCT32x8
76
0
      {0x00, 0xFF, 0x00},  // DCT8x32
77
0
      {0x00, 0xFF, 0x00},  // DCT32x16
78
0
      {0x00, 0xFF, 0x00},  // DCT16x32
79
0
      {0xFF, 0x80, 0x00},  // DCT4x8
80
0
      {0xFF, 0x80, 0x00},  // DCT8x4
81
0
      {0xFF, 0xFF, 0x80},  // AFV0
82
0
      {0xFF, 0xFF, 0x80},  // AFV1
83
0
      {0xFF, 0xFF, 0x80},  // AFV2
84
0
      {0xFF, 0xFF, 0x80},  // AFV3
85
0
      {0x00, 0xC0, 0xFF},  // DCT64x64
86
0
      {0x00, 0xFF, 0xFF},  // DCT64x32
87
0
      {0x00, 0xFF, 0xFF},  // DCT32x64
88
0
      {0x00, 0x40, 0xFF},  // DCT128x128
89
0
      {0x00, 0x80, 0xFF},  // DCT128x64
90
0
      {0x00, 0x80, 0xFF},  // DCT64x128
91
0
      {0x00, 0x00, 0xC0},  // DCT256x256
92
0
      {0x00, 0x00, 0xFF},  // DCT256x128
93
0
      {0x00, 0x00, 0xFF},  // DCT128x256
94
0
  };
95
0
  return kColors[raw_strategy];
96
0
}
97
98
0
const uint8_t* TypeMask(const uint8_t& raw_strategy) {
99
0
  JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
100
0
  static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks");
101
0
  // implicitly, first row and column is made dark
102
0
  static constexpr uint8_t kMask[][64] = {
103
0
      {
104
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
105
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
106
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
107
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
108
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
109
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
110
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
111
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
112
0
      },                           // DCT8
113
0
      {
114
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
115
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
116
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
117
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
118
0
          0, 0, 1, 1, 1, 1, 0, 0,  //
119
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
120
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
121
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
122
0
      },                           // HORNUSS
123
0
      {
124
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
125
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
126
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
127
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
128
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
129
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
130
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
131
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
132
0
      },                           // 2x2
133
0
      {
134
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
135
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
136
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
137
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
138
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
139
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
140
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
141
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
142
0
      },                           // 4x4
143
0
      {},                          // DCT16x16 (unused)
144
0
      {},                          // DCT32x32 (unused)
145
0
      {},                          // DCT16x8 (unused)
146
0
      {},                          // DCT8x16 (unused)
147
0
      {},                          // DCT32x8 (unused)
148
0
      {},                          // DCT8x32 (unused)
149
0
      {},                          // DCT32x16 (unused)
150
0
      {},                          // DCT16x32 (unused)
151
0
      {
152
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
153
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
154
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
155
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
156
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
157
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
158
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
159
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
160
0
      },                           // DCT4x8
161
0
      {
162
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
163
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
164
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
165
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
166
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
167
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
168
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
169
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
170
0
      },                           // DCT8x4
171
0
      {
172
0
          1, 1, 1, 1, 1, 0, 0, 0,  //
173
0
          1, 1, 1, 1, 0, 0, 0, 0,  //
174
0
          1, 1, 1, 0, 0, 0, 0, 0,  //
175
0
          1, 1, 0, 0, 0, 0, 0, 0,  //
176
0
          1, 0, 0, 0, 0, 0, 0, 0,  //
177
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
178
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
179
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
180
0
      },                           // AFV0
181
0
      {
182
0
          0, 0, 0, 0, 1, 1, 1, 1,  //
183
0
          0, 0, 0, 0, 0, 1, 1, 1,  //
184
0
          0, 0, 0, 0, 0, 0, 1, 1,  //
185
0
          0, 0, 0, 0, 0, 0, 0, 1,  //
186
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
187
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
188
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
189
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
190
0
      },                           // AFV1
191
0
      {
192
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
193
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
194
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
195
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
196
0
          1, 0, 0, 0, 0, 0, 0, 0,  //
197
0
          1, 1, 0, 0, 0, 0, 0, 0,  //
198
0
          1, 1, 1, 0, 0, 0, 0, 0,  //
199
0
          1, 1, 1, 1, 0, 0, 0, 0,  //
200
0
      },                           // AFV2
201
0
      {
202
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
203
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
204
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
205
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
206
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
207
0
          0, 0, 0, 0, 0, 0, 0, 1,  //
208
0
          0, 0, 0, 0, 0, 0, 1, 1,  //
209
0
          0, 0, 0, 0, 0, 1, 1, 1,  //
210
0
      },                           // AFV3
211
0
  };
212
0
  return kMask[raw_strategy];
213
0
}
214
215
Status DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
216
                      size_t ysize, const char* tag, AuxOut* aux_out,
217
0
                      const CompressParams& cparams) {
218
0
  JxlMemoryManager* memory_manager = ac_strategy.memory_manager();
219
0
  JXL_ASSIGN_OR_RETURN(Image3F color_acs,
220
0
                       Image3F::Create(memory_manager, xsize, ysize));
221
0
  for (size_t y = 0; y < ysize; y++) {
222
0
    float* JXL_RESTRICT rows[3] = {
223
0
        color_acs.PlaneRow(0, y),
224
0
        color_acs.PlaneRow(1, y),
225
0
        color_acs.PlaneRow(2, y),
226
0
    };
227
0
    const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim);
228
0
    for (size_t x = 0; x < xsize; x++) {
229
0
      AcStrategy acs = acs_row[x / kBlockDim];
230
0
      const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
231
0
      for (size_t c = 0; c < 3; c++) {
232
0
        rows[c][x] = color[c] / 255.f;
233
0
      }
234
0
    }
235
0
  }
236
0
  size_t stride = color_acs.PixelsPerRow();
237
0
  for (size_t c = 0; c < 3; c++) {
238
0
    for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) {
239
0
      float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim);
240
0
      const AcStrategyRow acs_row = ac_strategy.ConstRow(by);
241
0
      for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) {
242
0
        AcStrategy acs = acs_row[bx];
243
0
        if (!acs.IsFirstBlock()) continue;
244
0
        const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
245
0
        const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy());
246
0
        if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) {
247
0
          for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize;
248
0
               iy++) {
249
0
            for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize;
250
0
                 ix++) {
251
0
              if (mask[iy * kBlockDim + ix]) {
252
0
                row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f;
253
0
              }
254
0
            }
255
0
          }
256
0
        }
257
0
        // draw block edges
258
0
        for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() &&
259
0
                            bx * kBlockDim + ix < xsize;
260
0
             ix++) {
261
0
          row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f;
262
0
        }
263
0
        for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() &&
264
0
                            by * kBlockDim + iy < ysize;
265
0
             iy++) {
266
0
          row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f;
267
0
        }
268
0
      }
269
0
    }
270
0
  }
271
0
  return DumpImage(cparams, tag, color_acs);
272
0
}
273
274
}  // namespace
275
}  // namespace jxl
276
#endif  // LIB_JXL_ENC_AC_STRATEGY_
277
278
HWY_BEFORE_NAMESPACE();
279
namespace jxl {
280
namespace HWY_NAMESPACE {
281
282
// These templates are not found via ADL.
283
using hwy::HWY_NAMESPACE::AbsDiff;
284
using hwy::HWY_NAMESPACE::Eq;
285
using hwy::HWY_NAMESPACE::IfThenElseZero;
286
using hwy::HWY_NAMESPACE::IfThenZeroElse;
287
using hwy::HWY_NAMESPACE::Round;
288
using hwy::HWY_NAMESPACE::Sqrt;
289
290
bool MultiBlockTransformCrossesHorizontalBoundary(
291
    const AcStrategyImage& ac_strategy, size_t start_x, size_t y,
292
0
    size_t end_x) {
293
0
  if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) {
294
0
    return false;
295
0
  }
296
0
  if (y % 8 == 0) {
297
    // Nothing crosses 64x64 boundaries, and the memory on the other side
298
    // of the 64x64 block may still uninitialized.
299
0
    return false;
300
0
  }
301
0
  end_x = std::min(end_x, ac_strategy.xsize());
302
  // The first multiblock might be before the start_x, let's adjust it
303
  // to point to the first IsFirstBlock() == true block we find by backward
304
  // tracing.
305
0
  AcStrategyRow row = ac_strategy.ConstRow(y);
306
0
  const size_t start_x_limit = start_x & ~7;
307
0
  while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) {
308
0
    --start_x;
309
0
  }
310
0
  for (size_t x = start_x; x < end_x;) {
311
0
    if (row[x].IsFirstBlock()) {
312
0
      x += row[x].covered_blocks_x();
313
0
    } else {
314
0
      return true;
315
0
    }
316
0
  }
317
0
  return false;
318
0
}
Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_AVX2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
319
320
bool MultiBlockTransformCrossesVerticalBoundary(
321
    const AcStrategyImage& ac_strategy, size_t x, size_t start_y,
322
0
    size_t end_y) {
323
0
  if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) {
324
0
    return false;
325
0
  }
326
0
  if (x % 8 == 0) {
327
    // Nothing crosses 64x64 boundaries, and the memory on the other side
328
    // of the 64x64 block may still uninitialized.
329
0
    return false;
330
0
  }
331
0
  end_y = std::min(end_y, ac_strategy.ysize());
332
  // The first multiblock might be before the start_y, let's adjust it
333
  // to point to the first IsFirstBlock() == true block we find by backward
334
  // tracing.
335
0
  const size_t start_y_limit = start_y & ~7;
336
0
  while (start_y != start_y_limit &&
337
0
         !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) {
338
0
    --start_y;
339
0
  }
340
341
0
  for (size_t y = start_y; y < end_y;) {
342
0
    AcStrategyRow row = ac_strategy.ConstRow(y);
343
0
    if (row[x].IsFirstBlock()) {
344
0
      y += row[x].covered_blocks_y();
345
0
    } else {
346
0
      return true;
347
0
    }
348
0
  }
349
0
  return false;
350
0
}
Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_AVX2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
351
352
float EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x,
353
                      size_t y, const ACSConfig& config,
354
                      const float* JXL_RESTRICT cmap_factors, float* block,
355
520
                      float* full_scratch_space, uint32_t* quantized) {
356
520
  float* mem = full_scratch_space;
357
520
  float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea;
358
520
  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
359
360
  // Apply transform.
361
2.08k
  for (size_t c = 0; c < 3; c++) {
362
1.56k
    float* JXL_RESTRICT block_c = block + size * c;
363
1.56k
    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
364
1.56k
                        config.src_stride, block_c, scratch_space);
365
1.56k
  }
366
520
  HWY_FULL(float) df;
367
368
520
  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
369
  // avoid large blocks when there is a lot going on in red-green.
370
520
  float quant_norm16 = 0;
371
520
  if (num_blocks == 1) {
372
    // When it is only one 8x8, we don't need aggregation of values.
373
520
    quant_norm16 = config.Quant(x / 8, y / 8);
374
520
  } else if (num_blocks == 2) {
375
    // Taking max instead of 8th norm seems to work
376
    // better for smallest blocks up to 16x8. Jyrki couldn't get
377
    // improvements in trying the same for 16x16 blocks.
378
0
    if (acs.covered_blocks_y() == 2) {
379
0
      quant_norm16 =
380
0
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
381
0
    } else {
382
0
      quant_norm16 =
383
0
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
384
0
    }
385
0
  } else {
386
    // Load QF value, calculate empirical heuristic on masking field
387
    // for weighting the information loss. Information loss manifests
388
    // itself as ringing, and masking could hide it.
389
0
    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
390
0
      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
391
0
        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
392
0
        qval *= qval;
393
0
        qval *= qval;
394
0
        qval *= qval;
395
0
        quant_norm16 += qval * qval;
396
0
      }
397
0
    }
398
0
    quant_norm16 /= num_blocks;
399
0
    quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f);
400
0
  }
401
520
  const auto quant = Set(df, quant_norm16);
402
403
  // Compute entropy.
404
520
  float entropy = 0.0f;
405
520
  const HWY_CAPPED(float, 8) df8;
406
407
520
  auto loss = Zero(df8);
408
2.08k
  for (size_t c = 0; c < 3; c++) {
409
1.56k
    const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c);
410
1.56k
    const float* matrix = config.dequant->Matrix(acs.RawStrategy(), c);
411
1.56k
    const auto cmap_factor = Set(df, cmap_factors[c]);
412
413
1.56k
    auto entropy_v = Zero(df);
414
1.56k
    auto nzeros_v = Zero(df);
415
14.0k
    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
416
12.4k
      const auto in = Load(df, block + c * size + i);
417
12.4k
      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
418
12.4k
      const auto im = Load(df, inv_matrix + i);
419
12.4k
      const auto val = Mul(Sub(in, in_y), Mul(im, quant));
420
12.4k
      const auto rval = Round(val);
421
12.4k
      const auto diff = Sub(val, rval);
422
12.4k
      const auto m = Load(df, matrix + i);
423
12.4k
      Store(Mul(m, diff), df, &mem[i]);
424
12.4k
      const auto q = Abs(rval);
425
12.4k
      const auto q_is_zero = Eq(q, Zero(df));
426
      // We used to have q * C here, but that cost model seems to
427
      // be punishing large values more than necessary. Sqrt tries
428
      // to avoid large values less aggressively.
429
12.4k
      entropy_v = Add(Sqrt(q), entropy_v);
430
12.4k
      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
431
12.4k
    }
432
433
1.56k
    {
434
1.56k
      auto lossc = Zero(df8);
435
1.56k
      TransformToPixels(acs.Strategy(), &mem[0], block,
436
1.56k
                        acs.covered_blocks_x() * 8, scratch_space);
437
438
3.12k
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
439
3.12k
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
440
14.0k
          for (size_t dy = 0; dy < kBlockDim; ++dy) {
441
24.9k
            for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) {
442
12.4k
              auto in = Load(df8, block +
443
12.4k
                                      (iy * kBlockDim + dy) *
444
12.4k
                                          (acs.covered_blocks_x() * kBlockDim) +
445
12.4k
                                      ix * kBlockDim + dx);
446
12.4k
              auto masku = Abs(Load(
447
12.4k
                  df8, config.MaskingPtr1x1(x + ix * 8 + dx, y + iy * 8 + dy)));
448
12.4k
              in = Mul(masku, in);
449
12.4k
              in = Mul(in, in);
450
12.4k
              in = Mul(in, in);
451
12.4k
              in = Mul(in, in);
452
12.4k
              lossc = Add(lossc, in);
453
12.4k
            }
454
12.4k
          }
455
1.56k
        }
456
1.56k
      }
457
1.56k
      static const double kChannelMul[3] = {
458
1.56k
          pow(10.2, 8.0),
459
1.56k
          pow(1.0, 8.0),
460
1.56k
          pow(1.03, 8.0),
461
1.56k
      };
462
1.56k
      lossc = Mul(Set(df8, kChannelMul[c]), lossc);
463
1.56k
      loss = Add(loss, lossc);
464
1.56k
    }
465
1.56k
    entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v));
466
1.56k
    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
467
    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
468
    // number of non-zeros of the block.
469
1.56k
    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
470
    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
471
    // bias.
472
1.56k
    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
473
1.56k
  }
474
520
  float loss_scalar =
475
520
      pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize),
476
520
          1.0 / 8.0) *
477
520
      (num_blocks * kDCTBlockSize) / quant_norm16;
478
520
  float ret = entropy * entropy_mul;
479
520
  ret += config.info_loss_multiplier * loss_scalar;
480
520
  return ret;
481
520
}
Unexecuted instantiation: jxl::N_SSE4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*)
jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*)
Line
Count
Source
355
520
                      float* full_scratch_space, uint32_t* quantized) {
356
520
  float* mem = full_scratch_space;
357
520
  float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea;
358
520
  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
359
360
  // Apply transform.
361
2.08k
  for (size_t c = 0; c < 3; c++) {
362
1.56k
    float* JXL_RESTRICT block_c = block + size * c;
363
1.56k
    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
364
1.56k
                        config.src_stride, block_c, scratch_space);
365
1.56k
  }
366
520
  HWY_FULL(float) df;
367
368
520
  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
369
  // avoid large blocks when there is a lot going on in red-green.
370
520
  float quant_norm16 = 0;
371
520
  if (num_blocks == 1) {
372
    // When it is only one 8x8, we don't need aggregation of values.
373
520
    quant_norm16 = config.Quant(x / 8, y / 8);
374
520
  } else if (num_blocks == 2) {
375
    // Taking max instead of 8th norm seems to work
376
    // better for smallest blocks up to 16x8. Jyrki couldn't get
377
    // improvements in trying the same for 16x16 blocks.
378
0
    if (acs.covered_blocks_y() == 2) {
379
0
      quant_norm16 =
380
0
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
381
0
    } else {
382
0
      quant_norm16 =
383
0
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
384
0
    }
385
0
  } else {
386
    // Load QF value, calculate empirical heuristic on masking field
387
    // for weighting the information loss. Information loss manifests
388
    // itself as ringing, and masking could hide it.
389
0
    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
390
0
      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
391
0
        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
392
0
        qval *= qval;
393
0
        qval *= qval;
394
0
        qval *= qval;
395
0
        quant_norm16 += qval * qval;
396
0
      }
397
0
    }
398
0
    quant_norm16 /= num_blocks;
399
0
    quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f);
400
0
  }
401
520
  const auto quant = Set(df, quant_norm16);
402
403
  // Compute entropy.
404
520
  float entropy = 0.0f;
405
520
  const HWY_CAPPED(float, 8) df8;
406
407
520
  auto loss = Zero(df8);
408
2.08k
  for (size_t c = 0; c < 3; c++) {
409
1.56k
    const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c);
410
1.56k
    const float* matrix = config.dequant->Matrix(acs.RawStrategy(), c);
411
1.56k
    const auto cmap_factor = Set(df, cmap_factors[c]);
412
413
1.56k
    auto entropy_v = Zero(df);
414
1.56k
    auto nzeros_v = Zero(df);
415
14.0k
    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
416
12.4k
      const auto in = Load(df, block + c * size + i);
417
12.4k
      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
418
12.4k
      const auto im = Load(df, inv_matrix + i);
419
12.4k
      const auto val = Mul(Sub(in, in_y), Mul(im, quant));
420
12.4k
      const auto rval = Round(val);
421
12.4k
      const auto diff = Sub(val, rval);
422
12.4k
      const auto m = Load(df, matrix + i);
423
12.4k
      Store(Mul(m, diff), df, &mem[i]);
424
12.4k
      const auto q = Abs(rval);
425
12.4k
      const auto q_is_zero = Eq(q, Zero(df));
426
      // We used to have q * C here, but that cost model seems to
427
      // be punishing large values more than necessary. Sqrt tries
428
      // to avoid large values less aggressively.
429
12.4k
      entropy_v = Add(Sqrt(q), entropy_v);
430
12.4k
      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
431
12.4k
    }
432
433
1.56k
    {
434
1.56k
      auto lossc = Zero(df8);
435
1.56k
      TransformToPixels(acs.Strategy(), &mem[0], block,
436
1.56k
                        acs.covered_blocks_x() * 8, scratch_space);
437
438
3.12k
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
439
3.12k
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
440
14.0k
          for (size_t dy = 0; dy < kBlockDim; ++dy) {
441
24.9k
            for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) {
442
12.4k
              auto in = Load(df8, block +
443
12.4k
                                      (iy * kBlockDim + dy) *
444
12.4k
                                          (acs.covered_blocks_x() * kBlockDim) +
445
12.4k
                                      ix * kBlockDim + dx);
446
12.4k
              auto masku = Abs(Load(
447
12.4k
                  df8, config.MaskingPtr1x1(x + ix * 8 + dx, y + iy * 8 + dy)));
448
12.4k
              in = Mul(masku, in);
449
12.4k
              in = Mul(in, in);
450
12.4k
              in = Mul(in, in);
451
12.4k
              in = Mul(in, in);
452
12.4k
              lossc = Add(lossc, in);
453
12.4k
            }
454
12.4k
          }
455
1.56k
        }
456
1.56k
      }
457
1.56k
      static const double kChannelMul[3] = {
458
1.56k
          pow(10.2, 8.0),
459
1.56k
          pow(1.0, 8.0),
460
1.56k
          pow(1.03, 8.0),
461
1.56k
      };
462
1.56k
      lossc = Mul(Set(df8, kChannelMul[c]), lossc);
463
1.56k
      loss = Add(loss, lossc);
464
1.56k
    }
465
1.56k
    entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v));
466
1.56k
    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
467
    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
468
    // number of non-zeros of the block.
469
1.56k
    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
470
    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
471
    // bias.
472
1.56k
    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
473
1.56k
  }
474
520
  float loss_scalar =
475
520
      pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize),
476
520
          1.0 / 8.0) *
477
520
      (num_blocks * kDCTBlockSize) / quant_norm16;
478
520
  float ret = entropy * entropy_mul;
479
520
  ret += config.info_loss_multiplier * loss_scalar;
480
520
  return ret;
481
520
}
Unexecuted instantiation: jxl::N_SSE2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*)
482
483
uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
484
                             float butteraugli_target, const ACSConfig& config,
485
                             const float* JXL_RESTRICT cmap_factors,
486
                             AcStrategyImage* JXL_RESTRICT ac_strategy,
487
                             float* block, float* scratch_space,
488
52
                             uint32_t* quantized, float* entropy_out) {
489
52
  struct TransformTry8x8 {
490
52
    AcStrategy::Type type;
491
52
    int encoding_speed_tier_max_limit;
492
52
    double entropy_mul;
493
52
  };
494
52
  static const TransformTry8x8 kTransforms8x8[] = {
495
52
      {
496
52
          AcStrategy::Type::DCT,
497
52
          9,
498
52
          0.8,
499
52
      },
500
52
      {
501
52
          AcStrategy::Type::DCT4X4,
502
52
          5,
503
52
          1.08,
504
52
      },
505
52
      {
506
52
          AcStrategy::Type::DCT2X2,
507
52
          5,
508
52
          0.95,
509
52
      },
510
52
      {
511
52
          AcStrategy::Type::DCT4X8,
512
52
          4,
513
52
          0.85931637428340035,
514
52
      },
515
52
      {
516
52
          AcStrategy::Type::DCT8X4,
517
52
          4,
518
52
          0.85931637428340035,
519
52
      },
520
52
      {
521
52
          AcStrategy::Type::IDENTITY,
522
52
          5,
523
52
          1.0427542510634957,
524
52
      },
525
52
      {
526
52
          AcStrategy::Type::AFV0,
527
52
          4,
528
52
          0.81779489591359944,
529
52
      },
530
52
      {
531
52
          AcStrategy::Type::AFV1,
532
52
          4,
533
52
          0.81779489591359944,
534
52
      },
535
52
      {
536
52
          AcStrategy::Type::AFV2,
537
52
          4,
538
52
          0.81779489591359944,
539
52
      },
540
52
      {
541
52
          AcStrategy::Type::AFV3,
542
52
          4,
543
52
          0.81779489591359944,
544
52
      },
545
52
  };
546
52
  double best = 1e30;
547
52
  uint8_t best_tx = kTransforms8x8[0].type;
548
520
  for (auto tx : kTransforms8x8) {
549
520
    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
550
0
      continue;
551
0
    }
552
520
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
553
520
    float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul;
554
520
    if ((tx.type == AcStrategy::Type::DCT2X2 ||
555
520
         tx.type == AcStrategy::Type::IDENTITY) &&
556
520
        butteraugli_target < 5.0) {
557
104
      static const float kFavor2X2AtHighQuality = 0.4;
558
104
      float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0);
559
104
      entropy_mul -= kFavor2X2AtHighQuality * weight;
560
104
    }
561
520
    if ((tx.type != AcStrategy::Type::DCT &&
562
520
         tx.type != AcStrategy::Type::DCT2X2 &&
563
520
         tx.type != AcStrategy::Type::IDENTITY) &&
564
520
        butteraugli_target > 4.0) {
565
0
      static const float kAvoidEntropyOfTransforms = 0.5;
566
0
      float mul = 1.0;
567
0
      if (butteraugli_target < 12.0) {
568
0
        mul *= (12.0 - 4.0) / (butteraugli_target - 4.0);
569
0
      }
570
0
      entropy_mul += kAvoidEntropyOfTransforms * mul;
571
0
    }
572
520
    float entropy =
573
520
        EstimateEntropy(acs, entropy_mul, x, y, config, cmap_factors, block,
574
520
                        scratch_space, quantized);
575
520
    if (entropy < best) {
576
104
      best_tx = tx.type;
577
104
      best = entropy;
578
104
    }
579
520
  }
580
52
  *entropy_out = best;
581
52
  return best_tx;
582
52
}
Unexecuted instantiation: jxl::N_SSE4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*)
jxl::N_AVX2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*)
Line
Count
Source
488
52
                             uint32_t* quantized, float* entropy_out) {
489
52
  struct TransformTry8x8 {
490
52
    AcStrategy::Type type;
491
52
    int encoding_speed_tier_max_limit;
492
52
    double entropy_mul;
493
52
  };
494
52
  static const TransformTry8x8 kTransforms8x8[] = {
495
52
      {
496
52
          AcStrategy::Type::DCT,
497
52
          9,
498
52
          0.8,
499
52
      },
500
52
      {
501
52
          AcStrategy::Type::DCT4X4,
502
52
          5,
503
52
          1.08,
504
52
      },
505
52
      {
506
52
          AcStrategy::Type::DCT2X2,
507
52
          5,
508
52
          0.95,
509
52
      },
510
52
      {
511
52
          AcStrategy::Type::DCT4X8,
512
52
          4,
513
52
          0.85931637428340035,
514
52
      },
515
52
      {
516
52
          AcStrategy::Type::DCT8X4,
517
52
          4,
518
52
          0.85931637428340035,
519
52
      },
520
52
      {
521
52
          AcStrategy::Type::IDENTITY,
522
52
          5,
523
52
          1.0427542510634957,
524
52
      },
525
52
      {
526
52
          AcStrategy::Type::AFV0,
527
52
          4,
528
52
          0.81779489591359944,
529
52
      },
530
52
      {
531
52
          AcStrategy::Type::AFV1,
532
52
          4,
533
52
          0.81779489591359944,
534
52
      },
535
52
      {
536
52
          AcStrategy::Type::AFV2,
537
52
          4,
538
52
          0.81779489591359944,
539
52
      },
540
52
      {
541
52
          AcStrategy::Type::AFV3,
542
52
          4,
543
52
          0.81779489591359944,
544
52
      },
545
52
  };
546
52
  double best = 1e30;
547
52
  uint8_t best_tx = kTransforms8x8[0].type;
548
520
  for (auto tx : kTransforms8x8) {
549
520
    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
550
0
      continue;
551
0
    }
552
520
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
553
520
    float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul;
554
520
    if ((tx.type == AcStrategy::Type::DCT2X2 ||
555
520
         tx.type == AcStrategy::Type::IDENTITY) &&
556
520
        butteraugli_target < 5.0) {
557
104
      static const float kFavor2X2AtHighQuality = 0.4;
558
104
      float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0);
559
104
      entropy_mul -= kFavor2X2AtHighQuality * weight;
560
104
    }
561
520
    if ((tx.type != AcStrategy::Type::DCT &&
562
520
         tx.type != AcStrategy::Type::DCT2X2 &&
563
520
         tx.type != AcStrategy::Type::IDENTITY) &&
564
520
        butteraugli_target > 4.0) {
565
0
      static const float kAvoidEntropyOfTransforms = 0.5;
566
0
      float mul = 1.0;
567
0
      if (butteraugli_target < 12.0) {
568
0
        mul *= (12.0 - 4.0) / (butteraugli_target - 4.0);
569
0
      }
570
0
      entropy_mul += kAvoidEntropyOfTransforms * mul;
571
0
    }
572
520
    float entropy =
573
520
        EstimateEntropy(acs, entropy_mul, x, y, config, cmap_factors, block,
574
520
                        scratch_space, quantized);
575
520
    if (entropy < best) {
576
104
      best_tx = tx.type;
577
104
      best = entropy;
578
104
    }
579
520
  }
580
52
  *entropy_out = best;
581
52
  return best_tx;
582
52
}
Unexecuted instantiation: jxl::N_SSE2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*)
583
584
// bx, by addresses the 64x64 block at 8x8 subresolution
585
// cx, cy addresses the left, upper 8x8 block position of the candidate
586
// transform.
587
void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx,
588
                 size_t cy, const ACSConfig& config,
589
                 const float* JXL_RESTRICT cmap_factors,
590
                 AcStrategyImage* JXL_RESTRICT ac_strategy,
591
                 const float entropy_mul, const uint8_t candidate_priority,
592
                 uint8_t* priority, float* JXL_RESTRICT entropy_estimate,
593
156
                 float* block, float* scratch_space, uint32_t* quantized) {
594
156
  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
595
156
  float entropy_current = 0;
596
156
  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
597
156
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
598
156
      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
599
        // Transform would reuse already allocated blocks and
600
        // lead to invalid overlaps, for example DCT64X32 vs.
601
        // DCT32X64.
602
156
        return;
603
156
      }
604
0
      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
605
0
    }
606
156
  }
607
0
  float entropy_candidate =
608
0
      EstimateEntropy(acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config,
609
0
                      cmap_factors, block, scratch_space, quantized);
610
0
  if (entropy_candidate >= entropy_current) return;
611
  // Accept the candidate.
612
0
  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
613
0
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
614
0
      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
615
0
      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
616
0
    }
617
0
  }
618
0
  ac_strategy->Set(bx + cx, by + cy, acs_raw);
619
0
  entropy_estimate[cy * 8 + cx] = entropy_candidate;
620
0
}
Unexecuted instantiation: jxl::N_SSE4::TryMergeAcs(jxl::AcStrategy::Type, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
jxl::N_AVX2::TryMergeAcs(jxl::AcStrategy::Type, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
Line
Count
Source
593
156
                 float* block, float* scratch_space, uint32_t* quantized) {
594
156
  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
595
156
  float entropy_current = 0;
596
156
  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
597
156
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
598
156
      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
599
        // Transform would reuse already allocated blocks and
600
        // lead to invalid overlaps, for example DCT64X32 vs.
601
        // DCT32X64.
602
156
        return;
603
156
      }
604
0
      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
605
0
    }
606
156
  }
607
0
  float entropy_candidate =
608
0
      EstimateEntropy(acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config,
609
0
                      cmap_factors, block, scratch_space, quantized);
610
0
  if (entropy_candidate >= entropy_current) return;
611
  // Accept the candidate.
612
0
  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
613
0
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
614
0
      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
615
0
      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
616
0
    }
617
0
  }
618
0
  ac_strategy->Set(bx + cx, by + cy, acs_raw);
619
0
  entropy_estimate[cy * 8 + cx] = entropy_candidate;
620
0
}
Unexecuted instantiation: jxl::N_SSE2::TryMergeAcs(jxl::AcStrategy::Type, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
621
622
static void SetEntropyForTransform(size_t cx, size_t cy,
623
                                   const AcStrategy::Type acs_raw,
624
                                   float entropy,
625
0
                                   float* JXL_RESTRICT entropy_estimate) {
626
0
  const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
627
0
  for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) {
628
0
    for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) {
629
0
      entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0;
630
0
    }
631
0
  }
632
0
  entropy_estimate[cy * 8 + cx] = entropy;
633
0
}
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategy::Type, float, float*)
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategy::Type, float, float*)
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategy::Type, float, float*)
634
635
0
AcStrategy::Type AcsSquare(size_t blocks) {
636
0
  if (blocks == 2) {
637
0
    return AcStrategy::Type::DCT16X16;
638
0
  } else if (blocks == 4) {
639
0
    return AcStrategy::Type::DCT32X32;
640
0
  } else {
641
0
    return AcStrategy::Type::DCT64X64;
642
0
  }
643
0
}
Unexecuted instantiation: jxl::N_SSE4::AcsSquare(unsigned long)
Unexecuted instantiation: jxl::N_AVX2::AcsSquare(unsigned long)
Unexecuted instantiation: jxl::N_SSE2::AcsSquare(unsigned long)
644
645
0
AcStrategy::Type AcsVerticalSplit(size_t blocks) {
646
0
  if (blocks == 2) {
647
0
    return AcStrategy::Type::DCT16X8;
648
0
  } else if (blocks == 4) {
649
0
    return AcStrategy::Type::DCT32X16;
650
0
  } else {
651
0
    return AcStrategy::Type::DCT64X32;
652
0
  }
653
0
}
Unexecuted instantiation: jxl::N_SSE4::AcsVerticalSplit(unsigned long)
Unexecuted instantiation: jxl::N_AVX2::AcsVerticalSplit(unsigned long)
Unexecuted instantiation: jxl::N_SSE2::AcsVerticalSplit(unsigned long)
654
655
0
AcStrategy::Type AcsHorizontalSplit(size_t blocks) {
656
0
  if (blocks == 2) {
657
0
    return AcStrategy::Type::DCT8X16;
658
0
  } else if (blocks == 4) {
659
0
    return AcStrategy::Type::DCT16X32;
660
0
  } else {
661
0
    return AcStrategy::Type::DCT32X64;
662
0
  }
663
0
}
Unexecuted instantiation: jxl::N_SSE4::AcsHorizontalSplit(unsigned long)
Unexecuted instantiation: jxl::N_AVX2::AcsHorizontalSplit(unsigned long)
Unexecuted instantiation: jxl::N_SSE2::AcsHorizontalSplit(unsigned long)
664
665
// The following function tries to merge smaller transforms into
666
// squares and the rectangles originating from a single middle division
667
// (horizontal or vertical) fairly.
668
//
669
// This is now generalized to concern about squares
670
// of blocks X blocks size, where a block is 8x8 pixels.
671
void FindBestFirstLevelDivisionForSquare(
672
    size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx,
673
    size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors,
674
    AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK,
675
    const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate,
676
0
    float* block, float* scratch_space, uint32_t* quantized) {
677
  // We denote J for the larger dimension here, and K for the smaller.
678
  // For example, for 32x32 block splitting, J would be 32, K 16.
679
0
  const size_t blocks_half = blocks / 2;
680
0
  const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks);
681
0
  const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks);
682
0
  const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks);
683
0
  const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK);
684
0
  const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ);
685
0
  const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ);
686
0
  AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0);
687
0
  AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half);
688
  // Let's check if we can consider a JXJ block here at all.
689
  // This is not necessary in the basic use of hierarchically merging
690
  // blocks in the simplest possible way, but is needed when we try other
691
  // 'floating' options of merging, possibly after a simple hierarchical
692
  // merge has been explored.
693
0
  if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx,
694
0
                                                   by + cy, bx + cx + blocks) ||
695
0
      MultiBlockTransformCrossesHorizontalBoundary(
696
0
          *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) ||
697
0
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy,
698
0
                                                 by + cy + blocks) ||
699
0
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks,
700
0
                                                 by + cy, by + cy + blocks)) {
701
0
    return;  // not suitable for JxJ analysis, some transforms leak out.
702
0
  }
703
  // For floating transforms there may be
704
  // already blocks selected that make either or both JXK and
705
  // KXJ not feasible for this location.
706
0
  const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary(
707
0
      *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks);
708
0
  const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary(
709
0
      *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks);
710
  // Current entropies aggregated on NxN resolution.
711
0
  float entropy[2][2] = {};
712
0
  for (size_t dy = 0; dy < blocks; ++dy) {
713
0
    for (size_t dx = 0; dx < blocks; ++dx) {
714
0
      entropy[dy / blocks_half][dx / blocks_half] +=
715
0
          entropy_estimate[(cy + dy) * 8 + (cx + dx)];
716
0
    }
717
0
  }
718
0
  float entropy_JXK_left = std::numeric_limits<float>::max();
719
0
  float entropy_JXK_right = std::numeric_limits<float>::max();
720
0
  float entropy_KXJ_top = std::numeric_limits<float>::max();
721
0
  float entropy_KXJ_bottom = std::numeric_limits<float>::max();
722
0
  float entropy_JXJ = std::numeric_limits<float>::max();
723
0
  if (allow_JXK) {
724
0
    if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) {
725
0
      entropy_JXK_left = EstimateEntropy(
726
0
          acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
727
0
          cmap_factors, block, scratch_space, quantized);
728
0
    }
729
0
    if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) {
730
0
      entropy_JXK_right =
731
0
          EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8,
732
0
                          (by + cy + 0) * 8, config, cmap_factors, block,
733
0
                          scratch_space, quantized);
734
0
    }
735
0
  }
736
0
  if (allow_KXJ) {
737
0
    if (row0[bx + cx].RawStrategy() != acs_rawKXJ) {
738
0
      entropy_KXJ_top = EstimateEntropy(
739
0
          acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
740
0
          cmap_factors, block, scratch_space, quantized);
741
0
    }
742
0
    if (row1[bx + cx].RawStrategy() != acs_rawKXJ) {
743
0
      entropy_KXJ_bottom =
744
0
          EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8,
745
0
                          (by + cy + blocks_half) * 8, config, cmap_factors,
746
0
                          block, scratch_space, quantized);
747
0
    }
748
0
  }
749
0
  if (allow_square_transform) {
750
    // We control the exploration of the square transform separately so that
751
    // we can turn it off at high decoding speeds for 32x32, but still allow
752
    // exploring 16x32 and 32x16.
753
0
    entropy_JXJ = EstimateEntropy(acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8,
754
0
                                  (by + cy + 0) * 8, config, cmap_factors,
755
0
                                  block, scratch_space, quantized);
756
0
  }
757
758
  // Test if this block should have JXK or KXJ transforms,
759
  // because it can have only one or the other.
760
0
  float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) +
761
0
                  std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]);
762
0
  float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) +
763
0
                  std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]);
764
0
  if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) {
765
0
    ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ);
766
0
    SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate);
767
0
  } else if (costJxN < costNxJ) {
768
0
    if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) {
769
0
      ac_strategy->Set(bx + cx, by + cy, acs_rawJXK);
770
0
      SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left,
771
0
                             entropy_estimate);
772
0
    }
773
0
    if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) {
774
0
      ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK);
775
0
      SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK,
776
0
                             entropy_JXK_right, entropy_estimate);
777
0
    }
778
0
  } else {
779
0
    if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) {
780
0
      ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ);
781
0
      SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top,
782
0
                             entropy_estimate);
783
0
    }
784
0
    if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) {
785
0
      ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ);
786
0
      SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ,
787
0
                             entropy_KXJ_bottom, entropy_estimate);
788
0
    }
789
0
  }
790
0
}
Unexecuted instantiation: jxl::N_SSE4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_AVX2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_SSE2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
791
792
void ProcessRectACS(const CompressParams& cparams, const ACSConfig& config,
793
                    const Rect& rect, const ColorCorrelationMap& cmap,
794
                    float* JXL_RESTRICT block, uint32_t* JXL_RESTRICT quantized,
795
52
                    AcStrategyImage* ac_strategy) {
796
  // Main philosophy here:
797
  // 1. First find best 8x8 transform for each area.
798
  // 2. Merging them into larger transforms where possibly, but
799
  // starting from the smallest transforms (16x8 and 8x16).
800
  // Additional complication: 16x8 and 8x16 are considered
801
  // simultaneously and fairly against each other.
802
  // We are looking at 64x64 squares since the YtoX and YtoB
803
  // maps happen to be at that resolution, and having
804
  // integral transforms cross these boundaries leads to
805
  // additional complications.
806
52
  const float butteraugli_target = cparams.butteraugli_distance;
807
52
  float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea;
808
52
  size_t bx = rect.x0();
809
52
  size_t by = rect.y0();
810
52
  JXL_ASSERT(rect.xsize() <= 8);
811
52
  JXL_ASSERT(rect.ysize() <= 8);
812
52
  size_t tx = bx / kColorTileDimInBlocks;
813
52
  size_t ty = by / kColorTileDimInBlocks;
814
52
  const float cmap_factors[3] = {
815
52
      cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]),
816
52
      0.0f,
817
52
      cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]),
818
52
  };
819
52
  if (cparams.speed_tier > SpeedTier::kHare) return;
820
  // First compute the best 8x8 transform for each square. Later, we do not
821
  // experiment with different combinations, but only use the best of the 8x8s
822
  // when DCT8X8 is specified in the tree search.
823
  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
824
52
  float entropy_estimate[64] = {};
825
  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
826
  // low butteraugli_target distances.
827
52
  static const float k8x8mul1 = -0.4;
828
52
  static const float k8x8mul2 = 1.0;
829
52
  static const float k8x8base = 1.4;
830
52
  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
831
104
  for (size_t iy = 0; iy < rect.ysize(); iy++) {
832
104
    for (size_t ix = 0; ix < rect.xsize(); ix++) {
833
52
      float entropy = 0.0;
834
52
      const uint8_t best_of_8x8s = FindBest8x8Transform(
835
52
          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
836
52
          butteraugli_target, config, cmap_factors, ac_strategy, block,
837
52
          scratch_space, quantized, &entropy);
838
52
      ac_strategy->Set(bx + ix, by + iy,
839
52
                       static_cast<AcStrategy::Type>(best_of_8x8s));
840
52
      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
841
52
    }
842
52
  }
843
  // Merge when a larger transform is better than the previously
844
  // searched best combination of 8x8 transforms.
845
52
  struct MergeTry {
846
52
    AcStrategy::Type type;
847
52
    uint8_t priority;
848
52
    uint8_t decoding_speed_tier_max_limit;
849
52
    uint8_t encoding_speed_tier_max_limit;
850
52
    float entropy_mul;
851
52
  };
852
  // These numbers need to be figured out manually and looking at
853
  // ringing next to sky etc. Optimization will find smaller numbers
854
  // and produce more ringing than is ideal. Larger numbers will
855
  // help stop ringing.
856
52
  const float entropy_mul16X8 = 1.25;
857
52
  const float entropy_mul16X16 = 1.35;
858
52
  const float entropy_mul16X32 = 1.5;
859
52
  const float entropy_mul32X32 = 1.5;
860
52
  const float entropy_mul64X32 = 2.26;
861
52
  const float entropy_mul64X64 = 2.26;
862
  // TODO(jyrki): Consider this feedback in further changes:
863
  // Also effectively when the multipliers for smaller blocks are
864
  // below 1, this raises the bar for the bigger blocks even higher
865
  // in that sense these constants are not independent (e.g. changing
866
  // the constant for DCT16x32 by -5% (making it more likely) also
867
  // means that DCT32x32 becomes harder to do when starting from
868
  // two DCT16x32s). It might be better to make them more independent,
869
  // e.g. by not applying the multiplier when storing the new entropy
870
  // estimates in TryMergeToACSCandidate().
871
52
  const MergeTry kTransformsForMerge[9] = {
872
52
      {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8},
873
52
      {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8},
874
      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
875
      // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16},
876
52
      {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32},
877
52
      {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32},
878
      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
879
      // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5,
880
      // 0.9822994906548809f},
881
52
      {AcStrategy::Type::DCT64X32, 6, 1, 3, entropy_mul64X32},
882
52
      {AcStrategy::Type::DCT32X64, 6, 1, 3, entropy_mul64X32},
883
      // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f},
884
52
  };
885
  /*
886
  These sizes not yet included in merge heuristic:
887
  set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f);
888
  set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f);
889
  set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f);
890
  set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f);
891
  set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f);
892
  set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f);
893
  set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f);
894
  set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f);
895
  */
896
897
  // Priority is a tricky kludge to avoid collisions so that transforms
898
  // don't overlap.
899
52
  uint8_t priority[64] = {};
900
52
  bool enable_32x32 = cparams.decoding_speed_tier < 4;
901
468
  for (auto tx : kTransformsForMerge) {
902
468
    if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
903
0
      continue;
904
0
    }
905
468
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
906
907
676
    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
908
468
         cy += acs.covered_blocks_y()) {
909
364
      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
910
208
           cx += acs.covered_blocks_x()) {
911
156
        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
912
0
          if (cparams.decoding_speed_tier < 4 &&
913
0
              tx.type == AcStrategy::Type::DCT32X64) {
914
            // We handle both DCT8X16 and DCT16X8 at the same time.
915
0
            if ((cy | cx) % 8 == 0) {
916
0
              FindBestFirstLevelDivisionForSquare(
917
0
                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
918
0
                  tx.entropy_mul, entropy_mul64X64, entropy_estimate, block,
919
0
                  scratch_space, quantized);
920
0
            }
921
0
            continue;
922
0
          } else if (tx.type == AcStrategy::Type::DCT32X16) {
923
            // We handled both DCT8X16 and DCT16X8 at the same time,
924
            // and that is above. The last column and last row,
925
            // when the last column or last row is odd numbered,
926
            // are still handled by TryMergeAcs.
927
0
            continue;
928
0
          }
929
0
        }
930
156
        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
931
156
            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
932
          // already covered by FindBest32X32
933
0
          continue;
934
0
        }
935
936
156
        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
937
0
          if (tx.type == AcStrategy::Type::DCT16X32) {
938
            // We handle both DCT8X16 and DCT16X8 at the same time.
939
0
            if ((cy | cx) % 4 == 0) {
940
0
              FindBestFirstLevelDivisionForSquare(
941
0
                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
942
0
                  ac_strategy, tx.entropy_mul, entropy_mul32X32,
943
0
                  entropy_estimate, block, scratch_space, quantized);
944
0
            }
945
0
            continue;
946
0
          } else if (tx.type == AcStrategy::Type::DCT32X16) {
947
            // We handled both DCT8X16 and DCT16X8 at the same time,
948
            // and that is above. The last column and last row,
949
            // when the last column or last row is odd numbered,
950
            // are still handled by TryMergeAcs.
951
0
            continue;
952
0
          }
953
0
        }
954
156
        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
955
156
            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
956
          // already covered by FindBest32X32
957
0
          continue;
958
0
        }
959
156
        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
960
0
          if (tx.type == AcStrategy::Type::DCT8X16) {
961
            // We handle both DCT8X16 and DCT16X8 at the same time.
962
0
            if ((cy | cx) % 2 == 0) {
963
0
              FindBestFirstLevelDivisionForSquare(
964
0
                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
965
0
                  tx.entropy_mul, entropy_mul16X16, entropy_estimate, block,
966
0
                  scratch_space, quantized);
967
0
            }
968
0
            continue;
969
0
          } else if (tx.type == AcStrategy::Type::DCT16X8) {
970
            // We handled both DCT8X16 and DCT16X8 at the same time,
971
            // and that is above. The last column and last row,
972
            // when the last column or last row is odd numbered,
973
            // are still handled by TryMergeAcs.
974
0
            continue;
975
0
          }
976
0
        }
977
156
        if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) ||
978
156
            (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) {
979
          // already covered by FindBestFirstLevelDivisionForSquare
980
0
          continue;
981
0
        }
982
        // All other merge sizes are handled here.
983
        // Some of the DCT16X8s and DCT8X16s will still leak through here
984
        // when there is an odd number of 8x8 blocks, then the last row
985
        // and column will get their DCT16X8s and DCT8X16s through the
986
        // normal integral transform merging process.
987
156
        TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy,
988
156
                    tx.entropy_mul, tx.priority, &priority[0], entropy_estimate,
989
156
                    block, scratch_space, quantized);
990
156
      }
991
208
    }
992
468
  }
993
52
  if (cparams.speed_tier >= SpeedTier::kHare) {
994
0
    return;
995
0
  }
996
  // Here we still try to do some non-aligned matching, find a few more
997
  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
998
52
  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
999
0
    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
1000
0
      if ((cy | cx) % 2 != 0) {
1001
0
        FindBestFirstLevelDivisionForSquare(
1002
0
            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1003
0
            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
1004
0
            scratch_space, quantized);
1005
0
      }
1006
0
    }
1007
0
  }
1008
  // Non-aligned matching for 32X32, 16X32 and 32X16.
1009
52
  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
1010
52
  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
1011
0
    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
1012
0
      if ((cy | cx) % 4 == 0) {
1013
0
        continue;  // Already tried with loop above (DCT16X32 case).
1014
0
      }
1015
0
      FindBestFirstLevelDivisionForSquare(
1016
0
          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1017
0
          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
1018
0
          scratch_space, quantized);
1019
0
    }
1020
0
  }
1021
52
}
Unexecuted instantiation: jxl::N_SSE4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
jxl::N_AVX2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
Line
Count
Source
795
52
                    AcStrategyImage* ac_strategy) {
796
  // Main philosophy here:
797
  // 1. First find best 8x8 transform for each area.
798
  // 2. Merging them into larger transforms where possibly, but
799
  // starting from the smallest transforms (16x8 and 8x16).
800
  // Additional complication: 16x8 and 8x16 are considered
801
  // simultaneously and fairly against each other.
802
  // We are looking at 64x64 squares since the YtoX and YtoB
803
  // maps happen to be at that resolution, and having
804
  // integral transforms cross these boundaries leads to
805
  // additional complications.
806
52
  const float butteraugli_target = cparams.butteraugli_distance;
807
52
  float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea;
808
52
  size_t bx = rect.x0();
809
52
  size_t by = rect.y0();
810
52
  JXL_ASSERT(rect.xsize() <= 8);
811
52
  JXL_ASSERT(rect.ysize() <= 8);
812
52
  size_t tx = bx / kColorTileDimInBlocks;
813
52
  size_t ty = by / kColorTileDimInBlocks;
814
52
  const float cmap_factors[3] = {
815
52
      cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]),
816
52
      0.0f,
817
52
      cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]),
818
52
  };
819
52
  if (cparams.speed_tier > SpeedTier::kHare) return;
820
  // First compute the best 8x8 transform for each square. Later, we do not
821
  // experiment with different combinations, but only use the best of the 8x8s
822
  // when DCT8X8 is specified in the tree search.
823
  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
824
52
  float entropy_estimate[64] = {};
825
  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
826
  // low butteraugli_target distances.
827
52
  static const float k8x8mul1 = -0.4;
828
52
  static const float k8x8mul2 = 1.0;
829
52
  static const float k8x8base = 1.4;
830
52
  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
831
104
  for (size_t iy = 0; iy < rect.ysize(); iy++) {
832
104
    for (size_t ix = 0; ix < rect.xsize(); ix++) {
833
52
      float entropy = 0.0;
834
52
      const uint8_t best_of_8x8s = FindBest8x8Transform(
835
52
          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
836
52
          butteraugli_target, config, cmap_factors, ac_strategy, block,
837
52
          scratch_space, quantized, &entropy);
838
52
      ac_strategy->Set(bx + ix, by + iy,
839
52
                       static_cast<AcStrategy::Type>(best_of_8x8s));
840
52
      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
841
52
    }
842
52
  }
843
  // Merge when a larger transform is better than the previously
844
  // searched best combination of 8x8 transforms.
845
52
  struct MergeTry {
846
52
    AcStrategy::Type type;
847
52
    uint8_t priority;
848
52
    uint8_t decoding_speed_tier_max_limit;
849
52
    uint8_t encoding_speed_tier_max_limit;
850
52
    float entropy_mul;
851
52
  };
852
  // These numbers need to be figured out manually and looking at
853
  // ringing next to sky etc. Optimization will find smaller numbers
854
  // and produce more ringing than is ideal. Larger numbers will
855
  // help stop ringing.
856
52
  const float entropy_mul16X8 = 1.25;
857
52
  const float entropy_mul16X16 = 1.35;
858
52
  const float entropy_mul16X32 = 1.5;
859
52
  const float entropy_mul32X32 = 1.5;
860
52
  const float entropy_mul64X32 = 2.26;
861
52
  const float entropy_mul64X64 = 2.26;
862
  // TODO(jyrki): Consider this feedback in further changes:
863
  // Also effectively when the multipliers for smaller blocks are
864
  // below 1, this raises the bar for the bigger blocks even higher
865
  // in that sense these constants are not independent (e.g. changing
866
  // the constant for DCT16x32 by -5% (making it more likely) also
867
  // means that DCT32x32 becomes harder to do when starting from
868
  // two DCT16x32s). It might be better to make them more independent,
869
  // e.g. by not applying the multiplier when storing the new entropy
870
  // estimates in TryMergeToACSCandidate().
871
52
  const MergeTry kTransformsForMerge[9] = {
872
52
      {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8},
873
52
      {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8},
874
      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
875
      // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16},
876
52
      {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32},
877
52
      {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32},
878
      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
879
      // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5,
880
      // 0.9822994906548809f},
881
52
      {AcStrategy::Type::DCT64X32, 6, 1, 3, entropy_mul64X32},
882
52
      {AcStrategy::Type::DCT32X64, 6, 1, 3, entropy_mul64X32},
883
      // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f},
884
52
  };
885
  /*
886
  These sizes not yet included in merge heuristic:
887
  set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f);
888
  set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f);
889
  set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f);
890
  set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f);
891
  set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f);
892
  set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f);
893
  set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f);
894
  set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f);
895
  */
896
897
  // Priority is a tricky kludge to avoid collisions so that transforms
898
  // don't overlap.
899
52
  uint8_t priority[64] = {};
900
52
  bool enable_32x32 = cparams.decoding_speed_tier < 4;
901
468
  for (auto tx : kTransformsForMerge) {
902
468
    if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
903
0
      continue;
904
0
    }
905
468
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
906
907
676
    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
908
468
         cy += acs.covered_blocks_y()) {
909
364
      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
910
208
           cx += acs.covered_blocks_x()) {
911
156
        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
912
0
          if (cparams.decoding_speed_tier < 4 &&
913
0
              tx.type == AcStrategy::Type::DCT32X64) {
914
            // We handle both DCT8X16 and DCT16X8 at the same time.
915
0
            if ((cy | cx) % 8 == 0) {
916
0
              FindBestFirstLevelDivisionForSquare(
917
0
                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
918
0
                  tx.entropy_mul, entropy_mul64X64, entropy_estimate, block,
919
0
                  scratch_space, quantized);
920
0
            }
921
0
            continue;
922
0
          } else if (tx.type == AcStrategy::Type::DCT32X16) {
923
            // We handled both DCT8X16 and DCT16X8 at the same time,
924
            // and that is above. The last column and last row,
925
            // when the last column or last row is odd numbered,
926
            // are still handled by TryMergeAcs.
927
0
            continue;
928
0
          }
929
0
        }
930
156
        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
931
156
            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
932
          // already covered by FindBest32X32
933
0
          continue;
934
0
        }
935
936
156
        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
937
0
          if (tx.type == AcStrategy::Type::DCT16X32) {
938
            // We handle both DCT8X16 and DCT16X8 at the same time.
939
0
            if ((cy | cx) % 4 == 0) {
940
0
              FindBestFirstLevelDivisionForSquare(
941
0
                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
942
0
                  ac_strategy, tx.entropy_mul, entropy_mul32X32,
943
0
                  entropy_estimate, block, scratch_space, quantized);
944
0
            }
945
0
            continue;
946
0
          } else if (tx.type == AcStrategy::Type::DCT32X16) {
947
            // We handled both DCT8X16 and DCT16X8 at the same time,
948
            // and that is above. The last column and last row,
949
            // when the last column or last row is odd numbered,
950
            // are still handled by TryMergeAcs.
951
0
            continue;
952
0
          }
953
0
        }
954
156
        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
955
156
            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
956
          // already covered by FindBest32X32
957
0
          continue;
958
0
        }
959
156
        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
960
0
          if (tx.type == AcStrategy::Type::DCT8X16) {
961
            // We handle both DCT8X16 and DCT16X8 at the same time.
962
0
            if ((cy | cx) % 2 == 0) {
963
0
              FindBestFirstLevelDivisionForSquare(
964
0
                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
965
0
                  tx.entropy_mul, entropy_mul16X16, entropy_estimate, block,
966
0
                  scratch_space, quantized);
967
0
            }
968
0
            continue;
969
0
          } else if (tx.type == AcStrategy::Type::DCT16X8) {
970
            // We handled both DCT8X16 and DCT16X8 at the same time,
971
            // and that is above. The last column and last row,
972
            // when the last column or last row is odd numbered,
973
            // are still handled by TryMergeAcs.
974
0
            continue;
975
0
          }
976
0
        }
977
156
        if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) ||
978
156
            (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) {
979
          // already covered by FindBestFirstLevelDivisionForSquare
980
0
          continue;
981
0
        }
982
        // All other merge sizes are handled here.
983
        // Some of the DCT16X8s and DCT8X16s will still leak through here
984
        // when there is an odd number of 8x8 blocks, then the last row
985
        // and column will get their DCT16X8s and DCT8X16s through the
986
        // normal integral transform merging process.
987
156
        TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy,
988
156
                    tx.entropy_mul, tx.priority, &priority[0], entropy_estimate,
989
156
                    block, scratch_space, quantized);
990
156
      }
991
208
    }
992
468
  }
993
52
  if (cparams.speed_tier >= SpeedTier::kHare) {
994
0
    return;
995
0
  }
996
  // Here we still try to do some non-aligned matching, find a few more
997
  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
998
52
  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
999
0
    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
1000
0
      if ((cy | cx) % 2 != 0) {
1001
0
        FindBestFirstLevelDivisionForSquare(
1002
0
            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1003
0
            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
1004
0
            scratch_space, quantized);
1005
0
      }
1006
0
    }
1007
0
  }
1008
  // Non-aligned matching for 32X32, 16X32 and 32X16.
1009
52
  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
1010
52
  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
1011
0
    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
1012
0
      if ((cy | cx) % 4 == 0) {
1013
0
        continue;  // Already tried with loop above (DCT16X32 case).
1014
0
      }
1015
0
      FindBestFirstLevelDivisionForSquare(
1016
0
          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1017
0
          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
1018
0
          scratch_space, quantized);
1019
0
    }
1020
0
  }
1021
52
}
Unexecuted instantiation: jxl::N_SSE2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
1022
1023
// NOLINTNEXTLINE(google-readability-namespace-comments)
1024
}  // namespace HWY_NAMESPACE
1025
}  // namespace jxl
1026
HWY_AFTER_NAMESPACE();
1027
1028
#if HWY_ONCE
1029
namespace jxl {
1030
HWY_EXPORT(ProcessRectACS);
1031
1032
void AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in,
1033
                                const ImageF& quant_field, const ImageF& mask,
1034
                                const ImageF& mask1x1,
1035
52
                                DequantMatrices* matrices) {
1036
52
  config.dequant = matrices;
1037
1038
52
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
1039
0
    JXL_CHECK(matrices->EnsureComputed(1));  // DCT8 only
1040
52
  } else {
1041
52
    uint32_t acs_mask = 0;
1042
    // All transforms up to 64x64.
1043
1.14k
    for (size_t i = 0; i < AcStrategy::DCT128X128; i++) {
1044
1.09k
      acs_mask |= (1 << i);
1045
1.09k
    }
1046
52
    JXL_CHECK(matrices->EnsureComputed(acs_mask));
1047
52
  }
1048
1049
  // Image row pointers and strides.
1050
52
  config.quant_field_row = quant_field.Row(0);
1051
52
  config.quant_field_stride = quant_field.PixelsPerRow();
1052
52
  if (mask.xsize() > 0 && mask.ysize() > 0) {
1053
52
    config.masking_field_row = mask.Row(0);
1054
52
    config.masking_field_stride = mask.PixelsPerRow();
1055
52
  }
1056
52
  if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) {
1057
52
    config.masking1x1_field_row = mask1x1.Row(0);
1058
52
    config.masking1x1_field_stride = mask1x1.PixelsPerRow();
1059
52
  }
1060
1061
52
  config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0);
1062
52
  config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0);
1063
52
  config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0);
1064
52
  config.src_stride = src.PixelsPerRow();
1065
1066
  // Entropy estimate is composed of two factors:
1067
  //  - estimate of the number of bits that will be used by the block
1068
  //  - information loss due to quantization
1069
  // The following constant controls the relative weights of these components.
1070
52
  config.info_loss_multiplier = 1.2;
1071
52
  config.zeros_mul = 9.3089059022677905;
1072
52
  config.cost_delta = 10.833273317067883;
1073
1074
52
  static const float kBias = 0.13731742964354549;
1075
52
  const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias);
1076
1077
52
  static const float kPow1 = 0.33677806662454718;
1078
52
  static const float kPow2 = 0.50990926717963703;
1079
52
  static const float kPow3 = 0.36702940662370243;
1080
52
  config.info_loss_multiplier *= std::pow(ratio, kPow1);
1081
52
  config.zeros_mul *= std::pow(ratio, kPow2);
1082
52
  config.cost_delta *= std::pow(ratio, kPow3);
1083
52
}
1084
1085
52
void AcStrategyHeuristics::PrepareForThreads(std::size_t num_threads) {
1086
52
  const size_t dct_scratch_size =
1087
52
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
1088
52
  mem_per_thread = 6 * AcStrategy::kMaxCoeffArea + dct_scratch_size;
1089
52
  mem = hwy::AllocateAligned<float>(num_threads * mem_per_thread);
1090
52
  qmem_per_thread = AcStrategy::kMaxCoeffArea;
1091
52
  qmem = hwy::AllocateAligned<uint32_t>(num_threads * qmem_per_thread);
1092
52
}
1093
1094
void AcStrategyHeuristics::ProcessRect(const Rect& rect,
1095
                                       const ColorCorrelationMap& cmap,
1096
                                       AcStrategyImage* ac_strategy,
1097
52
                                       size_t thread) {
1098
  // In Falcon mode, use DCT8 everywhere and uniform quantization.
1099
52
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
1100
0
    ac_strategy->FillDCT8(rect);
1101
0
    return;
1102
0
  }
1103
52
  HWY_DYNAMIC_DISPATCH(ProcessRectACS)
1104
52
  (cparams, config, rect, cmap, mem.get() + thread * mem_per_thread,
1105
52
   qmem.get() + thread * qmem_per_thread, ac_strategy);
1106
52
}
1107
1108
Status AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim,
1109
                                      const AcStrategyImage& ac_strategy,
1110
52
                                      AuxOut* aux_out) {
1111
  // Accounting and debug output.
1112
52
  if (aux_out != nullptr) {
1113
0
    aux_out->num_small_blocks =
1114
0
        ac_strategy.CountBlocks(AcStrategy::Type::IDENTITY) +
1115
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT2X2) +
1116
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT4X4);
1117
0
    aux_out->num_dct4x8_blocks =
1118
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) +
1119
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4);
1120
0
    aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) +
1121
0
                              ac_strategy.CountBlocks(AcStrategy::Type::AFV1) +
1122
0
                              ac_strategy.CountBlocks(AcStrategy::Type::AFV2) +
1123
0
                              ac_strategy.CountBlocks(AcStrategy::Type::AFV3);
1124
0
    aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT);
1125
0
    aux_out->num_dct8x16_blocks =
1126
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) +
1127
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8);
1128
0
    aux_out->num_dct8x32_blocks =
1129
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) +
1130
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8);
1131
0
    aux_out->num_dct16_blocks =
1132
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16);
1133
0
    aux_out->num_dct16x32_blocks =
1134
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) +
1135
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16);
1136
0
    aux_out->num_dct32_blocks =
1137
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32);
1138
0
    aux_out->num_dct32x64_blocks =
1139
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) +
1140
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32);
1141
0
    aux_out->num_dct64_blocks =
1142
0
        ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64);
1143
0
  }
1144
1145
  // if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(aux_out)) {
1146
52
  if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) {
1147
0
    JXL_RETURN_IF_ERROR(DumpAcStrategy(ac_strategy, frame_dim.xsize,
1148
0
                                       frame_dim.ysize, "ac_strategy", aux_out,
1149
0
                                       cparams));
1150
0
  }
1151
52
  return true;
1152
52
}
1153
1154
}  // namespace jxl
1155
#endif  // HWY_ONCE