Coverage Report

Created: 2025-11-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_ac_strategy.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ac_strategy.h"
7
8
#include <algorithm>
9
#include <cmath>
10
#include <cstdint>
11
#include <cstdio>
12
#include <cstring>
13
#include <limits>
14
15
#include "lib/jxl/chroma_from_luma.h"
16
#include "lib/jxl/common.h"
17
#include "lib/jxl/frame_dimensions.h"
18
#include "lib/jxl/image.h"
19
#include "lib/jxl/memory_manager_internal.h"
20
#include "lib/jxl/quant_weights.h"
21
22
#undef HWY_TARGET_INCLUDE
23
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc"
24
#include <hwy/foreach_target.h>
25
#include <hwy/highway.h>
26
27
#include "lib/jxl/ac_strategy.h"
28
#include "lib/jxl/base/bits.h"
29
#include "lib/jxl/base/compiler_specific.h"
30
#include "lib/jxl/base/fast_math-inl.h"
31
#include "lib/jxl/base/rect.h"
32
#include "lib/jxl/base/status.h"
33
#include "lib/jxl/dec_transforms-inl.h"
34
#include "lib/jxl/enc_aux_out.h"
35
#include "lib/jxl/enc_debug_image.h"
36
#include "lib/jxl/enc_params.h"
37
#include "lib/jxl/enc_transforms-inl.h"
38
#include "lib/jxl/simd_util.h"
39
40
// Some of the floating point constants in this file and in other
41
// files in the libjxl project have been obtained using the
42
// tools/optimizer/simplex_fork.py tool. It is a variation of
43
// Nelder-Mead optimization, and we generally try to minimize
44
// BPP * pnorm aggregate as reported by the benchmark_xl tool,
45
// but occasionally the values are optimized by using additional
46
// constraints such as maintaining a certain density, or ratio of
47
// popularity of integral transforms. Jyrki visually reviews all
48
// such changes and often makes manual changes to maintain good
49
// visual quality to changes where butteraugli was not sufficiently
50
// sensitive to some kind of degradation. Unfortunately image quality
51
// is still more of an art than science.
52
53
// Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging.
54
#ifndef JXL_DEBUG_AC_STRATEGY
55
6.97k
#define JXL_DEBUG_AC_STRATEGY 0
56
#endif
57
58
// This must come before the begin/end_target, but HWY_ONCE is only true
59
// after that, so use an "include guard".
60
#ifndef LIB_JXL_ENC_AC_STRATEGY_
61
#define LIB_JXL_ENC_AC_STRATEGY_
62
// Parameters of the heuristic are marked with a OPTIMIZE comment.
63
namespace jxl {
64
namespace {
65
66
// Debugging utilities.
67
68
// Returns a linear sRGB color (as bytes) for each AC strategy.
69
0
const uint8_t* TypeColor(uint8_t raw_strategy) {
70
0
  JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
71
0
  static_assert(AcStrategy::kNumValidStrategies == 27, "Update colors");
72
0
  static constexpr uint8_t kColors[AcStrategy::kNumValidStrategies + 1][3] = {
73
0
      {0xFF, 0xFF, 0x00},  // DCT8       | yellow
74
0
      {0xFF, 0x80, 0x80},  // HORNUSS    | vivid tangerine
75
0
      {0xFF, 0x80, 0x80},  // DCT2x2     | vivid tangerine
76
0
      {0xFF, 0x80, 0x80},  // DCT4x4     | vivid tangerine
77
0
      {0x80, 0xFF, 0x00},  // DCT16x16   | chartreuse
78
0
      {0x00, 0xC0, 0x00},  // DCT32x32   | waystone green
79
0
      {0xC0, 0xFF, 0x00},  // DCT16x8    | lime
80
0
      {0xC0, 0xFF, 0x00},  // DCT8x16    | lime
81
0
      {0x00, 0xFF, 0x00},  // DCT32x8    | green
82
0
      {0x00, 0xFF, 0x00},  // DCT8x32    | green
83
0
      {0x00, 0xFF, 0x00},  // DCT32x16   | green
84
0
      {0x00, 0xFF, 0x00},  // DCT16x32   | green
85
0
      {0xFF, 0x80, 0x00},  // DCT4x8     | orange juice
86
0
      {0xFF, 0x80, 0x00},  // DCT8x4     | orange juice
87
0
      {0xFF, 0xFF, 0x80},  // AFV0       | butter
88
0
      {0xFF, 0xFF, 0x80},  // AFV1       | butter
89
0
      {0xFF, 0xFF, 0x80},  // AFV2       | butter
90
0
      {0xFF, 0xFF, 0x80},  // AFV3       | butter
91
0
      {0x00, 0xC0, 0xFF},  // DCT64x64   | capri
92
0
      {0x00, 0xFF, 0xFF},  // DCT64x32   | aqua
93
0
      {0x00, 0xFF, 0xFF},  // DCT32x64   | aqua
94
0
      {0x00, 0x40, 0xFF},  // DCT128x128 | rare blue
95
0
      {0x00, 0x80, 0xFF},  // DCT128x64  | magic ink
96
0
      {0x00, 0x80, 0xFF},  // DCT64x128  | magic ink
97
0
      {0x00, 0x00, 0xC0},  // DCT256x256 | keese blue
98
0
      {0x00, 0x00, 0xFF},  // DCT256x128 | blue
99
0
      {0x00, 0x00, 0xFF},  // DCT128x256 | blue
100
0
      {0x00, 0x00, 0x00}   // invalid    | black
101
0
  };
102
0
  raw_strategy =
103
0
      Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies);
104
0
  return kColors[raw_strategy];
105
0
}
106
107
0
const uint8_t* TypeMask(uint8_t raw_strategy) {
108
0
  JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
109
0
  static_assert(AcStrategy::kNumValidStrategies == 27, "Update masks");
110
0
  // implicitly, first row and column is made dark
111
0
  static constexpr uint8_t kMask[AcStrategy::kNumValidStrategies + 1][64] = {
112
0
      {
113
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
114
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
115
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
116
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
117
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
118
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
119
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
120
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
121
0
      },                           // DCT8
122
0
      {
123
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
124
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
125
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
126
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
127
0
          0, 0, 1, 1, 1, 1, 0, 0,  //
128
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
129
0
          0, 0, 1, 0, 0, 1, 0, 0,  //
130
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
131
0
      },                           // HORNUSS
132
0
      {
133
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
134
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
135
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
136
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
137
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
138
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
139
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
140
0
          1, 0, 1, 0, 1, 0, 1, 0,  //
141
0
      },                           // 2x2
142
0
      {
143
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
144
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
145
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
146
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
147
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
148
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
149
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
150
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
151
0
      },                           // 4x4
152
0
      {},                          // DCT16x16 (unused)
153
0
      {},                          // DCT32x32 (unused)
154
0
      {},                          // DCT16x8 (unused)
155
0
      {},                          // DCT8x16 (unused)
156
0
      {},                          // DCT32x8 (unused)
157
0
      {},                          // DCT8x32 (unused)
158
0
      {},                          // DCT32x16 (unused)
159
0
      {},                          // DCT16x32 (unused)
160
0
      {
161
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
162
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
163
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
164
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
165
0
          1, 1, 1, 1, 1, 1, 1, 1,  //
166
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
167
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
168
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
169
0
      },                           // DCT4x8
170
0
      {
171
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
172
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
173
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
174
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
175
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
176
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
177
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
178
0
          0, 0, 0, 0, 1, 0, 0, 0,  //
179
0
      },                           // DCT8x4
180
0
      {
181
0
          1, 1, 1, 1, 1, 0, 0, 0,  //
182
0
          1, 1, 1, 1, 0, 0, 0, 0,  //
183
0
          1, 1, 1, 0, 0, 0, 0, 0,  //
184
0
          1, 1, 0, 0, 0, 0, 0, 0,  //
185
0
          1, 0, 0, 0, 0, 0, 0, 0,  //
186
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
187
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
188
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
189
0
      },                           // AFV0
190
0
      {
191
0
          0, 0, 0, 0, 1, 1, 1, 1,  //
192
0
          0, 0, 0, 0, 0, 1, 1, 1,  //
193
0
          0, 0, 0, 0, 0, 0, 1, 1,  //
194
0
          0, 0, 0, 0, 0, 0, 0, 1,  //
195
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
196
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
197
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
198
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
199
0
      },                           // AFV1
200
0
      {
201
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
202
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
203
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
204
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
205
0
          1, 0, 0, 0, 0, 0, 0, 0,  //
206
0
          1, 1, 0, 0, 0, 0, 0, 0,  //
207
0
          1, 1, 1, 0, 0, 0, 0, 0,  //
208
0
          1, 1, 1, 1, 0, 0, 0, 0,  //
209
0
      },                           // AFV2
210
0
      {
211
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
212
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
213
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
214
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
215
0
          0, 0, 0, 0, 0, 0, 0, 0,  //
216
0
          0, 0, 0, 0, 0, 0, 0, 1,  //
217
0
          0, 0, 0, 0, 0, 0, 1, 1,  //
218
0
          0, 0, 0, 0, 0, 1, 1, 1,  //
219
0
      },                           // AFV3
220
0
      {}                           // invalid
221
0
  };
222
0
  raw_strategy =
223
0
      Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies);
224
0
  return kMask[raw_strategy];
225
0
}
226
227
Status DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
228
                      size_t ysize, const char* tag, AuxOut* aux_out,
229
0
                      const CompressParams& cparams) {
230
0
  JxlMemoryManager* memory_manager = ac_strategy.memory_manager();
231
0
  JXL_ASSIGN_OR_RETURN(Image3F color_acs,
232
0
                       Image3F::Create(memory_manager, xsize, ysize));
233
0
  for (size_t y = 0; y < ysize; y++) {
234
0
    float* JXL_RESTRICT rows[3] = {
235
0
        color_acs.PlaneRow(0, y),
236
0
        color_acs.PlaneRow(1, y),
237
0
        color_acs.PlaneRow(2, y),
238
0
    };
239
0
    const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim);
240
0
    for (size_t x = 0; x < xsize; x++) {
241
0
      AcStrategy acs = acs_row[x / kBlockDim];
242
0
      const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
243
0
      for (size_t c = 0; c < 3; c++) {
244
0
        rows[c][x] = color[c] / 255.f;
245
0
      }
246
0
    }
247
0
  }
248
0
  size_t stride = color_acs.PixelsPerRow();
249
0
  for (size_t c = 0; c < 3; c++) {
250
0
    for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) {
251
0
      float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim);
252
0
      const AcStrategyRow acs_row = ac_strategy.ConstRow(by);
253
0
      for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) {
254
0
        AcStrategy acs = acs_row[bx];
255
0
        if (!acs.IsFirstBlock()) continue;
256
0
        const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
257
0
        const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy());
258
0
        if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) {
259
0
          for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize;
260
0
               iy++) {
261
0
            for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize;
262
0
                 ix++) {
263
0
              if (mask[iy * kBlockDim + ix]) {
264
0
                row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f;
265
0
              }
266
0
            }
267
0
          }
268
0
        }
269
0
        // draw block edges
270
0
        for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() &&
271
0
                            bx * kBlockDim + ix < xsize;
272
0
             ix++) {
273
0
          row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f;
274
0
        }
275
0
        for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() &&
276
0
                            by * kBlockDim + iy < ysize;
277
0
             iy++) {
278
0
          row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f;
279
0
        }
280
0
      }
281
0
    }
282
0
  }
283
0
  return DumpImage(cparams, tag, color_acs);
284
0
}
285
286
}  // namespace
287
}  // namespace jxl
288
#endif  // LIB_JXL_ENC_AC_STRATEGY_
289
290
HWY_BEFORE_NAMESPACE();
291
namespace jxl {
292
namespace HWY_NAMESPACE {
293
294
// These templates are not found via ADL.
295
using hwy::HWY_NAMESPACE::AbsDiff;
296
using hwy::HWY_NAMESPACE::Eq;
297
using hwy::HWY_NAMESPACE::IfThenElseZero;
298
using hwy::HWY_NAMESPACE::IfThenZeroElse;
299
using hwy::HWY_NAMESPACE::Round;
300
using hwy::HWY_NAMESPACE::Sqrt;
301
302
bool MultiBlockTransformCrossesHorizontalBoundary(
303
    const AcStrategyImage& ac_strategy, size_t start_x, size_t y,
304
10.0M
    size_t end_x) {
305
10.0M
  if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) {
306
95.1k
    return false;
307
95.1k
  }
308
10.0M
  if (y % 8 == 0) {
309
    // Nothing crosses 64x64 boundaries, and the memory on the other side
310
    // of the 64x64 block may still uninitialized.
311
1.47M
    return false;
312
1.47M
  }
313
8.52M
  end_x = std::min(end_x, ac_strategy.xsize());
314
  // The first multiblock might be before the start_x, let's adjust it
315
  // to point to the first IsFirstBlock() == true block we find by backward
316
  // tracing.
317
8.52M
  AcStrategyRow row = ac_strategy.ConstRow(y);
318
8.52M
  const size_t start_x_limit = start_x & ~7;
319
12.6M
  while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) {
320
4.08M
    --start_x;
321
4.08M
  }
322
24.0M
  for (size_t x = start_x; x < end_x;) {
323
17.1M
    if (row[x].IsFirstBlock()) {
324
15.5M
      x += row[x].covered_blocks_x();
325
15.5M
    } else {
326
1.63M
      return true;
327
1.63M
    }
328
17.1M
  }
329
6.89M
  return false;
330
8.52M
}
Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
jxl::N_AVX2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Line
Count
Source
304
10.0M
    size_t end_x) {
305
10.0M
  if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) {
306
95.1k
    return false;
307
95.1k
  }
308
10.0M
  if (y % 8 == 0) {
309
    // Nothing crosses 64x64 boundaries, and the memory on the other side
310
    // of the 64x64 block may still uninitialized.
311
1.47M
    return false;
312
1.47M
  }
313
8.52M
  end_x = std::min(end_x, ac_strategy.xsize());
314
  // The first multiblock might be before the start_x, let's adjust it
315
  // to point to the first IsFirstBlock() == true block we find by backward
316
  // tracing.
317
8.52M
  AcStrategyRow row = ac_strategy.ConstRow(y);
318
8.52M
  const size_t start_x_limit = start_x & ~7;
319
12.6M
  while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) {
320
4.08M
    --start_x;
321
4.08M
  }
322
24.0M
  for (size_t x = start_x; x < end_x;) {
323
17.1M
    if (row[x].IsFirstBlock()) {
324
15.5M
      x += row[x].covered_blocks_x();
325
15.5M
    } else {
326
1.63M
      return true;
327
1.63M
    }
328
17.1M
  }
329
6.89M
  return false;
330
8.52M
}
Unexecuted instantiation: jxl::N_AVX3::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_AVX3_SPR::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
331
332
bool MultiBlockTransformCrossesVerticalBoundary(
333
    const AcStrategyImage& ac_strategy, size_t x, size_t start_y,
334
7.97M
    size_t end_y) {
335
7.97M
  if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) {
336
69.3k
    return false;
337
69.3k
  }
338
7.90M
  if (x % 8 == 0) {
339
    // Nothing crosses 64x64 boundaries, and the memory on the other side
340
    // of the 64x64 block may still uninitialized.
341
1.22M
    return false;
342
1.22M
  }
343
6.68M
  end_y = std::min(end_y, ac_strategy.ysize());
344
  // The first multiblock might be before the start_y, let's adjust it
345
  // to point to the first IsFirstBlock() == true block we find by backward
346
  // tracing.
347
6.68M
  const size_t start_y_limit = start_y & ~7;
348
7.13M
  while (start_y != start_y_limit &&
349
5.41M
         !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) {
350
450k
    --start_y;
351
450k
  }
352
353
20.9M
  for (size_t y = start_y; y < end_y;) {
354
14.5M
    AcStrategyRow row = ac_strategy.ConstRow(y);
355
14.5M
    if (row[x].IsFirstBlock()) {
356
14.2M
      y += row[x].covered_blocks_y();
357
14.2M
    } else {
358
366k
      return true;
359
366k
    }
360
14.5M
  }
361
6.31M
  return false;
362
6.68M
}
Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
jxl::N_AVX2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Line
Count
Source
334
7.97M
    size_t end_y) {
335
7.97M
  if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) {
336
69.3k
    return false;
337
69.3k
  }
338
7.90M
  if (x % 8 == 0) {
339
    // Nothing crosses 64x64 boundaries, and the memory on the other side
340
    // of the 64x64 block may still uninitialized.
341
1.22M
    return false;
342
1.22M
  }
343
6.68M
  end_y = std::min(end_y, ac_strategy.ysize());
344
  // The first multiblock might be before the start_y, let's adjust it
345
  // to point to the first IsFirstBlock() == true block we find by backward
346
  // tracing.
347
6.68M
  const size_t start_y_limit = start_y & ~7;
348
7.13M
  while (start_y != start_y_limit &&
349
5.41M
         !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) {
350
450k
    --start_y;
351
450k
  }
352
353
20.9M
  for (size_t y = start_y; y < end_y;) {
354
14.5M
    AcStrategyRow row = ac_strategy.ConstRow(y);
355
14.5M
    if (row[x].IsFirstBlock()) {
356
14.2M
      y += row[x].covered_blocks_y();
357
14.2M
    } else {
358
366k
      return true;
359
366k
    }
360
14.5M
  }
361
6.31M
  return false;
362
6.68M
}
Unexecuted instantiation: jxl::N_AVX3::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_AVX3_SPR::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long)
363
364
Status EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x,
365
                       size_t y, const ACSConfig& config,
366
                       const float* JXL_RESTRICT cmap_factors, float* block,
367
                       float* full_scratch_space, uint32_t* quantized,
368
62.0M
                       float& entropy) {
369
62.0M
  entropy = 0.0f;
370
62.0M
  float* mem = full_scratch_space;
371
62.0M
  float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea;
372
62.0M
  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
373
374
  // Apply transform.
375
248M
  for (size_t c = 0; c < 3; c++) {
376
186M
    float* JXL_RESTRICT block_c = block + size * c;
377
186M
    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
378
186M
                        config.src_stride, block_c, scratch_space);
379
186M
  }
380
62.0M
  HWY_FULL(float) df;
381
382
62.0M
  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
383
  // avoid large blocks when there is a lot going on in red-green.
384
62.0M
  float quant_norm16 = 0;
385
62.0M
  if (num_blocks == 1) {
386
    // When it is only one 8x8, we don't need aggregation of values.
387
49.4M
    quant_norm16 = config.Quant(x / 8, y / 8);
388
49.4M
  } else if (num_blocks == 2) {
389
    // Taking max instead of 8th norm seems to work
390
    // better for smallest blocks up to 16x8. Jyrki couldn't get
391
    // improvements in trying the same for 16x16 blocks.
392
8.04M
    if (acs.covered_blocks_y() == 2) {
393
4.00M
      quant_norm16 =
394
4.00M
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
395
4.03M
    } else {
396
4.03M
      quant_norm16 =
397
4.03M
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
398
4.03M
    }
399
8.04M
  } else {
400
    // Load QF value, calculate empirical heuristic on masking field
401
    // for weighting the information loss. Information loss manifests
402
    // itself as ringing, and masking could hide it.
403
18.3M
    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
404
59.3M
      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
405
45.5M
        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
406
45.5M
        qval *= qval;
407
45.5M
        qval *= qval;
408
45.5M
        qval *= qval;
409
45.5M
        quant_norm16 += qval * qval;
410
45.5M
      }
411
13.8M
    }
412
4.56M
    quant_norm16 /= num_blocks;
413
4.56M
    quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f);
414
4.56M
  }
415
62.0M
  const auto quant = Set(df, quant_norm16);
416
417
  // Compute entropy.
418
62.0M
  const HWY_CAPPED(float, 8) df8;
419
420
62.0M
  auto loss = Zero(df8);
421
248M
  for (size_t c = 0; c < 3; c++) {
422
186M
    const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c);
423
186M
    const float* matrix = config.dequant->Matrix(acs.Strategy(), c);
424
186M
    const auto cmap_factor = Set(df, cmap_factors[c]);
425
426
186M
    auto entropy_v = Zero(df);
427
186M
    auto nzeros_v = Zero(df);
428
2.85G
    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
429
2.66G
      const auto in = Load(df, block + c * size + i);
430
2.66G
      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
431
2.66G
      const auto im = Load(df, inv_matrix + i);
432
2.66G
      const auto val = Mul(Sub(in, in_y), Mul(im, quant));
433
2.66G
      const auto rval = Round(val);
434
2.66G
      const auto diff = Sub(val, rval);
435
2.66G
      const auto m = Load(df, matrix + i);
436
2.66G
      Store(Mul(m, diff), df, &mem[i]);
437
2.66G
      const auto q = Abs(rval);
438
2.66G
      const auto q_is_zero = Eq(q, Zero(df));
439
      // We used to have q * C here, but that cost model seems to
440
      // be punishing large values more than necessary. Sqrt tries
441
      // to avoid large values less aggressively.
442
2.66G
      entropy_v = Add(Sqrt(q), entropy_v);
443
2.66G
      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
444
2.66G
    }
445
446
186M
    {
447
186M
      float masku_lut[3] = {
448
186M
          12.0,
449
186M
          0.0,
450
186M
          4.0,
451
186M
      };
452
186M
      auto masku_off = Set(df8, masku_lut[c]);
453
186M
      auto lossc = Zero(df8);
454
186M
      TransformToPixels(acs.Strategy(), &mem[0], block,
455
186M
                        acs.covered_blocks_x() * 8, scratch_space);
456
457
412M
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
458
559M
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
459
2.99G
          for (size_t dy = 0; dy < kBlockDim; ++dy) {
460
5.33G
            for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) {
461
2.66G
              auto in = Load(df8, block +
462
2.66G
                                      (iy * kBlockDim + dy) *
463
2.66G
                                          (acs.covered_blocks_x() * kBlockDim) +
464
2.66G
                                      ix * kBlockDim + dx);
465
2.66G
              if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) {
466
2.66G
                auto masku =
467
2.66G
                    Add(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx,
468
2.66G
                                                       y + iy * 8 + dy)),
469
2.66G
                        masku_off);
470
2.66G
                in = Mul(masku, in);
471
2.66G
                in = Mul(in, in);
472
2.66G
                in = Mul(in, in);
473
2.66G
                in = Mul(in, in);
474
2.66G
                lossc = Add(lossc, in);
475
2.66G
              }
476
2.66G
            }
477
2.66G
          }
478
333M
        }
479
225M
      }
480
186M
      static const double kChannelMul[3] = {
481
186M
          pow(8.2, 8.0),
482
186M
          pow(1.0, 8.0),
483
186M
          pow(1.03, 8.0),
484
186M
      };
485
186M
      lossc = Mul(Set(df8, kChannelMul[c]), lossc);
486
186M
      loss = Add(loss, lossc);
487
186M
    }
488
186M
    entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v));
489
186M
    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
490
    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
491
    // number of non-zeros of the block.
492
186M
    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
493
    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
494
    // bias.
495
186M
    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
496
186M
    if (c == 0 && num_blocks >= 2) {
497
      // It is X channel (red-green) and we often see ringing
498
      // in the large blocks. Let's punish that more here.
499
12.6M
      float w = 1.0 + std::min(3.0, num_blocks / 8.0);
500
12.6M
      entropy *= w;
501
12.6M
      loss = Mul(loss, Set(df8, w));
502
12.6M
    }
503
186M
  }
504
62.0M
  float loss_scalar =
505
62.0M
      pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize),
506
62.0M
          1.0f / 8.0f) *
507
62.0M
      (num_blocks * kDCTBlockSize) / quant_norm16;
508
62.0M
  entropy *= entropy_mul;
509
62.0M
  entropy += config.info_loss_multiplier * loss_scalar;
510
62.0M
  return true;
511
62.0M
}
Unexecuted instantiation: jxl::N_SSE4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&)
jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&)
Line
Count
Source
368
62.0M
                       float& entropy) {
369
62.0M
  entropy = 0.0f;
370
62.0M
  float* mem = full_scratch_space;
371
62.0M
  float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea;
372
62.0M
  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
373
374
  // Apply transform.
375
248M
  for (size_t c = 0; c < 3; c++) {
376
186M
    float* JXL_RESTRICT block_c = block + size * c;
377
186M
    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
378
186M
                        config.src_stride, block_c, scratch_space);
379
186M
  }
380
62.0M
  HWY_FULL(float) df;
381
382
62.0M
  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
383
  // avoid large blocks when there is a lot going on in red-green.
384
62.0M
  float quant_norm16 = 0;
385
62.0M
  if (num_blocks == 1) {
386
    // When it is only one 8x8, we don't need aggregation of values.
387
49.4M
    quant_norm16 = config.Quant(x / 8, y / 8);
388
49.4M
  } else if (num_blocks == 2) {
389
    // Taking max instead of 8th norm seems to work
390
    // better for smallest blocks up to 16x8. Jyrki couldn't get
391
    // improvements in trying the same for 16x16 blocks.
392
8.04M
    if (acs.covered_blocks_y() == 2) {
393
4.00M
      quant_norm16 =
394
4.00M
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
395
4.03M
    } else {
396
4.03M
      quant_norm16 =
397
4.03M
          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
398
4.03M
    }
399
8.04M
  } else {
400
    // Load QF value, calculate empirical heuristic on masking field
401
    // for weighting the information loss. Information loss manifests
402
    // itself as ringing, and masking could hide it.
403
18.3M
    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
404
59.3M
      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
405
45.5M
        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
406
45.5M
        qval *= qval;
407
45.5M
        qval *= qval;
408
45.5M
        qval *= qval;
409
45.5M
        quant_norm16 += qval * qval;
410
45.5M
      }
411
13.8M
    }
412
4.56M
    quant_norm16 /= num_blocks;
413
4.56M
    quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f);
414
4.56M
  }
415
62.0M
  const auto quant = Set(df, quant_norm16);
416
417
  // Compute entropy.
418
62.0M
  const HWY_CAPPED(float, 8) df8;
419
420
62.0M
  auto loss = Zero(df8);
421
248M
  for (size_t c = 0; c < 3; c++) {
422
186M
    const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c);
423
186M
    const float* matrix = config.dequant->Matrix(acs.Strategy(), c);
424
186M
    const auto cmap_factor = Set(df, cmap_factors[c]);
425
426
186M
    auto entropy_v = Zero(df);
427
186M
    auto nzeros_v = Zero(df);
428
2.85G
    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
429
2.66G
      const auto in = Load(df, block + c * size + i);
430
2.66G
      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
431
2.66G
      const auto im = Load(df, inv_matrix + i);
432
2.66G
      const auto val = Mul(Sub(in, in_y), Mul(im, quant));
433
2.66G
      const auto rval = Round(val);
434
2.66G
      const auto diff = Sub(val, rval);
435
2.66G
      const auto m = Load(df, matrix + i);
436
2.66G
      Store(Mul(m, diff), df, &mem[i]);
437
2.66G
      const auto q = Abs(rval);
438
2.66G
      const auto q_is_zero = Eq(q, Zero(df));
439
      // We used to have q * C here, but that cost model seems to
440
      // be punishing large values more than necessary. Sqrt tries
441
      // to avoid large values less aggressively.
442
2.66G
      entropy_v = Add(Sqrt(q), entropy_v);
443
2.66G
      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
444
2.66G
    }
445
446
186M
    {
447
186M
      float masku_lut[3] = {
448
186M
          12.0,
449
186M
          0.0,
450
186M
          4.0,
451
186M
      };
452
186M
      auto masku_off = Set(df8, masku_lut[c]);
453
186M
      auto lossc = Zero(df8);
454
186M
      TransformToPixels(acs.Strategy(), &mem[0], block,
455
186M
                        acs.covered_blocks_x() * 8, scratch_space);
456
457
412M
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
458
559M
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
459
2.99G
          for (size_t dy = 0; dy < kBlockDim; ++dy) {
460
5.33G
            for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) {
461
2.66G
              auto in = Load(df8, block +
462
2.66G
                                      (iy * kBlockDim + dy) *
463
2.66G
                                          (acs.covered_blocks_x() * kBlockDim) +
464
2.66G
                                      ix * kBlockDim + dx);
465
2.66G
              if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) {
466
2.66G
                auto masku =
467
2.66G
                    Add(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx,
468
2.66G
                                                       y + iy * 8 + dy)),
469
2.66G
                        masku_off);
470
2.66G
                in = Mul(masku, in);
471
2.66G
                in = Mul(in, in);
472
2.66G
                in = Mul(in, in);
473
2.66G
                in = Mul(in, in);
474
2.66G
                lossc = Add(lossc, in);
475
2.66G
              }
476
2.66G
            }
477
2.66G
          }
478
333M
        }
479
225M
      }
480
186M
      static const double kChannelMul[3] = {
481
186M
          pow(8.2, 8.0),
482
186M
          pow(1.0, 8.0),
483
186M
          pow(1.03, 8.0),
484
186M
      };
485
186M
      lossc = Mul(Set(df8, kChannelMul[c]), lossc);
486
186M
      loss = Add(loss, lossc);
487
186M
    }
488
186M
    entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v));
489
186M
    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
490
    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
491
    // number of non-zeros of the block.
492
186M
    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
493
    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
494
    // bias.
495
186M
    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
496
186M
    if (c == 0 && num_blocks >= 2) {
497
      // It is X channel (red-green) and we often see ringing
498
      // in the large blocks. Let's punish that more here.
499
12.6M
      float w = 1.0 + std::min(3.0, num_blocks / 8.0);
500
12.6M
      entropy *= w;
501
12.6M
      loss = Mul(loss, Set(df8, w));
502
12.6M
    }
503
186M
  }
504
62.0M
  float loss_scalar =
505
62.0M
      pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize),
506
62.0M
          1.0f / 8.0f) *
507
62.0M
      (num_blocks * kDCTBlockSize) / quant_norm16;
508
62.0M
  entropy *= entropy_mul;
509
62.0M
  entropy += config.info_loss_multiplier * loss_scalar;
510
62.0M
  return true;
511
62.0M
}
Unexecuted instantiation: jxl::N_AVX3::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&)
Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&)
Unexecuted instantiation: jxl::N_SSE2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&)
512
513
Status FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
514
                            float butteraugli_target, const ACSConfig& config,
515
                            const float* JXL_RESTRICT cmap_factors,
516
                            AcStrategyImage* JXL_RESTRICT ac_strategy,
517
                            float* block, float* scratch_space,
518
                            uint32_t* quantized, float* entropy_out,
519
4.94M
                            AcStrategyType& best_tx) {
520
4.94M
  struct TransformTry8x8 {
521
4.94M
    AcStrategyType type;
522
4.94M
    int encoding_speed_tier_max_limit;
523
4.94M
    double entropy_mul;
524
4.94M
  };
525
4.94M
  static const TransformTry8x8 kTransforms8x8[] = {
526
4.94M
      {
527
4.94M
          AcStrategyType::DCT,
528
4.94M
          9,
529
4.94M
          0.8,
530
4.94M
      },
531
4.94M
      {
532
4.94M
          AcStrategyType::DCT4X4,
533
4.94M
          5,
534
4.94M
          1.08,
535
4.94M
      },
536
4.94M
      {
537
4.94M
          AcStrategyType::DCT2X2,
538
4.94M
          5,
539
4.94M
          0.95,
540
4.94M
      },
541
4.94M
      {
542
4.94M
          AcStrategyType::DCT4X8,
543
4.94M
          4,
544
4.94M
          0.85931637428340035,
545
4.94M
      },
546
4.94M
      {
547
4.94M
          AcStrategyType::DCT8X4,
548
4.94M
          4,
549
4.94M
          0.85931637428340035,
550
4.94M
      },
551
4.94M
      {
552
4.94M
          AcStrategyType::IDENTITY,
553
4.94M
          5,
554
4.94M
          1.0427542510634957,
555
4.94M
      },
556
4.94M
      {
557
4.94M
          AcStrategyType::AFV0,
558
4.94M
          4,
559
4.94M
          0.81779489591359944,
560
4.94M
      },
561
4.94M
      {
562
4.94M
          AcStrategyType::AFV1,
563
4.94M
          4,
564
4.94M
          0.81779489591359944,
565
4.94M
      },
566
4.94M
      {
567
4.94M
          AcStrategyType::AFV2,
568
4.94M
          4,
569
4.94M
          0.81779489591359944,
570
4.94M
      },
571
4.94M
      {
572
4.94M
          AcStrategyType::AFV3,
573
4.94M
          4,
574
4.94M
          0.81779489591359944,
575
4.94M
      },
576
4.94M
  };
577
4.94M
  double best = 1e30;
578
4.94M
  best_tx = kTransforms8x8[0].type;
579
49.4M
  for (auto tx : kTransforms8x8) {
580
49.4M
    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
581
0
      continue;
582
0
    }
583
49.4M
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
584
49.4M
    float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul;
585
49.4M
    if ((tx.type == AcStrategyType::DCT2X2 ||
586
44.4M
         tx.type == AcStrategyType::IDENTITY) &&
587
9.88M
        butteraugli_target < 5.0) {
588
9.88M
      static const float kFavor2X2AtHighQuality = 0.4;
589
9.88M
      float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0f);
590
9.88M
      entropy_mul -= kFavor2X2AtHighQuality * weight;
591
9.88M
    }
592
49.4M
    if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 &&
593
39.5M
         tx.type != AcStrategyType::IDENTITY) &&
594
34.6M
        butteraugli_target > 4.0) {
595
0
      static const float kAvoidEntropyOfTransforms = 0.5;
596
0
      float mul = 1.0;
597
0
      if (butteraugli_target < 12.0) {
598
0
        mul *= (12.0 - 4.0) / (butteraugli_target - 4.0);
599
0
      }
600
0
      entropy_mul += kAvoidEntropyOfTransforms * mul;
601
0
    }
602
49.4M
    float entropy;
603
49.4M
    JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config,
604
49.4M
                                        cmap_factors, block, scratch_space,
605
49.4M
                                        quantized, entropy));
606
49.4M
    if (entropy < best) {
607
7.67M
      best_tx = tx.type;
608
7.67M
      best = entropy;
609
7.67M
    }
610
49.4M
  }
611
4.94M
  *entropy_out = best;
612
4.94M
  return true;
613
4.94M
}
Unexecuted instantiation: jxl::N_SSE4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&)
jxl::N_AVX2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&)
Line
Count
Source
519
4.94M
                            AcStrategyType& best_tx) {
520
4.94M
  struct TransformTry8x8 {
521
4.94M
    AcStrategyType type;
522
4.94M
    int encoding_speed_tier_max_limit;
523
4.94M
    double entropy_mul;
524
4.94M
  };
525
4.94M
  static const TransformTry8x8 kTransforms8x8[] = {
526
4.94M
      {
527
4.94M
          AcStrategyType::DCT,
528
4.94M
          9,
529
4.94M
          0.8,
530
4.94M
      },
531
4.94M
      {
532
4.94M
          AcStrategyType::DCT4X4,
533
4.94M
          5,
534
4.94M
          1.08,
535
4.94M
      },
536
4.94M
      {
537
4.94M
          AcStrategyType::DCT2X2,
538
4.94M
          5,
539
4.94M
          0.95,
540
4.94M
      },
541
4.94M
      {
542
4.94M
          AcStrategyType::DCT4X8,
543
4.94M
          4,
544
4.94M
          0.85931637428340035,
545
4.94M
      },
546
4.94M
      {
547
4.94M
          AcStrategyType::DCT8X4,
548
4.94M
          4,
549
4.94M
          0.85931637428340035,
550
4.94M
      },
551
4.94M
      {
552
4.94M
          AcStrategyType::IDENTITY,
553
4.94M
          5,
554
4.94M
          1.0427542510634957,
555
4.94M
      },
556
4.94M
      {
557
4.94M
          AcStrategyType::AFV0,
558
4.94M
          4,
559
4.94M
          0.81779489591359944,
560
4.94M
      },
561
4.94M
      {
562
4.94M
          AcStrategyType::AFV1,
563
4.94M
          4,
564
4.94M
          0.81779489591359944,
565
4.94M
      },
566
4.94M
      {
567
4.94M
          AcStrategyType::AFV2,
568
4.94M
          4,
569
4.94M
          0.81779489591359944,
570
4.94M
      },
571
4.94M
      {
572
4.94M
          AcStrategyType::AFV3,
573
4.94M
          4,
574
4.94M
          0.81779489591359944,
575
4.94M
      },
576
4.94M
  };
577
4.94M
  double best = 1e30;
578
4.94M
  best_tx = kTransforms8x8[0].type;
579
49.4M
  for (auto tx : kTransforms8x8) {
580
49.4M
    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
581
0
      continue;
582
0
    }
583
49.4M
    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
584
49.4M
    float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul;
585
49.4M
    if ((tx.type == AcStrategyType::DCT2X2 ||
586
44.4M
         tx.type == AcStrategyType::IDENTITY) &&
587
9.88M
        butteraugli_target < 5.0) {
588
9.88M
      static const float kFavor2X2AtHighQuality = 0.4;
589
9.88M
      float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0f);
590
9.88M
      entropy_mul -= kFavor2X2AtHighQuality * weight;
591
9.88M
    }
592
49.4M
    if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 &&
593
39.5M
         tx.type != AcStrategyType::IDENTITY) &&
594
34.6M
        butteraugli_target > 4.0) {
595
0
      static const float kAvoidEntropyOfTransforms = 0.5;
596
0
      float mul = 1.0;
597
0
      if (butteraugli_target < 12.0) {
598
0
        mul *= (12.0 - 4.0) / (butteraugli_target - 4.0);
599
0
      }
600
0
      entropy_mul += kAvoidEntropyOfTransforms * mul;
601
0
    }
602
49.4M
    float entropy;
603
49.4M
    JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config,
604
49.4M
                                        cmap_factors, block, scratch_space,
605
49.4M
                                        quantized, entropy));
606
49.4M
    if (entropy < best) {
607
7.67M
      best_tx = tx.type;
608
7.67M
      best = entropy;
609
7.67M
    }
610
49.4M
  }
611
4.94M
  *entropy_out = best;
612
4.94M
  return true;
613
4.94M
}
Unexecuted instantiation: jxl::N_AVX3::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&)
Unexecuted instantiation: jxl::N_AVX3_SPR::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&)
Unexecuted instantiation: jxl::N_SSE2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&)
614
615
// bx, by addresses the 64x64 block at 8x8 subresolution
616
// cx, cy addresses the left, upper 8x8 block position of the candidate
617
// transform.
618
Status TryMergeAcs(AcStrategyType acs_raw, size_t bx, size_t by, size_t cx,
619
                   size_t cy, const ACSConfig& config,
620
                   const float* JXL_RESTRICT cmap_factors,
621
                   AcStrategyImage* JXL_RESTRICT ac_strategy,
622
                   const float entropy_mul, const uint8_t candidate_priority,
623
                   uint8_t* priority, float* JXL_RESTRICT entropy_estimate,
624
15.1M
                   float* block, float* scratch_space, uint32_t* quantized) {
625
15.1M
  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
626
15.1M
  float entropy_current = 0;
627
16.6M
  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
628
23.0M
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
629
21.5M
      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
630
        // Transform would reuse already allocated blocks and
631
        // lead to invalid overlaps, for example DCT64X32 vs.
632
        // DCT32X64.
633
14.8M
        return true;
634
14.8M
      }
635
6.70M
      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
636
6.70M
    }
637
16.3M
  }
638
276k
  float entropy_candidate;
639
276k
  JXL_RETURN_IF_ERROR(EstimateEntropy(
640
276k
      acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors,
641
276k
      block, scratch_space, quantized, entropy_candidate));
642
276k
  if (entropy_candidate >= entropy_current) return true;
643
  // Accept the candidate.
644
297k
  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
645
1.16M
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
646
915k
      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
647
915k
      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
648
915k
    }
649
245k
  }
650
52.7k
  JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw));
651
52.7k
  entropy_estimate[cy * 8 + cx] = entropy_candidate;
652
52.7k
  return true;
653
52.7k
}
Unexecuted instantiation: jxl::N_SSE4::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
jxl::N_AVX2::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
Line
Count
Source
624
15.1M
                   float* block, float* scratch_space, uint32_t* quantized) {
625
15.1M
  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
626
15.1M
  float entropy_current = 0;
627
16.6M
  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
628
23.0M
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
629
21.5M
      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
630
        // Transform would reuse already allocated blocks and
631
        // lead to invalid overlaps, for example DCT64X32 vs.
632
        // DCT32X64.
633
14.8M
        return true;
634
14.8M
      }
635
6.70M
      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
636
6.70M
    }
637
16.3M
  }
638
276k
  float entropy_candidate;
639
276k
  JXL_RETURN_IF_ERROR(EstimateEntropy(
640
276k
      acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors,
641
276k
      block, scratch_space, quantized, entropy_candidate));
642
276k
  if (entropy_candidate >= entropy_current) return true;
643
  // Accept the candidate.
644
297k
  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
645
1.16M
    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
646
915k
      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
647
915k
      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
648
915k
    }
649
245k
  }
650
52.7k
  JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw));
651
52.7k
  entropy_estimate[cy * 8 + cx] = entropy_candidate;
652
52.7k
  return true;
653
52.7k
}
Unexecuted instantiation: jxl::N_AVX3::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_SSE2::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*)
654
655
static void SetEntropyForTransform(size_t cx, size_t cy,
656
                                   const AcStrategyType acs_raw, float entropy,
657
974k
                                   float* JXL_RESTRICT entropy_estimate) {
658
974k
  const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
659
3.04M
  for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) {
660
7.46M
    for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) {
661
5.39M
      entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0;
662
5.39M
    }
663
2.07M
  }
664
974k
  entropy_estimate[cy * 8 + cx] = entropy;
665
974k
}
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*)
enc_ac_strategy.cc:jxl::N_AVX2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*)
Line
Count
Source
657
974k
                                   float* JXL_RESTRICT entropy_estimate) {
658
974k
  const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
659
3.04M
  for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) {
660
7.46M
    for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) {
661
5.39M
      entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0;
662
5.39M
    }
663
2.07M
  }
664
974k
  entropy_estimate[cy * 8 + cx] = entropy;
665
974k
}
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX3::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*)
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX3_ZEN4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*)
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX3_SPR::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*)
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*)
666
667
4.42M
AcStrategyType AcsSquare(size_t blocks) {
668
4.42M
  if (blocks == 2) {
669
3.70M
    return AcStrategyType::DCT16X16;
670
3.70M
  } else if (blocks == 4) {
671
651k
    return AcStrategyType::DCT32X32;
672
651k
  } else {
673
67.7k
    return AcStrategyType::DCT64X64;
674
67.7k
  }
675
4.42M
}
Unexecuted instantiation: jxl::N_SSE4::AcsSquare(unsigned long)
jxl::N_AVX2::AcsSquare(unsigned long)
Line
Count
Source
667
4.42M
AcStrategyType AcsSquare(size_t blocks) {
668
4.42M
  if (blocks == 2) {
669
3.70M
    return AcStrategyType::DCT16X16;
670
3.70M
  } else if (blocks == 4) {
671
651k
    return AcStrategyType::DCT32X32;
672
651k
  } else {
673
67.7k
    return AcStrategyType::DCT64X64;
674
67.7k
  }
675
4.42M
}
Unexecuted instantiation: jxl::N_AVX3::AcsSquare(unsigned long)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::AcsSquare(unsigned long)
Unexecuted instantiation: jxl::N_AVX3_SPR::AcsSquare(unsigned long)
Unexecuted instantiation: jxl::N_SSE2::AcsSquare(unsigned long)
676
677
4.42M
AcStrategyType AcsVerticalSplit(size_t blocks) {
678
4.42M
  if (blocks == 2) {
679
3.70M
    return AcStrategyType::DCT16X8;
680
3.70M
  } else if (blocks == 4) {
681
651k
    return AcStrategyType::DCT32X16;
682
651k
  } else {
683
67.7k
    return AcStrategyType::DCT64X32;
684
67.7k
  }
685
4.42M
}
Unexecuted instantiation: jxl::N_SSE4::AcsVerticalSplit(unsigned long)
jxl::N_AVX2::AcsVerticalSplit(unsigned long)
Line
Count
Source
677
4.42M
AcStrategyType AcsVerticalSplit(size_t blocks) {
678
4.42M
  if (blocks == 2) {
679
3.70M
    return AcStrategyType::DCT16X8;
680
3.70M
  } else if (blocks == 4) {
681
651k
    return AcStrategyType::DCT32X16;
682
651k
  } else {
683
67.7k
    return AcStrategyType::DCT64X32;
684
67.7k
  }
685
4.42M
}
Unexecuted instantiation: jxl::N_AVX3::AcsVerticalSplit(unsigned long)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::AcsVerticalSplit(unsigned long)
Unexecuted instantiation: jxl::N_AVX3_SPR::AcsVerticalSplit(unsigned long)
Unexecuted instantiation: jxl::N_SSE2::AcsVerticalSplit(unsigned long)
686
687
4.42M
AcStrategyType AcsHorizontalSplit(size_t blocks) {
688
4.42M
  if (blocks == 2) {
689
3.70M
    return AcStrategyType::DCT8X16;
690
3.70M
  } else if (blocks == 4) {
691
651k
    return AcStrategyType::DCT16X32;
692
651k
  } else {
693
67.7k
    return AcStrategyType::DCT32X64;
694
67.7k
  }
695
4.42M
}
Unexecuted instantiation: jxl::N_SSE4::AcsHorizontalSplit(unsigned long)
jxl::N_AVX2::AcsHorizontalSplit(unsigned long)
Line
Count
Source
687
4.42M
AcStrategyType AcsHorizontalSplit(size_t blocks) {
688
4.42M
  if (blocks == 2) {
689
3.70M
    return AcStrategyType::DCT8X16;
690
3.70M
  } else if (blocks == 4) {
691
651k
    return AcStrategyType::DCT16X32;
692
651k
  } else {
693
67.7k
    return AcStrategyType::DCT32X64;
694
67.7k
  }
695
4.42M
}
Unexecuted instantiation: jxl::N_AVX3::AcsHorizontalSplit(unsigned long)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::AcsHorizontalSplit(unsigned long)
Unexecuted instantiation: jxl::N_AVX3_SPR::AcsHorizontalSplit(unsigned long)
Unexecuted instantiation: jxl::N_SSE2::AcsHorizontalSplit(unsigned long)
696
697
// The following function tries to merge smaller transforms into
698
// squares and the rectangles originating from a single middle division
699
// (horizontal or vertical) fairly.
700
//
701
// This is now generalized to concern about squares
702
// of blocks X blocks size, where a block is 8x8 pixels.
703
Status FindBestFirstLevelDivisionForSquare(
704
    size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx,
705
    size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors,
706
    AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK,
707
    const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate,
708
4.42M
    float* block, float* scratch_space, uint32_t* quantized) {
709
  // We denote J for the larger dimension here, and K for the smaller.
710
  // For example, for 32x32 block splitting, J would be 32, K 16.
711
4.42M
  const size_t blocks_half = blocks / 2;
712
4.42M
  const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks);
713
4.42M
  const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks);
714
4.42M
  const AcStrategyType acs_rawJXJ = AcsSquare(blocks);
715
4.42M
  const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK);
716
4.42M
  const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ);
717
4.42M
  const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ);
718
4.42M
  AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0);
719
4.42M
  AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half);
720
  // Let's check if we can consider a JXJ block here at all.
721
  // This is not necessary in the basic use of hierarchically merging
722
  // blocks in the simplest possible way, but is needed when we try other
723
  // 'floating' options of merging, possibly after a simple hierarchical
724
  // merge has been explored.
725
4.42M
  if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx,
726
4.42M
                                                   by + cy, bx + cx + blocks) ||
727
3.14M
      MultiBlockTransformCrossesHorizontalBoundary(
728
3.14M
          *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) ||
729
2.84M
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy,
730
2.84M
                                                 by + cy + blocks) ||
731
2.59M
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks,
732
2.59M
                                                 by + cy, by + cy + blocks)) {
733
1.89M
    return true;  // not suitable for JxJ analysis, some transforms leak out.
734
1.89M
  }
735
  // For floating transforms there may be
736
  // already blocks selected that make either or both JXK and
737
  // KXJ not feasible for this location.
738
2.53M
  const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary(
739
2.53M
      *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks);
740
2.53M
  const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary(
741
2.53M
      *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks);
742
  // Current entropies aggregated on NxN resolution.
743
2.53M
  float entropy[2][2] = {};
744
8.82M
  for (size_t dy = 0; dy < blocks; ++dy) {
745
25.4M
    for (size_t dx = 0; dx < blocks; ++dx) {
746
19.1M
      entropy[dy / blocks_half][dx / blocks_half] +=
747
19.1M
          entropy_estimate[(cy + dy) * 8 + (cx + dx)];
748
19.1M
    }
749
6.29M
  }
750
2.53M
  float entropy_JXK_left = std::numeric_limits<float>::max();
751
2.53M
  float entropy_JXK_right = std::numeric_limits<float>::max();
752
2.53M
  float entropy_KXJ_top = std::numeric_limits<float>::max();
753
2.53M
  float entropy_KXJ_bottom = std::numeric_limits<float>::max();
754
2.53M
  float entropy_JXJ = std::numeric_limits<float>::max();
755
2.53M
  if (allow_JXK) {
756
2.48M
    if (row0[bx + cx + 0].Strategy() != acs_rawJXK) {
757
2.44M
      JXL_RETURN_IF_ERROR(EstimateEntropy(
758
2.44M
          acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
759
2.44M
          cmap_factors, block, scratch_space, quantized, entropy_JXK_left));
760
2.44M
    }
761
2.48M
    if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) {
762
2.45M
      JXL_RETURN_IF_ERROR(
763
2.45M
          EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8,
764
2.45M
                          (by + cy + 0) * 8, config, cmap_factors, block,
765
2.45M
                          scratch_space, quantized, entropy_JXK_right));
766
2.45M
    }
767
2.48M
  }
768
2.53M
  if (allow_KXJ) {
769
2.47M
    if (row0[bx + cx].Strategy() != acs_rawKXJ) {
770
2.44M
      JXL_RETURN_IF_ERROR(EstimateEntropy(
771
2.44M
          acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
772
2.44M
          cmap_factors, block, scratch_space, quantized, entropy_KXJ_top));
773
2.44M
    }
774
2.47M
    if (row1[bx + cx].Strategy() != acs_rawKXJ) {
775
2.45M
      JXL_RETURN_IF_ERROR(
776
2.45M
          EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8,
777
2.45M
                          (by + cy + blocks_half) * 8, config, cmap_factors,
778
2.45M
                          block, scratch_space, quantized, entropy_KXJ_bottom));
779
2.45M
    }
780
2.47M
  }
781
2.53M
  if (allow_square_transform) {
782
    // We control the exploration of the square transform separately so that
783
    // we can turn it off at high decoding speeds for 32x32, but still allow
784
    // exploring 16x32 and 32x16.
785
2.53M
    JXL_RETURN_IF_ERROR(EstimateEntropy(
786
2.53M
        acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
787
2.53M
        cmap_factors, block, scratch_space, quantized, entropy_JXJ));
788
2.53M
  }
789
790
  // Test if this block should have JXK or KXJ transforms,
791
  // because it can have only one or the other.
792
2.53M
  float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) +
793
2.53M
                  std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]);
794
2.53M
  float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) +
795
2.53M
                  std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]);
796
2.53M
  if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) {
797
549k
    JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ));
798
549k
    SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate);
799
1.98M
  } else if (costJxN < costNxJ) {
800
346k
    if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) {
801
91.4k
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK));
802
91.4k
      SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left,
803
91.4k
                             entropy_estimate);
804
91.4k
    }
805
346k
    if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) {
806
91.8k
      JXL_RETURN_IF_ERROR(
807
91.8k
          ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK));
808
91.8k
      SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK,
809
91.8k
                             entropy_JXK_right, entropy_estimate);
810
91.8k
    }
811
1.63M
  } else {
812
1.63M
    if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) {
813
117k
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ));
814
117k
      SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top,
815
117k
                             entropy_estimate);
816
117k
    }
817
1.63M
    if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) {
818
123k
      JXL_RETURN_IF_ERROR(
819
123k
          ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ));
820
123k
      SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ,
821
123k
                             entropy_KXJ_bottom, entropy_estimate);
822
123k
    }
823
1.63M
  }
824
2.53M
  return true;
825
2.53M
}
Unexecuted instantiation: jxl::N_SSE4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
jxl::N_AVX2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
Line
Count
Source
708
4.42M
    float* block, float* scratch_space, uint32_t* quantized) {
709
  // We denote J for the larger dimension here, and K for the smaller.
710
  // For example, for 32x32 block splitting, J would be 32, K 16.
711
4.42M
  const size_t blocks_half = blocks / 2;
712
4.42M
  const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks);
713
4.42M
  const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks);
714
4.42M
  const AcStrategyType acs_rawJXJ = AcsSquare(blocks);
715
4.42M
  const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK);
716
4.42M
  const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ);
717
4.42M
  const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ);
718
4.42M
  AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0);
719
4.42M
  AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half);
720
  // Let's check if we can consider a JXJ block here at all.
721
  // This is not necessary in the basic use of hierarchically merging
722
  // blocks in the simplest possible way, but is needed when we try other
723
  // 'floating' options of merging, possibly after a simple hierarchical
724
  // merge has been explored.
725
4.42M
  if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx,
726
4.42M
                                                   by + cy, bx + cx + blocks) ||
727
3.14M
      MultiBlockTransformCrossesHorizontalBoundary(
728
3.14M
          *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) ||
729
2.84M
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy,
730
2.84M
                                                 by + cy + blocks) ||
731
2.59M
      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks,
732
2.59M
                                                 by + cy, by + cy + blocks)) {
733
1.89M
    return true;  // not suitable for JxJ analysis, some transforms leak out.
734
1.89M
  }
735
  // For floating transforms there may be
736
  // already blocks selected that make either or both JXK and
737
  // KXJ not feasible for this location.
738
2.53M
  const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary(
739
2.53M
      *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks);
740
2.53M
  const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary(
741
2.53M
      *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks);
742
  // Current entropies aggregated on NxN resolution.
743
2.53M
  float entropy[2][2] = {};
744
8.82M
  for (size_t dy = 0; dy < blocks; ++dy) {
745
25.4M
    for (size_t dx = 0; dx < blocks; ++dx) {
746
19.1M
      entropy[dy / blocks_half][dx / blocks_half] +=
747
19.1M
          entropy_estimate[(cy + dy) * 8 + (cx + dx)];
748
19.1M
    }
749
6.29M
  }
750
2.53M
  float entropy_JXK_left = std::numeric_limits<float>::max();
751
2.53M
  float entropy_JXK_right = std::numeric_limits<float>::max();
752
2.53M
  float entropy_KXJ_top = std::numeric_limits<float>::max();
753
2.53M
  float entropy_KXJ_bottom = std::numeric_limits<float>::max();
754
2.53M
  float entropy_JXJ = std::numeric_limits<float>::max();
755
2.53M
  if (allow_JXK) {
756
2.48M
    if (row0[bx + cx + 0].Strategy() != acs_rawJXK) {
757
2.44M
      JXL_RETURN_IF_ERROR(EstimateEntropy(
758
2.44M
          acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
759
2.44M
          cmap_factors, block, scratch_space, quantized, entropy_JXK_left));
760
2.44M
    }
761
2.48M
    if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) {
762
2.45M
      JXL_RETURN_IF_ERROR(
763
2.45M
          EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8,
764
2.45M
                          (by + cy + 0) * 8, config, cmap_factors, block,
765
2.45M
                          scratch_space, quantized, entropy_JXK_right));
766
2.45M
    }
767
2.48M
  }
768
2.53M
  if (allow_KXJ) {
769
2.47M
    if (row0[bx + cx].Strategy() != acs_rawKXJ) {
770
2.44M
      JXL_RETURN_IF_ERROR(EstimateEntropy(
771
2.44M
          acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
772
2.44M
          cmap_factors, block, scratch_space, quantized, entropy_KXJ_top));
773
2.44M
    }
774
2.47M
    if (row1[bx + cx].Strategy() != acs_rawKXJ) {
775
2.45M
      JXL_RETURN_IF_ERROR(
776
2.45M
          EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8,
777
2.45M
                          (by + cy + blocks_half) * 8, config, cmap_factors,
778
2.45M
                          block, scratch_space, quantized, entropy_KXJ_bottom));
779
2.45M
    }
780
2.47M
  }
781
2.53M
  if (allow_square_transform) {
782
    // We control the exploration of the square transform separately so that
783
    // we can turn it off at high decoding speeds for 32x32, but still allow
784
    // exploring 16x32 and 32x16.
785
2.53M
    JXL_RETURN_IF_ERROR(EstimateEntropy(
786
2.53M
        acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
787
2.53M
        cmap_factors, block, scratch_space, quantized, entropy_JXJ));
788
2.53M
  }
789
790
  // Test if this block should have JXK or KXJ transforms,
791
  // because it can have only one or the other.
792
2.53M
  float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) +
793
2.53M
                  std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]);
794
2.53M
  float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) +
795
2.53M
                  std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]);
796
2.53M
  if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) {
797
549k
    JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ));
798
549k
    SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate);
799
1.98M
  } else if (costJxN < costNxJ) {
800
346k
    if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) {
801
91.4k
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK));
802
91.4k
      SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left,
803
91.4k
                             entropy_estimate);
804
91.4k
    }
805
346k
    if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) {
806
91.8k
      JXL_RETURN_IF_ERROR(
807
91.8k
          ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK));
808
91.8k
      SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK,
809
91.8k
                             entropy_JXK_right, entropy_estimate);
810
91.8k
    }
811
1.63M
  } else {
812
1.63M
    if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) {
813
117k
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ));
814
117k
      SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top,
815
117k
                             entropy_estimate);
816
117k
    }
817
1.63M
    if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) {
818
123k
      JXL_RETURN_IF_ERROR(
819
123k
          ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ));
820
123k
      SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ,
821
123k
                             entropy_KXJ_bottom, entropy_estimate);
822
123k
    }
823
1.63M
  }
824
2.53M
  return true;
825
2.53M
}
Unexecuted instantiation: jxl::N_AVX3::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
Unexecuted instantiation: jxl::N_SSE2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*)
826
827
Status ProcessRectACS(const CompressParams& cparams, const ACSConfig& config,
828
                      const Rect& rect, const ColorCorrelationMap& cmap,
829
                      float* JXL_RESTRICT block,
830
                      uint32_t* JXL_RESTRICT quantized,
831
89.5k
                      AcStrategyImage* ac_strategy) {
832
  // Main philosophy here:
833
  // 1. First find best 8x8 transform for each area.
834
  // 2. Merging them into larger transforms where possibly, but
835
  // starting from the smallest transforms (16x8 and 8x16).
836
  // Additional complication: 16x8 and 8x16 are considered
837
  // simultaneously and fairly against each other.
838
  // We are looking at 64x64 squares since the Y-to-X and Y-to-B
839
  // maps happen to be at that resolution, and having
840
  // integral transforms cross these boundaries leads to
841
  // additional complications.
842
89.5k
  const float butteraugli_target = cparams.butteraugli_distance;
843
89.5k
  float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea;
844
89.5k
  size_t bx = rect.x0();
845
89.5k
  size_t by = rect.y0();
846
89.5k
  JXL_ENSURE(rect.xsize() <= 8);
847
89.5k
  JXL_ENSURE(rect.ysize() <= 8);
848
89.5k
  size_t tx = bx / kColorTileDimInBlocks;
849
89.5k
  size_t ty = by / kColorTileDimInBlocks;
850
89.5k
  const float cmap_factors[3] = {
851
89.5k
      cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]),
852
89.5k
      0.0f,
853
89.5k
      cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]),
854
89.5k
  };
855
89.5k
  if (cparams.speed_tier > SpeedTier::kHare) return true;
856
  // First compute the best 8x8 transform for each square. Later, we do not
857
  // experiment with different combinations, but only use the best of the 8x8s
858
  // when DCT8X8 is specified in the tree search.
859
  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
860
89.5k
  float entropy_estimate[64] = {};
861
  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
862
  // low butteraugli_target distances.
863
89.5k
  static const float k8x8mul1 = -0.4;
864
89.5k
  static const float k8x8mul2 = 1.0;
865
89.5k
  static const float k8x8base = 1.4;
866
89.5k
  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
867
754k
  for (size_t iy = 0; iy < rect.ysize(); iy++) {
868
5.60M
    for (size_t ix = 0; ix < rect.xsize(); ix++) {
869
4.94M
      float entropy = 0.0;
870
4.94M
      AcStrategyType best_of_8x8s;
871
4.94M
      JXL_RETURN_IF_ERROR(FindBest8x8Transform(
872
4.94M
          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
873
4.94M
          butteraugli_target, config, cmap_factors, ac_strategy, block,
874
4.94M
          scratch_space, quantized, &entropy, best_of_8x8s));
875
4.94M
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s));
876
4.94M
      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
877
4.94M
    }
878
665k
  }
879
  // Merge when a larger transform is better than the previously
880
  // searched best combination of 8x8 transforms.
881
89.5k
  struct MergeTry {
882
89.5k
    AcStrategyType type;
883
89.5k
    uint8_t priority;
884
89.5k
    uint8_t decoding_speed_tier_max_limit;
885
89.5k
    uint8_t encoding_speed_tier_max_limit;
886
89.5k
    float entropy_mul;
887
89.5k
  };
888
  // These numbers need to be figured out manually and looking at
889
  // ringing next to sky etc. Optimization will find smaller numbers
890
  // and produce more ringing than is ideal. Larger numbers will
891
  // help stop ringing.
892
89.5k
  const float entropy_mul16X8 = 1.21;
893
89.5k
  const float entropy_mul16X16 = 1.34;
894
89.5k
  const float entropy_mul16X32 = 1.49;
895
89.5k
  const float entropy_mul32X32 = 1.48;
896
89.5k
  const float entropy_mul64X32 = 2.25;
897
89.5k
  const float entropy_mul64X64 = 2.25;
898
  // TODO(jyrki): Consider this feedback in further changes:
899
  // Also effectively when the multipliers for smaller blocks are
900
  // below 1, this raises the bar for the bigger blocks even higher
901
  // in that sense these constants are not independent (e.g. changing
902
  // the constant for DCT16x32 by -5% (making it more likely) also
903
  // means that DCT32x32 becomes harder to do when starting from
904
  // two DCT16x32s). It might be better to make them more independent,
905
  // e.g. by not applying the multiplier when storing the new entropy
906
  // estimates in TryMergeToACSCandidate().
907
89.5k
  const MergeTry kTransformsForMerge[9] = {
908
89.5k
      {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8},
909
89.5k
      {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8},
910
      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
911
      // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16},
912
89.5k
      {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32},
913
89.5k
      {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32},
914
      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
915
      // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5,
916
      // 0.9822994906548809f},
917
89.5k
      {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32},
918
89.5k
      {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32},
919
      // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f},
920
89.5k
  };
921
  /*
922
  These sizes not yet included in merge heuristic:
923
  set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f);
924
  set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f);
925
  set(AcStrategyType::DCT128X128, 0.0f, 1.0f);
926
  set(AcStrategyType::DCT128X64, 0.0f, 0.73f);
927
  set(AcStrategyType::DCT64X128, 0.0f, 0.73f);
928
  set(AcStrategyType::DCT256X256, 0.0f, 1.0f);
929
  set(AcStrategyType::DCT256X128, 0.0f, 0.73f);
930
  set(AcStrategyType::DCT128X256, 0.0f, 0.73f);
931
  */
932
933
  // Priority is a tricky kludge to avoid collisions so that transforms
934
  // don't overlap.
935
89.5k
  uint8_t priority[64] = {};
936
89.5k
  bool enable_32x32 = cparams.decoding_speed_tier < 4;
937
806k
  for (auto mt : kTransformsForMerge) {
938
806k
    if (mt.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
939
0
      continue;
940
0
    }
941
806k
    AcStrategy acs = AcStrategy::FromRawStrategy(mt.type);
942
943
4.51M
    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
944
3.71M
         cy += acs.covered_blocks_y()) {
945
24.8M
      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
946
21.1M
           cx += acs.covered_blocks_x()) {
947
21.1M
        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
948
609k
          if (cparams.decoding_speed_tier < 4 &&
949
609k
              mt.type == AcStrategyType::DCT32X64) {
950
            // We handle both DCT8X16 and DCT16X8 at the same time.
951
67.7k
            if ((cy | cx) % 8 == 0) {
952
67.7k
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
953
67.7k
                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
954
67.7k
                  mt.entropy_mul, entropy_mul64X64, entropy_estimate, block,
955
67.7k
                  scratch_space, quantized));
956
67.7k
            }
957
67.7k
            continue;
958
541k
          } else if (mt.type == AcStrategyType::DCT32X16) {
959
            // We handled both DCT8X16 and DCT16X8 at the same time,
960
            // and that is above. The last column and last row,
961
            // when the last column or last row is odd numbered,
962
            // are still handled by TryMergeAcs.
963
67.7k
            continue;
964
67.7k
          }
965
609k
        }
966
21.0M
        if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) ||
967
20.7M
            (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) {
968
          // already covered by FindBest32X32
969
576k
          continue;
970
576k
        }
971
972
20.4M
        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
973
8.32M
          if (mt.type == AcStrategyType::DCT16X32) {
974
            // We handle both DCT8X16 and DCT16X8 at the same time.
975
288k
            if ((cy | cx) % 4 == 0) {
976
288k
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
977
288k
                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
978
288k
                  ac_strategy, mt.entropy_mul, entropy_mul32X32,
979
288k
                  entropy_estimate, block, scratch_space, quantized));
980
288k
            }
981
288k
            continue;
982
8.04M
          } else if (mt.type == AcStrategyType::DCT32X16) {
983
            // We handled both DCT8X16 and DCT16X8 at the same time,
984
            // and that is above. The last column and last row,
985
            // when the last column or last row is odd numbered,
986
            // are still handled by TryMergeAcs.
987
220k
            continue;
988
220k
          }
989
8.32M
        }
990
19.9M
        if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) ||
991
19.9M
            (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) {
992
          // already covered by FindBest32X32
993
0
          continue;
994
0
        }
995
19.9M
        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
996
15.5M
          if (mt.type == AcStrategyType::DCT8X16) {
997
            // We handle both DCT8X16 and DCT16X8 at the same time.
998
2.12M
            if ((cy | cx) % 2 == 0) {
999
1.21M
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1000
1.21M
                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1001
1.21M
                  mt.entropy_mul, entropy_mul16X16, entropy_estimate, block,
1002
1.21M
                  scratch_space, quantized));
1003
1.21M
            }
1004
2.12M
            continue;
1005
13.4M
          } else if (mt.type == AcStrategyType::DCT16X8) {
1006
            // We handled both DCT8X16 and DCT16X8 at the same time,
1007
            // and that is above. The last column and last row,
1008
            // when the last column or last row is odd numbered,
1009
            // are still handled by TryMergeAcs.
1010
2.10M
            continue;
1011
2.10M
          }
1012
15.5M
        }
1013
15.7M
        if ((mt.type == AcStrategyType::DCT8X16 && cy % 2 == 1) ||
1014
15.4M
            (mt.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) {
1015
          // already covered by FindBestFirstLevelDivisionForSquare
1016
608k
          continue;
1017
608k
        }
1018
        // All other merge sizes are handled here.
1019
        // Some of the DCT16X8s and DCT8X16s will still leak through here
1020
        // when there is an odd number of 8x8 blocks, then the last row
1021
        // and column will get their DCT16X8s and DCT8X16s through the
1022
        // normal integral transform merging process.
1023
15.1M
        JXL_RETURN_IF_ERROR(
1024
15.1M
            TryMergeAcs(mt.type, bx, by, cx, cy, config, cmap_factors,
1025
15.1M
                        ac_strategy, mt.entropy_mul, mt.priority, &priority[0],
1026
15.1M
                        entropy_estimate, block, scratch_space, quantized));
1027
15.1M
      }
1028
3.71M
    }
1029
806k
  }
1030
89.5k
  if (cparams.speed_tier >= SpeedTier::kHare) {
1031
0
    return true;
1032
0
  }
1033
  // Here we still try to do some non-aligned matching, find a few more
1034
  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
1035
665k
  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
1036
4.27M
    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
1037
3.70M
      if ((cy | cx) % 2 != 0) {
1038
2.49M
        JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1039
2.49M
            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1040
2.49M
            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
1041
2.49M
            scratch_space, quantized));
1042
2.49M
      }
1043
3.70M
    }
1044
575k
  }
1045
  // Non-aligned matching for 32X32, 16X32 and 32X16.
1046
89.5k
  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
1047
328k
  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
1048
891k
    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
1049
651k
      if ((cy | cx) % 4 == 0) {
1050
288k
        continue;  // Already tried with loop above (DCT16X32 case).
1051
288k
      }
1052
363k
      JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1053
363k
          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1054
363k
          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
1055
363k
          scratch_space, quantized));
1056
363k
    }
1057
239k
  }
1058
89.5k
  return true;
1059
89.5k
}
Unexecuted instantiation: jxl::N_SSE4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
jxl::N_AVX2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
Line
Count
Source
831
89.5k
                      AcStrategyImage* ac_strategy) {
832
  // Main philosophy here:
833
  // 1. First find best 8x8 transform for each area.
834
  // 2. Merging them into larger transforms where possibly, but
835
  // starting from the smallest transforms (16x8 and 8x16).
836
  // Additional complication: 16x8 and 8x16 are considered
837
  // simultaneously and fairly against each other.
838
  // We are looking at 64x64 squares since the Y-to-X and Y-to-B
839
  // maps happen to be at that resolution, and having
840
  // integral transforms cross these boundaries leads to
841
  // additional complications.
842
89.5k
  const float butteraugli_target = cparams.butteraugli_distance;
843
89.5k
  float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea;
844
89.5k
  size_t bx = rect.x0();
845
89.5k
  size_t by = rect.y0();
846
89.5k
  JXL_ENSURE(rect.xsize() <= 8);
847
89.5k
  JXL_ENSURE(rect.ysize() <= 8);
848
89.5k
  size_t tx = bx / kColorTileDimInBlocks;
849
89.5k
  size_t ty = by / kColorTileDimInBlocks;
850
89.5k
  const float cmap_factors[3] = {
851
89.5k
      cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]),
852
89.5k
      0.0f,
853
89.5k
      cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]),
854
89.5k
  };
855
89.5k
  if (cparams.speed_tier > SpeedTier::kHare) return true;
856
  // First compute the best 8x8 transform for each square. Later, we do not
857
  // experiment with different combinations, but only use the best of the 8x8s
858
  // when DCT8X8 is specified in the tree search.
859
  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
860
89.5k
  float entropy_estimate[64] = {};
861
  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
862
  // low butteraugli_target distances.
863
89.5k
  static const float k8x8mul1 = -0.4;
864
89.5k
  static const float k8x8mul2 = 1.0;
865
89.5k
  static const float k8x8base = 1.4;
866
89.5k
  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
867
754k
  for (size_t iy = 0; iy < rect.ysize(); iy++) {
868
5.60M
    for (size_t ix = 0; ix < rect.xsize(); ix++) {
869
4.94M
      float entropy = 0.0;
870
4.94M
      AcStrategyType best_of_8x8s;
871
4.94M
      JXL_RETURN_IF_ERROR(FindBest8x8Transform(
872
4.94M
          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
873
4.94M
          butteraugli_target, config, cmap_factors, ac_strategy, block,
874
4.94M
          scratch_space, quantized, &entropy, best_of_8x8s));
875
4.94M
      JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s));
876
4.94M
      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
877
4.94M
    }
878
665k
  }
879
  // Merge when a larger transform is better than the previously
880
  // searched best combination of 8x8 transforms.
881
89.5k
  struct MergeTry {
882
89.5k
    AcStrategyType type;
883
89.5k
    uint8_t priority;
884
89.5k
    uint8_t decoding_speed_tier_max_limit;
885
89.5k
    uint8_t encoding_speed_tier_max_limit;
886
89.5k
    float entropy_mul;
887
89.5k
  };
888
  // These numbers need to be figured out manually and looking at
889
  // ringing next to sky etc. Optimization will find smaller numbers
890
  // and produce more ringing than is ideal. Larger numbers will
891
  // help stop ringing.
892
89.5k
  const float entropy_mul16X8 = 1.21;
893
89.5k
  const float entropy_mul16X16 = 1.34;
894
89.5k
  const float entropy_mul16X32 = 1.49;
895
89.5k
  const float entropy_mul32X32 = 1.48;
896
89.5k
  const float entropy_mul64X32 = 2.25;
897
89.5k
  const float entropy_mul64X64 = 2.25;
898
  // TODO(jyrki): Consider this feedback in further changes:
899
  // Also effectively when the multipliers for smaller blocks are
900
  // below 1, this raises the bar for the bigger blocks even higher
901
  // in that sense these constants are not independent (e.g. changing
902
  // the constant for DCT16x32 by -5% (making it more likely) also
903
  // means that DCT32x32 becomes harder to do when starting from
904
  // two DCT16x32s). It might be better to make them more independent,
905
  // e.g. by not applying the multiplier when storing the new entropy
906
  // estimates in TryMergeToACSCandidate().
907
89.5k
  const MergeTry kTransformsForMerge[9] = {
908
89.5k
      {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8},
909
89.5k
      {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8},
910
      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
911
      // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16},
912
89.5k
      {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32},
913
89.5k
      {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32},
914
      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
915
      // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5,
916
      // 0.9822994906548809f},
917
89.5k
      {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32},
918
89.5k
      {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32},
919
      // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f},
920
89.5k
  };
921
  /*
922
  These sizes not yet included in merge heuristic:
923
  set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f);
924
  set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f);
925
  set(AcStrategyType::DCT128X128, 0.0f, 1.0f);
926
  set(AcStrategyType::DCT128X64, 0.0f, 0.73f);
927
  set(AcStrategyType::DCT64X128, 0.0f, 0.73f);
928
  set(AcStrategyType::DCT256X256, 0.0f, 1.0f);
929
  set(AcStrategyType::DCT256X128, 0.0f, 0.73f);
930
  set(AcStrategyType::DCT128X256, 0.0f, 0.73f);
931
  */
932
933
  // Priority is a tricky kludge to avoid collisions so that transforms
934
  // don't overlap.
935
89.5k
  uint8_t priority[64] = {};
936
89.5k
  bool enable_32x32 = cparams.decoding_speed_tier < 4;
937
806k
  for (auto mt : kTransformsForMerge) {
938
806k
    if (mt.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
939
0
      continue;
940
0
    }
941
806k
    AcStrategy acs = AcStrategy::FromRawStrategy(mt.type);
942
943
4.51M
    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
944
3.71M
         cy += acs.covered_blocks_y()) {
945
24.8M
      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
946
21.1M
           cx += acs.covered_blocks_x()) {
947
21.1M
        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
948
609k
          if (cparams.decoding_speed_tier < 4 &&
949
609k
              mt.type == AcStrategyType::DCT32X64) {
950
            // We handle both DCT8X16 and DCT16X8 at the same time.
951
67.7k
            if ((cy | cx) % 8 == 0) {
952
67.7k
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
953
67.7k
                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
954
67.7k
                  mt.entropy_mul, entropy_mul64X64, entropy_estimate, block,
955
67.7k
                  scratch_space, quantized));
956
67.7k
            }
957
67.7k
            continue;
958
541k
          } else if (mt.type == AcStrategyType::DCT32X16) {
959
            // We handled both DCT8X16 and DCT16X8 at the same time,
960
            // and that is above. The last column and last row,
961
            // when the last column or last row is odd numbered,
962
            // are still handled by TryMergeAcs.
963
67.7k
            continue;
964
67.7k
          }
965
609k
        }
966
21.0M
        if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) ||
967
20.7M
            (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) {
968
          // already covered by FindBest32X32
969
576k
          continue;
970
576k
        }
971
972
20.4M
        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
973
8.32M
          if (mt.type == AcStrategyType::DCT16X32) {
974
            // We handle both DCT8X16 and DCT16X8 at the same time.
975
288k
            if ((cy | cx) % 4 == 0) {
976
288k
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
977
288k
                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
978
288k
                  ac_strategy, mt.entropy_mul, entropy_mul32X32,
979
288k
                  entropy_estimate, block, scratch_space, quantized));
980
288k
            }
981
288k
            continue;
982
8.04M
          } else if (mt.type == AcStrategyType::DCT32X16) {
983
            // We handled both DCT8X16 and DCT16X8 at the same time,
984
            // and that is above. The last column and last row,
985
            // when the last column or last row is odd numbered,
986
            // are still handled by TryMergeAcs.
987
220k
            continue;
988
220k
          }
989
8.32M
        }
990
19.9M
        if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) ||
991
19.9M
            (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) {
992
          // already covered by FindBest32X32
993
0
          continue;
994
0
        }
995
19.9M
        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
996
15.5M
          if (mt.type == AcStrategyType::DCT8X16) {
997
            // We handle both DCT8X16 and DCT16X8 at the same time.
998
2.12M
            if ((cy | cx) % 2 == 0) {
999
1.21M
              JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1000
1.21M
                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1001
1.21M
                  mt.entropy_mul, entropy_mul16X16, entropy_estimate, block,
1002
1.21M
                  scratch_space, quantized));
1003
1.21M
            }
1004
2.12M
            continue;
1005
13.4M
          } else if (mt.type == AcStrategyType::DCT16X8) {
1006
            // We handled both DCT8X16 and DCT16X8 at the same time,
1007
            // and that is above. The last column and last row,
1008
            // when the last column or last row is odd numbered,
1009
            // are still handled by TryMergeAcs.
1010
2.10M
            continue;
1011
2.10M
          }
1012
15.5M
        }
1013
15.7M
        if ((mt.type == AcStrategyType::DCT8X16 && cy % 2 == 1) ||
1014
15.4M
            (mt.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) {
1015
          // already covered by FindBestFirstLevelDivisionForSquare
1016
608k
          continue;
1017
608k
        }
1018
        // All other merge sizes are handled here.
1019
        // Some of the DCT16X8s and DCT8X16s will still leak through here
1020
        // when there is an odd number of 8x8 blocks, then the last row
1021
        // and column will get their DCT16X8s and DCT8X16s through the
1022
        // normal integral transform merging process.
1023
15.1M
        JXL_RETURN_IF_ERROR(
1024
15.1M
            TryMergeAcs(mt.type, bx, by, cx, cy, config, cmap_factors,
1025
15.1M
                        ac_strategy, mt.entropy_mul, mt.priority, &priority[0],
1026
15.1M
                        entropy_estimate, block, scratch_space, quantized));
1027
15.1M
      }
1028
3.71M
    }
1029
806k
  }
1030
89.5k
  if (cparams.speed_tier >= SpeedTier::kHare) {
1031
0
    return true;
1032
0
  }
1033
  // Here we still try to do some non-aligned matching, find a few more
1034
  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
1035
665k
  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
1036
4.27M
    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
1037
3.70M
      if ((cy | cx) % 2 != 0) {
1038
2.49M
        JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1039
2.49M
            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1040
2.49M
            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
1041
2.49M
            scratch_space, quantized));
1042
2.49M
      }
1043
3.70M
    }
1044
575k
  }
1045
  // Non-aligned matching for 32X32, 16X32 and 32X16.
1046
89.5k
  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
1047
328k
  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
1048
891k
    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
1049
651k
      if ((cy | cx) % 4 == 0) {
1050
288k
        continue;  // Already tried with loop above (DCT16X32 case).
1051
288k
      }
1052
363k
      JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare(
1053
363k
          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
1054
363k
          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
1055
363k
          scratch_space, quantized));
1056
363k
    }
1057
239k
  }
1058
89.5k
  return true;
1059
89.5k
}
Unexecuted instantiation: jxl::N_AVX3::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
Unexecuted instantiation: jxl::N_AVX3_SPR::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
Unexecuted instantiation: jxl::N_SSE2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*)
1060
1061
// NOLINTNEXTLINE(google-readability-namespace-comments)
1062
}  // namespace HWY_NAMESPACE
1063
}  // namespace jxl
1064
HWY_AFTER_NAMESPACE();
1065
1066
#if HWY_ONCE
1067
namespace jxl {
1068
HWY_EXPORT(ProcessRectACS);
1069
1070
Status AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in,
1071
                                  const ImageF& quant_field, const ImageF& mask,
1072
                                  const ImageF& mask1x1,
1073
3.48k
                                  DequantMatrices* matrices) {
1074
3.48k
  config.dequant = matrices;
1075
1076
3.48k
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
1077
0
    JXL_RETURN_IF_ERROR(
1078
0
        matrices->EnsureComputed(memory_manager, 1));  // DCT8 only
1079
3.48k
  } else {
1080
3.48k
    uint32_t acs_mask = 0;
1081
    // All transforms up to 64x64.
1082
76.7k
    for (size_t i = 0; i < static_cast<size_t>(AcStrategyType::DCT128X128);
1083
73.2k
         i++) {
1084
73.2k
      acs_mask |= (1 << i);
1085
73.2k
    }
1086
3.48k
    JXL_RETURN_IF_ERROR(matrices->EnsureComputed(memory_manager, acs_mask));
1087
3.48k
  }
1088
1089
  // Image row pointers and strides.
1090
3.48k
  config.quant_field_row = quant_field.Row(0);
1091
3.48k
  config.quant_field_stride = quant_field.PixelsPerRow();
1092
3.48k
  if (mask.xsize() > 0 && mask.ysize() > 0) {
1093
3.48k
    config.masking_field_row = mask.Row(0);
1094
3.48k
    config.masking_field_stride = mask.PixelsPerRow();
1095
3.48k
  }
1096
3.48k
  config.mask1x1_xsize = mask1x1.xsize();
1097
3.48k
  if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) {
1098
3.48k
    config.masking1x1_field_row = mask1x1.Row(0);
1099
3.48k
    config.masking1x1_field_stride = mask1x1.PixelsPerRow();
1100
3.48k
  }
1101
1102
3.48k
  config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0);
1103
3.48k
  config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0);
1104
3.48k
  config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0);
1105
3.48k
  config.src_stride = src.PixelsPerRow();
1106
1107
  // Entropy estimate is composed of two factors:
1108
  //  - estimate of the number of bits that will be used by the block
1109
  //  - information loss due to quantization
1110
  // The following constant controls the relative weights of these components.
1111
3.48k
  config.info_loss_multiplier = 1.2;
1112
3.48k
  config.zeros_mul = 9.3089059022677905;
1113
3.48k
  config.cost_delta = 10.833273317067883;
1114
1115
3.48k
  static const float kBias = 0.13731742964354549;
1116
3.48k
  const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias);
1117
1118
3.48k
  static const float kPow1 = 0.33677806662454718;
1119
3.48k
  static const float kPow2 = 0.50990926717963703;
1120
3.48k
  static const float kPow3 = 0.36702940662370243;
1121
3.48k
  config.info_loss_multiplier *= std::pow(ratio, kPow1);
1122
3.48k
  config.zeros_mul *= std::pow(ratio, kPow2);
1123
3.48k
  config.cost_delta *= std::pow(ratio, kPow3);
1124
3.48k
  return true;
1125
3.48k
}
1126
1127
3.48k
Status AcStrategyHeuristics::PrepareForThreads(std::size_t num_threads) {
1128
3.48k
  const size_t dct_scratch_size =
1129
3.48k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
1130
3.48k
  mem_per_thread = 6 * AcStrategy::kMaxCoeffArea + dct_scratch_size;
1131
3.48k
  size_t mem_bytes = num_threads * mem_per_thread * sizeof(float);
1132
3.48k
  JXL_ASSIGN_OR_RETURN(mem, AlignedMemory::Create(memory_manager, mem_bytes));
1133
3.48k
  qmem_per_thread = AcStrategy::kMaxCoeffArea;
1134
3.48k
  size_t qmem_bytes = num_threads * qmem_per_thread * sizeof(uint32_t);
1135
3.48k
  JXL_ASSIGN_OR_RETURN(qmem, AlignedMemory::Create(memory_manager, qmem_bytes));
1136
3.48k
  return true;
1137
3.48k
}
1138
1139
Status AcStrategyHeuristics::ProcessRect(const Rect& rect,
1140
                                         const ColorCorrelationMap& cmap,
1141
                                         AcStrategyImage* ac_strategy,
1142
89.5k
                                         size_t thread) {
1143
  // In Cheetah mode, use DCT8 everywhere and uniform quantization.
1144
89.5k
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
1145
0
    ac_strategy->FillDCT8(rect);
1146
0
    return true;
1147
0
  }
1148
89.5k
  return HWY_DYNAMIC_DISPATCH(ProcessRectACS)(
1149
89.5k
      cparams, config, rect, cmap,
1150
89.5k
      mem.address<float>() + thread * mem_per_thread,
1151
89.5k
      qmem.address<uint32_t>() + thread * qmem_per_thread, ac_strategy);
1152
89.5k
}
1153
1154
Status AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim,
1155
                                      const AcStrategyImage& ac_strategy,
1156
3.48k
                                      AuxOut* aux_out) {
1157
  // Accounting and debug output.
1158
3.48k
  if (aux_out != nullptr) {
1159
0
    aux_out->num_small_blocks =
1160
0
        ac_strategy.CountBlocks(AcStrategyType::IDENTITY) +
1161
0
        ac_strategy.CountBlocks(AcStrategyType::DCT2X2) +
1162
0
        ac_strategy.CountBlocks(AcStrategyType::DCT4X4);
1163
0
    aux_out->num_dct4x8_blocks =
1164
0
        ac_strategy.CountBlocks(AcStrategyType::DCT4X8) +
1165
0
        ac_strategy.CountBlocks(AcStrategyType::DCT8X4);
1166
0
    aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategyType::AFV0) +
1167
0
                              ac_strategy.CountBlocks(AcStrategyType::AFV1) +
1168
0
                              ac_strategy.CountBlocks(AcStrategyType::AFV2) +
1169
0
                              ac_strategy.CountBlocks(AcStrategyType::AFV3);
1170
0
    aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategyType::DCT);
1171
0
    aux_out->num_dct8x16_blocks =
1172
0
        ac_strategy.CountBlocks(AcStrategyType::DCT8X16) +
1173
0
        ac_strategy.CountBlocks(AcStrategyType::DCT16X8);
1174
0
    aux_out->num_dct8x32_blocks =
1175
0
        ac_strategy.CountBlocks(AcStrategyType::DCT8X32) +
1176
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X8);
1177
0
    aux_out->num_dct16_blocks =
1178
0
        ac_strategy.CountBlocks(AcStrategyType::DCT16X16);
1179
0
    aux_out->num_dct16x32_blocks =
1180
0
        ac_strategy.CountBlocks(AcStrategyType::DCT16X32) +
1181
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X16);
1182
0
    aux_out->num_dct32_blocks =
1183
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X32);
1184
0
    aux_out->num_dct32x64_blocks =
1185
0
        ac_strategy.CountBlocks(AcStrategyType::DCT32X64) +
1186
0
        ac_strategy.CountBlocks(AcStrategyType::DCT64X32);
1187
0
    aux_out->num_dct64_blocks =
1188
0
        ac_strategy.CountBlocks(AcStrategyType::DCT64X64);
1189
0
  }
1190
1191
3.48k
  if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) {
1192
0
    JXL_RETURN_IF_ERROR(DumpAcStrategy(ac_strategy, frame_dim.xsize,
1193
0
                                       frame_dim.ysize, "ac_strategy", aux_out,
1194
0
                                       cparams));
1195
0
  }
1196
3.48k
  return true;
1197
3.48k
}
1198
1199
}  // namespace jxl
1200
#endif  // HWY_ONCE