/src/libjxl/lib/jxl/enc_ac_strategy.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_ac_strategy.h" |
7 | | |
8 | | #include <algorithm> |
9 | | #include <cmath> |
10 | | #include <cstdint> |
11 | | #include <cstdio> |
12 | | #include <cstring> |
13 | | #include <limits> |
14 | | |
15 | | #include "lib/jxl/chroma_from_luma.h" |
16 | | #include "lib/jxl/common.h" |
17 | | #include "lib/jxl/frame_dimensions.h" |
18 | | #include "lib/jxl/image.h" |
19 | | #include "lib/jxl/memory_manager_internal.h" |
20 | | #include "lib/jxl/quant_weights.h" |
21 | | |
22 | | #undef HWY_TARGET_INCLUDE |
23 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc" |
24 | | #include <hwy/foreach_target.h> |
25 | | #include <hwy/highway.h> |
26 | | |
27 | | #include "lib/jxl/ac_strategy.h" |
28 | | #include "lib/jxl/base/bits.h" |
29 | | #include "lib/jxl/base/compiler_specific.h" |
30 | | #include "lib/jxl/base/fast_math-inl.h" |
31 | | #include "lib/jxl/base/rect.h" |
32 | | #include "lib/jxl/base/status.h" |
33 | | #include "lib/jxl/dec_transforms-inl.h" |
34 | | #include "lib/jxl/enc_aux_out.h" |
35 | | #include "lib/jxl/enc_debug_image.h" |
36 | | #include "lib/jxl/enc_params.h" |
37 | | #include "lib/jxl/enc_transforms-inl.h" |
38 | | #include "lib/jxl/simd_util.h" |
39 | | |
40 | | // Some of the floating point constants in this file and in other |
41 | | // files in the libjxl project have been obtained using the |
42 | | // tools/optimizer/simplex_fork.py tool. It is a variation of |
43 | | // Nelder-Mead optimization, and we generally try to minimize |
44 | | // BPP * pnorm aggregate as reported by the benchmark_xl tool, |
45 | | // but occasionally the values are optimized by using additional |
46 | | // constraints such as maintaining a certain density, or ratio of |
47 | | // popularity of integral transforms. Jyrki visually reviews all |
48 | | // such changes and often makes manual changes to maintain good |
49 | | // visual quality to changes where butteraugli was not sufficiently |
50 | | // sensitive to some kind of degradation. Unfortunately image quality |
51 | | // is still more of an art than science. |
52 | | |
53 | | // Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging. |
54 | | #ifndef JXL_DEBUG_AC_STRATEGY |
55 | 4.26k | #define JXL_DEBUG_AC_STRATEGY 0 |
56 | | #endif |
57 | | |
58 | | // This must come before the begin/end_target, but HWY_ONCE is only true |
59 | | // after that, so use an "include guard". |
60 | | #ifndef LIB_JXL_ENC_AC_STRATEGY_ |
61 | | #define LIB_JXL_ENC_AC_STRATEGY_ |
62 | | // Parameters of the heuristic are marked with a OPTIMIZE comment. |
63 | | namespace jxl { |
64 | | namespace { |
65 | | |
66 | | // Debugging utilities. |
67 | | |
68 | | // Returns a linear sRGB color (as bytes) for each AC strategy. |
69 | 0 | const uint8_t* TypeColor(uint8_t raw_strategy) { |
70 | 0 | JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); |
71 | 0 | static_assert(AcStrategy::kNumValidStrategies == 27, "Update colors"); |
72 | 0 | static constexpr uint8_t kColors[AcStrategy::kNumValidStrategies + 1][3] = { |
73 | 0 | {0xFF, 0xFF, 0x00}, // DCT8 | yellow |
74 | 0 | {0xFF, 0x80, 0x80}, // HORNUSS | vivid tangerine |
75 | 0 | {0xFF, 0x80, 0x80}, // DCT2x2 | vivid tangerine |
76 | 0 | {0xFF, 0x80, 0x80}, // DCT4x4 | vivid tangerine |
77 | 0 | {0x80, 0xFF, 0x00}, // DCT16x16 | chartreuse |
78 | 0 | {0x00, 0xC0, 0x00}, // DCT32x32 | waystone green |
79 | 0 | {0xC0, 0xFF, 0x00}, // DCT16x8 | lime |
80 | 0 | {0xC0, 0xFF, 0x00}, // DCT8x16 | lime |
81 | 0 | {0x00, 0xFF, 0x00}, // DCT32x8 | green |
82 | 0 | {0x00, 0xFF, 0x00}, // DCT8x32 | green |
83 | 0 | {0x00, 0xFF, 0x00}, // DCT32x16 | green |
84 | 0 | {0x00, 0xFF, 0x00}, // DCT16x32 | green |
85 | 0 | {0xFF, 0x80, 0x00}, // DCT4x8 | orange juice |
86 | 0 | {0xFF, 0x80, 0x00}, // DCT8x4 | orange juice |
87 | 0 | {0xFF, 0xFF, 0x80}, // AFV0 | butter |
88 | 0 | {0xFF, 0xFF, 0x80}, // AFV1 | butter |
89 | 0 | {0xFF, 0xFF, 0x80}, // AFV2 | butter |
90 | 0 | {0xFF, 0xFF, 0x80}, // AFV3 | butter |
91 | 0 | {0x00, 0xC0, 0xFF}, // DCT64x64 | capri |
92 | 0 | {0x00, 0xFF, 0xFF}, // DCT64x32 | aqua |
93 | 0 | {0x00, 0xFF, 0xFF}, // DCT32x64 | aqua |
94 | 0 | {0x00, 0x40, 0xFF}, // DCT128x128 | rare blue |
95 | 0 | {0x00, 0x80, 0xFF}, // DCT128x64 | magic ink |
96 | 0 | {0x00, 0x80, 0xFF}, // DCT64x128 | magic ink |
97 | 0 | {0x00, 0x00, 0xC0}, // DCT256x256 | keese blue |
98 | 0 | {0x00, 0x00, 0xFF}, // DCT256x128 | blue |
99 | 0 | {0x00, 0x00, 0xFF}, // DCT128x256 | blue |
100 | 0 | {0x00, 0x00, 0x00} // invalid | black |
101 | 0 | }; |
102 | 0 | raw_strategy = |
103 | 0 | Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies); |
104 | 0 | return kColors[raw_strategy]; |
105 | 0 | } |
106 | | |
107 | 0 | const uint8_t* TypeMask(uint8_t raw_strategy) { |
108 | 0 | JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); |
109 | 0 | static_assert(AcStrategy::kNumValidStrategies == 27, "Update masks"); |
110 | 0 | // implicitly, first row and column is made dark |
111 | 0 | static constexpr uint8_t kMask[AcStrategy::kNumValidStrategies + 1][64] = { |
112 | 0 | { |
113 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
114 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
115 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
116 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
117 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
118 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
119 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
120 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
121 | 0 | }, // DCT8 |
122 | 0 | { |
123 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
124 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
125 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
126 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
127 | 0 | 0, 0, 1, 1, 1, 1, 0, 0, // |
128 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
129 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
130 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
131 | 0 | }, // HORNUSS |
132 | 0 | { |
133 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
134 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
135 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
136 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
137 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
138 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
139 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
140 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
141 | 0 | }, // 2x2 |
142 | 0 | { |
143 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
144 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
145 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
146 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
147 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
148 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
149 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
150 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
151 | 0 | }, // 4x4 |
152 | 0 | {}, // DCT16x16 (unused) |
153 | 0 | {}, // DCT32x32 (unused) |
154 | 0 | {}, // DCT16x8 (unused) |
155 | 0 | {}, // DCT8x16 (unused) |
156 | 0 | {}, // DCT32x8 (unused) |
157 | 0 | {}, // DCT8x32 (unused) |
158 | 0 | {}, // DCT32x16 (unused) |
159 | 0 | {}, // DCT16x32 (unused) |
160 | 0 | { |
161 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
162 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
163 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
164 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
165 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
166 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
167 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
168 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
169 | 0 | }, // DCT4x8 |
170 | 0 | { |
171 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
172 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
173 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
174 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
175 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
176 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
177 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
178 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
179 | 0 | }, // DCT8x4 |
180 | 0 | { |
181 | 0 | 1, 1, 1, 1, 1, 0, 0, 0, // |
182 | 0 | 1, 1, 1, 1, 0, 0, 0, 0, // |
183 | 0 | 1, 1, 1, 0, 0, 0, 0, 0, // |
184 | 0 | 1, 1, 0, 0, 0, 0, 0, 0, // |
185 | 0 | 1, 0, 0, 0, 0, 0, 0, 0, // |
186 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
187 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
188 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
189 | 0 | }, // AFV0 |
190 | 0 | { |
191 | 0 | 0, 0, 0, 0, 1, 1, 1, 1, // |
192 | 0 | 0, 0, 0, 0, 0, 1, 1, 1, // |
193 | 0 | 0, 0, 0, 0, 0, 0, 1, 1, // |
194 | 0 | 0, 0, 0, 0, 0, 0, 0, 1, // |
195 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
196 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
197 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
198 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
199 | 0 | }, // AFV1 |
200 | 0 | { |
201 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
202 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
203 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
204 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
205 | 0 | 1, 0, 0, 0, 0, 0, 0, 0, // |
206 | 0 | 1, 1, 0, 0, 0, 0, 0, 0, // |
207 | 0 | 1, 1, 1, 0, 0, 0, 0, 0, // |
208 | 0 | 1, 1, 1, 1, 0, 0, 0, 0, // |
209 | 0 | }, // AFV2 |
210 | 0 | { |
211 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
212 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
213 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
214 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
215 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
216 | 0 | 0, 0, 0, 0, 0, 0, 0, 1, // |
217 | 0 | 0, 0, 0, 0, 0, 0, 1, 1, // |
218 | 0 | 0, 0, 0, 0, 0, 1, 1, 1, // |
219 | 0 | }, // AFV3 |
220 | 0 | {} // invalid |
221 | 0 | }; |
222 | 0 | raw_strategy = |
223 | 0 | Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies); |
224 | 0 | return kMask[raw_strategy]; |
225 | 0 | } |
226 | | |
227 | | Status DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, |
228 | | size_t ysize, const char* tag, AuxOut* aux_out, |
229 | 0 | const CompressParams& cparams) { |
230 | 0 | JxlMemoryManager* memory_manager = ac_strategy.memory_manager(); |
231 | 0 | JXL_ASSIGN_OR_RETURN(Image3F color_acs, |
232 | 0 | Image3F::Create(memory_manager, xsize, ysize)); |
233 | 0 | for (size_t y = 0; y < ysize; y++) { |
234 | 0 | float* JXL_RESTRICT rows[3] = { |
235 | 0 | color_acs.PlaneRow(0, y), |
236 | 0 | color_acs.PlaneRow(1, y), |
237 | 0 | color_acs.PlaneRow(2, y), |
238 | 0 | }; |
239 | 0 | const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); |
240 | 0 | for (size_t x = 0; x < xsize; x++) { |
241 | 0 | AcStrategy acs = acs_row[x / kBlockDim]; |
242 | 0 | const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); |
243 | 0 | for (size_t c = 0; c < 3; c++) { |
244 | 0 | rows[c][x] = color[c] / 255.f; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | } |
248 | 0 | size_t stride = color_acs.PixelsPerRow(); |
249 | 0 | for (size_t c = 0; c < 3; c++) { |
250 | 0 | for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { |
251 | 0 | float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); |
252 | 0 | const AcStrategyRow acs_row = ac_strategy.ConstRow(by); |
253 | 0 | for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { |
254 | 0 | AcStrategy acs = acs_row[bx]; |
255 | 0 | if (!acs.IsFirstBlock()) continue; |
256 | 0 | const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); |
257 | 0 | const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); |
258 | 0 | if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { |
259 | 0 | for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; |
260 | 0 | iy++) { |
261 | 0 | for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; |
262 | 0 | ix++) { |
263 | 0 | if (mask[iy * kBlockDim + ix]) { |
264 | 0 | row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; |
265 | 0 | } |
266 | 0 | } |
267 | 0 | } |
268 | 0 | } |
269 | 0 | // draw block edges |
270 | 0 | for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && |
271 | 0 | bx * kBlockDim + ix < xsize; |
272 | 0 | ix++) { |
273 | 0 | row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; |
274 | 0 | } |
275 | 0 | for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && |
276 | 0 | by * kBlockDim + iy < ysize; |
277 | 0 | iy++) { |
278 | 0 | row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; |
279 | 0 | } |
280 | 0 | } |
281 | 0 | } |
282 | 0 | } |
283 | 0 | return DumpImage(cparams, tag, color_acs); |
284 | 0 | } |
285 | | |
286 | | } // namespace |
287 | | } // namespace jxl |
288 | | #endif // LIB_JXL_ENC_AC_STRATEGY_ |
289 | | |
290 | | HWY_BEFORE_NAMESPACE(); |
291 | | namespace jxl { |
292 | | namespace HWY_NAMESPACE { |
293 | | |
294 | | // These templates are not found via ADL. |
295 | | using hwy::HWY_NAMESPACE::AbsDiff; |
296 | | using hwy::HWY_NAMESPACE::Eq; |
297 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
298 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
299 | | using hwy::HWY_NAMESPACE::Round; |
300 | | using hwy::HWY_NAMESPACE::Sqrt; |
301 | | |
302 | | bool MultiBlockTransformCrossesHorizontalBoundary( |
303 | | const AcStrategyImage& ac_strategy, size_t start_x, size_t y, |
304 | 6.02M | size_t end_x) { |
305 | 6.02M | if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { |
306 | 57.3k | return false; |
307 | 57.3k | } |
308 | 5.96M | if (y % 8 == 0) { |
309 | | // Nothing crosses 64x64 boundaries, and the memory on the other side |
310 | | // of the 64x64 block may still uninitialized. |
311 | 868k | return false; |
312 | 868k | } |
313 | 5.09M | end_x = std::min(end_x, ac_strategy.xsize()); |
314 | | // The first multiblock might be before the start_x, let's adjust it |
315 | | // to point to the first IsFirstBlock() == true block we find by backward |
316 | | // tracing. |
317 | 5.09M | AcStrategyRow row = ac_strategy.ConstRow(y); |
318 | 5.09M | const size_t start_x_limit = start_x & ~7; |
319 | 7.25M | while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { |
320 | 2.15M | --start_x; |
321 | 2.15M | } |
322 | 14.5M | for (size_t x = start_x; x < end_x;) { |
323 | 10.3M | if (row[x].IsFirstBlock()) { |
324 | 9.47M | x += row[x].covered_blocks_x(); |
325 | 9.47M | } else { |
326 | 905k | return true; |
327 | 905k | } |
328 | 10.3M | } |
329 | 4.19M | return false; |
330 | 5.09M | } Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) jxl::N_AVX2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Line | Count | Source | 304 | 6.02M | size_t end_x) { | 305 | 6.02M | if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { | 306 | 57.3k | return false; | 307 | 57.3k | } | 308 | 5.96M | if (y % 8 == 0) { | 309 | | // Nothing crosses 64x64 boundaries, and the memory on the other side | 310 | | // of the 64x64 block may still uninitialized. | 311 | 868k | return false; | 312 | 868k | } | 313 | 5.09M | end_x = std::min(end_x, ac_strategy.xsize()); | 314 | | // The first multiblock might be before the start_x, let's adjust it | 315 | | // to point to the first IsFirstBlock() == true block we find by backward | 316 | | // tracing. | 317 | 5.09M | AcStrategyRow row = ac_strategy.ConstRow(y); | 318 | 5.09M | const size_t start_x_limit = start_x & ~7; | 319 | 7.25M | while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { | 320 | 2.15M | --start_x; | 321 | 2.15M | } | 322 | 14.5M | for (size_t x = start_x; x < end_x;) { | 323 | 10.3M | if (row[x].IsFirstBlock()) { | 324 | 9.47M | x += row[x].covered_blocks_x(); | 325 | 9.47M | } else { | 326 | 905k | return true; | 327 | 905k | } | 328 | 10.3M | } | 329 | 4.19M | return false; | 330 | 5.09M | } |
Unexecuted instantiation: jxl::N_AVX3::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_AVX3_ZEN4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_AVX3_SPR::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) |
331 | | |
332 | | bool MultiBlockTransformCrossesVerticalBoundary( |
333 | | const AcStrategyImage& ac_strategy, size_t x, size_t start_y, |
334 | 4.82M | size_t end_y) { |
335 | 4.82M | if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { |
336 | 41.7k | return false; |
337 | 41.7k | } |
338 | 4.78M | if (x % 8 == 0) { |
339 | | // Nothing crosses 64x64 boundaries, and the memory on the other side |
340 | | // of the 64x64 block may still uninitialized. |
341 | 734k | return false; |
342 | 734k | } |
343 | 4.04M | end_y = std::min(end_y, ac_strategy.ysize()); |
344 | | // The first multiblock might be before the start_y, let's adjust it |
345 | | // to point to the first IsFirstBlock() == true block we find by backward |
346 | | // tracing. |
347 | 4.04M | const size_t start_y_limit = start_y & ~7; |
348 | 4.35M | while (start_y != start_y_limit && |
349 | 4.35M | !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { |
350 | 308k | --start_y; |
351 | 308k | } |
352 | | |
353 | 12.6M | for (size_t y = start_y; y < end_y;) { |
354 | 8.82M | AcStrategyRow row = ac_strategy.ConstRow(y); |
355 | 8.82M | if (row[x].IsFirstBlock()) { |
356 | 8.57M | y += row[x].covered_blocks_y(); |
357 | 8.57M | } else { |
358 | 255k | return true; |
359 | 255k | } |
360 | 8.82M | } |
361 | 3.79M | return false; |
362 | 4.04M | } Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) jxl::N_AVX2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Line | Count | Source | 334 | 4.82M | size_t end_y) { | 335 | 4.82M | if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { | 336 | 41.7k | return false; | 337 | 41.7k | } | 338 | 4.78M | if (x % 8 == 0) { | 339 | | // Nothing crosses 64x64 boundaries, and the memory on the other side | 340 | | // of the 64x64 block may still uninitialized. | 341 | 734k | return false; | 342 | 734k | } | 343 | 4.04M | end_y = std::min(end_y, ac_strategy.ysize()); | 344 | | // The first multiblock might be before the start_y, let's adjust it | 345 | | // to point to the first IsFirstBlock() == true block we find by backward | 346 | | // tracing. | 347 | 4.04M | const size_t start_y_limit = start_y & ~7; | 348 | 4.35M | while (start_y != start_y_limit && | 349 | 4.35M | !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { | 350 | 308k | --start_y; | 351 | 308k | } | 352 | | | 353 | 12.6M | for (size_t y = start_y; y < end_y;) { | 354 | 8.82M | AcStrategyRow row = ac_strategy.ConstRow(y); | 355 | 8.82M | if (row[x].IsFirstBlock()) { | 356 | 8.57M | y += row[x].covered_blocks_y(); | 357 | 8.57M | } else { | 358 | 255k | return true; | 359 | 255k | } | 360 | 8.82M | } | 361 | 3.79M | return false; | 362 | 4.04M | } |
Unexecuted instantiation: jxl::N_AVX3::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_AVX3_ZEN4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_AVX3_SPR::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) |
363 | | |
364 | | Status EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x, |
365 | | size_t y, const ACSConfig& config, |
366 | | const float* JXL_RESTRICT cmap_factors, float* block, |
367 | | float* full_scratch_space, uint32_t* quantized, |
368 | 36.5M | float& entropy) { |
369 | 36.5M | entropy = 0.0f; |
370 | 36.5M | float* mem = full_scratch_space; |
371 | 36.5M | float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea; |
372 | 36.5M | const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; |
373 | | |
374 | | // Apply transform. |
375 | 146M | for (size_t c = 0; c < 3; c++) { |
376 | 109M | float* JXL_RESTRICT block_c = block + size * c; |
377 | 109M | TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), |
378 | 109M | config.src_stride, block_c, scratch_space); |
379 | 109M | } |
380 | 36.5M | HWY_FULL(float) df; |
381 | | |
382 | 36.5M | const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); |
383 | | // avoid large blocks when there is a lot going on in red-green. |
384 | 36.5M | float quant_norm16 = 0; |
385 | 36.5M | if (num_blocks == 1) { |
386 | | // When it is only one 8x8, we don't need aggregation of values. |
387 | 28.9M | quant_norm16 = config.Quant(x / 8, y / 8); |
388 | 28.9M | } else if (num_blocks == 2) { |
389 | | // Taking max instead of 8th norm seems to work |
390 | | // better for smallest blocks up to 16x8. Jyrki couldn't get |
391 | | // improvements in trying the same for 16x16 blocks. |
392 | 4.77M | if (acs.covered_blocks_y() == 2) { |
393 | 2.38M | quant_norm16 = |
394 | 2.38M | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); |
395 | 2.39M | } else { |
396 | 2.39M | quant_norm16 = |
397 | 2.39M | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); |
398 | 2.39M | } |
399 | 4.77M | } else { |
400 | | // Load QF value, calculate empirical heuristic on masking field |
401 | | // for weighting the information loss. Information loss manifests |
402 | | // itself as ringing, and masking could hide it. |
403 | 11.0M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
404 | 35.7M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
405 | 27.3M | float qval = config.Quant(x / 8 + ix, y / 8 + iy); |
406 | 27.3M | qval *= qval; |
407 | 27.3M | qval *= qval; |
408 | 27.3M | qval *= qval; |
409 | 27.3M | quant_norm16 += qval * qval; |
410 | 27.3M | } |
411 | 8.31M | } |
412 | 2.74M | quant_norm16 /= num_blocks; |
413 | 2.74M | quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f); |
414 | 2.74M | } |
415 | 36.5M | const auto quant = Set(df, quant_norm16); |
416 | | |
417 | | // Compute entropy. |
418 | 36.5M | const HWY_CAPPED(float, 8) df8; |
419 | | |
420 | 36.5M | auto loss = Zero(df8); |
421 | 146M | for (size_t c = 0; c < 3; c++) { |
422 | 109M | const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c); |
423 | 109M | const float* matrix = config.dequant->Matrix(acs.Strategy(), c); |
424 | 109M | const auto cmap_factor = Set(df, cmap_factors[c]); |
425 | | |
426 | 109M | auto entropy_v = Zero(df); |
427 | 109M | auto nzeros_v = Zero(df); |
428 | 1.69G | for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { |
429 | 1.58G | const auto in = Load(df, block + c * size + i); |
430 | 1.58G | const auto in_y = Mul(Load(df, block + size + i), cmap_factor); |
431 | 1.58G | const auto im = Load(df, inv_matrix + i); |
432 | 1.58G | const auto val = Mul(Sub(in, in_y), Mul(im, quant)); |
433 | 1.58G | const auto rval = Round(val); |
434 | 1.58G | const auto diff = Sub(val, rval); |
435 | 1.58G | const auto m = Load(df, matrix + i); |
436 | 1.58G | Store(Mul(m, diff), df, &mem[i]); |
437 | 1.58G | const auto q = Abs(rval); |
438 | 1.58G | const auto q_is_zero = Eq(q, Zero(df)); |
439 | | // We used to have q * C here, but that cost model seems to |
440 | | // be punishing large values more than necessary. Sqrt tries |
441 | | // to avoid large values less aggressively. |
442 | 1.58G | entropy_v = Add(Sqrt(q), entropy_v); |
443 | 1.58G | nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); |
444 | 1.58G | } |
445 | | |
446 | 109M | { |
447 | 109M | float masku_lut[3] = { |
448 | 109M | 12.0, |
449 | 109M | 0.0, |
450 | 109M | 4.0, |
451 | 109M | }; |
452 | 109M | auto masku_off = Set(df8, masku_lut[c]); |
453 | 109M | auto lossc = Zero(df8); |
454 | 109M | TransformToPixels(acs.Strategy(), &mem[0], block, |
455 | 109M | acs.covered_blocks_x() * 8, scratch_space); |
456 | | |
457 | 242M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
458 | 331M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
459 | 1.78G | for (size_t dy = 0; dy < kBlockDim; ++dy) { |
460 | 3.16G | for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) { |
461 | 1.58G | auto in = Load(df8, block + |
462 | 1.58G | (iy * kBlockDim + dy) * |
463 | 1.58G | (acs.covered_blocks_x() * kBlockDim) + |
464 | 1.58G | ix * kBlockDim + dx); |
465 | 1.58G | if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) { |
466 | 1.58G | auto masku = |
467 | 1.58G | Add(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx, |
468 | 1.58G | y + iy * 8 + dy)), |
469 | 1.58G | masku_off); |
470 | 1.58G | in = Mul(masku, in); |
471 | 1.58G | in = Mul(in, in); |
472 | 1.58G | in = Mul(in, in); |
473 | 1.58G | in = Mul(in, in); |
474 | 1.58G | lossc = Add(lossc, in); |
475 | 1.58G | } |
476 | 1.58G | } |
477 | 1.58G | } |
478 | 197M | } |
479 | 133M | } |
480 | 109M | static const double kChannelMul[3] = { |
481 | 109M | pow(8.2, 8.0), |
482 | 109M | pow(1.0, 8.0), |
483 | 109M | pow(1.03, 8.0), |
484 | 109M | }; |
485 | 109M | lossc = Mul(Set(df8, kChannelMul[c]), lossc); |
486 | 109M | loss = Add(loss, lossc); |
487 | 109M | } |
488 | 109M | entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v)); |
489 | 109M | size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); |
490 | | // Add #bit of num_nonzeros, as an estimate of the cost for encoding the |
491 | | // number of non-zeros of the block. |
492 | 109M | size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; |
493 | | // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a |
494 | | // bias. |
495 | 109M | entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); |
496 | 109M | if (c == 0 && num_blocks >= 2) { |
497 | | // It is X channel (red-green) and we often see ringing |
498 | | // in the large blocks. Let's punish that more here. |
499 | 7.52M | float w = 1.0 + std::min(3.0, num_blocks / 8.0); |
500 | 7.52M | entropy *= w; |
501 | 7.52M | loss = Mul(loss, Set(df8, w)); |
502 | 7.52M | } |
503 | 109M | } |
504 | 36.5M | float loss_scalar = |
505 | 36.5M | pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize), |
506 | 36.5M | 1.0f / 8.0f) * |
507 | 36.5M | (num_blocks * kDCTBlockSize) / quant_norm16; |
508 | 36.5M | entropy *= entropy_mul; |
509 | 36.5M | entropy += config.info_loss_multiplier * loss_scalar; |
510 | 36.5M | return true; |
511 | 36.5M | } Unexecuted instantiation: jxl::N_SSE4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) Line | Count | Source | 368 | 36.5M | float& entropy) { | 369 | 36.5M | entropy = 0.0f; | 370 | 36.5M | float* mem = full_scratch_space; | 371 | 36.5M | float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea; | 372 | 36.5M | const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; | 373 | | | 374 | | // Apply transform. | 375 | 146M | for (size_t c = 0; c < 3; c++) { | 376 | 109M | float* JXL_RESTRICT block_c = block + size * c; | 377 | 109M | TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), | 378 | 109M | config.src_stride, block_c, scratch_space); | 379 | 109M | } | 380 | 36.5M | HWY_FULL(float) df; | 381 | | | 382 | 36.5M | const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); | 383 | | // avoid large blocks when there is a lot going on in red-green. | 384 | 36.5M | float quant_norm16 = 0; | 385 | 36.5M | if (num_blocks == 1) { | 386 | | // When it is only one 8x8, we don't need aggregation of values. | 387 | 28.9M | quant_norm16 = config.Quant(x / 8, y / 8); | 388 | 28.9M | } else if (num_blocks == 2) { | 389 | | // Taking max instead of 8th norm seems to work | 390 | | // better for smallest blocks up to 16x8. Jyrki couldn't get | 391 | | // improvements in trying the same for 16x16 blocks. | 392 | 4.77M | if (acs.covered_blocks_y() == 2) { | 393 | 2.38M | quant_norm16 = | 394 | 2.38M | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); | 395 | 2.39M | } else { | 396 | 2.39M | quant_norm16 = | 397 | 2.39M | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); | 398 | 2.39M | } | 399 | 4.77M | } else { | 400 | | // Load QF value, calculate empirical heuristic on masking field | 401 | | // for weighting the information loss. Information loss manifests | 402 | | // itself as ringing, and masking could hide it. | 403 | 11.0M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 404 | 35.7M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 405 | 27.3M | float qval = config.Quant(x / 8 + ix, y / 8 + iy); | 406 | 27.3M | qval *= qval; | 407 | 27.3M | qval *= qval; | 408 | 27.3M | qval *= qval; | 409 | 27.3M | quant_norm16 += qval * qval; | 410 | 27.3M | } | 411 | 8.31M | } | 412 | 2.74M | quant_norm16 /= num_blocks; | 413 | 2.74M | quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f); | 414 | 2.74M | } | 415 | 36.5M | const auto quant = Set(df, quant_norm16); | 416 | | | 417 | | // Compute entropy. | 418 | 36.5M | const HWY_CAPPED(float, 8) df8; | 419 | | | 420 | 36.5M | auto loss = Zero(df8); | 421 | 146M | for (size_t c = 0; c < 3; c++) { | 422 | 109M | const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c); | 423 | 109M | const float* matrix = config.dequant->Matrix(acs.Strategy(), c); | 424 | 109M | const auto cmap_factor = Set(df, cmap_factors[c]); | 425 | | | 426 | 109M | auto entropy_v = Zero(df); | 427 | 109M | auto nzeros_v = Zero(df); | 428 | 1.69G | for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { | 429 | 1.58G | const auto in = Load(df, block + c * size + i); | 430 | 1.58G | const auto in_y = Mul(Load(df, block + size + i), cmap_factor); | 431 | 1.58G | const auto im = Load(df, inv_matrix + i); | 432 | 1.58G | const auto val = Mul(Sub(in, in_y), Mul(im, quant)); | 433 | 1.58G | const auto rval = Round(val); | 434 | 1.58G | const auto diff = Sub(val, rval); | 435 | 1.58G | const auto m = Load(df, matrix + i); | 436 | 1.58G | Store(Mul(m, diff), df, &mem[i]); | 437 | 1.58G | const auto q = Abs(rval); | 438 | 1.58G | const auto q_is_zero = Eq(q, Zero(df)); | 439 | | // We used to have q * C here, but that cost model seems to | 440 | | // be punishing large values more than necessary. Sqrt tries | 441 | | // to avoid large values less aggressively. | 442 | 1.58G | entropy_v = Add(Sqrt(q), entropy_v); | 443 | 1.58G | nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); | 444 | 1.58G | } | 445 | | | 446 | 109M | { | 447 | 109M | float masku_lut[3] = { | 448 | 109M | 12.0, | 449 | 109M | 0.0, | 450 | 109M | 4.0, | 451 | 109M | }; | 452 | 109M | auto masku_off = Set(df8, masku_lut[c]); | 453 | 109M | auto lossc = Zero(df8); | 454 | 109M | TransformToPixels(acs.Strategy(), &mem[0], block, | 455 | 109M | acs.covered_blocks_x() * 8, scratch_space); | 456 | | | 457 | 242M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 458 | 331M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 459 | 1.78G | for (size_t dy = 0; dy < kBlockDim; ++dy) { | 460 | 3.16G | for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) { | 461 | 1.58G | auto in = Load(df8, block + | 462 | 1.58G | (iy * kBlockDim + dy) * | 463 | 1.58G | (acs.covered_blocks_x() * kBlockDim) + | 464 | 1.58G | ix * kBlockDim + dx); | 465 | 1.58G | if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) { | 466 | 1.58G | auto masku = | 467 | 1.58G | Add(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx, | 468 | 1.58G | y + iy * 8 + dy)), | 469 | 1.58G | masku_off); | 470 | 1.58G | in = Mul(masku, in); | 471 | 1.58G | in = Mul(in, in); | 472 | 1.58G | in = Mul(in, in); | 473 | 1.58G | in = Mul(in, in); | 474 | 1.58G | lossc = Add(lossc, in); | 475 | 1.58G | } | 476 | 1.58G | } | 477 | 1.58G | } | 478 | 197M | } | 479 | 133M | } | 480 | 109M | static const double kChannelMul[3] = { | 481 | 109M | pow(8.2, 8.0), | 482 | 109M | pow(1.0, 8.0), | 483 | 109M | pow(1.03, 8.0), | 484 | 109M | }; | 485 | 109M | lossc = Mul(Set(df8, kChannelMul[c]), lossc); | 486 | 109M | loss = Add(loss, lossc); | 487 | 109M | } | 488 | 109M | entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v)); | 489 | 109M | size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); | 490 | | // Add #bit of num_nonzeros, as an estimate of the cost for encoding the | 491 | | // number of non-zeros of the block. | 492 | 109M | size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; | 493 | | // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a | 494 | | // bias. | 495 | 109M | entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); | 496 | 109M | if (c == 0 && num_blocks >= 2) { | 497 | | // It is X channel (red-green) and we often see ringing | 498 | | // in the large blocks. Let's punish that more here. | 499 | 7.52M | float w = 1.0 + std::min(3.0, num_blocks / 8.0); | 500 | 7.52M | entropy *= w; | 501 | 7.52M | loss = Mul(loss, Set(df8, w)); | 502 | 7.52M | } | 503 | 109M | } | 504 | 36.5M | float loss_scalar = | 505 | 36.5M | pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize), | 506 | 36.5M | 1.0f / 8.0f) * | 507 | 36.5M | (num_blocks * kDCTBlockSize) / quant_norm16; | 508 | 36.5M | entropy *= entropy_mul; | 509 | 36.5M | entropy += config.info_loss_multiplier * loss_scalar; | 510 | 36.5M | return true; | 511 | 36.5M | } |
Unexecuted instantiation: jxl::N_AVX3::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) Unexecuted instantiation: jxl::N_SSE2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) |
512 | | |
513 | | Status FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier, |
514 | | float butteraugli_target, const ACSConfig& config, |
515 | | const float* JXL_RESTRICT cmap_factors, |
516 | | AcStrategyImage* JXL_RESTRICT ac_strategy, |
517 | | float* block, float* scratch_space, |
518 | | uint32_t* quantized, float* entropy_out, |
519 | 2.89M | AcStrategyType& best_tx) { |
520 | 2.89M | struct TransformTry8x8 { |
521 | 2.89M | AcStrategyType type; |
522 | 2.89M | int encoding_speed_tier_max_limit; |
523 | 2.89M | double entropy_mul; |
524 | 2.89M | }; |
525 | 2.89M | static const TransformTry8x8 kTransforms8x8[] = { |
526 | 2.89M | { |
527 | 2.89M | AcStrategyType::DCT, |
528 | 2.89M | 9, |
529 | 2.89M | 0.8, |
530 | 2.89M | }, |
531 | 2.89M | { |
532 | 2.89M | AcStrategyType::DCT4X4, |
533 | 2.89M | 5, |
534 | 2.89M | 1.08, |
535 | 2.89M | }, |
536 | 2.89M | { |
537 | 2.89M | AcStrategyType::DCT2X2, |
538 | 2.89M | 5, |
539 | 2.89M | 0.95, |
540 | 2.89M | }, |
541 | 2.89M | { |
542 | 2.89M | AcStrategyType::DCT4X8, |
543 | 2.89M | 4, |
544 | 2.89M | 0.85931637428340035, |
545 | 2.89M | }, |
546 | 2.89M | { |
547 | 2.89M | AcStrategyType::DCT8X4, |
548 | 2.89M | 4, |
549 | 2.89M | 0.85931637428340035, |
550 | 2.89M | }, |
551 | 2.89M | { |
552 | 2.89M | AcStrategyType::IDENTITY, |
553 | 2.89M | 5, |
554 | 2.89M | 1.0427542510634957, |
555 | 2.89M | }, |
556 | 2.89M | { |
557 | 2.89M | AcStrategyType::AFV0, |
558 | 2.89M | 4, |
559 | 2.89M | 0.81779489591359944, |
560 | 2.89M | }, |
561 | 2.89M | { |
562 | 2.89M | AcStrategyType::AFV1, |
563 | 2.89M | 4, |
564 | 2.89M | 0.81779489591359944, |
565 | 2.89M | }, |
566 | 2.89M | { |
567 | 2.89M | AcStrategyType::AFV2, |
568 | 2.89M | 4, |
569 | 2.89M | 0.81779489591359944, |
570 | 2.89M | }, |
571 | 2.89M | { |
572 | 2.89M | AcStrategyType::AFV3, |
573 | 2.89M | 4, |
574 | 2.89M | 0.81779489591359944, |
575 | 2.89M | }, |
576 | 2.89M | }; |
577 | 2.89M | double best = 1e30; |
578 | 2.89M | best_tx = kTransforms8x8[0].type; |
579 | 28.9M | for (auto tx : kTransforms8x8) { |
580 | 28.9M | if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { |
581 | 0 | continue; |
582 | 0 | } |
583 | 28.9M | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); |
584 | 28.9M | float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul; |
585 | 28.9M | if ((tx.type == AcStrategyType::DCT2X2 || |
586 | 28.9M | tx.type == AcStrategyType::IDENTITY) && |
587 | 28.9M | butteraugli_target < 5.0) { |
588 | 5.79M | static const float kFavor2X2AtHighQuality = 0.4; |
589 | 5.79M | float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0f); |
590 | 5.79M | entropy_mul -= kFavor2X2AtHighQuality * weight; |
591 | 5.79M | } |
592 | 28.9M | if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 && |
593 | 28.9M | tx.type != AcStrategyType::IDENTITY) && |
594 | 28.9M | butteraugli_target > 4.0) { |
595 | 0 | static const float kAvoidEntropyOfTransforms = 0.5; |
596 | 0 | float mul = 1.0; |
597 | 0 | if (butteraugli_target < 12.0) { |
598 | 0 | mul *= (12.0 - 4.0) / (butteraugli_target - 4.0); |
599 | 0 | } |
600 | 0 | entropy_mul += kAvoidEntropyOfTransforms * mul; |
601 | 0 | } |
602 | 28.9M | float entropy; |
603 | 28.9M | JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config, |
604 | 28.9M | cmap_factors, block, scratch_space, |
605 | 28.9M | quantized, entropy)); |
606 | 28.9M | if (entropy < best) { |
607 | 4.73M | best_tx = tx.type; |
608 | 4.73M | best = entropy; |
609 | 4.73M | } |
610 | 28.9M | } |
611 | 2.89M | *entropy_out = best; |
612 | 2.89M | return true; |
613 | 2.89M | } Unexecuted instantiation: jxl::N_SSE4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) jxl::N_AVX2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) Line | Count | Source | 519 | 2.89M | AcStrategyType& best_tx) { | 520 | 2.89M | struct TransformTry8x8 { | 521 | 2.89M | AcStrategyType type; | 522 | 2.89M | int encoding_speed_tier_max_limit; | 523 | 2.89M | double entropy_mul; | 524 | 2.89M | }; | 525 | 2.89M | static const TransformTry8x8 kTransforms8x8[] = { | 526 | 2.89M | { | 527 | 2.89M | AcStrategyType::DCT, | 528 | 2.89M | 9, | 529 | 2.89M | 0.8, | 530 | 2.89M | }, | 531 | 2.89M | { | 532 | 2.89M | AcStrategyType::DCT4X4, | 533 | 2.89M | 5, | 534 | 2.89M | 1.08, | 535 | 2.89M | }, | 536 | 2.89M | { | 537 | 2.89M | AcStrategyType::DCT2X2, | 538 | 2.89M | 5, | 539 | 2.89M | 0.95, | 540 | 2.89M | }, | 541 | 2.89M | { | 542 | 2.89M | AcStrategyType::DCT4X8, | 543 | 2.89M | 4, | 544 | 2.89M | 0.85931637428340035, | 545 | 2.89M | }, | 546 | 2.89M | { | 547 | 2.89M | AcStrategyType::DCT8X4, | 548 | 2.89M | 4, | 549 | 2.89M | 0.85931637428340035, | 550 | 2.89M | }, | 551 | 2.89M | { | 552 | 2.89M | AcStrategyType::IDENTITY, | 553 | 2.89M | 5, | 554 | 2.89M | 1.0427542510634957, | 555 | 2.89M | }, | 556 | 2.89M | { | 557 | 2.89M | AcStrategyType::AFV0, | 558 | 2.89M | 4, | 559 | 2.89M | 0.81779489591359944, | 560 | 2.89M | }, | 561 | 2.89M | { | 562 | 2.89M | AcStrategyType::AFV1, | 563 | 2.89M | 4, | 564 | 2.89M | 0.81779489591359944, | 565 | 2.89M | }, | 566 | 2.89M | { | 567 | 2.89M | AcStrategyType::AFV2, | 568 | 2.89M | 4, | 569 | 2.89M | 0.81779489591359944, | 570 | 2.89M | }, | 571 | 2.89M | { | 572 | 2.89M | AcStrategyType::AFV3, | 573 | 2.89M | 4, | 574 | 2.89M | 0.81779489591359944, | 575 | 2.89M | }, | 576 | 2.89M | }; | 577 | 2.89M | double best = 1e30; | 578 | 2.89M | best_tx = kTransforms8x8[0].type; | 579 | 28.9M | for (auto tx : kTransforms8x8) { | 580 | 28.9M | if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { | 581 | 0 | continue; | 582 | 0 | } | 583 | 28.9M | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); | 584 | 28.9M | float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul; | 585 | 28.9M | if ((tx.type == AcStrategyType::DCT2X2 || | 586 | 28.9M | tx.type == AcStrategyType::IDENTITY) && | 587 | 28.9M | butteraugli_target < 5.0) { | 588 | 5.79M | static const float kFavor2X2AtHighQuality = 0.4; | 589 | 5.79M | float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0f); | 590 | 5.79M | entropy_mul -= kFavor2X2AtHighQuality * weight; | 591 | 5.79M | } | 592 | 28.9M | if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 && | 593 | 28.9M | tx.type != AcStrategyType::IDENTITY) && | 594 | 28.9M | butteraugli_target > 4.0) { | 595 | 0 | static const float kAvoidEntropyOfTransforms = 0.5; | 596 | 0 | float mul = 1.0; | 597 | 0 | if (butteraugli_target < 12.0) { | 598 | 0 | mul *= (12.0 - 4.0) / (butteraugli_target - 4.0); | 599 | 0 | } | 600 | 0 | entropy_mul += kAvoidEntropyOfTransforms * mul; | 601 | 0 | } | 602 | 28.9M | float entropy; | 603 | 28.9M | JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config, | 604 | 28.9M | cmap_factors, block, scratch_space, | 605 | 28.9M | quantized, entropy)); | 606 | 28.9M | if (entropy < best) { | 607 | 4.73M | best_tx = tx.type; | 608 | 4.73M | best = entropy; | 609 | 4.73M | } | 610 | 28.9M | } | 611 | 2.89M | *entropy_out = best; | 612 | 2.89M | return true; | 613 | 2.89M | } |
Unexecuted instantiation: jxl::N_AVX3::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) Unexecuted instantiation: jxl::N_AVX3_ZEN4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) Unexecuted instantiation: jxl::N_AVX3_SPR::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) Unexecuted instantiation: jxl::N_SSE2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) |
614 | | |
615 | | // bx, by addresses the 64x64 block at 8x8 subresolution |
616 | | // cx, cy addresses the left, upper 8x8 block position of the candidate |
617 | | // transform. |
618 | | Status TryMergeAcs(AcStrategyType acs_raw, size_t bx, size_t by, size_t cx, |
619 | | size_t cy, const ACSConfig& config, |
620 | | const float* JXL_RESTRICT cmap_factors, |
621 | | AcStrategyImage* JXL_RESTRICT ac_strategy, |
622 | | const float entropy_mul, const uint8_t candidate_priority, |
623 | | uint8_t* priority, float* JXL_RESTRICT entropy_estimate, |
624 | 8.86M | float* block, float* scratch_space, uint32_t* quantized) { |
625 | 8.86M | AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); |
626 | 8.86M | float entropy_current = 0; |
627 | 9.74M | for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { |
628 | 13.5M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { |
629 | 12.6M | if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { |
630 | | // Transform would reuse already allocated blocks and |
631 | | // lead to invalid overlaps, for example DCT64X32 vs. |
632 | | // DCT32X64. |
633 | 8.70M | return true; |
634 | 8.70M | } |
635 | 3.96M | entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; |
636 | 3.96M | } |
637 | 9.58M | } |
638 | 156k | float entropy_candidate; |
639 | 156k | JXL_RETURN_IF_ERROR(EstimateEntropy( |
640 | 156k | acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors, |
641 | 156k | block, scratch_space, quantized, entropy_candidate)); |
642 | 156k | if (entropy_candidate >= entropy_current) return true; |
643 | | // Accept the candidate. |
644 | 159k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
645 | 632k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
646 | 498k | entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; |
647 | 498k | priority[(cy + iy) * 8 + cx + ix] = candidate_priority; |
648 | 498k | } |
649 | 133k | } |
650 | 26.1k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw)); |
651 | 26.1k | entropy_estimate[cy * 8 + cx] = entropy_candidate; |
652 | 26.1k | return true; |
653 | 26.1k | } Unexecuted instantiation: jxl::N_SSE4::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) jxl::N_AVX2::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) Line | Count | Source | 624 | 8.86M | float* block, float* scratch_space, uint32_t* quantized) { | 625 | 8.86M | AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); | 626 | 8.86M | float entropy_current = 0; | 627 | 9.74M | for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { | 628 | 13.5M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { | 629 | 12.6M | if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { | 630 | | // Transform would reuse already allocated blocks and | 631 | | // lead to invalid overlaps, for example DCT64X32 vs. | 632 | | // DCT32X64. | 633 | 8.70M | return true; | 634 | 8.70M | } | 635 | 3.96M | entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; | 636 | 3.96M | } | 637 | 9.58M | } | 638 | 156k | float entropy_candidate; | 639 | 156k | JXL_RETURN_IF_ERROR(EstimateEntropy( | 640 | 156k | acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors, | 641 | 156k | block, scratch_space, quantized, entropy_candidate)); | 642 | 156k | if (entropy_candidate >= entropy_current) return true; | 643 | | // Accept the candidate. | 644 | 159k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 645 | 632k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 646 | 498k | entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; | 647 | 498k | priority[(cy + iy) * 8 + cx + ix] = candidate_priority; | 648 | 498k | } | 649 | 133k | } | 650 | 26.1k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw)); | 651 | 26.1k | entropy_estimate[cy * 8 + cx] = entropy_candidate; | 652 | 26.1k | return true; | 653 | 26.1k | } |
Unexecuted instantiation: jxl::N_AVX3::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_AVX3_SPR::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_SSE2::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) |
654 | | |
655 | | static void SetEntropyForTransform(size_t cx, size_t cy, |
656 | | const AcStrategyType acs_raw, float entropy, |
657 | 574k | float* JXL_RESTRICT entropy_estimate) { |
658 | 574k | const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); |
659 | 1.77M | for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { |
660 | 4.23M | for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { |
661 | 3.03M | entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; |
662 | 3.03M | } |
663 | 1.19M | } |
664 | 574k | entropy_estimate[cy * 8 + cx] = entropy; |
665 | 574k | } Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) enc_ac_strategy.cc:jxl::N_AVX2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) Line | Count | Source | 657 | 574k | float* JXL_RESTRICT entropy_estimate) { | 658 | 574k | const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); | 659 | 1.77M | for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { | 660 | 4.23M | for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { | 661 | 3.03M | entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; | 662 | 3.03M | } | 663 | 1.19M | } | 664 | 574k | entropy_estimate[cy * 8 + cx] = entropy; | 665 | 574k | } |
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX3::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX3_ZEN4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX3_SPR::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) |
666 | | |
667 | 2.60M | AcStrategyType AcsSquare(size_t blocks) { |
668 | 2.60M | if (blocks == 2) { |
669 | 2.17M | return AcStrategyType::DCT16X16; |
670 | 2.17M | } else if (blocks == 4) { |
671 | 386k | return AcStrategyType::DCT32X32; |
672 | 386k | } else { |
673 | 40.6k | return AcStrategyType::DCT64X64; |
674 | 40.6k | } |
675 | 2.60M | } Unexecuted instantiation: jxl::N_SSE4::AcsSquare(unsigned long) jxl::N_AVX2::AcsSquare(unsigned long) Line | Count | Source | 667 | 2.60M | AcStrategyType AcsSquare(size_t blocks) { | 668 | 2.60M | if (blocks == 2) { | 669 | 2.17M | return AcStrategyType::DCT16X16; | 670 | 2.17M | } else if (blocks == 4) { | 671 | 386k | return AcStrategyType::DCT32X32; | 672 | 386k | } else { | 673 | 40.6k | return AcStrategyType::DCT64X64; | 674 | 40.6k | } | 675 | 2.60M | } |
Unexecuted instantiation: jxl::N_AVX3::AcsSquare(unsigned long) Unexecuted instantiation: jxl::N_AVX3_ZEN4::AcsSquare(unsigned long) Unexecuted instantiation: jxl::N_AVX3_SPR::AcsSquare(unsigned long) Unexecuted instantiation: jxl::N_SSE2::AcsSquare(unsigned long) |
676 | | |
677 | 2.60M | AcStrategyType AcsVerticalSplit(size_t blocks) { |
678 | 2.60M | if (blocks == 2) { |
679 | 2.17M | return AcStrategyType::DCT16X8; |
680 | 2.17M | } else if (blocks == 4) { |
681 | 386k | return AcStrategyType::DCT32X16; |
682 | 386k | } else { |
683 | 40.6k | return AcStrategyType::DCT64X32; |
684 | 40.6k | } |
685 | 2.60M | } Unexecuted instantiation: jxl::N_SSE4::AcsVerticalSplit(unsigned long) jxl::N_AVX2::AcsVerticalSplit(unsigned long) Line | Count | Source | 677 | 2.60M | AcStrategyType AcsVerticalSplit(size_t blocks) { | 678 | 2.60M | if (blocks == 2) { | 679 | 2.17M | return AcStrategyType::DCT16X8; | 680 | 2.17M | } else if (blocks == 4) { | 681 | 386k | return AcStrategyType::DCT32X16; | 682 | 386k | } else { | 683 | 40.6k | return AcStrategyType::DCT64X32; | 684 | 40.6k | } | 685 | 2.60M | } |
Unexecuted instantiation: jxl::N_AVX3::AcsVerticalSplit(unsigned long) Unexecuted instantiation: jxl::N_AVX3_ZEN4::AcsVerticalSplit(unsigned long) Unexecuted instantiation: jxl::N_AVX3_SPR::AcsVerticalSplit(unsigned long) Unexecuted instantiation: jxl::N_SSE2::AcsVerticalSplit(unsigned long) |
686 | | |
687 | 2.60M | AcStrategyType AcsHorizontalSplit(size_t blocks) { |
688 | 2.60M | if (blocks == 2) { |
689 | 2.17M | return AcStrategyType::DCT8X16; |
690 | 2.17M | } else if (blocks == 4) { |
691 | 386k | return AcStrategyType::DCT16X32; |
692 | 386k | } else { |
693 | 40.6k | return AcStrategyType::DCT32X64; |
694 | 40.6k | } |
695 | 2.60M | } Unexecuted instantiation: jxl::N_SSE4::AcsHorizontalSplit(unsigned long) jxl::N_AVX2::AcsHorizontalSplit(unsigned long) Line | Count | Source | 687 | 2.60M | AcStrategyType AcsHorizontalSplit(size_t blocks) { | 688 | 2.60M | if (blocks == 2) { | 689 | 2.17M | return AcStrategyType::DCT8X16; | 690 | 2.17M | } else if (blocks == 4) { | 691 | 386k | return AcStrategyType::DCT16X32; | 692 | 386k | } else { | 693 | 40.6k | return AcStrategyType::DCT32X64; | 694 | 40.6k | } | 695 | 2.60M | } |
Unexecuted instantiation: jxl::N_AVX3::AcsHorizontalSplit(unsigned long) Unexecuted instantiation: jxl::N_AVX3_ZEN4::AcsHorizontalSplit(unsigned long) Unexecuted instantiation: jxl::N_AVX3_SPR::AcsHorizontalSplit(unsigned long) Unexecuted instantiation: jxl::N_SSE2::AcsHorizontalSplit(unsigned long) |
696 | | |
697 | | // The following function tries to merge smaller transforms into |
698 | | // squares and the rectangles originating from a single middle division |
699 | | // (horizontal or vertical) fairly. |
700 | | // |
701 | | // This is now generalized to concern about squares |
702 | | // of blocks X blocks size, where a block is 8x8 pixels. |
703 | | Status FindBestFirstLevelDivisionForSquare( |
704 | | size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx, |
705 | | size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors, |
706 | | AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK, |
707 | | const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate, |
708 | 2.60M | float* block, float* scratch_space, uint32_t* quantized) { |
709 | | // We denote J for the larger dimension here, and K for the smaller. |
710 | | // For example, for 32x32 block splitting, J would be 32, K 16. |
711 | 2.60M | const size_t blocks_half = blocks / 2; |
712 | 2.60M | const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks); |
713 | 2.60M | const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks); |
714 | 2.60M | const AcStrategyType acs_rawJXJ = AcsSquare(blocks); |
715 | 2.60M | const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); |
716 | 2.60M | const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); |
717 | 2.60M | const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); |
718 | 2.60M | AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); |
719 | 2.60M | AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); |
720 | | // Let's check if we can consider a JXJ block here at all. |
721 | | // This is not necessary in the basic use of hierarchically merging |
722 | | // blocks in the simplest possible way, but is needed when we try other |
723 | | // 'floating' options of merging, possibly after a simple hierarchical |
724 | | // merge has been explored. |
725 | 2.60M | if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, |
726 | 2.60M | by + cy, bx + cx + blocks) || |
727 | 2.60M | MultiBlockTransformCrossesHorizontalBoundary( |
728 | 1.90M | *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || |
729 | 2.60M | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, |
730 | 1.73M | by + cy + blocks) || |
731 | 2.60M | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, |
732 | 1.56M | by + cy, by + cy + blocks)) { |
733 | 1.08M | return true; // not suitable for JxJ analysis, some transforms leak out. |
734 | 1.08M | } |
735 | | // For floating transforms there may be |
736 | | // already blocks selected that make either or both JXK and |
737 | | // KXJ not feasible for this location. |
738 | 1.51M | const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( |
739 | 1.51M | *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); |
740 | 1.51M | const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( |
741 | 1.51M | *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); |
742 | | // Current entropies aggregated on NxN resolution. |
743 | 1.51M | float entropy[2][2] = {}; |
744 | 5.30M | for (size_t dy = 0; dy < blocks; ++dy) { |
745 | 15.3M | for (size_t dx = 0; dx < blocks; ++dx) { |
746 | 11.5M | entropy[dy / blocks_half][dx / blocks_half] += |
747 | 11.5M | entropy_estimate[(cy + dy) * 8 + (cx + dx)]; |
748 | 11.5M | } |
749 | 3.78M | } |
750 | 1.51M | float entropy_JXK_left = std::numeric_limits<float>::max(); |
751 | 1.51M | float entropy_JXK_right = std::numeric_limits<float>::max(); |
752 | 1.51M | float entropy_KXJ_top = std::numeric_limits<float>::max(); |
753 | 1.51M | float entropy_KXJ_bottom = std::numeric_limits<float>::max(); |
754 | 1.51M | float entropy_JXJ = std::numeric_limits<float>::max(); |
755 | 1.51M | if (allow_JXK) { |
756 | 1.48M | if (row0[bx + cx + 0].Strategy() != acs_rawJXK) { |
757 | 1.45M | JXL_RETURN_IF_ERROR(EstimateEntropy( |
758 | 1.45M | acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
759 | 1.45M | cmap_factors, block, scratch_space, quantized, entropy_JXK_left)); |
760 | 1.45M | } |
761 | 1.48M | if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) { |
762 | 1.46M | JXL_RETURN_IF_ERROR( |
763 | 1.46M | EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8, |
764 | 1.46M | (by + cy + 0) * 8, config, cmap_factors, block, |
765 | 1.46M | scratch_space, quantized, entropy_JXK_right)); |
766 | 1.46M | } |
767 | 1.48M | } |
768 | 1.51M | if (allow_KXJ) { |
769 | 1.48M | if (row0[bx + cx].Strategy() != acs_rawKXJ) { |
770 | 1.45M | JXL_RETURN_IF_ERROR(EstimateEntropy( |
771 | 1.45M | acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
772 | 1.45M | cmap_factors, block, scratch_space, quantized, entropy_KXJ_top)); |
773 | 1.45M | } |
774 | 1.48M | if (row1[bx + cx].Strategy() != acs_rawKXJ) { |
775 | 1.46M | JXL_RETURN_IF_ERROR( |
776 | 1.46M | EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, |
777 | 1.46M | (by + cy + blocks_half) * 8, config, cmap_factors, |
778 | 1.46M | block, scratch_space, quantized, entropy_KXJ_bottom)); |
779 | 1.46M | } |
780 | 1.48M | } |
781 | 1.51M | if (allow_square_transform) { |
782 | | // We control the exploration of the square transform separately so that |
783 | | // we can turn it off at high decoding speeds for 32x32, but still allow |
784 | | // exploring 16x32 and 32x16. |
785 | 1.51M | JXL_RETURN_IF_ERROR(EstimateEntropy( |
786 | 1.51M | acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
787 | 1.51M | cmap_factors, block, scratch_space, quantized, entropy_JXJ)); |
788 | 1.51M | } |
789 | | |
790 | | // Test if this block should have JXK or KXJ transforms, |
791 | | // because it can have only one or the other. |
792 | 1.51M | float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + |
793 | 1.51M | std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); |
794 | 1.51M | float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + |
795 | 1.51M | std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); |
796 | 1.51M | if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { |
797 | 331k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ)); |
798 | 331k | SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); |
799 | 1.18M | } else if (costJxN < costNxJ) { |
800 | 207k | if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { |
801 | 53.6k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK)); |
802 | 53.6k | SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, |
803 | 53.6k | entropy_estimate); |
804 | 53.6k | } |
805 | 207k | if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { |
806 | 47.9k | JXL_RETURN_IF_ERROR( |
807 | 47.9k | ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK)); |
808 | 47.9k | SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, |
809 | 47.9k | entropy_JXK_right, entropy_estimate); |
810 | 47.9k | } |
811 | 980k | } else { |
812 | 980k | if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { |
813 | 73.2k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ)); |
814 | 73.2k | SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, |
815 | 73.2k | entropy_estimate); |
816 | 73.2k | } |
817 | 980k | if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { |
818 | 68.0k | JXL_RETURN_IF_ERROR( |
819 | 68.0k | ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ)); |
820 | 68.0k | SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, |
821 | 68.0k | entropy_KXJ_bottom, entropy_estimate); |
822 | 68.0k | } |
823 | 980k | } |
824 | 1.51M | return true; |
825 | 1.51M | } Unexecuted instantiation: jxl::N_SSE4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) jxl::N_AVX2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Line | Count | Source | 708 | 2.60M | float* block, float* scratch_space, uint32_t* quantized) { | 709 | | // We denote J for the larger dimension here, and K for the smaller. | 710 | | // For example, for 32x32 block splitting, J would be 32, K 16. | 711 | 2.60M | const size_t blocks_half = blocks / 2; | 712 | 2.60M | const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks); | 713 | 2.60M | const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks); | 714 | 2.60M | const AcStrategyType acs_rawJXJ = AcsSquare(blocks); | 715 | 2.60M | const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); | 716 | 2.60M | const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); | 717 | 2.60M | const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); | 718 | 2.60M | AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); | 719 | 2.60M | AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); | 720 | | // Let's check if we can consider a JXJ block here at all. | 721 | | // This is not necessary in the basic use of hierarchically merging | 722 | | // blocks in the simplest possible way, but is needed when we try other | 723 | | // 'floating' options of merging, possibly after a simple hierarchical | 724 | | // merge has been explored. | 725 | 2.60M | if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, | 726 | 2.60M | by + cy, bx + cx + blocks) || | 727 | 2.60M | MultiBlockTransformCrossesHorizontalBoundary( | 728 | 1.90M | *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || | 729 | 2.60M | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, | 730 | 1.73M | by + cy + blocks) || | 731 | 2.60M | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, | 732 | 1.56M | by + cy, by + cy + blocks)) { | 733 | 1.08M | return true; // not suitable for JxJ analysis, some transforms leak out. | 734 | 1.08M | } | 735 | | // For floating transforms there may be | 736 | | // already blocks selected that make either or both JXK and | 737 | | // KXJ not feasible for this location. | 738 | 1.51M | const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( | 739 | 1.51M | *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); | 740 | 1.51M | const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( | 741 | 1.51M | *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); | 742 | | // Current entropies aggregated on NxN resolution. | 743 | 1.51M | float entropy[2][2] = {}; | 744 | 5.30M | for (size_t dy = 0; dy < blocks; ++dy) { | 745 | 15.3M | for (size_t dx = 0; dx < blocks; ++dx) { | 746 | 11.5M | entropy[dy / blocks_half][dx / blocks_half] += | 747 | 11.5M | entropy_estimate[(cy + dy) * 8 + (cx + dx)]; | 748 | 11.5M | } | 749 | 3.78M | } | 750 | 1.51M | float entropy_JXK_left = std::numeric_limits<float>::max(); | 751 | 1.51M | float entropy_JXK_right = std::numeric_limits<float>::max(); | 752 | 1.51M | float entropy_KXJ_top = std::numeric_limits<float>::max(); | 753 | 1.51M | float entropy_KXJ_bottom = std::numeric_limits<float>::max(); | 754 | 1.51M | float entropy_JXJ = std::numeric_limits<float>::max(); | 755 | 1.51M | if (allow_JXK) { | 756 | 1.48M | if (row0[bx + cx + 0].Strategy() != acs_rawJXK) { | 757 | 1.45M | JXL_RETURN_IF_ERROR(EstimateEntropy( | 758 | 1.45M | acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, | 759 | 1.45M | cmap_factors, block, scratch_space, quantized, entropy_JXK_left)); | 760 | 1.45M | } | 761 | 1.48M | if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) { | 762 | 1.46M | JXL_RETURN_IF_ERROR( | 763 | 1.46M | EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8, | 764 | 1.46M | (by + cy + 0) * 8, config, cmap_factors, block, | 765 | 1.46M | scratch_space, quantized, entropy_JXK_right)); | 766 | 1.46M | } | 767 | 1.48M | } | 768 | 1.51M | if (allow_KXJ) { | 769 | 1.48M | if (row0[bx + cx].Strategy() != acs_rawKXJ) { | 770 | 1.45M | JXL_RETURN_IF_ERROR(EstimateEntropy( | 771 | 1.45M | acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, | 772 | 1.45M | cmap_factors, block, scratch_space, quantized, entropy_KXJ_top)); | 773 | 1.45M | } | 774 | 1.48M | if (row1[bx + cx].Strategy() != acs_rawKXJ) { | 775 | 1.46M | JXL_RETURN_IF_ERROR( | 776 | 1.46M | EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, | 777 | 1.46M | (by + cy + blocks_half) * 8, config, cmap_factors, | 778 | 1.46M | block, scratch_space, quantized, entropy_KXJ_bottom)); | 779 | 1.46M | } | 780 | 1.48M | } | 781 | 1.51M | if (allow_square_transform) { | 782 | | // We control the exploration of the square transform separately so that | 783 | | // we can turn it off at high decoding speeds for 32x32, but still allow | 784 | | // exploring 16x32 and 32x16. | 785 | 1.51M | JXL_RETURN_IF_ERROR(EstimateEntropy( | 786 | 1.51M | acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, | 787 | 1.51M | cmap_factors, block, scratch_space, quantized, entropy_JXJ)); | 788 | 1.51M | } | 789 | | | 790 | | // Test if this block should have JXK or KXJ transforms, | 791 | | // because it can have only one or the other. | 792 | 1.51M | float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + | 793 | 1.51M | std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); | 794 | 1.51M | float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + | 795 | 1.51M | std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); | 796 | 1.51M | if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { | 797 | 331k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ)); | 798 | 331k | SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); | 799 | 1.18M | } else if (costJxN < costNxJ) { | 800 | 207k | if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { | 801 | 53.6k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK)); | 802 | 53.6k | SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, | 803 | 53.6k | entropy_estimate); | 804 | 53.6k | } | 805 | 207k | if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { | 806 | 47.9k | JXL_RETURN_IF_ERROR( | 807 | 47.9k | ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK)); | 808 | 47.9k | SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, | 809 | 47.9k | entropy_JXK_right, entropy_estimate); | 810 | 47.9k | } | 811 | 980k | } else { | 812 | 980k | if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { | 813 | 73.2k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ)); | 814 | 73.2k | SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, | 815 | 73.2k | entropy_estimate); | 816 | 73.2k | } | 817 | 980k | if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { | 818 | 68.0k | JXL_RETURN_IF_ERROR( | 819 | 68.0k | ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ)); | 820 | 68.0k | SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, | 821 | 68.0k | entropy_KXJ_bottom, entropy_estimate); | 822 | 68.0k | } | 823 | 980k | } | 824 | 1.51M | return true; | 825 | 1.51M | } |
Unexecuted instantiation: jxl::N_AVX3::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_AVX3_SPR::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_SSE2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) |
826 | | |
827 | | Status ProcessRectACS(const CompressParams& cparams, const ACSConfig& config, |
828 | | const Rect& rect, const ColorCorrelationMap& cmap, |
829 | | float* JXL_RESTRICT block, |
830 | | uint32_t* JXL_RESTRICT quantized, |
831 | 51.9k | AcStrategyImage* ac_strategy) { |
832 | | // Main philosophy here: |
833 | | // 1. First find best 8x8 transform for each area. |
834 | | // 2. Merging them into larger transforms where possibly, but |
835 | | // starting from the smallest transforms (16x8 and 8x16). |
836 | | // Additional complication: 16x8 and 8x16 are considered |
837 | | // simultaneously and fairly against each other. |
838 | | // We are looking at 64x64 squares since the Y-to-X and Y-to-B |
839 | | // maps happen to be at that resolution, and having |
840 | | // integral transforms cross these boundaries leads to |
841 | | // additional complications. |
842 | 51.9k | const float butteraugli_target = cparams.butteraugli_distance; |
843 | 51.9k | float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea; |
844 | 51.9k | size_t bx = rect.x0(); |
845 | 51.9k | size_t by = rect.y0(); |
846 | 51.9k | JXL_ENSURE(rect.xsize() <= 8); |
847 | 51.9k | JXL_ENSURE(rect.ysize() <= 8); |
848 | 51.9k | size_t tx = bx / kColorTileDimInBlocks; |
849 | 51.9k | size_t ty = by / kColorTileDimInBlocks; |
850 | 51.9k | const float cmap_factors[3] = { |
851 | 51.9k | cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]), |
852 | 51.9k | 0.0f, |
853 | 51.9k | cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]), |
854 | 51.9k | }; |
855 | 51.9k | if (cparams.speed_tier > SpeedTier::kHare) return true; |
856 | | // First compute the best 8x8 transform for each square. Later, we do not |
857 | | // experiment with different combinations, but only use the best of the 8x8s |
858 | | // when DCT8X8 is specified in the tree search. |
859 | | // 8x8 transforms have 10 variants, but every larger transform is just a DCT. |
860 | 51.9k | float entropy_estimate[64] = {}; |
861 | | // Favor all 8x8 transforms (against 16x8 and larger transforms)) at |
862 | | // low butteraugli_target distances. |
863 | 51.9k | static const float k8x8mul1 = -0.4; |
864 | 51.9k | static const float k8x8mul2 = 1.0; |
865 | 51.9k | static const float k8x8base = 1.4; |
866 | 51.9k | const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); |
867 | 445k | for (size_t iy = 0; iy < rect.ysize(); iy++) { |
868 | 3.29M | for (size_t ix = 0; ix < rect.xsize(); ix++) { |
869 | 2.89M | float entropy = 0.0; |
870 | 2.89M | AcStrategyType best_of_8x8s; |
871 | 2.89M | JXL_RETURN_IF_ERROR(FindBest8x8Transform( |
872 | 2.89M | 8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier), |
873 | 2.89M | butteraugli_target, config, cmap_factors, ac_strategy, block, |
874 | 2.89M | scratch_space, quantized, &entropy, best_of_8x8s)); |
875 | 2.89M | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s)); |
876 | 2.89M | entropy_estimate[iy * 8 + ix] = entropy * mul8x8; |
877 | 2.89M | } |
878 | 393k | } |
879 | | // Merge when a larger transform is better than the previously |
880 | | // searched best combination of 8x8 transforms. |
881 | 51.9k | struct MergeTry { |
882 | 51.9k | AcStrategyType type; |
883 | 51.9k | uint8_t priority; |
884 | 51.9k | uint8_t decoding_speed_tier_max_limit; |
885 | 51.9k | uint8_t encoding_speed_tier_max_limit; |
886 | 51.9k | float entropy_mul; |
887 | 51.9k | }; |
888 | | // These numbers need to be figured out manually and looking at |
889 | | // ringing next to sky etc. Optimization will find smaller numbers |
890 | | // and produce more ringing than is ideal. Larger numbers will |
891 | | // help stop ringing. |
892 | 51.9k | const float entropy_mul16X8 = 1.21; |
893 | 51.9k | const float entropy_mul16X16 = 1.34; |
894 | 51.9k | const float entropy_mul16X32 = 1.49; |
895 | 51.9k | const float entropy_mul32X32 = 1.48; |
896 | 51.9k | const float entropy_mul64X32 = 2.25; |
897 | 51.9k | const float entropy_mul64X64 = 2.25; |
898 | | // TODO(jyrki): Consider this feedback in further changes: |
899 | | // Also effectively when the multipliers for smaller blocks are |
900 | | // below 1, this raises the bar for the bigger blocks even higher |
901 | | // in that sense these constants are not independent (e.g. changing |
902 | | // the constant for DCT16x32 by -5% (making it more likely) also |
903 | | // means that DCT32x32 becomes harder to do when starting from |
904 | | // two DCT16x32s). It might be better to make them more independent, |
905 | | // e.g. by not applying the multiplier when storing the new entropy |
906 | | // estimates in TryMergeToACSCandidate(). |
907 | 51.9k | const MergeTry kTransformsForMerge[9] = { |
908 | 51.9k | {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8}, |
909 | 51.9k | {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8}, |
910 | | // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its |
911 | | // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16}, |
912 | 51.9k | {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32}, |
913 | 51.9k | {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32}, |
914 | | // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its |
915 | | // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5, |
916 | | // 0.9822994906548809f}, |
917 | 51.9k | {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32}, |
918 | 51.9k | {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32}, |
919 | | // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f}, |
920 | 51.9k | }; |
921 | | /* |
922 | | These sizes not yet included in merge heuristic: |
923 | | set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f); |
924 | | set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f); |
925 | | set(AcStrategyType::DCT128X128, 0.0f, 1.0f); |
926 | | set(AcStrategyType::DCT128X64, 0.0f, 0.73f); |
927 | | set(AcStrategyType::DCT64X128, 0.0f, 0.73f); |
928 | | set(AcStrategyType::DCT256X256, 0.0f, 1.0f); |
929 | | set(AcStrategyType::DCT256X128, 0.0f, 0.73f); |
930 | | set(AcStrategyType::DCT128X256, 0.0f, 0.73f); |
931 | | */ |
932 | | |
933 | | // Priority is a tricky kludge to avoid collisions so that transforms |
934 | | // don't overlap. |
935 | 51.9k | uint8_t priority[64] = {}; |
936 | 51.9k | bool enable_32x32 = cparams.decoding_speed_tier < 4; |
937 | 467k | for (auto mt : kTransformsForMerge) { |
938 | 467k | if (mt.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { |
939 | 0 | continue; |
940 | 0 | } |
941 | 467k | AcStrategy acs = AcStrategy::FromRawStrategy(mt.type); |
942 | | |
943 | 2.66M | for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); |
944 | 2.19M | cy += acs.covered_blocks_y()) { |
945 | 14.6M | for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); |
946 | 12.4M | cx += acs.covered_blocks_x()) { |
947 | 12.4M | if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { |
948 | 365k | if (cparams.decoding_speed_tier < 4 && |
949 | 365k | mt.type == AcStrategyType::DCT32X64) { |
950 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
951 | 40.6k | if ((cy | cx) % 8 == 0) { |
952 | 40.6k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
953 | 40.6k | 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
954 | 40.6k | mt.entropy_mul, entropy_mul64X64, entropy_estimate, block, |
955 | 40.6k | scratch_space, quantized)); |
956 | 40.6k | } |
957 | 40.6k | continue; |
958 | 325k | } else if (mt.type == AcStrategyType::DCT32X16) { |
959 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
960 | | // and that is above. The last column and last row, |
961 | | // when the last column or last row is odd numbered, |
962 | | // are still handled by TryMergeAcs. |
963 | 40.6k | continue; |
964 | 40.6k | } |
965 | 365k | } |
966 | 12.3M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || |
967 | 12.3M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { |
968 | | // already covered by FindBest32X32 |
969 | 342k | continue; |
970 | 342k | } |
971 | | |
972 | 12.0M | if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { |
973 | 4.94M | if (mt.type == AcStrategyType::DCT16X32) { |
974 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
975 | 171k | if ((cy | cx) % 4 == 0) { |
976 | 171k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
977 | 171k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, |
978 | 171k | ac_strategy, mt.entropy_mul, entropy_mul32X32, |
979 | 171k | entropy_estimate, block, scratch_space, quantized)); |
980 | 171k | } |
981 | 171k | continue; |
982 | 4.76M | } else if (mt.type == AcStrategyType::DCT32X16) { |
983 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
984 | | // and that is above. The last column and last row, |
985 | | // when the last column or last row is odd numbered, |
986 | | // are still handled by TryMergeAcs. |
987 | 130k | continue; |
988 | 130k | } |
989 | 4.94M | } |
990 | 11.7M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || |
991 | 11.7M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { |
992 | | // already covered by FindBest32X32 |
993 | 0 | continue; |
994 | 0 | } |
995 | 11.7M | if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { |
996 | 9.16M | if (mt.type == AcStrategyType::DCT8X16) { |
997 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
998 | 1.25M | if ((cy | cx) % 2 == 0) { |
999 | 712k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
1000 | 712k | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1001 | 712k | mt.entropy_mul, entropy_mul16X16, entropy_estimate, block, |
1002 | 712k | scratch_space, quantized)); |
1003 | 712k | } |
1004 | 1.25M | continue; |
1005 | 7.91M | } else if (mt.type == AcStrategyType::DCT16X8) { |
1006 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
1007 | | // and that is above. The last column and last row, |
1008 | | // when the last column or last row is odd numbered, |
1009 | | // are still handled by TryMergeAcs. |
1010 | 1.24M | continue; |
1011 | 1.24M | } |
1012 | 9.16M | } |
1013 | 9.22M | if ((mt.type == AcStrategyType::DCT8X16 && cy % 2 == 1) || |
1014 | 9.22M | (mt.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) { |
1015 | | // already covered by FindBestFirstLevelDivisionForSquare |
1016 | 359k | continue; |
1017 | 359k | } |
1018 | | // All other merge sizes are handled here. |
1019 | | // Some of the DCT16X8s and DCT8X16s will still leak through here |
1020 | | // when there is an odd number of 8x8 blocks, then the last row |
1021 | | // and column will get their DCT16X8s and DCT8X16s through the |
1022 | | // normal integral transform merging process. |
1023 | 8.86M | JXL_RETURN_IF_ERROR( |
1024 | 8.86M | TryMergeAcs(mt.type, bx, by, cx, cy, config, cmap_factors, |
1025 | 8.86M | ac_strategy, mt.entropy_mul, mt.priority, &priority[0], |
1026 | 8.86M | entropy_estimate, block, scratch_space, quantized)); |
1027 | 8.86M | } |
1028 | 2.19M | } |
1029 | 467k | } |
1030 | 51.9k | if (cparams.speed_tier >= SpeedTier::kHare) { |
1031 | 0 | return true; |
1032 | 0 | } |
1033 | | // Here we still try to do some non-aligned matching, find a few more |
1034 | | // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. |
1035 | 393k | for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { |
1036 | 2.51M | for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { |
1037 | 2.17M | if ((cy | cx) % 2 != 0) { |
1038 | 1.46M | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
1039 | 1.46M | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1040 | 1.46M | entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, |
1041 | 1.46M | scratch_space, quantized)); |
1042 | 1.46M | } |
1043 | 2.17M | } |
1044 | 341k | } |
1045 | | // Non-aligned matching for 32X32, 16X32 and 32X16. |
1046 | 51.9k | size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; |
1047 | 194k | for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { |
1048 | 529k | for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { |
1049 | 386k | if ((cy | cx) % 4 == 0) { |
1050 | 171k | continue; // Already tried with loop above (DCT16X32 case). |
1051 | 171k | } |
1052 | 214k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
1053 | 214k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1054 | 214k | entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, |
1055 | 214k | scratch_space, quantized)); |
1056 | 214k | } |
1057 | 143k | } |
1058 | 51.9k | return true; |
1059 | 51.9k | } Unexecuted instantiation: jxl::N_SSE4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) jxl::N_AVX2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) Line | Count | Source | 831 | 51.9k | AcStrategyImage* ac_strategy) { | 832 | | // Main philosophy here: | 833 | | // 1. First find best 8x8 transform for each area. | 834 | | // 2. Merging them into larger transforms where possibly, but | 835 | | // starting from the smallest transforms (16x8 and 8x16). | 836 | | // Additional complication: 16x8 and 8x16 are considered | 837 | | // simultaneously and fairly against each other. | 838 | | // We are looking at 64x64 squares since the Y-to-X and Y-to-B | 839 | | // maps happen to be at that resolution, and having | 840 | | // integral transforms cross these boundaries leads to | 841 | | // additional complications. | 842 | 51.9k | const float butteraugli_target = cparams.butteraugli_distance; | 843 | 51.9k | float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea; | 844 | 51.9k | size_t bx = rect.x0(); | 845 | 51.9k | size_t by = rect.y0(); | 846 | 51.9k | JXL_ENSURE(rect.xsize() <= 8); | 847 | 51.9k | JXL_ENSURE(rect.ysize() <= 8); | 848 | 51.9k | size_t tx = bx / kColorTileDimInBlocks; | 849 | 51.9k | size_t ty = by / kColorTileDimInBlocks; | 850 | 51.9k | const float cmap_factors[3] = { | 851 | 51.9k | cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]), | 852 | 51.9k | 0.0f, | 853 | 51.9k | cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]), | 854 | 51.9k | }; | 855 | 51.9k | if (cparams.speed_tier > SpeedTier::kHare) return true; | 856 | | // First compute the best 8x8 transform for each square. Later, we do not | 857 | | // experiment with different combinations, but only use the best of the 8x8s | 858 | | // when DCT8X8 is specified in the tree search. | 859 | | // 8x8 transforms have 10 variants, but every larger transform is just a DCT. | 860 | 51.9k | float entropy_estimate[64] = {}; | 861 | | // Favor all 8x8 transforms (against 16x8 and larger transforms)) at | 862 | | // low butteraugli_target distances. | 863 | 51.9k | static const float k8x8mul1 = -0.4; | 864 | 51.9k | static const float k8x8mul2 = 1.0; | 865 | 51.9k | static const float k8x8base = 1.4; | 866 | 51.9k | const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); | 867 | 445k | for (size_t iy = 0; iy < rect.ysize(); iy++) { | 868 | 3.29M | for (size_t ix = 0; ix < rect.xsize(); ix++) { | 869 | 2.89M | float entropy = 0.0; | 870 | 2.89M | AcStrategyType best_of_8x8s; | 871 | 2.89M | JXL_RETURN_IF_ERROR(FindBest8x8Transform( | 872 | 2.89M | 8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier), | 873 | 2.89M | butteraugli_target, config, cmap_factors, ac_strategy, block, | 874 | 2.89M | scratch_space, quantized, &entropy, best_of_8x8s)); | 875 | 2.89M | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s)); | 876 | 2.89M | entropy_estimate[iy * 8 + ix] = entropy * mul8x8; | 877 | 2.89M | } | 878 | 393k | } | 879 | | // Merge when a larger transform is better than the previously | 880 | | // searched best combination of 8x8 transforms. | 881 | 51.9k | struct MergeTry { | 882 | 51.9k | AcStrategyType type; | 883 | 51.9k | uint8_t priority; | 884 | 51.9k | uint8_t decoding_speed_tier_max_limit; | 885 | 51.9k | uint8_t encoding_speed_tier_max_limit; | 886 | 51.9k | float entropy_mul; | 887 | 51.9k | }; | 888 | | // These numbers need to be figured out manually and looking at | 889 | | // ringing next to sky etc. Optimization will find smaller numbers | 890 | | // and produce more ringing than is ideal. Larger numbers will | 891 | | // help stop ringing. | 892 | 51.9k | const float entropy_mul16X8 = 1.21; | 893 | 51.9k | const float entropy_mul16X16 = 1.34; | 894 | 51.9k | const float entropy_mul16X32 = 1.49; | 895 | 51.9k | const float entropy_mul32X32 = 1.48; | 896 | 51.9k | const float entropy_mul64X32 = 2.25; | 897 | 51.9k | const float entropy_mul64X64 = 2.25; | 898 | | // TODO(jyrki): Consider this feedback in further changes: | 899 | | // Also effectively when the multipliers for smaller blocks are | 900 | | // below 1, this raises the bar for the bigger blocks even higher | 901 | | // in that sense these constants are not independent (e.g. changing | 902 | | // the constant for DCT16x32 by -5% (making it more likely) also | 903 | | // means that DCT32x32 becomes harder to do when starting from | 904 | | // two DCT16x32s). It might be better to make them more independent, | 905 | | // e.g. by not applying the multiplier when storing the new entropy | 906 | | // estimates in TryMergeToACSCandidate(). | 907 | 51.9k | const MergeTry kTransformsForMerge[9] = { | 908 | 51.9k | {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8}, | 909 | 51.9k | {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8}, | 910 | | // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its | 911 | | // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16}, | 912 | 51.9k | {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32}, | 913 | 51.9k | {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32}, | 914 | | // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its | 915 | | // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5, | 916 | | // 0.9822994906548809f}, | 917 | 51.9k | {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32}, | 918 | 51.9k | {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32}, | 919 | | // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f}, | 920 | 51.9k | }; | 921 | | /* | 922 | | These sizes not yet included in merge heuristic: | 923 | | set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f); | 924 | | set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f); | 925 | | set(AcStrategyType::DCT128X128, 0.0f, 1.0f); | 926 | | set(AcStrategyType::DCT128X64, 0.0f, 0.73f); | 927 | | set(AcStrategyType::DCT64X128, 0.0f, 0.73f); | 928 | | set(AcStrategyType::DCT256X256, 0.0f, 1.0f); | 929 | | set(AcStrategyType::DCT256X128, 0.0f, 0.73f); | 930 | | set(AcStrategyType::DCT128X256, 0.0f, 0.73f); | 931 | | */ | 932 | | | 933 | | // Priority is a tricky kludge to avoid collisions so that transforms | 934 | | // don't overlap. | 935 | 51.9k | uint8_t priority[64] = {}; | 936 | 51.9k | bool enable_32x32 = cparams.decoding_speed_tier < 4; | 937 | 467k | for (auto mt : kTransformsForMerge) { | 938 | 467k | if (mt.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { | 939 | 0 | continue; | 940 | 0 | } | 941 | 467k | AcStrategy acs = AcStrategy::FromRawStrategy(mt.type); | 942 | | | 943 | 2.66M | for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); | 944 | 2.19M | cy += acs.covered_blocks_y()) { | 945 | 14.6M | for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); | 946 | 12.4M | cx += acs.covered_blocks_x()) { | 947 | 12.4M | if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { | 948 | 365k | if (cparams.decoding_speed_tier < 4 && | 949 | 365k | mt.type == AcStrategyType::DCT32X64) { | 950 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 951 | 40.6k | if ((cy | cx) % 8 == 0) { | 952 | 40.6k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 953 | 40.6k | 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 954 | 40.6k | mt.entropy_mul, entropy_mul64X64, entropy_estimate, block, | 955 | 40.6k | scratch_space, quantized)); | 956 | 40.6k | } | 957 | 40.6k | continue; | 958 | 325k | } else if (mt.type == AcStrategyType::DCT32X16) { | 959 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 960 | | // and that is above. The last column and last row, | 961 | | // when the last column or last row is odd numbered, | 962 | | // are still handled by TryMergeAcs. | 963 | 40.6k | continue; | 964 | 40.6k | } | 965 | 365k | } | 966 | 12.3M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || | 967 | 12.3M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { | 968 | | // already covered by FindBest32X32 | 969 | 342k | continue; | 970 | 342k | } | 971 | | | 972 | 12.0M | if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { | 973 | 4.94M | if (mt.type == AcStrategyType::DCT16X32) { | 974 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 975 | 171k | if ((cy | cx) % 4 == 0) { | 976 | 171k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 977 | 171k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, | 978 | 171k | ac_strategy, mt.entropy_mul, entropy_mul32X32, | 979 | 171k | entropy_estimate, block, scratch_space, quantized)); | 980 | 171k | } | 981 | 171k | continue; | 982 | 4.76M | } else if (mt.type == AcStrategyType::DCT32X16) { | 983 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 984 | | // and that is above. The last column and last row, | 985 | | // when the last column or last row is odd numbered, | 986 | | // are still handled by TryMergeAcs. | 987 | 130k | continue; | 988 | 130k | } | 989 | 4.94M | } | 990 | 11.7M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || | 991 | 11.7M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { | 992 | | // already covered by FindBest32X32 | 993 | 0 | continue; | 994 | 0 | } | 995 | 11.7M | if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { | 996 | 9.16M | if (mt.type == AcStrategyType::DCT8X16) { | 997 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 998 | 1.25M | if ((cy | cx) % 2 == 0) { | 999 | 712k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 1000 | 712k | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1001 | 712k | mt.entropy_mul, entropy_mul16X16, entropy_estimate, block, | 1002 | 712k | scratch_space, quantized)); | 1003 | 712k | } | 1004 | 1.25M | continue; | 1005 | 7.91M | } else if (mt.type == AcStrategyType::DCT16X8) { | 1006 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 1007 | | // and that is above. The last column and last row, | 1008 | | // when the last column or last row is odd numbered, | 1009 | | // are still handled by TryMergeAcs. | 1010 | 1.24M | continue; | 1011 | 1.24M | } | 1012 | 9.16M | } | 1013 | 9.22M | if ((mt.type == AcStrategyType::DCT8X16 && cy % 2 == 1) || | 1014 | 9.22M | (mt.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) { | 1015 | | // already covered by FindBestFirstLevelDivisionForSquare | 1016 | 359k | continue; | 1017 | 359k | } | 1018 | | // All other merge sizes are handled here. | 1019 | | // Some of the DCT16X8s and DCT8X16s will still leak through here | 1020 | | // when there is an odd number of 8x8 blocks, then the last row | 1021 | | // and column will get their DCT16X8s and DCT8X16s through the | 1022 | | // normal integral transform merging process. | 1023 | 8.86M | JXL_RETURN_IF_ERROR( | 1024 | 8.86M | TryMergeAcs(mt.type, bx, by, cx, cy, config, cmap_factors, | 1025 | 8.86M | ac_strategy, mt.entropy_mul, mt.priority, &priority[0], | 1026 | 8.86M | entropy_estimate, block, scratch_space, quantized)); | 1027 | 8.86M | } | 1028 | 2.19M | } | 1029 | 467k | } | 1030 | 51.9k | if (cparams.speed_tier >= SpeedTier::kHare) { | 1031 | 0 | return true; | 1032 | 0 | } | 1033 | | // Here we still try to do some non-aligned matching, find a few more | 1034 | | // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. | 1035 | 393k | for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { | 1036 | 2.51M | for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { | 1037 | 2.17M | if ((cy | cx) % 2 != 0) { | 1038 | 1.46M | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 1039 | 1.46M | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1040 | 1.46M | entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, | 1041 | 1.46M | scratch_space, quantized)); | 1042 | 1.46M | } | 1043 | 2.17M | } | 1044 | 341k | } | 1045 | | // Non-aligned matching for 32X32, 16X32 and 32X16. | 1046 | 51.9k | size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; | 1047 | 194k | for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { | 1048 | 529k | for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { | 1049 | 386k | if ((cy | cx) % 4 == 0) { | 1050 | 171k | continue; // Already tried with loop above (DCT16X32 case). | 1051 | 171k | } | 1052 | 214k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 1053 | 214k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1054 | 214k | entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, | 1055 | 214k | scratch_space, quantized)); | 1056 | 214k | } | 1057 | 143k | } | 1058 | 51.9k | return true; | 1059 | 51.9k | } |
Unexecuted instantiation: jxl::N_AVX3::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) Unexecuted instantiation: jxl::N_AVX3_SPR::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) Unexecuted instantiation: jxl::N_SSE2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) |
1060 | | |
1061 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
1062 | | } // namespace HWY_NAMESPACE |
1063 | | } // namespace jxl |
1064 | | HWY_AFTER_NAMESPACE(); |
1065 | | |
1066 | | #if HWY_ONCE |
1067 | | namespace jxl { |
1068 | | HWY_EXPORT(ProcessRectACS); |
1069 | | |
1070 | | Status AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in, |
1071 | | const ImageF& quant_field, const ImageF& mask, |
1072 | | const ImageF& mask1x1, |
1073 | 2.13k | DequantMatrices* matrices) { |
1074 | 2.13k | config.dequant = matrices; |
1075 | | |
1076 | 2.13k | if (cparams.speed_tier >= SpeedTier::kCheetah) { |
1077 | 0 | JXL_RETURN_IF_ERROR( |
1078 | 0 | matrices->EnsureComputed(memory_manager, 1)); // DCT8 only |
1079 | 2.13k | } else { |
1080 | 2.13k | uint32_t acs_mask = 0; |
1081 | | // All transforms up to 64x64. |
1082 | 46.8k | for (size_t i = 0; i < static_cast<size_t>(AcStrategyType::DCT128X128); |
1083 | 44.7k | i++) { |
1084 | 44.7k | acs_mask |= (1 << i); |
1085 | 44.7k | } |
1086 | 2.13k | JXL_RETURN_IF_ERROR(matrices->EnsureComputed(memory_manager, acs_mask)); |
1087 | 2.13k | } |
1088 | | |
1089 | | // Image row pointers and strides. |
1090 | 2.13k | config.quant_field_row = quant_field.Row(0); |
1091 | 2.13k | config.quant_field_stride = quant_field.PixelsPerRow(); |
1092 | 2.13k | if (mask.xsize() > 0 && mask.ysize() > 0) { |
1093 | 2.13k | config.masking_field_row = mask.Row(0); |
1094 | 2.13k | config.masking_field_stride = mask.PixelsPerRow(); |
1095 | 2.13k | } |
1096 | 2.13k | config.mask1x1_xsize = mask1x1.xsize(); |
1097 | 2.13k | if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) { |
1098 | 2.13k | config.masking1x1_field_row = mask1x1.Row(0); |
1099 | 2.13k | config.masking1x1_field_stride = mask1x1.PixelsPerRow(); |
1100 | 2.13k | } |
1101 | | |
1102 | 2.13k | config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0); |
1103 | 2.13k | config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0); |
1104 | 2.13k | config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0); |
1105 | 2.13k | config.src_stride = src.PixelsPerRow(); |
1106 | | |
1107 | | // Entropy estimate is composed of two factors: |
1108 | | // - estimate of the number of bits that will be used by the block |
1109 | | // - information loss due to quantization |
1110 | | // The following constant controls the relative weights of these components. |
1111 | 2.13k | config.info_loss_multiplier = 1.2; |
1112 | 2.13k | config.zeros_mul = 9.3089059022677905; |
1113 | 2.13k | config.cost_delta = 10.833273317067883; |
1114 | | |
1115 | 2.13k | static const float kBias = 0.13731742964354549; |
1116 | 2.13k | const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias); |
1117 | | |
1118 | 2.13k | static const float kPow1 = 0.33677806662454718; |
1119 | 2.13k | static const float kPow2 = 0.50990926717963703; |
1120 | 2.13k | static const float kPow3 = 0.36702940662370243; |
1121 | 2.13k | config.info_loss_multiplier *= std::pow(ratio, kPow1); |
1122 | 2.13k | config.zeros_mul *= std::pow(ratio, kPow2); |
1123 | 2.13k | config.cost_delta *= std::pow(ratio, kPow3); |
1124 | 2.13k | return true; |
1125 | 2.13k | } |
1126 | | |
1127 | 2.13k | Status AcStrategyHeuristics::PrepareForThreads(std::size_t num_threads) { |
1128 | 2.13k | const size_t dct_scratch_size = |
1129 | 2.13k | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; |
1130 | 2.13k | mem_per_thread = 6 * AcStrategy::kMaxCoeffArea + dct_scratch_size; |
1131 | 2.13k | size_t mem_bytes = num_threads * mem_per_thread * sizeof(float); |
1132 | 2.13k | JXL_ASSIGN_OR_RETURN(mem, AlignedMemory::Create(memory_manager, mem_bytes)); |
1133 | 2.13k | qmem_per_thread = AcStrategy::kMaxCoeffArea; |
1134 | 2.13k | size_t qmem_bytes = num_threads * qmem_per_thread * sizeof(uint32_t); |
1135 | 2.13k | JXL_ASSIGN_OR_RETURN(qmem, AlignedMemory::Create(memory_manager, qmem_bytes)); |
1136 | 2.13k | return true; |
1137 | 2.13k | } |
1138 | | |
1139 | | Status AcStrategyHeuristics::ProcessRect(const Rect& rect, |
1140 | | const ColorCorrelationMap& cmap, |
1141 | | AcStrategyImage* ac_strategy, |
1142 | 51.9k | size_t thread) { |
1143 | | // In Cheetah mode, use DCT8 everywhere and uniform quantization. |
1144 | 51.9k | if (cparams.speed_tier >= SpeedTier::kCheetah) { |
1145 | 0 | ac_strategy->FillDCT8(rect); |
1146 | 0 | return true; |
1147 | 0 | } |
1148 | 51.9k | return HWY_DYNAMIC_DISPATCH(ProcessRectACS)( |
1149 | 51.9k | cparams, config, rect, cmap, |
1150 | 51.9k | mem.address<float>() + thread * mem_per_thread, |
1151 | 51.9k | qmem.address<uint32_t>() + thread * qmem_per_thread, ac_strategy); |
1152 | 51.9k | } |
1153 | | |
1154 | | Status AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim, |
1155 | | const AcStrategyImage& ac_strategy, |
1156 | 2.13k | AuxOut* aux_out) { |
1157 | | // Accounting and debug output. |
1158 | 2.13k | if (aux_out != nullptr) { |
1159 | 0 | aux_out->num_small_blocks = |
1160 | 0 | ac_strategy.CountBlocks(AcStrategyType::IDENTITY) + |
1161 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT2X2) + |
1162 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT4X4); |
1163 | 0 | aux_out->num_dct4x8_blocks = |
1164 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT4X8) + |
1165 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT8X4); |
1166 | 0 | aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategyType::AFV0) + |
1167 | 0 | ac_strategy.CountBlocks(AcStrategyType::AFV1) + |
1168 | 0 | ac_strategy.CountBlocks(AcStrategyType::AFV2) + |
1169 | 0 | ac_strategy.CountBlocks(AcStrategyType::AFV3); |
1170 | 0 | aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategyType::DCT); |
1171 | 0 | aux_out->num_dct8x16_blocks = |
1172 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT8X16) + |
1173 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT16X8); |
1174 | 0 | aux_out->num_dct8x32_blocks = |
1175 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT8X32) + |
1176 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X8); |
1177 | 0 | aux_out->num_dct16_blocks = |
1178 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT16X16); |
1179 | 0 | aux_out->num_dct16x32_blocks = |
1180 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT16X32) + |
1181 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X16); |
1182 | 0 | aux_out->num_dct32_blocks = |
1183 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X32); |
1184 | 0 | aux_out->num_dct32x64_blocks = |
1185 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X64) + |
1186 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT64X32); |
1187 | 0 | aux_out->num_dct64_blocks = |
1188 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT64X64); |
1189 | 0 | } |
1190 | | |
1191 | 2.13k | if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) { |
1192 | 0 | JXL_RETURN_IF_ERROR(DumpAcStrategy(ac_strategy, frame_dim.xsize, |
1193 | 0 | frame_dim.ysize, "ac_strategy", aux_out, |
1194 | 0 | cparams)); |
1195 | 0 | } |
1196 | 2.13k | return true; |
1197 | 2.13k | } |
1198 | | |
1199 | | } // namespace jxl |
1200 | | #endif // HWY_ONCE |