/src/libjxl/lib/jxl/enc_ac_strategy.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_ac_strategy.h" |
7 | | |
8 | | #include <algorithm> |
9 | | #include <cmath> |
10 | | #include <cstdint> |
11 | | #include <cstdio> |
12 | | #include <cstring> |
13 | | #include <limits> |
14 | | |
15 | | #include "lib/jxl/chroma_from_luma.h" |
16 | | #include "lib/jxl/common.h" |
17 | | #include "lib/jxl/frame_dimensions.h" |
18 | | #include "lib/jxl/image.h" |
19 | | #include "lib/jxl/memory_manager_internal.h" |
20 | | #include "lib/jxl/quant_weights.h" |
21 | | |
22 | | #undef HWY_TARGET_INCLUDE |
23 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc" |
24 | | #include <hwy/foreach_target.h> |
25 | | #include <hwy/highway.h> |
26 | | |
27 | | #include "lib/jxl/ac_strategy.h" |
28 | | #include "lib/jxl/base/bits.h" |
29 | | #include "lib/jxl/base/compiler_specific.h" |
30 | | #include "lib/jxl/base/fast_math-inl.h" |
31 | | #include "lib/jxl/base/rect.h" |
32 | | #include "lib/jxl/base/status.h" |
33 | | #include "lib/jxl/dec_transforms-inl.h" |
34 | | #include "lib/jxl/enc_aux_out.h" |
35 | | #include "lib/jxl/enc_debug_image.h" |
36 | | #include "lib/jxl/enc_params.h" |
37 | | #include "lib/jxl/enc_transforms-inl.h" |
38 | | #include "lib/jxl/simd_util.h" |
39 | | |
40 | | // Some of the floating point constants in this file and in other |
41 | | // files in the libjxl project have been obtained using the |
42 | | // tools/optimizer/simplex_fork.py tool. It is a variation of |
43 | | // Nelder-Mead optimization, and we generally try to minimize |
44 | | // BPP * pnorm aggregate as reported by the benchmark_xl tool, |
45 | | // but occasionally the values are optimized by using additional |
46 | | // constraints such as maintaining a certain density, or ratio of |
47 | | // popularity of integral transforms. Jyrki visually reviews all |
48 | | // such changes and often makes manual changes to maintain good |
49 | | // visual quality to changes where butteraugli was not sufficiently |
50 | | // sensitive to some kind of degradation. Unfortunately image quality |
51 | | // is still more of an art than science. |
52 | | |
53 | | // Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging. |
54 | | #ifndef JXL_DEBUG_AC_STRATEGY |
55 | 372 | #define JXL_DEBUG_AC_STRATEGY 0 |
56 | | #endif |
57 | | |
58 | | // This must come before the begin/end_target, but HWY_ONCE is only true |
59 | | // after that, so use an "include guard". |
60 | | #ifndef LIB_JXL_ENC_AC_STRATEGY_ |
61 | | #define LIB_JXL_ENC_AC_STRATEGY_ |
62 | | // Parameters of the heuristic are marked with a OPTIMIZE comment. |
63 | | namespace jxl { |
64 | | namespace { |
65 | | |
66 | | // Debugging utilities. |
67 | | |
68 | | // Returns a linear sRGB color (as bytes) for each AC strategy. |
69 | 0 | const uint8_t* TypeColor(uint8_t raw_strategy) { |
70 | 0 | JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); |
71 | 0 | static_assert(AcStrategy::kNumValidStrategies == 27, "Update colors"); |
72 | 0 | static constexpr uint8_t kColors[AcStrategy::kNumValidStrategies + 1][3] = { |
73 | 0 | {0xFF, 0xFF, 0x00}, // DCT8 | yellow |
74 | 0 | {0xFF, 0x80, 0x80}, // HORNUSS | vivid tangerine |
75 | 0 | {0xFF, 0x80, 0x80}, // DCT2x2 | vivid tangerine |
76 | 0 | {0xFF, 0x80, 0x80}, // DCT4x4 | vivid tangerine |
77 | 0 | {0x80, 0xFF, 0x00}, // DCT16x16 | chartreuse |
78 | 0 | {0x00, 0xC0, 0x00}, // DCT32x32 | waystone green |
79 | 0 | {0xC0, 0xFF, 0x00}, // DCT16x8 | lime |
80 | 0 | {0xC0, 0xFF, 0x00}, // DCT8x16 | lime |
81 | 0 | {0x00, 0xFF, 0x00}, // DCT32x8 | green |
82 | 0 | {0x00, 0xFF, 0x00}, // DCT8x32 | green |
83 | 0 | {0x00, 0xFF, 0x00}, // DCT32x16 | green |
84 | 0 | {0x00, 0xFF, 0x00}, // DCT16x32 | green |
85 | 0 | {0xFF, 0x80, 0x00}, // DCT4x8 | orange juice |
86 | 0 | {0xFF, 0x80, 0x00}, // DCT8x4 | orange juice |
87 | 0 | {0xFF, 0xFF, 0x80}, // AFV0 | butter |
88 | 0 | {0xFF, 0xFF, 0x80}, // AFV1 | butter |
89 | 0 | {0xFF, 0xFF, 0x80}, // AFV2 | butter |
90 | 0 | {0xFF, 0xFF, 0x80}, // AFV3 | butter |
91 | 0 | {0x00, 0xC0, 0xFF}, // DCT64x64 | capri |
92 | 0 | {0x00, 0xFF, 0xFF}, // DCT64x32 | aqua |
93 | 0 | {0x00, 0xFF, 0xFF}, // DCT32x64 | aqua |
94 | 0 | {0x00, 0x40, 0xFF}, // DCT128x128 | rare blue |
95 | 0 | {0x00, 0x80, 0xFF}, // DCT128x64 | magic ink |
96 | 0 | {0x00, 0x80, 0xFF}, // DCT64x128 | magic ink |
97 | 0 | {0x00, 0x00, 0xC0}, // DCT256x256 | keese blue |
98 | 0 | {0x00, 0x00, 0xFF}, // DCT256x128 | blue |
99 | 0 | {0x00, 0x00, 0xFF}, // DCT128x256 | blue |
100 | 0 | {0x00, 0x00, 0x00} // invalid | black |
101 | 0 | }; |
102 | 0 | raw_strategy = |
103 | 0 | Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies); |
104 | 0 | return kColors[raw_strategy]; |
105 | 0 | } |
106 | | |
107 | 0 | const uint8_t* TypeMask(uint8_t raw_strategy) { |
108 | 0 | JXL_DASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); |
109 | 0 | static_assert(AcStrategy::kNumValidStrategies == 27, "Update masks"); |
110 | 0 | // implicitly, first row and column is made dark |
111 | 0 | static constexpr uint8_t kMask[AcStrategy::kNumValidStrategies + 1][64] = { |
112 | 0 | { |
113 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
114 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
115 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
116 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
117 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
118 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
119 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
120 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
121 | 0 | }, // DCT8 |
122 | 0 | { |
123 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
124 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
125 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
126 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
127 | 0 | 0, 0, 1, 1, 1, 1, 0, 0, // |
128 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
129 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
130 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
131 | 0 | }, // HORNUSS |
132 | 0 | { |
133 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
134 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
135 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
136 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
137 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
138 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
139 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
140 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
141 | 0 | }, // 2x2 |
142 | 0 | { |
143 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
144 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
145 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
146 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
147 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
148 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
149 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
150 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
151 | 0 | }, // 4x4 |
152 | 0 | {}, // DCT16x16 (unused) |
153 | 0 | {}, // DCT32x32 (unused) |
154 | 0 | {}, // DCT16x8 (unused) |
155 | 0 | {}, // DCT8x16 (unused) |
156 | 0 | {}, // DCT32x8 (unused) |
157 | 0 | {}, // DCT8x32 (unused) |
158 | 0 | {}, // DCT32x16 (unused) |
159 | 0 | {}, // DCT16x32 (unused) |
160 | 0 | { |
161 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
162 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
163 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
164 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
165 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
166 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
167 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
168 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
169 | 0 | }, // DCT4x8 |
170 | 0 | { |
171 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
172 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
173 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
174 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
175 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
176 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
177 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
178 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
179 | 0 | }, // DCT8x4 |
180 | 0 | { |
181 | 0 | 1, 1, 1, 1, 1, 0, 0, 0, // |
182 | 0 | 1, 1, 1, 1, 0, 0, 0, 0, // |
183 | 0 | 1, 1, 1, 0, 0, 0, 0, 0, // |
184 | 0 | 1, 1, 0, 0, 0, 0, 0, 0, // |
185 | 0 | 1, 0, 0, 0, 0, 0, 0, 0, // |
186 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
187 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
188 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
189 | 0 | }, // AFV0 |
190 | 0 | { |
191 | 0 | 0, 0, 0, 0, 1, 1, 1, 1, // |
192 | 0 | 0, 0, 0, 0, 0, 1, 1, 1, // |
193 | 0 | 0, 0, 0, 0, 0, 0, 1, 1, // |
194 | 0 | 0, 0, 0, 0, 0, 0, 0, 1, // |
195 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
196 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
197 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
198 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
199 | 0 | }, // AFV1 |
200 | 0 | { |
201 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
202 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
203 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
204 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
205 | 0 | 1, 0, 0, 0, 0, 0, 0, 0, // |
206 | 0 | 1, 1, 0, 0, 0, 0, 0, 0, // |
207 | 0 | 1, 1, 1, 0, 0, 0, 0, 0, // |
208 | 0 | 1, 1, 1, 1, 0, 0, 0, 0, // |
209 | 0 | }, // AFV2 |
210 | 0 | { |
211 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
212 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
213 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
214 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
215 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
216 | 0 | 0, 0, 0, 0, 0, 0, 0, 1, // |
217 | 0 | 0, 0, 0, 0, 0, 0, 1, 1, // |
218 | 0 | 0, 0, 0, 0, 0, 1, 1, 1, // |
219 | 0 | }, // AFV3 |
220 | 0 | {} // invalid |
221 | 0 | }; |
222 | 0 | raw_strategy = |
223 | 0 | Clamp1<uint8_t>(raw_strategy, 0, AcStrategy::kNumValidStrategies); |
224 | 0 | return kMask[raw_strategy]; |
225 | 0 | } |
226 | | |
227 | | Status DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, |
228 | | size_t ysize, const char* tag, AuxOut* aux_out, |
229 | 0 | const CompressParams& cparams) { |
230 | 0 | JxlMemoryManager* memory_manager = ac_strategy.memory_manager(); |
231 | 0 | JXL_ASSIGN_OR_RETURN(Image3F color_acs, |
232 | 0 | Image3F::Create(memory_manager, xsize, ysize)); |
233 | 0 | for (size_t y = 0; y < ysize; y++) { |
234 | 0 | float* JXL_RESTRICT rows[3] = { |
235 | 0 | color_acs.PlaneRow(0, y), |
236 | 0 | color_acs.PlaneRow(1, y), |
237 | 0 | color_acs.PlaneRow(2, y), |
238 | 0 | }; |
239 | 0 | const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); |
240 | 0 | for (size_t x = 0; x < xsize; x++) { |
241 | 0 | AcStrategy acs = acs_row[x / kBlockDim]; |
242 | 0 | const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); |
243 | 0 | for (size_t c = 0; c < 3; c++) { |
244 | 0 | rows[c][x] = color[c] / 255.f; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | } |
248 | 0 | size_t stride = color_acs.PixelsPerRow(); |
249 | 0 | for (size_t c = 0; c < 3; c++) { |
250 | 0 | for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { |
251 | 0 | float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); |
252 | 0 | const AcStrategyRow acs_row = ac_strategy.ConstRow(by); |
253 | 0 | for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { |
254 | 0 | AcStrategy acs = acs_row[bx]; |
255 | 0 | if (!acs.IsFirstBlock()) continue; |
256 | 0 | const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); |
257 | 0 | const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); |
258 | 0 | if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { |
259 | 0 | for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; |
260 | 0 | iy++) { |
261 | 0 | for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; |
262 | 0 | ix++) { |
263 | 0 | if (mask[iy * kBlockDim + ix]) { |
264 | 0 | row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; |
265 | 0 | } |
266 | 0 | } |
267 | 0 | } |
268 | 0 | } |
269 | 0 | // draw block edges |
270 | 0 | for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && |
271 | 0 | bx * kBlockDim + ix < xsize; |
272 | 0 | ix++) { |
273 | 0 | row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; |
274 | 0 | } |
275 | 0 | for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && |
276 | 0 | by * kBlockDim + iy < ysize; |
277 | 0 | iy++) { |
278 | 0 | row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; |
279 | 0 | } |
280 | 0 | } |
281 | 0 | } |
282 | 0 | } |
283 | 0 | return DumpImage(cparams, tag, color_acs); |
284 | 0 | } |
285 | | |
286 | | } // namespace |
287 | | } // namespace jxl |
288 | | #endif // LIB_JXL_ENC_AC_STRATEGY_ |
289 | | |
290 | | HWY_BEFORE_NAMESPACE(); |
291 | | namespace jxl { |
292 | | namespace HWY_NAMESPACE { |
293 | | |
294 | | // These templates are not found via ADL. |
295 | | using hwy::HWY_NAMESPACE::AbsDiff; |
296 | | using hwy::HWY_NAMESPACE::Eq; |
297 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
298 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
299 | | using hwy::HWY_NAMESPACE::Round; |
300 | | using hwy::HWY_NAMESPACE::Sqrt; |
301 | | |
302 | | bool MultiBlockTransformCrossesHorizontalBoundary( |
303 | | const AcStrategyImage& ac_strategy, size_t start_x, size_t y, |
304 | 838k | size_t end_x) { |
305 | 838k | if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { |
306 | 5.30k | return false; |
307 | 5.30k | } |
308 | 833k | if (y % 8 == 0) { |
309 | | // Nothing crosses 64x64 boundaries, and the memory on the other side |
310 | | // of the 64x64 block may still uninitialized. |
311 | 125k | return false; |
312 | 125k | } |
313 | 707k | end_x = std::min(end_x, ac_strategy.xsize()); |
314 | | // The first multiblock might be before the start_x, let's adjust it |
315 | | // to point to the first IsFirstBlock() == true block we find by backward |
316 | | // tracing. |
317 | 707k | AcStrategyRow row = ac_strategy.ConstRow(y); |
318 | 707k | const size_t start_x_limit = start_x & ~7; |
319 | 982k | while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { |
320 | 275k | --start_x; |
321 | 275k | } |
322 | 2.06M | for (size_t x = start_x; x < end_x;) { |
323 | 1.46M | if (row[x].IsFirstBlock()) { |
324 | 1.35M | x += row[x].covered_blocks_x(); |
325 | 1.35M | } else { |
326 | 116k | return true; |
327 | 116k | } |
328 | 1.46M | } |
329 | 591k | return false; |
330 | 707k | } Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) jxl::N_AVX2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Line | Count | Source | 304 | 838k | size_t end_x) { | 305 | 838k | if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { | 306 | 5.30k | return false; | 307 | 5.30k | } | 308 | 833k | if (y % 8 == 0) { | 309 | | // Nothing crosses 64x64 boundaries, and the memory on the other side | 310 | | // of the 64x64 block may still uninitialized. | 311 | 125k | return false; | 312 | 125k | } | 313 | 707k | end_x = std::min(end_x, ac_strategy.xsize()); | 314 | | // The first multiblock might be before the start_x, let's adjust it | 315 | | // to point to the first IsFirstBlock() == true block we find by backward | 316 | | // tracing. | 317 | 707k | AcStrategyRow row = ac_strategy.ConstRow(y); | 318 | 707k | const size_t start_x_limit = start_x & ~7; | 319 | 982k | while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { | 320 | 275k | --start_x; | 321 | 275k | } | 322 | 2.06M | for (size_t x = start_x; x < end_x;) { | 323 | 1.46M | if (row[x].IsFirstBlock()) { | 324 | 1.35M | x += row[x].covered_blocks_x(); | 325 | 1.35M | } else { | 326 | 116k | return true; | 327 | 116k | } | 328 | 1.46M | } | 329 | 591k | return false; | 330 | 707k | } |
Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) |
331 | | |
332 | | bool MultiBlockTransformCrossesVerticalBoundary( |
333 | | const AcStrategyImage& ac_strategy, size_t x, size_t start_y, |
334 | 682k | size_t end_y) { |
335 | 682k | if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { |
336 | 3.78k | return false; |
337 | 3.78k | } |
338 | 678k | if (x % 8 == 0) { |
339 | | // Nothing crosses 64x64 boundaries, and the memory on the other side |
340 | | // of the 64x64 block may still uninitialized. |
341 | 101k | return false; |
342 | 101k | } |
343 | 576k | end_y = std::min(end_y, ac_strategy.ysize()); |
344 | | // The first multiblock might be before the start_y, let's adjust it |
345 | | // to point to the first IsFirstBlock() == true block we find by backward |
346 | | // tracing. |
347 | 576k | const size_t start_y_limit = start_y & ~7; |
348 | 619k | while (start_y != start_y_limit && |
349 | 619k | !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { |
350 | 42.7k | --start_y; |
351 | 42.7k | } |
352 | | |
353 | 1.81M | for (size_t y = start_y; y < end_y;) { |
354 | 1.27M | AcStrategyRow row = ac_strategy.ConstRow(y); |
355 | 1.27M | if (row[x].IsFirstBlock()) { |
356 | 1.24M | y += row[x].covered_blocks_y(); |
357 | 1.24M | } else { |
358 | 31.9k | return true; |
359 | 31.9k | } |
360 | 1.27M | } |
361 | 544k | return false; |
362 | 576k | } Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) jxl::N_AVX2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Line | Count | Source | 334 | 682k | size_t end_y) { | 335 | 682k | if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { | 336 | 3.78k | return false; | 337 | 3.78k | } | 338 | 678k | if (x % 8 == 0) { | 339 | | // Nothing crosses 64x64 boundaries, and the memory on the other side | 340 | | // of the 64x64 block may still uninitialized. | 341 | 101k | return false; | 342 | 101k | } | 343 | 576k | end_y = std::min(end_y, ac_strategy.ysize()); | 344 | | // The first multiblock might be before the start_y, let's adjust it | 345 | | // to point to the first IsFirstBlock() == true block we find by backward | 346 | | // tracing. | 347 | 576k | const size_t start_y_limit = start_y & ~7; | 348 | 619k | while (start_y != start_y_limit && | 349 | 619k | !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { | 350 | 42.7k | --start_y; | 351 | 42.7k | } | 352 | | | 353 | 1.81M | for (size_t y = start_y; y < end_y;) { | 354 | 1.27M | AcStrategyRow row = ac_strategy.ConstRow(y); | 355 | 1.27M | if (row[x].IsFirstBlock()) { | 356 | 1.24M | y += row[x].covered_blocks_y(); | 357 | 1.24M | } else { | 358 | 31.9k | return true; | 359 | 31.9k | } | 360 | 1.27M | } | 361 | 544k | return false; | 362 | 576k | } |
Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) |
363 | | |
364 | | Status EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x, |
365 | | size_t y, const ACSConfig& config, |
366 | | const float* JXL_RESTRICT cmap_factors, float* block, |
367 | | float* full_scratch_space, uint32_t* quantized, |
368 | 5.06M | float& entropy) { |
369 | 5.06M | entropy = 0.0f; |
370 | 5.06M | float* mem = full_scratch_space; |
371 | 5.06M | float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea; |
372 | 5.06M | const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; |
373 | | |
374 | | // Apply transform. |
375 | 20.2M | for (size_t c = 0; c < 3; c++) { |
376 | 15.1M | float* JXL_RESTRICT block_c = block + size * c; |
377 | 15.1M | TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), |
378 | 15.1M | config.src_stride, block_c, scratch_space); |
379 | 15.1M | } |
380 | 5.06M | HWY_FULL(float) df; |
381 | | |
382 | 5.06M | const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); |
383 | | // avoid large blocks when there is a lot going on in red-green. |
384 | 5.06M | float quant_norm16 = 0; |
385 | 5.06M | if (num_blocks == 1) { |
386 | | // When it is only one 8x8, we don't need aggregation of values. |
387 | 3.98M | quant_norm16 = config.Quant(x / 8, y / 8); |
388 | 3.98M | } else if (num_blocks == 2) { |
389 | | // Taking max instead of 8th norm seems to work |
390 | | // better for smallest blocks up to 16x8. Jyrki couldn't get |
391 | | // improvements in trying the same for 16x16 blocks. |
392 | 690k | if (acs.covered_blocks_y() == 2) { |
393 | 344k | quant_norm16 = |
394 | 344k | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); |
395 | 345k | } else { |
396 | 345k | quant_norm16 = |
397 | 345k | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); |
398 | 345k | } |
399 | 690k | } else { |
400 | | // Load QF value, calculate empirical heuristic on masking field |
401 | | // for weighting the information loss. Information loss manifests |
402 | | // itself as ringing, and masking could hide it. |
403 | 1.57M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
404 | 5.12M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
405 | 3.93M | float qval = config.Quant(x / 8 + ix, y / 8 + iy); |
406 | 3.93M | qval *= qval; |
407 | 3.93M | qval *= qval; |
408 | 3.93M | qval *= qval; |
409 | 3.93M | quant_norm16 += qval * qval; |
410 | 3.93M | } |
411 | 1.18M | } |
412 | 391k | quant_norm16 /= num_blocks; |
413 | 391k | quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f); |
414 | 391k | } |
415 | 5.06M | const auto quant = Set(df, quant_norm16); |
416 | | |
417 | | // Compute entropy. |
418 | 5.06M | const HWY_CAPPED(float, 8) df8; |
419 | | |
420 | 5.06M | auto loss = Zero(df8); |
421 | 20.2M | for (size_t c = 0; c < 3; c++) { |
422 | 15.1M | const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c); |
423 | 15.1M | const float* matrix = config.dequant->Matrix(acs.Strategy(), c); |
424 | 15.1M | const auto cmap_factor = Set(df, cmap_factors[c]); |
425 | | |
426 | 15.1M | auto entropy_v = Zero(df); |
427 | 15.1M | auto nzeros_v = Zero(df); |
428 | 238M | for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { |
429 | 223M | const auto in = Load(df, block + c * size + i); |
430 | 223M | const auto in_y = Mul(Load(df, block + size + i), cmap_factor); |
431 | 223M | const auto im = Load(df, inv_matrix + i); |
432 | 223M | const auto val = Mul(Sub(in, in_y), Mul(im, quant)); |
433 | 223M | const auto rval = Round(val); |
434 | 223M | const auto diff = Sub(val, rval); |
435 | 223M | const auto m = Load(df, matrix + i); |
436 | 223M | Store(Mul(m, diff), df, &mem[i]); |
437 | 223M | const auto q = Abs(rval); |
438 | 223M | const auto q_is_zero = Eq(q, Zero(df)); |
439 | | // We used to have q * C here, but that cost model seems to |
440 | | // be punishing large values more than necessary. Sqrt tries |
441 | | // to avoid large values less aggressively. |
442 | 223M | entropy_v = Add(Sqrt(q), entropy_v); |
443 | 223M | nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); |
444 | 223M | } |
445 | | |
446 | 15.1M | { |
447 | 15.1M | float masku_lut[3] = { |
448 | 15.1M | 12.0, |
449 | 15.1M | 0.0, |
450 | 15.1M | 4.0, |
451 | 15.1M | }; |
452 | 15.1M | auto masku_off = Set(df8, masku_lut[c]); |
453 | 15.1M | auto lossc = Zero(df8); |
454 | 15.1M | TransformToPixels(acs.Strategy(), &mem[0], block, |
455 | 15.1M | acs.covered_blocks_x() * 8, scratch_space); |
456 | | |
457 | 33.7M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
458 | 46.5M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
459 | 251M | for (size_t dy = 0; dy < kBlockDim; ++dy) { |
460 | 446M | for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) { |
461 | 223M | auto in = Load(df8, block + |
462 | 223M | (iy * kBlockDim + dy) * |
463 | 223M | (acs.covered_blocks_x() * kBlockDim) + |
464 | 223M | ix * kBlockDim + dx); |
465 | 223M | if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) { |
466 | 223M | auto masku = |
467 | 223M | Add(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx, |
468 | 223M | y + iy * 8 + dy)), |
469 | 223M | masku_off); |
470 | 223M | in = Mul(masku, in); |
471 | 223M | in = Mul(in, in); |
472 | 223M | in = Mul(in, in); |
473 | 223M | in = Mul(in, in); |
474 | 223M | lossc = Add(lossc, in); |
475 | 223M | } |
476 | 223M | } |
477 | 223M | } |
478 | 27.9M | } |
479 | 18.6M | } |
480 | 15.1M | static const double kChannelMul[3] = { |
481 | 15.1M | pow(8.2, 8.0), |
482 | 15.1M | pow(1.0, 8.0), |
483 | 15.1M | pow(1.03, 8.0), |
484 | 15.1M | }; |
485 | 15.1M | lossc = Mul(Set(df8, kChannelMul[c]), lossc); |
486 | 15.1M | loss = Add(loss, lossc); |
487 | 15.1M | } |
488 | 15.1M | entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v)); |
489 | 15.1M | size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); |
490 | | // Add #bit of num_nonzeros, as an estimate of the cost for encoding the |
491 | | // number of non-zeros of the block. |
492 | 15.1M | size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; |
493 | | // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a |
494 | | // bias. |
495 | 15.1M | entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); |
496 | 15.1M | if (c == 0 && num_blocks >= 2) { |
497 | | // It is X channel (red-green) and we often see ringing |
498 | | // in the large blocks. Let's punish that more here. |
499 | 1.08M | float w = 1.0 + std::min(3.0, num_blocks / 8.0); |
500 | 1.08M | entropy *= w; |
501 | 1.08M | loss = Mul(loss, Set(df8, w)); |
502 | 1.08M | } |
503 | 15.1M | } |
504 | 5.06M | float loss_scalar = |
505 | 5.06M | pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize), |
506 | 5.06M | 1.0f / 8.0f) * |
507 | 5.06M | (num_blocks * kDCTBlockSize) / quant_norm16; |
508 | 5.06M | entropy *= entropy_mul; |
509 | 5.06M | entropy += config.info_loss_multiplier * loss_scalar; |
510 | 5.06M | return true; |
511 | 5.06M | } Unexecuted instantiation: jxl::N_SSE4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) Line | Count | Source | 368 | 5.06M | float& entropy) { | 369 | 5.06M | entropy = 0.0f; | 370 | 5.06M | float* mem = full_scratch_space; | 371 | 5.06M | float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea; | 372 | 5.06M | const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; | 373 | | | 374 | | // Apply transform. | 375 | 20.2M | for (size_t c = 0; c < 3; c++) { | 376 | 15.1M | float* JXL_RESTRICT block_c = block + size * c; | 377 | 15.1M | TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), | 378 | 15.1M | config.src_stride, block_c, scratch_space); | 379 | 15.1M | } | 380 | 5.06M | HWY_FULL(float) df; | 381 | | | 382 | 5.06M | const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); | 383 | | // avoid large blocks when there is a lot going on in red-green. | 384 | 5.06M | float quant_norm16 = 0; | 385 | 5.06M | if (num_blocks == 1) { | 386 | | // When it is only one 8x8, we don't need aggregation of values. | 387 | 3.98M | quant_norm16 = config.Quant(x / 8, y / 8); | 388 | 3.98M | } else if (num_blocks == 2) { | 389 | | // Taking max instead of 8th norm seems to work | 390 | | // better for smallest blocks up to 16x8. Jyrki couldn't get | 391 | | // improvements in trying the same for 16x16 blocks. | 392 | 690k | if (acs.covered_blocks_y() == 2) { | 393 | 344k | quant_norm16 = | 394 | 344k | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); | 395 | 345k | } else { | 396 | 345k | quant_norm16 = | 397 | 345k | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); | 398 | 345k | } | 399 | 690k | } else { | 400 | | // Load QF value, calculate empirical heuristic on masking field | 401 | | // for weighting the information loss. Information loss manifests | 402 | | // itself as ringing, and masking could hide it. | 403 | 1.57M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 404 | 5.12M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 405 | 3.93M | float qval = config.Quant(x / 8 + ix, y / 8 + iy); | 406 | 3.93M | qval *= qval; | 407 | 3.93M | qval *= qval; | 408 | 3.93M | qval *= qval; | 409 | 3.93M | quant_norm16 += qval * qval; | 410 | 3.93M | } | 411 | 1.18M | } | 412 | 391k | quant_norm16 /= num_blocks; | 413 | 391k | quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f); | 414 | 391k | } | 415 | 5.06M | const auto quant = Set(df, quant_norm16); | 416 | | | 417 | | // Compute entropy. | 418 | 5.06M | const HWY_CAPPED(float, 8) df8; | 419 | | | 420 | 5.06M | auto loss = Zero(df8); | 421 | 20.2M | for (size_t c = 0; c < 3; c++) { | 422 | 15.1M | const float* inv_matrix = config.dequant->InvMatrix(acs.Strategy(), c); | 423 | 15.1M | const float* matrix = config.dequant->Matrix(acs.Strategy(), c); | 424 | 15.1M | const auto cmap_factor = Set(df, cmap_factors[c]); | 425 | | | 426 | 15.1M | auto entropy_v = Zero(df); | 427 | 15.1M | auto nzeros_v = Zero(df); | 428 | 238M | for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { | 429 | 223M | const auto in = Load(df, block + c * size + i); | 430 | 223M | const auto in_y = Mul(Load(df, block + size + i), cmap_factor); | 431 | 223M | const auto im = Load(df, inv_matrix + i); | 432 | 223M | const auto val = Mul(Sub(in, in_y), Mul(im, quant)); | 433 | 223M | const auto rval = Round(val); | 434 | 223M | const auto diff = Sub(val, rval); | 435 | 223M | const auto m = Load(df, matrix + i); | 436 | 223M | Store(Mul(m, diff), df, &mem[i]); | 437 | 223M | const auto q = Abs(rval); | 438 | 223M | const auto q_is_zero = Eq(q, Zero(df)); | 439 | | // We used to have q * C here, but that cost model seems to | 440 | | // be punishing large values more than necessary. Sqrt tries | 441 | | // to avoid large values less aggressively. | 442 | 223M | entropy_v = Add(Sqrt(q), entropy_v); | 443 | 223M | nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); | 444 | 223M | } | 445 | | | 446 | 15.1M | { | 447 | 15.1M | float masku_lut[3] = { | 448 | 15.1M | 12.0, | 449 | 15.1M | 0.0, | 450 | 15.1M | 4.0, | 451 | 15.1M | }; | 452 | 15.1M | auto masku_off = Set(df8, masku_lut[c]); | 453 | 15.1M | auto lossc = Zero(df8); | 454 | 15.1M | TransformToPixels(acs.Strategy(), &mem[0], block, | 455 | 15.1M | acs.covered_blocks_x() * 8, scratch_space); | 456 | | | 457 | 33.7M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 458 | 46.5M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 459 | 251M | for (size_t dy = 0; dy < kBlockDim; ++dy) { | 460 | 446M | for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) { | 461 | 223M | auto in = Load(df8, block + | 462 | 223M | (iy * kBlockDim + dy) * | 463 | 223M | (acs.covered_blocks_x() * kBlockDim) + | 464 | 223M | ix * kBlockDim + dx); | 465 | 223M | if (x + ix * 8 + dx + Lanes(df8) <= config.mask1x1_xsize) { | 466 | 223M | auto masku = | 467 | 223M | Add(Load(df8, config.MaskingPtr1x1(x + ix * 8 + dx, | 468 | 223M | y + iy * 8 + dy)), | 469 | 223M | masku_off); | 470 | 223M | in = Mul(masku, in); | 471 | 223M | in = Mul(in, in); | 472 | 223M | in = Mul(in, in); | 473 | 223M | in = Mul(in, in); | 474 | 223M | lossc = Add(lossc, in); | 475 | 223M | } | 476 | 223M | } | 477 | 223M | } | 478 | 27.9M | } | 479 | 18.6M | } | 480 | 15.1M | static const double kChannelMul[3] = { | 481 | 15.1M | pow(8.2, 8.0), | 482 | 15.1M | pow(1.0, 8.0), | 483 | 15.1M | pow(1.03, 8.0), | 484 | 15.1M | }; | 485 | 15.1M | lossc = Mul(Set(df8, kChannelMul[c]), lossc); | 486 | 15.1M | loss = Add(loss, lossc); | 487 | 15.1M | } | 488 | 15.1M | entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v)); | 489 | 15.1M | size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); | 490 | | // Add #bit of num_nonzeros, as an estimate of the cost for encoding the | 491 | | // number of non-zeros of the block. | 492 | 15.1M | size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; | 493 | | // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a | 494 | | // bias. | 495 | 15.1M | entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); | 496 | 15.1M | if (c == 0 && num_blocks >= 2) { | 497 | | // It is X channel (red-green) and we often see ringing | 498 | | // in the large blocks. Let's punish that more here. | 499 | 1.08M | float w = 1.0 + std::min(3.0, num_blocks / 8.0); | 500 | 1.08M | entropy *= w; | 501 | 1.08M | loss = Mul(loss, Set(df8, w)); | 502 | 1.08M | } | 503 | 15.1M | } | 504 | 5.06M | float loss_scalar = | 505 | 5.06M | pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize), | 506 | 5.06M | 1.0f / 8.0f) * | 507 | 5.06M | (num_blocks * kDCTBlockSize) / quant_norm16; | 508 | 5.06M | entropy *= entropy_mul; | 509 | 5.06M | entropy += config.info_loss_multiplier * loss_scalar; | 510 | 5.06M | return true; | 511 | 5.06M | } |
Unexecuted instantiation: jxl::N_SSE2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*, float&) |
512 | | |
513 | | Status FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier, |
514 | | float butteraugli_target, const ACSConfig& config, |
515 | | const float* JXL_RESTRICT cmap_factors, |
516 | | AcStrategyImage* JXL_RESTRICT ac_strategy, |
517 | | float* block, float* scratch_space, |
518 | | uint32_t* quantized, float* entropy_out, |
519 | 398k | AcStrategyType& best_tx) { |
520 | 398k | struct TransformTry8x8 { |
521 | 398k | AcStrategyType type; |
522 | 398k | int encoding_speed_tier_max_limit; |
523 | 398k | double entropy_mul; |
524 | 398k | }; |
525 | 398k | static const TransformTry8x8 kTransforms8x8[] = { |
526 | 398k | { |
527 | 398k | AcStrategyType::DCT, |
528 | 398k | 9, |
529 | 398k | 0.8, |
530 | 398k | }, |
531 | 398k | { |
532 | 398k | AcStrategyType::DCT4X4, |
533 | 398k | 5, |
534 | 398k | 1.08, |
535 | 398k | }, |
536 | 398k | { |
537 | 398k | AcStrategyType::DCT2X2, |
538 | 398k | 5, |
539 | 398k | 0.95, |
540 | 398k | }, |
541 | 398k | { |
542 | 398k | AcStrategyType::DCT4X8, |
543 | 398k | 4, |
544 | 398k | 0.85931637428340035, |
545 | 398k | }, |
546 | 398k | { |
547 | 398k | AcStrategyType::DCT8X4, |
548 | 398k | 4, |
549 | 398k | 0.85931637428340035, |
550 | 398k | }, |
551 | 398k | { |
552 | 398k | AcStrategyType::IDENTITY, |
553 | 398k | 5, |
554 | 398k | 1.0427542510634957, |
555 | 398k | }, |
556 | 398k | { |
557 | 398k | AcStrategyType::AFV0, |
558 | 398k | 4, |
559 | 398k | 0.81779489591359944, |
560 | 398k | }, |
561 | 398k | { |
562 | 398k | AcStrategyType::AFV1, |
563 | 398k | 4, |
564 | 398k | 0.81779489591359944, |
565 | 398k | }, |
566 | 398k | { |
567 | 398k | AcStrategyType::AFV2, |
568 | 398k | 4, |
569 | 398k | 0.81779489591359944, |
570 | 398k | }, |
571 | 398k | { |
572 | 398k | AcStrategyType::AFV3, |
573 | 398k | 4, |
574 | 398k | 0.81779489591359944, |
575 | 398k | }, |
576 | 398k | }; |
577 | 398k | double best = 1e30; |
578 | 398k | best_tx = kTransforms8x8[0].type; |
579 | 3.98M | for (auto tx : kTransforms8x8) { |
580 | 3.98M | if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { |
581 | 0 | continue; |
582 | 0 | } |
583 | 3.98M | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); |
584 | 3.98M | float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul; |
585 | 3.98M | if ((tx.type == AcStrategyType::DCT2X2 || |
586 | 3.98M | tx.type == AcStrategyType::IDENTITY) && |
587 | 3.98M | butteraugli_target < 5.0) { |
588 | 796k | static const float kFavor2X2AtHighQuality = 0.4; |
589 | 796k | float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0f); |
590 | 796k | entropy_mul -= kFavor2X2AtHighQuality * weight; |
591 | 796k | } |
592 | 3.98M | if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 && |
593 | 3.98M | tx.type != AcStrategyType::IDENTITY) && |
594 | 3.98M | butteraugli_target > 4.0) { |
595 | 0 | static const float kAvoidEntropyOfTransforms = 0.5; |
596 | 0 | float mul = 1.0; |
597 | 0 | if (butteraugli_target < 12.0) { |
598 | 0 | mul *= (12.0 - 4.0) / (butteraugli_target - 4.0); |
599 | 0 | } |
600 | 0 | entropy_mul += kAvoidEntropyOfTransforms * mul; |
601 | 0 | } |
602 | 3.98M | float entropy; |
603 | 3.98M | JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config, |
604 | 3.98M | cmap_factors, block, scratch_space, |
605 | 3.98M | quantized, entropy)); |
606 | 3.98M | if (entropy < best) { |
607 | 755k | best_tx = tx.type; |
608 | 755k | best = entropy; |
609 | 755k | } |
610 | 3.98M | } |
611 | 398k | *entropy_out = best; |
612 | 398k | return true; |
613 | 398k | } Unexecuted instantiation: jxl::N_SSE4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) jxl::N_AVX2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) Line | Count | Source | 519 | 398k | AcStrategyType& best_tx) { | 520 | 398k | struct TransformTry8x8 { | 521 | 398k | AcStrategyType type; | 522 | 398k | int encoding_speed_tier_max_limit; | 523 | 398k | double entropy_mul; | 524 | 398k | }; | 525 | 398k | static const TransformTry8x8 kTransforms8x8[] = { | 526 | 398k | { | 527 | 398k | AcStrategyType::DCT, | 528 | 398k | 9, | 529 | 398k | 0.8, | 530 | 398k | }, | 531 | 398k | { | 532 | 398k | AcStrategyType::DCT4X4, | 533 | 398k | 5, | 534 | 398k | 1.08, | 535 | 398k | }, | 536 | 398k | { | 537 | 398k | AcStrategyType::DCT2X2, | 538 | 398k | 5, | 539 | 398k | 0.95, | 540 | 398k | }, | 541 | 398k | { | 542 | 398k | AcStrategyType::DCT4X8, | 543 | 398k | 4, | 544 | 398k | 0.85931637428340035, | 545 | 398k | }, | 546 | 398k | { | 547 | 398k | AcStrategyType::DCT8X4, | 548 | 398k | 4, | 549 | 398k | 0.85931637428340035, | 550 | 398k | }, | 551 | 398k | { | 552 | 398k | AcStrategyType::IDENTITY, | 553 | 398k | 5, | 554 | 398k | 1.0427542510634957, | 555 | 398k | }, | 556 | 398k | { | 557 | 398k | AcStrategyType::AFV0, | 558 | 398k | 4, | 559 | 398k | 0.81779489591359944, | 560 | 398k | }, | 561 | 398k | { | 562 | 398k | AcStrategyType::AFV1, | 563 | 398k | 4, | 564 | 398k | 0.81779489591359944, | 565 | 398k | }, | 566 | 398k | { | 567 | 398k | AcStrategyType::AFV2, | 568 | 398k | 4, | 569 | 398k | 0.81779489591359944, | 570 | 398k | }, | 571 | 398k | { | 572 | 398k | AcStrategyType::AFV3, | 573 | 398k | 4, | 574 | 398k | 0.81779489591359944, | 575 | 398k | }, | 576 | 398k | }; | 577 | 398k | double best = 1e30; | 578 | 398k | best_tx = kTransforms8x8[0].type; | 579 | 3.98M | for (auto tx : kTransforms8x8) { | 580 | 3.98M | if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { | 581 | 0 | continue; | 582 | 0 | } | 583 | 3.98M | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); | 584 | 3.98M | float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul; | 585 | 3.98M | if ((tx.type == AcStrategyType::DCT2X2 || | 586 | 3.98M | tx.type == AcStrategyType::IDENTITY) && | 587 | 3.98M | butteraugli_target < 5.0) { | 588 | 796k | static const float kFavor2X2AtHighQuality = 0.4; | 589 | 796k | float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0f); | 590 | 796k | entropy_mul -= kFavor2X2AtHighQuality * weight; | 591 | 796k | } | 592 | 3.98M | if ((tx.type != AcStrategyType::DCT && tx.type != AcStrategyType::DCT2X2 && | 593 | 3.98M | tx.type != AcStrategyType::IDENTITY) && | 594 | 3.98M | butteraugli_target > 4.0) { | 595 | 0 | static const float kAvoidEntropyOfTransforms = 0.5; | 596 | 0 | float mul = 1.0; | 597 | 0 | if (butteraugli_target < 12.0) { | 598 | 0 | mul *= (12.0 - 4.0) / (butteraugli_target - 4.0); | 599 | 0 | } | 600 | 0 | entropy_mul += kAvoidEntropyOfTransforms * mul; | 601 | 0 | } | 602 | 3.98M | float entropy; | 603 | 3.98M | JXL_RETURN_IF_ERROR(EstimateEntropy(acs, entropy_mul, x, y, config, | 604 | 3.98M | cmap_factors, block, scratch_space, | 605 | 3.98M | quantized, entropy)); | 606 | 3.98M | if (entropy < best) { | 607 | 755k | best_tx = tx.type; | 608 | 755k | best = entropy; | 609 | 755k | } | 610 | 3.98M | } | 611 | 398k | *entropy_out = best; | 612 | 398k | return true; | 613 | 398k | } |
Unexecuted instantiation: jxl::N_SSE2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*, jxl::AcStrategyType&) |
614 | | |
615 | | // bx, by addresses the 64x64 block at 8x8 subresolution |
616 | | // cx, cy addresses the left, upper 8x8 block position of the candidate |
617 | | // transform. |
618 | | Status TryMergeAcs(AcStrategyType acs_raw, size_t bx, size_t by, size_t cx, |
619 | | size_t cy, const ACSConfig& config, |
620 | | const float* JXL_RESTRICT cmap_factors, |
621 | | AcStrategyImage* JXL_RESTRICT ac_strategy, |
622 | | const float entropy_mul, const uint8_t candidate_priority, |
623 | | uint8_t* priority, float* JXL_RESTRICT entropy_estimate, |
624 | 1.21M | float* block, float* scratch_space, uint32_t* quantized) { |
625 | 1.21M | AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); |
626 | 1.21M | float entropy_current = 0; |
627 | 1.34M | for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { |
628 | 1.88M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { |
629 | 1.76M | if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { |
630 | | // Transform would reuse already allocated blocks and |
631 | | // lead to invalid overlaps, for example DCT64X32 vs. |
632 | | // DCT32X64. |
633 | 1.19M | return true; |
634 | 1.19M | } |
635 | 565k | entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; |
636 | 565k | } |
637 | 1.31M | } |
638 | 24.0k | float entropy_candidate; |
639 | 24.0k | JXL_RETURN_IF_ERROR(EstimateEntropy( |
640 | 24.0k | acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors, |
641 | 24.0k | block, scratch_space, quantized, entropy_candidate)); |
642 | 24.0k | if (entropy_candidate >= entropy_current) return true; |
643 | | // Accept the candidate. |
644 | 9.72k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
645 | 37.4k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
646 | 29.3k | entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; |
647 | 29.3k | priority[(cy + iy) * 8 + cx + ix] = candidate_priority; |
648 | 29.3k | } |
649 | 8.09k | } |
650 | 1.63k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw)); |
651 | 1.63k | entropy_estimate[cy * 8 + cx] = entropy_candidate; |
652 | 1.63k | return true; |
653 | 1.63k | } Unexecuted instantiation: jxl::N_SSE4::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) jxl::N_AVX2::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) Line | Count | Source | 624 | 1.21M | float* block, float* scratch_space, uint32_t* quantized) { | 625 | 1.21M | AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); | 626 | 1.21M | float entropy_current = 0; | 627 | 1.34M | for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { | 628 | 1.88M | for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { | 629 | 1.76M | if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { | 630 | | // Transform would reuse already allocated blocks and | 631 | | // lead to invalid overlaps, for example DCT64X32 vs. | 632 | | // DCT32X64. | 633 | 1.19M | return true; | 634 | 1.19M | } | 635 | 565k | entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; | 636 | 565k | } | 637 | 1.31M | } | 638 | 24.0k | float entropy_candidate; | 639 | 24.0k | JXL_RETURN_IF_ERROR(EstimateEntropy( | 640 | 24.0k | acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, cmap_factors, | 641 | 24.0k | block, scratch_space, quantized, entropy_candidate)); | 642 | 24.0k | if (entropy_candidate >= entropy_current) return true; | 643 | | // Accept the candidate. | 644 | 9.72k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 645 | 37.4k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 646 | 29.3k | entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; | 647 | 29.3k | priority[(cy + iy) * 8 + cx + ix] = candidate_priority; | 648 | 29.3k | } | 649 | 8.09k | } | 650 | 1.63k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_raw)); | 651 | 1.63k | entropy_estimate[cy * 8 + cx] = entropy_candidate; | 652 | 1.63k | return true; | 653 | 1.63k | } |
Unexecuted instantiation: jxl::N_SSE2::TryMergeAcs(jxl::AcStrategyType, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) |
654 | | |
655 | | static void SetEntropyForTransform(size_t cx, size_t cy, |
656 | | const AcStrategyType acs_raw, float entropy, |
657 | 61.0k | float* JXL_RESTRICT entropy_estimate) { |
658 | 61.0k | const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); |
659 | 189k | for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { |
660 | 443k | for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { |
661 | 314k | entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; |
662 | 314k | } |
663 | 128k | } |
664 | 61.0k | entropy_estimate[cy * 8 + cx] = entropy; |
665 | 61.0k | } Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) enc_ac_strategy.cc:jxl::N_AVX2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) Line | Count | Source | 657 | 61.0k | float* JXL_RESTRICT entropy_estimate) { | 658 | 61.0k | const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); | 659 | 189k | for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { | 660 | 443k | for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { | 661 | 314k | entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; | 662 | 314k | } | 663 | 128k | } | 664 | 61.0k | entropy_estimate[cy * 8 + cx] = entropy; | 665 | 61.0k | } |
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategyType, float, float*) |
666 | | |
667 | 356k | AcStrategyType AcsSquare(size_t blocks) { |
668 | 356k | if (blocks == 2) { |
669 | 298k | return AcStrategyType::DCT16X16; |
670 | 298k | } else if (blocks == 4) { |
671 | 52.4k | return AcStrategyType::DCT32X32; |
672 | 52.4k | } else { |
673 | 5.66k | return AcStrategyType::DCT64X64; |
674 | 5.66k | } |
675 | 356k | } Unexecuted instantiation: jxl::N_SSE4::AcsSquare(unsigned long) jxl::N_AVX2::AcsSquare(unsigned long) Line | Count | Source | 667 | 356k | AcStrategyType AcsSquare(size_t blocks) { | 668 | 356k | if (blocks == 2) { | 669 | 298k | return AcStrategyType::DCT16X16; | 670 | 298k | } else if (blocks == 4) { | 671 | 52.4k | return AcStrategyType::DCT32X32; | 672 | 52.4k | } else { | 673 | 5.66k | return AcStrategyType::DCT64X64; | 674 | 5.66k | } | 675 | 356k | } |
Unexecuted instantiation: jxl::N_SSE2::AcsSquare(unsigned long) |
676 | | |
677 | 356k | AcStrategyType AcsVerticalSplit(size_t blocks) { |
678 | 356k | if (blocks == 2) { |
679 | 298k | return AcStrategyType::DCT16X8; |
680 | 298k | } else if (blocks == 4) { |
681 | 52.4k | return AcStrategyType::DCT32X16; |
682 | 52.4k | } else { |
683 | 5.66k | return AcStrategyType::DCT64X32; |
684 | 5.66k | } |
685 | 356k | } Unexecuted instantiation: jxl::N_SSE4::AcsVerticalSplit(unsigned long) jxl::N_AVX2::AcsVerticalSplit(unsigned long) Line | Count | Source | 677 | 356k | AcStrategyType AcsVerticalSplit(size_t blocks) { | 678 | 356k | if (blocks == 2) { | 679 | 298k | return AcStrategyType::DCT16X8; | 680 | 298k | } else if (blocks == 4) { | 681 | 52.4k | return AcStrategyType::DCT32X16; | 682 | 52.4k | } else { | 683 | 5.66k | return AcStrategyType::DCT64X32; | 684 | 5.66k | } | 685 | 356k | } |
Unexecuted instantiation: jxl::N_SSE2::AcsVerticalSplit(unsigned long) |
686 | | |
687 | 356k | AcStrategyType AcsHorizontalSplit(size_t blocks) { |
688 | 356k | if (blocks == 2) { |
689 | 298k | return AcStrategyType::DCT8X16; |
690 | 298k | } else if (blocks == 4) { |
691 | 52.4k | return AcStrategyType::DCT16X32; |
692 | 52.4k | } else { |
693 | 5.66k | return AcStrategyType::DCT32X64; |
694 | 5.66k | } |
695 | 356k | } Unexecuted instantiation: jxl::N_SSE4::AcsHorizontalSplit(unsigned long) jxl::N_AVX2::AcsHorizontalSplit(unsigned long) Line | Count | Source | 687 | 356k | AcStrategyType AcsHorizontalSplit(size_t blocks) { | 688 | 356k | if (blocks == 2) { | 689 | 298k | return AcStrategyType::DCT8X16; | 690 | 298k | } else if (blocks == 4) { | 691 | 52.4k | return AcStrategyType::DCT16X32; | 692 | 52.4k | } else { | 693 | 5.66k | return AcStrategyType::DCT32X64; | 694 | 5.66k | } | 695 | 356k | } |
Unexecuted instantiation: jxl::N_SSE2::AcsHorizontalSplit(unsigned long) |
696 | | |
697 | | // The following function tries to merge smaller transforms into |
698 | | // squares and the rectangles originating from a single middle division |
699 | | // (horizontal or vertical) fairly. |
700 | | // |
701 | | // This is now generalized to concern about squares |
702 | | // of blocks X blocks size, where a block is 8x8 pixels. |
703 | | Status FindBestFirstLevelDivisionForSquare( |
704 | | size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx, |
705 | | size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors, |
706 | | AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK, |
707 | | const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate, |
708 | 356k | float* block, float* scratch_space, uint32_t* quantized) { |
709 | | // We denote J for the larger dimension here, and K for the smaller. |
710 | | // For example, for 32x32 block splitting, J would be 32, K 16. |
711 | 356k | const size_t blocks_half = blocks / 2; |
712 | 356k | const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks); |
713 | 356k | const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks); |
714 | 356k | const AcStrategyType acs_rawJXJ = AcsSquare(blocks); |
715 | 356k | const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); |
716 | 356k | const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); |
717 | 356k | const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); |
718 | 356k | AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); |
719 | 356k | AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); |
720 | | // Let's check if we can consider a JXJ block here at all. |
721 | | // This is not necessary in the basic use of hierarchically merging |
722 | | // blocks in the simplest possible way, but is needed when we try other |
723 | | // 'floating' options of merging, possibly after a simple hierarchical |
724 | | // merge has been explored. |
725 | 356k | if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, |
726 | 356k | by + cy, bx + cx + blocks) || |
727 | 356k | MultiBlockTransformCrossesHorizontalBoundary( |
728 | 265k | *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || |
729 | 356k | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, |
730 | 244k | by + cy + blocks) || |
731 | 356k | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, |
732 | 221k | by + cy, by + cy + blocks)) { |
733 | 139k | return true; // not suitable for JxJ analysis, some transforms leak out. |
734 | 139k | } |
735 | | // For floating transforms there may be |
736 | | // already blocks selected that make either or both JXK and |
737 | | // KXJ not feasible for this location. |
738 | 216k | const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( |
739 | 216k | *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); |
740 | 216k | const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( |
741 | 216k | *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); |
742 | | // Current entropies aggregated on NxN resolution. |
743 | 216k | float entropy[2][2] = {}; |
744 | 753k | for (size_t dy = 0; dy < blocks; ++dy) { |
745 | 2.16M | for (size_t dx = 0; dx < blocks; ++dx) { |
746 | 1.63M | entropy[dy / blocks_half][dx / blocks_half] += |
747 | 1.63M | entropy_estimate[(cy + dy) * 8 + (cx + dx)]; |
748 | 1.63M | } |
749 | 537k | } |
750 | 216k | float entropy_JXK_left = std::numeric_limits<float>::max(); |
751 | 216k | float entropy_JXK_right = std::numeric_limits<float>::max(); |
752 | 216k | float entropy_KXJ_top = std::numeric_limits<float>::max(); |
753 | 216k | float entropy_KXJ_bottom = std::numeric_limits<float>::max(); |
754 | 216k | float entropy_JXJ = std::numeric_limits<float>::max(); |
755 | 216k | if (allow_JXK) { |
756 | 212k | if (row0[bx + cx + 0].Strategy() != acs_rawJXK) { |
757 | 209k | JXL_RETURN_IF_ERROR(EstimateEntropy( |
758 | 209k | acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
759 | 209k | cmap_factors, block, scratch_space, quantized, entropy_JXK_left)); |
760 | 209k | } |
761 | 212k | if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) { |
762 | 210k | JXL_RETURN_IF_ERROR( |
763 | 210k | EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8, |
764 | 210k | (by + cy + 0) * 8, config, cmap_factors, block, |
765 | 210k | scratch_space, quantized, entropy_JXK_right)); |
766 | 210k | } |
767 | 212k | } |
768 | 216k | if (allow_KXJ) { |
769 | 212k | if (row0[bx + cx].Strategy() != acs_rawKXJ) { |
770 | 209k | JXL_RETURN_IF_ERROR(EstimateEntropy( |
771 | 209k | acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
772 | 209k | cmap_factors, block, scratch_space, quantized, entropy_KXJ_top)); |
773 | 209k | } |
774 | 212k | if (row1[bx + cx].Strategy() != acs_rawKXJ) { |
775 | 211k | JXL_RETURN_IF_ERROR( |
776 | 211k | EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, |
777 | 211k | (by + cy + blocks_half) * 8, config, cmap_factors, |
778 | 211k | block, scratch_space, quantized, entropy_KXJ_bottom)); |
779 | 211k | } |
780 | 212k | } |
781 | 216k | if (allow_square_transform) { |
782 | | // We control the exploration of the square transform separately so that |
783 | | // we can turn it off at high decoding speeds for 32x32, but still allow |
784 | | // exploring 16x32 and 32x16. |
785 | 216k | JXL_RETURN_IF_ERROR(EstimateEntropy( |
786 | 216k | acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
787 | 216k | cmap_factors, block, scratch_space, quantized, entropy_JXJ)); |
788 | 216k | } |
789 | | |
790 | | // Test if this block should have JXK or KXJ transforms, |
791 | | // because it can have only one or the other. |
792 | 216k | float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + |
793 | 216k | std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); |
794 | 216k | float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + |
795 | 216k | std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); |
796 | 216k | if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { |
797 | 34.7k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ)); |
798 | 34.7k | SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); |
799 | 181k | } else if (costJxN < costNxJ) { |
800 | 29.7k | if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { |
801 | 6.27k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK)); |
802 | 6.27k | SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, |
803 | 6.27k | entropy_estimate); |
804 | 6.27k | } |
805 | 29.7k | if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { |
806 | 5.95k | JXL_RETURN_IF_ERROR( |
807 | 5.95k | ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK)); |
808 | 5.95k | SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, |
809 | 5.95k | entropy_JXK_right, entropy_estimate); |
810 | 5.95k | } |
811 | 151k | } else { |
812 | 151k | if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { |
813 | 7.13k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ)); |
814 | 7.13k | SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, |
815 | 7.13k | entropy_estimate); |
816 | 7.13k | } |
817 | 151k | if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { |
818 | 7.01k | JXL_RETURN_IF_ERROR( |
819 | 7.01k | ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ)); |
820 | 7.01k | SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, |
821 | 7.01k | entropy_KXJ_bottom, entropy_estimate); |
822 | 7.01k | } |
823 | 151k | } |
824 | 216k | return true; |
825 | 216k | } Unexecuted instantiation: jxl::N_SSE4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) jxl::N_AVX2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Line | Count | Source | 708 | 356k | float* block, float* scratch_space, uint32_t* quantized) { | 709 | | // We denote J for the larger dimension here, and K for the smaller. | 710 | | // For example, for 32x32 block splitting, J would be 32, K 16. | 711 | 356k | const size_t blocks_half = blocks / 2; | 712 | 356k | const AcStrategyType acs_rawJXK = AcsVerticalSplit(blocks); | 713 | 356k | const AcStrategyType acs_rawKXJ = AcsHorizontalSplit(blocks); | 714 | 356k | const AcStrategyType acs_rawJXJ = AcsSquare(blocks); | 715 | 356k | const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); | 716 | 356k | const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); | 717 | 356k | const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); | 718 | 356k | AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); | 719 | 356k | AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); | 720 | | // Let's check if we can consider a JXJ block here at all. | 721 | | // This is not necessary in the basic use of hierarchically merging | 722 | | // blocks in the simplest possible way, but is needed when we try other | 723 | | // 'floating' options of merging, possibly after a simple hierarchical | 724 | | // merge has been explored. | 725 | 356k | if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, | 726 | 356k | by + cy, bx + cx + blocks) || | 727 | 356k | MultiBlockTransformCrossesHorizontalBoundary( | 728 | 265k | *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || | 729 | 356k | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, | 730 | 244k | by + cy + blocks) || | 731 | 356k | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, | 732 | 221k | by + cy, by + cy + blocks)) { | 733 | 139k | return true; // not suitable for JxJ analysis, some transforms leak out. | 734 | 139k | } | 735 | | // For floating transforms there may be | 736 | | // already blocks selected that make either or both JXK and | 737 | | // KXJ not feasible for this location. | 738 | 216k | const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( | 739 | 216k | *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); | 740 | 216k | const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( | 741 | 216k | *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); | 742 | | // Current entropies aggregated on NxN resolution. | 743 | 216k | float entropy[2][2] = {}; | 744 | 753k | for (size_t dy = 0; dy < blocks; ++dy) { | 745 | 2.16M | for (size_t dx = 0; dx < blocks; ++dx) { | 746 | 1.63M | entropy[dy / blocks_half][dx / blocks_half] += | 747 | 1.63M | entropy_estimate[(cy + dy) * 8 + (cx + dx)]; | 748 | 1.63M | } | 749 | 537k | } | 750 | 216k | float entropy_JXK_left = std::numeric_limits<float>::max(); | 751 | 216k | float entropy_JXK_right = std::numeric_limits<float>::max(); | 752 | 216k | float entropy_KXJ_top = std::numeric_limits<float>::max(); | 753 | 216k | float entropy_KXJ_bottom = std::numeric_limits<float>::max(); | 754 | 216k | float entropy_JXJ = std::numeric_limits<float>::max(); | 755 | 216k | if (allow_JXK) { | 756 | 212k | if (row0[bx + cx + 0].Strategy() != acs_rawJXK) { | 757 | 209k | JXL_RETURN_IF_ERROR(EstimateEntropy( | 758 | 209k | acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, | 759 | 209k | cmap_factors, block, scratch_space, quantized, entropy_JXK_left)); | 760 | 209k | } | 761 | 212k | if (row0[bx + cx + blocks_half].Strategy() != acs_rawJXK) { | 762 | 210k | JXL_RETURN_IF_ERROR( | 763 | 210k | EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8, | 764 | 210k | (by + cy + 0) * 8, config, cmap_factors, block, | 765 | 210k | scratch_space, quantized, entropy_JXK_right)); | 766 | 210k | } | 767 | 212k | } | 768 | 216k | if (allow_KXJ) { | 769 | 212k | if (row0[bx + cx].Strategy() != acs_rawKXJ) { | 770 | 209k | JXL_RETURN_IF_ERROR(EstimateEntropy( | 771 | 209k | acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, | 772 | 209k | cmap_factors, block, scratch_space, quantized, entropy_KXJ_top)); | 773 | 209k | } | 774 | 212k | if (row1[bx + cx].Strategy() != acs_rawKXJ) { | 775 | 211k | JXL_RETURN_IF_ERROR( | 776 | 211k | EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, | 777 | 211k | (by + cy + blocks_half) * 8, config, cmap_factors, | 778 | 211k | block, scratch_space, quantized, entropy_KXJ_bottom)); | 779 | 211k | } | 780 | 212k | } | 781 | 216k | if (allow_square_transform) { | 782 | | // We control the exploration of the square transform separately so that | 783 | | // we can turn it off at high decoding speeds for 32x32, but still allow | 784 | | // exploring 16x32 and 32x16. | 785 | 216k | JXL_RETURN_IF_ERROR(EstimateEntropy( | 786 | 216k | acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, | 787 | 216k | cmap_factors, block, scratch_space, quantized, entropy_JXJ)); | 788 | 216k | } | 789 | | | 790 | | // Test if this block should have JXK or KXJ transforms, | 791 | | // because it can have only one or the other. | 792 | 216k | float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + | 793 | 216k | std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); | 794 | 216k | float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + | 795 | 216k | std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); | 796 | 216k | if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { | 797 | 34.7k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ)); | 798 | 34.7k | SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); | 799 | 181k | } else if (costJxN < costNxJ) { | 800 | 29.7k | if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { | 801 | 6.27k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawJXK)); | 802 | 6.27k | SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, | 803 | 6.27k | entropy_estimate); | 804 | 6.27k | } | 805 | 29.7k | if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { | 806 | 5.95k | JXL_RETURN_IF_ERROR( | 807 | 5.95k | ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK)); | 808 | 5.95k | SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, | 809 | 5.95k | entropy_JXK_right, entropy_estimate); | 810 | 5.95k | } | 811 | 151k | } else { | 812 | 151k | if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { | 813 | 7.13k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ)); | 814 | 7.13k | SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, | 815 | 7.13k | entropy_estimate); | 816 | 7.13k | } | 817 | 151k | if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { | 818 | 7.01k | JXL_RETURN_IF_ERROR( | 819 | 7.01k | ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ)); | 820 | 7.01k | SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, | 821 | 7.01k | entropy_KXJ_bottom, entropy_estimate); | 822 | 7.01k | } | 823 | 151k | } | 824 | 216k | return true; | 825 | 216k | } |
Unexecuted instantiation: jxl::N_SSE2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) |
826 | | |
827 | | Status ProcessRectACS(const CompressParams& cparams, const ACSConfig& config, |
828 | | const Rect& rect, const ColorCorrelationMap& cmap, |
829 | | float* JXL_RESTRICT block, |
830 | | uint32_t* JXL_RESTRICT quantized, |
831 | 7.22k | AcStrategyImage* ac_strategy) { |
832 | | // Main philosophy here: |
833 | | // 1. First find best 8x8 transform for each area. |
834 | | // 2. Merging them into larger transforms where possibly, but |
835 | | // starting from the smallest transforms (16x8 and 8x16). |
836 | | // Additional complication: 16x8 and 8x16 are considered |
837 | | // simultaneously and fairly against each other. |
838 | | // We are looking at 64x64 squares since the Y-to-X and Y-to-B |
839 | | // maps happen to be at that resolution, and having |
840 | | // integral transforms cross these boundaries leads to |
841 | | // additional complications. |
842 | 7.22k | const float butteraugli_target = cparams.butteraugli_distance; |
843 | 7.22k | float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea; |
844 | 7.22k | size_t bx = rect.x0(); |
845 | 7.22k | size_t by = rect.y0(); |
846 | 7.22k | JXL_ENSURE(rect.xsize() <= 8); |
847 | 7.22k | JXL_ENSURE(rect.ysize() <= 8); |
848 | 7.22k | size_t tx = bx / kColorTileDimInBlocks; |
849 | 7.22k | size_t ty = by / kColorTileDimInBlocks; |
850 | 7.22k | const float cmap_factors[3] = { |
851 | 7.22k | cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]), |
852 | 7.22k | 0.0f, |
853 | 7.22k | cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]), |
854 | 7.22k | }; |
855 | 7.22k | if (cparams.speed_tier > SpeedTier::kHare) return true; |
856 | | // First compute the best 8x8 transform for each square. Later, we do not |
857 | | // experiment with different combinations, but only use the best of the 8x8s |
858 | | // when DCT8X8 is specified in the tree search. |
859 | | // 8x8 transforms have 10 variants, but every larger transform is just a DCT. |
860 | 7.22k | float entropy_estimate[64] = {}; |
861 | | // Favor all 8x8 transforms (against 16x8 and larger transforms)) at |
862 | | // low butteraugli_target distances. |
863 | 7.22k | static const float k8x8mul1 = -0.4; |
864 | 7.22k | static const float k8x8mul2 = 1.0; |
865 | 7.22k | static const float k8x8base = 1.4; |
866 | 7.22k | const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); |
867 | 60.6k | for (size_t iy = 0; iy < rect.ysize(); iy++) { |
868 | 451k | for (size_t ix = 0; ix < rect.xsize(); ix++) { |
869 | 398k | float entropy = 0.0; |
870 | 398k | AcStrategyType best_of_8x8s; |
871 | 398k | JXL_RETURN_IF_ERROR(FindBest8x8Transform( |
872 | 398k | 8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier), |
873 | 398k | butteraugli_target, config, cmap_factors, ac_strategy, block, |
874 | 398k | scratch_space, quantized, &entropy, best_of_8x8s)); |
875 | 398k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s)); |
876 | 398k | entropy_estimate[iy * 8 + ix] = entropy * mul8x8; |
877 | 398k | } |
878 | 53.4k | } |
879 | | // Merge when a larger transform is better than the previously |
880 | | // searched best combination of 8x8 transforms. |
881 | 7.22k | struct MergeTry { |
882 | 7.22k | AcStrategyType type; |
883 | 7.22k | uint8_t priority; |
884 | 7.22k | uint8_t decoding_speed_tier_max_limit; |
885 | 7.22k | uint8_t encoding_speed_tier_max_limit; |
886 | 7.22k | float entropy_mul; |
887 | 7.22k | }; |
888 | | // These numbers need to be figured out manually and looking at |
889 | | // ringing next to sky etc. Optimization will find smaller numbers |
890 | | // and produce more ringing than is ideal. Larger numbers will |
891 | | // help stop ringing. |
892 | 7.22k | const float entropy_mul16X8 = 1.21; |
893 | 7.22k | const float entropy_mul16X16 = 1.34; |
894 | 7.22k | const float entropy_mul16X32 = 1.49; |
895 | 7.22k | const float entropy_mul32X32 = 1.48; |
896 | 7.22k | const float entropy_mul64X32 = 2.25; |
897 | 7.22k | const float entropy_mul64X64 = 2.25; |
898 | | // TODO(jyrki): Consider this feedback in further changes: |
899 | | // Also effectively when the multipliers for smaller blocks are |
900 | | // below 1, this raises the bar for the bigger blocks even higher |
901 | | // in that sense these constants are not independent (e.g. changing |
902 | | // the constant for DCT16x32 by -5% (making it more likely) also |
903 | | // means that DCT32x32 becomes harder to do when starting from |
904 | | // two DCT16x32s). It might be better to make them more independent, |
905 | | // e.g. by not applying the multiplier when storing the new entropy |
906 | | // estimates in TryMergeToACSCandidate(). |
907 | 7.22k | const MergeTry kTransformsForMerge[9] = { |
908 | 7.22k | {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8}, |
909 | 7.22k | {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8}, |
910 | | // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its |
911 | | // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16}, |
912 | 7.22k | {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32}, |
913 | 7.22k | {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32}, |
914 | | // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its |
915 | | // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5, |
916 | | // 0.9822994906548809f}, |
917 | 7.22k | {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32}, |
918 | 7.22k | {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32}, |
919 | | // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f}, |
920 | 7.22k | }; |
921 | | /* |
922 | | These sizes not yet included in merge heuristic: |
923 | | set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f); |
924 | | set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f); |
925 | | set(AcStrategyType::DCT128X128, 0.0f, 1.0f); |
926 | | set(AcStrategyType::DCT128X64, 0.0f, 0.73f); |
927 | | set(AcStrategyType::DCT64X128, 0.0f, 0.73f); |
928 | | set(AcStrategyType::DCT256X256, 0.0f, 1.0f); |
929 | | set(AcStrategyType::DCT256X128, 0.0f, 0.73f); |
930 | | set(AcStrategyType::DCT128X256, 0.0f, 0.73f); |
931 | | */ |
932 | | |
933 | | // Priority is a tricky kludge to avoid collisions so that transforms |
934 | | // don't overlap. |
935 | 7.22k | uint8_t priority[64] = {}; |
936 | 7.22k | bool enable_32x32 = cparams.decoding_speed_tier < 4; |
937 | 65.0k | for (auto mt : kTransformsForMerge) { |
938 | 65.0k | if (mt.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { |
939 | 0 | continue; |
940 | 0 | } |
941 | 65.0k | AcStrategy acs = AcStrategy::FromRawStrategy(mt.type); |
942 | | |
943 | 363k | for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); |
944 | 298k | cy += acs.covered_blocks_y()) { |
945 | 2.00M | for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); |
946 | 1.70M | cx += acs.covered_blocks_x()) { |
947 | 1.70M | if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { |
948 | 51.0k | if (cparams.decoding_speed_tier < 4 && |
949 | 51.0k | mt.type == AcStrategyType::DCT32X64) { |
950 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
951 | 5.66k | if ((cy | cx) % 8 == 0) { |
952 | 5.66k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
953 | 5.66k | 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
954 | 5.66k | mt.entropy_mul, entropy_mul64X64, entropy_estimate, block, |
955 | 5.66k | scratch_space, quantized)); |
956 | 5.66k | } |
957 | 5.66k | continue; |
958 | 45.3k | } else if (mt.type == AcStrategyType::DCT32X16) { |
959 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
960 | | // and that is above. The last column and last row, |
961 | | // when the last column or last row is odd numbered, |
962 | | // are still handled by TryMergeAcs. |
963 | 5.66k | continue; |
964 | 5.66k | } |
965 | 51.0k | } |
966 | 1.69M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || |
967 | 1.69M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { |
968 | | // already covered by FindBest32X32 |
969 | 46.8k | continue; |
970 | 46.8k | } |
971 | | |
972 | 1.64M | if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { |
973 | 672k | if (mt.type == AcStrategyType::DCT16X32) { |
974 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
975 | 23.4k | if ((cy | cx) % 4 == 0) { |
976 | 23.4k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
977 | 23.4k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, |
978 | 23.4k | ac_strategy, mt.entropy_mul, entropy_mul32X32, |
979 | 23.4k | entropy_estimate, block, scratch_space, quantized)); |
980 | 23.4k | } |
981 | 23.4k | continue; |
982 | 648k | } else if (mt.type == AcStrategyType::DCT32X16) { |
983 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
984 | | // and that is above. The last column and last row, |
985 | | // when the last column or last row is odd numbered, |
986 | | // are still handled by TryMergeAcs. |
987 | 17.7k | continue; |
988 | 17.7k | } |
989 | 672k | } |
990 | 1.60M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || |
991 | 1.60M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { |
992 | | // already covered by FindBest32X32 |
993 | 0 | continue; |
994 | 0 | } |
995 | 1.60M | if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { |
996 | 1.25M | if (mt.type == AcStrategyType::DCT8X16) { |
997 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
998 | 170k | if ((cy | cx) % 2 == 0) { |
999 | 96.7k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
1000 | 96.7k | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1001 | 96.7k | mt.entropy_mul, entropy_mul16X16, entropy_estimate, block, |
1002 | 96.7k | scratch_space, quantized)); |
1003 | 96.7k | } |
1004 | 170k | continue; |
1005 | 1.08M | } else if (mt.type == AcStrategyType::DCT16X8) { |
1006 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
1007 | | // and that is above. The last column and last row, |
1008 | | // when the last column or last row is odd numbered, |
1009 | | // are still handled by TryMergeAcs. |
1010 | 169k | continue; |
1011 | 169k | } |
1012 | 1.25M | } |
1013 | 1.26M | if ((mt.type == AcStrategyType::DCT8X16 && cy % 2 == 1) || |
1014 | 1.26M | (mt.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) { |
1015 | | // already covered by FindBestFirstLevelDivisionForSquare |
1016 | 47.2k | continue; |
1017 | 47.2k | } |
1018 | | // All other merge sizes are handled here. |
1019 | | // Some of the DCT16X8s and DCT8X16s will still leak through here |
1020 | | // when there is an odd number of 8x8 blocks, then the last row |
1021 | | // and column will get their DCT16X8s and DCT8X16s through the |
1022 | | // normal integral transform merging process. |
1023 | 1.21M | JXL_RETURN_IF_ERROR( |
1024 | 1.21M | TryMergeAcs(mt.type, bx, by, cx, cy, config, cmap_factors, |
1025 | 1.21M | ac_strategy, mt.entropy_mul, mt.priority, &priority[0], |
1026 | 1.21M | entropy_estimate, block, scratch_space, quantized)); |
1027 | 1.21M | } |
1028 | 298k | } |
1029 | 65.0k | } |
1030 | 7.22k | if (cparams.speed_tier >= SpeedTier::kHare) { |
1031 | 0 | return true; |
1032 | 0 | } |
1033 | | // Here we still try to do some non-aligned matching, find a few more |
1034 | | // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. |
1035 | 53.4k | for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { |
1036 | 344k | for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { |
1037 | 298k | if ((cy | cx) % 2 != 0) { |
1038 | 201k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
1039 | 201k | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1040 | 201k | entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, |
1041 | 201k | scratch_space, quantized)); |
1042 | 201k | } |
1043 | 298k | } |
1044 | 46.2k | } |
1045 | | // Non-aligned matching for 32X32, 16X32 and 32X16. |
1046 | 7.22k | size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; |
1047 | 26.2k | for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { |
1048 | 71.5k | for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { |
1049 | 52.4k | if ((cy | cx) % 4 == 0) { |
1050 | 23.4k | continue; // Already tried with loop above (DCT16X32 case). |
1051 | 23.4k | } |
1052 | 29.0k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( |
1053 | 29.0k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1054 | 29.0k | entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, |
1055 | 29.0k | scratch_space, quantized)); |
1056 | 29.0k | } |
1057 | 19.0k | } |
1058 | 7.22k | return true; |
1059 | 7.22k | } Unexecuted instantiation: jxl::N_SSE4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) jxl::N_AVX2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) Line | Count | Source | 831 | 7.22k | AcStrategyImage* ac_strategy) { | 832 | | // Main philosophy here: | 833 | | // 1. First find best 8x8 transform for each area. | 834 | | // 2. Merging them into larger transforms where possibly, but | 835 | | // starting from the smallest transforms (16x8 and 8x16). | 836 | | // Additional complication: 16x8 and 8x16 are considered | 837 | | // simultaneously and fairly against each other. | 838 | | // We are looking at 64x64 squares since the Y-to-X and Y-to-B | 839 | | // maps happen to be at that resolution, and having | 840 | | // integral transforms cross these boundaries leads to | 841 | | // additional complications. | 842 | 7.22k | const float butteraugli_target = cparams.butteraugli_distance; | 843 | 7.22k | float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea; | 844 | 7.22k | size_t bx = rect.x0(); | 845 | 7.22k | size_t by = rect.y0(); | 846 | 7.22k | JXL_ENSURE(rect.xsize() <= 8); | 847 | 7.22k | JXL_ENSURE(rect.ysize() <= 8); | 848 | 7.22k | size_t tx = bx / kColorTileDimInBlocks; | 849 | 7.22k | size_t ty = by / kColorTileDimInBlocks; | 850 | 7.22k | const float cmap_factors[3] = { | 851 | 7.22k | cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]), | 852 | 7.22k | 0.0f, | 853 | 7.22k | cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]), | 854 | 7.22k | }; | 855 | 7.22k | if (cparams.speed_tier > SpeedTier::kHare) return true; | 856 | | // First compute the best 8x8 transform for each square. Later, we do not | 857 | | // experiment with different combinations, but only use the best of the 8x8s | 858 | | // when DCT8X8 is specified in the tree search. | 859 | | // 8x8 transforms have 10 variants, but every larger transform is just a DCT. | 860 | 7.22k | float entropy_estimate[64] = {}; | 861 | | // Favor all 8x8 transforms (against 16x8 and larger transforms)) at | 862 | | // low butteraugli_target distances. | 863 | 7.22k | static const float k8x8mul1 = -0.4; | 864 | 7.22k | static const float k8x8mul2 = 1.0; | 865 | 7.22k | static const float k8x8base = 1.4; | 866 | 7.22k | const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); | 867 | 60.6k | for (size_t iy = 0; iy < rect.ysize(); iy++) { | 868 | 451k | for (size_t ix = 0; ix < rect.xsize(); ix++) { | 869 | 398k | float entropy = 0.0; | 870 | 398k | AcStrategyType best_of_8x8s; | 871 | 398k | JXL_RETURN_IF_ERROR(FindBest8x8Transform( | 872 | 398k | 8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier), | 873 | 398k | butteraugli_target, config, cmap_factors, ac_strategy, block, | 874 | 398k | scratch_space, quantized, &entropy, best_of_8x8s)); | 875 | 398k | JXL_RETURN_IF_ERROR(ac_strategy->Set(bx + ix, by + iy, best_of_8x8s)); | 876 | 398k | entropy_estimate[iy * 8 + ix] = entropy * mul8x8; | 877 | 398k | } | 878 | 53.4k | } | 879 | | // Merge when a larger transform is better than the previously | 880 | | // searched best combination of 8x8 transforms. | 881 | 7.22k | struct MergeTry { | 882 | 7.22k | AcStrategyType type; | 883 | 7.22k | uint8_t priority; | 884 | 7.22k | uint8_t decoding_speed_tier_max_limit; | 885 | 7.22k | uint8_t encoding_speed_tier_max_limit; | 886 | 7.22k | float entropy_mul; | 887 | 7.22k | }; | 888 | | // These numbers need to be figured out manually and looking at | 889 | | // ringing next to sky etc. Optimization will find smaller numbers | 890 | | // and produce more ringing than is ideal. Larger numbers will | 891 | | // help stop ringing. | 892 | 7.22k | const float entropy_mul16X8 = 1.21; | 893 | 7.22k | const float entropy_mul16X16 = 1.34; | 894 | 7.22k | const float entropy_mul16X32 = 1.49; | 895 | 7.22k | const float entropy_mul32X32 = 1.48; | 896 | 7.22k | const float entropy_mul64X32 = 2.25; | 897 | 7.22k | const float entropy_mul64X64 = 2.25; | 898 | | // TODO(jyrki): Consider this feedback in further changes: | 899 | | // Also effectively when the multipliers for smaller blocks are | 900 | | // below 1, this raises the bar for the bigger blocks even higher | 901 | | // in that sense these constants are not independent (e.g. changing | 902 | | // the constant for DCT16x32 by -5% (making it more likely) also | 903 | | // means that DCT32x32 becomes harder to do when starting from | 904 | | // two DCT16x32s). It might be better to make them more independent, | 905 | | // e.g. by not applying the multiplier when storing the new entropy | 906 | | // estimates in TryMergeToACSCandidate(). | 907 | 7.22k | const MergeTry kTransformsForMerge[9] = { | 908 | 7.22k | {AcStrategyType::DCT16X8, 2, 4, 5, entropy_mul16X8}, | 909 | 7.22k | {AcStrategyType::DCT8X16, 2, 4, 5, entropy_mul16X8}, | 910 | | // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its | 911 | | // subdivisions. {AcStrategyType::DCT16X16, 3, entropy_mul16X16}, | 912 | 7.22k | {AcStrategyType::DCT16X32, 4, 4, 4, entropy_mul16X32}, | 913 | 7.22k | {AcStrategyType::DCT32X16, 4, 4, 4, entropy_mul16X32}, | 914 | | // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its | 915 | | // subdivisions. {AcStrategyType::DCT32X32, 5, 1, 5, | 916 | | // 0.9822994906548809f}, | 917 | 7.22k | {AcStrategyType::DCT64X32, 6, 1, 3, entropy_mul64X32}, | 918 | 7.22k | {AcStrategyType::DCT32X64, 6, 1, 3, entropy_mul64X32}, | 919 | | // {AcStrategyType::DCT64X64, 8, 1, 3, 2.0846542128012948f}, | 920 | 7.22k | }; | 921 | | /* | 922 | | These sizes not yet included in merge heuristic: | 923 | | set(AcStrategyType::DCT32X8, 0.0f, 2.261390410971102f); | 924 | | set(AcStrategyType::DCT8X32, 0.0f, 2.261390410971102f); | 925 | | set(AcStrategyType::DCT128X128, 0.0f, 1.0f); | 926 | | set(AcStrategyType::DCT128X64, 0.0f, 0.73f); | 927 | | set(AcStrategyType::DCT64X128, 0.0f, 0.73f); | 928 | | set(AcStrategyType::DCT256X256, 0.0f, 1.0f); | 929 | | set(AcStrategyType::DCT256X128, 0.0f, 0.73f); | 930 | | set(AcStrategyType::DCT128X256, 0.0f, 0.73f); | 931 | | */ | 932 | | | 933 | | // Priority is a tricky kludge to avoid collisions so that transforms | 934 | | // don't overlap. | 935 | 7.22k | uint8_t priority[64] = {}; | 936 | 7.22k | bool enable_32x32 = cparams.decoding_speed_tier < 4; | 937 | 65.0k | for (auto mt : kTransformsForMerge) { | 938 | 65.0k | if (mt.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { | 939 | 0 | continue; | 940 | 0 | } | 941 | 65.0k | AcStrategy acs = AcStrategy::FromRawStrategy(mt.type); | 942 | | | 943 | 363k | for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); | 944 | 298k | cy += acs.covered_blocks_y()) { | 945 | 2.00M | for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); | 946 | 1.70M | cx += acs.covered_blocks_x()) { | 947 | 1.70M | if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { | 948 | 51.0k | if (cparams.decoding_speed_tier < 4 && | 949 | 51.0k | mt.type == AcStrategyType::DCT32X64) { | 950 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 951 | 5.66k | if ((cy | cx) % 8 == 0) { | 952 | 5.66k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 953 | 5.66k | 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 954 | 5.66k | mt.entropy_mul, entropy_mul64X64, entropy_estimate, block, | 955 | 5.66k | scratch_space, quantized)); | 956 | 5.66k | } | 957 | 5.66k | continue; | 958 | 45.3k | } else if (mt.type == AcStrategyType::DCT32X16) { | 959 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 960 | | // and that is above. The last column and last row, | 961 | | // when the last column or last row is odd numbered, | 962 | | // are still handled by TryMergeAcs. | 963 | 5.66k | continue; | 964 | 5.66k | } | 965 | 51.0k | } | 966 | 1.69M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || | 967 | 1.69M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { | 968 | | // already covered by FindBest32X32 | 969 | 46.8k | continue; | 970 | 46.8k | } | 971 | | | 972 | 1.64M | if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { | 973 | 672k | if (mt.type == AcStrategyType::DCT16X32) { | 974 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 975 | 23.4k | if ((cy | cx) % 4 == 0) { | 976 | 23.4k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 977 | 23.4k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, | 978 | 23.4k | ac_strategy, mt.entropy_mul, entropy_mul32X32, | 979 | 23.4k | entropy_estimate, block, scratch_space, quantized)); | 980 | 23.4k | } | 981 | 23.4k | continue; | 982 | 648k | } else if (mt.type == AcStrategyType::DCT32X16) { | 983 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 984 | | // and that is above. The last column and last row, | 985 | | // when the last column or last row is odd numbered, | 986 | | // are still handled by TryMergeAcs. | 987 | 17.7k | continue; | 988 | 17.7k | } | 989 | 672k | } | 990 | 1.60M | if ((mt.type == AcStrategyType::DCT16X32 && cy % 4 != 0) || | 991 | 1.60M | (mt.type == AcStrategyType::DCT32X16 && cx % 4 != 0)) { | 992 | | // already covered by FindBest32X32 | 993 | 0 | continue; | 994 | 0 | } | 995 | 1.60M | if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { | 996 | 1.25M | if (mt.type == AcStrategyType::DCT8X16) { | 997 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 998 | 170k | if ((cy | cx) % 2 == 0) { | 999 | 96.7k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 1000 | 96.7k | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1001 | 96.7k | mt.entropy_mul, entropy_mul16X16, entropy_estimate, block, | 1002 | 96.7k | scratch_space, quantized)); | 1003 | 96.7k | } | 1004 | 170k | continue; | 1005 | 1.08M | } else if (mt.type == AcStrategyType::DCT16X8) { | 1006 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 1007 | | // and that is above. The last column and last row, | 1008 | | // when the last column or last row is odd numbered, | 1009 | | // are still handled by TryMergeAcs. | 1010 | 169k | continue; | 1011 | 169k | } | 1012 | 1.25M | } | 1013 | 1.26M | if ((mt.type == AcStrategyType::DCT8X16 && cy % 2 == 1) || | 1014 | 1.26M | (mt.type == AcStrategyType::DCT16X8 && cx % 2 == 1)) { | 1015 | | // already covered by FindBestFirstLevelDivisionForSquare | 1016 | 47.2k | continue; | 1017 | 47.2k | } | 1018 | | // All other merge sizes are handled here. | 1019 | | // Some of the DCT16X8s and DCT8X16s will still leak through here | 1020 | | // when there is an odd number of 8x8 blocks, then the last row | 1021 | | // and column will get their DCT16X8s and DCT8X16s through the | 1022 | | // normal integral transform merging process. | 1023 | 1.21M | JXL_RETURN_IF_ERROR( | 1024 | 1.21M | TryMergeAcs(mt.type, bx, by, cx, cy, config, cmap_factors, | 1025 | 1.21M | ac_strategy, mt.entropy_mul, mt.priority, &priority[0], | 1026 | 1.21M | entropy_estimate, block, scratch_space, quantized)); | 1027 | 1.21M | } | 1028 | 298k | } | 1029 | 65.0k | } | 1030 | 7.22k | if (cparams.speed_tier >= SpeedTier::kHare) { | 1031 | 0 | return true; | 1032 | 0 | } | 1033 | | // Here we still try to do some non-aligned matching, find a few more | 1034 | | // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. | 1035 | 53.4k | for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { | 1036 | 344k | for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { | 1037 | 298k | if ((cy | cx) % 2 != 0) { | 1038 | 201k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 1039 | 201k | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1040 | 201k | entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, | 1041 | 201k | scratch_space, quantized)); | 1042 | 201k | } | 1043 | 298k | } | 1044 | 46.2k | } | 1045 | | // Non-aligned matching for 32X32, 16X32 and 32X16. | 1046 | 7.22k | size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; | 1047 | 26.2k | for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { | 1048 | 71.5k | for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { | 1049 | 52.4k | if ((cy | cx) % 4 == 0) { | 1050 | 23.4k | continue; // Already tried with loop above (DCT16X32 case). | 1051 | 23.4k | } | 1052 | 29.0k | JXL_RETURN_IF_ERROR(FindBestFirstLevelDivisionForSquare( | 1053 | 29.0k | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1054 | 29.0k | entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, | 1055 | 29.0k | scratch_space, quantized)); | 1056 | 29.0k | } | 1057 | 19.0k | } | 1058 | 7.22k | return true; | 1059 | 7.22k | } |
Unexecuted instantiation: jxl::N_SSE2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) |
1060 | | |
1061 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
1062 | | } // namespace HWY_NAMESPACE |
1063 | | } // namespace jxl |
1064 | | HWY_AFTER_NAMESPACE(); |
1065 | | |
1066 | | #if HWY_ONCE |
1067 | | namespace jxl { |
1068 | | HWY_EXPORT(ProcessRectACS); |
1069 | | |
1070 | | Status AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in, |
1071 | | const ImageF& quant_field, const ImageF& mask, |
1072 | | const ImageF& mask1x1, |
1073 | 186 | DequantMatrices* matrices) { |
1074 | 186 | config.dequant = matrices; |
1075 | | |
1076 | 186 | if (cparams.speed_tier >= SpeedTier::kCheetah) { |
1077 | 0 | JXL_RETURN_IF_ERROR( |
1078 | 0 | matrices->EnsureComputed(memory_manager, 1)); // DCT8 only |
1079 | 186 | } else { |
1080 | 186 | uint32_t acs_mask = 0; |
1081 | | // All transforms up to 64x64. |
1082 | 4.09k | for (size_t i = 0; i < static_cast<size_t>(AcStrategyType::DCT128X128); |
1083 | 3.90k | i++) { |
1084 | 3.90k | acs_mask |= (1 << i); |
1085 | 3.90k | } |
1086 | 186 | JXL_RETURN_IF_ERROR(matrices->EnsureComputed(memory_manager, acs_mask)); |
1087 | 186 | } |
1088 | | |
1089 | | // Image row pointers and strides. |
1090 | 186 | config.quant_field_row = quant_field.Row(0); |
1091 | 186 | config.quant_field_stride = quant_field.PixelsPerRow(); |
1092 | 186 | if (mask.xsize() > 0 && mask.ysize() > 0) { |
1093 | 186 | config.masking_field_row = mask.Row(0); |
1094 | 186 | config.masking_field_stride = mask.PixelsPerRow(); |
1095 | 186 | } |
1096 | 186 | config.mask1x1_xsize = mask1x1.xsize(); |
1097 | 186 | if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) { |
1098 | 186 | config.masking1x1_field_row = mask1x1.Row(0); |
1099 | 186 | config.masking1x1_field_stride = mask1x1.PixelsPerRow(); |
1100 | 186 | } |
1101 | | |
1102 | 186 | config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0); |
1103 | 186 | config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0); |
1104 | 186 | config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0); |
1105 | 186 | config.src_stride = src.PixelsPerRow(); |
1106 | | |
1107 | | // Entropy estimate is composed of two factors: |
1108 | | // - estimate of the number of bits that will be used by the block |
1109 | | // - information loss due to quantization |
1110 | | // The following constant controls the relative weights of these components. |
1111 | 186 | config.info_loss_multiplier = 1.2; |
1112 | 186 | config.zeros_mul = 9.3089059022677905; |
1113 | 186 | config.cost_delta = 10.833273317067883; |
1114 | | |
1115 | 186 | static const float kBias = 0.13731742964354549; |
1116 | 186 | const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias); |
1117 | | |
1118 | 186 | static const float kPow1 = 0.33677806662454718; |
1119 | 186 | static const float kPow2 = 0.50990926717963703; |
1120 | 186 | static const float kPow3 = 0.36702940662370243; |
1121 | 186 | config.info_loss_multiplier *= std::pow(ratio, kPow1); |
1122 | 186 | config.zeros_mul *= std::pow(ratio, kPow2); |
1123 | 186 | config.cost_delta *= std::pow(ratio, kPow3); |
1124 | 186 | return true; |
1125 | 186 | } |
1126 | | |
1127 | 186 | Status AcStrategyHeuristics::PrepareForThreads(std::size_t num_threads) { |
1128 | 186 | const size_t dct_scratch_size = |
1129 | 186 | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; |
1130 | 186 | mem_per_thread = 6 * AcStrategy::kMaxCoeffArea + dct_scratch_size; |
1131 | 186 | size_t mem_bytes = num_threads * mem_per_thread * sizeof(float); |
1132 | 186 | JXL_ASSIGN_OR_RETURN(mem, AlignedMemory::Create(memory_manager, mem_bytes)); |
1133 | 186 | qmem_per_thread = AcStrategy::kMaxCoeffArea; |
1134 | 186 | size_t qmem_bytes = num_threads * qmem_per_thread * sizeof(uint32_t); |
1135 | 186 | JXL_ASSIGN_OR_RETURN(qmem, AlignedMemory::Create(memory_manager, qmem_bytes)); |
1136 | 186 | return true; |
1137 | 186 | } |
1138 | | |
1139 | | Status AcStrategyHeuristics::ProcessRect(const Rect& rect, |
1140 | | const ColorCorrelationMap& cmap, |
1141 | | AcStrategyImage* ac_strategy, |
1142 | 7.22k | size_t thread) { |
1143 | | // In Cheetah mode, use DCT8 everywhere and uniform quantization. |
1144 | 7.22k | if (cparams.speed_tier >= SpeedTier::kCheetah) { |
1145 | 0 | ac_strategy->FillDCT8(rect); |
1146 | 0 | return true; |
1147 | 0 | } |
1148 | 7.22k | return HWY_DYNAMIC_DISPATCH(ProcessRectACS)( |
1149 | 7.22k | cparams, config, rect, cmap, |
1150 | 7.22k | mem.address<float>() + thread * mem_per_thread, |
1151 | 7.22k | qmem.address<uint32_t>() + thread * qmem_per_thread, ac_strategy); |
1152 | 7.22k | } |
1153 | | |
1154 | | Status AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim, |
1155 | | const AcStrategyImage& ac_strategy, |
1156 | 186 | AuxOut* aux_out) { |
1157 | | // Accounting and debug output. |
1158 | 186 | if (aux_out != nullptr) { |
1159 | 0 | aux_out->num_small_blocks = |
1160 | 0 | ac_strategy.CountBlocks(AcStrategyType::IDENTITY) + |
1161 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT2X2) + |
1162 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT4X4); |
1163 | 0 | aux_out->num_dct4x8_blocks = |
1164 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT4X8) + |
1165 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT8X4); |
1166 | 0 | aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategyType::AFV0) + |
1167 | 0 | ac_strategy.CountBlocks(AcStrategyType::AFV1) + |
1168 | 0 | ac_strategy.CountBlocks(AcStrategyType::AFV2) + |
1169 | 0 | ac_strategy.CountBlocks(AcStrategyType::AFV3); |
1170 | 0 | aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategyType::DCT); |
1171 | 0 | aux_out->num_dct8x16_blocks = |
1172 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT8X16) + |
1173 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT16X8); |
1174 | 0 | aux_out->num_dct8x32_blocks = |
1175 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT8X32) + |
1176 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X8); |
1177 | 0 | aux_out->num_dct16_blocks = |
1178 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT16X16); |
1179 | 0 | aux_out->num_dct16x32_blocks = |
1180 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT16X32) + |
1181 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X16); |
1182 | 0 | aux_out->num_dct32_blocks = |
1183 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X32); |
1184 | 0 | aux_out->num_dct32x64_blocks = |
1185 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT32X64) + |
1186 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT64X32); |
1187 | 0 | aux_out->num_dct64_blocks = |
1188 | 0 | ac_strategy.CountBlocks(AcStrategyType::DCT64X64); |
1189 | 0 | } |
1190 | | |
1191 | 186 | if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) { |
1192 | 0 | JXL_RETURN_IF_ERROR(DumpAcStrategy(ac_strategy, frame_dim.xsize, |
1193 | 0 | frame_dim.ysize, "ac_strategy", aux_out, |
1194 | 0 | cparams)); |
1195 | 0 | } |
1196 | 186 | return true; |
1197 | 186 | } |
1198 | | |
1199 | | } // namespace jxl |
1200 | | #endif // HWY_ONCE |