/src/libjxl/lib/jxl/enc_ac_strategy.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_ac_strategy.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cmath> |
12 | | #include <cstdint> |
13 | | #include <cstdio> |
14 | | #include <cstring> |
15 | | |
16 | | #undef HWY_TARGET_INCLUDE |
17 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc" |
18 | | #include <hwy/foreach_target.h> |
19 | | #include <hwy/highway.h> |
20 | | |
21 | | #include "lib/jxl/ac_strategy.h" |
22 | | #include "lib/jxl/base/bits.h" |
23 | | #include "lib/jxl/base/compiler_specific.h" |
24 | | #include "lib/jxl/base/fast_math-inl.h" |
25 | | #include "lib/jxl/base/rect.h" |
26 | | #include "lib/jxl/base/status.h" |
27 | | #include "lib/jxl/dec_transforms-inl.h" |
28 | | #include "lib/jxl/enc_aux_out.h" |
29 | | #include "lib/jxl/enc_debug_image.h" |
30 | | #include "lib/jxl/enc_params.h" |
31 | | #include "lib/jxl/enc_transforms-inl.h" |
32 | | #include "lib/jxl/simd_util.h" |
33 | | |
34 | | // Some of the floating point constants in this file and in other |
35 | | // files in the libjxl project have been obtained using the |
36 | | // tools/optimizer/simplex_fork.py tool. It is a variation of |
37 | | // Nelder-Mead optimization, and we generally try to minimize |
38 | | // BPP * pnorm aggregate as reported by the benchmark_xl tool, |
39 | | // but occasionally the values are optimized by using additional |
40 | | // constraints such as maintaining a certain density, or ratio of |
41 | | // popularity of integral transforms. Jyrki visually reviews all |
42 | | // such changes and often makes manual changes to maintain good |
43 | | // visual quality to changes where butteraugli was not sufficiently |
44 | | // sensitive to some kind of degradation. Unfortunately image quality |
45 | | // is still more of an art than science. |
46 | | |
47 | | // Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging. |
48 | | #ifndef JXL_DEBUG_AC_STRATEGY |
49 | 104 | #define JXL_DEBUG_AC_STRATEGY 0 |
50 | | #endif |
51 | | |
52 | | // This must come before the begin/end_target, but HWY_ONCE is only true |
53 | | // after that, so use an "include guard". |
54 | | #ifndef LIB_JXL_ENC_AC_STRATEGY_ |
55 | | #define LIB_JXL_ENC_AC_STRATEGY_ |
56 | | // Parameters of the heuristic are marked with a OPTIMIZE comment. |
57 | | namespace jxl { |
58 | | namespace { |
59 | | |
60 | | // Debugging utilities. |
61 | | |
62 | | // Returns a linear sRGB color (as bytes) for each AC strategy. |
63 | 0 | const uint8_t* TypeColor(const uint8_t& raw_strategy) { |
64 | 0 | JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); |
65 | 0 | static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors"); |
66 | 0 | static constexpr uint8_t kColors[][3] = { |
67 | 0 | {0xFF, 0xFF, 0x00}, // DCT8 |
68 | 0 | {0xFF, 0x80, 0x80}, // HORNUSS |
69 | 0 | {0xFF, 0x80, 0x80}, // DCT2x2 |
70 | 0 | {0xFF, 0x80, 0x80}, // DCT4x4 |
71 | 0 | {0x80, 0xFF, 0x00}, // DCT16x16 |
72 | 0 | {0x00, 0xC0, 0x00}, // DCT32x32 |
73 | 0 | {0xC0, 0xFF, 0x00}, // DCT16x8 |
74 | 0 | {0xC0, 0xFF, 0x00}, // DCT8x16 |
75 | 0 | {0x00, 0xFF, 0x00}, // DCT32x8 |
76 | 0 | {0x00, 0xFF, 0x00}, // DCT8x32 |
77 | 0 | {0x00, 0xFF, 0x00}, // DCT32x16 |
78 | 0 | {0x00, 0xFF, 0x00}, // DCT16x32 |
79 | 0 | {0xFF, 0x80, 0x00}, // DCT4x8 |
80 | 0 | {0xFF, 0x80, 0x00}, // DCT8x4 |
81 | 0 | {0xFF, 0xFF, 0x80}, // AFV0 |
82 | 0 | {0xFF, 0xFF, 0x80}, // AFV1 |
83 | 0 | {0xFF, 0xFF, 0x80}, // AFV2 |
84 | 0 | {0xFF, 0xFF, 0x80}, // AFV3 |
85 | 0 | {0x00, 0xC0, 0xFF}, // DCT64x64 |
86 | 0 | {0x00, 0xFF, 0xFF}, // DCT64x32 |
87 | 0 | {0x00, 0xFF, 0xFF}, // DCT32x64 |
88 | 0 | {0x00, 0x40, 0xFF}, // DCT128x128 |
89 | 0 | {0x00, 0x80, 0xFF}, // DCT128x64 |
90 | 0 | {0x00, 0x80, 0xFF}, // DCT64x128 |
91 | 0 | {0x00, 0x00, 0xC0}, // DCT256x256 |
92 | 0 | {0x00, 0x00, 0xFF}, // DCT256x128 |
93 | 0 | {0x00, 0x00, 0xFF}, // DCT128x256 |
94 | 0 | }; |
95 | 0 | return kColors[raw_strategy]; |
96 | 0 | } |
97 | | |
98 | 0 | const uint8_t* TypeMask(const uint8_t& raw_strategy) { |
99 | 0 | JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); |
100 | 0 | static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks"); |
101 | 0 | // implicitly, first row and column is made dark |
102 | 0 | static constexpr uint8_t kMask[][64] = { |
103 | 0 | { |
104 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
105 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
106 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
107 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
108 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
109 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
110 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
111 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
112 | 0 | }, // DCT8 |
113 | 0 | { |
114 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
115 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
116 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
117 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
118 | 0 | 0, 0, 1, 1, 1, 1, 0, 0, // |
119 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
120 | 0 | 0, 0, 1, 0, 0, 1, 0, 0, // |
121 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
122 | 0 | }, // HORNUSS |
123 | 0 | { |
124 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
125 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
126 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
127 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
128 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
129 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
130 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
131 | 0 | 1, 0, 1, 0, 1, 0, 1, 0, // |
132 | 0 | }, // 2x2 |
133 | 0 | { |
134 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
135 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
136 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
137 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
138 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
139 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
140 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
141 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
142 | 0 | }, // 4x4 |
143 | 0 | {}, // DCT16x16 (unused) |
144 | 0 | {}, // DCT32x32 (unused) |
145 | 0 | {}, // DCT16x8 (unused) |
146 | 0 | {}, // DCT8x16 (unused) |
147 | 0 | {}, // DCT32x8 (unused) |
148 | 0 | {}, // DCT8x32 (unused) |
149 | 0 | {}, // DCT32x16 (unused) |
150 | 0 | {}, // DCT16x32 (unused) |
151 | 0 | { |
152 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
153 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
154 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
155 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
156 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, // |
157 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
158 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
159 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
160 | 0 | }, // DCT4x8 |
161 | 0 | { |
162 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
163 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
164 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
165 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
166 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
167 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
168 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
169 | 0 | 0, 0, 0, 0, 1, 0, 0, 0, // |
170 | 0 | }, // DCT8x4 |
171 | 0 | { |
172 | 0 | 1, 1, 1, 1, 1, 0, 0, 0, // |
173 | 0 | 1, 1, 1, 1, 0, 0, 0, 0, // |
174 | 0 | 1, 1, 1, 0, 0, 0, 0, 0, // |
175 | 0 | 1, 1, 0, 0, 0, 0, 0, 0, // |
176 | 0 | 1, 0, 0, 0, 0, 0, 0, 0, // |
177 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
178 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
179 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
180 | 0 | }, // AFV0 |
181 | 0 | { |
182 | 0 | 0, 0, 0, 0, 1, 1, 1, 1, // |
183 | 0 | 0, 0, 0, 0, 0, 1, 1, 1, // |
184 | 0 | 0, 0, 0, 0, 0, 0, 1, 1, // |
185 | 0 | 0, 0, 0, 0, 0, 0, 0, 1, // |
186 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
187 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
188 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
189 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
190 | 0 | }, // AFV1 |
191 | 0 | { |
192 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
193 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
194 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
195 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
196 | 0 | 1, 0, 0, 0, 0, 0, 0, 0, // |
197 | 0 | 1, 1, 0, 0, 0, 0, 0, 0, // |
198 | 0 | 1, 1, 1, 0, 0, 0, 0, 0, // |
199 | 0 | 1, 1, 1, 1, 0, 0, 0, 0, // |
200 | 0 | }, // AFV2 |
201 | 0 | { |
202 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
203 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
204 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
205 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
206 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, // |
207 | 0 | 0, 0, 0, 0, 0, 0, 0, 1, // |
208 | 0 | 0, 0, 0, 0, 0, 0, 1, 1, // |
209 | 0 | 0, 0, 0, 0, 0, 1, 1, 1, // |
210 | 0 | }, // AFV3 |
211 | 0 | }; |
212 | 0 | return kMask[raw_strategy]; |
213 | 0 | } |
214 | | |
215 | | Status DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, |
216 | | size_t ysize, const char* tag, AuxOut* aux_out, |
217 | 0 | const CompressParams& cparams) { |
218 | 0 | JxlMemoryManager* memory_manager = ac_strategy.memory_manager(); |
219 | 0 | JXL_ASSIGN_OR_RETURN(Image3F color_acs, |
220 | 0 | Image3F::Create(memory_manager, xsize, ysize)); |
221 | 0 | for (size_t y = 0; y < ysize; y++) { |
222 | 0 | float* JXL_RESTRICT rows[3] = { |
223 | 0 | color_acs.PlaneRow(0, y), |
224 | 0 | color_acs.PlaneRow(1, y), |
225 | 0 | color_acs.PlaneRow(2, y), |
226 | 0 | }; |
227 | 0 | const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); |
228 | 0 | for (size_t x = 0; x < xsize; x++) { |
229 | 0 | AcStrategy acs = acs_row[x / kBlockDim]; |
230 | 0 | const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); |
231 | 0 | for (size_t c = 0; c < 3; c++) { |
232 | 0 | rows[c][x] = color[c] / 255.f; |
233 | 0 | } |
234 | 0 | } |
235 | 0 | } |
236 | 0 | size_t stride = color_acs.PixelsPerRow(); |
237 | 0 | for (size_t c = 0; c < 3; c++) { |
238 | 0 | for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { |
239 | 0 | float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); |
240 | 0 | const AcStrategyRow acs_row = ac_strategy.ConstRow(by); |
241 | 0 | for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { |
242 | 0 | AcStrategy acs = acs_row[bx]; |
243 | 0 | if (!acs.IsFirstBlock()) continue; |
244 | 0 | const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); |
245 | 0 | const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); |
246 | 0 | if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { |
247 | 0 | for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; |
248 | 0 | iy++) { |
249 | 0 | for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; |
250 | 0 | ix++) { |
251 | 0 | if (mask[iy * kBlockDim + ix]) { |
252 | 0 | row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; |
253 | 0 | } |
254 | 0 | } |
255 | 0 | } |
256 | 0 | } |
257 | 0 | // draw block edges |
258 | 0 | for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && |
259 | 0 | bx * kBlockDim + ix < xsize; |
260 | 0 | ix++) { |
261 | 0 | row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; |
262 | 0 | } |
263 | 0 | for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && |
264 | 0 | by * kBlockDim + iy < ysize; |
265 | 0 | iy++) { |
266 | 0 | row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; |
267 | 0 | } |
268 | 0 | } |
269 | 0 | } |
270 | 0 | } |
271 | 0 | return DumpImage(cparams, tag, color_acs); |
272 | 0 | } |
273 | | |
274 | | } // namespace |
275 | | } // namespace jxl |
276 | | #endif // LIB_JXL_ENC_AC_STRATEGY_ |
277 | | |
278 | | HWY_BEFORE_NAMESPACE(); |
279 | | namespace jxl { |
280 | | namespace HWY_NAMESPACE { |
281 | | |
282 | | // These templates are not found via ADL. |
283 | | using hwy::HWY_NAMESPACE::AbsDiff; |
284 | | using hwy::HWY_NAMESPACE::Eq; |
285 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
286 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
287 | | using hwy::HWY_NAMESPACE::Round; |
288 | | using hwy::HWY_NAMESPACE::Sqrt; |
289 | | |
290 | | bool MultiBlockTransformCrossesHorizontalBoundary( |
291 | | const AcStrategyImage& ac_strategy, size_t start_x, size_t y, |
292 | 0 | size_t end_x) { |
293 | 0 | if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { |
294 | 0 | return false; |
295 | 0 | } |
296 | 0 | if (y % 8 == 0) { |
297 | | // Nothing crosses 64x64 boundaries, and the memory on the other side |
298 | | // of the 64x64 block may still uninitialized. |
299 | 0 | return false; |
300 | 0 | } |
301 | 0 | end_x = std::min(end_x, ac_strategy.xsize()); |
302 | | // The first multiblock might be before the start_x, let's adjust it |
303 | | // to point to the first IsFirstBlock() == true block we find by backward |
304 | | // tracing. |
305 | 0 | AcStrategyRow row = ac_strategy.ConstRow(y); |
306 | 0 | const size_t start_x_limit = start_x & ~7; |
307 | 0 | while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { |
308 | 0 | --start_x; |
309 | 0 | } |
310 | 0 | for (size_t x = start_x; x < end_x;) { |
311 | 0 | if (row[x].IsFirstBlock()) { |
312 | 0 | x += row[x].covered_blocks_x(); |
313 | 0 | } else { |
314 | 0 | return true; |
315 | 0 | } |
316 | 0 | } |
317 | 0 | return false; |
318 | 0 | } Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_AVX2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesHorizontalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) |
319 | | |
320 | | bool MultiBlockTransformCrossesVerticalBoundary( |
321 | | const AcStrategyImage& ac_strategy, size_t x, size_t start_y, |
322 | 0 | size_t end_y) { |
323 | 0 | if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { |
324 | 0 | return false; |
325 | 0 | } |
326 | 0 | if (x % 8 == 0) { |
327 | | // Nothing crosses 64x64 boundaries, and the memory on the other side |
328 | | // of the 64x64 block may still uninitialized. |
329 | 0 | return false; |
330 | 0 | } |
331 | 0 | end_y = std::min(end_y, ac_strategy.ysize()); |
332 | | // The first multiblock might be before the start_y, let's adjust it |
333 | | // to point to the first IsFirstBlock() == true block we find by backward |
334 | | // tracing. |
335 | 0 | const size_t start_y_limit = start_y & ~7; |
336 | 0 | while (start_y != start_y_limit && |
337 | 0 | !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { |
338 | 0 | --start_y; |
339 | 0 | } |
340 | |
|
341 | 0 | for (size_t y = start_y; y < end_y;) { |
342 | 0 | AcStrategyRow row = ac_strategy.ConstRow(y); |
343 | 0 | if (row[x].IsFirstBlock()) { |
344 | 0 | y += row[x].covered_blocks_y(); |
345 | 0 | } else { |
346 | 0 | return true; |
347 | 0 | } |
348 | 0 | } |
349 | 0 | return false; |
350 | 0 | } Unexecuted instantiation: jxl::N_SSE4::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_AVX2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: jxl::N_SSE2::MultiBlockTransformCrossesVerticalBoundary(jxl::AcStrategyImage const&, unsigned long, unsigned long, unsigned long) |
351 | | |
352 | | float EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x, |
353 | | size_t y, const ACSConfig& config, |
354 | | const float* JXL_RESTRICT cmap_factors, float* block, |
355 | 520 | float* full_scratch_space, uint32_t* quantized) { |
356 | 520 | float* mem = full_scratch_space; |
357 | 520 | float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea; |
358 | 520 | const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; |
359 | | |
360 | | // Apply transform. |
361 | 2.08k | for (size_t c = 0; c < 3; c++) { |
362 | 1.56k | float* JXL_RESTRICT block_c = block + size * c; |
363 | 1.56k | TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), |
364 | 1.56k | config.src_stride, block_c, scratch_space); |
365 | 1.56k | } |
366 | 520 | HWY_FULL(float) df; |
367 | | |
368 | 520 | const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); |
369 | | // avoid large blocks when there is a lot going on in red-green. |
370 | 520 | float quant_norm16 = 0; |
371 | 520 | if (num_blocks == 1) { |
372 | | // When it is only one 8x8, we don't need aggregation of values. |
373 | 520 | quant_norm16 = config.Quant(x / 8, y / 8); |
374 | 520 | } else if (num_blocks == 2) { |
375 | | // Taking max instead of 8th norm seems to work |
376 | | // better for smallest blocks up to 16x8. Jyrki couldn't get |
377 | | // improvements in trying the same for 16x16 blocks. |
378 | 0 | if (acs.covered_blocks_y() == 2) { |
379 | 0 | quant_norm16 = |
380 | 0 | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); |
381 | 0 | } else { |
382 | 0 | quant_norm16 = |
383 | 0 | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); |
384 | 0 | } |
385 | 0 | } else { |
386 | | // Load QF value, calculate empirical heuristic on masking field |
387 | | // for weighting the information loss. Information loss manifests |
388 | | // itself as ringing, and masking could hide it. |
389 | 0 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
390 | 0 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
391 | 0 | float qval = config.Quant(x / 8 + ix, y / 8 + iy); |
392 | 0 | qval *= qval; |
393 | 0 | qval *= qval; |
394 | 0 | qval *= qval; |
395 | 0 | quant_norm16 += qval * qval; |
396 | 0 | } |
397 | 0 | } |
398 | 0 | quant_norm16 /= num_blocks; |
399 | 0 | quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f); |
400 | 0 | } |
401 | 520 | const auto quant = Set(df, quant_norm16); |
402 | | |
403 | | // Compute entropy. |
404 | 520 | float entropy = 0.0f; |
405 | 520 | const HWY_CAPPED(float, 8) df8; |
406 | | |
407 | 520 | auto loss = Zero(df8); |
408 | 2.08k | for (size_t c = 0; c < 3; c++) { |
409 | 1.56k | const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c); |
410 | 1.56k | const float* matrix = config.dequant->Matrix(acs.RawStrategy(), c); |
411 | 1.56k | const auto cmap_factor = Set(df, cmap_factors[c]); |
412 | | |
413 | 1.56k | auto entropy_v = Zero(df); |
414 | 1.56k | auto nzeros_v = Zero(df); |
415 | 14.0k | for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { |
416 | 12.4k | const auto in = Load(df, block + c * size + i); |
417 | 12.4k | const auto in_y = Mul(Load(df, block + size + i), cmap_factor); |
418 | 12.4k | const auto im = Load(df, inv_matrix + i); |
419 | 12.4k | const auto val = Mul(Sub(in, in_y), Mul(im, quant)); |
420 | 12.4k | const auto rval = Round(val); |
421 | 12.4k | const auto diff = Sub(val, rval); |
422 | 12.4k | const auto m = Load(df, matrix + i); |
423 | 12.4k | Store(Mul(m, diff), df, &mem[i]); |
424 | 12.4k | const auto q = Abs(rval); |
425 | 12.4k | const auto q_is_zero = Eq(q, Zero(df)); |
426 | | // We used to have q * C here, but that cost model seems to |
427 | | // be punishing large values more than necessary. Sqrt tries |
428 | | // to avoid large values less aggressively. |
429 | 12.4k | entropy_v = Add(Sqrt(q), entropy_v); |
430 | 12.4k | nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); |
431 | 12.4k | } |
432 | | |
433 | 1.56k | { |
434 | 1.56k | auto lossc = Zero(df8); |
435 | 1.56k | TransformToPixels(acs.Strategy(), &mem[0], block, |
436 | 1.56k | acs.covered_blocks_x() * 8, scratch_space); |
437 | | |
438 | 3.12k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
439 | 3.12k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
440 | 14.0k | for (size_t dy = 0; dy < kBlockDim; ++dy) { |
441 | 24.9k | for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) { |
442 | 12.4k | auto in = Load(df8, block + |
443 | 12.4k | (iy * kBlockDim + dy) * |
444 | 12.4k | (acs.covered_blocks_x() * kBlockDim) + |
445 | 12.4k | ix * kBlockDim + dx); |
446 | 12.4k | auto masku = Abs(Load( |
447 | 12.4k | df8, config.MaskingPtr1x1(x + ix * 8 + dx, y + iy * 8 + dy))); |
448 | 12.4k | in = Mul(masku, in); |
449 | 12.4k | in = Mul(in, in); |
450 | 12.4k | in = Mul(in, in); |
451 | 12.4k | in = Mul(in, in); |
452 | 12.4k | lossc = Add(lossc, in); |
453 | 12.4k | } |
454 | 12.4k | } |
455 | 1.56k | } |
456 | 1.56k | } |
457 | 1.56k | static const double kChannelMul[3] = { |
458 | 1.56k | pow(10.2, 8.0), |
459 | 1.56k | pow(1.0, 8.0), |
460 | 1.56k | pow(1.03, 8.0), |
461 | 1.56k | }; |
462 | 1.56k | lossc = Mul(Set(df8, kChannelMul[c]), lossc); |
463 | 1.56k | loss = Add(loss, lossc); |
464 | 1.56k | } |
465 | 1.56k | entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v)); |
466 | 1.56k | size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); |
467 | | // Add #bit of num_nonzeros, as an estimate of the cost for encoding the |
468 | | // number of non-zeros of the block. |
469 | 1.56k | size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; |
470 | | // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a |
471 | | // bias. |
472 | 1.56k | entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); |
473 | 1.56k | } |
474 | 520 | float loss_scalar = |
475 | 520 | pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize), |
476 | 520 | 1.0 / 8.0) * |
477 | 520 | (num_blocks * kDCTBlockSize) / quant_norm16; |
478 | 520 | float ret = entropy * entropy_mul; |
479 | 520 | ret += config.info_loss_multiplier * loss_scalar; |
480 | 520 | return ret; |
481 | 520 | } Unexecuted instantiation: jxl::N_SSE4::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*) jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*) Line | Count | Source | 355 | 520 | float* full_scratch_space, uint32_t* quantized) { | 356 | 520 | float* mem = full_scratch_space; | 357 | 520 | float* scratch_space = full_scratch_space + AcStrategy::kMaxCoeffArea; | 358 | 520 | const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; | 359 | | | 360 | | // Apply transform. | 361 | 2.08k | for (size_t c = 0; c < 3; c++) { | 362 | 1.56k | float* JXL_RESTRICT block_c = block + size * c; | 363 | 1.56k | TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), | 364 | 1.56k | config.src_stride, block_c, scratch_space); | 365 | 1.56k | } | 366 | 520 | HWY_FULL(float) df; | 367 | | | 368 | 520 | const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); | 369 | | // avoid large blocks when there is a lot going on in red-green. | 370 | 520 | float quant_norm16 = 0; | 371 | 520 | if (num_blocks == 1) { | 372 | | // When it is only one 8x8, we don't need aggregation of values. | 373 | 520 | quant_norm16 = config.Quant(x / 8, y / 8); | 374 | 520 | } else if (num_blocks == 2) { | 375 | | // Taking max instead of 8th norm seems to work | 376 | | // better for smallest blocks up to 16x8. Jyrki couldn't get | 377 | | // improvements in trying the same for 16x16 blocks. | 378 | 0 | if (acs.covered_blocks_y() == 2) { | 379 | 0 | quant_norm16 = | 380 | 0 | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); | 381 | 0 | } else { | 382 | 0 | quant_norm16 = | 383 | 0 | std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); | 384 | 0 | } | 385 | 0 | } else { | 386 | | // Load QF value, calculate empirical heuristic on masking field | 387 | | // for weighting the information loss. Information loss manifests | 388 | | // itself as ringing, and masking could hide it. | 389 | 0 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 390 | 0 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 391 | 0 | float qval = config.Quant(x / 8 + ix, y / 8 + iy); | 392 | 0 | qval *= qval; | 393 | 0 | qval *= qval; | 394 | 0 | qval *= qval; | 395 | 0 | quant_norm16 += qval * qval; | 396 | 0 | } | 397 | 0 | } | 398 | 0 | quant_norm16 /= num_blocks; | 399 | 0 | quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f); | 400 | 0 | } | 401 | 520 | const auto quant = Set(df, quant_norm16); | 402 | | | 403 | | // Compute entropy. | 404 | 520 | float entropy = 0.0f; | 405 | 520 | const HWY_CAPPED(float, 8) df8; | 406 | | | 407 | 520 | auto loss = Zero(df8); | 408 | 2.08k | for (size_t c = 0; c < 3; c++) { | 409 | 1.56k | const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c); | 410 | 1.56k | const float* matrix = config.dequant->Matrix(acs.RawStrategy(), c); | 411 | 1.56k | const auto cmap_factor = Set(df, cmap_factors[c]); | 412 | | | 413 | 1.56k | auto entropy_v = Zero(df); | 414 | 1.56k | auto nzeros_v = Zero(df); | 415 | 14.0k | for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { | 416 | 12.4k | const auto in = Load(df, block + c * size + i); | 417 | 12.4k | const auto in_y = Mul(Load(df, block + size + i), cmap_factor); | 418 | 12.4k | const auto im = Load(df, inv_matrix + i); | 419 | 12.4k | const auto val = Mul(Sub(in, in_y), Mul(im, quant)); | 420 | 12.4k | const auto rval = Round(val); | 421 | 12.4k | const auto diff = Sub(val, rval); | 422 | 12.4k | const auto m = Load(df, matrix + i); | 423 | 12.4k | Store(Mul(m, diff), df, &mem[i]); | 424 | 12.4k | const auto q = Abs(rval); | 425 | 12.4k | const auto q_is_zero = Eq(q, Zero(df)); | 426 | | // We used to have q * C here, but that cost model seems to | 427 | | // be punishing large values more than necessary. Sqrt tries | 428 | | // to avoid large values less aggressively. | 429 | 12.4k | entropy_v = Add(Sqrt(q), entropy_v); | 430 | 12.4k | nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); | 431 | 12.4k | } | 432 | | | 433 | 1.56k | { | 434 | 1.56k | auto lossc = Zero(df8); | 435 | 1.56k | TransformToPixels(acs.Strategy(), &mem[0], block, | 436 | 1.56k | acs.covered_blocks_x() * 8, scratch_space); | 437 | | | 438 | 3.12k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 439 | 3.12k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 440 | 14.0k | for (size_t dy = 0; dy < kBlockDim; ++dy) { | 441 | 24.9k | for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) { | 442 | 12.4k | auto in = Load(df8, block + | 443 | 12.4k | (iy * kBlockDim + dy) * | 444 | 12.4k | (acs.covered_blocks_x() * kBlockDim) + | 445 | 12.4k | ix * kBlockDim + dx); | 446 | 12.4k | auto masku = Abs(Load( | 447 | 12.4k | df8, config.MaskingPtr1x1(x + ix * 8 + dx, y + iy * 8 + dy))); | 448 | 12.4k | in = Mul(masku, in); | 449 | 12.4k | in = Mul(in, in); | 450 | 12.4k | in = Mul(in, in); | 451 | 12.4k | in = Mul(in, in); | 452 | 12.4k | lossc = Add(lossc, in); | 453 | 12.4k | } | 454 | 12.4k | } | 455 | 1.56k | } | 456 | 1.56k | } | 457 | 1.56k | static const double kChannelMul[3] = { | 458 | 1.56k | pow(10.2, 8.0), | 459 | 1.56k | pow(1.0, 8.0), | 460 | 1.56k | pow(1.03, 8.0), | 461 | 1.56k | }; | 462 | 1.56k | lossc = Mul(Set(df8, kChannelMul[c]), lossc); | 463 | 1.56k | loss = Add(loss, lossc); | 464 | 1.56k | } | 465 | 1.56k | entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v)); | 466 | 1.56k | size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); | 467 | | // Add #bit of num_nonzeros, as an estimate of the cost for encoding the | 468 | | // number of non-zeros of the block. | 469 | 1.56k | size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; | 470 | | // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a | 471 | | // bias. | 472 | 1.56k | entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); | 473 | 1.56k | } | 474 | 520 | float loss_scalar = | 475 | 520 | pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize), | 476 | 520 | 1.0 / 8.0) * | 477 | 520 | (num_blocks * kDCTBlockSize) / quant_norm16; | 478 | 520 | float ret = entropy * entropy_mul; | 479 | 520 | ret += config.info_loss_multiplier * loss_scalar; | 480 | 520 | return ret; | 481 | 520 | } |
Unexecuted instantiation: jxl::N_SSE2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned int*) |
482 | | |
483 | | uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier, |
484 | | float butteraugli_target, const ACSConfig& config, |
485 | | const float* JXL_RESTRICT cmap_factors, |
486 | | AcStrategyImage* JXL_RESTRICT ac_strategy, |
487 | | float* block, float* scratch_space, |
488 | 52 | uint32_t* quantized, float* entropy_out) { |
489 | 52 | struct TransformTry8x8 { |
490 | 52 | AcStrategy::Type type; |
491 | 52 | int encoding_speed_tier_max_limit; |
492 | 52 | double entropy_mul; |
493 | 52 | }; |
494 | 52 | static const TransformTry8x8 kTransforms8x8[] = { |
495 | 52 | { |
496 | 52 | AcStrategy::Type::DCT, |
497 | 52 | 9, |
498 | 52 | 0.8, |
499 | 52 | }, |
500 | 52 | { |
501 | 52 | AcStrategy::Type::DCT4X4, |
502 | 52 | 5, |
503 | 52 | 1.08, |
504 | 52 | }, |
505 | 52 | { |
506 | 52 | AcStrategy::Type::DCT2X2, |
507 | 52 | 5, |
508 | 52 | 0.95, |
509 | 52 | }, |
510 | 52 | { |
511 | 52 | AcStrategy::Type::DCT4X8, |
512 | 52 | 4, |
513 | 52 | 0.85931637428340035, |
514 | 52 | }, |
515 | 52 | { |
516 | 52 | AcStrategy::Type::DCT8X4, |
517 | 52 | 4, |
518 | 52 | 0.85931637428340035, |
519 | 52 | }, |
520 | 52 | { |
521 | 52 | AcStrategy::Type::IDENTITY, |
522 | 52 | 5, |
523 | 52 | 1.0427542510634957, |
524 | 52 | }, |
525 | 52 | { |
526 | 52 | AcStrategy::Type::AFV0, |
527 | 52 | 4, |
528 | 52 | 0.81779489591359944, |
529 | 52 | }, |
530 | 52 | { |
531 | 52 | AcStrategy::Type::AFV1, |
532 | 52 | 4, |
533 | 52 | 0.81779489591359944, |
534 | 52 | }, |
535 | 52 | { |
536 | 52 | AcStrategy::Type::AFV2, |
537 | 52 | 4, |
538 | 52 | 0.81779489591359944, |
539 | 52 | }, |
540 | 52 | { |
541 | 52 | AcStrategy::Type::AFV3, |
542 | 52 | 4, |
543 | 52 | 0.81779489591359944, |
544 | 52 | }, |
545 | 52 | }; |
546 | 52 | double best = 1e30; |
547 | 52 | uint8_t best_tx = kTransforms8x8[0].type; |
548 | 520 | for (auto tx : kTransforms8x8) { |
549 | 520 | if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { |
550 | 0 | continue; |
551 | 0 | } |
552 | 520 | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); |
553 | 520 | float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul; |
554 | 520 | if ((tx.type == AcStrategy::Type::DCT2X2 || |
555 | 520 | tx.type == AcStrategy::Type::IDENTITY) && |
556 | 520 | butteraugli_target < 5.0) { |
557 | 104 | static const float kFavor2X2AtHighQuality = 0.4; |
558 | 104 | float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0); |
559 | 104 | entropy_mul -= kFavor2X2AtHighQuality * weight; |
560 | 104 | } |
561 | 520 | if ((tx.type != AcStrategy::Type::DCT && |
562 | 520 | tx.type != AcStrategy::Type::DCT2X2 && |
563 | 520 | tx.type != AcStrategy::Type::IDENTITY) && |
564 | 520 | butteraugli_target > 4.0) { |
565 | 0 | static const float kAvoidEntropyOfTransforms = 0.5; |
566 | 0 | float mul = 1.0; |
567 | 0 | if (butteraugli_target < 12.0) { |
568 | 0 | mul *= (12.0 - 4.0) / (butteraugli_target - 4.0); |
569 | 0 | } |
570 | 0 | entropy_mul += kAvoidEntropyOfTransforms * mul; |
571 | 0 | } |
572 | 520 | float entropy = |
573 | 520 | EstimateEntropy(acs, entropy_mul, x, y, config, cmap_factors, block, |
574 | 520 | scratch_space, quantized); |
575 | 520 | if (entropy < best) { |
576 | 104 | best_tx = tx.type; |
577 | 104 | best = entropy; |
578 | 104 | } |
579 | 520 | } |
580 | 52 | *entropy_out = best; |
581 | 52 | return best_tx; |
582 | 52 | } Unexecuted instantiation: jxl::N_SSE4::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*) jxl::N_AVX2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*) Line | Count | Source | 488 | 52 | uint32_t* quantized, float* entropy_out) { | 489 | 52 | struct TransformTry8x8 { | 490 | 52 | AcStrategy::Type type; | 491 | 52 | int encoding_speed_tier_max_limit; | 492 | 52 | double entropy_mul; | 493 | 52 | }; | 494 | 52 | static const TransformTry8x8 kTransforms8x8[] = { | 495 | 52 | { | 496 | 52 | AcStrategy::Type::DCT, | 497 | 52 | 9, | 498 | 52 | 0.8, | 499 | 52 | }, | 500 | 52 | { | 501 | 52 | AcStrategy::Type::DCT4X4, | 502 | 52 | 5, | 503 | 52 | 1.08, | 504 | 52 | }, | 505 | 52 | { | 506 | 52 | AcStrategy::Type::DCT2X2, | 507 | 52 | 5, | 508 | 52 | 0.95, | 509 | 52 | }, | 510 | 52 | { | 511 | 52 | AcStrategy::Type::DCT4X8, | 512 | 52 | 4, | 513 | 52 | 0.85931637428340035, | 514 | 52 | }, | 515 | 52 | { | 516 | 52 | AcStrategy::Type::DCT8X4, | 517 | 52 | 4, | 518 | 52 | 0.85931637428340035, | 519 | 52 | }, | 520 | 52 | { | 521 | 52 | AcStrategy::Type::IDENTITY, | 522 | 52 | 5, | 523 | 52 | 1.0427542510634957, | 524 | 52 | }, | 525 | 52 | { | 526 | 52 | AcStrategy::Type::AFV0, | 527 | 52 | 4, | 528 | 52 | 0.81779489591359944, | 529 | 52 | }, | 530 | 52 | { | 531 | 52 | AcStrategy::Type::AFV1, | 532 | 52 | 4, | 533 | 52 | 0.81779489591359944, | 534 | 52 | }, | 535 | 52 | { | 536 | 52 | AcStrategy::Type::AFV2, | 537 | 52 | 4, | 538 | 52 | 0.81779489591359944, | 539 | 52 | }, | 540 | 52 | { | 541 | 52 | AcStrategy::Type::AFV3, | 542 | 52 | 4, | 543 | 52 | 0.81779489591359944, | 544 | 52 | }, | 545 | 52 | }; | 546 | 52 | double best = 1e30; | 547 | 52 | uint8_t best_tx = kTransforms8x8[0].type; | 548 | 520 | for (auto tx : kTransforms8x8) { | 549 | 520 | if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { | 550 | 0 | continue; | 551 | 0 | } | 552 | 520 | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); | 553 | 520 | float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul; | 554 | 520 | if ((tx.type == AcStrategy::Type::DCT2X2 || | 555 | 520 | tx.type == AcStrategy::Type::IDENTITY) && | 556 | 520 | butteraugli_target < 5.0) { | 557 | 104 | static const float kFavor2X2AtHighQuality = 0.4; | 558 | 104 | float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0); | 559 | 104 | entropy_mul -= kFavor2X2AtHighQuality * weight; | 560 | 104 | } | 561 | 520 | if ((tx.type != AcStrategy::Type::DCT && | 562 | 520 | tx.type != AcStrategy::Type::DCT2X2 && | 563 | 520 | tx.type != AcStrategy::Type::IDENTITY) && | 564 | 520 | butteraugli_target > 4.0) { | 565 | 0 | static const float kAvoidEntropyOfTransforms = 0.5; | 566 | 0 | float mul = 1.0; | 567 | 0 | if (butteraugli_target < 12.0) { | 568 | 0 | mul *= (12.0 - 4.0) / (butteraugli_target - 4.0); | 569 | 0 | } | 570 | 0 | entropy_mul += kAvoidEntropyOfTransforms * mul; | 571 | 0 | } | 572 | 520 | float entropy = | 573 | 520 | EstimateEntropy(acs, entropy_mul, x, y, config, cmap_factors, block, | 574 | 520 | scratch_space, quantized); | 575 | 520 | if (entropy < best) { | 576 | 104 | best_tx = tx.type; | 577 | 104 | best = entropy; | 578 | 104 | } | 579 | 520 | } | 580 | 52 | *entropy_out = best; | 581 | 52 | return best_tx; | 582 | 52 | } |
Unexecuted instantiation: jxl::N_SSE2::FindBest8x8Transform(unsigned long, unsigned long, int, float, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float*, float*, unsigned int*, float*) |
583 | | |
584 | | // bx, by addresses the 64x64 block at 8x8 subresolution |
585 | | // cx, cy addresses the left, upper 8x8 block position of the candidate |
586 | | // transform. |
587 | | void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx, |
588 | | size_t cy, const ACSConfig& config, |
589 | | const float* JXL_RESTRICT cmap_factors, |
590 | | AcStrategyImage* JXL_RESTRICT ac_strategy, |
591 | | const float entropy_mul, const uint8_t candidate_priority, |
592 | | uint8_t* priority, float* JXL_RESTRICT entropy_estimate, |
593 | 156 | float* block, float* scratch_space, uint32_t* quantized) { |
594 | 156 | AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); |
595 | 156 | float entropy_current = 0; |
596 | 156 | for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { |
597 | 156 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { |
598 | 156 | if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { |
599 | | // Transform would reuse already allocated blocks and |
600 | | // lead to invalid overlaps, for example DCT64X32 vs. |
601 | | // DCT32X64. |
602 | 156 | return; |
603 | 156 | } |
604 | 0 | entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; |
605 | 0 | } |
606 | 156 | } |
607 | 0 | float entropy_candidate = |
608 | 0 | EstimateEntropy(acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, |
609 | 0 | cmap_factors, block, scratch_space, quantized); |
610 | 0 | if (entropy_candidate >= entropy_current) return; |
611 | | // Accept the candidate. |
612 | 0 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
613 | 0 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
614 | 0 | entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; |
615 | 0 | priority[(cy + iy) * 8 + cx + ix] = candidate_priority; |
616 | 0 | } |
617 | 0 | } |
618 | 0 | ac_strategy->Set(bx + cx, by + cy, acs_raw); |
619 | 0 | entropy_estimate[cy * 8 + cx] = entropy_candidate; |
620 | 0 | } Unexecuted instantiation: jxl::N_SSE4::TryMergeAcs(jxl::AcStrategy::Type, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) jxl::N_AVX2::TryMergeAcs(jxl::AcStrategy::Type, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) Line | Count | Source | 593 | 156 | float* block, float* scratch_space, uint32_t* quantized) { | 594 | 156 | AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); | 595 | 156 | float entropy_current = 0; | 596 | 156 | for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { | 597 | 156 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { | 598 | 156 | if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { | 599 | | // Transform would reuse already allocated blocks and | 600 | | // lead to invalid overlaps, for example DCT64X32 vs. | 601 | | // DCT32X64. | 602 | 156 | return; | 603 | 156 | } | 604 | 0 | entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; | 605 | 0 | } | 606 | 156 | } | 607 | 0 | float entropy_candidate = | 608 | 0 | EstimateEntropy(acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config, | 609 | 0 | cmap_factors, block, scratch_space, quantized); | 610 | 0 | if (entropy_candidate >= entropy_current) return; | 611 | | // Accept the candidate. | 612 | 0 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 613 | 0 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { | 614 | 0 | entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; | 615 | 0 | priority[(cy + iy) * 8 + cx + ix] = candidate_priority; | 616 | 0 | } | 617 | 0 | } | 618 | 0 | ac_strategy->Set(bx + cx, by + cy, acs_raw); | 619 | 0 | entropy_estimate[cy * 8 + cx] = entropy_candidate; | 620 | 0 | } |
Unexecuted instantiation: jxl::N_SSE2::TryMergeAcs(jxl::AcStrategy::Type, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, unsigned char, unsigned char*, float*, float*, float*, unsigned int*) |
621 | | |
622 | | static void SetEntropyForTransform(size_t cx, size_t cy, |
623 | | const AcStrategy::Type acs_raw, |
624 | | float entropy, |
625 | 0 | float* JXL_RESTRICT entropy_estimate) { |
626 | 0 | const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); |
627 | 0 | for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { |
628 | 0 | for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { |
629 | 0 | entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; |
630 | 0 | } |
631 | 0 | } |
632 | 0 | entropy_estimate[cy * 8 + cx] = entropy; |
633 | 0 | } Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE4::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategy::Type, float, float*) Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_AVX2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategy::Type, float, float*) Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SSE2::SetEntropyForTransform(unsigned long, unsigned long, jxl::AcStrategy::Type, float, float*) |
634 | | |
635 | 0 | AcStrategy::Type AcsSquare(size_t blocks) { |
636 | 0 | if (blocks == 2) { |
637 | 0 | return AcStrategy::Type::DCT16X16; |
638 | 0 | } else if (blocks == 4) { |
639 | 0 | return AcStrategy::Type::DCT32X32; |
640 | 0 | } else { |
641 | 0 | return AcStrategy::Type::DCT64X64; |
642 | 0 | } |
643 | 0 | } Unexecuted instantiation: jxl::N_SSE4::AcsSquare(unsigned long) Unexecuted instantiation: jxl::N_AVX2::AcsSquare(unsigned long) Unexecuted instantiation: jxl::N_SSE2::AcsSquare(unsigned long) |
644 | | |
645 | 0 | AcStrategy::Type AcsVerticalSplit(size_t blocks) { |
646 | 0 | if (blocks == 2) { |
647 | 0 | return AcStrategy::Type::DCT16X8; |
648 | 0 | } else if (blocks == 4) { |
649 | 0 | return AcStrategy::Type::DCT32X16; |
650 | 0 | } else { |
651 | 0 | return AcStrategy::Type::DCT64X32; |
652 | 0 | } |
653 | 0 | } Unexecuted instantiation: jxl::N_SSE4::AcsVerticalSplit(unsigned long) Unexecuted instantiation: jxl::N_AVX2::AcsVerticalSplit(unsigned long) Unexecuted instantiation: jxl::N_SSE2::AcsVerticalSplit(unsigned long) |
654 | | |
655 | 0 | AcStrategy::Type AcsHorizontalSplit(size_t blocks) { |
656 | 0 | if (blocks == 2) { |
657 | 0 | return AcStrategy::Type::DCT8X16; |
658 | 0 | } else if (blocks == 4) { |
659 | 0 | return AcStrategy::Type::DCT16X32; |
660 | 0 | } else { |
661 | 0 | return AcStrategy::Type::DCT32X64; |
662 | 0 | } |
663 | 0 | } Unexecuted instantiation: jxl::N_SSE4::AcsHorizontalSplit(unsigned long) Unexecuted instantiation: jxl::N_AVX2::AcsHorizontalSplit(unsigned long) Unexecuted instantiation: jxl::N_SSE2::AcsHorizontalSplit(unsigned long) |
664 | | |
665 | | // The following function tries to merge smaller transforms into |
666 | | // squares and the rectangles originating from a single middle division |
667 | | // (horizontal or vertical) fairly. |
668 | | // |
669 | | // This is now generalized to concern about squares |
670 | | // of blocks X blocks size, where a block is 8x8 pixels. |
671 | | void FindBestFirstLevelDivisionForSquare( |
672 | | size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx, |
673 | | size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors, |
674 | | AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK, |
675 | | const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate, |
676 | 0 | float* block, float* scratch_space, uint32_t* quantized) { |
677 | | // We denote J for the larger dimension here, and K for the smaller. |
678 | | // For example, for 32x32 block splitting, J would be 32, K 16. |
679 | 0 | const size_t blocks_half = blocks / 2; |
680 | 0 | const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks); |
681 | 0 | const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks); |
682 | 0 | const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks); |
683 | 0 | const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); |
684 | 0 | const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); |
685 | 0 | const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); |
686 | 0 | AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); |
687 | 0 | AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); |
688 | | // Let's check if we can consider a JXJ block here at all. |
689 | | // This is not necessary in the basic use of hierarchically merging |
690 | | // blocks in the simplest possible way, but is needed when we try other |
691 | | // 'floating' options of merging, possibly after a simple hierarchical |
692 | | // merge has been explored. |
693 | 0 | if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, |
694 | 0 | by + cy, bx + cx + blocks) || |
695 | 0 | MultiBlockTransformCrossesHorizontalBoundary( |
696 | 0 | *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || |
697 | 0 | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, |
698 | 0 | by + cy + blocks) || |
699 | 0 | MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, |
700 | 0 | by + cy, by + cy + blocks)) { |
701 | 0 | return; // not suitable for JxJ analysis, some transforms leak out. |
702 | 0 | } |
703 | | // For floating transforms there may be |
704 | | // already blocks selected that make either or both JXK and |
705 | | // KXJ not feasible for this location. |
706 | 0 | const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( |
707 | 0 | *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); |
708 | 0 | const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( |
709 | 0 | *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); |
710 | | // Current entropies aggregated on NxN resolution. |
711 | 0 | float entropy[2][2] = {}; |
712 | 0 | for (size_t dy = 0; dy < blocks; ++dy) { |
713 | 0 | for (size_t dx = 0; dx < blocks; ++dx) { |
714 | 0 | entropy[dy / blocks_half][dx / blocks_half] += |
715 | 0 | entropy_estimate[(cy + dy) * 8 + (cx + dx)]; |
716 | 0 | } |
717 | 0 | } |
718 | 0 | float entropy_JXK_left = std::numeric_limits<float>::max(); |
719 | 0 | float entropy_JXK_right = std::numeric_limits<float>::max(); |
720 | 0 | float entropy_KXJ_top = std::numeric_limits<float>::max(); |
721 | 0 | float entropy_KXJ_bottom = std::numeric_limits<float>::max(); |
722 | 0 | float entropy_JXJ = std::numeric_limits<float>::max(); |
723 | 0 | if (allow_JXK) { |
724 | 0 | if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) { |
725 | 0 | entropy_JXK_left = EstimateEntropy( |
726 | 0 | acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
727 | 0 | cmap_factors, block, scratch_space, quantized); |
728 | 0 | } |
729 | 0 | if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) { |
730 | 0 | entropy_JXK_right = |
731 | 0 | EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8, |
732 | 0 | (by + cy + 0) * 8, config, cmap_factors, block, |
733 | 0 | scratch_space, quantized); |
734 | 0 | } |
735 | 0 | } |
736 | 0 | if (allow_KXJ) { |
737 | 0 | if (row0[bx + cx].RawStrategy() != acs_rawKXJ) { |
738 | 0 | entropy_KXJ_top = EstimateEntropy( |
739 | 0 | acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, |
740 | 0 | cmap_factors, block, scratch_space, quantized); |
741 | 0 | } |
742 | 0 | if (row1[bx + cx].RawStrategy() != acs_rawKXJ) { |
743 | 0 | entropy_KXJ_bottom = |
744 | 0 | EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, |
745 | 0 | (by + cy + blocks_half) * 8, config, cmap_factors, |
746 | 0 | block, scratch_space, quantized); |
747 | 0 | } |
748 | 0 | } |
749 | 0 | if (allow_square_transform) { |
750 | | // We control the exploration of the square transform separately so that |
751 | | // we can turn it off at high decoding speeds for 32x32, but still allow |
752 | | // exploring 16x32 and 32x16. |
753 | 0 | entropy_JXJ = EstimateEntropy(acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8, |
754 | 0 | (by + cy + 0) * 8, config, cmap_factors, |
755 | 0 | block, scratch_space, quantized); |
756 | 0 | } |
757 | | |
758 | | // Test if this block should have JXK or KXJ transforms, |
759 | | // because it can have only one or the other. |
760 | 0 | float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + |
761 | 0 | std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); |
762 | 0 | float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + |
763 | 0 | std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); |
764 | 0 | if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { |
765 | 0 | ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ); |
766 | 0 | SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); |
767 | 0 | } else if (costJxN < costNxJ) { |
768 | 0 | if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { |
769 | 0 | ac_strategy->Set(bx + cx, by + cy, acs_rawJXK); |
770 | 0 | SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, |
771 | 0 | entropy_estimate); |
772 | 0 | } |
773 | 0 | if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { |
774 | 0 | ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK); |
775 | 0 | SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, |
776 | 0 | entropy_JXK_right, entropy_estimate); |
777 | 0 | } |
778 | 0 | } else { |
779 | 0 | if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { |
780 | 0 | ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ); |
781 | 0 | SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, |
782 | 0 | entropy_estimate); |
783 | 0 | } |
784 | 0 | if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { |
785 | 0 | ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ); |
786 | 0 | SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, |
787 | 0 | entropy_KXJ_bottom, entropy_estimate); |
788 | 0 | } |
789 | 0 | } |
790 | 0 | } Unexecuted instantiation: jxl::N_SSE4::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_AVX2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) Unexecuted instantiation: jxl::N_SSE2::FindBestFirstLevelDivisionForSquare(unsigned long, bool, unsigned long, unsigned long, unsigned long, unsigned long, jxl::ACSConfig const&, float const*, jxl::AcStrategyImage*, float, float, float*, float*, float*, unsigned int*) |
791 | | |
792 | | void ProcessRectACS(const CompressParams& cparams, const ACSConfig& config, |
793 | | const Rect& rect, const ColorCorrelationMap& cmap, |
794 | | float* JXL_RESTRICT block, uint32_t* JXL_RESTRICT quantized, |
795 | 52 | AcStrategyImage* ac_strategy) { |
796 | | // Main philosophy here: |
797 | | // 1. First find best 8x8 transform for each area. |
798 | | // 2. Merging them into larger transforms where possibly, but |
799 | | // starting from the smallest transforms (16x8 and 8x16). |
800 | | // Additional complication: 16x8 and 8x16 are considered |
801 | | // simultaneously and fairly against each other. |
802 | | // We are looking at 64x64 squares since the YtoX and YtoB |
803 | | // maps happen to be at that resolution, and having |
804 | | // integral transforms cross these boundaries leads to |
805 | | // additional complications. |
806 | 52 | const float butteraugli_target = cparams.butteraugli_distance; |
807 | 52 | float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea; |
808 | 52 | size_t bx = rect.x0(); |
809 | 52 | size_t by = rect.y0(); |
810 | 52 | JXL_ASSERT(rect.xsize() <= 8); |
811 | 52 | JXL_ASSERT(rect.ysize() <= 8); |
812 | 52 | size_t tx = bx / kColorTileDimInBlocks; |
813 | 52 | size_t ty = by / kColorTileDimInBlocks; |
814 | 52 | const float cmap_factors[3] = { |
815 | 52 | cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]), |
816 | 52 | 0.0f, |
817 | 52 | cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]), |
818 | 52 | }; |
819 | 52 | if (cparams.speed_tier > SpeedTier::kHare) return; |
820 | | // First compute the best 8x8 transform for each square. Later, we do not |
821 | | // experiment with different combinations, but only use the best of the 8x8s |
822 | | // when DCT8X8 is specified in the tree search. |
823 | | // 8x8 transforms have 10 variants, but every larger transform is just a DCT. |
824 | 52 | float entropy_estimate[64] = {}; |
825 | | // Favor all 8x8 transforms (against 16x8 and larger transforms)) at |
826 | | // low butteraugli_target distances. |
827 | 52 | static const float k8x8mul1 = -0.4; |
828 | 52 | static const float k8x8mul2 = 1.0; |
829 | 52 | static const float k8x8base = 1.4; |
830 | 52 | const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); |
831 | 104 | for (size_t iy = 0; iy < rect.ysize(); iy++) { |
832 | 104 | for (size_t ix = 0; ix < rect.xsize(); ix++) { |
833 | 52 | float entropy = 0.0; |
834 | 52 | const uint8_t best_of_8x8s = FindBest8x8Transform( |
835 | 52 | 8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier), |
836 | 52 | butteraugli_target, config, cmap_factors, ac_strategy, block, |
837 | 52 | scratch_space, quantized, &entropy); |
838 | 52 | ac_strategy->Set(bx + ix, by + iy, |
839 | 52 | static_cast<AcStrategy::Type>(best_of_8x8s)); |
840 | 52 | entropy_estimate[iy * 8 + ix] = entropy * mul8x8; |
841 | 52 | } |
842 | 52 | } |
843 | | // Merge when a larger transform is better than the previously |
844 | | // searched best combination of 8x8 transforms. |
845 | 52 | struct MergeTry { |
846 | 52 | AcStrategy::Type type; |
847 | 52 | uint8_t priority; |
848 | 52 | uint8_t decoding_speed_tier_max_limit; |
849 | 52 | uint8_t encoding_speed_tier_max_limit; |
850 | 52 | float entropy_mul; |
851 | 52 | }; |
852 | | // These numbers need to be figured out manually and looking at |
853 | | // ringing next to sky etc. Optimization will find smaller numbers |
854 | | // and produce more ringing than is ideal. Larger numbers will |
855 | | // help stop ringing. |
856 | 52 | const float entropy_mul16X8 = 1.25; |
857 | 52 | const float entropy_mul16X16 = 1.35; |
858 | 52 | const float entropy_mul16X32 = 1.5; |
859 | 52 | const float entropy_mul32X32 = 1.5; |
860 | 52 | const float entropy_mul64X32 = 2.26; |
861 | 52 | const float entropy_mul64X64 = 2.26; |
862 | | // TODO(jyrki): Consider this feedback in further changes: |
863 | | // Also effectively when the multipliers for smaller blocks are |
864 | | // below 1, this raises the bar for the bigger blocks even higher |
865 | | // in that sense these constants are not independent (e.g. changing |
866 | | // the constant for DCT16x32 by -5% (making it more likely) also |
867 | | // means that DCT32x32 becomes harder to do when starting from |
868 | | // two DCT16x32s). It might be better to make them more independent, |
869 | | // e.g. by not applying the multiplier when storing the new entropy |
870 | | // estimates in TryMergeToACSCandidate(). |
871 | 52 | const MergeTry kTransformsForMerge[9] = { |
872 | 52 | {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8}, |
873 | 52 | {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8}, |
874 | | // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its |
875 | | // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16}, |
876 | 52 | {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32}, |
877 | 52 | {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32}, |
878 | | // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its |
879 | | // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5, |
880 | | // 0.9822994906548809f}, |
881 | 52 | {AcStrategy::Type::DCT64X32, 6, 1, 3, entropy_mul64X32}, |
882 | 52 | {AcStrategy::Type::DCT32X64, 6, 1, 3, entropy_mul64X32}, |
883 | | // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f}, |
884 | 52 | }; |
885 | | /* |
886 | | These sizes not yet included in merge heuristic: |
887 | | set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f); |
888 | | set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f); |
889 | | set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f); |
890 | | set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f); |
891 | | set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f); |
892 | | set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f); |
893 | | set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f); |
894 | | set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f); |
895 | | */ |
896 | | |
897 | | // Priority is a tricky kludge to avoid collisions so that transforms |
898 | | // don't overlap. |
899 | 52 | uint8_t priority[64] = {}; |
900 | 52 | bool enable_32x32 = cparams.decoding_speed_tier < 4; |
901 | 468 | for (auto tx : kTransformsForMerge) { |
902 | 468 | if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { |
903 | 0 | continue; |
904 | 0 | } |
905 | 468 | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); |
906 | | |
907 | 676 | for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); |
908 | 468 | cy += acs.covered_blocks_y()) { |
909 | 364 | for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); |
910 | 208 | cx += acs.covered_blocks_x()) { |
911 | 156 | if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { |
912 | 0 | if (cparams.decoding_speed_tier < 4 && |
913 | 0 | tx.type == AcStrategy::Type::DCT32X64) { |
914 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
915 | 0 | if ((cy | cx) % 8 == 0) { |
916 | 0 | FindBestFirstLevelDivisionForSquare( |
917 | 0 | 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
918 | 0 | tx.entropy_mul, entropy_mul64X64, entropy_estimate, block, |
919 | 0 | scratch_space, quantized); |
920 | 0 | } |
921 | 0 | continue; |
922 | 0 | } else if (tx.type == AcStrategy::Type::DCT32X16) { |
923 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
924 | | // and that is above. The last column and last row, |
925 | | // when the last column or last row is odd numbered, |
926 | | // are still handled by TryMergeAcs. |
927 | 0 | continue; |
928 | 0 | } |
929 | 0 | } |
930 | 156 | if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || |
931 | 156 | (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { |
932 | | // already covered by FindBest32X32 |
933 | 0 | continue; |
934 | 0 | } |
935 | | |
936 | 156 | if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { |
937 | 0 | if (tx.type == AcStrategy::Type::DCT16X32) { |
938 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
939 | 0 | if ((cy | cx) % 4 == 0) { |
940 | 0 | FindBestFirstLevelDivisionForSquare( |
941 | 0 | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, |
942 | 0 | ac_strategy, tx.entropy_mul, entropy_mul32X32, |
943 | 0 | entropy_estimate, block, scratch_space, quantized); |
944 | 0 | } |
945 | 0 | continue; |
946 | 0 | } else if (tx.type == AcStrategy::Type::DCT32X16) { |
947 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
948 | | // and that is above. The last column and last row, |
949 | | // when the last column or last row is odd numbered, |
950 | | // are still handled by TryMergeAcs. |
951 | 0 | continue; |
952 | 0 | } |
953 | 0 | } |
954 | 156 | if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || |
955 | 156 | (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { |
956 | | // already covered by FindBest32X32 |
957 | 0 | continue; |
958 | 0 | } |
959 | 156 | if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { |
960 | 0 | if (tx.type == AcStrategy::Type::DCT8X16) { |
961 | | // We handle both DCT8X16 and DCT16X8 at the same time. |
962 | 0 | if ((cy | cx) % 2 == 0) { |
963 | 0 | FindBestFirstLevelDivisionForSquare( |
964 | 0 | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
965 | 0 | tx.entropy_mul, entropy_mul16X16, entropy_estimate, block, |
966 | 0 | scratch_space, quantized); |
967 | 0 | } |
968 | 0 | continue; |
969 | 0 | } else if (tx.type == AcStrategy::Type::DCT16X8) { |
970 | | // We handled both DCT8X16 and DCT16X8 at the same time, |
971 | | // and that is above. The last column and last row, |
972 | | // when the last column or last row is odd numbered, |
973 | | // are still handled by TryMergeAcs. |
974 | 0 | continue; |
975 | 0 | } |
976 | 0 | } |
977 | 156 | if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) || |
978 | 156 | (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) { |
979 | | // already covered by FindBestFirstLevelDivisionForSquare |
980 | 0 | continue; |
981 | 0 | } |
982 | | // All other merge sizes are handled here. |
983 | | // Some of the DCT16X8s and DCT8X16s will still leak through here |
984 | | // when there is an odd number of 8x8 blocks, then the last row |
985 | | // and column will get their DCT16X8s and DCT8X16s through the |
986 | | // normal integral transform merging process. |
987 | 156 | TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
988 | 156 | tx.entropy_mul, tx.priority, &priority[0], entropy_estimate, |
989 | 156 | block, scratch_space, quantized); |
990 | 156 | } |
991 | 208 | } |
992 | 468 | } |
993 | 52 | if (cparams.speed_tier >= SpeedTier::kHare) { |
994 | 0 | return; |
995 | 0 | } |
996 | | // Here we still try to do some non-aligned matching, find a few more |
997 | | // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. |
998 | 52 | for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { |
999 | 0 | for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { |
1000 | 0 | if ((cy | cx) % 2 != 0) { |
1001 | 0 | FindBestFirstLevelDivisionForSquare( |
1002 | 0 | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1003 | 0 | entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, |
1004 | 0 | scratch_space, quantized); |
1005 | 0 | } |
1006 | 0 | } |
1007 | 0 | } |
1008 | | // Non-aligned matching for 32X32, 16X32 and 32X16. |
1009 | 52 | size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; |
1010 | 52 | for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { |
1011 | 0 | for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { |
1012 | 0 | if ((cy | cx) % 4 == 0) { |
1013 | 0 | continue; // Already tried with loop above (DCT16X32 case). |
1014 | 0 | } |
1015 | 0 | FindBestFirstLevelDivisionForSquare( |
1016 | 0 | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, |
1017 | 0 | entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, |
1018 | 0 | scratch_space, quantized); |
1019 | 0 | } |
1020 | 0 | } |
1021 | 52 | } Unexecuted instantiation: jxl::N_SSE4::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) jxl::N_AVX2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) Line | Count | Source | 795 | 52 | AcStrategyImage* ac_strategy) { | 796 | | // Main philosophy here: | 797 | | // 1. First find best 8x8 transform for each area. | 798 | | // 2. Merging them into larger transforms where possibly, but | 799 | | // starting from the smallest transforms (16x8 and 8x16). | 800 | | // Additional complication: 16x8 and 8x16 are considered | 801 | | // simultaneously and fairly against each other. | 802 | | // We are looking at 64x64 squares since the YtoX and YtoB | 803 | | // maps happen to be at that resolution, and having | 804 | | // integral transforms cross these boundaries leads to | 805 | | // additional complications. | 806 | 52 | const float butteraugli_target = cparams.butteraugli_distance; | 807 | 52 | float* JXL_RESTRICT scratch_space = block + 3 * AcStrategy::kMaxCoeffArea; | 808 | 52 | size_t bx = rect.x0(); | 809 | 52 | size_t by = rect.y0(); | 810 | 52 | JXL_ASSERT(rect.xsize() <= 8); | 811 | 52 | JXL_ASSERT(rect.ysize() <= 8); | 812 | 52 | size_t tx = bx / kColorTileDimInBlocks; | 813 | 52 | size_t ty = by / kColorTileDimInBlocks; | 814 | 52 | const float cmap_factors[3] = { | 815 | 52 | cmap.base().YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]), | 816 | 52 | 0.0f, | 817 | 52 | cmap.base().YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]), | 818 | 52 | }; | 819 | 52 | if (cparams.speed_tier > SpeedTier::kHare) return; | 820 | | // First compute the best 8x8 transform for each square. Later, we do not | 821 | | // experiment with different combinations, but only use the best of the 8x8s | 822 | | // when DCT8X8 is specified in the tree search. | 823 | | // 8x8 transforms have 10 variants, but every larger transform is just a DCT. | 824 | 52 | float entropy_estimate[64] = {}; | 825 | | // Favor all 8x8 transforms (against 16x8 and larger transforms)) at | 826 | | // low butteraugli_target distances. | 827 | 52 | static const float k8x8mul1 = -0.4; | 828 | 52 | static const float k8x8mul2 = 1.0; | 829 | 52 | static const float k8x8base = 1.4; | 830 | 52 | const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); | 831 | 104 | for (size_t iy = 0; iy < rect.ysize(); iy++) { | 832 | 104 | for (size_t ix = 0; ix < rect.xsize(); ix++) { | 833 | 52 | float entropy = 0.0; | 834 | 52 | const uint8_t best_of_8x8s = FindBest8x8Transform( | 835 | 52 | 8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier), | 836 | 52 | butteraugli_target, config, cmap_factors, ac_strategy, block, | 837 | 52 | scratch_space, quantized, &entropy); | 838 | 52 | ac_strategy->Set(bx + ix, by + iy, | 839 | 52 | static_cast<AcStrategy::Type>(best_of_8x8s)); | 840 | 52 | entropy_estimate[iy * 8 + ix] = entropy * mul8x8; | 841 | 52 | } | 842 | 52 | } | 843 | | // Merge when a larger transform is better than the previously | 844 | | // searched best combination of 8x8 transforms. | 845 | 52 | struct MergeTry { | 846 | 52 | AcStrategy::Type type; | 847 | 52 | uint8_t priority; | 848 | 52 | uint8_t decoding_speed_tier_max_limit; | 849 | 52 | uint8_t encoding_speed_tier_max_limit; | 850 | 52 | float entropy_mul; | 851 | 52 | }; | 852 | | // These numbers need to be figured out manually and looking at | 853 | | // ringing next to sky etc. Optimization will find smaller numbers | 854 | | // and produce more ringing than is ideal. Larger numbers will | 855 | | // help stop ringing. | 856 | 52 | const float entropy_mul16X8 = 1.25; | 857 | 52 | const float entropy_mul16X16 = 1.35; | 858 | 52 | const float entropy_mul16X32 = 1.5; | 859 | 52 | const float entropy_mul32X32 = 1.5; | 860 | 52 | const float entropy_mul64X32 = 2.26; | 861 | 52 | const float entropy_mul64X64 = 2.26; | 862 | | // TODO(jyrki): Consider this feedback in further changes: | 863 | | // Also effectively when the multipliers for smaller blocks are | 864 | | // below 1, this raises the bar for the bigger blocks even higher | 865 | | // in that sense these constants are not independent (e.g. changing | 866 | | // the constant for DCT16x32 by -5% (making it more likely) also | 867 | | // means that DCT32x32 becomes harder to do when starting from | 868 | | // two DCT16x32s). It might be better to make them more independent, | 869 | | // e.g. by not applying the multiplier when storing the new entropy | 870 | | // estimates in TryMergeToACSCandidate(). | 871 | 52 | const MergeTry kTransformsForMerge[9] = { | 872 | 52 | {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8}, | 873 | 52 | {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8}, | 874 | | // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its | 875 | | // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16}, | 876 | 52 | {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32}, | 877 | 52 | {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32}, | 878 | | // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its | 879 | | // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5, | 880 | | // 0.9822994906548809f}, | 881 | 52 | {AcStrategy::Type::DCT64X32, 6, 1, 3, entropy_mul64X32}, | 882 | 52 | {AcStrategy::Type::DCT32X64, 6, 1, 3, entropy_mul64X32}, | 883 | | // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f}, | 884 | 52 | }; | 885 | | /* | 886 | | These sizes not yet included in merge heuristic: | 887 | | set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f); | 888 | | set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f); | 889 | | set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f); | 890 | | set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f); | 891 | | set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f); | 892 | | set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f); | 893 | | set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f); | 894 | | set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f); | 895 | | */ | 896 | | | 897 | | // Priority is a tricky kludge to avoid collisions so that transforms | 898 | | // don't overlap. | 899 | 52 | uint8_t priority[64] = {}; | 900 | 52 | bool enable_32x32 = cparams.decoding_speed_tier < 4; | 901 | 468 | for (auto tx : kTransformsForMerge) { | 902 | 468 | if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { | 903 | 0 | continue; | 904 | 0 | } | 905 | 468 | AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); | 906 | | | 907 | 676 | for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); | 908 | 468 | cy += acs.covered_blocks_y()) { | 909 | 364 | for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); | 910 | 208 | cx += acs.covered_blocks_x()) { | 911 | 156 | if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { | 912 | 0 | if (cparams.decoding_speed_tier < 4 && | 913 | 0 | tx.type == AcStrategy::Type::DCT32X64) { | 914 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 915 | 0 | if ((cy | cx) % 8 == 0) { | 916 | 0 | FindBestFirstLevelDivisionForSquare( | 917 | 0 | 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 918 | 0 | tx.entropy_mul, entropy_mul64X64, entropy_estimate, block, | 919 | 0 | scratch_space, quantized); | 920 | 0 | } | 921 | 0 | continue; | 922 | 0 | } else if (tx.type == AcStrategy::Type::DCT32X16) { | 923 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 924 | | // and that is above. The last column and last row, | 925 | | // when the last column or last row is odd numbered, | 926 | | // are still handled by TryMergeAcs. | 927 | 0 | continue; | 928 | 0 | } | 929 | 0 | } | 930 | 156 | if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || | 931 | 156 | (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { | 932 | | // already covered by FindBest32X32 | 933 | 0 | continue; | 934 | 0 | } | 935 | | | 936 | 156 | if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { | 937 | 0 | if (tx.type == AcStrategy::Type::DCT16X32) { | 938 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 939 | 0 | if ((cy | cx) % 4 == 0) { | 940 | 0 | FindBestFirstLevelDivisionForSquare( | 941 | 0 | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, | 942 | 0 | ac_strategy, tx.entropy_mul, entropy_mul32X32, | 943 | 0 | entropy_estimate, block, scratch_space, quantized); | 944 | 0 | } | 945 | 0 | continue; | 946 | 0 | } else if (tx.type == AcStrategy::Type::DCT32X16) { | 947 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 948 | | // and that is above. The last column and last row, | 949 | | // when the last column or last row is odd numbered, | 950 | | // are still handled by TryMergeAcs. | 951 | 0 | continue; | 952 | 0 | } | 953 | 0 | } | 954 | 156 | if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || | 955 | 156 | (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { | 956 | | // already covered by FindBest32X32 | 957 | 0 | continue; | 958 | 0 | } | 959 | 156 | if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { | 960 | 0 | if (tx.type == AcStrategy::Type::DCT8X16) { | 961 | | // We handle both DCT8X16 and DCT16X8 at the same time. | 962 | 0 | if ((cy | cx) % 2 == 0) { | 963 | 0 | FindBestFirstLevelDivisionForSquare( | 964 | 0 | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 965 | 0 | tx.entropy_mul, entropy_mul16X16, entropy_estimate, block, | 966 | 0 | scratch_space, quantized); | 967 | 0 | } | 968 | 0 | continue; | 969 | 0 | } else if (tx.type == AcStrategy::Type::DCT16X8) { | 970 | | // We handled both DCT8X16 and DCT16X8 at the same time, | 971 | | // and that is above. The last column and last row, | 972 | | // when the last column or last row is odd numbered, | 973 | | // are still handled by TryMergeAcs. | 974 | 0 | continue; | 975 | 0 | } | 976 | 0 | } | 977 | 156 | if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) || | 978 | 156 | (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) { | 979 | | // already covered by FindBestFirstLevelDivisionForSquare | 980 | 0 | continue; | 981 | 0 | } | 982 | | // All other merge sizes are handled here. | 983 | | // Some of the DCT16X8s and DCT8X16s will still leak through here | 984 | | // when there is an odd number of 8x8 blocks, then the last row | 985 | | // and column will get their DCT16X8s and DCT8X16s through the | 986 | | // normal integral transform merging process. | 987 | 156 | TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 988 | 156 | tx.entropy_mul, tx.priority, &priority[0], entropy_estimate, | 989 | 156 | block, scratch_space, quantized); | 990 | 156 | } | 991 | 208 | } | 992 | 468 | } | 993 | 52 | if (cparams.speed_tier >= SpeedTier::kHare) { | 994 | 0 | return; | 995 | 0 | } | 996 | | // Here we still try to do some non-aligned matching, find a few more | 997 | | // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. | 998 | 52 | for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { | 999 | 0 | for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { | 1000 | 0 | if ((cy | cx) % 2 != 0) { | 1001 | 0 | FindBestFirstLevelDivisionForSquare( | 1002 | 0 | 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1003 | 0 | entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, | 1004 | 0 | scratch_space, quantized); | 1005 | 0 | } | 1006 | 0 | } | 1007 | 0 | } | 1008 | | // Non-aligned matching for 32X32, 16X32 and 32X16. | 1009 | 52 | size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; | 1010 | 52 | for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { | 1011 | 0 | for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { | 1012 | 0 | if ((cy | cx) % 4 == 0) { | 1013 | 0 | continue; // Already tried with loop above (DCT16X32 case). | 1014 | 0 | } | 1015 | 0 | FindBestFirstLevelDivisionForSquare( | 1016 | 0 | 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, | 1017 | 0 | entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, | 1018 | 0 | scratch_space, quantized); | 1019 | 0 | } | 1020 | 0 | } | 1021 | 52 | } |
Unexecuted instantiation: jxl::N_SSE2::ProcessRectACS(jxl::CompressParams const&, jxl::ACSConfig const&, jxl::RectT<unsigned long> const&, jxl::ColorCorrelationMap const&, float*, unsigned int*, jxl::AcStrategyImage*) |
1022 | | |
1023 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
1024 | | } // namespace HWY_NAMESPACE |
1025 | | } // namespace jxl |
1026 | | HWY_AFTER_NAMESPACE(); |
1027 | | |
1028 | | #if HWY_ONCE |
1029 | | namespace jxl { |
1030 | | HWY_EXPORT(ProcessRectACS); |
1031 | | |
1032 | | void AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in, |
1033 | | const ImageF& quant_field, const ImageF& mask, |
1034 | | const ImageF& mask1x1, |
1035 | 52 | DequantMatrices* matrices) { |
1036 | 52 | config.dequant = matrices; |
1037 | | |
1038 | 52 | if (cparams.speed_tier >= SpeedTier::kCheetah) { |
1039 | 0 | JXL_CHECK(matrices->EnsureComputed(1)); // DCT8 only |
1040 | 52 | } else { |
1041 | 52 | uint32_t acs_mask = 0; |
1042 | | // All transforms up to 64x64. |
1043 | 1.14k | for (size_t i = 0; i < AcStrategy::DCT128X128; i++) { |
1044 | 1.09k | acs_mask |= (1 << i); |
1045 | 1.09k | } |
1046 | 52 | JXL_CHECK(matrices->EnsureComputed(acs_mask)); |
1047 | 52 | } |
1048 | | |
1049 | | // Image row pointers and strides. |
1050 | 52 | config.quant_field_row = quant_field.Row(0); |
1051 | 52 | config.quant_field_stride = quant_field.PixelsPerRow(); |
1052 | 52 | if (mask.xsize() > 0 && mask.ysize() > 0) { |
1053 | 52 | config.masking_field_row = mask.Row(0); |
1054 | 52 | config.masking_field_stride = mask.PixelsPerRow(); |
1055 | 52 | } |
1056 | 52 | if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) { |
1057 | 52 | config.masking1x1_field_row = mask1x1.Row(0); |
1058 | 52 | config.masking1x1_field_stride = mask1x1.PixelsPerRow(); |
1059 | 52 | } |
1060 | | |
1061 | 52 | config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0); |
1062 | 52 | config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0); |
1063 | 52 | config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0); |
1064 | 52 | config.src_stride = src.PixelsPerRow(); |
1065 | | |
1066 | | // Entropy estimate is composed of two factors: |
1067 | | // - estimate of the number of bits that will be used by the block |
1068 | | // - information loss due to quantization |
1069 | | // The following constant controls the relative weights of these components. |
1070 | 52 | config.info_loss_multiplier = 1.2; |
1071 | 52 | config.zeros_mul = 9.3089059022677905; |
1072 | 52 | config.cost_delta = 10.833273317067883; |
1073 | | |
1074 | 52 | static const float kBias = 0.13731742964354549; |
1075 | 52 | const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias); |
1076 | | |
1077 | 52 | static const float kPow1 = 0.33677806662454718; |
1078 | 52 | static const float kPow2 = 0.50990926717963703; |
1079 | 52 | static const float kPow3 = 0.36702940662370243; |
1080 | 52 | config.info_loss_multiplier *= std::pow(ratio, kPow1); |
1081 | 52 | config.zeros_mul *= std::pow(ratio, kPow2); |
1082 | 52 | config.cost_delta *= std::pow(ratio, kPow3); |
1083 | 52 | } |
1084 | | |
1085 | 52 | void AcStrategyHeuristics::PrepareForThreads(std::size_t num_threads) { |
1086 | 52 | const size_t dct_scratch_size = |
1087 | 52 | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; |
1088 | 52 | mem_per_thread = 6 * AcStrategy::kMaxCoeffArea + dct_scratch_size; |
1089 | 52 | mem = hwy::AllocateAligned<float>(num_threads * mem_per_thread); |
1090 | 52 | qmem_per_thread = AcStrategy::kMaxCoeffArea; |
1091 | 52 | qmem = hwy::AllocateAligned<uint32_t>(num_threads * qmem_per_thread); |
1092 | 52 | } |
1093 | | |
1094 | | void AcStrategyHeuristics::ProcessRect(const Rect& rect, |
1095 | | const ColorCorrelationMap& cmap, |
1096 | | AcStrategyImage* ac_strategy, |
1097 | 52 | size_t thread) { |
1098 | | // In Falcon mode, use DCT8 everywhere and uniform quantization. |
1099 | 52 | if (cparams.speed_tier >= SpeedTier::kCheetah) { |
1100 | 0 | ac_strategy->FillDCT8(rect); |
1101 | 0 | return; |
1102 | 0 | } |
1103 | 52 | HWY_DYNAMIC_DISPATCH(ProcessRectACS) |
1104 | 52 | (cparams, config, rect, cmap, mem.get() + thread * mem_per_thread, |
1105 | 52 | qmem.get() + thread * qmem_per_thread, ac_strategy); |
1106 | 52 | } |
1107 | | |
1108 | | Status AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim, |
1109 | | const AcStrategyImage& ac_strategy, |
1110 | 52 | AuxOut* aux_out) { |
1111 | | // Accounting and debug output. |
1112 | 52 | if (aux_out != nullptr) { |
1113 | 0 | aux_out->num_small_blocks = |
1114 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::IDENTITY) + |
1115 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT2X2) + |
1116 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT4X4); |
1117 | 0 | aux_out->num_dct4x8_blocks = |
1118 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) + |
1119 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4); |
1120 | 0 | aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) + |
1121 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::AFV1) + |
1122 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::AFV2) + |
1123 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::AFV3); |
1124 | 0 | aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT); |
1125 | 0 | aux_out->num_dct8x16_blocks = |
1126 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) + |
1127 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8); |
1128 | 0 | aux_out->num_dct8x32_blocks = |
1129 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) + |
1130 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8); |
1131 | 0 | aux_out->num_dct16_blocks = |
1132 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16); |
1133 | 0 | aux_out->num_dct16x32_blocks = |
1134 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) + |
1135 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16); |
1136 | 0 | aux_out->num_dct32_blocks = |
1137 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32); |
1138 | 0 | aux_out->num_dct32x64_blocks = |
1139 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) + |
1140 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32); |
1141 | 0 | aux_out->num_dct64_blocks = |
1142 | 0 | ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64); |
1143 | 0 | } |
1144 | | |
1145 | | // if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(aux_out)) { |
1146 | 52 | if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) { |
1147 | 0 | JXL_RETURN_IF_ERROR(DumpAcStrategy(ac_strategy, frame_dim.xsize, |
1148 | 0 | frame_dim.ysize, "ac_strategy", aux_out, |
1149 | 0 | cparams)); |
1150 | 0 | } |
1151 | 52 | return true; |
1152 | 52 | } |
1153 | | |
1154 | | } // namespace jxl |
1155 | | #endif // HWY_ONCE |