/src/libjxl/lib/jxl/enc_coeff_order.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include <jxl/memory_manager.h> |
7 | | |
8 | | #include <algorithm> |
9 | | #include <cmath> |
10 | | #include <cstddef> |
11 | | #include <cstdint> |
12 | | #include <cstring> |
13 | | #include <limits> |
14 | | #include <utility> |
15 | | #include <vector> |
16 | | |
17 | | #include "lib/jxl/ac_strategy.h" |
18 | | #include "lib/jxl/base/compiler_specific.h" |
19 | | #include "lib/jxl/base/rect.h" |
20 | | #include "lib/jxl/base/status.h" |
21 | | #include "lib/jxl/coeff_order.h" |
22 | | #include "lib/jxl/coeff_order_fwd.h" |
23 | | #include "lib/jxl/common.h" |
24 | | #include "lib/jxl/dct_util.h" |
25 | | #include "lib/jxl/enc_ans.h" |
26 | | #include "lib/jxl/enc_ans_params.h" |
27 | | #include "lib/jxl/enc_bit_writer.h" |
28 | | #include "lib/jxl/frame_dimensions.h" |
29 | | #include "lib/jxl/lehmer_code.h" |
30 | | #include "lib/jxl/memory_manager_internal.h" |
31 | | |
32 | | namespace jxl { |
33 | | |
34 | | struct AuxOut; |
35 | | enum class LayerType : uint8_t; |
36 | | |
37 | | std::pair<uint32_t, uint32_t> ComputeUsedOrders( |
38 | | const SpeedTier speed, const AcStrategyImage& ac_strategy, |
39 | 2.13k | const Rect& rect) { |
40 | | // No coefficient reordering in Falcon or faster. |
41 | | // Only uses DCT8 = 0, so bitfield = 1. |
42 | 2.13k | if (speed >= SpeedTier::kFalcon) return {1, 1}; |
43 | | |
44 | 2.13k | uint32_t ret = 0; |
45 | 2.13k | uint32_t ret_customize = 0; |
46 | 2.13k | size_t xsize_blocks = rect.xsize(); |
47 | 2.13k | size_t ysize_blocks = rect.ysize(); |
48 | | // TODO(veluca): precompute when doing DCT. |
49 | 65.7k | for (size_t by = 0; by < ysize_blocks; ++by) { |
50 | 63.6k | AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); |
51 | 2.96M | for (size_t bx = 0; bx < xsize_blocks; ++bx) { |
52 | 2.89M | int ord = kStrategyOrder[acs_row[bx].RawStrategy()]; |
53 | | // Do not customize coefficient orders for blocks bigger than 32x32. |
54 | 2.89M | ret |= 1u << ord; |
55 | 2.89M | if (ord > 6) { |
56 | 546k | continue; |
57 | 546k | } |
58 | 2.35M | ret_customize |= 1u << ord; |
59 | 2.35M | } |
60 | 63.6k | } |
61 | | // Use default orders for small images. |
62 | 2.13k | if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0}; |
63 | 1.96k | return {ret, ret_customize}; |
64 | 2.13k | } |
65 | | |
66 | | Status ComputeCoeffOrder(SpeedTier speed, const ACImage& ac_image, |
67 | | const AcStrategyImage& ac_strategy, |
68 | | const FrameDimensions& frame_dim, |
69 | | uint32_t& all_used_orders, uint32_t prev_used_acs, |
70 | | uint32_t current_used_acs, |
71 | | uint32_t current_used_orders, |
72 | 2.13k | coeff_order_t* JXL_RESTRICT order) { |
73 | 2.13k | JxlMemoryManager* memory_manager = ac_strategy.memory_manager(); |
74 | 2.13k | std::vector<int64_t> num_zeros(kCoeffOrderMaxSize); |
75 | | // If compressing at high speed and only using 8x8 DCTs, only consider a |
76 | | // subset of blocks. |
77 | 2.13k | double block_fraction = 1.0f; |
78 | | // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes |
79 | | // encoding significantly less dense. |
80 | 2.13k | if (speed >= SpeedTier::kSquirrel && current_used_orders == 1) { |
81 | 4 | block_fraction = 0.5f; |
82 | 4 | } |
83 | | // No need to compute number of zero coefficients if all orders are the |
84 | | // default. |
85 | 2.13k | if (current_used_orders != 0) { |
86 | 1.95k | uint64_t threshold = |
87 | 1.95k | (std::numeric_limits<uint64_t>::max() >> 32) * block_fraction; |
88 | 1.95k | uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull), |
89 | 1.95k | static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)}; |
90 | | // Xorshift128+ adapted from xorshift128+-inl.h |
91 | 1.59M | auto use_sample = [&]() { |
92 | 1.59M | auto s1 = s[0]; |
93 | 1.59M | const auto s0 = s[1]; |
94 | 1.59M | const auto bits = s1 + s0; // b, c |
95 | 1.59M | s[0] = s0; |
96 | 1.59M | s1 ^= s1 << 23; |
97 | 1.59M | s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); |
98 | 1.59M | s[1] = s1; |
99 | 1.59M | return (bits >> 32) <= threshold; |
100 | 1.59M | }; |
101 | | |
102 | | // Count number of zero coefficients, separately for each DCT band. |
103 | | // TODO(veluca): precompute when doing DCT. |
104 | 7.02k | for (size_t group_index = 0; group_index < frame_dim.num_groups; |
105 | 5.07k | group_index++) { |
106 | 5.07k | const size_t gx = group_index % frame_dim.xsize_groups; |
107 | 5.07k | const size_t gy = group_index / frame_dim.xsize_groups; |
108 | 5.07k | const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks, |
109 | 5.07k | kGroupDimInBlocks, kGroupDimInBlocks, |
110 | 5.07k | frame_dim.xsize_blocks, frame_dim.ysize_blocks); |
111 | 5.07k | ConstACPtr rows[3]; |
112 | 5.07k | ACType type = ac_image.Type(); |
113 | 20.2k | for (size_t c = 0; c < 3; c++) { |
114 | 15.2k | rows[c] = ac_image.PlaneRow(c, group_index, 0); |
115 | 15.2k | } |
116 | 5.07k | size_t ac_offset = 0; |
117 | | |
118 | | // TODO(veluca): SIMDfy. |
119 | 118k | for (size_t by = 0; by < rect.ysize(); ++by) { |
120 | 113k | AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); |
121 | 2.99M | for (size_t bx = 0; bx < rect.xsize(); ++bx) { |
122 | 2.88M | AcStrategy acs = acs_row[bx]; |
123 | 2.88M | if (!acs.IsFirstBlock()) continue; |
124 | 1.59M | if (!use_sample()) continue; |
125 | 1.59M | size_t size = kDCTBlockSize << acs.log2_covered_blocks(); |
126 | 6.36M | for (size_t c = 0; c < 3; ++c) { |
127 | 4.77M | const size_t order_offset = |
128 | 4.77M | CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c); |
129 | 4.77M | if (type == ACType::k16) { |
130 | 0 | for (size_t k = 0; k < size; k++) { |
131 | 0 | bool is_zero = rows[c].ptr16[ac_offset + k] == 0; |
132 | 0 | num_zeros[order_offset + k] += is_zero ? 1 : 0; |
133 | 0 | } |
134 | 4.77M | } else { |
135 | 558M | for (size_t k = 0; k < size; k++) { |
136 | 553M | bool is_zero = rows[c].ptr32[ac_offset + k] == 0; |
137 | 553M | num_zeros[order_offset + k] += is_zero ? 1 : 0; |
138 | 553M | } |
139 | 4.77M | } |
140 | | // Ensure LLFs are first in the order. |
141 | 4.77M | size_t cx = acs.covered_blocks_x(); |
142 | 4.77M | size_t cy = acs.covered_blocks_y(); |
143 | 4.77M | CoefficientLayout(&cy, &cx); |
144 | 10.0M | for (size_t iy = 0; iy < cy; iy++) { |
145 | 13.9M | for (size_t ix = 0; ix < cx; ix++) { |
146 | 8.65M | num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1; |
147 | 8.65M | } |
148 | 5.31M | } |
149 | 4.77M | } |
150 | 1.59M | ac_offset += size; |
151 | 1.59M | } |
152 | 113k | } |
153 | 5.07k | } |
154 | 1.95k | } |
155 | 2.13k | struct PosAndCount { |
156 | 2.13k | uint32_t pos; |
157 | | // Saving index breaks the ties for non-stable sort |
158 | 2.13k | uint64_t count_and_idx; |
159 | 2.13k | }; |
160 | 2.13k | size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(PosAndCount); |
161 | 2.13k | JXL_ASSIGN_OR_RETURN(auto mem, |
162 | 2.13k | AlignedMemory::Create(memory_manager, mem_bytes)); |
163 | | |
164 | 2.13k | std::vector<coeff_order_t> natural_order_buffer; |
165 | | |
166 | 2.13k | uint16_t computed = 0; |
167 | 59.6k | for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { |
168 | 57.5k | uint8_t ord = kStrategyOrder[o]; |
169 | 57.5k | if (computed & (1 << ord)) continue; |
170 | 27.7k | computed |= 1 << ord; |
171 | 27.7k | AcStrategy acs = AcStrategy::FromRawStrategy(o); |
172 | 27.7k | size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y(); |
173 | | // Expected maximal size is 256 x 256. |
174 | 27.7k | JXL_DASSERT(sz <= (1 << 16)); |
175 | | |
176 | | // Do nothing for transforms that don't appear. |
177 | 27.7k | if ((1 << ord) & ~current_used_acs) continue; |
178 | | |
179 | | // Do nothing if we already committed to this custom order previously. |
180 | 9.51k | if ((1 << ord) & prev_used_acs) continue; |
181 | 9.51k | if ((1 << ord) & all_used_orders) continue; |
182 | | |
183 | 9.51k | if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz); |
184 | 9.51k | acs.ComputeNaturalCoeffOrder(natural_order_buffer.data()); |
185 | | |
186 | | // Ensure natural coefficient order is not permuted if the order is |
187 | | // not transmitted. |
188 | 9.51k | if ((1 << ord) & ~current_used_orders) { |
189 | 4.85k | for (size_t c = 0; c < 3; c++) { |
190 | 3.63k | size_t offset = CoeffOrderOffset(ord, c); |
191 | 3.63k | JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz); |
192 | 3.63k | memcpy(&order[offset], natural_order_buffer.data(), |
193 | 3.63k | sz * sizeof(*order)); |
194 | 3.63k | } |
195 | 1.21k | continue; |
196 | 1.21k | } |
197 | | |
198 | 8.29k | bool is_nondefault = false; |
199 | 33.1k | for (uint8_t c = 0; c < 3; c++) { |
200 | | // Apply zig-zag order. |
201 | 24.8k | PosAndCount* pos_and_val = mem.address<PosAndCount>(); |
202 | 24.8k | size_t offset = CoeffOrderOffset(ord, c); |
203 | 24.8k | JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz); |
204 | 24.8k | float inv_sqrt_sz = 1.0f / std::sqrt(sz); |
205 | 7.01M | for (size_t i = 0; i < sz; ++i) { |
206 | 6.98M | size_t pos = natural_order_buffer[i]; |
207 | 6.98M | pos_and_val[i].pos = pos; |
208 | | // We don't care for the exact number -> quantize number of zeros, |
209 | | // to get less permuted order. |
210 | 6.98M | uint64_t count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f; |
211 | | // Worst case: all dct8x8, all zeroes: count <= nb_pixels/64/8 |
212 | | // nb_pixels is limited to 2^40 (Level 10 limit) |
213 | | // so count is limited to 2^31 |
214 | 6.98M | JXL_DASSERT(count < (uint64_t{1} << 48)); |
215 | 6.98M | pos_and_val[i].count_and_idx = (count << 16) | i; |
216 | 6.98M | } |
217 | | |
218 | | // Stable-sort -> elements with same number of zeros will preserve their |
219 | | // order. |
220 | 19.2M | auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool { |
221 | 19.2M | return a.count_and_idx < b.count_and_idx; |
222 | 19.2M | }; |
223 | 24.8k | std::sort(pos_and_val, pos_and_val + sz, comparator); |
224 | | |
225 | | // Grab indices. |
226 | 7.01M | for (size_t i = 0; i < sz; ++i) { |
227 | 6.98M | order[offset + i] = pos_and_val[i].pos; |
228 | 6.98M | is_nondefault |= natural_order_buffer[i] != pos_and_val[i].pos; |
229 | 6.98M | } |
230 | 24.8k | } |
231 | 8.29k | if (!is_nondefault) { |
232 | 4.05k | current_used_orders &= ~(1 << ord); |
233 | 4.05k | } |
234 | 8.29k | } |
235 | 2.13k | all_used_orders |= current_used_orders; |
236 | 2.13k | return true; |
237 | 2.13k | } |
238 | | |
239 | | namespace { |
240 | | |
241 | | Status TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, |
242 | 12.7k | size_t size, std::vector<Token>* tokens) { |
243 | 12.7k | std::vector<LehmerT> lehmer(size); |
244 | 12.7k | std::vector<uint32_t> temp(size + 1); |
245 | 12.7k | JXL_RETURN_IF_ERROR( |
246 | 12.7k | ComputeLehmerCode(order, temp.data(), size, lehmer.data())); |
247 | 12.7k | size_t end = size; |
248 | 1.37M | while (end > skip && lehmer[end - 1] == 0) { |
249 | 1.36M | --end; |
250 | 1.36M | } |
251 | 12.7k | tokens->emplace_back(CoeffOrderContext(size), end - skip); |
252 | 12.7k | uint32_t last = 0; |
253 | 469k | for (size_t i = skip; i < end; ++i) { |
254 | 456k | tokens->emplace_back(CoeffOrderContext(last), lehmer[i]); |
255 | 456k | last = lehmer[i]; |
256 | 456k | } |
257 | 12.7k | return true; |
258 | 12.7k | } |
259 | | |
260 | | } // namespace |
261 | | |
262 | | Status EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, |
263 | | size_t size, BitWriter* writer, LayerType layer, |
264 | 0 | AuxOut* aux_out) { |
265 | 0 | JxlMemoryManager* memory_manager = writer->memory_manager(); |
266 | 0 | std::vector<std::vector<Token>> tokens(1); |
267 | 0 | JXL_RETURN_IF_ERROR(TokenizePermutation(order, skip, size, tokens.data())); |
268 | 0 | EntropyEncodingData codes; |
269 | 0 | JXL_ASSIGN_OR_RETURN( |
270 | 0 | size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(), |
271 | 0 | kPermutationContexts, tokens, |
272 | 0 | &codes, writer, layer, aux_out)); |
273 | 0 | (void)cost; |
274 | 0 | JXL_RETURN_IF_ERROR(WriteTokens(tokens[0], codes, 0, writer, layer, aux_out)); |
275 | 0 | return true; |
276 | 0 | } |
277 | | |
278 | | namespace { |
279 | | Status EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs, |
280 | | std::vector<Token>* tokens, coeff_order_t* order_zigzag, |
281 | 12.7k | std::vector<coeff_order_t>& natural_order_lut) { |
282 | 12.7k | const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); |
283 | 12.7k | const size_t size = kDCTBlockSize * llf; |
284 | 1.86M | for (size_t i = 0; i < size; ++i) { |
285 | 1.84M | order_zigzag[i] = natural_order_lut[order[i]]; |
286 | 1.84M | } |
287 | 12.7k | JXL_RETURN_IF_ERROR(TokenizePermutation(order_zigzag, llf, size, tokens)); |
288 | 12.7k | return true; |
289 | 12.7k | } |
290 | | } // namespace |
291 | | |
292 | | Status EncodeCoeffOrders(uint16_t used_orders, |
293 | | const coeff_order_t* JXL_RESTRICT order, |
294 | | BitWriter* writer, LayerType layer, |
295 | 2.13k | AuxOut* JXL_RESTRICT aux_out) { |
296 | 2.13k | JxlMemoryManager* memory_manager = writer->memory_manager(); |
297 | 2.13k | size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(coeff_order_t); |
298 | 2.13k | JXL_ASSIGN_OR_RETURN(auto mem, |
299 | 2.13k | AlignedMemory::Create(memory_manager, mem_bytes)); |
300 | 2.13k | uint16_t computed = 0; |
301 | 2.13k | std::vector<std::vector<Token>> tokens(1); |
302 | 2.13k | std::vector<coeff_order_t> natural_order_lut; |
303 | 59.6k | for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { |
304 | 57.5k | uint8_t ord = kStrategyOrder[o]; |
305 | 57.5k | if (computed & (1 << ord)) continue; |
306 | 27.7k | computed |= 1 << ord; |
307 | 27.7k | if ((used_orders & (1 << ord)) == 0) continue; |
308 | 4.24k | AcStrategy acs = AcStrategy::FromRawStrategy(o); |
309 | 4.24k | const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); |
310 | 4.24k | const size_t size = kDCTBlockSize * llf; |
311 | 4.24k | if (natural_order_lut.size() < size) natural_order_lut.resize(size); |
312 | 4.24k | acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data()); |
313 | 16.9k | for (size_t c = 0; c < 3; c++) { |
314 | 12.7k | JXL_RETURN_IF_ERROR( |
315 | 12.7k | EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, tokens.data(), |
316 | 12.7k | mem.address<coeff_order_t>(), natural_order_lut)); |
317 | 12.7k | } |
318 | 4.24k | } |
319 | | // Do not write anything if no order is used. |
320 | 2.13k | if (used_orders != 0) { |
321 | 1.49k | EntropyEncodingData codes; |
322 | 1.49k | JXL_ASSIGN_OR_RETURN( |
323 | 1.49k | size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(), |
324 | 1.49k | kPermutationContexts, tokens, |
325 | 1.49k | &codes, writer, layer, aux_out)); |
326 | 1.49k | (void)cost; |
327 | 1.49k | JXL_RETURN_IF_ERROR( |
328 | 1.49k | WriteTokens(tokens[0], codes, 0, writer, layer, aux_out)); |
329 | 1.49k | } |
330 | 2.13k | return true; |
331 | 2.13k | } |
332 | | |
333 | | } // namespace jxl |