/src/libjxl/lib/jxl/enc_coeff_order.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include <jxl/memory_manager.h> |
7 | | |
8 | | #include <algorithm> |
9 | | #include <cmath> |
10 | | #include <cstddef> |
11 | | #include <cstdint> |
12 | | #include <cstring> |
13 | | #include <limits> |
14 | | #include <utility> |
15 | | #include <vector> |
16 | | |
17 | | #include "lib/jxl/ac_strategy.h" |
18 | | #include "lib/jxl/base/compiler_specific.h" |
19 | | #include "lib/jxl/base/rect.h" |
20 | | #include "lib/jxl/base/status.h" |
21 | | #include "lib/jxl/coeff_order.h" |
22 | | #include "lib/jxl/coeff_order_fwd.h" |
23 | | #include "lib/jxl/common.h" |
24 | | #include "lib/jxl/dct_util.h" |
25 | | #include "lib/jxl/enc_ans.h" |
26 | | #include "lib/jxl/enc_ans_params.h" |
27 | | #include "lib/jxl/enc_bit_writer.h" |
28 | | #include "lib/jxl/frame_dimensions.h" |
29 | | #include "lib/jxl/lehmer_code.h" |
30 | | #include "lib/jxl/memory_manager_internal.h" |
31 | | |
32 | | namespace jxl { |
33 | | |
34 | | struct AuxOut; |
35 | | enum class LayerType : uint8_t; |
36 | | |
37 | | std::pair<uint32_t, uint32_t> ComputeUsedOrders( |
38 | | const SpeedTier speed, const AcStrategyImage& ac_strategy, |
39 | 186 | const Rect& rect) { |
40 | | // No coefficient reordering in Falcon or faster. |
41 | | // Only uses DCT8 = 0, so bitfield = 1. |
42 | 186 | if (speed >= SpeedTier::kFalcon) return {1, 1}; |
43 | | |
44 | 186 | uint32_t ret = 0; |
45 | 186 | uint32_t ret_customize = 0; |
46 | 186 | size_t xsize_blocks = rect.xsize(); |
47 | 186 | size_t ysize_blocks = rect.ysize(); |
48 | | // TODO(veluca): precompute when doing DCT. |
49 | 7.51k | for (size_t by = 0; by < ysize_blocks; ++by) { |
50 | 7.32k | AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); |
51 | 405k | for (size_t bx = 0; bx < xsize_blocks; ++bx) { |
52 | 398k | int ord = kStrategyOrder[acs_row[bx].RawStrategy()]; |
53 | | // Do not customize coefficient orders for blocks bigger than 32x32. |
54 | 398k | ret |= 1u << ord; |
55 | 398k | if (ord > 6) { |
56 | 29.3k | continue; |
57 | 29.3k | } |
58 | 368k | ret_customize |= 1u << ord; |
59 | 368k | } |
60 | 7.32k | } |
61 | | // Use default orders for small images. |
62 | 186 | if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0}; |
63 | 163 | return {ret, ret_customize}; |
64 | 186 | } |
65 | | |
66 | | Status ComputeCoeffOrder(SpeedTier speed, const ACImage& ac_image, |
67 | | const AcStrategyImage& ac_strategy, |
68 | | const FrameDimensions& frame_dim, |
69 | | uint32_t& all_used_orders, uint32_t prev_used_acs, |
70 | | uint32_t current_used_acs, |
71 | | uint32_t current_used_orders, |
72 | 186 | coeff_order_t* JXL_RESTRICT order) { |
73 | 186 | JxlMemoryManager* memory_manager = ac_strategy.memory_manager(); |
74 | 186 | std::vector<int64_t> num_zeros(kCoeffOrderMaxSize); |
75 | | // If compressing at high speed and only using 8x8 DCTs, only consider a |
76 | | // subset of blocks. |
77 | 186 | double block_fraction = 1.0f; |
78 | | // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes |
79 | | // encoding significantly less dense. |
80 | 186 | if (speed >= SpeedTier::kSquirrel && current_used_orders == 1) { |
81 | 0 | block_fraction = 0.5f; |
82 | 0 | } |
83 | | // No need to compute number of zero coefficients if all orders are the |
84 | | // default. |
85 | 186 | if (current_used_orders != 0) { |
86 | 162 | uint64_t threshold = |
87 | 162 | (std::numeric_limits<uint64_t>::max() >> 32) * block_fraction; |
88 | 162 | uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull), |
89 | 162 | static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)}; |
90 | | // Xorshift128+ adapted from xorshift128+-inl.h |
91 | 241k | auto use_sample = [&]() { |
92 | 241k | auto s1 = s[0]; |
93 | 241k | const auto s0 = s[1]; |
94 | 241k | const auto bits = s1 + s0; // b, c |
95 | 241k | s[0] = s0; |
96 | 241k | s1 ^= s1 << 23; |
97 | 241k | s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); |
98 | 241k | s[1] = s1; |
99 | 241k | return (bits >> 32) <= threshold; |
100 | 241k | }; |
101 | | |
102 | | // Count number of zero coefficients, separately for each DCT band. |
103 | | // TODO(veluca): precompute when doing DCT. |
104 | 733 | for (size_t group_index = 0; group_index < frame_dim.num_groups; |
105 | 571 | group_index++) { |
106 | 571 | const size_t gx = group_index % frame_dim.xsize_groups; |
107 | 571 | const size_t gy = group_index / frame_dim.xsize_groups; |
108 | 571 | const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks, |
109 | 571 | kGroupDimInBlocks, kGroupDimInBlocks, |
110 | 571 | frame_dim.xsize_blocks, frame_dim.ysize_blocks); |
111 | 571 | ConstACPtr rows[3]; |
112 | 571 | ACType type = ac_image.Type(); |
113 | 2.28k | for (size_t c = 0; c < 3; c++) { |
114 | 1.71k | rows[c] = ac_image.PlaneRow(c, group_index, 0); |
115 | 1.71k | } |
116 | 571 | size_t ac_offset = 0; |
117 | | |
118 | | // TODO(veluca): SIMDfy. |
119 | 15.0k | for (size_t by = 0; by < rect.ysize(); ++by) { |
120 | 14.4k | AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); |
121 | 412k | for (size_t bx = 0; bx < rect.xsize(); ++bx) { |
122 | 397k | AcStrategy acs = acs_row[bx]; |
123 | 397k | if (!acs.IsFirstBlock()) continue; |
124 | 241k | if (!use_sample()) continue; |
125 | 241k | size_t size = kDCTBlockSize << acs.log2_covered_blocks(); |
126 | 966k | for (size_t c = 0; c < 3; ++c) { |
127 | 725k | const size_t order_offset = |
128 | 725k | CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c); |
129 | 725k | if (type == ACType::k16) { |
130 | 0 | for (size_t k = 0; k < size; k++) { |
131 | 0 | bool is_zero = rows[c].ptr16[ac_offset + k] == 0; |
132 | 0 | num_zeros[order_offset + k] += is_zero ? 1 : 0; |
133 | 0 | } |
134 | 725k | } else { |
135 | 77.0M | for (size_t k = 0; k < size; k++) { |
136 | 76.3M | bool is_zero = rows[c].ptr32[ac_offset + k] == 0; |
137 | 76.3M | num_zeros[order_offset + k] += is_zero ? 1 : 0; |
138 | 76.3M | } |
139 | 725k | } |
140 | | // Ensure LLFs are first in the order. |
141 | 725k | size_t cx = acs.covered_blocks_x(); |
142 | 725k | size_t cy = acs.covered_blocks_y(); |
143 | 725k | CoefficientLayout(&cy, &cx); |
144 | 1.52M | for (size_t iy = 0; iy < cy; iy++) { |
145 | 1.99M | for (size_t ix = 0; ix < cx; ix++) { |
146 | 1.19M | num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1; |
147 | 1.19M | } |
148 | 803k | } |
149 | 725k | } |
150 | 241k | ac_offset += size; |
151 | 241k | } |
152 | 14.4k | } |
153 | 571 | } |
154 | 162 | } |
155 | 186 | struct PosAndCount { |
156 | 186 | uint32_t pos; |
157 | | // Saving index breaks the ties for non-stable sort |
158 | 186 | uint64_t count_and_idx; |
159 | 186 | }; |
160 | 186 | size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(PosAndCount); |
161 | 186 | JXL_ASSIGN_OR_RETURN(auto mem, |
162 | 186 | AlignedMemory::Create(memory_manager, mem_bytes)); |
163 | | |
164 | 186 | std::vector<coeff_order_t> natural_order_buffer; |
165 | | |
166 | 186 | uint16_t computed = 0; |
167 | 5.20k | for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { |
168 | 5.02k | uint8_t ord = kStrategyOrder[o]; |
169 | 5.02k | if (computed & (1 << ord)) continue; |
170 | 2.41k | computed |= 1 << ord; |
171 | 2.41k | AcStrategy acs = AcStrategy::FromRawStrategy(o); |
172 | 2.41k | size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y(); |
173 | | // Expected maximal size is 256 x 256. |
174 | 2.41k | JXL_DASSERT(sz <= (1 << 16)); |
175 | | |
176 | | // Do nothing for transforms that don't appear. |
177 | 2.41k | if ((1 << ord) & ~current_used_acs) continue; |
178 | | |
179 | | // Do nothing if we already committed to this custom order previously. |
180 | 851 | if ((1 << ord) & prev_used_acs) continue; |
181 | 851 | if ((1 << ord) & all_used_orders) continue; |
182 | | |
183 | 851 | if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz); |
184 | 851 | acs.ComputeNaturalCoeffOrder(natural_order_buffer.data()); |
185 | | |
186 | | // Ensure natural coefficient order is not permuted if the order is |
187 | | // not transmitted. |
188 | 851 | if ((1 << ord) & ~current_used_orders) { |
189 | 240 | for (size_t c = 0; c < 3; c++) { |
190 | 180 | size_t offset = CoeffOrderOffset(ord, c); |
191 | 180 | JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz); |
192 | 180 | memcpy(&order[offset], natural_order_buffer.data(), |
193 | 180 | sz * sizeof(*order)); |
194 | 180 | } |
195 | 60 | continue; |
196 | 60 | } |
197 | | |
198 | 791 | bool is_nondefault = false; |
199 | 3.16k | for (uint8_t c = 0; c < 3; c++) { |
200 | | // Apply zig-zag order. |
201 | 2.37k | PosAndCount* pos_and_val = mem.address<PosAndCount>(); |
202 | 2.37k | size_t offset = CoeffOrderOffset(ord, c); |
203 | 2.37k | JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz); |
204 | 2.37k | float inv_sqrt_sz = 1.0f / std::sqrt(sz); |
205 | 705k | for (size_t i = 0; i < sz; ++i) { |
206 | 703k | size_t pos = natural_order_buffer[i]; |
207 | 703k | pos_and_val[i].pos = pos; |
208 | | // We don't care for the exact number -> quantize number of zeros, |
209 | | // to get less permuted order. |
210 | 703k | uint64_t count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f; |
211 | | // Worst case: all dct8x8, all zeroes: count <= nb_pixels/64/8 |
212 | | // nb_pixels is limited to 2^40 (Level 10 limit) |
213 | | // so count is limited to 2^31 |
214 | 703k | JXL_DASSERT(count < (uint64_t{1} << 48)); |
215 | 703k | pos_and_val[i].count_and_idx = (count << 16) | i; |
216 | 703k | } |
217 | | |
218 | | // Stable-sort -> elements with same number of zeros will preserve their |
219 | | // order. |
220 | 2.91M | auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool { |
221 | 2.91M | return a.count_and_idx < b.count_and_idx; |
222 | 2.91M | }; |
223 | 2.37k | std::sort(pos_and_val, pos_and_val + sz, comparator); |
224 | | |
225 | | // Grab indices. |
226 | 705k | for (size_t i = 0; i < sz; ++i) { |
227 | 703k | order[offset + i] = pos_and_val[i].pos; |
228 | 703k | is_nondefault |= natural_order_buffer[i] != pos_and_val[i].pos; |
229 | 703k | } |
230 | 2.37k | } |
231 | 791 | if (!is_nondefault) { |
232 | 290 | current_used_orders &= ~(1 << ord); |
233 | 290 | } |
234 | 791 | } |
235 | 186 | all_used_orders |= current_used_orders; |
236 | 186 | return true; |
237 | 186 | } |
238 | | |
239 | | namespace { |
240 | | |
241 | | Status TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, |
242 | 1.50k | size_t size, std::vector<Token>* tokens) { |
243 | 1.50k | std::vector<LehmerT> lehmer(size); |
244 | 1.50k | std::vector<uint32_t> temp(size + 1); |
245 | 1.50k | JXL_RETURN_IF_ERROR( |
246 | 1.50k | ComputeLehmerCode(order, temp.data(), size, lehmer.data())); |
247 | 1.50k | size_t end = size; |
248 | 230k | while (end > skip && lehmer[end - 1] == 0) { |
249 | 229k | --end; |
250 | 229k | } |
251 | 1.50k | tokens->emplace_back(CoeffOrderContext(size), end - skip); |
252 | 1.50k | uint32_t last = 0; |
253 | 82.5k | for (size_t i = skip; i < end; ++i) { |
254 | 81.0k | tokens->emplace_back(CoeffOrderContext(last), lehmer[i]); |
255 | 81.0k | last = lehmer[i]; |
256 | 81.0k | } |
257 | 1.50k | return true; |
258 | 1.50k | } |
259 | | |
260 | | } // namespace |
261 | | |
262 | | Status EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, |
263 | | size_t size, BitWriter* writer, LayerType layer, |
264 | 0 | AuxOut* aux_out) { |
265 | 0 | JxlMemoryManager* memory_manager = writer->memory_manager(); |
266 | 0 | std::vector<std::vector<Token>> tokens(1); |
267 | 0 | JXL_RETURN_IF_ERROR(TokenizePermutation(order, skip, size, tokens.data())); |
268 | 0 | EntropyEncodingData codes; |
269 | 0 | JXL_ASSIGN_OR_RETURN( |
270 | 0 | size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(), |
271 | 0 | kPermutationContexts, tokens, |
272 | 0 | &codes, writer, layer, aux_out)); |
273 | 0 | (void)cost; |
274 | 0 | JXL_RETURN_IF_ERROR(WriteTokens(tokens[0], codes, 0, writer, layer, aux_out)); |
275 | 0 | return true; |
276 | 0 | } |
277 | | |
278 | | namespace { |
279 | | Status EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs, |
280 | | std::vector<Token>* tokens, coeff_order_t* order_zigzag, |
281 | 1.50k | std::vector<coeff_order_t>& natural_order_lut) { |
282 | 1.50k | const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); |
283 | 1.50k | const size_t size = kDCTBlockSize * llf; |
284 | 316k | for (size_t i = 0; i < size; ++i) { |
285 | 315k | order_zigzag[i] = natural_order_lut[order[i]]; |
286 | 315k | } |
287 | 1.50k | JXL_RETURN_IF_ERROR(TokenizePermutation(order_zigzag, llf, size, tokens)); |
288 | 1.50k | return true; |
289 | 1.50k | } |
290 | | } // namespace |
291 | | |
292 | | Status EncodeCoeffOrders(uint16_t used_orders, |
293 | | const coeff_order_t* JXL_RESTRICT order, |
294 | | BitWriter* writer, LayerType layer, |
295 | 186 | AuxOut* JXL_RESTRICT aux_out) { |
296 | 186 | JxlMemoryManager* memory_manager = writer->memory_manager(); |
297 | 186 | size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(coeff_order_t); |
298 | 186 | JXL_ASSIGN_OR_RETURN(auto mem, |
299 | 186 | AlignedMemory::Create(memory_manager, mem_bytes)); |
300 | 186 | uint16_t computed = 0; |
301 | 186 | std::vector<std::vector<Token>> tokens(1); |
302 | 186 | std::vector<coeff_order_t> natural_order_lut; |
303 | 5.20k | for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { |
304 | 5.02k | uint8_t ord = kStrategyOrder[o]; |
305 | 5.02k | if (computed & (1 << ord)) continue; |
306 | 2.41k | computed |= 1 << ord; |
307 | 2.41k | if ((used_orders & (1 << ord)) == 0) continue; |
308 | 501 | AcStrategy acs = AcStrategy::FromRawStrategy(o); |
309 | 501 | const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); |
310 | 501 | const size_t size = kDCTBlockSize * llf; |
311 | 501 | if (natural_order_lut.size() < size) natural_order_lut.resize(size); |
312 | 501 | acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data()); |
313 | 2.00k | for (size_t c = 0; c < 3; c++) { |
314 | 1.50k | JXL_RETURN_IF_ERROR( |
315 | 1.50k | EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, tokens.data(), |
316 | 1.50k | mem.address<coeff_order_t>(), natural_order_lut)); |
317 | 1.50k | } |
318 | 501 | } |
319 | | // Do not write anything if no order is used. |
320 | 186 | if (used_orders != 0) { |
321 | 137 | EntropyEncodingData codes; |
322 | 137 | JXL_ASSIGN_OR_RETURN( |
323 | 137 | size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(), |
324 | 137 | kPermutationContexts, tokens, |
325 | 137 | &codes, writer, layer, aux_out)); |
326 | 137 | (void)cost; |
327 | 137 | JXL_RETURN_IF_ERROR( |
328 | 137 | WriteTokens(tokens[0], codes, 0, writer, layer, aux_out)); |
329 | 137 | } |
330 | 186 | return true; |
331 | 186 | } |
332 | | |
333 | | } // namespace jxl |