/src/libjxl/lib/jxl/enc_coeff_order.cc

Source
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include <jxl/memory_manager.h>

#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <utility>
#include <vector>

#include "lib/jxl/ac_strategy.h"
#include "lib/jxl/base/compiler_specific.h"
#include "lib/jxl/base/rect.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/coeff_order.h"
#include "lib/jxl/coeff_order_fwd.h"
#include "lib/jxl/common.h"
#include "lib/jxl/dct_util.h"
#include "lib/jxl/enc_ans.h"
#include "lib/jxl/enc_ans_params.h"
#include "lib/jxl/enc_bit_writer.h"
#include "lib/jxl/frame_dimensions.h"
#include "lib/jxl/lehmer_code.h"
#include "lib/jxl/memory_manager_internal.h"

namespace jxl {

struct AuxOut;
enum class LayerType : uint8_t;

std::pair<uint32_t, uint32_t> ComputeUsedOrders(
    const SpeedTier speed, const AcStrategyImage& ac_strategy,
    const Rect& rect) {
  // No coefficient reordering in Falcon or faster.
  // Only uses DCT8 = 0, so bitfield = 1.
  if (speed >= SpeedTier::kFalcon) return {1, 1};

  uint32_t ret = 0;
  uint32_t ret_customize = 0;
  size_t xsize_blocks = rect.xsize();
  size_t ysize_blocks = rect.ysize();
  // TODO(veluca): precompute when doing DCT.
  for (size_t by = 0; by < ysize_blocks; ++by) {
    AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
    for (size_t bx = 0; bx < xsize_blocks; ++bx) {
      int ord = kStrategyOrder[acs_row[bx].RawStrategy()];
      // Do not customize coefficient orders for blocks bigger than 32x32.
      ret |= 1u << ord;
      if (ord > 6) {
        continue;
      }
      ret_customize |= 1u << ord;
    }
  }
  // Use default orders for small images.
  if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0};
  return {ret, ret_customize};
}

Status ComputeCoeffOrder(SpeedTier speed, const ACImage& ac_image,
                         const AcStrategyImage& ac_strategy,
                         const FrameDimensions& frame_dim,
                         uint32_t& all_used_orders, uint32_t prev_used_acs,
                         uint32_t current_used_acs,
                         uint32_t current_used_orders,
                         coeff_order_t* JXL_RESTRICT order) {
  JxlMemoryManager* memory_manager = ac_strategy.memory_manager();
  std::vector<int64_t> num_zeros(kCoeffOrderMaxSize);
  // If compressing at high speed and only using 8x8 DCTs, only consider a
  // subset of blocks.
  double block_fraction = 1.0f;
  // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes
  // encoding significantly less dense.
  if (speed >= SpeedTier::kSquirrel && current_used_orders == 1) {
    block_fraction = 0.5f;
  }
  // No need to compute number of zero coefficients if all orders are the
  // default.
  if (current_used_orders != 0) {
    uint64_t threshold =
        (std::numeric_limits<uint64_t>::max() >> 32) * block_fraction;
    uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull),
                     static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)};
    // Xorshift128+ adapted from xorshift128+-inl.h
    auto use_sample = [&]() {
      auto s1 = s[0];
      const auto s0 = s[1];
      const auto bits = s1 + s0;  // b, c
      s[0] = s0;
      s1 ^= s1 << 23;
      s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
      s[1] = s1;
      return (bits >> 32) <= threshold;
    };

    // Count number of zero coefficients, separately for each DCT band.
    // TODO(veluca): precompute when doing DCT.
    for (size_t group_index = 0; group_index < frame_dim.num_groups;
         group_index++) {
      const size_t gx = group_index % frame_dim.xsize_groups;
      const size_t gy = group_index / frame_dim.xsize_groups;
      const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks,
                      kGroupDimInBlocks, kGroupDimInBlocks,
                      frame_dim.xsize_blocks, frame_dim.ysize_blocks);
      ConstACPtr rows[3];
      ACType type = ac_image.Type();
      for (size_t c = 0; c < 3; c++) {
        rows[c] = ac_image.PlaneRow(c, group_index, 0);
      }
      size_t ac_offset = 0;

      // TODO(veluca): SIMDfy.
      for (size_t by = 0; by < rect.ysize(); ++by) {
        AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
        for (size_t bx = 0; bx < rect.xsize(); ++bx) {
          AcStrategy acs = acs_row[bx];
          if (!acs.IsFirstBlock()) continue;
          if (!use_sample()) continue;
          size_t size = kDCTBlockSize << acs.log2_covered_blocks();
          for (size_t c = 0; c < 3; ++c) {
            const size_t order_offset =
                CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c);
            if (type == ACType::k16) {
              for (size_t k = 0; k < size; k++) {
                bool is_zero = rows[c].ptr16[ac_offset + k] == 0;
                num_zeros[order_offset + k] += is_zero ? 1 : 0;
              }
            } else {
              for (size_t k = 0; k < size; k++) {
                bool is_zero = rows[c].ptr32[ac_offset + k] == 0;
                num_zeros[order_offset + k] += is_zero ? 1 : 0;
              }
            }
            // Ensure LLFs are first in the order.
            size_t cx = acs.covered_blocks_x();
            size_t cy = acs.covered_blocks_y();
            CoefficientLayout(&cy, &cx);
            for (size_t iy = 0; iy < cy; iy++) {
              for (size_t ix = 0; ix < cx; ix++) {
                num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1;
              }
            }
          }
          ac_offset += size;
        }
      }
    }
  }
  struct PosAndCount {
    uint32_t pos;
    // Saving index breaks the ties for non-stable sort
    uint64_t count_and_idx;
  };
  size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(PosAndCount);
  JXL_ASSIGN_OR_RETURN(auto mem,
                       AlignedMemory::Create(memory_manager, mem_bytes));

  std::vector<coeff_order_t> natural_order_buffer;

  uint16_t computed = 0;
  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
    uint8_t ord = kStrategyOrder[o];
    if (computed & (1 << ord)) continue;
    computed |= 1 << ord;
    AcStrategy acs = AcStrategy::FromRawStrategy(o);
    size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y();
    // Expected maximal size is 256 x 256.
    JXL_DASSERT(sz <= (1 << 16));

    // Do nothing for transforms that don't appear.
    if ((1 << ord) & ~current_used_acs) continue;

    // Do nothing if we already committed to this custom order previously.
    if ((1 << ord) & prev_used_acs) continue;
    if ((1 << ord) & all_used_orders) continue;

    if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz);
    acs.ComputeNaturalCoeffOrder(natural_order_buffer.data());

    // Ensure natural coefficient order is not permuted if the order is
    // not transmitted.
    if ((1 << ord) & ~current_used_orders) {
      for (size_t c = 0; c < 3; c++) {
        size_t offset = CoeffOrderOffset(ord, c);
        JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz);
        memcpy(&order[offset], natural_order_buffer.data(),
               sz * sizeof(*order));
      }
      continue;
    }

    bool is_nondefault = false;
    for (uint8_t c = 0; c < 3; c++) {
      // Apply zig-zag order.
      PosAndCount* pos_and_val = mem.address<PosAndCount>();
      size_t offset = CoeffOrderOffset(ord, c);
      JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz);
      float inv_sqrt_sz = 1.0f / std::sqrt(sz);
      for (size_t i = 0; i < sz; ++i) {
        size_t pos = natural_order_buffer[i];
        pos_and_val[i].pos = pos;
        // We don't care for the exact number -> quantize number of zeros,
        // to get less permuted order.
        uint64_t count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f;
        // Worst case: all dct8x8, all zeroes: count <= nb_pixels/64/8
        // nb_pixels is limited to 2^40 (Level 10 limit)
        // so count is limited to 2^31
        JXL_DASSERT(count < (uint64_t{1} << 48));
        pos_and_val[i].count_and_idx = (count << 16) | i;
      }

      // Stable-sort -> elements with same number of zeros will preserve their
      // order.
      auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool {
        return a.count_and_idx < b.count_and_idx;
      };
      std::sort(pos_and_val, pos_and_val + sz, comparator);

      // Grab indices.
      for (size_t i = 0; i < sz; ++i) {
        order[offset + i] = pos_and_val[i].pos;
        is_nondefault |= natural_order_buffer[i] != pos_and_val[i].pos;
      }
    }
    if (!is_nondefault) {
      current_used_orders &= ~(1 << ord);
    }
  }
  all_used_orders |= current_used_orders;
  return true;
}

namespace {

Status TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
                           size_t size, std::vector<Token>* tokens) {
  std::vector<LehmerT> lehmer(size);
  std::vector<uint32_t> temp(size + 1);
  JXL_RETURN_IF_ERROR(
      ComputeLehmerCode(order, temp.data(), size, lehmer.data()));
  size_t end = size;
  while (end > skip && lehmer[end - 1] == 0) {
    --end;
  }
  tokens->emplace_back(CoeffOrderContext(size),
                       static_cast<uint32_t>(end - skip));
  uint32_t last = 0;
  for (size_t i = skip; i < end; ++i) {
    tokens->emplace_back(CoeffOrderContext(last), lehmer[i]);
    last = lehmer[i];
  }
  return true;
}

}  // namespace

Status EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
                         size_t size, BitWriter* writer, LayerType layer,
                         AuxOut* aux_out) {
  JxlMemoryManager* memory_manager = writer->memory_manager();
  std::vector<std::vector<Token>> tokens(1);
  JXL_RETURN_IF_ERROR(TokenizePermutation(order, skip, size, tokens.data()));
  EntropyEncodingData codes;
  JXL_ASSIGN_OR_RETURN(
      size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(),
                                            kPermutationContexts, tokens,
                                            &codes, writer, layer, aux_out));
  (void)cost;
  JXL_RETURN_IF_ERROR(WriteTokens(tokens[0], codes, 0, writer, layer, aux_out));
  return true;
}

namespace {
Status EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs,
                        std::vector<Token>* tokens, coeff_order_t* order_zigzag,
                        std::vector<coeff_order_t>& natural_order_lut) {
  const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
  const size_t size = kDCTBlockSize * llf;
  for (size_t i = 0; i < size; ++i) {
    order_zigzag[i] = natural_order_lut[order[i]];
  }
  JXL_RETURN_IF_ERROR(TokenizePermutation(order_zigzag, llf, size, tokens));
  return true;
}
}  // namespace

Status EncodeCoeffOrders(uint16_t used_orders,
                         const coeff_order_t* JXL_RESTRICT order,
                         BitWriter* writer, LayerType layer,
                         AuxOut* JXL_RESTRICT aux_out) {
  JxlMemoryManager* memory_manager = writer->memory_manager();
  size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(coeff_order_t);
  JXL_ASSIGN_OR_RETURN(auto mem,
                       AlignedMemory::Create(memory_manager, mem_bytes));
  uint16_t computed = 0;
  std::vector<std::vector<Token>> tokens(1);
  std::vector<coeff_order_t> natural_order_lut;
  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
    uint8_t ord = kStrategyOrder[o];
    if (computed & (1 << ord)) continue;
    computed |= 1 << ord;
    if ((used_orders & (1 << ord)) == 0) continue;
    AcStrategy acs = AcStrategy::FromRawStrategy(o);
    const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
    const size_t size = kDCTBlockSize * llf;
    if (natural_order_lut.size() < size) natural_order_lut.resize(size);
    acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data());
    for (size_t c = 0; c < 3; c++) {
      JXL_RETURN_IF_ERROR(
          EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, tokens.data(),
                           mem.address<coeff_order_t>(), natural_order_lut));
    }
  }
  // Do not write anything if no order is used.
  if (used_orders != 0) {
    EntropyEncodingData codes;
    JXL_ASSIGN_OR_RETURN(
        size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(),
                                              kPermutationContexts, tokens,
                                              &codes, writer, layer, aux_out));
    (void)cost;
    JXL_RETURN_IF_ERROR(
        WriteTokens(tokens[0], codes, 0, writer, layer, aux_out));
  }
  return true;
}

}  // namespace jxl

Coverage Report

Created: 2025-11-16 07:22

Line	Count	Source
1		// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2		//
3		// Use of this source code is governed by a BSD-style
4		// license that can be found in the LICENSE file.
5
6		#include <jxl/memory_manager.h>
7
8		#include <algorithm>
9		#include <cmath>
10		#include <cstddef>
11		#include <cstdint>
12		#include <cstring>
13		#include <limits>
14		#include <utility>
15		#include <vector>
16
17		#include "lib/jxl/ac_strategy.h"
18		#include "lib/jxl/base/compiler_specific.h"
19		#include "lib/jxl/base/rect.h"
20		#include "lib/jxl/base/status.h"
21		#include "lib/jxl/coeff_order.h"
22		#include "lib/jxl/coeff_order_fwd.h"
23		#include "lib/jxl/common.h"
24		#include "lib/jxl/dct_util.h"
25		#include "lib/jxl/enc_ans.h"
26		#include "lib/jxl/enc_ans_params.h"
27		#include "lib/jxl/enc_bit_writer.h"
28		#include "lib/jxl/frame_dimensions.h"
29		#include "lib/jxl/lehmer_code.h"
30		#include "lib/jxl/memory_manager_internal.h"
31
32		namespace jxl {
33
34		struct AuxOut;
35		enum class LayerType : uint8_t;
36
37		std::pair<uint32_t, uint32_t> ComputeUsedOrders(
38		const SpeedTier speed, const AcStrategyImage& ac_strategy,
39	3.48k	const Rect& rect) {
40		// No coefficient reordering in Falcon or faster.
41		// Only uses DCT8 = 0, so bitfield = 1.
42	3.48k	if (speed >= SpeedTier::kFalcon) return {1, 1};
43
44	3.48k	uint32_t ret = 0;
45	3.48k	uint32_t ret_customize = 0;
46	3.48k	size_t xsize_blocks = rect.xsize();
47	3.48k	size_t ysize_blocks = rect.ysize();
48		// TODO(veluca): precompute when doing DCT.
49	108k	for (size_t by = 0; by < ysize_blocks; ++by) {
50	105k	AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
51	5.04M	for (size_t bx = 0; bx < xsize_blocks; ++bx) {
52	4.94M	int ord = kStrategyOrder[acs_row[bx].RawStrategy()];
53		// Do not customize coefficient orders for blocks bigger than 32x32.
54	4.94M	ret \|= 1u << ord;
55	4.94M	if (ord > 6) {
56	927k	continue;
57	927k	}
58	4.01M	ret_customize \|= 1u << ord;
59	4.01M	}
60	105k	}
61		// Use default orders for small images.
62	3.48k	if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0};
63	3.27k	return {ret, ret_customize};
64	3.48k	}
65
66		Status ComputeCoeffOrder(SpeedTier speed, const ACImage& ac_image,
67		const AcStrategyImage& ac_strategy,
68		const FrameDimensions& frame_dim,
69		uint32_t& all_used_orders, uint32_t prev_used_acs,
70		uint32_t current_used_acs,
71		uint32_t current_used_orders,
72	3.48k	coeff_order_t* JXL_RESTRICT order) {
73	3.48k	JxlMemoryManager* memory_manager = ac_strategy.memory_manager();
74	3.48k	std::vector<int64_t> num_zeros(kCoeffOrderMaxSize);
75		// If compressing at high speed and only using 8x8 DCTs, only consider a
76		// subset of blocks.
77	3.48k	double block_fraction = 1.0f;
78		// TODO(veluca): figure out why sampling blocks if non-8x8s are used makes
79		// encoding significantly less dense.
80	3.48k	if (speed >= SpeedTier::kSquirrel && current_used_orders == 1) {
81	10	block_fraction = 0.5f;
82	10	}
83		// No need to compute number of zero coefficients if all orders are the
84		// default.
85	3.48k	if (current_used_orders != 0) {
86	3.24k	uint64_t threshold =
87	3.24k	(std::numeric_limits<uint64_t>::max() >> 32) * block_fraction;
88	3.24k	uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull),
89	3.24k	static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)};
90		// Xorshift128+ adapted from xorshift128+-inl.h
91	2.49M	auto use_sample = [&]() {
92	2.49M	auto s1 = s[0];
93	2.49M	const auto s0 = s[1];
94	2.49M	const auto bits = s1 + s0; // b, c
95	2.49M	s[0] = s0;
96	2.49M	s1 ^= s1 << 23;
97	2.49M	s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
98	2.49M	s[1] = s1;
99	2.49M	return (bits >> 32) <= threshold;
100	2.49M	};
101
102		// Count number of zero coefficients, separately for each DCT band.
103		// TODO(veluca): precompute when doing DCT.
104	11.8k	for (size_t group_index = 0; group_index < frame_dim.num_groups;
105	8.59k	group_index++) {
106	8.59k	const size_t gx = group_index % frame_dim.xsize_groups;
107	8.59k	const size_t gy = group_index / frame_dim.xsize_groups;
108	8.59k	const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks,
109	8.59k	kGroupDimInBlocks, kGroupDimInBlocks,
110	8.59k	frame_dim.xsize_blocks, frame_dim.ysize_blocks);
111	8.59k	ConstACPtr rows[3];
112	8.59k	ACType type = ac_image.Type();
113	34.3k	for (size_t c = 0; c < 3; c++) {
114	25.7k	rows[c] = ac_image.PlaneRow(c, group_index, 0);
115	25.7k	}
116	8.59k	size_t ac_offset = 0;
117
118		// TODO(veluca): SIMDfy.
119	198k	for (size_t by = 0; by < rect.ysize(); ++by) {
120	190k	AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
121	5.08M	for (size_t bx = 0; bx < rect.xsize(); ++bx) {
122	4.89M	AcStrategy acs = acs_row[bx];
123	4.89M	if (!acs.IsFirstBlock()) continue;
124	2.49M	if (!use_sample()) continue;
125	2.49M	size_t size = kDCTBlockSize << acs.log2_covered_blocks();
126	9.99M	for (size_t c = 0; c < 3; ++c) {
127	7.49M	const size_t order_offset =
128	7.49M	CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c);
129	7.49M	if (type == ACType::k16) {
130	0	for (size_t k = 0; k < size; k++) {
131	0	bool is_zero = rows[c].ptr16[ac_offset + k] == 0;
132	0	num_zeros[order_offset + k] += is_zero ? 1 : 0;
133	0	}
134	7.49M	} else {
135	948M	for (size_t k = 0; k < size; k++) {
136	940M	bool is_zero = rows[c].ptr32[ac_offset + k] == 0;
137	940M	num_zeros[order_offset + k] += is_zero ? 1 : 0;
138	940M	}
139	7.49M	}
140		// Ensure LLFs are first in the order.
141	7.49M	size_t cx = acs.covered_blocks_x();
142	7.49M	size_t cy = acs.covered_blocks_y();
143	7.49M	CoefficientLayout(&cy, &cx);
144	16.0M	for (size_t iy = 0; iy < cy; iy++) {
145	23.2M	for (size_t ix = 0; ix < cx; ix++) {
146	14.6M	num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1;
147	14.6M	}
148	8.55M	}
149	7.49M	}
150	2.49M	ac_offset += size;
151	2.49M	}
152	190k	}
153	8.59k	}
154	3.24k	}
155	3.48k	struct PosAndCount {
156	3.48k	uint32_t pos;
157		// Saving index breaks the ties for non-stable sort
158	3.48k	uint64_t count_and_idx;
159	3.48k	};
160	3.48k	size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(PosAndCount);
161	3.48k	JXL_ASSIGN_OR_RETURN(auto mem,
162	3.48k	AlignedMemory::Create(memory_manager, mem_bytes));
163
164	3.48k	std::vector<coeff_order_t> natural_order_buffer;
165
166	3.48k	uint16_t computed = 0;
167	97.6k	for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
168	94.1k	uint8_t ord = kStrategyOrder[o];
169	94.1k	if (computed & (1 << ord)) continue;
170	45.3k	computed \|= 1 << ord;
171	45.3k	AcStrategy acs = AcStrategy::FromRawStrategy(o);
172	45.3k	size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y();
173		// Expected maximal size is 256 x 256.
174	45.3k	JXL_DASSERT(sz <= (1 << 16));
175
176		// Do nothing for transforms that don't appear.
177	45.3k	if ((1 << ord) & ~current_used_acs) continue;
178
179		// Do nothing if we already committed to this custom order previously.
180	16.5k	if ((1 << ord) & prev_used_acs) continue;
181	16.5k	if ((1 << ord) & all_used_orders) continue;
182
183	16.5k	if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz);
184	16.5k	acs.ComputeNaturalCoeffOrder(natural_order_buffer.data());
185
186		// Ensure natural coefficient order is not permuted if the order is
187		// not transmitted.
188	16.5k	if ((1 << ord) & ~current_used_orders) {
189	7.30k	for (size_t c = 0; c < 3; c++) {
190	5.47k	size_t offset = CoeffOrderOffset(ord, c);
191	5.47k	JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz);
192	5.47k	memcpy(&order[offset], natural_order_buffer.data(),
193	5.47k	sz * sizeof(*order));
194	5.47k	}
195	1.82k	continue;
196	1.82k	}
197
198	14.7k	bool is_nondefault = false;
199	58.8k	for (uint8_t c = 0; c < 3; c++) {
200		// Apply zig-zag order.
201	44.1k	PosAndCount* pos_and_val = mem.address<PosAndCount>();
202	44.1k	size_t offset = CoeffOrderOffset(ord, c);
203	44.1k	JXL_ENSURE(CoeffOrderOffset(ord, c + 1) - offset == sz);
204	44.1k	float inv_sqrt_sz = 1.0f / std::sqrt(sz);
205	13.2M	for (size_t i = 0; i < sz; ++i) {
206	13.2M	size_t pos = natural_order_buffer[i];
207	13.2M	pos_and_val[i].pos = pos;
208		// We don't care for the exact number -> quantize number of zeros,
209		// to get less permuted order.
210	13.2M	uint64_t count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f;
211		// Worst case: all dct8x8, all zeroes: count <= nb_pixels/64/8
212		// nb_pixels is limited to 2^40 (Level 10 limit)
213		// so count is limited to 2^31
214	13.2M	JXL_DASSERT(count < (uint64_t{1} << 48));
215	13.2M	pos_and_val[i].count_and_idx = (count << 16) \| i;
216	13.2M	}
217
218		// Stable-sort -> elements with same number of zeros will preserve their
219		// order.
220	38.5M	auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool {
221	38.5M	return a.count_and_idx < b.count_and_idx;
222	38.5M	};
223	44.1k	std::sort(pos_and_val, pos_and_val + sz, comparator);
224
225		// Grab indices.
226	13.2M	for (size_t i = 0; i < sz; ++i) {
227	13.2M	order[offset + i] = pos_and_val[i].pos;
228	13.2M	is_nondefault \|= natural_order_buffer[i] != pos_and_val[i].pos;
229	13.2M	}
230	44.1k	}
231	14.7k	if (!is_nondefault) {
232	6.66k	current_used_orders &= ~(1 << ord);
233	6.66k	}
234	14.7k	}
235	3.48k	all_used_orders \|= current_used_orders;
236	3.48k	return true;
237	3.48k	}
238
239		namespace {
240
241		Status TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
242	24.1k	size_t size, std::vector<Token>* tokens) {
243	24.1k	std::vector<LehmerT> lehmer(size);
244	24.1k	std::vector<uint32_t> temp(size + 1);
245	24.1k	JXL_RETURN_IF_ERROR(
246	24.1k	ComputeLehmerCode(order, temp.data(), size, lehmer.data()));
247	24.1k	size_t end = size;
248	3.31M	while (end > skip && lehmer[end - 1] == 0) {
249	3.28M	--end;
250	3.28M	}
251	24.1k	tokens->emplace_back(CoeffOrderContext(size),
252	24.1k	static_cast<uint32_t>(end - skip));
253	24.1k	uint32_t last = 0;
254	1.03M	for (size_t i = skip; i < end; ++i) {
255	1.01M	tokens->emplace_back(CoeffOrderContext(last), lehmer[i]);
256	1.01M	last = lehmer[i];
257	1.01M	}
258	24.1k	return true;
259	24.1k	}
260
261		} // namespace
262
263		Status EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
264		size_t size, BitWriter* writer, LayerType layer,
265	0	AuxOut* aux_out) {
266	0	JxlMemoryManager* memory_manager = writer->memory_manager();
267	0	std::vector<std::vector<Token>> tokens(1);
268	0	JXL_RETURN_IF_ERROR(TokenizePermutation(order, skip, size, tokens.data()));
269	0	EntropyEncodingData codes;
270	0	JXL_ASSIGN_OR_RETURN(
271	0	size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(),
272	0	kPermutationContexts, tokens,
273	0	&codes, writer, layer, aux_out));
274	0	(void)cost;
275	0	JXL_RETURN_IF_ERROR(WriteTokens(tokens[0], codes, 0, writer, layer, aux_out));
276	0	return true;
277	0	}
278
279		namespace {
280		Status EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs,
281		std::vector<Token>* tokens, coeff_order_t* order_zigzag,
282	24.1k	std::vector<coeff_order_t>& natural_order_lut) {
283	24.1k	const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
284	24.1k	const size_t size = kDCTBlockSize * llf;
285	4.39M	for (size_t i = 0; i < size; ++i) {
286	4.37M	order_zigzag[i] = natural_order_lut[order[i]];
287	4.37M	}
288	24.1k	JXL_RETURN_IF_ERROR(TokenizePermutation(order_zigzag, llf, size, tokens));
289	24.1k	return true;
290	24.1k	}
291		} // namespace
292
293		Status EncodeCoeffOrders(uint16_t used_orders,
294		const coeff_order_t* JXL_RESTRICT order,
295		BitWriter* writer, LayerType layer,
296	3.48k	AuxOut* JXL_RESTRICT aux_out) {
297	3.48k	JxlMemoryManager* memory_manager = writer->memory_manager();
298	3.48k	size_t mem_bytes = AcStrategy::kMaxCoeffArea * sizeof(coeff_order_t);
299	3.48k	JXL_ASSIGN_OR_RETURN(auto mem,
300	3.48k	AlignedMemory::Create(memory_manager, mem_bytes));
301	3.48k	uint16_t computed = 0;
302	3.48k	std::vector<std::vector<Token>> tokens(1);
303	3.48k	std::vector<coeff_order_t> natural_order_lut;
304	97.6k	for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
305	94.1k	uint8_t ord = kStrategyOrder[o];
306	94.1k	if (computed & (1 << ord)) continue;
307	45.3k	computed \|= 1 << ord;
308	45.3k	if ((used_orders & (1 << ord)) == 0) continue;
309	8.04k	AcStrategy acs = AcStrategy::FromRawStrategy(o);
310	8.04k	const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
311	8.04k	const size_t size = kDCTBlockSize * llf;
312	8.04k	if (natural_order_lut.size() < size) natural_order_lut.resize(size);
313	8.04k	acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data());
314	32.1k	for (size_t c = 0; c < 3; c++) {
315	24.1k	JXL_RETURN_IF_ERROR(
316	24.1k	EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, tokens.data(),
317	24.1k	mem.address<coeff_order_t>(), natural_order_lut));
318	24.1k	}
319	8.04k	}
320		// Do not write anything if no order is used.
321	3.48k	if (used_orders != 0) {
322	2.67k	EntropyEncodingData codes;
323	2.67k	JXL_ASSIGN_OR_RETURN(
324	2.67k	size_t cost, BuildAndEncodeHistograms(memory_manager, HistogramParams(),
325	2.67k	kPermutationContexts, tokens,
326	2.67k	&codes, writer, layer, aux_out));
327	2.67k	(void)cost;
328	2.67k	JXL_RETURN_IF_ERROR(
329	2.67k	WriteTokens(tokens[0], codes, 0, writer, layer, aux_out));
330	2.67k	}
331	3.48k	return true;
332	3.48k	}
333
334		} // namespace jxl