Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/dec_group.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/dec_group.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <array>
12
#include <cstddef>
13
#include <cstdint>
14
#include <cstdio>
15
#include <cstdlib>
16
#include <cstring>
17
#include <memory>
18
#include <utility>
19
#include <vector>
20
21
#include "lib/jxl/base/compiler_specific.h"
22
#include "lib/jxl/chroma_from_luma.h"
23
#include "lib/jxl/coeff_order_fwd.h"
24
#include "lib/jxl/dct_util.h"
25
#include "lib/jxl/dec_ans.h"
26
#include "lib/jxl/frame_dimensions.h"
27
#include "lib/jxl/frame_header.h"
28
#include "lib/jxl/image.h"
29
#include "lib/jxl/image_ops.h"
30
#include "lib/jxl/jpeg/jpeg_data.h"
31
#include "lib/jxl/render_pipeline/render_pipeline.h"
32
#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
33
34
#undef HWY_TARGET_INCLUDE
35
#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc"
36
#include <hwy/foreach_target.h>
37
#include <hwy/highway.h>
38
39
#include "lib/jxl/ac_context.h"
40
#include "lib/jxl/ac_strategy.h"
41
#include "lib/jxl/base/bits.h"
42
#include "lib/jxl/base/common.h"
43
#include "lib/jxl/base/printf_macros.h"
44
#include "lib/jxl/base/rect.h"
45
#include "lib/jxl/base/status.h"
46
#include "lib/jxl/coeff_order.h"
47
#include "lib/jxl/common.h"  // kMaxNumPasses
48
#include "lib/jxl/dec_cache.h"
49
#include "lib/jxl/dec_transforms-inl.h"
50
#include "lib/jxl/dec_xyb.h"
51
#include "lib/jxl/entropy_coder.h"
52
#include "lib/jxl/quant_weights.h"
53
#include "lib/jxl/quantizer-inl.h"
54
#include "lib/jxl/quantizer.h"
55
56
#ifndef LIB_JXL_DEC_GROUP_CC
57
#define LIB_JXL_DEC_GROUP_CC
58
namespace jxl {
59
60
struct AuxOut;
61
62
// Interface for reading groups for DecodeGroupImpl.
63
class GetBlock {
64
 public:
65
  virtual void StartRow(size_t by) = 0;
66
  virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs,
67
                           size_t size, size_t log2_covered_blocks,
68
                           ACPtr block[3], ACType ac_type) = 0;
69
115k
  virtual ~GetBlock() {}
70
};
71
72
// Controls whether DecodeGroupImpl renders to pixels or not.
73
enum DrawMode {
74
  // Render to pixels.
75
  kDraw = 0,
76
  // Don't render to pixels.
77
  kDontDraw = 1,
78
};
79
80
}  // namespace jxl
81
#endif  // LIB_JXL_DEC_GROUP_CC
82
83
HWY_BEFORE_NAMESPACE();
84
namespace jxl {
85
namespace HWY_NAMESPACE {
86
87
// These templates are not found via ADL.
88
using hwy::HWY_NAMESPACE::AllFalse;
89
using hwy::HWY_NAMESPACE::Gt;
90
using hwy::HWY_NAMESPACE::Le;
91
using hwy::HWY_NAMESPACE::MaskFromVec;
92
using hwy::HWY_NAMESPACE::Or;
93
using hwy::HWY_NAMESPACE::Rebind;
94
using hwy::HWY_NAMESPACE::ShiftRight;
95
96
using D = HWY_FULL(float);
97
using DU = HWY_FULL(uint32_t);
98
using DI = HWY_FULL(int32_t);
99
using DI16 = Rebind<int16_t, DI>;
100
using DI16_FULL = HWY_CAPPED(int16_t, kDCTBlockSize);
101
constexpr D d;
102
constexpr DI di;
103
constexpr DI16 di16;
104
constexpr DI16_FULL di16_full;
105
106
// TODO(veluca): consider SIMDfying.
107
0
void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
108
0
  for (size_t x = 0; x < 8; x++) {
109
0
    for (size_t y = x + 1; y < 8; y++) {
110
0
      std::swap(block[y * 8 + x], block[x * 8 + y]);
111
0
    }
112
0
  }
113
0
}
Unexecuted instantiation: jxl::N_SSE4::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX2::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX3::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_SSE2::Transpose8x8InPlace(int*)
114
115
template <ACType ac_type>
116
void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
117
                 Vec<D> scaled_dequant_b,
118
                 const float* JXL_RESTRICT dequant_matrices, size_t size,
119
                 size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
120
                 const float* JXL_RESTRICT biases, ACPtr qblock[3],
121
181M
                 float* JXL_RESTRICT block) {
122
181M
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
123
181M
  const auto y_mul =
124
181M
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
125
181M
  const auto b_mul =
126
181M
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
127
128
181M
  Vec<DI> quantized_x_int;
129
181M
  Vec<DI> quantized_y_int;
130
181M
  Vec<DI> quantized_b_int;
131
181M
  if (ac_type == ACType::k16) {
132
15.0M
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
133
15.0M
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
134
15.0M
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
135
166M
  } else {
136
166M
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
137
166M
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
138
166M
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
139
166M
  }
140
141
181M
  const auto dequant_x_cc =
142
181M
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
143
181M
  const auto dequant_y =
144
181M
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
145
181M
  const auto dequant_b_cc =
146
181M
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
147
148
181M
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
149
181M
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
150
181M
  Store(dequant_x, d, block + k);
151
181M
  Store(dequant_y, d, block + size + k);
152
181M
  Store(dequant_b, d, block + 2 * size + k);
153
181M
}
Unexecuted instantiation: void jxl::N_SSE4::DequantLane<(jxl::ACType)0>(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_SSE4::DequantLane<(jxl::ACType)1>(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
void jxl::N_AVX2::DequantLane<(jxl::ACType)0>(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, unsigned long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, jxl::ACPtr*, float*)
Line
Count
Source
121
15.0M
                 float* JXL_RESTRICT block) {
122
15.0M
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
123
15.0M
  const auto y_mul =
124
15.0M
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
125
15.0M
  const auto b_mul =
126
15.0M
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
127
128
15.0M
  Vec<DI> quantized_x_int;
129
15.0M
  Vec<DI> quantized_y_int;
130
15.0M
  Vec<DI> quantized_b_int;
131
15.0M
  if (ac_type == ACType::k16) {
132
15.0M
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
133
15.0M
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
134
15.0M
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
135
15.0M
  } else {
136
0
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
137
0
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
138
0
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
139
0
  }
140
141
15.0M
  const auto dequant_x_cc =
142
15.0M
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
143
15.0M
  const auto dequant_y =
144
15.0M
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
145
15.0M
  const auto dequant_b_cc =
146
15.0M
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
147
148
15.0M
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
149
15.0M
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
150
15.0M
  Store(dequant_x, d, block + k);
151
15.0M
  Store(dequant_y, d, block + size + k);
152
15.0M
  Store(dequant_b, d, block + 2 * size + k);
153
15.0M
}
void jxl::N_AVX2::DequantLane<(jxl::ACType)1>(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, unsigned long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, jxl::ACPtr*, float*)
Line
Count
Source
121
166M
                 float* JXL_RESTRICT block) {
122
166M
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
123
166M
  const auto y_mul =
124
166M
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
125
166M
  const auto b_mul =
126
166M
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
127
128
166M
  Vec<DI> quantized_x_int;
129
166M
  Vec<DI> quantized_y_int;
130
166M
  Vec<DI> quantized_b_int;
131
166M
  if (ac_type == ACType::k16) {
132
0
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
133
0
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
134
0
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
135
166M
  } else {
136
166M
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
137
166M
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
138
166M
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
139
166M
  }
140
141
166M
  const auto dequant_x_cc =
142
166M
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
143
166M
  const auto dequant_y =
144
166M
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
145
166M
  const auto dequant_b_cc =
146
166M
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
147
148
166M
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
149
166M
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
150
166M
  Store(dequant_x, d, block + k);
151
166M
  Store(dequant_y, d, block + size + k);
152
166M
  Store(dequant_b, d, block + 2 * size + k);
153
166M
}
Unexecuted instantiation: void jxl::N_AVX3::DequantLane<(jxl::ACType)0>(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3::DequantLane<(jxl::ACType)1>(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantLane<(jxl::ACType)0>(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantLane<(jxl::ACType)1>(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantLane<(jxl::ACType)0>(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantLane<(jxl::ACType)1>(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantLane<(jxl::ACType)0>(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantLane<(jxl::ACType)1>(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
154
155
template <ACType ac_type>
156
void DequantBlock(float inv_global_scale, int quant, float x_dm_multiplier,
157
                  float b_dm_multiplier, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
158
                  AcStrategyType kind, size_t size, const Quantizer& quantizer,
159
                  size_t covered_blocks, const size_t* sbx,
160
                  const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
161
                  size_t dc_stride, const float* JXL_RESTRICT biases,
162
                  ACPtr qblock[3], float* JXL_RESTRICT block,
163
10.7M
                  float* JXL_RESTRICT scratch) {
164
10.7M
  const auto scaled_dequant_s = inv_global_scale / quant;
165
166
10.7M
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
167
10.7M
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
168
10.7M
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
169
170
10.7M
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
171
172
191M
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
173
181M
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
174
181M
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
175
181M
                         qblock, block);
176
181M
  }
177
43.0M
  for (size_t c = 0; c < 3; c++) {
178
32.2M
    LowestFrequenciesFromDC(kind, dc_row[c] + sbx[c], dc_stride,
179
32.2M
                            block + c * size, scratch);
180
32.2M
  }
181
10.7M
}
Unexecuted instantiation: void jxl::N_SSE4::DequantBlock<(jxl::ACType)0>(float, int, float, float, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_SSE4::DequantBlock<(jxl::ACType)1>(float, int, float, float, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
void jxl::N_AVX2::DequantBlock<(jxl::ACType)0>(float, int, float, float, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Line
Count
Source
163
1.00M
                  float* JXL_RESTRICT scratch) {
164
1.00M
  const auto scaled_dequant_s = inv_global_scale / quant;
165
166
1.00M
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
167
1.00M
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
168
1.00M
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
169
170
1.00M
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
171
172
16.0M
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
173
15.0M
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
174
15.0M
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
175
15.0M
                         qblock, block);
176
15.0M
  }
177
4.01M
  for (size_t c = 0; c < 3; c++) {
178
3.01M
    LowestFrequenciesFromDC(kind, dc_row[c] + sbx[c], dc_stride,
179
3.01M
                            block + c * size, scratch);
180
3.01M
  }
181
1.00M
}
void jxl::N_AVX2::DequantBlock<(jxl::ACType)1>(float, int, float, float, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Line
Count
Source
163
9.74M
                  float* JXL_RESTRICT scratch) {
164
9.74M
  const auto scaled_dequant_s = inv_global_scale / quant;
165
166
9.74M
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
167
9.74M
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
168
9.74M
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
169
170
9.74M
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
171
172
175M
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
173
166M
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
174
166M
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
175
166M
                         qblock, block);
176
166M
  }
177
38.9M
  for (size_t c = 0; c < 3; c++) {
178
29.2M
    LowestFrequenciesFromDC(kind, dc_row[c] + sbx[c], dc_stride,
179
29.2M
                            block + c * size, scratch);
180
29.2M
  }
181
9.74M
}
Unexecuted instantiation: void jxl::N_AVX3::DequantBlock<(jxl::ACType)0>(float, int, float, float, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3::DequantBlock<(jxl::ACType)1>(float, int, float, float, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantBlock<(jxl::ACType)0>(float, int, float, float, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantBlock<(jxl::ACType)1>(float, int, float, float, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantBlock<(jxl::ACType)0>(float, int, float, float, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantBlock<(jxl::ACType)1>(float, int, float, float, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantBlock<(jxl::ACType)0>(float, int, float, float, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantBlock<(jxl::ACType)1>(float, int, float, float, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
182
183
Status DecodeGroupImpl(const FrameHeader& frame_header,
184
                       GetBlock* JXL_RESTRICT get_block,
185
                       GroupDecCache* JXL_RESTRICT group_dec_cache,
186
                       PassesDecoderState* JXL_RESTRICT dec_state,
187
                       size_t thread, size_t group_idx,
188
                       RenderPipelineInput& render_pipeline_input,
189
50.2k
                       jpeg::JPEGData* jpeg_data, DrawMode draw) {
190
  // TODO(veluca): investigate cache usage in this function.
191
50.2k
  const Rect block_rect =
192
50.2k
      dec_state->shared->frame_dim.BlockGroupRect(group_idx);
193
50.2k
  const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
194
195
50.2k
  const size_t xsize_blocks = block_rect.xsize();
196
50.2k
  const size_t ysize_blocks = block_rect.ysize();
197
198
50.2k
  const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
199
200
50.2k
  const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
201
202
50.2k
  const YCbCrChromaSubsampling& cs = frame_header.chroma_subsampling;
203
204
50.2k
  const auto kJpegDctMin = Set(di16_full, -4095);
205
50.2k
  const auto kJpegDctMax = Set(di16_full, 4095);
206
207
50.2k
  size_t idct_stride[3];
208
201k
  for (size_t c = 0; c < 3; c++) {
209
150k
    idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow();
210
150k
  }
211
212
50.2k
  HWY_ALIGN int32_t scaled_qtable[64 * 3];
213
214
50.2k
  ACType ac_type = dec_state->coefficients->Type();
215
50.2k
  auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16>
216
50.2k
                                              : DequantBlock<ACType::k32>;
217
  // Whether or not coefficients should be stored for future usage, and/or read
218
  // from past usage.
219
50.2k
  bool accumulate = !dec_state->coefficients->IsEmpty();
220
  // Offset of the current block in the group.
221
50.2k
  size_t offset = 0;
222
223
50.2k
  std::array<int, 3> jpeg_c_map;
224
50.2k
  bool jpeg_is_gray = false;
225
50.2k
  std::array<int, 3> dcoff = {};
226
227
  // TODO(veluca): all of this should be done only once per image.
228
50.2k
  const ColorCorrelation& color_correlation = dec_state->shared->cmap.base();
229
50.2k
  if (jpeg_data) {
230
0
    if (!color_correlation.IsJPEGCompatible()) {
231
0
      return JXL_FAILURE("The CfL map is not JPEG-compatible");
232
0
    }
233
0
    jpeg_is_gray = (jpeg_data->components.size() == 1);
234
0
    JXL_ENSURE(frame_header.color_transform != ColorTransform::kXYB);
235
0
    jpeg_c_map = JpegOrder(frame_header.color_transform, jpeg_is_gray);
236
0
    const std::vector<QuantEncoding>& qe =
237
0
        dec_state->shared->matrices.encodings();
238
0
    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
239
0
        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
240
0
      return JXL_FAILURE(
241
0
          "Quantization table is not a JPEG quantization table.");
242
0
    }
243
0
    JXL_ENSURE(qe[0].qraw.qtable->size() == 3 * 8 * 8);
244
0
    int* qtable = qe[0].qraw.qtable->data();
245
0
    for (size_t c = 0; c < 3; c++) {
246
0
      if (frame_header.color_transform == ColorTransform::kNone) {
247
0
        dcoff[c] = 1024 / qtable[64 * c];
248
0
      }
249
0
      for (size_t i = 0; i < 64; i++) {
250
        // Transpose the matrix, as it will be used on the transposed block.
251
0
        int num = qtable[64 + i];
252
0
        int den = qtable[64 * c + i];
253
0
        if (num <= 0 || den <= 0 || num >= 65536 || den >= 65536) {
254
0
          return JXL_FAILURE("Invalid JPEG quantization table");
255
0
        }
256
0
        scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] =
257
0
            (1 << kCFLFixedPointPrecision) * num / den;
258
0
      }
259
0
    }
260
0
  }
261
262
50.2k
  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
263
50.2k
  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
264
50.2k
  Rect r[3];
265
201k
  for (size_t i = 0; i < 3; i++) {
266
150k
    r[i] =
267
150k
        Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i],
268
150k
             block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]);
269
150k
    if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(),
270
150k
                        dec_state->shared->dc->Plane(i).ysize()})) {
271
0
      return JXL_FAILURE("Frame dimensions are too big for the image.");
272
0
    }
273
150k
  }
274
275
1.07M
  for (size_t by = 0; by < ysize_blocks; ++by) {
276
1.02M
    get_block->StartRow(by);
277
1.02M
    size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]};
278
279
1.02M
    const int32_t* JXL_RESTRICT row_quant =
280
1.02M
        block_rect.ConstRow(dec_state->shared->raw_quant_field, by);
281
282
1.02M
    const float* JXL_RESTRICT dc_rows[3] = {
283
1.02M
        r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]),
284
1.02M
        r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]),
285
1.02M
        r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]),
286
1.02M
    };
287
288
1.02M
    const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks;
289
1.02M
    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
290
291
1.02M
    const int8_t* JXL_RESTRICT row_cmap[3] = {
292
1.02M
        dec_state->shared->cmap.ytox_map.ConstRow(ty),
293
1.02M
        nullptr,
294
1.02M
        dec_state->shared->cmap.ytob_map.ConstRow(ty),
295
1.02M
    };
296
297
1.02M
    float* JXL_RESTRICT idct_row[3];
298
1.02M
    int16_t* JXL_RESTRICT jpeg_row[3];
299
4.11M
    for (size_t c = 0; c < 3; c++) {
300
3.08M
      const auto& buffer = render_pipeline_input.GetBuffer(c);
301
3.08M
      idct_row[c] = buffer.second.Row(buffer.first, sby[c] * kBlockDim);
302
3.08M
      if (jpeg_data) {
303
0
        auto& component = jpeg_data->components[jpeg_c_map[c]];
304
0
        jpeg_row[c] =
305
0
            component.coeffs.data() +
306
0
            (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) *
307
0
                kDCTBlockSize;
308
0
      }
309
3.08M
    }
310
311
1.02M
    size_t bx = 0;
312
4.18M
    for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
313
3.15M
         tx++) {
314
3.15M
      size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks;
315
3.15M
      auto x_cc_mul = Set(d, color_correlation.YtoXRatio(row_cmap[0][abs_tx]));
316
3.15M
      auto b_cc_mul = Set(d, color_correlation.YtoBRatio(row_cmap[2][abs_tx]));
317
      // Increment bx by llf_x because those iterations would otherwise
318
      // immediately continue (!IsFirstBlock). Reduces mispredictions.
319
16.1M
      for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) {
320
13.0M
        size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]};
321
13.0M
        AcStrategy acs = acs_row[bx];
322
13.0M
        const size_t llf_x = acs.covered_blocks_x();
323
324
        // Can only happen in the second or lower rows of a varblock.
325
13.0M
        if (JXL_UNLIKELY(!acs.IsFirstBlock())) {
326
2.25M
          bx += llf_x;
327
2.25M
          continue;
328
2.25M
        }
329
10.7M
        const size_t log2_covered_blocks = acs.log2_covered_blocks();
330
331
10.7M
        const size_t covered_blocks = 1 << log2_covered_blocks;
332
10.7M
        const size_t size = covered_blocks * kDCTBlockSize;
333
334
10.7M
        ACPtr qblock[3];
335
10.7M
        if (accumulate) {
336
280
          for (size_t c = 0; c < 3; c++) {
337
210
            qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset);
338
210
          }
339
10.7M
        } else {
340
          // No point in reading from bitstream without accumulating and not
341
          // drawing.
342
10.7M
          JXL_ENSURE(draw == kDraw);
343
10.7M
          if (ac_type == ACType::k16) {
344
1.00M
            memset(group_dec_cache->dec_group_qblock16, 0,
345
1.00M
                   size * 3 * sizeof(int16_t));
346
4.01M
            for (size_t c = 0; c < 3; c++) {
347
3.01M
              qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size;
348
3.01M
            }
349
9.74M
          } else {
350
9.74M
            memset(group_dec_cache->dec_group_qblock, 0,
351
9.74M
                   size * 3 * sizeof(int32_t));
352
38.9M
            for (size_t c = 0; c < 3; c++) {
353
29.2M
              qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size;
354
29.2M
            }
355
9.74M
          }
356
10.7M
        }
357
10.7M
        JXL_RETURN_IF_ERROR(get_block->LoadBlock(
358
10.7M
            bx, by, acs, size, log2_covered_blocks, qblock, ac_type));
359
10.7M
        offset += size;
360
10.7M
        if (draw == kDontDraw) {
361
61
          bx += llf_x;
362
61
          continue;
363
61
        }
364
365
10.7M
        if (JXL_UNLIKELY(jpeg_data)) {
366
0
          if (acs.Strategy() != AcStrategyType::DCT) {
367
0
            return JXL_FAILURE(
368
0
                "Can only decode to JPEG if only DCT-8 is used.");
369
0
          }
370
371
0
          HWY_ALIGN int32_t transposed_dct_y[64];
372
0
          for (size_t c : {1, 0, 2}) {
373
            // Propagate only Y for grayscale.
374
0
            if (jpeg_is_gray && c != 1) {
375
0
              continue;
376
0
            }
377
0
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
378
0
              continue;
379
0
            }
380
0
            int16_t* JXL_RESTRICT jpeg_pos =
381
0
                jpeg_row[c] + sbx[c] * kDCTBlockSize;
382
            // JPEG XL is transposed, JPEG is not.
383
0
            auto* transposed_dct = qblock[c].ptr32;
384
0
            Transpose8x8InPlace(transposed_dct);
385
            // No CfL - no need to store the y block converted to integers.
386
0
            if (!cs.Is444() ||
387
0
                (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) {
388
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
389
0
                const auto ini = Load(di, transposed_dct + i);
390
0
                const auto ini16 = DemoteTo(di16, ini);
391
0
                StoreU(ini16, di16, jpeg_pos + i);
392
0
              }
393
0
            } else if (c == 1) {
394
              // Y channel: save for restoring X/B, but nothing else to do.
395
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
396
0
                const auto ini = Load(di, transposed_dct + i);
397
0
                Store(ini, di, transposed_dct_y + i);
398
0
                const auto ini16 = DemoteTo(di16, ini);
399
0
                StoreU(ini16, di16, jpeg_pos + i);
400
0
              }
401
0
            } else {
402
              // transposed_dct_y contains the y channel block, transposed.
403
0
              const auto scale =
404
0
                  Set(di, ColorCorrelation::RatioJPEG(row_cmap[c][abs_tx]));
405
0
              const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1));
406
0
              for (int i = 0; i < 64; i += Lanes(d)) {
407
0
                auto in = Load(di, transposed_dct + i);
408
0
                auto in_y = Load(di, transposed_dct_y + i);
409
0
                auto qt = Load(di, scaled_qtable + c * size + i);
410
0
                auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>(
411
0
                    Add(Mul(qt, scale), round));
412
0
                auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>(
413
0
                    Add(Mul(in_y, coeff_scale), round));
414
0
                StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i);
415
0
              }
416
0
            }
417
0
            jpeg_pos[0] =
418
0
                Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047);
419
0
            auto overflow = MaskFromVec(Set(di16_full, 0));
420
0
            auto underflow = MaskFromVec(Set(di16_full, 0));
421
0
            for (int i = 0; i < 64; i += Lanes(di16_full)) {
422
0
              auto in = LoadU(di16_full, jpeg_pos + i);
423
0
              overflow = Or(overflow, Gt(in, kJpegDctMax));
424
0
              underflow = Or(underflow, Lt(in, kJpegDctMin));
425
0
            }
426
0
            if (!AllFalse(di16_full, Or(overflow, underflow))) {
427
0
              return JXL_FAILURE("JPEG DCT coefficients out of range");
428
0
            }
429
0
          }
430
10.7M
        } else {
431
10.7M
          HWY_ALIGN float* const block = group_dec_cache->dec_group_block;
432
          // Dequantize and add predictions.
433
10.7M
          dequant_block(
434
10.7M
              inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
435
10.7M
              dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.Strategy(),
436
10.7M
              size, dec_state->shared->quantizer,
437
10.7M
              acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
438
10.7M
              dc_stride,
439
10.7M
              dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
440
10.7M
              block, group_dec_cache->scratch_space);
441
442
32.2M
          for (size_t c : {1, 0, 2}) {
443
32.2M
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
444
4.99k
              continue;
445
4.99k
            }
446
            // IDCT
447
32.2M
            float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim;
448
32.2M
            TransformToPixels(acs.Strategy(), block + c * size, idct_pos,
449
32.2M
                              idct_stride[c], group_dec_cache->scratch_space);
450
32.2M
          }
451
10.7M
        }
452
10.7M
        bx += llf_x;
453
10.7M
      }
454
3.15M
    }
455
1.02M
  }
456
49.6k
  return true;
457
50.2k
}
Unexecuted instantiation: jxl::N_SSE4::DecodeGroupImpl(jxl::FrameHeader const&, jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::jpeg::JPEGData*, jxl::DrawMode)
jxl::N_AVX2::DecodeGroupImpl(jxl::FrameHeader const&, jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::jpeg::JPEGData*, jxl::DrawMode)
Line
Count
Source
189
50.2k
                       jpeg::JPEGData* jpeg_data, DrawMode draw) {
190
  // TODO(veluca): investigate cache usage in this function.
191
50.2k
  const Rect block_rect =
192
50.2k
      dec_state->shared->frame_dim.BlockGroupRect(group_idx);
193
50.2k
  const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
194
195
50.2k
  const size_t xsize_blocks = block_rect.xsize();
196
50.2k
  const size_t ysize_blocks = block_rect.ysize();
197
198
50.2k
  const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
199
200
50.2k
  const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
201
202
50.2k
  const YCbCrChromaSubsampling& cs = frame_header.chroma_subsampling;
203
204
50.2k
  const auto kJpegDctMin = Set(di16_full, -4095);
205
50.2k
  const auto kJpegDctMax = Set(di16_full, 4095);
206
207
50.2k
  size_t idct_stride[3];
208
201k
  for (size_t c = 0; c < 3; c++) {
209
150k
    idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow();
210
150k
  }
211
212
50.2k
  HWY_ALIGN int32_t scaled_qtable[64 * 3];
213
214
50.2k
  ACType ac_type = dec_state->coefficients->Type();
215
50.2k
  auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16>
216
50.2k
                                              : DequantBlock<ACType::k32>;
217
  // Whether or not coefficients should be stored for future usage, and/or read
218
  // from past usage.
219
50.2k
  bool accumulate = !dec_state->coefficients->IsEmpty();
220
  // Offset of the current block in the group.
221
50.2k
  size_t offset = 0;
222
223
50.2k
  std::array<int, 3> jpeg_c_map;
224
50.2k
  bool jpeg_is_gray = false;
225
50.2k
  std::array<int, 3> dcoff = {};
226
227
  // TODO(veluca): all of this should be done only once per image.
228
50.2k
  const ColorCorrelation& color_correlation = dec_state->shared->cmap.base();
229
50.2k
  if (jpeg_data) {
230
0
    if (!color_correlation.IsJPEGCompatible()) {
231
0
      return JXL_FAILURE("The CfL map is not JPEG-compatible");
232
0
    }
233
0
    jpeg_is_gray = (jpeg_data->components.size() == 1);
234
0
    JXL_ENSURE(frame_header.color_transform != ColorTransform::kXYB);
235
0
    jpeg_c_map = JpegOrder(frame_header.color_transform, jpeg_is_gray);
236
0
    const std::vector<QuantEncoding>& qe =
237
0
        dec_state->shared->matrices.encodings();
238
0
    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
239
0
        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
240
0
      return JXL_FAILURE(
241
0
          "Quantization table is not a JPEG quantization table.");
242
0
    }
243
0
    JXL_ENSURE(qe[0].qraw.qtable->size() == 3 * 8 * 8);
244
0
    int* qtable = qe[0].qraw.qtable->data();
245
0
    for (size_t c = 0; c < 3; c++) {
246
0
      if (frame_header.color_transform == ColorTransform::kNone) {
247
0
        dcoff[c] = 1024 / qtable[64 * c];
248
0
      }
249
0
      for (size_t i = 0; i < 64; i++) {
250
        // Transpose the matrix, as it will be used on the transposed block.
251
0
        int num = qtable[64 + i];
252
0
        int den = qtable[64 * c + i];
253
0
        if (num <= 0 || den <= 0 || num >= 65536 || den >= 65536) {
254
0
          return JXL_FAILURE("Invalid JPEG quantization table");
255
0
        }
256
0
        scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] =
257
0
            (1 << kCFLFixedPointPrecision) * num / den;
258
0
      }
259
0
    }
260
0
  }
261
262
50.2k
  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
263
50.2k
  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
264
50.2k
  Rect r[3];
265
201k
  for (size_t i = 0; i < 3; i++) {
266
150k
    r[i] =
267
150k
        Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i],
268
150k
             block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]);
269
150k
    if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(),
270
150k
                        dec_state->shared->dc->Plane(i).ysize()})) {
271
0
      return JXL_FAILURE("Frame dimensions are too big for the image.");
272
0
    }
273
150k
  }
274
275
1.07M
  for (size_t by = 0; by < ysize_blocks; ++by) {
276
1.02M
    get_block->StartRow(by);
277
1.02M
    size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]};
278
279
1.02M
    const int32_t* JXL_RESTRICT row_quant =
280
1.02M
        block_rect.ConstRow(dec_state->shared->raw_quant_field, by);
281
282
1.02M
    const float* JXL_RESTRICT dc_rows[3] = {
283
1.02M
        r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]),
284
1.02M
        r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]),
285
1.02M
        r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]),
286
1.02M
    };
287
288
1.02M
    const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks;
289
1.02M
    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
290
291
1.02M
    const int8_t* JXL_RESTRICT row_cmap[3] = {
292
1.02M
        dec_state->shared->cmap.ytox_map.ConstRow(ty),
293
1.02M
        nullptr,
294
1.02M
        dec_state->shared->cmap.ytob_map.ConstRow(ty),
295
1.02M
    };
296
297
1.02M
    float* JXL_RESTRICT idct_row[3];
298
1.02M
    int16_t* JXL_RESTRICT jpeg_row[3];
299
4.11M
    for (size_t c = 0; c < 3; c++) {
300
3.08M
      const auto& buffer = render_pipeline_input.GetBuffer(c);
301
3.08M
      idct_row[c] = buffer.second.Row(buffer.first, sby[c] * kBlockDim);
302
3.08M
      if (jpeg_data) {
303
0
        auto& component = jpeg_data->components[jpeg_c_map[c]];
304
0
        jpeg_row[c] =
305
0
            component.coeffs.data() +
306
0
            (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) *
307
0
                kDCTBlockSize;
308
0
      }
309
3.08M
    }
310
311
1.02M
    size_t bx = 0;
312
4.18M
    for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
313
3.15M
         tx++) {
314
3.15M
      size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks;
315
3.15M
      auto x_cc_mul = Set(d, color_correlation.YtoXRatio(row_cmap[0][abs_tx]));
316
3.15M
      auto b_cc_mul = Set(d, color_correlation.YtoBRatio(row_cmap[2][abs_tx]));
317
      // Increment bx by llf_x because those iterations would otherwise
318
      // immediately continue (!IsFirstBlock). Reduces mispredictions.
319
16.1M
      for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) {
320
13.0M
        size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]};
321
13.0M
        AcStrategy acs = acs_row[bx];
322
13.0M
        const size_t llf_x = acs.covered_blocks_x();
323
324
        // Can only happen in the second or lower rows of a varblock.
325
13.0M
        if (JXL_UNLIKELY(!acs.IsFirstBlock())) {
326
2.25M
          bx += llf_x;
327
2.25M
          continue;
328
2.25M
        }
329
10.7M
        const size_t log2_covered_blocks = acs.log2_covered_blocks();
330
331
10.7M
        const size_t covered_blocks = 1 << log2_covered_blocks;
332
10.7M
        const size_t size = covered_blocks * kDCTBlockSize;
333
334
10.7M
        ACPtr qblock[3];
335
10.7M
        if (accumulate) {
336
280
          for (size_t c = 0; c < 3; c++) {
337
210
            qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset);
338
210
          }
339
10.7M
        } else {
340
          // No point in reading from bitstream without accumulating and not
341
          // drawing.
342
10.7M
          JXL_ENSURE(draw == kDraw);
343
10.7M
          if (ac_type == ACType::k16) {
344
1.00M
            memset(group_dec_cache->dec_group_qblock16, 0,
345
1.00M
                   size * 3 * sizeof(int16_t));
346
4.01M
            for (size_t c = 0; c < 3; c++) {
347
3.01M
              qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size;
348
3.01M
            }
349
9.74M
          } else {
350
9.74M
            memset(group_dec_cache->dec_group_qblock, 0,
351
9.74M
                   size * 3 * sizeof(int32_t));
352
38.9M
            for (size_t c = 0; c < 3; c++) {
353
29.2M
              qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size;
354
29.2M
            }
355
9.74M
          }
356
10.7M
        }
357
10.7M
        JXL_RETURN_IF_ERROR(get_block->LoadBlock(
358
10.7M
            bx, by, acs, size, log2_covered_blocks, qblock, ac_type));
359
10.7M
        offset += size;
360
10.7M
        if (draw == kDontDraw) {
361
61
          bx += llf_x;
362
61
          continue;
363
61
        }
364
365
10.7M
        if (JXL_UNLIKELY(jpeg_data)) {
366
0
          if (acs.Strategy() != AcStrategyType::DCT) {
367
0
            return JXL_FAILURE(
368
0
                "Can only decode to JPEG if only DCT-8 is used.");
369
0
          }
370
371
0
          HWY_ALIGN int32_t transposed_dct_y[64];
372
0
          for (size_t c : {1, 0, 2}) {
373
            // Propagate only Y for grayscale.
374
0
            if (jpeg_is_gray && c != 1) {
375
0
              continue;
376
0
            }
377
0
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
378
0
              continue;
379
0
            }
380
0
            int16_t* JXL_RESTRICT jpeg_pos =
381
0
                jpeg_row[c] + sbx[c] * kDCTBlockSize;
382
            // JPEG XL is transposed, JPEG is not.
383
0
            auto* transposed_dct = qblock[c].ptr32;
384
0
            Transpose8x8InPlace(transposed_dct);
385
            // No CfL - no need to store the y block converted to integers.
386
0
            if (!cs.Is444() ||
387
0
                (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) {
388
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
389
0
                const auto ini = Load(di, transposed_dct + i);
390
0
                const auto ini16 = DemoteTo(di16, ini);
391
0
                StoreU(ini16, di16, jpeg_pos + i);
392
0
              }
393
0
            } else if (c == 1) {
394
              // Y channel: save for restoring X/B, but nothing else to do.
395
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
396
0
                const auto ini = Load(di, transposed_dct + i);
397
0
                Store(ini, di, transposed_dct_y + i);
398
0
                const auto ini16 = DemoteTo(di16, ini);
399
0
                StoreU(ini16, di16, jpeg_pos + i);
400
0
              }
401
0
            } else {
402
              // transposed_dct_y contains the y channel block, transposed.
403
0
              const auto scale =
404
0
                  Set(di, ColorCorrelation::RatioJPEG(row_cmap[c][abs_tx]));
405
0
              const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1));
406
0
              for (int i = 0; i < 64; i += Lanes(d)) {
407
0
                auto in = Load(di, transposed_dct + i);
408
0
                auto in_y = Load(di, transposed_dct_y + i);
409
0
                auto qt = Load(di, scaled_qtable + c * size + i);
410
0
                auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>(
411
0
                    Add(Mul(qt, scale), round));
412
0
                auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>(
413
0
                    Add(Mul(in_y, coeff_scale), round));
414
0
                StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i);
415
0
              }
416
0
            }
417
0
            jpeg_pos[0] =
418
0
                Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047);
419
0
            auto overflow = MaskFromVec(Set(di16_full, 0));
420
0
            auto underflow = MaskFromVec(Set(di16_full, 0));
421
0
            for (int i = 0; i < 64; i += Lanes(di16_full)) {
422
0
              auto in = LoadU(di16_full, jpeg_pos + i);
423
0
              overflow = Or(overflow, Gt(in, kJpegDctMax));
424
0
              underflow = Or(underflow, Lt(in, kJpegDctMin));
425
0
            }
426
0
            if (!AllFalse(di16_full, Or(overflow, underflow))) {
427
0
              return JXL_FAILURE("JPEG DCT coefficients out of range");
428
0
            }
429
0
          }
430
10.7M
        } else {
431
10.7M
          HWY_ALIGN float* const block = group_dec_cache->dec_group_block;
432
          // Dequantize and add predictions.
433
10.7M
          dequant_block(
434
10.7M
              inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
435
10.7M
              dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.Strategy(),
436
10.7M
              size, dec_state->shared->quantizer,
437
10.7M
              acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
438
10.7M
              dc_stride,
439
10.7M
              dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
440
10.7M
              block, group_dec_cache->scratch_space);
441
442
32.2M
          for (size_t c : {1, 0, 2}) {
443
32.2M
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
444
4.99k
              continue;
445
4.99k
            }
446
            // IDCT
447
32.2M
            float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim;
448
32.2M
            TransformToPixels(acs.Strategy(), block + c * size, idct_pos,
449
32.2M
                              idct_stride[c], group_dec_cache->scratch_space);
450
32.2M
          }
451
10.7M
        }
452
10.7M
        bx += llf_x;
453
10.7M
      }
454
3.15M
    }
455
1.02M
  }
456
49.6k
  return true;
457
50.2k
}
Unexecuted instantiation: jxl::N_AVX3::DecodeGroupImpl(jxl::FrameHeader const&, jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::jpeg::JPEGData*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::DecodeGroupImpl(jxl::FrameHeader const&, jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::jpeg::JPEGData*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_AVX3_SPR::DecodeGroupImpl(jxl::FrameHeader const&, jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::jpeg::JPEGData*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_SSE2::DecodeGroupImpl(jxl::FrameHeader const&, jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::jpeg::JPEGData*, jxl::DrawMode)
458
459
// NOLINTNEXTLINE(google-readability-namespace-comments)
460
}  // namespace HWY_NAMESPACE
461
}  // namespace jxl
462
HWY_AFTER_NAMESPACE();
463
464
#if HWY_ONCE
465
namespace jxl {
466
namespace {
467
// Decode quantized AC coefficients of DCT blocks.
468
// LLF components in the output block will not be modified.
469
template <ACType ac_type, bool uses_lz77>
470
Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
471
                        int32_t* JXL_RESTRICT row_nzeros,
472
                        const int32_t* JXL_RESTRICT row_nzeros_top,
473
                        size_t nzeros_stride, size_t c, size_t bx, size_t by,
474
                        size_t lbx, AcStrategy acs,
475
                        const coeff_order_t* JXL_RESTRICT coeff_order,
476
                        BitReader* JXL_RESTRICT br,
477
                        ANSSymbolReader* JXL_RESTRICT decoder,
478
                        const std::vector<uint8_t>& context_map,
479
                        const uint8_t* qdc_row, const int32_t* qf_row,
480
                        const BlockCtxMap& block_ctx_map, ACPtr block,
481
3.40M
                        size_t shift = 0) {
482
  // Equal to number of LLF coefficients.
483
3.40M
  const size_t covered_blocks = 1 << log2_covered_blocks;
484
3.40M
  const size_t size = covered_blocks * kDCTBlockSize;
485
3.40M
  int32_t predicted_nzeros =
486
3.40M
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
487
488
3.40M
  size_t ord = kStrategyOrder[acs.RawStrategy()];
489
3.40M
  const coeff_order_t* JXL_RESTRICT order =
490
3.40M
      &coeff_order[CoeffOrderOffset(ord, c)];
491
492
3.40M
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
493
3.40M
  const int32_t nzero_ctx =
494
3.40M
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
495
496
3.40M
  size_t nzeros =
497
3.40M
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
498
3.40M
  if (nzeros > size - covered_blocks) {
499
382
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
500
382
                       " 8x8 blocks",
501
382
                       nzeros, covered_blocks);
502
382
  }
503
7.38M
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
504
10.3M
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
505
6.32M
      row_nzeros[bx + x + y * nzeros_stride] =
506
6.32M
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
507
6.32M
    }
508
3.97M
  }
509
510
3.40M
  const size_t histo_offset =
511
3.40M
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
512
513
3.40M
  size_t prev = (nzeros > size / 16 ? 0 : 1);
514
86.6M
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
515
83.2M
    const size_t ctx =
516
83.2M
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
517
83.2M
                                          log2_covered_blocks, prev);
518
83.2M
    const size_t u_coeff =
519
83.2M
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
520
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
521
    // signed integer to avoid undefined behavior of shifting negative numbers.
522
83.2M
    const size_t magnitude = u_coeff >> 1;
523
83.2M
    const size_t neg_sign = (~u_coeff) & 1;
524
83.2M
    const ptrdiff_t coeff =
525
83.2M
        static_cast<ptrdiff_t>((magnitude ^ (neg_sign - 1)) << shift);
526
83.2M
    if (ac_type == ACType::k16) {
527
77.1M
      block.ptr16[order[k]] += coeff;
528
77.1M
    } else {
529
6.12M
      block.ptr32[order[k]] += coeff;
530
6.12M
    }
531
83.2M
    prev = static_cast<size_t>(u_coeff != 0);
532
83.2M
    nzeros -= prev;
533
83.2M
  }
534
3.40M
  if (JXL_UNLIKELY(nzeros != 0)) {
535
256
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
536
256
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
537
256
                       "), channel %" PRIuS,
538
256
                       nzeros, bx, by, c);
539
256
  }
540
541
3.40M
  return true;
542
3.40M
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
481
228k
                        size_t shift = 0) {
482
  // Equal to number of LLF coefficients.
483
228k
  const size_t covered_blocks = 1 << log2_covered_blocks;
484
228k
  const size_t size = covered_blocks * kDCTBlockSize;
485
228k
  int32_t predicted_nzeros =
486
228k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
487
488
228k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
489
228k
  const coeff_order_t* JXL_RESTRICT order =
490
228k
      &coeff_order[CoeffOrderOffset(ord, c)];
491
492
228k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
493
228k
  const int32_t nzero_ctx =
494
228k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
495
496
228k
  size_t nzeros =
497
228k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
498
228k
  if (nzeros > size - covered_blocks) {
499
48
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
500
48
                       " 8x8 blocks",
501
48
                       nzeros, covered_blocks);
502
48
  }
503
462k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
504
549k
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
505
315k
      row_nzeros[bx + x + y * nzeros_stride] =
506
315k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
507
315k
    }
508
234k
  }
509
510
228k
  const size_t histo_offset =
511
228k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
512
513
228k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
514
680k
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
515
452k
    const size_t ctx =
516
452k
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
517
452k
                                          log2_covered_blocks, prev);
518
452k
    const size_t u_coeff =
519
452k
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
520
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
521
    // signed integer to avoid undefined behavior of shifting negative numbers.
522
452k
    const size_t magnitude = u_coeff >> 1;
523
452k
    const size_t neg_sign = (~u_coeff) & 1;
524
452k
    const ptrdiff_t coeff =
525
452k
        static_cast<ptrdiff_t>((magnitude ^ (neg_sign - 1)) << shift);
526
452k
    if (ac_type == ACType::k16) {
527
452k
      block.ptr16[order[k]] += coeff;
528
452k
    } else {
529
0
      block.ptr32[order[k]] += coeff;
530
0
    }
531
452k
    prev = static_cast<size_t>(u_coeff != 0);
532
452k
    nzeros -= prev;
533
452k
  }
534
228k
  if (JXL_UNLIKELY(nzeros != 0)) {
535
48
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
536
48
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
537
48
                       "), channel %" PRIuS,
538
48
                       nzeros, bx, by, c);
539
48
  }
540
541
228k
  return true;
542
228k
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
481
49.4k
                        size_t shift = 0) {
482
  // Equal to number of LLF coefficients.
483
49.4k
  const size_t covered_blocks = 1 << log2_covered_blocks;
484
49.4k
  const size_t size = covered_blocks * kDCTBlockSize;
485
49.4k
  int32_t predicted_nzeros =
486
49.4k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
487
488
49.4k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
489
49.4k
  const coeff_order_t* JXL_RESTRICT order =
490
49.4k
      &coeff_order[CoeffOrderOffset(ord, c)];
491
492
49.4k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
493
49.4k
  const int32_t nzero_ctx =
494
49.4k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
495
496
49.4k
  size_t nzeros =
497
49.4k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
498
49.4k
  if (nzeros > size - covered_blocks) {
499
196
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
500
196
                       " 8x8 blocks",
501
196
                       nzeros, covered_blocks);
502
196
  }
503
104k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
504
160k
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
505
104k
      row_nzeros[bx + x + y * nzeros_stride] =
506
104k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
507
104k
    }
508
55.7k
  }
509
510
49.2k
  const size_t histo_offset =
511
49.2k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
512
513
49.2k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
514
307k
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
515
258k
    const size_t ctx =
516
258k
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
517
258k
                                          log2_covered_blocks, prev);
518
258k
    const size_t u_coeff =
519
258k
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
520
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
521
    // signed integer to avoid undefined behavior of shifting negative numbers.
522
258k
    const size_t magnitude = u_coeff >> 1;
523
258k
    const size_t neg_sign = (~u_coeff) & 1;
524
258k
    const ptrdiff_t coeff =
525
258k
        static_cast<ptrdiff_t>((magnitude ^ (neg_sign - 1)) << shift);
526
258k
    if (ac_type == ACType::k16) {
527
0
      block.ptr16[order[k]] += coeff;
528
258k
    } else {
529
258k
      block.ptr32[order[k]] += coeff;
530
258k
    }
531
258k
    prev = static_cast<size_t>(u_coeff != 0);
532
258k
    nzeros -= prev;
533
258k
  }
534
49.2k
  if (JXL_UNLIKELY(nzeros != 0)) {
535
24
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
536
24
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
537
24
                       "), channel %" PRIuS,
538
24
                       nzeros, bx, by, c);
539
24
  }
540
541
49.1k
  return true;
542
49.2k
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
481
2.77M
                        size_t shift = 0) {
482
  // Equal to number of LLF coefficients.
483
2.77M
  const size_t covered_blocks = 1 << log2_covered_blocks;
484
2.77M
  const size_t size = covered_blocks * kDCTBlockSize;
485
2.77M
  int32_t predicted_nzeros =
486
2.77M
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
487
488
2.77M
  size_t ord = kStrategyOrder[acs.RawStrategy()];
489
2.77M
  const coeff_order_t* JXL_RESTRICT order =
490
2.77M
      &coeff_order[CoeffOrderOffset(ord, c)];
491
492
2.77M
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
493
2.77M
  const int32_t nzero_ctx =
494
2.77M
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
495
496
2.77M
  size_t nzeros =
497
2.77M
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
498
2.77M
  if (nzeros > size - covered_blocks) {
499
32
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
500
32
                       " 8x8 blocks",
501
32
                       nzeros, covered_blocks);
502
32
  }
503
6.02M
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
504
8.58M
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
505
5.33M
      row_nzeros[bx + x + y * nzeros_stride] =
506
5.33M
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
507
5.33M
    }
508
3.25M
  }
509
510
2.77M
  const size_t histo_offset =
511
2.77M
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
512
513
2.77M
  size_t prev = (nzeros > size / 16 ? 0 : 1);
514
79.4M
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
515
76.7M
    const size_t ctx =
516
76.7M
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
517
76.7M
                                          log2_covered_blocks, prev);
518
76.7M
    const size_t u_coeff =
519
76.7M
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
520
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
521
    // signed integer to avoid undefined behavior of shifting negative numbers.
522
76.7M
    const size_t magnitude = u_coeff >> 1;
523
76.7M
    const size_t neg_sign = (~u_coeff) & 1;
524
76.7M
    const ptrdiff_t coeff =
525
76.7M
        static_cast<ptrdiff_t>((magnitude ^ (neg_sign - 1)) << shift);
526
76.7M
    if (ac_type == ACType::k16) {
527
76.7M
      block.ptr16[order[k]] += coeff;
528
76.7M
    } else {
529
0
      block.ptr32[order[k]] += coeff;
530
0
    }
531
76.7M
    prev = static_cast<size_t>(u_coeff != 0);
532
76.7M
    nzeros -= prev;
533
76.7M
  }
534
2.77M
  if (JXL_UNLIKELY(nzeros != 0)) {
535
132
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
536
132
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
537
132
                       "), channel %" PRIuS,
538
132
                       nzeros, bx, by, c);
539
132
  }
540
541
2.77M
  return true;
542
2.77M
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
481
349k
                        size_t shift = 0) {
482
  // Equal to number of LLF coefficients.
483
349k
  const size_t covered_blocks = 1 << log2_covered_blocks;
484
349k
  const size_t size = covered_blocks * kDCTBlockSize;
485
349k
  int32_t predicted_nzeros =
486
349k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
487
488
349k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
489
349k
  const coeff_order_t* JXL_RESTRICT order =
490
349k
      &coeff_order[CoeffOrderOffset(ord, c)];
491
492
349k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
493
349k
  const int32_t nzero_ctx =
494
349k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
495
496
349k
  size_t nzeros =
497
349k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
498
349k
  if (nzeros > size - covered_blocks) {
499
106
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
500
106
                       " 8x8 blocks",
501
106
                       nzeros, covered_blocks);
502
106
  }
503
785k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
504
1.01M
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
505
576k
      row_nzeros[bx + x + y * nzeros_stride] =
506
576k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
507
576k
    }
508
436k
  }
509
510
349k
  const size_t histo_offset =
511
349k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
512
513
349k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
514
6.21M
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
515
5.86M
    const size_t ctx =
516
5.86M
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
517
5.86M
                                          log2_covered_blocks, prev);
518
5.86M
    const size_t u_coeff =
519
5.86M
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
520
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
521
    // signed integer to avoid undefined behavior of shifting negative numbers.
522
5.86M
    const size_t magnitude = u_coeff >> 1;
523
5.86M
    const size_t neg_sign = (~u_coeff) & 1;
524
5.86M
    const ptrdiff_t coeff =
525
5.86M
        static_cast<ptrdiff_t>((magnitude ^ (neg_sign - 1)) << shift);
526
5.86M
    if (ac_type == ACType::k16) {
527
0
      block.ptr16[order[k]] += coeff;
528
5.86M
    } else {
529
5.86M
      block.ptr32[order[k]] += coeff;
530
5.86M
    }
531
5.86M
    prev = static_cast<size_t>(u_coeff != 0);
532
5.86M
    nzeros -= prev;
533
5.86M
  }
534
349k
  if (JXL_UNLIKELY(nzeros != 0)) {
535
52
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
536
52
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
537
52
                       "), channel %" PRIuS,
538
52
                       nzeros, bx, by, c);
539
52
  }
540
541
349k
  return true;
542
349k
}
543
544
// Structs used by DecodeGroupImpl to get a quantized block.
545
// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row
546
// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient
547
// image provided by the encoder.
548
549
struct GetBlockFromBitstream : public GetBlock {
550
195k
  void StartRow(size_t by) override {
551
195k
    qf_row = rect.ConstRow(*qf, by);
552
781k
    for (size_t c = 0; c < 3; c++) {
553
585k
      size_t sby = by >> vshift[c];
554
585k
      quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0();
555
1.17M
      for (size_t i = 0; i < num_passes; i++) {
556
586k
        row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby);
557
586k
        row_nzeros_top[i][c] =
558
586k
            sby == 0
559
586k
                ? nullptr
560
586k
                : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1);
561
586k
      }
562
585k
    }
563
195k
  }
564
565
  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
566
                   size_t log2_covered_blocks, ACPtr block[3],
567
1.13M
                   ACType ac_type) override {
568
1.13M
    ;
569
3.41M
    for (size_t c : {1, 0, 2}) {
570
3.41M
      size_t sbx = bx >> hshift[c];
571
3.41M
      size_t sby = by >> vshift[c];
572
3.41M
      if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) {
573
4.99k
        continue;
574
4.99k
      }
575
576
6.81M
      for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) {
577
3.40M
        auto decode_ac_varblock =
578
3.40M
            decoders[pass].UsesLZ77()
579
3.40M
                ? (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 1>
580
277k
                                          : DecodeACVarBlock<ACType::k32, 1>)
581
3.40M
                : (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 0>
582
3.12M
                                          : DecodeACVarBlock<ACType::k32, 0>);
583
3.40M
        JXL_RETURN_IF_ERROR(decode_ac_varblock(
584
3.40M
            ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c],
585
3.40M
            row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs,
586
3.40M
            &coeff_orders[pass * coeff_order_size], readers[pass],
587
3.40M
            &decoders[pass], context_map[pass], quant_dc_row, qf_row,
588
3.40M
            *block_ctx_map, block[c], shift_for_pass[pass]));
589
3.40M
      }
590
3.40M
    }
591
1.13M
    return true;
592
1.13M
  }
593
594
  Status Init(const FrameHeader& frame_header,
595
              BitReader* JXL_RESTRICT* JXL_RESTRICT readers_,
596
              size_t num_passes_, size_t group_idx, size_t histo_selector_bits,
597
              const Rect& rect_, GroupDecCache* JXL_RESTRICT group_dec_cache_,
598
17.5k
              PassesDecoderState* dec_state, size_t first_pass) {
599
70.0k
    for (size_t i = 0; i < 3; i++) {
600
52.5k
      hshift[i] = frame_header.chroma_subsampling.HShift(i);
601
52.5k
      vshift[i] = frame_header.chroma_subsampling.VShift(i);
602
52.5k
    }
603
17.5k
    coeff_order_size = dec_state->shared->coeff_order_size;
604
17.5k
    coeff_orders =
605
17.5k
        dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size;
606
17.5k
    context_map = dec_state->context_map.data() + first_pass;
607
17.5k
    readers = readers_;
608
17.5k
    num_passes = num_passes_;
609
17.5k
    shift_for_pass = frame_header.passes.shift + first_pass;
610
17.5k
    group_dec_cache = group_dec_cache_;
611
17.5k
    rect = rect_;
612
17.5k
    block_ctx_map = &dec_state->shared->block_ctx_map;
613
17.5k
    qf = &dec_state->shared->raw_quant_field;
614
17.5k
    quant_dc = &dec_state->shared->quant_dc;
615
616
35.0k
    for (size_t pass = 0; pass < num_passes; pass++) {
617
      // Select which histogram set to use among those of the current pass.
618
17.6k
      size_t cur_histogram = 0;
619
17.6k
      if (histo_selector_bits != 0) {
620
6.53k
        cur_histogram = readers[pass]->ReadBits(histo_selector_bits);
621
6.53k
      }
622
17.6k
      if (cur_histogram >= dec_state->shared->num_histograms) {
623
20
        return JXL_FAILURE("Invalid histogram selector");
624
20
      }
625
17.5k
      ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts();
626
627
17.5k
      JXL_ASSIGN_OR_RETURN(
628
17.5k
          decoders[pass],
629
17.5k
          ANSSymbolReader::Create(&dec_state->code[pass + first_pass],
630
17.5k
                                  readers[pass]));
631
17.5k
    }
632
17.4k
    nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow();
633
35.0k
    for (size_t i = 0; i < num_passes; i++) {
634
17.5k
      JXL_ENSURE(
635
17.5k
          nzeros_stride ==
636
17.5k
          static_cast<size_t>(group_dec_cache->num_nzeroes[i].PixelsPerRow()));
637
17.5k
    }
638
17.4k
    return true;
639
17.4k
  }
640
641
  const uint32_t* shift_for_pass = nullptr;  // not owned
642
  const coeff_order_t* JXL_RESTRICT coeff_orders;
643
  size_t coeff_order_size;
644
  const std::vector<uint8_t>* JXL_RESTRICT context_map;
645
  ANSSymbolReader decoders[kMaxNumPasses];
646
  BitReader* JXL_RESTRICT* JXL_RESTRICT readers;
647
  size_t num_passes;
648
  size_t ctx_offset[kMaxNumPasses];
649
  size_t nzeros_stride;
650
  int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3];
651
  const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3];
652
  GroupDecCache* JXL_RESTRICT group_dec_cache;
653
  const BlockCtxMap* block_ctx_map;
654
  const ImageI* qf;
655
  const ImageB* quant_dc;
656
  const int32_t* qf_row;
657
  const uint8_t* quant_dc_row;
658
  Rect rect;
659
  size_t hshift[3], vshift[3];
660
};
661
662
struct GetBlockFromEncoder : public GetBlock {
663
834k
  void StartRow(size_t by) override {}
664
665
  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
666
                   size_t log2_covered_blocks, ACPtr block[3],
667
9.61M
                   ACType ac_type) override {
668
9.61M
    JXL_ENSURE(ac_type == ACType::k32);
669
38.4M
    for (size_t c = 0; c < 3; c++) {
670
      // for each pass
671
57.6M
      for (size_t i = 0; i < quantized_ac->size(); i++) {
672
3.97G
        for (size_t k = 0; k < size; k++) {
673
          // TODO(veluca): SIMD.
674
3.94G
          block[c].ptr32[k] +=
675
3.94G
              rows[i][c][offset + k] * (1 << shift_for_pass[i]);
676
3.94G
        }
677
28.8M
      }
678
28.8M
    }
679
9.61M
    offset += size;
680
9.61M
    return true;
681
9.61M
  }
682
683
  static StatusOr<GetBlockFromEncoder> Create(
684
      const std::vector<std::unique_ptr<ACImage>>& ac, size_t group_idx,
685
32.7k
      const uint32_t* shift_for_pass) {
686
32.7k
    GetBlockFromEncoder result(ac, group_idx, shift_for_pass);
687
    // TODO(veluca): not supported with chroma subsampling.
688
65.5k
    for (size_t i = 0; i < ac.size(); i++) {
689
32.7k
      JXL_ENSURE(ac[i]->Type() == ACType::k32);
690
131k
      for (size_t c = 0; c < 3; c++) {
691
98.3k
        result.rows[i][c] = ac[i]->PlaneRow(c, group_idx, 0).ptr32;
692
98.3k
      }
693
32.7k
    }
694
32.7k
    return result;
695
32.7k
  }
696
697
  const std::vector<std::unique_ptr<ACImage>>* JXL_RESTRICT quantized_ac;
698
  size_t offset = 0;
699
  const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3];
700
  const uint32_t* shift_for_pass = nullptr;  // not owned
701
702
 private:
703
  GetBlockFromEncoder(const std::vector<std::unique_ptr<ACImage>>& ac,
704
                      size_t group_idx, const uint32_t* shift_for_pass)
705
32.7k
      : quantized_ac(&ac), shift_for_pass(shift_for_pass) {}
706
};
707
708
HWY_EXPORT(DecodeGroupImpl);
709
710
}  // namespace
711
712
Status DecodeGroup(const FrameHeader& frame_header,
713
                   BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
714
                   size_t num_passes, size_t group_idx,
715
                   PassesDecoderState* JXL_RESTRICT dec_state,
716
                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
717
                   RenderPipelineInput& render_pipeline_input,
718
                   jpeg::JPEGData* JXL_RESTRICT jpeg_data, size_t first_pass,
719
17.5k
                   bool force_draw, bool dc_only, bool* should_run_pipeline) {
720
17.5k
  JxlMemoryManager* memory_manager = dec_state->memory_manager();
721
17.5k
  DrawMode draw =
722
17.5k
      (num_passes + first_pass == frame_header.passes.num_passes) || force_draw
723
17.5k
          ? kDraw
724
17.5k
          : kDontDraw;
725
726
17.5k
  if (should_run_pipeline) {
727
17.5k
    *should_run_pipeline = draw != kDontDraw;
728
17.5k
  }
729
730
17.5k
  if (draw == kDraw && num_passes == 0 && first_pass == 0) {
731
0
    JXL_RETURN_IF_ERROR(group_dec_cache->InitDCBufferOnce(memory_manager));
732
0
    const YCbCrChromaSubsampling& cs = frame_header.chroma_subsampling;
733
0
    for (size_t c : {0, 1, 2}) {
734
0
      size_t hs = cs.HShift(c);
735
0
      size_t vs = cs.VShift(c);
736
      // We reuse filter_input_storage here as it is not currently in use.
737
0
      const Rect src_rect_precs =
738
0
          dec_state->shared->frame_dim.BlockGroupRect(group_idx);
739
0
      const Rect src_rect =
740
0
          Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs,
741
0
               src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs);
742
0
      const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(),
743
0
                           src_rect.ysize());
744
0
      JXL_RETURN_IF_ERROR(
745
0
          CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2,
746
0
                                 copy_rect, &group_dec_cache->dc_buffer));
747
      // Mirrorpad. Interleaving left and right padding ensures that padding
748
      // works out correctly even for images with DC size of 1.
749
0
      for (size_t y = 0; y < src_rect.ysize() + 4; y++) {
750
0
        size_t xend = kRenderPipelineXOffset +
751
0
                      (dec_state->shared->dc->Plane(c).xsize() >> hs) -
752
0
                      src_rect.x0();
753
0
        for (size_t ix = 0; ix < 2; ix++) {
754
0
          if (src_rect.x0() == 0) {
755
0
            group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] =
756
0
                group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix];
757
0
          }
758
0
          if (src_rect.x0() + src_rect.xsize() + 2 >=
759
0
              (dec_state->shared->dc->xsize() >> hs)) {
760
0
            group_dec_cache->dc_buffer.Row(y)[xend + ix] =
761
0
                group_dec_cache->dc_buffer.Row(y)[xend - ix - 1];
762
0
          }
763
0
        }
764
0
      }
765
0
      const auto& buffer = render_pipeline_input.GetBuffer(c);
766
0
      Rect dst_rect = buffer.second;
767
0
      ImageF* upsampling_dst = buffer.first;
768
0
      JXL_ENSURE(dst_rect.IsInside(*upsampling_dst));
769
770
0
      RenderPipelineStage::RowInfo input_rows(1, std::vector<float*>(5));
771
0
      RenderPipelineStage::RowInfo output_rows(1, std::vector<float*>(8));
772
0
      for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize();
773
0
           y++) {
774
0
        for (ptrdiff_t iy = 0; iy < 5; iy++) {
775
0
          input_rows[0][iy] = group_dec_cache->dc_buffer.Row(
776
0
              Mirror(static_cast<ptrdiff_t>(y) + iy - 2,
777
0
                     dec_state->shared->dc->Plane(c).ysize() >> vs) +
778
0
              2 - src_rect.y0());
779
0
        }
780
0
        for (size_t iy = 0; iy < 8; iy++) {
781
0
          output_rows[0][iy] =
782
0
              dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) -
783
0
              kRenderPipelineXOffset;
784
0
        }
785
        // Arguments set to 0/nullptr are not used.
786
0
        JXL_RETURN_IF_ERROR(dec_state->upsampler8x->ProcessRow(
787
0
            input_rows, output_rows, /*xextra_left=*/0, /*xextra_right=*/0,
788
0
            src_rect.xsize(), 0, 0, thread));
789
0
      }
790
0
    }
791
0
    return true;
792
0
  }
793
794
17.5k
  size_t histo_selector_bits = 0;
795
17.5k
  if (dc_only) {
796
0
    JXL_ENSURE(num_passes == 0);
797
17.5k
  } else {
798
17.5k
    JXL_ENSURE(dec_state->shared->num_histograms > 0);
799
17.5k
    histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms);
800
17.5k
  }
801
802
17.5k
  auto get_block = jxl::make_unique<GetBlockFromBitstream>();
803
17.5k
  JXL_RETURN_IF_ERROR(get_block->Init(
804
17.5k
      frame_header, readers, num_passes, group_idx, histo_selector_bits,
805
17.5k
      dec_state->shared->frame_dim.BlockGroupRect(group_idx), group_dec_cache,
806
17.5k
      dec_state, first_pass));
807
808
17.4k
  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
809
17.4k
      frame_header, get_block.get(), group_dec_cache, dec_state, thread,
810
17.4k
      group_idx, render_pipeline_input, jpeg_data, draw));
811
812
33.7k
  for (size_t pass = 0; pass < num_passes; pass++) {
813
16.9k
    if (!get_block->decoders[pass].CheckANSFinalState()) {
814
0
      return JXL_FAILURE("ANS checksum failure.");
815
0
    }
816
16.9k
  }
817
16.8k
  return true;
818
16.8k
}
819
820
Status DecodeGroupForRoundtrip(const FrameHeader& frame_header,
821
                               const std::vector<std::unique_ptr<ACImage>>& ac,
822
                               size_t group_idx,
823
                               PassesDecoderState* JXL_RESTRICT dec_state,
824
                               GroupDecCache* JXL_RESTRICT group_dec_cache,
825
                               size_t thread,
826
                               RenderPipelineInput& render_pipeline_input,
827
                               jpeg::JPEGData* JXL_RESTRICT jpeg_data,
828
32.7k
                               AuxOut* aux_out) {
829
32.7k
  JxlMemoryManager* memory_manager = dec_state->memory_manager();
830
32.7k
  JXL_ASSIGN_OR_RETURN(
831
32.7k
      GetBlockFromEncoder get_block,
832
32.7k
      GetBlockFromEncoder::Create(ac, group_idx, frame_header.passes.shift));
833
32.7k
  JXL_RETURN_IF_ERROR(group_dec_cache->InitOnce(
834
32.7k
      memory_manager,
835
32.7k
      /*num_passes=*/0,
836
32.7k
      /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1));
837
838
32.7k
  return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
839
32.7k
      frame_header, &get_block, group_dec_cache, dec_state, thread, group_idx,
840
32.7k
      render_pipeline_input, jpeg_data, kDraw);
841
32.7k
}
842
843
}  // namespace jxl
844
#endif  // HWY_ONCE