Coverage Report

Created: 2025-06-22 08:04

/src/libjxl/lib/jxl/dec_group.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/dec_group.h"
7
8
#include <algorithm>
9
#include <cstdint>
10
#include <cstring>
11
#include <memory>
12
#include <utility>
13
14
#include "lib/jxl/chroma_from_luma.h"
15
#include "lib/jxl/frame_header.h"
16
17
#undef HWY_TARGET_INCLUDE
18
#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc"
19
#include <hwy/foreach_target.h>
20
#include <hwy/highway.h>
21
22
#include "lib/jxl/ac_context.h"
23
#include "lib/jxl/ac_strategy.h"
24
#include "lib/jxl/base/bits.h"
25
#include "lib/jxl/base/common.h"
26
#include "lib/jxl/base/printf_macros.h"
27
#include "lib/jxl/base/rect.h"
28
#include "lib/jxl/base/status.h"
29
#include "lib/jxl/coeff_order.h"
30
#include "lib/jxl/common.h"  // kMaxNumPasses
31
#include "lib/jxl/dec_cache.h"
32
#include "lib/jxl/dec_transforms-inl.h"
33
#include "lib/jxl/dec_xyb.h"
34
#include "lib/jxl/entropy_coder.h"
35
#include "lib/jxl/quant_weights.h"
36
#include "lib/jxl/quantizer-inl.h"
37
#include "lib/jxl/quantizer.h"
38
39
#ifndef LIB_JXL_DEC_GROUP_CC
40
#define LIB_JXL_DEC_GROUP_CC
41
namespace jxl {
42
43
struct AuxOut;
44
45
// Interface for reading groups for DecodeGroupImpl.
46
class GetBlock {
47
 public:
48
  virtual void StartRow(size_t by) = 0;
49
  virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs,
50
                           size_t size, size_t log2_covered_blocks,
51
                           ACPtr block[3], ACType ac_type) = 0;
52
10.2k
  virtual ~GetBlock() {}
53
};
54
55
// Controls whether DecodeGroupImpl renders to pixels or not.
56
enum DrawMode {
57
  // Render to pixels.
58
  kDraw = 0,
59
  // Don't render to pixels.
60
  kDontDraw = 1,
61
};
62
63
}  // namespace jxl
64
#endif  // LIB_JXL_DEC_GROUP_CC
65
66
HWY_BEFORE_NAMESPACE();
67
namespace jxl {
68
namespace HWY_NAMESPACE {
69
70
// These templates are not found via ADL.
71
using hwy::HWY_NAMESPACE::AllFalse;
72
using hwy::HWY_NAMESPACE::Gt;
73
using hwy::HWY_NAMESPACE::Le;
74
using hwy::HWY_NAMESPACE::MaskFromVec;
75
using hwy::HWY_NAMESPACE::Or;
76
using hwy::HWY_NAMESPACE::Rebind;
77
using hwy::HWY_NAMESPACE::ShiftRight;
78
79
using D = HWY_FULL(float);
80
using DU = HWY_FULL(uint32_t);
81
using DI = HWY_FULL(int32_t);
82
using DI16 = Rebind<int16_t, DI>;
83
using DI16_FULL = HWY_CAPPED(int16_t, kDCTBlockSize);
84
constexpr D d;
85
constexpr DI di;
86
constexpr DI16 di16;
87
constexpr DI16_FULL di16_full;
88
89
// TODO(veluca): consider SIMDfying.
90
0
void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
91
0
  for (size_t x = 0; x < 8; x++) {
92
0
    for (size_t y = x + 1; y < 8; y++) {
93
0
      std::swap(block[y * 8 + x], block[x * 8 + y]);
94
0
    }
95
0
  }
96
0
}
97
98
template <ACType ac_type>
99
void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
100
                 Vec<D> scaled_dequant_b,
101
                 const float* JXL_RESTRICT dequant_matrices, size_t size,
102
                 size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
103
                 const float* JXL_RESTRICT biases, ACPtr qblock[3],
104
15.9M
                 float* JXL_RESTRICT block) {
105
15.9M
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
106
15.9M
  const auto y_mul =
107
15.9M
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
108
15.9M
  const auto b_mul =
109
15.9M
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
110
111
15.9M
  Vec<DI> quantized_x_int;
112
15.9M
  Vec<DI> quantized_y_int;
113
15.9M
  Vec<DI> quantized_b_int;
114
15.9M
  if (ac_type == ACType::k16) {
115
11.2M
    Rebind<int16_t, DI> di16;
116
11.2M
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
117
11.2M
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
118
11.2M
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
119
11.2M
  } else {
120
4.67M
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
121
4.67M
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
122
4.67M
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
123
4.67M
  }
124
125
15.9M
  const auto dequant_x_cc =
126
15.9M
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
127
15.9M
  const auto dequant_y =
128
15.9M
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
129
15.9M
  const auto dequant_b_cc =
130
15.9M
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
131
132
15.9M
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
133
15.9M
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
134
15.9M
  Store(dequant_x, d, block + k);
135
15.9M
  Store(dequant_y, d, block + size + k);
136
15.9M
  Store(dequant_b, d, block + 2 * size + k);
137
15.9M
}
void jxl::N_SCALAR::DequantLane<(jxl::ACType)0>(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, float const*, unsigned long, unsigned long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, float const*, jxl::ACPtr*, float*)
Line
Count
Source
104
11.2M
                 float* JXL_RESTRICT block) {
105
11.2M
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
106
11.2M
  const auto y_mul =
107
11.2M
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
108
11.2M
  const auto b_mul =
109
11.2M
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
110
111
11.2M
  Vec<DI> quantized_x_int;
112
11.2M
  Vec<DI> quantized_y_int;
113
11.2M
  Vec<DI> quantized_b_int;
114
11.2M
  if (ac_type == ACType::k16) {
115
11.2M
    Rebind<int16_t, DI> di16;
116
11.2M
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
117
11.2M
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
118
11.2M
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
119
18.4E
  } else {
120
18.4E
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
121
18.4E
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
122
18.4E
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
123
18.4E
  }
124
125
11.2M
  const auto dequant_x_cc =
126
11.2M
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
127
11.2M
  const auto dequant_y =
128
11.2M
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
129
11.2M
  const auto dequant_b_cc =
130
11.2M
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
131
132
11.2M
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
133
11.2M
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
134
11.2M
  Store(dequant_x, d, block + k);
135
11.2M
  Store(dequant_y, d, block + size + k);
136
11.2M
  Store(dequant_b, d, block + 2 * size + k);
137
11.2M
}
void jxl::N_SCALAR::DequantLane<(jxl::ACType)1>(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, float const*, unsigned long, unsigned long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, float const*, jxl::ACPtr*, float*)
Line
Count
Source
104
4.73M
                 float* JXL_RESTRICT block) {
105
4.73M
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
106
4.73M
  const auto y_mul =
107
4.73M
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
108
4.73M
  const auto b_mul =
109
4.73M
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
110
111
4.73M
  Vec<DI> quantized_x_int;
112
4.73M
  Vec<DI> quantized_y_int;
113
4.73M
  Vec<DI> quantized_b_int;
114
4.73M
  if (ac_type == ACType::k16) {
115
0
    Rebind<int16_t, DI> di16;
116
0
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
117
0
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
118
0
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
119
4.73M
  } else {
120
4.73M
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
121
4.73M
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
122
4.73M
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
123
4.73M
  }
124
125
4.73M
  const auto dequant_x_cc =
126
4.73M
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
127
4.73M
  const auto dequant_y =
128
4.73M
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
129
4.73M
  const auto dequant_b_cc =
130
4.73M
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
131
132
4.73M
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
133
4.73M
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
134
4.73M
  Store(dequant_x, d, block + k);
135
4.73M
  Store(dequant_y, d, block + size + k);
136
4.73M
  Store(dequant_b, d, block + 2 * size + k);
137
4.73M
}
138
139
template <ACType ac_type>
140
void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
141
                  float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul,
142
                  Vec<D> b_cc_mul, AcStrategyType kind, size_t size,
143
                  const Quantizer& quantizer, size_t covered_blocks,
144
                  const size_t* sbx,
145
                  const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
146
                  size_t dc_stride, const float* JXL_RESTRICT biases,
147
                  ACPtr qblock[3], float* JXL_RESTRICT block,
148
203k
                  float* JXL_RESTRICT scratch) {
149
203k
  const auto scaled_dequant_s = inv_global_scale / quant;
150
151
203k
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
152
203k
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
153
203k
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
154
155
203k
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
156
157
16.0M
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
158
15.8M
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
159
15.8M
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
160
15.8M
                         qblock, block);
161
15.8M
  }
162
811k
  for (size_t c = 0; c < 3; c++) {
163
608k
    LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
164
608k
                            block + c * size, scratch);
165
608k
  }
166
203k
}
void jxl::N_SCALAR::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Line
Count
Source
148
136k
                  float* JXL_RESTRICT scratch) {
149
136k
  const auto scaled_dequant_s = inv_global_scale / quant;
150
151
136k
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
152
136k
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
153
136k
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
154
155
136k
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
156
157
11.2M
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
158
11.1M
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
159
11.1M
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
160
11.1M
                         qblock, block);
161
11.1M
  }
162
546k
  for (size_t c = 0; c < 3; c++) {
163
409k
    LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
164
409k
                            block + c * size, scratch);
165
409k
  }
166
136k
}
void jxl::N_SCALAR::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, jxl::AcStrategyType, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Line
Count
Source
148
66.2k
                  float* JXL_RESTRICT scratch) {
149
66.2k
  const auto scaled_dequant_s = inv_global_scale / quant;
150
151
66.2k
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
152
66.2k
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
153
66.2k
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
154
155
66.2k
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
156
157
4.79M
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
158
4.72M
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
159
4.72M
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
160
4.72M
                         qblock, block);
161
4.72M
  }
162
265k
  for (size_t c = 0; c < 3; c++) {
163
198k
    LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
164
198k
                            block + c * size, scratch);
165
198k
  }
166
66.2k
}
167
168
Status DecodeGroupImpl(const FrameHeader& frame_header,
169
                       GetBlock* JXL_RESTRICT get_block,
170
                       GroupDecCache* JXL_RESTRICT group_dec_cache,
171
                       PassesDecoderState* JXL_RESTRICT dec_state,
172
                       size_t thread, size_t group_idx,
173
                       RenderPipelineInput& render_pipeline_input,
174
10.2k
                       jpeg::JPEGData* jpeg_data, DrawMode draw) {
175
  // TODO(veluca): investigate cache usage in this function.
176
10.2k
  const Rect block_rect =
177
10.2k
      dec_state->shared->frame_dim.BlockGroupRect(group_idx);
178
10.2k
  const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
179
180
10.2k
  const size_t xsize_blocks = block_rect.xsize();
181
10.2k
  const size_t ysize_blocks = block_rect.ysize();
182
183
10.2k
  const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
184
185
10.2k
  const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
186
187
10.2k
  const YCbCrChromaSubsampling& cs = frame_header.chroma_subsampling;
188
189
10.2k
  const auto kJpegDctMin = Set(di16_full, -4095);
190
10.2k
  const auto kJpegDctMax = Set(di16_full, 4095);
191
192
10.2k
  size_t idct_stride[3];
193
41.1k
  for (size_t c = 0; c < 3; c++) {
194
30.8k
    idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow();
195
30.8k
  }
196
197
10.2k
  HWY_ALIGN int32_t scaled_qtable[64 * 3];
198
199
10.2k
  ACType ac_type = dec_state->coefficients->Type();
200
10.2k
  auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16>
201
10.2k
                                              : DequantBlock<ACType::k32>;
202
  // Whether or not coefficients should be stored for future usage, and/or read
203
  // from past usage.
204
10.2k
  bool accumulate = !dec_state->coefficients->IsEmpty();
205
  // Offset of the current block in the group.
206
10.2k
  size_t offset = 0;
207
208
10.2k
  std::array<int, 3> jpeg_c_map;
209
10.2k
  bool jpeg_is_gray = false;
210
10.2k
  std::array<int, 3> dcoff = {};
211
212
  // TODO(veluca): all of this should be done only once per image.
213
10.2k
  const ColorCorrelation& color_correlation = dec_state->shared->cmap.base();
214
10.2k
  if (jpeg_data) {
215
0
    if (!color_correlation.IsJPEGCompatible()) {
216
0
      return JXL_FAILURE("The CfL map is not JPEG-compatible");
217
0
    }
218
0
    jpeg_is_gray = (jpeg_data->components.size() == 1);
219
0
    JXL_ENSURE(frame_header.color_transform != ColorTransform::kXYB);
220
0
    jpeg_c_map = JpegOrder(frame_header.color_transform, jpeg_is_gray);
221
0
    const std::vector<QuantEncoding>& qe =
222
0
        dec_state->shared->matrices.encodings();
223
0
    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
224
0
        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
225
0
      return JXL_FAILURE(
226
0
          "Quantization table is not a JPEG quantization table.");
227
0
    }
228
0
    JXL_ENSURE(qe[0].qraw.qtable->size() == 3 * 8 * 8);
229
0
    int* qtable = qe[0].qraw.qtable->data();
230
0
    for (size_t c = 0; c < 3; c++) {
231
0
      if (frame_header.color_transform == ColorTransform::kNone) {
232
0
        dcoff[c] = 1024 / qtable[64 * c];
233
0
      }
234
0
      for (size_t i = 0; i < 64; i++) {
235
        // Transpose the matrix, as it will be used on the transposed block.
236
0
        int n = qtable[64 + i];
237
0
        int d = qtable[64 * c + i];
238
0
        if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) {
239
0
          return JXL_FAILURE("Invalid JPEG quantization table");
240
0
        }
241
0
        scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] =
242
0
            (1 << kCFLFixedPointPrecision) * n / d;
243
0
      }
244
0
    }
245
0
  }
246
247
10.2k
  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
248
10.2k
  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
249
10.2k
  Rect r[3];
250
41.1k
  for (size_t i = 0; i < 3; i++) {
251
30.8k
    r[i] =
252
30.8k
        Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i],
253
30.8k
             block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]);
254
30.8k
    if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(),
255
30.8k
                        dec_state->shared->dc->Plane(i).ysize()})) {
256
0
      return JXL_FAILURE("Frame dimensions are too big for the image.");
257
0
    }
258
30.8k
  }
259
260
36.7k
  for (size_t by = 0; by < ysize_blocks; ++by) {
261
26.8k
    get_block->StartRow(by);
262
26.8k
    size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]};
263
264
26.8k
    const int32_t* JXL_RESTRICT row_quant =
265
26.8k
        block_rect.ConstRow(dec_state->shared->raw_quant_field, by);
266
267
26.8k
    const float* JXL_RESTRICT dc_rows[3] = {
268
26.8k
        r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]),
269
26.8k
        r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]),
270
26.8k
        r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]),
271
26.8k
    };
272
273
26.8k
    const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks;
274
26.8k
    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
275
276
26.8k
    const int8_t* JXL_RESTRICT row_cmap[3] = {
277
26.8k
        dec_state->shared->cmap.ytox_map.ConstRow(ty),
278
26.8k
        nullptr,
279
26.8k
        dec_state->shared->cmap.ytob_map.ConstRow(ty),
280
26.8k
    };
281
282
26.8k
    float* JXL_RESTRICT idct_row[3];
283
26.8k
    int16_t* JXL_RESTRICT jpeg_row[3];
284
107k
    for (size_t c = 0; c < 3; c++) {
285
80.6k
      const auto& buffer = render_pipeline_input.GetBuffer(c);
286
80.6k
      idct_row[c] = buffer.second.Row(buffer.first, sby[c] * kBlockDim);
287
80.6k
      if (jpeg_data) {
288
0
        auto& component = jpeg_data->components[jpeg_c_map[c]];
289
0
        jpeg_row[c] =
290
0
            component.coeffs.data() +
291
0
            (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) *
292
0
                kDCTBlockSize;
293
0
      }
294
80.6k
    }
295
296
26.8k
    size_t bx = 0;
297
72.7k
    for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
298
46.2k
         tx++) {
299
46.2k
      size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks;
300
46.2k
      auto x_cc_mul = Set(d, color_correlation.YtoXRatio(row_cmap[0][abs_tx]));
301
46.2k
      auto b_cc_mul = Set(d, color_correlation.YtoBRatio(row_cmap[2][abs_tx]));
302
      // Increment bx by llf_x because those iterations would otherwise
303
      // immediately continue (!IsFirstBlock). Reduces mispredictions.
304
260k
      for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) {
305
214k
        size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]};
306
214k
        AcStrategy acs = acs_row[bx];
307
214k
        const size_t llf_x = acs.covered_blocks_x();
308
309
        // Can only happen in the second or lower rows of a varblock.
310
214k
        if (JXL_UNLIKELY(!acs.IsFirstBlock())) {
311
11.2k
          bx += llf_x;
312
11.2k
          continue;
313
11.2k
        }
314
203k
        const size_t log2_covered_blocks = acs.log2_covered_blocks();
315
316
203k
        const size_t covered_blocks = 1 << log2_covered_blocks;
317
203k
        const size_t size = covered_blocks * kDCTBlockSize;
318
319
203k
        ACPtr qblock[3];
320
203k
        if (accumulate) {
321
448
          for (size_t c = 0; c < 3; c++) {
322
336
            qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset);
323
336
          }
324
203k
        } else {
325
          // No point in reading from bitstream without accumulating and not
326
          // drawing.
327
203k
          JXL_ENSURE(draw == kDraw);
328
203k
          if (ac_type == ACType::k16) {
329
136k
            memset(group_dec_cache->dec_group_qblock16, 0,
330
136k
                   size * 3 * sizeof(int16_t));
331
547k
            for (size_t c = 0; c < 3; c++) {
332
410k
              qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size;
333
410k
            }
334
136k
          } else {
335
66.4k
            memset(group_dec_cache->dec_group_qblock, 0,
336
66.4k
                   size * 3 * sizeof(int32_t));
337
265k
            for (size_t c = 0; c < 3; c++) {
338
199k
              qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size;
339
199k
            }
340
66.4k
          }
341
203k
        }
342
203k
        JXL_RETURN_IF_ERROR(get_block->LoadBlock(
343
203k
            bx, by, acs, size, log2_covered_blocks, qblock, ac_type));
344
203k
        offset += size;
345
203k
        if (draw == kDontDraw) {
346
107
          bx += llf_x;
347
107
          continue;
348
107
        }
349
350
202k
        if (JXL_UNLIKELY(jpeg_data)) {
351
0
          if (acs.Strategy() != AcStrategyType::DCT) {
352
0
            return JXL_FAILURE(
353
0
                "Can only decode to JPEG if only DCT-8 is used.");
354
0
          }
355
356
0
          HWY_ALIGN int32_t transposed_dct_y[64];
357
0
          for (size_t c : {1, 0, 2}) {
358
            // Propagate only Y for grayscale.
359
0
            if (jpeg_is_gray && c != 1) {
360
0
              continue;
361
0
            }
362
0
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
363
0
              continue;
364
0
            }
365
0
            int16_t* JXL_RESTRICT jpeg_pos =
366
0
                jpeg_row[c] + sbx[c] * kDCTBlockSize;
367
            // JPEG XL is transposed, JPEG is not.
368
0
            auto* transposed_dct = qblock[c].ptr32;
369
0
            Transpose8x8InPlace(transposed_dct);
370
            // No CfL - no need to store the y block converted to integers.
371
0
            if (!cs.Is444() ||
372
0
                (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) {
373
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
374
0
                const auto ini = Load(di, transposed_dct + i);
375
0
                const auto ini16 = DemoteTo(di16, ini);
376
0
                StoreU(ini16, di16, jpeg_pos + i);
377
0
              }
378
0
            } else if (c == 1) {
379
              // Y channel: save for restoring X/B, but nothing else to do.
380
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
381
0
                const auto ini = Load(di, transposed_dct + i);
382
0
                Store(ini, di, transposed_dct_y + i);
383
0
                const auto ini16 = DemoteTo(di16, ini);
384
0
                StoreU(ini16, di16, jpeg_pos + i);
385
0
              }
386
0
            } else {
387
              // transposed_dct_y contains the y channel block, transposed.
388
0
              const auto scale =
389
0
                  Set(di, ColorCorrelation::RatioJPEG(row_cmap[c][abs_tx]));
390
0
              const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1));
391
0
              for (int i = 0; i < 64; i += Lanes(d)) {
392
0
                auto in = Load(di, transposed_dct + i);
393
0
                auto in_y = Load(di, transposed_dct_y + i);
394
0
                auto qt = Load(di, scaled_qtable + c * size + i);
395
0
                auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>(
396
0
                    Add(Mul(qt, scale), round));
397
0
                auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>(
398
0
                    Add(Mul(in_y, coeff_scale), round));
399
0
                StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i);
400
0
              }
401
0
            }
402
0
            jpeg_pos[0] =
403
0
                Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047);
404
0
            auto overflow = MaskFromVec(Set(di16_full, 0));
405
0
            auto underflow = MaskFromVec(Set(di16_full, 0));
406
0
            for (int i = 0; i < 64; i += Lanes(di16_full)) {
407
0
              auto in = LoadU(di16_full, jpeg_pos + i);
408
0
              overflow = Or(overflow, Gt(in, kJpegDctMax));
409
0
              underflow = Or(underflow, Lt(in, kJpegDctMin));
410
0
            }
411
0
            if (!AllFalse(di16_full, Or(overflow, underflow))) {
412
0
              return JXL_FAILURE("JPEG DCT coefficients out of range");
413
0
            }
414
0
          }
415
202k
        } else {
416
202k
          HWY_ALIGN float* const block = group_dec_cache->dec_group_block;
417
          // Dequantize and add predictions.
418
202k
          dequant_block(
419
202k
              acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
420
202k
              dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.Strategy(),
421
202k
              size, dec_state->shared->quantizer,
422
202k
              acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
423
202k
              dc_stride,
424
202k
              dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
425
202k
              block, group_dec_cache->scratch_space);
426
427
608k
          for (size_t c : {1, 0, 2}) {
428
608k
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
429
6.38k
              continue;
430
6.38k
            }
431
            // IDCT
432
601k
            float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim;
433
601k
            TransformToPixels(acs.Strategy(), block + c * size, idct_pos,
434
601k
                              idct_stride[c], group_dec_cache->scratch_space);
435
601k
          }
436
202k
        }
437
202k
        bx += llf_x;
438
202k
      }
439
46.2k
    }
440
26.8k
  }
441
9.91k
  return true;
442
10.2k
}
443
444
// NOLINTNEXTLINE(google-readability-namespace-comments)
445
}  // namespace HWY_NAMESPACE
446
}  // namespace jxl
447
HWY_AFTER_NAMESPACE();
448
449
#if HWY_ONCE
450
namespace jxl {
451
namespace {
452
// Decode quantized AC coefficients of DCT blocks.
453
// LLF components in the output block will not be modified.
454
template <ACType ac_type, bool uses_lz77>
455
Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
456
                        int32_t* JXL_RESTRICT row_nzeros,
457
                        const int32_t* JXL_RESTRICT row_nzeros_top,
458
                        size_t nzeros_stride, size_t c, size_t bx, size_t by,
459
                        size_t lbx, AcStrategy acs,
460
                        const coeff_order_t* JXL_RESTRICT coeff_order,
461
                        BitReader* JXL_RESTRICT br,
462
                        ANSSymbolReader* JXL_RESTRICT decoder,
463
                        const std::vector<uint8_t>& context_map,
464
                        const uint8_t* qdc_row, const int32_t* qf_row,
465
                        const BlockCtxMap& block_ctx_map, ACPtr block,
466
603k
                        size_t shift = 0) {
467
  // Equal to number of LLF coefficients.
468
603k
  const size_t covered_blocks = 1 << log2_covered_blocks;
469
603k
  const size_t size = covered_blocks * kDCTBlockSize;
470
603k
  int32_t predicted_nzeros =
471
603k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
472
473
603k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
474
603k
  const coeff_order_t* JXL_RESTRICT order =
475
603k
      &coeff_order[CoeffOrderOffset(ord, c)];
476
477
603k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
478
603k
  const int32_t nzero_ctx =
479
603k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
480
481
603k
  size_t nzeros =
482
603k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
483
603k
  if (nzeros > size - covered_blocks) {
484
202
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
485
202
                       " 8x8 blocks",
486
202
                       nzeros, covered_blocks);
487
202
  }
488
1.24M
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
489
1.42M
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
490
789k
      row_nzeros[bx + x + y * nzeros_stride] =
491
789k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
492
789k
    }
493
637k
  }
494
495
603k
  const size_t histo_offset =
496
603k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
497
498
603k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
499
2.84M
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
500
2.24M
    const size_t ctx =
501
2.24M
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
502
2.24M
                                          log2_covered_blocks, prev);
503
2.24M
    const size_t u_coeff =
504
2.24M
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
505
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
506
    // signed integer to avoid undefined behavior of shifting negative numbers.
507
2.24M
    const size_t magnitude = u_coeff >> 1;
508
2.24M
    const size_t neg_sign = (~u_coeff) & 1;
509
2.24M
    const intptr_t coeff =
510
2.24M
        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
511
2.24M
    if (ac_type == ACType::k16) {
512
1.21M
      block.ptr16[order[k]] += coeff;
513
1.21M
    } else {
514
1.02M
      block.ptr32[order[k]] += coeff;
515
1.02M
    }
516
2.24M
    prev = static_cast<size_t>(u_coeff != 0);
517
2.24M
    nzeros -= prev;
518
2.24M
  }
519
603k
  if (JXL_UNLIKELY(nzeros != 0)) {
520
164
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
521
164
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
522
164
                       "), channel %" PRIuS,
523
164
                       nzeros, bx, by, c);
524
164
  }
525
526
603k
  return true;
527
603k
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
466
197k
                        size_t shift = 0) {
467
  // Equal to number of LLF coefficients.
468
197k
  const size_t covered_blocks = 1 << log2_covered_blocks;
469
197k
  const size_t size = covered_blocks * kDCTBlockSize;
470
197k
  int32_t predicted_nzeros =
471
197k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
472
473
197k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
474
197k
  const coeff_order_t* JXL_RESTRICT order =
475
197k
      &coeff_order[CoeffOrderOffset(ord, c)];
476
477
197k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
478
197k
  const int32_t nzero_ctx =
479
197k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
480
481
197k
  size_t nzeros =
482
197k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
483
197k
  if (nzeros > size - covered_blocks) {
484
12
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
485
12
                       " 8x8 blocks",
486
12
                       nzeros, covered_blocks);
487
12
  }
488
403k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
489
486k
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
490
280k
      row_nzeros[bx + x + y * nzeros_stride] =
491
280k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
492
280k
    }
493
205k
  }
494
495
197k
  const size_t histo_offset =
496
197k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
497
498
197k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
499
573k
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
500
376k
    const size_t ctx =
501
376k
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
502
376k
                                          log2_covered_blocks, prev);
503
376k
    const size_t u_coeff =
504
376k
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
505
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
506
    // signed integer to avoid undefined behavior of shifting negative numbers.
507
376k
    const size_t magnitude = u_coeff >> 1;
508
376k
    const size_t neg_sign = (~u_coeff) & 1;
509
376k
    const intptr_t coeff =
510
376k
        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
511
376k
    if (ac_type == ACType::k16) {
512
376k
      block.ptr16[order[k]] += coeff;
513
376k
    } else {
514
1
      block.ptr32[order[k]] += coeff;
515
1
    }
516
376k
    prev = static_cast<size_t>(u_coeff != 0);
517
376k
    nzeros -= prev;
518
376k
  }
519
197k
  if (JXL_UNLIKELY(nzeros != 0)) {
520
67
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
521
67
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
522
67
                       "), channel %" PRIuS,
523
67
                       nzeros, bx, by, c);
524
67
  }
525
526
197k
  return true;
527
197k
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
466
40.0k
                        size_t shift = 0) {
467
  // Equal to number of LLF coefficients.
468
40.0k
  const size_t covered_blocks = 1 << log2_covered_blocks;
469
40.0k
  const size_t size = covered_blocks * kDCTBlockSize;
470
40.0k
  int32_t predicted_nzeros =
471
40.0k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
472
473
40.0k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
474
40.0k
  const coeff_order_t* JXL_RESTRICT order =
475
40.0k
      &coeff_order[CoeffOrderOffset(ord, c)];
476
477
40.0k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
478
40.0k
  const int32_t nzero_ctx =
479
40.0k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
480
481
40.0k
  size_t nzeros =
482
40.0k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
483
40.0k
  if (nzeros > size - covered_blocks) {
484
88
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
485
88
                       " 8x8 blocks",
486
88
                       nzeros, covered_blocks);
487
88
  }
488
82.2k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
489
101k
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
490
59.5k
      row_nzeros[bx + x + y * nzeros_stride] =
491
59.5k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
492
59.5k
    }
493
42.2k
  }
494
495
40.0k
  const size_t histo_offset =
496
40.0k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
497
498
40.0k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
499
163k
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
500
123k
    const size_t ctx =
501
123k
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
502
123k
                                          log2_covered_blocks, prev);
503
123k
    const size_t u_coeff =
504
123k
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
505
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
506
    // signed integer to avoid undefined behavior of shifting negative numbers.
507
123k
    const size_t magnitude = u_coeff >> 1;
508
123k
    const size_t neg_sign = (~u_coeff) & 1;
509
123k
    const intptr_t coeff =
510
123k
        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
511
123k
    if (ac_type == ACType::k16) {
512
0
      block.ptr16[order[k]] += coeff;
513
123k
    } else {
514
123k
      block.ptr32[order[k]] += coeff;
515
123k
    }
516
123k
    prev = static_cast<size_t>(u_coeff != 0);
517
123k
    nzeros -= prev;
518
123k
  }
519
40.0k
  if (JXL_UNLIKELY(nzeros != 0)) {
520
14
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
521
14
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
522
14
                       "), channel %" PRIuS,
523
14
                       nzeros, bx, by, c);
524
14
  }
525
526
39.9k
  return true;
527
40.0k
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
466
206k
                        size_t shift = 0) {
467
  // Equal to number of LLF coefficients.
468
206k
  const size_t covered_blocks = 1 << log2_covered_blocks;
469
206k
  const size_t size = covered_blocks * kDCTBlockSize;
470
206k
  int32_t predicted_nzeros =
471
206k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
472
473
206k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
474
206k
  const coeff_order_t* JXL_RESTRICT order =
475
206k
      &coeff_order[CoeffOrderOffset(ord, c)];
476
477
206k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
478
206k
  const int32_t nzero_ctx =
479
206k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
480
481
206k
  size_t nzeros =
482
206k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
483
206k
  if (nzeros > size - covered_blocks) {
484
14
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
485
14
                       " 8x8 blocks",
486
14
                       nzeros, covered_blocks);
487
14
  }
488
434k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
489
509k
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
490
282k
      row_nzeros[bx + x + y * nzeros_stride] =
491
282k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
492
282k
    }
493
227k
  }
494
495
206k
  const size_t histo_offset =
496
206k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
497
498
206k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
499
1.04M
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
500
842k
    const size_t ctx =
501
842k
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
502
842k
                                          log2_covered_blocks, prev);
503
842k
    const size_t u_coeff =
504
842k
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
505
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
506
    // signed integer to avoid undefined behavior of shifting negative numbers.
507
842k
    const size_t magnitude = u_coeff >> 1;
508
842k
    const size_t neg_sign = (~u_coeff) & 1;
509
842k
    const intptr_t coeff =
510
842k
        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
511
842k
    if (ac_type == ACType::k16) {
512
842k
      block.ptr16[order[k]] += coeff;
513
842k
    } else {
514
5
      block.ptr32[order[k]] += coeff;
515
5
    }
516
842k
    prev = static_cast<size_t>(u_coeff != 0);
517
842k
    nzeros -= prev;
518
842k
  }
519
206k
  if (JXL_UNLIKELY(nzeros != 0)) {
520
67
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
521
67
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
522
67
                       "), channel %" PRIuS,
523
67
                       nzeros, bx, by, c);
524
67
  }
525
526
206k
  return true;
527
206k
}
dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Line
Count
Source
466
158k
                        size_t shift = 0) {
467
  // Equal to number of LLF coefficients.
468
158k
  const size_t covered_blocks = 1 << log2_covered_blocks;
469
158k
  const size_t size = covered_blocks * kDCTBlockSize;
470
158k
  int32_t predicted_nzeros =
471
158k
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
472
473
158k
  size_t ord = kStrategyOrder[acs.RawStrategy()];
474
158k
  const coeff_order_t* JXL_RESTRICT order =
475
158k
      &coeff_order[CoeffOrderOffset(ord, c)];
476
477
158k
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
478
158k
  const int32_t nzero_ctx =
479
158k
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
480
481
158k
  size_t nzeros =
482
158k
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
483
158k
  if (nzeros > size - covered_blocks) {
484
88
    return JXL_FAILURE("Invalid AC: nzeros %" PRIuS " too large for %" PRIuS
485
88
                       " 8x8 blocks",
486
88
                       nzeros, covered_blocks);
487
88
  }
488
320k
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
489
329k
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
490
167k
      row_nzeros[bx + x + y * nzeros_stride] =
491
167k
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
492
167k
    }
493
161k
  }
494
495
158k
  const size_t histo_offset =
496
158k
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
497
498
158k
  size_t prev = (nzeros > size / 16 ? 0 : 1);
499
1.05M
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
500
901k
    const size_t ctx =
501
901k
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
502
901k
                                          log2_covered_blocks, prev);
503
901k
    const size_t u_coeff =
504
901k
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
505
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
506
    // signed integer to avoid undefined behavior of shifting negative numbers.
507
901k
    const size_t magnitude = u_coeff >> 1;
508
901k
    const size_t neg_sign = (~u_coeff) & 1;
509
901k
    const intptr_t coeff =
510
901k
        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
511
901k
    if (ac_type == ACType::k16) {
512
0
      block.ptr16[order[k]] += coeff;
513
901k
    } else {
514
901k
      block.ptr32[order[k]] += coeff;
515
901k
    }
516
901k
    prev = static_cast<size_t>(u_coeff != 0);
517
901k
    nzeros -= prev;
518
901k
  }
519
158k
  if (JXL_UNLIKELY(nzeros != 0)) {
520
16
    return JXL_FAILURE("Invalid AC: nzeros at end of block is %" PRIuS
521
16
                       ", should be 0. Block (%" PRIuS ", %" PRIuS
522
16
                       "), channel %" PRIuS,
523
16
                       nzeros, bx, by, c);
524
16
  }
525
526
158k
  return true;
527
158k
}
528
529
// Structs used by DecodeGroupImpl to get a quantized block.
530
// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row
531
// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient
532
// image provided by the encoder.
533
534
struct GetBlockFromBitstream : public GetBlock {
535
26.8k
  void StartRow(size_t by) override {
536
26.8k
    qf_row = rect.ConstRow(*qf, by);
537
107k
    for (size_t c = 0; c < 3; c++) {
538
80.6k
      size_t sby = by >> vshift[c];
539
80.6k
      quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0();
540
161k
      for (size_t i = 0; i < num_passes; i++) {
541
80.6k
        row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby);
542
80.6k
        row_nzeros_top[i][c] =
543
80.6k
            sby == 0
544
80.6k
                ? nullptr
545
80.6k
                : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1);
546
80.6k
      }
547
80.6k
    }
548
26.8k
  }
549
550
  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
551
                   size_t log2_covered_blocks, ACPtr block[3],
552
203k
                   ACType ac_type) override {
553
203k
    ;
554
609k
    for (size_t c : {1, 0, 2}) {
555
609k
      size_t sbx = bx >> hshift[c];
556
609k
      size_t sby = by >> vshift[c];
557
609k
      if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) {
558
6.39k
        continue;
559
6.39k
      }
560
561
1.20M
      for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) {
562
603k
        auto decode_ac_varblock =
563
603k
            decoders[pass].UsesLZ77()
564
603k
                ? (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 1>
565
237k
                                          : DecodeACVarBlock<ACType::k32, 1>)
566
603k
                : (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 0>
567
365k
                                          : DecodeACVarBlock<ACType::k32, 0>);
568
603k
        JXL_RETURN_IF_ERROR(decode_ac_varblock(
569
603k
            ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c],
570
603k
            row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs,
571
603k
            &coeff_orders[pass * coeff_order_size], readers[pass],
572
603k
            &decoders[pass], context_map[pass], quant_dc_row, qf_row,
573
603k
            *block_ctx_map, block[c], shift_for_pass[pass]));
574
603k
      }
575
603k
    }
576
203k
    return true;
577
203k
  }
578
579
  Status Init(const FrameHeader& frame_header,
580
              BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes,
581
              size_t group_idx, size_t histo_selector_bits, const Rect& rect,
582
              GroupDecCache* JXL_RESTRICT group_dec_cache,
583
10.2k
              PassesDecoderState* dec_state, size_t first_pass) {
584
41.1k
    for (size_t i = 0; i < 3; i++) {
585
30.8k
      hshift[i] = frame_header.chroma_subsampling.HShift(i);
586
30.8k
      vshift[i] = frame_header.chroma_subsampling.VShift(i);
587
30.8k
    }
588
10.2k
    this->coeff_order_size = dec_state->shared->coeff_order_size;
589
10.2k
    this->coeff_orders =
590
10.2k
        dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size;
591
10.2k
    this->context_map = dec_state->context_map.data() + first_pass;
592
10.2k
    this->readers = readers;
593
10.2k
    this->num_passes = num_passes;
594
10.2k
    this->shift_for_pass = frame_header.passes.shift + first_pass;
595
10.2k
    this->group_dec_cache = group_dec_cache;
596
10.2k
    this->rect = rect;
597
10.2k
    block_ctx_map = &dec_state->shared->block_ctx_map;
598
10.2k
    qf = &dec_state->shared->raw_quant_field;
599
10.2k
    quant_dc = &dec_state->shared->quant_dc;
600
601
20.5k
    for (size_t pass = 0; pass < num_passes; pass++) {
602
      // Select which histogram set to use among those of the current pass.
603
10.2k
      size_t cur_histogram = 0;
604
10.2k
      if (histo_selector_bits != 0) {
605
97
        cur_histogram = readers[pass]->ReadBits(histo_selector_bits);
606
97
      }
607
10.2k
      if (cur_histogram >= dec_state->shared->num_histograms) {
608
2
        return JXL_FAILURE("Invalid histogram selector");
609
2
      }
610
10.2k
      ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts();
611
612
10.2k
      JXL_ASSIGN_OR_RETURN(
613
10.2k
          decoders[pass],
614
10.2k
          ANSSymbolReader::Create(&dec_state->code[pass + first_pass],
615
10.2k
                                  readers[pass]));
616
10.2k
    }
617
10.2k
    nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow();
618
20.5k
    for (size_t i = 0; i < num_passes; i++) {
619
10.2k
      JXL_ENSURE(
620
10.2k
          nzeros_stride ==
621
10.2k
          static_cast<size_t>(group_dec_cache->num_nzeroes[i].PixelsPerRow()));
622
10.2k
    }
623
10.2k
    return true;
624
10.2k
  }
625
626
  const uint32_t* shift_for_pass = nullptr;  // not owned
627
  const coeff_order_t* JXL_RESTRICT coeff_orders;
628
  size_t coeff_order_size;
629
  const std::vector<uint8_t>* JXL_RESTRICT context_map;
630
  ANSSymbolReader decoders[kMaxNumPasses];
631
  BitReader* JXL_RESTRICT* JXL_RESTRICT readers;
632
  size_t num_passes;
633
  size_t ctx_offset[kMaxNumPasses];
634
  size_t nzeros_stride;
635
  int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3];
636
  const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3];
637
  GroupDecCache* JXL_RESTRICT group_dec_cache;
638
  const BlockCtxMap* block_ctx_map;
639
  const ImageI* qf;
640
  const ImageB* quant_dc;
641
  const int32_t* qf_row;
642
  const uint8_t* quant_dc_row;
643
  Rect rect;
644
  size_t hshift[3], vshift[3];
645
};
646
647
struct GetBlockFromEncoder : public GetBlock {
648
0
  void StartRow(size_t by) override {}
649
650
  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
651
                   size_t log2_covered_blocks, ACPtr block[3],
652
0
                   ACType ac_type) override {
653
0
    JXL_ENSURE(ac_type == ACType::k32);
654
0
    for (size_t c = 0; c < 3; c++) {
655
      // for each pass
656
0
      for (size_t i = 0; i < quantized_ac->size(); i++) {
657
0
        for (size_t k = 0; k < size; k++) {
658
          // TODO(veluca): SIMD.
659
0
          block[c].ptr32[k] +=
660
0
              rows[i][c][offset + k] * (1 << shift_for_pass[i]);
661
0
        }
662
0
      }
663
0
    }
664
0
    offset += size;
665
0
    return true;
666
0
  }
667
668
  static StatusOr<GetBlockFromEncoder> Create(
669
      const std::vector<std::unique_ptr<ACImage>>& ac, size_t group_idx,
670
0
      const uint32_t* shift_for_pass) {
671
0
    GetBlockFromEncoder result(ac, group_idx, shift_for_pass);
672
    // TODO(veluca): not supported with chroma subsampling.
673
0
    for (size_t i = 0; i < ac.size(); i++) {
674
0
      JXL_ENSURE(ac[i]->Type() == ACType::k32);
675
0
      for (size_t c = 0; c < 3; c++) {
676
0
        result.rows[i][c] = ac[i]->PlaneRow(c, group_idx, 0).ptr32;
677
0
      }
678
0
    }
679
0
    return result;
680
0
  }
681
682
  const std::vector<std::unique_ptr<ACImage>>* JXL_RESTRICT quantized_ac;
683
  size_t offset = 0;
684
  const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3];
685
  const uint32_t* shift_for_pass = nullptr;  // not owned
686
687
 private:
688
  GetBlockFromEncoder(const std::vector<std::unique_ptr<ACImage>>& ac,
689
                      size_t group_idx, const uint32_t* shift_for_pass)
690
0
      : quantized_ac(&ac), shift_for_pass(shift_for_pass) {}
691
};
692
693
HWY_EXPORT(DecodeGroupImpl);
694
695
}  // namespace
696
697
Status DecodeGroup(const FrameHeader& frame_header,
698
                   BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
699
                   size_t num_passes, size_t group_idx,
700
                   PassesDecoderState* JXL_RESTRICT dec_state,
701
                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
702
                   RenderPipelineInput& render_pipeline_input,
703
                   jpeg::JPEGData* JXL_RESTRICT jpeg_data, size_t first_pass,
704
10.2k
                   bool force_draw, bool dc_only, bool* should_run_pipeline) {
705
10.2k
  JxlMemoryManager* memory_manager = dec_state->memory_manager();
706
10.2k
  DrawMode draw =
707
10.2k
      (num_passes + first_pass == frame_header.passes.num_passes) || force_draw
708
10.2k
          ? kDraw
709
10.2k
          : kDontDraw;
710
711
10.2k
  if (should_run_pipeline) {
712
10.2k
    *should_run_pipeline = draw != kDontDraw;
713
10.2k
  }
714
715
10.2k
  if (draw == kDraw && num_passes == 0 && first_pass == 0) {
716
0
    JXL_RETURN_IF_ERROR(group_dec_cache->InitDCBufferOnce(memory_manager));
717
0
    const YCbCrChromaSubsampling& cs = frame_header.chroma_subsampling;
718
0
    for (size_t c : {0, 1, 2}) {
719
0
      size_t hs = cs.HShift(c);
720
0
      size_t vs = cs.VShift(c);
721
      // We reuse filter_input_storage here as it is not currently in use.
722
0
      const Rect src_rect_precs =
723
0
          dec_state->shared->frame_dim.BlockGroupRect(group_idx);
724
0
      const Rect src_rect =
725
0
          Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs,
726
0
               src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs);
727
0
      const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(),
728
0
                           src_rect.ysize());
729
0
      JXL_RETURN_IF_ERROR(
730
0
          CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2,
731
0
                                 copy_rect, &group_dec_cache->dc_buffer));
732
      // Mirrorpad. Interleaving left and right padding ensures that padding
733
      // works out correctly even for images with DC size of 1.
734
0
      for (size_t y = 0; y < src_rect.ysize() + 4; y++) {
735
0
        size_t xend = kRenderPipelineXOffset +
736
0
                      (dec_state->shared->dc->Plane(c).xsize() >> hs) -
737
0
                      src_rect.x0();
738
0
        for (size_t ix = 0; ix < 2; ix++) {
739
0
          if (src_rect.x0() == 0) {
740
0
            group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] =
741
0
                group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix];
742
0
          }
743
0
          if (src_rect.x0() + src_rect.xsize() + 2 >=
744
0
              (dec_state->shared->dc->xsize() >> hs)) {
745
0
            group_dec_cache->dc_buffer.Row(y)[xend + ix] =
746
0
                group_dec_cache->dc_buffer.Row(y)[xend - ix - 1];
747
0
          }
748
0
        }
749
0
      }
750
0
      const auto& buffer = render_pipeline_input.GetBuffer(c);
751
0
      Rect dst_rect = buffer.second;
752
0
      ImageF* upsampling_dst = buffer.first;
753
0
      JXL_ENSURE(dst_rect.IsInside(*upsampling_dst));
754
755
0
      RenderPipelineStage::RowInfo input_rows(1, std::vector<float*>(5));
756
0
      RenderPipelineStage::RowInfo output_rows(1, std::vector<float*>(8));
757
0
      for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize();
758
0
           y++) {
759
0
        for (ssize_t iy = 0; iy < 5; iy++) {
760
0
          input_rows[0][iy] = group_dec_cache->dc_buffer.Row(
761
0
              Mirror(static_cast<ssize_t>(y) + iy - 2,
762
0
                     dec_state->shared->dc->Plane(c).ysize() >> vs) +
763
0
              2 - src_rect.y0());
764
0
        }
765
0
        for (size_t iy = 0; iy < 8; iy++) {
766
0
          output_rows[0][iy] =
767
0
              dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) -
768
0
              kRenderPipelineXOffset;
769
0
        }
770
        // Arguments set to 0/nullptr are not used.
771
0
        JXL_RETURN_IF_ERROR(dec_state->upsampler8x->ProcessRow(
772
0
            input_rows, output_rows,
773
0
            /*xextra=*/0, src_rect.xsize(), 0, 0, thread));
774
0
      }
775
0
    }
776
0
    return true;
777
0
  }
778
779
10.2k
  size_t histo_selector_bits = 0;
780
10.2k
  if (dc_only) {
781
0
    JXL_ENSURE(num_passes == 0);
782
10.2k
  } else {
783
10.2k
    JXL_ENSURE(dec_state->shared->num_histograms > 0);
784
10.2k
    histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms);
785
10.2k
  }
786
787
10.2k
  auto get_block = jxl::make_unique<GetBlockFromBitstream>();
788
10.2k
  JXL_RETURN_IF_ERROR(get_block->Init(
789
10.2k
      frame_header, readers, num_passes, group_idx, histo_selector_bits,
790
10.2k
      dec_state->shared->frame_dim.BlockGroupRect(group_idx), group_dec_cache,
791
10.2k
      dec_state, first_pass));
792
793
10.2k
  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
794
10.2k
      frame_header, get_block.get(), group_dec_cache, dec_state, thread,
795
10.2k
      group_idx, render_pipeline_input, jpeg_data, draw));
796
797
19.8k
  for (size_t pass = 0; pass < num_passes; pass++) {
798
9.92k
    if (!get_block->decoders[pass].CheckANSFinalState()) {
799
0
      return JXL_FAILURE("ANS checksum failure.");
800
0
    }
801
9.92k
  }
802
9.92k
  return true;
803
9.92k
}
804
805
Status DecodeGroupForRoundtrip(const FrameHeader& frame_header,
806
                               const std::vector<std::unique_ptr<ACImage>>& ac,
807
                               size_t group_idx,
808
                               PassesDecoderState* JXL_RESTRICT dec_state,
809
                               GroupDecCache* JXL_RESTRICT group_dec_cache,
810
                               size_t thread,
811
                               RenderPipelineInput& render_pipeline_input,
812
                               jpeg::JPEGData* JXL_RESTRICT jpeg_data,
813
0
                               AuxOut* aux_out) {
814
0
  JxlMemoryManager* memory_manager = dec_state->memory_manager();
815
0
  JXL_ASSIGN_OR_RETURN(
816
0
      GetBlockFromEncoder get_block,
817
0
      GetBlockFromEncoder::Create(ac, group_idx, frame_header.passes.shift));
818
0
  JXL_RETURN_IF_ERROR(group_dec_cache->InitOnce(
819
0
      memory_manager,
820
0
      /*num_passes=*/0,
821
0
      /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1));
822
823
0
  return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
824
0
      frame_header, &get_block, group_dec_cache, dec_state, thread, group_idx,
825
0
      render_pipeline_input, jpeg_data, kDraw);
826
0
}
827
828
}  // namespace jxl
829
#endif  // HWY_ONCE