Coverage Report

Created: 2023-08-28 07:24

/src/libjxl/lib/jxl/dec_group.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/dec_group.h"
7
8
#include <stdint.h>
9
#include <string.h>
10
11
#include <algorithm>
12
#include <memory>
13
#include <utility>
14
15
#include "lib/jxl/frame_header.h"
16
17
#undef HWY_TARGET_INCLUDE
18
#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc"
19
#include <hwy/foreach_target.h>
20
#include <hwy/highway.h>
21
22
#include "lib/jxl/ac_context.h"
23
#include "lib/jxl/ac_strategy.h"
24
#include "lib/jxl/base/bits.h"
25
#include "lib/jxl/base/printf_macros.h"
26
#include "lib/jxl/base/status.h"
27
#include "lib/jxl/coeff_order.h"
28
#include "lib/jxl/common.h"
29
#include "lib/jxl/convolve.h"
30
#include "lib/jxl/dct_scales.h"
31
#include "lib/jxl/dec_cache.h"
32
#include "lib/jxl/dec_transforms-inl.h"
33
#include "lib/jxl/dec_xyb.h"
34
#include "lib/jxl/entropy_coder.h"
35
#include "lib/jxl/epf.h"
36
#include "lib/jxl/opsin_params.h"
37
#include "lib/jxl/quant_weights.h"
38
#include "lib/jxl/quantizer-inl.h"
39
#include "lib/jxl/quantizer.h"
40
41
#ifndef LIB_JXL_DEC_GROUP_CC
42
#define LIB_JXL_DEC_GROUP_CC
43
namespace jxl {
44
45
struct AuxOut;
46
47
// Interface for reading groups for DecodeGroupImpl.
48
class GetBlock {
49
 public:
50
  virtual void StartRow(size_t by) = 0;
51
  virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs,
52
                           size_t size, size_t log2_covered_blocks,
53
                           ACPtr block[3], ACType ac_type) = 0;
54
0
  virtual ~GetBlock() {}
55
};
56
57
// Controls whether DecodeGroupImpl renders to pixels or not.
58
enum DrawMode {
59
  // Render to pixels.
60
  kDraw = 0,
61
  // Don't render to pixels.
62
  kDontDraw = 1,
63
};
64
65
}  // namespace jxl
66
#endif  // LIB_JXL_DEC_GROUP_CC
67
68
HWY_BEFORE_NAMESPACE();
69
namespace jxl {
70
namespace HWY_NAMESPACE {
71
72
// These templates are not found via ADL.
73
using hwy::HWY_NAMESPACE::Rebind;
74
using hwy::HWY_NAMESPACE::ShiftRight;
75
76
using D = HWY_FULL(float);
77
using DU = HWY_FULL(uint32_t);
78
using DI = HWY_FULL(int32_t);
79
using DI16 = Rebind<int16_t, DI>;
80
constexpr D d;
81
constexpr DI di;
82
constexpr DI16 di16;
83
84
// TODO(veluca): consider SIMDfying.
85
0
void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
86
0
  for (size_t x = 0; x < 8; x++) {
87
0
    for (size_t y = x + 1; y < 8; y++) {
88
0
      std::swap(block[y * 8 + x], block[x * 8 + y]);
89
0
    }
90
0
  }
91
0
}
Unexecuted instantiation: jxl::N_SSE4::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX2::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX3::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::Transpose8x8InPlace(int*)
Unexecuted instantiation: jxl::N_SSE2::Transpose8x8InPlace(int*)
92
93
template <ACType ac_type>
94
void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
95
                 Vec<D> scaled_dequant_b,
96
                 const float* JXL_RESTRICT dequant_matrices, size_t size,
97
                 size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
98
                 const float* JXL_RESTRICT biases, ACPtr qblock[3],
99
0
                 float* JXL_RESTRICT block) {
100
0
  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
101
0
  const auto y_mul =
102
0
      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
103
0
  const auto b_mul =
104
0
      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
105
106
0
  Vec<DI> quantized_x_int;
107
0
  Vec<DI> quantized_y_int;
108
0
  Vec<DI> quantized_b_int;
109
0
  if (ac_type == ACType::k16) {
110
0
    Rebind<int16_t, DI> di16;
111
0
    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
112
0
    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
113
0
    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
114
0
  } else {
115
0
    quantized_x_int = Load(di, qblock[0].ptr32 + k);
116
0
    quantized_y_int = Load(di, qblock[1].ptr32 + k);
117
0
    quantized_b_int = Load(di, qblock[2].ptr32 + k);
118
0
  }
119
120
0
  const auto dequant_x_cc =
121
0
      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
122
0
  const auto dequant_y =
123
0
      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
124
0
  const auto dequant_b_cc =
125
0
      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
126
127
0
  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
128
0
  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
129
0
  Store(dequant_x, d, block + k);
130
0
  Store(dequant_y, d, block + size + k);
131
0
  Store(dequant_b, d, block + 2 * size + k);
132
0
}
Unexecuted instantiation: void jxl::N_SSE4::DequantLane<(jxl::ACType)0>(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_SSE4::DequantLane<(jxl::ACType)1>(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX2::DequantLane<(jxl::ACType)0>(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, unsigned long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX2::DequantLane<(jxl::ACType)1>(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, unsigned long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3::DequantLane<(jxl::ACType)0>(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3::DequantLane<(jxl::ACType)1>(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantLane<(jxl::ACType)0>(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantLane<(jxl::ACType)1>(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantLane<(jxl::ACType)0>(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantLane<(jxl::ACType)1>(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantLane<(jxl::ACType)0>(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantLane<(jxl::ACType)1>(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*)
133
134
template <ACType ac_type>
135
void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
136
                  float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul,
137
                  Vec<D> b_cc_mul, size_t kind, size_t size,
138
                  const Quantizer& quantizer, size_t covered_blocks,
139
                  const size_t* sbx,
140
                  const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
141
                  size_t dc_stride, const float* JXL_RESTRICT biases,
142
                  ACPtr qblock[3], float* JXL_RESTRICT block,
143
0
                  float* JXL_RESTRICT scratch) {
144
0
  const auto scaled_dequant_s = inv_global_scale / quant;
145
146
0
  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
147
0
  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
148
0
  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
149
150
0
  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
151
152
0
  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
153
0
    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
154
0
                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
155
0
                         qblock, block);
156
0
  }
157
0
  for (size_t c = 0; c < 3; c++) {
158
0
    LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
159
0
                            block + c * size, scratch);
160
0
  }
161
0
}
Unexecuted instantiation: void jxl::N_SSE4::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_SSE4::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX2::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX2::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
Unexecuted instantiation: void jxl::N_SSE2::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*)
162
163
Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
164
                       GroupDecCache* JXL_RESTRICT group_dec_cache,
165
                       PassesDecoderState* JXL_RESTRICT dec_state,
166
                       size_t thread, size_t group_idx,
167
                       RenderPipelineInput& render_pipeline_input,
168
0
                       ImageBundle* decoded, DrawMode draw) {
169
  // TODO(veluca): investigate cache usage in this function.
170
0
  const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx);
171
0
  const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
172
173
0
  const size_t xsize_blocks = block_rect.xsize();
174
0
  const size_t ysize_blocks = block_rect.ysize();
175
176
0
  const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
177
178
0
  const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
179
180
0
  const YCbCrChromaSubsampling& cs =
181
0
      dec_state->shared->frame_header.chroma_subsampling;
182
183
0
  size_t idct_stride[3];
184
0
  for (size_t c = 0; c < 3; c++) {
185
0
    idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow();
186
0
  }
187
188
0
  HWY_ALIGN int32_t scaled_qtable[64 * 3];
189
190
0
  ACType ac_type = dec_state->coefficients->Type();
191
0
  auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16>
192
0
                                              : DequantBlock<ACType::k32>;
193
  // Whether or not coefficients should be stored for future usage, and/or read
194
  // from past usage.
195
0
  bool accumulate = !dec_state->coefficients->IsEmpty();
196
  // Offset of the current block in the group.
197
0
  size_t offset = 0;
198
199
0
  std::array<int, 3> jpeg_c_map;
200
0
  bool jpeg_is_gray = false;
201
0
  std::array<int, 3> dcoff = {};
202
203
  // TODO(veluca): all of this should be done only once per image.
204
0
  if (decoded->IsJPEG()) {
205
0
    if (!dec_state->shared->cmap.IsJPEGCompatible()) {
206
0
      return JXL_FAILURE("The CfL map is not JPEG-compatible");
207
0
    }
208
0
    jpeg_is_gray = (decoded->jpeg_data->components.size() == 1);
209
0
    jpeg_c_map = JpegOrder(dec_state->shared->frame_header.color_transform,
210
0
                           jpeg_is_gray);
211
0
    const std::vector<QuantEncoding>& qe =
212
0
        dec_state->shared->matrices.encodings();
213
0
    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
214
0
        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
215
0
      return JXL_FAILURE(
216
0
          "Quantization table is not a JPEG quantization table.");
217
0
    }
218
0
    for (size_t c = 0; c < 3; c++) {
219
0
      if (dec_state->shared->frame_header.color_transform ==
220
0
          ColorTransform::kNone) {
221
0
        dcoff[c] = 1024 / (*qe[0].qraw.qtable)[64 * c];
222
0
      }
223
0
      for (size_t i = 0; i < 64; i++) {
224
        // Transpose the matrix, as it will be used on the transposed block.
225
0
        int n = qe[0].qraw.qtable->at(64 + i);
226
0
        int d = qe[0].qraw.qtable->at(64 * c + i);
227
0
        if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) {
228
0
          return JXL_FAILURE("Invalid JPEG quantization table");
229
0
        }
230
0
        scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] =
231
0
            (1 << kCFLFixedPointPrecision) * n / d;
232
0
      }
233
0
    }
234
0
  }
235
236
0
  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
237
0
  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
238
0
  Rect r[3];
239
0
  for (size_t i = 0; i < 3; i++) {
240
0
    r[i] =
241
0
        Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i],
242
0
             block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]);
243
0
    if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(),
244
0
                        dec_state->shared->dc->Plane(i).ysize()})) {
245
0
      return JXL_FAILURE("Frame dimensions are too big for the image.");
246
0
    }
247
0
  }
248
249
0
  for (size_t by = 0; by < ysize_blocks; ++by) {
250
0
    get_block->StartRow(by);
251
0
    size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]};
252
253
0
    const int32_t* JXL_RESTRICT row_quant =
254
0
        block_rect.ConstRow(dec_state->shared->raw_quant_field, by);
255
256
0
    const float* JXL_RESTRICT dc_rows[3] = {
257
0
        r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]),
258
0
        r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]),
259
0
        r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]),
260
0
    };
261
262
0
    const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks;
263
0
    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
264
265
0
    const int8_t* JXL_RESTRICT row_cmap[3] = {
266
0
        dec_state->shared->cmap.ytox_map.ConstRow(ty),
267
0
        nullptr,
268
0
        dec_state->shared->cmap.ytob_map.ConstRow(ty),
269
0
    };
270
271
0
    float* JXL_RESTRICT idct_row[3];
272
0
    int16_t* JXL_RESTRICT jpeg_row[3];
273
0
    for (size_t c = 0; c < 3; c++) {
274
0
      idct_row[c] = render_pipeline_input.GetBuffer(c).second.Row(
275
0
          render_pipeline_input.GetBuffer(c).first, sby[c] * kBlockDim);
276
0
      if (decoded->IsJPEG()) {
277
0
        auto& component = decoded->jpeg_data->components[jpeg_c_map[c]];
278
0
        jpeg_row[c] =
279
0
            component.coeffs.data() +
280
0
            (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) *
281
0
                kDCTBlockSize;
282
0
      }
283
0
    }
284
285
0
    size_t bx = 0;
286
0
    for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
287
0
         tx++) {
288
0
      size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks;
289
0
      auto x_cc_mul =
290
0
          Set(d, dec_state->shared->cmap.YtoXRatio(row_cmap[0][abs_tx]));
291
0
      auto b_cc_mul =
292
0
          Set(d, dec_state->shared->cmap.YtoBRatio(row_cmap[2][abs_tx]));
293
      // Increment bx by llf_x because those iterations would otherwise
294
      // immediately continue (!IsFirstBlock). Reduces mispredictions.
295
0
      for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) {
296
0
        size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]};
297
0
        AcStrategy acs = acs_row[bx];
298
0
        const size_t llf_x = acs.covered_blocks_x();
299
300
        // Can only happen in the second or lower rows of a varblock.
301
0
        if (JXL_UNLIKELY(!acs.IsFirstBlock())) {
302
0
          bx += llf_x;
303
0
          continue;
304
0
        }
305
0
        const size_t log2_covered_blocks = acs.log2_covered_blocks();
306
307
0
        const size_t covered_blocks = 1 << log2_covered_blocks;
308
0
        const size_t size = covered_blocks * kDCTBlockSize;
309
310
0
        ACPtr qblock[3];
311
0
        if (accumulate) {
312
0
          for (size_t c = 0; c < 3; c++) {
313
0
            qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset);
314
0
          }
315
0
        } else {
316
          // No point in reading from bitstream without accumulating and not
317
          // drawing.
318
0
          JXL_ASSERT(draw == kDraw);
319
0
          if (ac_type == ACType::k16) {
320
0
            memset(group_dec_cache->dec_group_qblock16, 0,
321
0
                   size * 3 * sizeof(int16_t));
322
0
            for (size_t c = 0; c < 3; c++) {
323
0
              qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size;
324
0
            }
325
0
          } else {
326
0
            memset(group_dec_cache->dec_group_qblock, 0,
327
0
                   size * 3 * sizeof(int32_t));
328
0
            for (size_t c = 0; c < 3; c++) {
329
0
              qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size;
330
0
            }
331
0
          }
332
0
        }
333
0
        JXL_RETURN_IF_ERROR(get_block->LoadBlock(
334
0
            bx, by, acs, size, log2_covered_blocks, qblock, ac_type));
335
0
        offset += size;
336
0
        if (draw == kDontDraw) {
337
0
          bx += llf_x;
338
0
          continue;
339
0
        }
340
341
0
        if (JXL_UNLIKELY(decoded->IsJPEG())) {
342
0
          if (acs.Strategy() != AcStrategy::Type::DCT) {
343
0
            return JXL_FAILURE(
344
0
                "Can only decode to JPEG if only DCT-8 is used.");
345
0
          }
346
347
0
          HWY_ALIGN int32_t transposed_dct_y[64];
348
0
          for (size_t c : {1, 0, 2}) {
349
            // Propagate only Y for grayscale.
350
0
            if (jpeg_is_gray && c != 1) {
351
0
              continue;
352
0
            }
353
0
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
354
0
              continue;
355
0
            }
356
0
            int16_t* JXL_RESTRICT jpeg_pos =
357
0
                jpeg_row[c] + sbx[c] * kDCTBlockSize;
358
            // JPEG XL is transposed, JPEG is not.
359
0
            auto transposed_dct = qblock[c].ptr32;
360
0
            Transpose8x8InPlace(transposed_dct);
361
            // No CfL - no need to store the y block converted to integers.
362
0
            if (!cs.Is444() ||
363
0
                (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) {
364
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
365
0
                const auto ini = Load(di, transposed_dct + i);
366
0
                const auto ini16 = DemoteTo(di16, ini);
367
0
                StoreU(ini16, di16, jpeg_pos + i);
368
0
              }
369
0
            } else if (c == 1) {
370
              // Y channel: save for restoring X/B, but nothing else to do.
371
0
              for (size_t i = 0; i < 64; i += Lanes(d)) {
372
0
                const auto ini = Load(di, transposed_dct + i);
373
0
                Store(ini, di, transposed_dct_y + i);
374
0
                const auto ini16 = DemoteTo(di16, ini);
375
0
                StoreU(ini16, di16, jpeg_pos + i);
376
0
              }
377
0
            } else {
378
              // transposed_dct_y contains the y channel block, transposed.
379
0
              const auto scale = Set(
380
0
                  di, dec_state->shared->cmap.RatioJPEG(row_cmap[c][abs_tx]));
381
0
              const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1));
382
0
              for (int i = 0; i < 64; i += Lanes(d)) {
383
0
                auto in = Load(di, transposed_dct + i);
384
0
                auto in_y = Load(di, transposed_dct_y + i);
385
0
                auto qt = Load(di, scaled_qtable + c * size + i);
386
0
                auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>(
387
0
                    Add(Mul(qt, scale), round));
388
0
                auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>(
389
0
                    Add(Mul(in_y, coeff_scale), round));
390
0
                StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i);
391
0
              }
392
0
            }
393
0
            jpeg_pos[0] =
394
0
                Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047);
395
0
          }
396
0
        } else {
397
0
          HWY_ALIGN float* const block = group_dec_cache->dec_group_block;
398
          // Dequantize and add predictions.
399
0
          dequant_block(
400
0
              acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
401
0
              dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(),
402
0
              size, dec_state->shared->quantizer,
403
0
              acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
404
0
              dc_stride,
405
0
              dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
406
0
              block, group_dec_cache->scratch_space);
407
408
0
          for (size_t c : {1, 0, 2}) {
409
0
            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
410
0
              continue;
411
0
            }
412
            // IDCT
413
0
            float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim;
414
0
            TransformToPixels(acs.Strategy(), block + c * size, idct_pos,
415
0
                              idct_stride[c], group_dec_cache->scratch_space);
416
0
          }
417
0
        }
418
0
        bx += llf_x;
419
0
      }
420
0
    }
421
0
  }
422
0
  return true;
423
0
}
Unexecuted instantiation: jxl::N_SSE4::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_AVX2::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_AVX3::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_AVX3_SPR::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode)
Unexecuted instantiation: jxl::N_SSE2::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode)
424
425
// NOLINTNEXTLINE(google-readability-namespace-comments)
426
}  // namespace HWY_NAMESPACE
427
}  // namespace jxl
428
HWY_AFTER_NAMESPACE();
429
430
#if HWY_ONCE
431
namespace jxl {
432
namespace {
433
// Decode quantized AC coefficients of DCT blocks.
434
// LLF components in the output block will not be modified.
435
template <ACType ac_type, bool uses_lz77>
436
Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
437
                        int32_t* JXL_RESTRICT row_nzeros,
438
                        const int32_t* JXL_RESTRICT row_nzeros_top,
439
                        size_t nzeros_stride, size_t c, size_t bx, size_t by,
440
                        size_t lbx, AcStrategy acs,
441
                        const coeff_order_t* JXL_RESTRICT coeff_order,
442
                        BitReader* JXL_RESTRICT br,
443
                        ANSSymbolReader* JXL_RESTRICT decoder,
444
                        const std::vector<uint8_t>& context_map,
445
                        const uint8_t* qdc_row, const int32_t* qf_row,
446
                        const BlockCtxMap& block_ctx_map, ACPtr block,
447
0
                        size_t shift = 0) {
448
  // Equal to number of LLF coefficients.
449
0
  const size_t covered_blocks = 1 << log2_covered_blocks;
450
0
  const size_t size = covered_blocks * kDCTBlockSize;
451
0
  int32_t predicted_nzeros =
452
0
      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
453
454
0
  size_t ord = kStrategyOrder[acs.RawStrategy()];
455
0
  const coeff_order_t* JXL_RESTRICT order =
456
0
      &coeff_order[CoeffOrderOffset(ord, c)];
457
458
0
  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
459
0
  const int32_t nzero_ctx =
460
0
      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
461
462
0
  size_t nzeros =
463
0
      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
464
0
  if (nzeros + covered_blocks > size) {
465
0
    return JXL_FAILURE("Invalid AC: nzeros too large");
466
0
  }
467
0
  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
468
0
    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
469
0
      row_nzeros[bx + x + y * nzeros_stride] =
470
0
          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
471
0
    }
472
0
  }
473
474
0
  const size_t histo_offset =
475
0
      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
476
477
0
  size_t prev = (nzeros > size / 16 ? 0 : 1);
478
0
  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
479
0
    const size_t ctx =
480
0
        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
481
0
                                          log2_covered_blocks, prev);
482
0
    const size_t u_coeff =
483
0
        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
484
    // Hand-rolled version of UnpackSigned, shifting before the conversion to
485
    // signed integer to avoid undefined behavior of shifting negative numbers.
486
0
    const size_t magnitude = u_coeff >> 1;
487
0
    const size_t neg_sign = (~u_coeff) & 1;
488
0
    const intptr_t coeff =
489
0
        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
490
0
    if (ac_type == ACType::k16) {
491
0
      block.ptr16[order[k]] += coeff;
492
0
    } else {
493
0
      block.ptr32[order[k]] += coeff;
494
0
    }
495
0
    prev = static_cast<size_t>(u_coeff != 0);
496
0
    nzeros -= prev;
497
0
  }
498
0
  if (JXL_UNLIKELY(nzeros != 0)) {
499
0
    return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS
500
0
                       "), channel %" PRIuS,
501
0
                       bx, by, c);
502
0
  }
503
504
0
  return true;
505
0
}
Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long)
506
507
// Structs used by DecodeGroupImpl to get a quantized block.
508
// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row
509
// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient
510
// image provided by the encoder.
511
512
struct GetBlockFromBitstream : public GetBlock {
513
0
  void StartRow(size_t by) override {
514
0
    qf_row = rect.ConstRow(*qf, by);
515
0
    for (size_t c = 0; c < 3; c++) {
516
0
      size_t sby = by >> vshift[c];
517
0
      quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0();
518
0
      for (size_t i = 0; i < num_passes; i++) {
519
0
        row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby);
520
0
        row_nzeros_top[i][c] =
521
0
            sby == 0
522
0
                ? nullptr
523
0
                : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1);
524
0
      }
525
0
    }
526
0
  }
527
528
  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
529
                   size_t log2_covered_blocks, ACPtr block[3],
530
0
                   ACType ac_type) override {
531
0
    ;
532
0
    for (size_t c : {1, 0, 2}) {
533
0
      size_t sbx = bx >> hshift[c];
534
0
      size_t sby = by >> vshift[c];
535
0
      if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) {
536
0
        continue;
537
0
      }
538
539
0
      for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) {
540
0
        auto decode_ac_varblock =
541
0
            decoders[pass].UsesLZ77()
542
0
                ? (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 1>
543
0
                                          : DecodeACVarBlock<ACType::k32, 1>)
544
0
                : (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 0>
545
0
                                          : DecodeACVarBlock<ACType::k32, 0>);
546
0
        JXL_RETURN_IF_ERROR(decode_ac_varblock(
547
0
            ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c],
548
0
            row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs,
549
0
            &coeff_orders[pass * coeff_order_size], readers[pass],
550
0
            &decoders[pass], context_map[pass], quant_dc_row, qf_row,
551
0
            *block_ctx_map, block[c], shift_for_pass[pass]));
552
0
      }
553
0
    }
554
0
    return true;
555
0
  }
556
557
  Status Init(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes,
558
              size_t group_idx, size_t histo_selector_bits, const Rect& rect,
559
              GroupDecCache* JXL_RESTRICT group_dec_cache,
560
0
              PassesDecoderState* dec_state, size_t first_pass) {
561
0
    for (size_t i = 0; i < 3; i++) {
562
0
      hshift[i] = dec_state->shared->frame_header.chroma_subsampling.HShift(i);
563
0
      vshift[i] = dec_state->shared->frame_header.chroma_subsampling.VShift(i);
564
0
    }
565
0
    this->coeff_order_size = dec_state->shared->coeff_order_size;
566
0
    this->coeff_orders =
567
0
        dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size;
568
0
    this->context_map = dec_state->context_map.data() + first_pass;
569
0
    this->readers = readers;
570
0
    this->num_passes = num_passes;
571
0
    this->shift_for_pass =
572
0
        dec_state->shared->frame_header.passes.shift + first_pass;
573
0
    this->group_dec_cache = group_dec_cache;
574
0
    this->rect = rect;
575
0
    block_ctx_map = &dec_state->shared->block_ctx_map;
576
0
    qf = &dec_state->shared->raw_quant_field;
577
0
    quant_dc = &dec_state->shared->quant_dc;
578
579
0
    for (size_t pass = 0; pass < num_passes; pass++) {
580
      // Select which histogram set to use among those of the current pass.
581
0
      size_t cur_histogram = 0;
582
0
      if (histo_selector_bits != 0) {
583
0
        cur_histogram = readers[pass]->ReadBits(histo_selector_bits);
584
0
      }
585
0
      if (cur_histogram >= dec_state->shared->num_histograms) {
586
0
        return JXL_FAILURE("Invalid histogram selector");
587
0
      }
588
0
      ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts();
589
590
0
      decoders[pass] =
591
0
          ANSSymbolReader(&dec_state->code[pass + first_pass], readers[pass]);
592
0
    }
593
0
    nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow();
594
0
    for (size_t i = 0; i < num_passes; i++) {
595
0
      JXL_ASSERT(
596
0
          nzeros_stride ==
597
0
          static_cast<size_t>(group_dec_cache->num_nzeroes[i].PixelsPerRow()));
598
0
    }
599
0
    return true;
600
0
  }
601
602
  const uint32_t* shift_for_pass = nullptr;  // not owned
603
  const coeff_order_t* JXL_RESTRICT coeff_orders;
604
  size_t coeff_order_size;
605
  const std::vector<uint8_t>* JXL_RESTRICT context_map;
606
  ANSSymbolReader decoders[kMaxNumPasses];
607
  BitReader* JXL_RESTRICT* JXL_RESTRICT readers;
608
  size_t num_passes;
609
  size_t ctx_offset[kMaxNumPasses];
610
  size_t nzeros_stride;
611
  int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3];
612
  const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3];
613
  GroupDecCache* JXL_RESTRICT group_dec_cache;
614
  const BlockCtxMap* block_ctx_map;
615
  const ImageI* qf;
616
  const ImageB* quant_dc;
617
  const int32_t* qf_row;
618
  const uint8_t* quant_dc_row;
619
  Rect rect;
620
  size_t hshift[3], vshift[3];
621
};
622
623
struct GetBlockFromEncoder : public GetBlock {
624
0
  void StartRow(size_t by) override {}
625
626
  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
627
                   size_t log2_covered_blocks, ACPtr block[3],
628
0
                   ACType ac_type) override {
629
0
    JXL_DASSERT(ac_type == ACType::k32);
630
0
    for (size_t c = 0; c < 3; c++) {
631
      // for each pass
632
0
      for (size_t i = 0; i < quantized_ac->size(); i++) {
633
0
        for (size_t k = 0; k < size; k++) {
634
          // TODO(veluca): SIMD.
635
0
          block[c].ptr32[k] +=
636
0
              rows[i][c][offset + k] * (1 << shift_for_pass[i]);
637
0
        }
638
0
      }
639
0
    }
640
0
    offset += size;
641
0
    return true;
642
0
  }
643
644
  GetBlockFromEncoder(const std::vector<std::unique_ptr<ACImage>>& ac,
645
                      size_t group_idx, const uint32_t* shift_for_pass)
646
0
      : quantized_ac(&ac), shift_for_pass(shift_for_pass) {
647
    // TODO(veluca): not supported with chroma subsampling.
648
0
    for (size_t i = 0; i < quantized_ac->size(); i++) {
649
0
      JXL_CHECK((*quantized_ac)[i]->Type() == ACType::k32);
650
0
      for (size_t c = 0; c < 3; c++) {
651
0
        rows[i][c] = (*quantized_ac)[i]->PlaneRow(c, group_idx, 0).ptr32;
652
0
      }
653
0
    }
654
0
  }
655
656
  const std::vector<std::unique_ptr<ACImage>>* JXL_RESTRICT quantized_ac;
657
  size_t offset = 0;
658
  const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3];
659
  const uint32_t* shift_for_pass = nullptr;  // not owned
660
};
661
662
HWY_EXPORT(DecodeGroupImpl);
663
664
}  // namespace
665
666
Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
667
                   size_t num_passes, size_t group_idx,
668
                   PassesDecoderState* JXL_RESTRICT dec_state,
669
                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
670
                   RenderPipelineInput& render_pipeline_input,
671
                   ImageBundle* JXL_RESTRICT decoded, size_t first_pass,
672
0
                   bool force_draw, bool dc_only, bool* should_run_pipeline) {
673
0
  DrawMode draw = (num_passes + first_pass ==
674
0
                   dec_state->shared->frame_header.passes.num_passes) ||
675
0
                          force_draw
676
0
                      ? kDraw
677
0
                      : kDontDraw;
678
679
0
  if (should_run_pipeline) {
680
0
    *should_run_pipeline = draw != kDontDraw;
681
0
  }
682
683
0
  if (draw == kDraw && num_passes == 0 && first_pass == 0) {
684
0
    group_dec_cache->InitDCBufferOnce();
685
0
    const YCbCrChromaSubsampling& cs =
686
0
        dec_state->shared->frame_header.chroma_subsampling;
687
0
    for (size_t c : {0, 1, 2}) {
688
0
      size_t hs = cs.HShift(c);
689
0
      size_t vs = cs.VShift(c);
690
      // We reuse filter_input_storage here as it is not currently in use.
691
0
      const Rect src_rect_precs = dec_state->shared->BlockGroupRect(group_idx);
692
0
      const Rect src_rect =
693
0
          Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs,
694
0
               src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs);
695
0
      const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(),
696
0
                           src_rect.ysize());
697
0
      CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2,
698
0
                             copy_rect, &group_dec_cache->dc_buffer);
699
      // Mirrorpad. Interleaving left and right padding ensures that padding
700
      // works out correctly even for images with DC size of 1.
701
0
      for (size_t y = 0; y < src_rect.ysize() + 4; y++) {
702
0
        size_t xend = kRenderPipelineXOffset +
703
0
                      (dec_state->shared->dc->Plane(c).xsize() >> hs) -
704
0
                      src_rect.x0();
705
0
        for (size_t ix = 0; ix < 2; ix++) {
706
0
          if (src_rect.x0() == 0) {
707
0
            group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] =
708
0
                group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix];
709
0
          }
710
0
          if (src_rect.x0() + src_rect.xsize() + 2 >=
711
0
              (dec_state->shared->dc->xsize() >> hs)) {
712
0
            group_dec_cache->dc_buffer.Row(y)[xend + ix] =
713
0
                group_dec_cache->dc_buffer.Row(y)[xend - ix - 1];
714
0
          }
715
0
        }
716
0
      }
717
0
      Rect dst_rect = render_pipeline_input.GetBuffer(c).second;
718
0
      ImageF* upsampling_dst = render_pipeline_input.GetBuffer(c).first;
719
0
      JXL_ASSERT(dst_rect.IsInside(*upsampling_dst));
720
721
0
      RenderPipelineStage::RowInfo input_rows(1, std::vector<float*>(5));
722
0
      RenderPipelineStage::RowInfo output_rows(1, std::vector<float*>(8));
723
0
      for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize();
724
0
           y++) {
725
0
        for (ssize_t iy = 0; iy < 5; iy++) {
726
0
          input_rows[0][iy] = group_dec_cache->dc_buffer.Row(
727
0
              Mirror(ssize_t(y) + iy - 2,
728
0
                     dec_state->shared->dc->Plane(c).ysize() >> vs) +
729
0
              2 - src_rect.y0());
730
0
        }
731
0
        for (size_t iy = 0; iy < 8; iy++) {
732
0
          output_rows[0][iy] =
733
0
              dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) -
734
0
              kRenderPipelineXOffset;
735
0
        }
736
        // Arguments set to 0/nullptr are not used.
737
0
        dec_state->upsampler8x->ProcessRow(input_rows, output_rows,
738
0
                                           /*xextra=*/0, src_rect.xsize(), 0, 0,
739
0
                                           thread);
740
0
      }
741
0
    }
742
0
    return true;
743
0
  }
744
745
0
  size_t histo_selector_bits = 0;
746
0
  if (dc_only) {
747
0
    JXL_ASSERT(num_passes == 0);
748
0
  } else {
749
0
    JXL_ASSERT(dec_state->shared->num_histograms > 0);
750
0
    histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms);
751
0
  }
752
753
0
  auto get_block = jxl::make_unique<GetBlockFromBitstream>();
754
0
  JXL_RETURN_IF_ERROR(
755
0
      get_block->Init(readers, num_passes, group_idx, histo_selector_bits,
756
0
                      dec_state->shared->BlockGroupRect(group_idx),
757
0
                      group_dec_cache, dec_state, first_pass));
758
759
0
  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
760
0
      get_block.get(), group_dec_cache, dec_state, thread, group_idx,
761
0
      render_pipeline_input, decoded, draw));
762
763
0
  for (size_t pass = 0; pass < num_passes; pass++) {
764
0
    if (!get_block->decoders[pass].CheckANSFinalState()) {
765
0
      return JXL_FAILURE("ANS checksum failure.");
766
0
    }
767
0
  }
768
0
  return true;
769
0
}
770
771
Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac,
772
                               size_t group_idx,
773
                               PassesDecoderState* JXL_RESTRICT dec_state,
774
                               GroupDecCache* JXL_RESTRICT group_dec_cache,
775
                               size_t thread,
776
                               RenderPipelineInput& render_pipeline_input,
777
                               ImageBundle* JXL_RESTRICT decoded,
778
0
                               AuxOut* aux_out) {
779
0
  GetBlockFromEncoder get_block(ac, group_idx,
780
0
                                dec_state->shared->frame_header.passes.shift);
781
0
  group_dec_cache->InitOnce(
782
0
      /*num_passes=*/0,
783
0
      /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1);
784
785
0
  return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
786
0
      &get_block, group_dec_cache, dec_state, thread, group_idx,
787
0
      render_pipeline_input, decoded, kDraw);
788
0
}
789
790
}  // namespace jxl
791
#endif  // HWY_ONCE