/src/libjxl/lib/jxl/dec_group.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/dec_group.h" |
7 | | |
8 | | #include <stdint.h> |
9 | | #include <string.h> |
10 | | |
11 | | #include <algorithm> |
12 | | #include <memory> |
13 | | #include <utility> |
14 | | |
15 | | #include "lib/jxl/frame_header.h" |
16 | | |
17 | | #undef HWY_TARGET_INCLUDE |
18 | | #define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc" |
19 | | #include <hwy/foreach_target.h> |
20 | | #include <hwy/highway.h> |
21 | | |
22 | | #include "lib/jxl/ac_context.h" |
23 | | #include "lib/jxl/ac_strategy.h" |
24 | | #include "lib/jxl/base/bits.h" |
25 | | #include "lib/jxl/base/printf_macros.h" |
26 | | #include "lib/jxl/base/status.h" |
27 | | #include "lib/jxl/coeff_order.h" |
28 | | #include "lib/jxl/common.h" |
29 | | #include "lib/jxl/convolve.h" |
30 | | #include "lib/jxl/dct_scales.h" |
31 | | #include "lib/jxl/dec_cache.h" |
32 | | #include "lib/jxl/dec_transforms-inl.h" |
33 | | #include "lib/jxl/dec_xyb.h" |
34 | | #include "lib/jxl/entropy_coder.h" |
35 | | #include "lib/jxl/epf.h" |
36 | | #include "lib/jxl/opsin_params.h" |
37 | | #include "lib/jxl/quant_weights.h" |
38 | | #include "lib/jxl/quantizer-inl.h" |
39 | | #include "lib/jxl/quantizer.h" |
40 | | |
41 | | #ifndef LIB_JXL_DEC_GROUP_CC |
42 | | #define LIB_JXL_DEC_GROUP_CC |
43 | | namespace jxl { |
44 | | |
45 | | struct AuxOut; |
46 | | |
47 | | // Interface for reading groups for DecodeGroupImpl. |
48 | | class GetBlock { |
49 | | public: |
50 | | virtual void StartRow(size_t by) = 0; |
51 | | virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, |
52 | | size_t size, size_t log2_covered_blocks, |
53 | | ACPtr block[3], ACType ac_type) = 0; |
54 | 0 | virtual ~GetBlock() {} |
55 | | }; |
56 | | |
57 | | // Controls whether DecodeGroupImpl renders to pixels or not. |
58 | | enum DrawMode { |
59 | | // Render to pixels. |
60 | | kDraw = 0, |
61 | | // Don't render to pixels. |
62 | | kDontDraw = 1, |
63 | | }; |
64 | | |
65 | | } // namespace jxl |
66 | | #endif // LIB_JXL_DEC_GROUP_CC |
67 | | |
68 | | HWY_BEFORE_NAMESPACE(); |
69 | | namespace jxl { |
70 | | namespace HWY_NAMESPACE { |
71 | | |
72 | | // These templates are not found via ADL. |
73 | | using hwy::HWY_NAMESPACE::Rebind; |
74 | | using hwy::HWY_NAMESPACE::ShiftRight; |
75 | | |
76 | | using D = HWY_FULL(float); |
77 | | using DU = HWY_FULL(uint32_t); |
78 | | using DI = HWY_FULL(int32_t); |
79 | | using DI16 = Rebind<int16_t, DI>; |
80 | | constexpr D d; |
81 | | constexpr DI di; |
82 | | constexpr DI16 di16; |
83 | | |
84 | | // TODO(veluca): consider SIMDfying. |
85 | 0 | void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) { |
86 | 0 | for (size_t x = 0; x < 8; x++) { |
87 | 0 | for (size_t y = x + 1; y < 8; y++) { |
88 | 0 | std::swap(block[y * 8 + x], block[x * 8 + y]); |
89 | 0 | } |
90 | 0 | } |
91 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Transpose8x8InPlace(int*) Unexecuted instantiation: jxl::N_AVX2::Transpose8x8InPlace(int*) Unexecuted instantiation: jxl::N_AVX3::Transpose8x8InPlace(int*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::Transpose8x8InPlace(int*) Unexecuted instantiation: jxl::N_AVX3_SPR::Transpose8x8InPlace(int*) Unexecuted instantiation: jxl::N_SSE2::Transpose8x8InPlace(int*) |
92 | | |
93 | | template <ACType ac_type> |
94 | | void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y, |
95 | | Vec<D> scaled_dequant_b, |
96 | | const float* JXL_RESTRICT dequant_matrices, size_t size, |
97 | | size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul, |
98 | | const float* JXL_RESTRICT biases, ACPtr qblock[3], |
99 | 0 | float* JXL_RESTRICT block) { |
100 | 0 | const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x); |
101 | 0 | const auto y_mul = |
102 | 0 | Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y); |
103 | 0 | const auto b_mul = |
104 | 0 | Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b); |
105 | |
|
106 | 0 | Vec<DI> quantized_x_int; |
107 | 0 | Vec<DI> quantized_y_int; |
108 | 0 | Vec<DI> quantized_b_int; |
109 | 0 | if (ac_type == ACType::k16) { |
110 | 0 | Rebind<int16_t, DI> di16; |
111 | 0 | quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k)); |
112 | 0 | quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k)); |
113 | 0 | quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k)); |
114 | 0 | } else { |
115 | 0 | quantized_x_int = Load(di, qblock[0].ptr32 + k); |
116 | 0 | quantized_y_int = Load(di, qblock[1].ptr32 + k); |
117 | 0 | quantized_b_int = Load(di, qblock[2].ptr32 + k); |
118 | 0 | } |
119 | |
|
120 | 0 | const auto dequant_x_cc = |
121 | 0 | Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul); |
122 | 0 | const auto dequant_y = |
123 | 0 | Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul); |
124 | 0 | const auto dequant_b_cc = |
125 | 0 | Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul); |
126 | |
|
127 | 0 | const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc); |
128 | 0 | const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc); |
129 | 0 | Store(dequant_x, d, block + k); |
130 | 0 | Store(dequant_y, d, block + size + k); |
131 | 0 | Store(dequant_b, d, block + 2 * size + k); |
132 | 0 | } Unexecuted instantiation: void jxl::N_SSE4::DequantLane<(jxl::ACType)0>(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_SSE4::DequantLane<(jxl::ACType)1>(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX2::DequantLane<(jxl::ACType)0>(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, unsigned long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX2::DequantLane<(jxl::ACType)1>(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, unsigned long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX3::DequantLane<(jxl::ACType)0>(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX3::DequantLane<(jxl::ACType)1>(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantLane<(jxl::ACType)0>(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantLane<(jxl::ACType)1>(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantLane<(jxl::ACType)0>(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantLane<(jxl::ACType)1>(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, unsigned long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_SSE2::DequantLane<(jxl::ACType)0>(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*) Unexecuted instantiation: void jxl::N_SSE2::DequantLane<(jxl::ACType)1>(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, unsigned long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, jxl::ACPtr*, float*) |
133 | | |
134 | | template <ACType ac_type> |
135 | | void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant, |
136 | | float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul, |
137 | | Vec<D> b_cc_mul, size_t kind, size_t size, |
138 | | const Quantizer& quantizer, size_t covered_blocks, |
139 | | const size_t* sbx, |
140 | | const float* JXL_RESTRICT* JXL_RESTRICT dc_row, |
141 | | size_t dc_stride, const float* JXL_RESTRICT biases, |
142 | | ACPtr qblock[3], float* JXL_RESTRICT block, |
143 | 0 | float* JXL_RESTRICT scratch) { |
144 | 0 | const auto scaled_dequant_s = inv_global_scale / quant; |
145 | |
|
146 | 0 | const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier); |
147 | 0 | const auto scaled_dequant_y = Set(d, scaled_dequant_s); |
148 | 0 | const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier); |
149 | |
|
150 | 0 | const float* dequant_matrices = quantizer.DequantMatrix(kind, 0); |
151 | |
|
152 | 0 | for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) { |
153 | 0 | DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b, |
154 | 0 | dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases, |
155 | 0 | qblock, block); |
156 | 0 | } |
157 | 0 | for (size_t c = 0; c < 3; c++) { |
158 | 0 | LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride, |
159 | 0 | block + c * size, scratch); |
160 | 0 | } |
161 | 0 | } Unexecuted instantiation: void jxl::N_SSE4::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_SSE4::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX2::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX2::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX3::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX3::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX3_ZEN4::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_AVX3_SPR::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_SSE2::DequantBlock<(jxl::ACType)0>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) Unexecuted instantiation: void jxl::N_SSE2::DequantBlock<(jxl::ACType)1>(jxl::AcStrategy const&, float, int, float, float, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, unsigned long, unsigned long, jxl::Quantizer const&, unsigned long, unsigned long const*, float const* restrict*, unsigned long, float const*, jxl::ACPtr*, float*, float*) |
162 | | |
163 | | Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block, |
164 | | GroupDecCache* JXL_RESTRICT group_dec_cache, |
165 | | PassesDecoderState* JXL_RESTRICT dec_state, |
166 | | size_t thread, size_t group_idx, |
167 | | RenderPipelineInput& render_pipeline_input, |
168 | 0 | ImageBundle* decoded, DrawMode draw) { |
169 | | // TODO(veluca): investigate cache usage in this function. |
170 | 0 | const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx); |
171 | 0 | const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy; |
172 | |
|
173 | 0 | const size_t xsize_blocks = block_rect.xsize(); |
174 | 0 | const size_t ysize_blocks = block_rect.ysize(); |
175 | |
|
176 | 0 | const size_t dc_stride = dec_state->shared->dc->PixelsPerRow(); |
177 | |
|
178 | 0 | const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale(); |
179 | |
|
180 | 0 | const YCbCrChromaSubsampling& cs = |
181 | 0 | dec_state->shared->frame_header.chroma_subsampling; |
182 | |
|
183 | 0 | size_t idct_stride[3]; |
184 | 0 | for (size_t c = 0; c < 3; c++) { |
185 | 0 | idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow(); |
186 | 0 | } |
187 | |
|
188 | 0 | HWY_ALIGN int32_t scaled_qtable[64 * 3]; |
189 | |
|
190 | 0 | ACType ac_type = dec_state->coefficients->Type(); |
191 | 0 | auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16> |
192 | 0 | : DequantBlock<ACType::k32>; |
193 | | // Whether or not coefficients should be stored for future usage, and/or read |
194 | | // from past usage. |
195 | 0 | bool accumulate = !dec_state->coefficients->IsEmpty(); |
196 | | // Offset of the current block in the group. |
197 | 0 | size_t offset = 0; |
198 | |
|
199 | 0 | std::array<int, 3> jpeg_c_map; |
200 | 0 | bool jpeg_is_gray = false; |
201 | 0 | std::array<int, 3> dcoff = {}; |
202 | | |
203 | | // TODO(veluca): all of this should be done only once per image. |
204 | 0 | if (decoded->IsJPEG()) { |
205 | 0 | if (!dec_state->shared->cmap.IsJPEGCompatible()) { |
206 | 0 | return JXL_FAILURE("The CfL map is not JPEG-compatible"); |
207 | 0 | } |
208 | 0 | jpeg_is_gray = (decoded->jpeg_data->components.size() == 1); |
209 | 0 | jpeg_c_map = JpegOrder(dec_state->shared->frame_header.color_transform, |
210 | 0 | jpeg_is_gray); |
211 | 0 | const std::vector<QuantEncoding>& qe = |
212 | 0 | dec_state->shared->matrices.encodings(); |
213 | 0 | if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW || |
214 | 0 | std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) { |
215 | 0 | return JXL_FAILURE( |
216 | 0 | "Quantization table is not a JPEG quantization table."); |
217 | 0 | } |
218 | 0 | for (size_t c = 0; c < 3; c++) { |
219 | 0 | if (dec_state->shared->frame_header.color_transform == |
220 | 0 | ColorTransform::kNone) { |
221 | 0 | dcoff[c] = 1024 / (*qe[0].qraw.qtable)[64 * c]; |
222 | 0 | } |
223 | 0 | for (size_t i = 0; i < 64; i++) { |
224 | | // Transpose the matrix, as it will be used on the transposed block. |
225 | 0 | int n = qe[0].qraw.qtable->at(64 + i); |
226 | 0 | int d = qe[0].qraw.qtable->at(64 * c + i); |
227 | 0 | if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) { |
228 | 0 | return JXL_FAILURE("Invalid JPEG quantization table"); |
229 | 0 | } |
230 | 0 | scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] = |
231 | 0 | (1 << kCFLFixedPointPrecision) * n / d; |
232 | 0 | } |
233 | 0 | } |
234 | 0 | } |
235 | | |
236 | 0 | size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)}; |
237 | 0 | size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)}; |
238 | 0 | Rect r[3]; |
239 | 0 | for (size_t i = 0; i < 3; i++) { |
240 | 0 | r[i] = |
241 | 0 | Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i], |
242 | 0 | block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]); |
243 | 0 | if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(), |
244 | 0 | dec_state->shared->dc->Plane(i).ysize()})) { |
245 | 0 | return JXL_FAILURE("Frame dimensions are too big for the image."); |
246 | 0 | } |
247 | 0 | } |
248 | | |
249 | 0 | for (size_t by = 0; by < ysize_blocks; ++by) { |
250 | 0 | get_block->StartRow(by); |
251 | 0 | size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]}; |
252 | |
|
253 | 0 | const int32_t* JXL_RESTRICT row_quant = |
254 | 0 | block_rect.ConstRow(dec_state->shared->raw_quant_field, by); |
255 | |
|
256 | 0 | const float* JXL_RESTRICT dc_rows[3] = { |
257 | 0 | r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]), |
258 | 0 | r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]), |
259 | 0 | r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]), |
260 | 0 | }; |
261 | |
|
262 | 0 | const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks; |
263 | 0 | AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by); |
264 | |
|
265 | 0 | const int8_t* JXL_RESTRICT row_cmap[3] = { |
266 | 0 | dec_state->shared->cmap.ytox_map.ConstRow(ty), |
267 | 0 | nullptr, |
268 | 0 | dec_state->shared->cmap.ytob_map.ConstRow(ty), |
269 | 0 | }; |
270 | |
|
271 | 0 | float* JXL_RESTRICT idct_row[3]; |
272 | 0 | int16_t* JXL_RESTRICT jpeg_row[3]; |
273 | 0 | for (size_t c = 0; c < 3; c++) { |
274 | 0 | idct_row[c] = render_pipeline_input.GetBuffer(c).second.Row( |
275 | 0 | render_pipeline_input.GetBuffer(c).first, sby[c] * kBlockDim); |
276 | 0 | if (decoded->IsJPEG()) { |
277 | 0 | auto& component = decoded->jpeg_data->components[jpeg_c_map[c]]; |
278 | 0 | jpeg_row[c] = |
279 | 0 | component.coeffs.data() + |
280 | 0 | (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) * |
281 | 0 | kDCTBlockSize; |
282 | 0 | } |
283 | 0 | } |
284 | |
|
285 | 0 | size_t bx = 0; |
286 | 0 | for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); |
287 | 0 | tx++) { |
288 | 0 | size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks; |
289 | 0 | auto x_cc_mul = |
290 | 0 | Set(d, dec_state->shared->cmap.YtoXRatio(row_cmap[0][abs_tx])); |
291 | 0 | auto b_cc_mul = |
292 | 0 | Set(d, dec_state->shared->cmap.YtoBRatio(row_cmap[2][abs_tx])); |
293 | | // Increment bx by llf_x because those iterations would otherwise |
294 | | // immediately continue (!IsFirstBlock). Reduces mispredictions. |
295 | 0 | for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) { |
296 | 0 | size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]}; |
297 | 0 | AcStrategy acs = acs_row[bx]; |
298 | 0 | const size_t llf_x = acs.covered_blocks_x(); |
299 | | |
300 | | // Can only happen in the second or lower rows of a varblock. |
301 | 0 | if (JXL_UNLIKELY(!acs.IsFirstBlock())) { |
302 | 0 | bx += llf_x; |
303 | 0 | continue; |
304 | 0 | } |
305 | 0 | const size_t log2_covered_blocks = acs.log2_covered_blocks(); |
306 | |
|
307 | 0 | const size_t covered_blocks = 1 << log2_covered_blocks; |
308 | 0 | const size_t size = covered_blocks * kDCTBlockSize; |
309 | |
|
310 | 0 | ACPtr qblock[3]; |
311 | 0 | if (accumulate) { |
312 | 0 | for (size_t c = 0; c < 3; c++) { |
313 | 0 | qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset); |
314 | 0 | } |
315 | 0 | } else { |
316 | | // No point in reading from bitstream without accumulating and not |
317 | | // drawing. |
318 | 0 | JXL_ASSERT(draw == kDraw); |
319 | 0 | if (ac_type == ACType::k16) { |
320 | 0 | memset(group_dec_cache->dec_group_qblock16, 0, |
321 | 0 | size * 3 * sizeof(int16_t)); |
322 | 0 | for (size_t c = 0; c < 3; c++) { |
323 | 0 | qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size; |
324 | 0 | } |
325 | 0 | } else { |
326 | 0 | memset(group_dec_cache->dec_group_qblock, 0, |
327 | 0 | size * 3 * sizeof(int32_t)); |
328 | 0 | for (size_t c = 0; c < 3; c++) { |
329 | 0 | qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size; |
330 | 0 | } |
331 | 0 | } |
332 | 0 | } |
333 | 0 | JXL_RETURN_IF_ERROR(get_block->LoadBlock( |
334 | 0 | bx, by, acs, size, log2_covered_blocks, qblock, ac_type)); |
335 | 0 | offset += size; |
336 | 0 | if (draw == kDontDraw) { |
337 | 0 | bx += llf_x; |
338 | 0 | continue; |
339 | 0 | } |
340 | | |
341 | 0 | if (JXL_UNLIKELY(decoded->IsJPEG())) { |
342 | 0 | if (acs.Strategy() != AcStrategy::Type::DCT) { |
343 | 0 | return JXL_FAILURE( |
344 | 0 | "Can only decode to JPEG if only DCT-8 is used."); |
345 | 0 | } |
346 | | |
347 | 0 | HWY_ALIGN int32_t transposed_dct_y[64]; |
348 | 0 | for (size_t c : {1, 0, 2}) { |
349 | | // Propagate only Y for grayscale. |
350 | 0 | if (jpeg_is_gray && c != 1) { |
351 | 0 | continue; |
352 | 0 | } |
353 | 0 | if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) { |
354 | 0 | continue; |
355 | 0 | } |
356 | 0 | int16_t* JXL_RESTRICT jpeg_pos = |
357 | 0 | jpeg_row[c] + sbx[c] * kDCTBlockSize; |
358 | | // JPEG XL is transposed, JPEG is not. |
359 | 0 | auto transposed_dct = qblock[c].ptr32; |
360 | 0 | Transpose8x8InPlace(transposed_dct); |
361 | | // No CfL - no need to store the y block converted to integers. |
362 | 0 | if (!cs.Is444() || |
363 | 0 | (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) { |
364 | 0 | for (size_t i = 0; i < 64; i += Lanes(d)) { |
365 | 0 | const auto ini = Load(di, transposed_dct + i); |
366 | 0 | const auto ini16 = DemoteTo(di16, ini); |
367 | 0 | StoreU(ini16, di16, jpeg_pos + i); |
368 | 0 | } |
369 | 0 | } else if (c == 1) { |
370 | | // Y channel: save for restoring X/B, but nothing else to do. |
371 | 0 | for (size_t i = 0; i < 64; i += Lanes(d)) { |
372 | 0 | const auto ini = Load(di, transposed_dct + i); |
373 | 0 | Store(ini, di, transposed_dct_y + i); |
374 | 0 | const auto ini16 = DemoteTo(di16, ini); |
375 | 0 | StoreU(ini16, di16, jpeg_pos + i); |
376 | 0 | } |
377 | 0 | } else { |
378 | | // transposed_dct_y contains the y channel block, transposed. |
379 | 0 | const auto scale = Set( |
380 | 0 | di, dec_state->shared->cmap.RatioJPEG(row_cmap[c][abs_tx])); |
381 | 0 | const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1)); |
382 | 0 | for (int i = 0; i < 64; i += Lanes(d)) { |
383 | 0 | auto in = Load(di, transposed_dct + i); |
384 | 0 | auto in_y = Load(di, transposed_dct_y + i); |
385 | 0 | auto qt = Load(di, scaled_qtable + c * size + i); |
386 | 0 | auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>( |
387 | 0 | Add(Mul(qt, scale), round)); |
388 | 0 | auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>( |
389 | 0 | Add(Mul(in_y, coeff_scale), round)); |
390 | 0 | StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i); |
391 | 0 | } |
392 | 0 | } |
393 | 0 | jpeg_pos[0] = |
394 | 0 | Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047); |
395 | 0 | } |
396 | 0 | } else { |
397 | 0 | HWY_ALIGN float* const block = group_dec_cache->dec_group_block; |
398 | | // Dequantize and add predictions. |
399 | 0 | dequant_block( |
400 | 0 | acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier, |
401 | 0 | dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(), |
402 | 0 | size, dec_state->shared->quantizer, |
403 | 0 | acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows, |
404 | 0 | dc_stride, |
405 | 0 | dec_state->output_encoding_info.opsin_params.quant_biases, qblock, |
406 | 0 | block, group_dec_cache->scratch_space); |
407 | |
|
408 | 0 | for (size_t c : {1, 0, 2}) { |
409 | 0 | if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) { |
410 | 0 | continue; |
411 | 0 | } |
412 | | // IDCT |
413 | 0 | float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim; |
414 | 0 | TransformToPixels(acs.Strategy(), block + c * size, idct_pos, |
415 | 0 | idct_stride[c], group_dec_cache->scratch_space); |
416 | 0 | } |
417 | 0 | } |
418 | 0 | bx += llf_x; |
419 | 0 | } |
420 | 0 | } |
421 | 0 | } |
422 | 0 | return true; |
423 | 0 | } Unexecuted instantiation: jxl::N_SSE4::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode) Unexecuted instantiation: jxl::N_AVX2::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode) Unexecuted instantiation: jxl::N_AVX3::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode) Unexecuted instantiation: jxl::N_AVX3_ZEN4::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode) Unexecuted instantiation: jxl::N_AVX3_SPR::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode) Unexecuted instantiation: jxl::N_SSE2::DecodeGroupImpl(jxl::GetBlock*, jxl::GroupDecCache*, jxl::PassesDecoderState*, unsigned long, unsigned long, jxl::RenderPipelineInput&, jxl::ImageBundle*, jxl::DrawMode) |
424 | | |
425 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
426 | | } // namespace HWY_NAMESPACE |
427 | | } // namespace jxl |
428 | | HWY_AFTER_NAMESPACE(); |
429 | | |
430 | | #if HWY_ONCE |
431 | | namespace jxl { |
432 | | namespace { |
433 | | // Decode quantized AC coefficients of DCT blocks. |
434 | | // LLF components in the output block will not be modified. |
435 | | template <ACType ac_type, bool uses_lz77> |
436 | | Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks, |
437 | | int32_t* JXL_RESTRICT row_nzeros, |
438 | | const int32_t* JXL_RESTRICT row_nzeros_top, |
439 | | size_t nzeros_stride, size_t c, size_t bx, size_t by, |
440 | | size_t lbx, AcStrategy acs, |
441 | | const coeff_order_t* JXL_RESTRICT coeff_order, |
442 | | BitReader* JXL_RESTRICT br, |
443 | | ANSSymbolReader* JXL_RESTRICT decoder, |
444 | | const std::vector<uint8_t>& context_map, |
445 | | const uint8_t* qdc_row, const int32_t* qf_row, |
446 | | const BlockCtxMap& block_ctx_map, ACPtr block, |
447 | 0 | size_t shift = 0) { |
448 | | // Equal to number of LLF coefficients. |
449 | 0 | const size_t covered_blocks = 1 << log2_covered_blocks; |
450 | 0 | const size_t size = covered_blocks * kDCTBlockSize; |
451 | 0 | int32_t predicted_nzeros = |
452 | 0 | PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32); |
453 | |
|
454 | 0 | size_t ord = kStrategyOrder[acs.RawStrategy()]; |
455 | 0 | const coeff_order_t* JXL_RESTRICT order = |
456 | 0 | &coeff_order[CoeffOrderOffset(ord, c)]; |
457 | |
|
458 | 0 | size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c); |
459 | 0 | const int32_t nzero_ctx = |
460 | 0 | block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset; |
461 | |
|
462 | 0 | size_t nzeros = |
463 | 0 | decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map); |
464 | 0 | if (nzeros + covered_blocks > size) { |
465 | 0 | return JXL_FAILURE("Invalid AC: nzeros too large"); |
466 | 0 | } |
467 | 0 | for (size_t y = 0; y < acs.covered_blocks_y(); y++) { |
468 | 0 | for (size_t x = 0; x < acs.covered_blocks_x(); x++) { |
469 | 0 | row_nzeros[bx + x + y * nzeros_stride] = |
470 | 0 | (nzeros + covered_blocks - 1) >> log2_covered_blocks; |
471 | 0 | } |
472 | 0 | } |
473 | |
|
474 | 0 | const size_t histo_offset = |
475 | 0 | ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx); |
476 | |
|
477 | 0 | size_t prev = (nzeros > size / 16 ? 0 : 1); |
478 | 0 | for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) { |
479 | 0 | const size_t ctx = |
480 | 0 | histo_offset + ZeroDensityContext(nzeros, k, covered_blocks, |
481 | 0 | log2_covered_blocks, prev); |
482 | 0 | const size_t u_coeff = |
483 | 0 | decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map); |
484 | | // Hand-rolled version of UnpackSigned, shifting before the conversion to |
485 | | // signed integer to avoid undefined behavior of shifting negative numbers. |
486 | 0 | const size_t magnitude = u_coeff >> 1; |
487 | 0 | const size_t neg_sign = (~u_coeff) & 1; |
488 | 0 | const intptr_t coeff = |
489 | 0 | static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift); |
490 | 0 | if (ac_type == ACType::k16) { |
491 | 0 | block.ptr16[order[k]] += coeff; |
492 | 0 | } else { |
493 | 0 | block.ptr32[order[k]] += coeff; |
494 | 0 | } |
495 | 0 | prev = static_cast<size_t>(u_coeff != 0); |
496 | 0 | nzeros -= prev; |
497 | 0 | } |
498 | 0 | if (JXL_UNLIKELY(nzeros != 0)) { |
499 | 0 | return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS |
500 | 0 | "), channel %" PRIuS, |
501 | 0 | bx, by, c); |
502 | 0 | } |
503 | | |
504 | 0 | return true; |
505 | 0 | } Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long) Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, true>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long) Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)0, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long) Unexecuted instantiation: dec_group.cc:jxl::Status jxl::(anonymous namespace)::DecodeACVarBlock<(jxl::ACType)1, false>(unsigned long, unsigned long, int*, int const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, jxl::AcStrategy, unsigned int const*, jxl::BitReader*, jxl::ANSSymbolReader*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> > const&, unsigned char const*, int const*, jxl::BlockCtxMap const&, jxl::ACPtr, unsigned long) |
506 | | |
507 | | // Structs used by DecodeGroupImpl to get a quantized block. |
508 | | // GetBlockFromBitstream uses ANS decoding (and thus keeps track of row |
509 | | // pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient |
510 | | // image provided by the encoder. |
511 | | |
512 | | struct GetBlockFromBitstream : public GetBlock { |
513 | 0 | void StartRow(size_t by) override { |
514 | 0 | qf_row = rect.ConstRow(*qf, by); |
515 | 0 | for (size_t c = 0; c < 3; c++) { |
516 | 0 | size_t sby = by >> vshift[c]; |
517 | 0 | quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0(); |
518 | 0 | for (size_t i = 0; i < num_passes; i++) { |
519 | 0 | row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby); |
520 | 0 | row_nzeros_top[i][c] = |
521 | 0 | sby == 0 |
522 | 0 | ? nullptr |
523 | 0 | : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1); |
524 | 0 | } |
525 | 0 | } |
526 | 0 | } |
527 | | |
528 | | Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size, |
529 | | size_t log2_covered_blocks, ACPtr block[3], |
530 | 0 | ACType ac_type) override { |
531 | 0 | ; |
532 | 0 | for (size_t c : {1, 0, 2}) { |
533 | 0 | size_t sbx = bx >> hshift[c]; |
534 | 0 | size_t sby = by >> vshift[c]; |
535 | 0 | if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) { |
536 | 0 | continue; |
537 | 0 | } |
538 | | |
539 | 0 | for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) { |
540 | 0 | auto decode_ac_varblock = |
541 | 0 | decoders[pass].UsesLZ77() |
542 | 0 | ? (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 1> |
543 | 0 | : DecodeACVarBlock<ACType::k32, 1>) |
544 | 0 | : (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 0> |
545 | 0 | : DecodeACVarBlock<ACType::k32, 0>); |
546 | 0 | JXL_RETURN_IF_ERROR(decode_ac_varblock( |
547 | 0 | ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c], |
548 | 0 | row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs, |
549 | 0 | &coeff_orders[pass * coeff_order_size], readers[pass], |
550 | 0 | &decoders[pass], context_map[pass], quant_dc_row, qf_row, |
551 | 0 | *block_ctx_map, block[c], shift_for_pass[pass])); |
552 | 0 | } |
553 | 0 | } |
554 | 0 | return true; |
555 | 0 | } |
556 | | |
557 | | Status Init(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes, |
558 | | size_t group_idx, size_t histo_selector_bits, const Rect& rect, |
559 | | GroupDecCache* JXL_RESTRICT group_dec_cache, |
560 | 0 | PassesDecoderState* dec_state, size_t first_pass) { |
561 | 0 | for (size_t i = 0; i < 3; i++) { |
562 | 0 | hshift[i] = dec_state->shared->frame_header.chroma_subsampling.HShift(i); |
563 | 0 | vshift[i] = dec_state->shared->frame_header.chroma_subsampling.VShift(i); |
564 | 0 | } |
565 | 0 | this->coeff_order_size = dec_state->shared->coeff_order_size; |
566 | 0 | this->coeff_orders = |
567 | 0 | dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size; |
568 | 0 | this->context_map = dec_state->context_map.data() + first_pass; |
569 | 0 | this->readers = readers; |
570 | 0 | this->num_passes = num_passes; |
571 | 0 | this->shift_for_pass = |
572 | 0 | dec_state->shared->frame_header.passes.shift + first_pass; |
573 | 0 | this->group_dec_cache = group_dec_cache; |
574 | 0 | this->rect = rect; |
575 | 0 | block_ctx_map = &dec_state->shared->block_ctx_map; |
576 | 0 | qf = &dec_state->shared->raw_quant_field; |
577 | 0 | quant_dc = &dec_state->shared->quant_dc; |
578 | |
|
579 | 0 | for (size_t pass = 0; pass < num_passes; pass++) { |
580 | | // Select which histogram set to use among those of the current pass. |
581 | 0 | size_t cur_histogram = 0; |
582 | 0 | if (histo_selector_bits != 0) { |
583 | 0 | cur_histogram = readers[pass]->ReadBits(histo_selector_bits); |
584 | 0 | } |
585 | 0 | if (cur_histogram >= dec_state->shared->num_histograms) { |
586 | 0 | return JXL_FAILURE("Invalid histogram selector"); |
587 | 0 | } |
588 | 0 | ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts(); |
589 | |
|
590 | 0 | decoders[pass] = |
591 | 0 | ANSSymbolReader(&dec_state->code[pass + first_pass], readers[pass]); |
592 | 0 | } |
593 | 0 | nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow(); |
594 | 0 | for (size_t i = 0; i < num_passes; i++) { |
595 | 0 | JXL_ASSERT( |
596 | 0 | nzeros_stride == |
597 | 0 | static_cast<size_t>(group_dec_cache->num_nzeroes[i].PixelsPerRow())); |
598 | 0 | } |
599 | 0 | return true; |
600 | 0 | } |
601 | | |
602 | | const uint32_t* shift_for_pass = nullptr; // not owned |
603 | | const coeff_order_t* JXL_RESTRICT coeff_orders; |
604 | | size_t coeff_order_size; |
605 | | const std::vector<uint8_t>* JXL_RESTRICT context_map; |
606 | | ANSSymbolReader decoders[kMaxNumPasses]; |
607 | | BitReader* JXL_RESTRICT* JXL_RESTRICT readers; |
608 | | size_t num_passes; |
609 | | size_t ctx_offset[kMaxNumPasses]; |
610 | | size_t nzeros_stride; |
611 | | int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3]; |
612 | | const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3]; |
613 | | GroupDecCache* JXL_RESTRICT group_dec_cache; |
614 | | const BlockCtxMap* block_ctx_map; |
615 | | const ImageI* qf; |
616 | | const ImageB* quant_dc; |
617 | | const int32_t* qf_row; |
618 | | const uint8_t* quant_dc_row; |
619 | | Rect rect; |
620 | | size_t hshift[3], vshift[3]; |
621 | | }; |
622 | | |
623 | | struct GetBlockFromEncoder : public GetBlock { |
624 | 0 | void StartRow(size_t by) override {} |
625 | | |
626 | | Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size, |
627 | | size_t log2_covered_blocks, ACPtr block[3], |
628 | 0 | ACType ac_type) override { |
629 | 0 | JXL_DASSERT(ac_type == ACType::k32); |
630 | 0 | for (size_t c = 0; c < 3; c++) { |
631 | | // for each pass |
632 | 0 | for (size_t i = 0; i < quantized_ac->size(); i++) { |
633 | 0 | for (size_t k = 0; k < size; k++) { |
634 | | // TODO(veluca): SIMD. |
635 | 0 | block[c].ptr32[k] += |
636 | 0 | rows[i][c][offset + k] * (1 << shift_for_pass[i]); |
637 | 0 | } |
638 | 0 | } |
639 | 0 | } |
640 | 0 | offset += size; |
641 | 0 | return true; |
642 | 0 | } |
643 | | |
644 | | GetBlockFromEncoder(const std::vector<std::unique_ptr<ACImage>>& ac, |
645 | | size_t group_idx, const uint32_t* shift_for_pass) |
646 | 0 | : quantized_ac(&ac), shift_for_pass(shift_for_pass) { |
647 | | // TODO(veluca): not supported with chroma subsampling. |
648 | 0 | for (size_t i = 0; i < quantized_ac->size(); i++) { |
649 | 0 | JXL_CHECK((*quantized_ac)[i]->Type() == ACType::k32); |
650 | 0 | for (size_t c = 0; c < 3; c++) { |
651 | 0 | rows[i][c] = (*quantized_ac)[i]->PlaneRow(c, group_idx, 0).ptr32; |
652 | 0 | } |
653 | 0 | } |
654 | 0 | } |
655 | | |
656 | | const std::vector<std::unique_ptr<ACImage>>* JXL_RESTRICT quantized_ac; |
657 | | size_t offset = 0; |
658 | | const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3]; |
659 | | const uint32_t* shift_for_pass = nullptr; // not owned |
660 | | }; |
661 | | |
662 | | HWY_EXPORT(DecodeGroupImpl); |
663 | | |
664 | | } // namespace |
665 | | |
666 | | Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, |
667 | | size_t num_passes, size_t group_idx, |
668 | | PassesDecoderState* JXL_RESTRICT dec_state, |
669 | | GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread, |
670 | | RenderPipelineInput& render_pipeline_input, |
671 | | ImageBundle* JXL_RESTRICT decoded, size_t first_pass, |
672 | 0 | bool force_draw, bool dc_only, bool* should_run_pipeline) { |
673 | 0 | DrawMode draw = (num_passes + first_pass == |
674 | 0 | dec_state->shared->frame_header.passes.num_passes) || |
675 | 0 | force_draw |
676 | 0 | ? kDraw |
677 | 0 | : kDontDraw; |
678 | |
|
679 | 0 | if (should_run_pipeline) { |
680 | 0 | *should_run_pipeline = draw != kDontDraw; |
681 | 0 | } |
682 | |
|
683 | 0 | if (draw == kDraw && num_passes == 0 && first_pass == 0) { |
684 | 0 | group_dec_cache->InitDCBufferOnce(); |
685 | 0 | const YCbCrChromaSubsampling& cs = |
686 | 0 | dec_state->shared->frame_header.chroma_subsampling; |
687 | 0 | for (size_t c : {0, 1, 2}) { |
688 | 0 | size_t hs = cs.HShift(c); |
689 | 0 | size_t vs = cs.VShift(c); |
690 | | // We reuse filter_input_storage here as it is not currently in use. |
691 | 0 | const Rect src_rect_precs = dec_state->shared->BlockGroupRect(group_idx); |
692 | 0 | const Rect src_rect = |
693 | 0 | Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs, |
694 | 0 | src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs); |
695 | 0 | const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(), |
696 | 0 | src_rect.ysize()); |
697 | 0 | CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2, |
698 | 0 | copy_rect, &group_dec_cache->dc_buffer); |
699 | | // Mirrorpad. Interleaving left and right padding ensures that padding |
700 | | // works out correctly even for images with DC size of 1. |
701 | 0 | for (size_t y = 0; y < src_rect.ysize() + 4; y++) { |
702 | 0 | size_t xend = kRenderPipelineXOffset + |
703 | 0 | (dec_state->shared->dc->Plane(c).xsize() >> hs) - |
704 | 0 | src_rect.x0(); |
705 | 0 | for (size_t ix = 0; ix < 2; ix++) { |
706 | 0 | if (src_rect.x0() == 0) { |
707 | 0 | group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] = |
708 | 0 | group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix]; |
709 | 0 | } |
710 | 0 | if (src_rect.x0() + src_rect.xsize() + 2 >= |
711 | 0 | (dec_state->shared->dc->xsize() >> hs)) { |
712 | 0 | group_dec_cache->dc_buffer.Row(y)[xend + ix] = |
713 | 0 | group_dec_cache->dc_buffer.Row(y)[xend - ix - 1]; |
714 | 0 | } |
715 | 0 | } |
716 | 0 | } |
717 | 0 | Rect dst_rect = render_pipeline_input.GetBuffer(c).second; |
718 | 0 | ImageF* upsampling_dst = render_pipeline_input.GetBuffer(c).first; |
719 | 0 | JXL_ASSERT(dst_rect.IsInside(*upsampling_dst)); |
720 | | |
721 | 0 | RenderPipelineStage::RowInfo input_rows(1, std::vector<float*>(5)); |
722 | 0 | RenderPipelineStage::RowInfo output_rows(1, std::vector<float*>(8)); |
723 | 0 | for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize(); |
724 | 0 | y++) { |
725 | 0 | for (ssize_t iy = 0; iy < 5; iy++) { |
726 | 0 | input_rows[0][iy] = group_dec_cache->dc_buffer.Row( |
727 | 0 | Mirror(ssize_t(y) + iy - 2, |
728 | 0 | dec_state->shared->dc->Plane(c).ysize() >> vs) + |
729 | 0 | 2 - src_rect.y0()); |
730 | 0 | } |
731 | 0 | for (size_t iy = 0; iy < 8; iy++) { |
732 | 0 | output_rows[0][iy] = |
733 | 0 | dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) - |
734 | 0 | kRenderPipelineXOffset; |
735 | 0 | } |
736 | | // Arguments set to 0/nullptr are not used. |
737 | 0 | dec_state->upsampler8x->ProcessRow(input_rows, output_rows, |
738 | 0 | /*xextra=*/0, src_rect.xsize(), 0, 0, |
739 | 0 | thread); |
740 | 0 | } |
741 | 0 | } |
742 | 0 | return true; |
743 | 0 | } |
744 | | |
745 | 0 | size_t histo_selector_bits = 0; |
746 | 0 | if (dc_only) { |
747 | 0 | JXL_ASSERT(num_passes == 0); |
748 | 0 | } else { |
749 | 0 | JXL_ASSERT(dec_state->shared->num_histograms > 0); |
750 | 0 | histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms); |
751 | 0 | } |
752 | | |
753 | 0 | auto get_block = jxl::make_unique<GetBlockFromBitstream>(); |
754 | 0 | JXL_RETURN_IF_ERROR( |
755 | 0 | get_block->Init(readers, num_passes, group_idx, histo_selector_bits, |
756 | 0 | dec_state->shared->BlockGroupRect(group_idx), |
757 | 0 | group_dec_cache, dec_state, first_pass)); |
758 | | |
759 | 0 | JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)( |
760 | 0 | get_block.get(), group_dec_cache, dec_state, thread, group_idx, |
761 | 0 | render_pipeline_input, decoded, draw)); |
762 | | |
763 | 0 | for (size_t pass = 0; pass < num_passes; pass++) { |
764 | 0 | if (!get_block->decoders[pass].CheckANSFinalState()) { |
765 | 0 | return JXL_FAILURE("ANS checksum failure."); |
766 | 0 | } |
767 | 0 | } |
768 | 0 | return true; |
769 | 0 | } |
770 | | |
771 | | Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac, |
772 | | size_t group_idx, |
773 | | PassesDecoderState* JXL_RESTRICT dec_state, |
774 | | GroupDecCache* JXL_RESTRICT group_dec_cache, |
775 | | size_t thread, |
776 | | RenderPipelineInput& render_pipeline_input, |
777 | | ImageBundle* JXL_RESTRICT decoded, |
778 | 0 | AuxOut* aux_out) { |
779 | 0 | GetBlockFromEncoder get_block(ac, group_idx, |
780 | 0 | dec_state->shared->frame_header.passes.shift); |
781 | 0 | group_dec_cache->InitOnce( |
782 | 0 | /*num_passes=*/0, |
783 | 0 | /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1); |
784 | |
|
785 | 0 | return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)( |
786 | 0 | &get_block, group_dec_cache, dec_state, thread, group_idx, |
787 | 0 | render_pipeline_input, decoded, kDraw); |
788 | 0 | } |
789 | | |
790 | | } // namespace jxl |
791 | | #endif // HWY_ONCE |