/src/libjxl/lib/jxl/enc_group.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_group.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cmath> |
12 | | #include <cstddef> |
13 | | #include <cstdint> |
14 | | #include <cstdlib> |
15 | | |
16 | | #include "lib/jxl/base/common.h" |
17 | | #include "lib/jxl/base/status.h" |
18 | | #include "lib/jxl/chroma_from_luma.h" |
19 | | #include "lib/jxl/coeff_order_fwd.h" |
20 | | #include "lib/jxl/enc_ans.h" |
21 | | #include "lib/jxl/enc_bit_writer.h" |
22 | | #include "lib/jxl/frame_dimensions.h" |
23 | | #include "lib/jxl/memory_manager_internal.h" |
24 | | |
25 | | #undef HWY_TARGET_INCLUDE |
26 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc" |
27 | | #include <hwy/foreach_target.h> |
28 | | #include <hwy/highway.h> |
29 | | |
30 | | #include "lib/jxl/ac_strategy.h" |
31 | | #include "lib/jxl/base/bits.h" |
32 | | #include "lib/jxl/base/compiler_specific.h" |
33 | | #include "lib/jxl/base/rect.h" |
34 | | #include "lib/jxl/common.h" // kMaxNumPasses |
35 | | #include "lib/jxl/dct_util.h" |
36 | | #include "lib/jxl/dec_transforms-inl.h" |
37 | | #include "lib/jxl/enc_aux_out.h" |
38 | | #include "lib/jxl/enc_cache.h" |
39 | | #include "lib/jxl/enc_params.h" |
40 | | #include "lib/jxl/enc_transforms-inl.h" |
41 | | #include "lib/jxl/image.h" |
42 | | #include "lib/jxl/quantizer-inl.h" |
43 | | #include "lib/jxl/quantizer.h" |
44 | | #include "lib/jxl/simd_util.h" |
45 | | HWY_BEFORE_NAMESPACE(); |
46 | | namespace jxl { |
47 | | namespace HWY_NAMESPACE { |
48 | | |
49 | | // These templates are not found via ADL. |
50 | | using hwy::HWY_NAMESPACE::Abs; |
51 | | using hwy::HWY_NAMESPACE::Ge; |
52 | | using hwy::HWY_NAMESPACE::IfThenElse; |
53 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
54 | | using hwy::HWY_NAMESPACE::MaskFromVec; |
55 | | using hwy::HWY_NAMESPACE::Round; |
56 | | |
57 | | // NOTE: caller takes care of extracting quant from rect of RawQuantField. |
58 | | void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion, |
59 | | size_t c, float qm_multiplier, AcStrategyType quant_kind, |
60 | | size_t xsize, size_t ysize, float* thresholds, |
61 | | const float* JXL_RESTRICT block_in, const int32_t* quant, |
62 | 725k | int32_t* JXL_RESTRICT block_out) { |
63 | 725k | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); |
64 | 725k | float qac = quantizer.Scale() * (*quant); |
65 | | // Not SIMD-ified for now. |
66 | 725k | if (c != 1 && xsize * ysize >= 4) { |
67 | 151k | for (int i = 0; i < 4; ++i) { |
68 | 121k | thresholds[i] -= 0.00744f * xsize * ysize; |
69 | 121k | if (thresholds[i] < 0.5) { |
70 | 12.2k | thresholds[i] = 0.5; |
71 | 12.2k | } |
72 | 121k | } |
73 | 30.3k | } |
74 | 725k | HWY_CAPPED(float, kBlockDim) df; |
75 | 725k | HWY_CAPPED(int32_t, kBlockDim) di; |
76 | 725k | HWY_CAPPED(uint32_t, kBlockDim) du; |
77 | 725k | const auto quantv = Set(df, qac * qm_multiplier); |
78 | 7.15M | for (size_t y = 0; y < ysize * kBlockDim; y++) { |
79 | 6.42M | size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2; |
80 | 6.42M | const size_t off = y * kBlockDim * xsize; |
81 | 15.9M | for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { |
82 | 9.55M | auto threshold = Zero(df); |
83 | 9.55M | if (xsize == 1) { |
84 | 5.01M | HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u}; |
85 | 5.01M | const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); |
86 | 5.01M | threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]), |
87 | 5.01M | Set(df, thresholds[yfix])); |
88 | 5.01M | } else { |
89 | | // Same for all lanes in the vector. |
90 | 4.53M | threshold = Set( |
91 | 4.53M | df, |
92 | 4.53M | thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]); |
93 | 4.53M | } |
94 | 9.55M | const auto q = Mul(Load(df, qm + off + x), quantv); |
95 | 9.55M | const auto in = Load(df, block_in + off + x); |
96 | 9.55M | const auto val = Mul(q, in); |
97 | 9.55M | const auto nzero_mask = Ge(Abs(val), threshold); |
98 | 9.55M | const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); |
99 | 9.55M | Store(v, di, block_out + off + x); |
100 | 9.55M | } |
101 | 6.42M | } |
102 | 725k | } Unexecuted instantiation: jxl::N_SSE4::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*) jxl::N_AVX2::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*) Line | Count | Source | 62 | 725k | int32_t* JXL_RESTRICT block_out) { | 63 | 725k | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); | 64 | 725k | float qac = quantizer.Scale() * (*quant); | 65 | | // Not SIMD-ified for now. | 66 | 725k | if (c != 1 && xsize * ysize >= 4) { | 67 | 151k | for (int i = 0; i < 4; ++i) { | 68 | 121k | thresholds[i] -= 0.00744f * xsize * ysize; | 69 | 121k | if (thresholds[i] < 0.5) { | 70 | 12.2k | thresholds[i] = 0.5; | 71 | 12.2k | } | 72 | 121k | } | 73 | 30.3k | } | 74 | 725k | HWY_CAPPED(float, kBlockDim) df; | 75 | 725k | HWY_CAPPED(int32_t, kBlockDim) di; | 76 | 725k | HWY_CAPPED(uint32_t, kBlockDim) du; | 77 | 725k | const auto quantv = Set(df, qac * qm_multiplier); | 78 | 7.15M | for (size_t y = 0; y < ysize * kBlockDim; y++) { | 79 | 6.42M | size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2; | 80 | 6.42M | const size_t off = y * kBlockDim * xsize; | 81 | 15.9M | for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { | 82 | 9.55M | auto threshold = Zero(df); | 83 | 9.55M | if (xsize == 1) { | 84 | 5.01M | HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u}; | 85 | 5.01M | const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); | 86 | 5.01M | threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]), | 87 | 5.01M | Set(df, thresholds[yfix])); | 88 | 5.01M | } else { | 89 | | // Same for all lanes in the vector. | 90 | 4.53M | threshold = Set( | 91 | 4.53M | df, | 92 | 4.53M | thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]); | 93 | 4.53M | } | 94 | 9.55M | const auto q = Mul(Load(df, qm + off + x), quantv); | 95 | 9.55M | const auto in = Load(df, block_in + off + x); | 96 | 9.55M | const auto val = Mul(q, in); | 97 | 9.55M | const auto nzero_mask = Ge(Abs(val), threshold); | 98 | 9.55M | const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); | 99 | 9.55M | Store(v, di, block_out + off + x); | 100 | 9.55M | } | 101 | 6.42M | } | 102 | 725k | } |
Unexecuted instantiation: jxl::N_SSE2::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*) |
103 | | |
104 | | void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c, |
105 | | float qm_multiplier, AcStrategyType quant_kind, |
106 | | size_t xsize, size_t ysize, float* thresholds, |
107 | 725k | const float* JXL_RESTRICT block_in, int32_t* quant) { |
108 | | // No quantization adjusting for these small blocks. |
109 | | // Quantization adjusting attempts to fix some known issues |
110 | | // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness |
111 | | // when there are not many non-zeros. |
112 | 725k | constexpr size_t kPartialBlockKinds = |
113 | 725k | (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) | |
114 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) | |
115 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) | |
116 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) | |
117 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) | |
118 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV0)) | |
119 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV1)) | |
120 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV2)) | |
121 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV3)); |
122 | 725k | if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) { |
123 | 439k | return; |
124 | 439k | } |
125 | | |
126 | 285k | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); |
127 | 285k | float qac = quantizer.Scale() * (*quant); |
128 | 285k | if (xsize > 1 || ysize > 1) { |
129 | 489k | for (int i = 0; i < 4; ++i) { |
130 | 391k | thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f); |
131 | 391k | if (thresholds[i] < 0.54) { |
132 | 13.8k | thresholds[i] = 0.54; |
133 | 13.8k | } |
134 | 391k | } |
135 | 97.9k | } |
136 | 285k | float sum_of_highest_freq_row_and_column = 0; |
137 | 285k | float sum_of_error = 0; |
138 | 285k | float sum_of_vals = 0; |
139 | 285k | float hfNonZeros[4] = {}; |
140 | 285k | float hfMaxError[4] = {}; |
141 | | |
142 | 3.19M | for (size_t y = 0; y < ysize * kBlockDim; y++) { |
143 | 51.1M | for (size_t x = 0; x < xsize * kBlockDim; x++) { |
144 | 48.2M | const size_t pos = y * kBlockDim * xsize + x; |
145 | 48.2M | if (x < xsize && y < ysize) { |
146 | 754k | continue; |
147 | 754k | } |
148 | 47.5M | const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 + |
149 | 47.5M | static_cast<size_t>(x >= xsize * kBlockDim / 2)); |
150 | 47.5M | const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); |
151 | 47.5M | const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val); |
152 | 47.5M | const float error = std::abs(val - v); |
153 | 47.5M | sum_of_error += error; |
154 | 47.5M | sum_of_vals += std::abs(v); |
155 | 47.5M | if (c == 1 && v == 0) { |
156 | 12.1M | if (hfMaxError[hfix] < error) { |
157 | 791k | hfMaxError[hfix] = error; |
158 | 791k | } |
159 | 12.1M | } |
160 | 47.5M | if (v != 0.0f) { |
161 | 7.47M | hfNonZeros[hfix] += std::abs(v); |
162 | 7.47M | bool in_corner = y >= 7 * ysize && x >= 7 * xsize; |
163 | 7.47M | bool on_border = |
164 | 7.47M | y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1; |
165 | 7.47M | bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize; |
166 | 7.47M | if (in_corner || (on_border && in_larger_corner)) { |
167 | 445k | sum_of_highest_freq_row_and_column += std::abs(val); |
168 | 445k | } |
169 | 7.47M | } |
170 | 47.5M | } |
171 | 2.90M | } |
172 | 285k | if (c == 1 && sum_of_vals * 8 < xsize * ysize) { |
173 | 26.1k | static const double kLimit[4] = { |
174 | 26.1k | 0.46, |
175 | 26.1k | 0.46, |
176 | 26.1k | 0.46, |
177 | 26.1k | 0.46, |
178 | 26.1k | }; |
179 | 26.1k | static const double kMul[4] = { |
180 | 26.1k | 0.9999, |
181 | 26.1k | 0.9999, |
182 | 26.1k | 0.9999, |
183 | 26.1k | 0.9999, |
184 | 26.1k | }; |
185 | 26.1k | const int32_t orig_quant = *quant; |
186 | 26.1k | int32_t new_quant = *quant; |
187 | 102k | for (int i = 1; i < 4; ++i) { |
188 | 77.1k | if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) { |
189 | 980 | new_quant = orig_quant + 1; |
190 | 980 | break; |
191 | 980 | } |
192 | 77.1k | } |
193 | 26.1k | *quant = new_quant; |
194 | 26.1k | if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) { |
195 | 290 | thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant; |
196 | 25.9k | } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) || |
197 | 25.9k | (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) { |
198 | 690 | thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) * |
199 | 690 | new_quant / orig_quant; |
200 | 690 | thresholds[2] = thresholds[1]; |
201 | 25.2k | } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) { |
202 | 1.26k | thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant; |
203 | 1.26k | } |
204 | 26.1k | } |
205 | | // Heuristic for improving accuracy of high-frequency patterns |
206 | | // occurring in an environment with no medium-frequency masking |
207 | | // patterns. |
208 | 285k | { |
209 | 285k | float all = |
210 | 285k | hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1; |
211 | 285k | float mul[3] = {70, 30, 60}; |
212 | 285k | if (mul[c] * sum_of_highest_freq_row_and_column >= all) { |
213 | 93.4k | *quant += mul[c] * sum_of_highest_freq_row_and_column / all; |
214 | 93.4k | if (*quant >= Quantizer::kQuantMax) { |
215 | 0 | *quant = Quantizer::kQuantMax - 1; |
216 | 0 | } |
217 | 93.4k | } |
218 | 285k | } |
219 | 285k | if (quant_kind == AcStrategyType::DCT) { |
220 | | // If this 8x8 block is too flat, increase the adaptive quantization level |
221 | | // a bit to reduce visible block boundaries and requantize the block. |
222 | 187k | if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) { |
223 | 54.7k | *quant += 1; |
224 | 54.7k | if (*quant >= Quantizer::kQuantMax) { |
225 | 0 | *quant = Quantizer::kQuantMax - 1; |
226 | 0 | } |
227 | 54.7k | } |
228 | 187k | } |
229 | 285k | { |
230 | 285k | static const double kMul1[4][3] = { |
231 | 285k | { |
232 | 285k | 0.22080615753848404, |
233 | 285k | 0.45797479824262011, |
234 | 285k | 0.29859235095977965, |
235 | 285k | }, |
236 | 285k | { |
237 | 285k | 0.70109486510286834, |
238 | 285k | 0.16185281305512639, |
239 | 285k | 0.14387691730035473, |
240 | 285k | }, |
241 | 285k | { |
242 | 285k | 0.114985964456218638, |
243 | 285k | 0.44656840441027695, |
244 | 285k | 0.10587658215149048, |
245 | 285k | }, |
246 | 285k | { |
247 | 285k | 0.46849665264409396, |
248 | 285k | 0.41239077937781954, |
249 | 285k | 0.088667407767185444, |
250 | 285k | }, |
251 | 285k | }; |
252 | 285k | static const double kMul2[4][3] = { |
253 | 285k | { |
254 | 285k | 0.27450281941822197, |
255 | 285k | 1.1255766549984996, |
256 | 285k | 0.98950459134128388, |
257 | 285k | }, |
258 | 285k | { |
259 | 285k | 0.4652168675598285, |
260 | 285k | 0.40945807983455818, |
261 | 285k | 0.36581899811751367, |
262 | 285k | }, |
263 | 285k | { |
264 | 285k | 0.28034972424715715, |
265 | 285k | 0.9182653201929738, |
266 | 285k | 1.5581531543057416, |
267 | 285k | }, |
268 | 285k | { |
269 | 285k | 0.26873118114033728, |
270 | 285k | 0.68863712390392484, |
271 | 285k | 1.2082185408666786, |
272 | 285k | }, |
273 | 285k | }; |
274 | 285k | static const double kQuantNormalizer = 2.2942708343284721; |
275 | 285k | sum_of_error *= kQuantNormalizer; |
276 | 285k | sum_of_vals *= kQuantNormalizer; |
277 | 285k | if (quant_kind >= AcStrategyType::DCT16X16) { |
278 | 97.9k | int ix = 3; |
279 | 97.9k | if (quant_kind == AcStrategyType::DCT32X16 || |
280 | 97.9k | quant_kind == AcStrategyType::DCT16X32) { |
281 | 12.5k | ix = 1; |
282 | 85.4k | } else if (quant_kind == AcStrategyType::DCT16X16) { |
283 | 19.1k | ix = 0; |
284 | 66.3k | } else if (quant_kind == AcStrategyType::DCT32X32) { |
285 | 12.3k | ix = 2; |
286 | 12.3k | } |
287 | 97.9k | int step = |
288 | 97.9k | sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + |
289 | 97.9k | kMul2[ix][c] * sum_of_vals); |
290 | 97.9k | if (step >= 2) { |
291 | 355 | step = 2; |
292 | 355 | } |
293 | 97.9k | if (step < 0) { |
294 | 48 | step = 0; |
295 | 48 | } |
296 | 97.9k | if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + |
297 | 97.9k | kMul2[ix][c] * sum_of_vals) { |
298 | 5.49k | *quant += step; |
299 | 5.49k | if (*quant >= Quantizer::kQuantMax) { |
300 | 0 | *quant = Quantizer::kQuantMax - 1; |
301 | 0 | } |
302 | 5.49k | } |
303 | 97.9k | } |
304 | 285k | } |
305 | 285k | { |
306 | | // Reduce quant in highly active areas. |
307 | 285k | int32_t div = (xsize * ysize); |
308 | 285k | int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div; |
309 | 285k | int32_t orig_qp_limit = std::max(4, *quant / 2); |
310 | 1.14M | for (int i = 1; i < 4; ++i) { |
311 | 856k | activity = std::min( |
312 | 856k | activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div); |
313 | 856k | } |
314 | 285k | if (activity >= 15) { |
315 | 92.9k | activity = 15; |
316 | 92.9k | } |
317 | 285k | int32_t qp = *quant - activity; |
318 | 285k | if (c == 1) { |
319 | 380k | for (int i = 1; i < 4; ++i) { |
320 | 285k | thresholds[i] += 0.01 * activity; |
321 | 285k | } |
322 | 95.1k | } |
323 | 285k | if (qp < orig_qp_limit) { |
324 | 132k | qp = orig_qp_limit; |
325 | 132k | } |
326 | 285k | *quant = qp; |
327 | 285k | } |
328 | 285k | } Unexecuted instantiation: jxl::N_SSE4::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*) jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*) Line | Count | Source | 107 | 725k | const float* JXL_RESTRICT block_in, int32_t* quant) { | 108 | | // No quantization adjusting for these small blocks. | 109 | | // Quantization adjusting attempts to fix some known issues | 110 | | // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness | 111 | | // when there are not many non-zeros. | 112 | 725k | constexpr size_t kPartialBlockKinds = | 113 | 725k | (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) | | 114 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) | | 115 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) | | 116 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) | | 117 | 725k | (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) | | 118 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV0)) | | 119 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV1)) | | 120 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV2)) | | 121 | 725k | (1 << static_cast<size_t>(AcStrategyType::AFV3)); | 122 | 725k | if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) { | 123 | 439k | return; | 124 | 439k | } | 125 | | | 126 | 285k | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); | 127 | 285k | float qac = quantizer.Scale() * (*quant); | 128 | 285k | if (xsize > 1 || ysize > 1) { | 129 | 489k | for (int i = 0; i < 4; ++i) { | 130 | 391k | thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f); | 131 | 391k | if (thresholds[i] < 0.54) { | 132 | 13.8k | thresholds[i] = 0.54; | 133 | 13.8k | } | 134 | 391k | } | 135 | 97.9k | } | 136 | 285k | float sum_of_highest_freq_row_and_column = 0; | 137 | 285k | float sum_of_error = 0; | 138 | 285k | float sum_of_vals = 0; | 139 | 285k | float hfNonZeros[4] = {}; | 140 | 285k | float hfMaxError[4] = {}; | 141 | | | 142 | 3.19M | for (size_t y = 0; y < ysize * kBlockDim; y++) { | 143 | 51.1M | for (size_t x = 0; x < xsize * kBlockDim; x++) { | 144 | 48.2M | const size_t pos = y * kBlockDim * xsize + x; | 145 | 48.2M | if (x < xsize && y < ysize) { | 146 | 754k | continue; | 147 | 754k | } | 148 | 47.5M | const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 + | 149 | 47.5M | static_cast<size_t>(x >= xsize * kBlockDim / 2)); | 150 | 47.5M | const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); | 151 | 47.5M | const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val); | 152 | 47.5M | const float error = std::abs(val - v); | 153 | 47.5M | sum_of_error += error; | 154 | 47.5M | sum_of_vals += std::abs(v); | 155 | 47.5M | if (c == 1 && v == 0) { | 156 | 12.1M | if (hfMaxError[hfix] < error) { | 157 | 791k | hfMaxError[hfix] = error; | 158 | 791k | } | 159 | 12.1M | } | 160 | 47.5M | if (v != 0.0f) { | 161 | 7.47M | hfNonZeros[hfix] += std::abs(v); | 162 | 7.47M | bool in_corner = y >= 7 * ysize && x >= 7 * xsize; | 163 | 7.47M | bool on_border = | 164 | 7.47M | y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1; | 165 | 7.47M | bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize; | 166 | 7.47M | if (in_corner || (on_border && in_larger_corner)) { | 167 | 445k | sum_of_highest_freq_row_and_column += std::abs(val); | 168 | 445k | } | 169 | 7.47M | } | 170 | 47.5M | } | 171 | 2.90M | } | 172 | 285k | if (c == 1 && sum_of_vals * 8 < xsize * ysize) { | 173 | 26.1k | static const double kLimit[4] = { | 174 | 26.1k | 0.46, | 175 | 26.1k | 0.46, | 176 | 26.1k | 0.46, | 177 | 26.1k | 0.46, | 178 | 26.1k | }; | 179 | 26.1k | static const double kMul[4] = { | 180 | 26.1k | 0.9999, | 181 | 26.1k | 0.9999, | 182 | 26.1k | 0.9999, | 183 | 26.1k | 0.9999, | 184 | 26.1k | }; | 185 | 26.1k | const int32_t orig_quant = *quant; | 186 | 26.1k | int32_t new_quant = *quant; | 187 | 102k | for (int i = 1; i < 4; ++i) { | 188 | 77.1k | if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) { | 189 | 980 | new_quant = orig_quant + 1; | 190 | 980 | break; | 191 | 980 | } | 192 | 77.1k | } | 193 | 26.1k | *quant = new_quant; | 194 | 26.1k | if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) { | 195 | 290 | thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant; | 196 | 25.9k | } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) || | 197 | 25.9k | (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) { | 198 | 690 | thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) * | 199 | 690 | new_quant / orig_quant; | 200 | 690 | thresholds[2] = thresholds[1]; | 201 | 25.2k | } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) { | 202 | 1.26k | thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant; | 203 | 1.26k | } | 204 | 26.1k | } | 205 | | // Heuristic for improving accuracy of high-frequency patterns | 206 | | // occurring in an environment with no medium-frequency masking | 207 | | // patterns. | 208 | 285k | { | 209 | 285k | float all = | 210 | 285k | hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1; | 211 | 285k | float mul[3] = {70, 30, 60}; | 212 | 285k | if (mul[c] * sum_of_highest_freq_row_and_column >= all) { | 213 | 93.4k | *quant += mul[c] * sum_of_highest_freq_row_and_column / all; | 214 | 93.4k | if (*quant >= Quantizer::kQuantMax) { | 215 | 0 | *quant = Quantizer::kQuantMax - 1; | 216 | 0 | } | 217 | 93.4k | } | 218 | 285k | } | 219 | 285k | if (quant_kind == AcStrategyType::DCT) { | 220 | | // If this 8x8 block is too flat, increase the adaptive quantization level | 221 | | // a bit to reduce visible block boundaries and requantize the block. | 222 | 187k | if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) { | 223 | 54.7k | *quant += 1; | 224 | 54.7k | if (*quant >= Quantizer::kQuantMax) { | 225 | 0 | *quant = Quantizer::kQuantMax - 1; | 226 | 0 | } | 227 | 54.7k | } | 228 | 187k | } | 229 | 285k | { | 230 | 285k | static const double kMul1[4][3] = { | 231 | 285k | { | 232 | 285k | 0.22080615753848404, | 233 | 285k | 0.45797479824262011, | 234 | 285k | 0.29859235095977965, | 235 | 285k | }, | 236 | 285k | { | 237 | 285k | 0.70109486510286834, | 238 | 285k | 0.16185281305512639, | 239 | 285k | 0.14387691730035473, | 240 | 285k | }, | 241 | 285k | { | 242 | 285k | 0.114985964456218638, | 243 | 285k | 0.44656840441027695, | 244 | 285k | 0.10587658215149048, | 245 | 285k | }, | 246 | 285k | { | 247 | 285k | 0.46849665264409396, | 248 | 285k | 0.41239077937781954, | 249 | 285k | 0.088667407767185444, | 250 | 285k | }, | 251 | 285k | }; | 252 | 285k | static const double kMul2[4][3] = { | 253 | 285k | { | 254 | 285k | 0.27450281941822197, | 255 | 285k | 1.1255766549984996, | 256 | 285k | 0.98950459134128388, | 257 | 285k | }, | 258 | 285k | { | 259 | 285k | 0.4652168675598285, | 260 | 285k | 0.40945807983455818, | 261 | 285k | 0.36581899811751367, | 262 | 285k | }, | 263 | 285k | { | 264 | 285k | 0.28034972424715715, | 265 | 285k | 0.9182653201929738, | 266 | 285k | 1.5581531543057416, | 267 | 285k | }, | 268 | 285k | { | 269 | 285k | 0.26873118114033728, | 270 | 285k | 0.68863712390392484, | 271 | 285k | 1.2082185408666786, | 272 | 285k | }, | 273 | 285k | }; | 274 | 285k | static const double kQuantNormalizer = 2.2942708343284721; | 275 | 285k | sum_of_error *= kQuantNormalizer; | 276 | 285k | sum_of_vals *= kQuantNormalizer; | 277 | 285k | if (quant_kind >= AcStrategyType::DCT16X16) { | 278 | 97.9k | int ix = 3; | 279 | 97.9k | if (quant_kind == AcStrategyType::DCT32X16 || | 280 | 97.9k | quant_kind == AcStrategyType::DCT16X32) { | 281 | 12.5k | ix = 1; | 282 | 85.4k | } else if (quant_kind == AcStrategyType::DCT16X16) { | 283 | 19.1k | ix = 0; | 284 | 66.3k | } else if (quant_kind == AcStrategyType::DCT32X32) { | 285 | 12.3k | ix = 2; | 286 | 12.3k | } | 287 | 97.9k | int step = | 288 | 97.9k | sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + | 289 | 97.9k | kMul2[ix][c] * sum_of_vals); | 290 | 97.9k | if (step >= 2) { | 291 | 355 | step = 2; | 292 | 355 | } | 293 | 97.9k | if (step < 0) { | 294 | 48 | step = 0; | 295 | 48 | } | 296 | 97.9k | if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + | 297 | 97.9k | kMul2[ix][c] * sum_of_vals) { | 298 | 5.49k | *quant += step; | 299 | 5.49k | if (*quant >= Quantizer::kQuantMax) { | 300 | 0 | *quant = Quantizer::kQuantMax - 1; | 301 | 0 | } | 302 | 5.49k | } | 303 | 97.9k | } | 304 | 285k | } | 305 | 285k | { | 306 | | // Reduce quant in highly active areas. | 307 | 285k | int32_t div = (xsize * ysize); | 308 | 285k | int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div; | 309 | 285k | int32_t orig_qp_limit = std::max(4, *quant / 2); | 310 | 1.14M | for (int i = 1; i < 4; ++i) { | 311 | 856k | activity = std::min( | 312 | 856k | activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div); | 313 | 856k | } | 314 | 285k | if (activity >= 15) { | 315 | 92.9k | activity = 15; | 316 | 92.9k | } | 317 | 285k | int32_t qp = *quant - activity; | 318 | 285k | if (c == 1) { | 319 | 380k | for (int i = 1; i < 4; ++i) { | 320 | 285k | thresholds[i] += 0.01 * activity; | 321 | 285k | } | 322 | 95.1k | } | 323 | 285k | if (qp < orig_qp_limit) { | 324 | 132k | qp = orig_qp_limit; | 325 | 132k | } | 326 | 285k | *quant = qp; | 327 | 285k | } | 328 | 285k | } |
Unexecuted instantiation: jxl::N_SSE2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*) |
329 | | |
330 | | // NOTE: caller takes care of extracting quant from rect of RawQuantField. |
331 | | void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size, |
332 | | const Quantizer& quantizer, |
333 | | const bool error_diffusion, |
334 | | AcStrategyType quant_kind, size_t xsize, |
335 | | size_t ysize, const float* JXL_RESTRICT biases, |
336 | | int32_t* quant, float* JXL_RESTRICT inout, |
337 | 241k | int32_t* JXL_RESTRICT quantized) { |
338 | 241k | float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f}; |
339 | 241k | if (enc_state->cparams.speed_tier <= SpeedTier::kHare) { |
340 | 241k | int32_t max_quant = 0; |
341 | 241k | int quant_orig = *quant; |
342 | 241k | float val[3] = {enc_state->x_qm_multiplier, 1.0f, |
343 | 241k | enc_state->b_qm_multiplier}; |
344 | 725k | for (int c : {1, 0, 2}) { |
345 | 725k | float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f}; |
346 | 725k | *quant = quant_orig; |
347 | 725k | AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize, |
348 | 725k | &thres[0], inout + c * size, quant); |
349 | | // Dead zone adjustment |
350 | 725k | if (c == 1) { |
351 | 1.20M | for (int k = 0; k < 4; ++k) { |
352 | 967k | thres_y[k] = thres[k]; |
353 | 967k | } |
354 | 241k | } |
355 | 725k | max_quant = std::max(*quant, max_quant); |
356 | 725k | } |
357 | 241k | *quant = max_quant; |
358 | 241k | } else { |
359 | 0 | thres_y[0] = 0.56; |
360 | 0 | thres_y[1] = 0.62; |
361 | 0 | thres_y[2] = 0.62; |
362 | 0 | thres_y[3] = 0.62; |
363 | 0 | } |
364 | | |
365 | 241k | QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize, |
366 | 241k | &thres_y[0], inout + size, quant, quantized + size); |
367 | | |
368 | 241k | const float* JXL_RESTRICT dequant_matrix = |
369 | 241k | quantizer.DequantMatrix(quant_kind, 1); |
370 | | |
371 | 241k | HWY_CAPPED(float, kDCTBlockSize) df; |
372 | 241k | HWY_CAPPED(int32_t, kDCTBlockSize) di; |
373 | 241k | const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant)); |
374 | 3.42M | for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { |
375 | 3.18M | const auto oquant = Load(di, quantized + size + k); |
376 | 3.18M | const auto adj_quant = AdjustQuantBias(di, 1, oquant, biases); |
377 | 3.18M | const auto dequantm = Load(df, dequant_matrix + k); |
378 | 3.18M | Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k); |
379 | 3.18M | } |
380 | 241k | } Unexecuted instantiation: jxl::N_SSE4::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*) jxl::N_AVX2::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*) Line | Count | Source | 337 | 241k | int32_t* JXL_RESTRICT quantized) { | 338 | 241k | float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f}; | 339 | 241k | if (enc_state->cparams.speed_tier <= SpeedTier::kHare) { | 340 | 241k | int32_t max_quant = 0; | 341 | 241k | int quant_orig = *quant; | 342 | 241k | float val[3] = {enc_state->x_qm_multiplier, 1.0f, | 343 | 241k | enc_state->b_qm_multiplier}; | 344 | 725k | for (int c : {1, 0, 2}) { | 345 | 725k | float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f}; | 346 | 725k | *quant = quant_orig; | 347 | 725k | AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize, | 348 | 725k | &thres[0], inout + c * size, quant); | 349 | | // Dead zone adjustment | 350 | 725k | if (c == 1) { | 351 | 1.20M | for (int k = 0; k < 4; ++k) { | 352 | 967k | thres_y[k] = thres[k]; | 353 | 967k | } | 354 | 241k | } | 355 | 725k | max_quant = std::max(*quant, max_quant); | 356 | 725k | } | 357 | 241k | *quant = max_quant; | 358 | 241k | } else { | 359 | 0 | thres_y[0] = 0.56; | 360 | 0 | thres_y[1] = 0.62; | 361 | 0 | thres_y[2] = 0.62; | 362 | 0 | thres_y[3] = 0.62; | 363 | 0 | } | 364 | | | 365 | 241k | QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize, | 366 | 241k | &thres_y[0], inout + size, quant, quantized + size); | 367 | | | 368 | 241k | const float* JXL_RESTRICT dequant_matrix = | 369 | 241k | quantizer.DequantMatrix(quant_kind, 1); | 370 | | | 371 | 241k | HWY_CAPPED(float, kDCTBlockSize) df; | 372 | 241k | HWY_CAPPED(int32_t, kDCTBlockSize) di; | 373 | 241k | const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant)); | 374 | 3.42M | for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { | 375 | 3.18M | const auto oquant = Load(di, quantized + size + k); | 376 | 3.18M | const auto adj_quant = AdjustQuantBias(di, 1, oquant, biases); | 377 | 3.18M | const auto dequantm = Load(df, dequant_matrix + k); | 378 | 3.18M | Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k); | 379 | 3.18M | } | 380 | 241k | } |
Unexecuted instantiation: jxl::N_SSE2::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*) |
381 | | |
382 | | Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, |
383 | | const Image3F& opsin, const Rect& rect, |
384 | 595 | Image3F* dc) { |
385 | 595 | JxlMemoryManager* memory_manager = opsin.memory_manager(); |
386 | 595 | const Rect block_group_rect = |
387 | 595 | enc_state->shared.frame_dim.BlockGroupRect(group_idx); |
388 | 595 | const Rect cmap_rect( |
389 | 595 | block_group_rect.x0() / kColorTileDimInBlocks, |
390 | 595 | block_group_rect.y0() / kColorTileDimInBlocks, |
391 | 595 | DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), |
392 | 595 | DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); |
393 | 595 | const Rect group_rect = |
394 | 595 | enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(), |
395 | 595 | rect.y0()); |
396 | | |
397 | 595 | const size_t xsize_blocks = block_group_rect.xsize(); |
398 | 595 | const size_t ysize_blocks = block_group_rect.ysize(); |
399 | | |
400 | 595 | const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow()); |
401 | 595 | const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow()); |
402 | | |
403 | 595 | ImageI& full_quant_field = enc_state->shared.raw_quant_field; |
404 | 595 | const CompressParams& cparams = enc_state->cparams; |
405 | | |
406 | 595 | const size_t dct_scratch_size = |
407 | 595 | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; |
408 | | |
409 | | // TODO(veluca): consider strategies to reduce this memory. |
410 | 595 | size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t); |
411 | 595 | JXL_ASSIGN_OR_RETURN(auto mem, |
412 | 595 | AlignedMemory::Create(memory_manager, mem_bytes)); |
413 | 595 | size_t fmem_bytes = |
414 | 595 | (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float); |
415 | 595 | JXL_ASSIGN_OR_RETURN(auto fmem, |
416 | 595 | AlignedMemory::Create(memory_manager, fmem_bytes)); |
417 | 595 | float* JXL_RESTRICT scratch_space = |
418 | 595 | fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea; |
419 | 595 | { |
420 | | // Only use error diffusion in Squirrel mode or slower. |
421 | 595 | const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; |
422 | 595 | constexpr HWY_CAPPED(float, kDCTBlockSize) d; |
423 | | |
424 | 595 | int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {}; |
425 | 595 | size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); |
426 | 595 | JXL_ENSURE(num_passes > 0); |
427 | 1.19k | for (size_t i = 0; i < num_passes; i++) { |
428 | | // TODO(veluca): 16-bit quantized coeffs are not implemented yet. |
429 | 595 | JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32); |
430 | 2.38k | for (size_t c = 0; c < 3; c++) { |
431 | 1.78k | coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; |
432 | 1.78k | } |
433 | 595 | } |
434 | | |
435 | 595 | HWY_ALIGN float* coeffs_in = fmem.address<float>(); |
436 | 595 | HWY_ALIGN int32_t* quantized = mem.address<int32_t>(); |
437 | | |
438 | 15.0k | for (size_t by = 0; by < ysize_blocks; ++by) { |
439 | 14.4k | int32_t* JXL_RESTRICT row_quant_ac = |
440 | 14.4k | block_group_rect.Row(&full_quant_field, by); |
441 | 14.4k | size_t ty = by / kColorTileDimInBlocks; |
442 | 14.4k | const int8_t* JXL_RESTRICT row_cmap[3] = { |
443 | 14.4k | cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), |
444 | 14.4k | nullptr, |
445 | 14.4k | cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), |
446 | 14.4k | }; |
447 | 14.4k | const float* JXL_RESTRICT opsin_rows[3] = { |
448 | 14.4k | group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), |
449 | 14.4k | group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), |
450 | 14.4k | group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), |
451 | 14.4k | }; |
452 | 14.4k | float* JXL_RESTRICT dc_rows[3] = { |
453 | 14.4k | block_group_rect.PlaneRow(dc, 0, by), |
454 | 14.4k | block_group_rect.PlaneRow(dc, 1, by), |
455 | 14.4k | block_group_rect.PlaneRow(dc, 2, by), |
456 | 14.4k | }; |
457 | 14.4k | AcStrategyRow ac_strategy_row = |
458 | 14.4k | enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); |
459 | 67.9k | for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); |
460 | 53.4k | tx++) { |
461 | 53.4k | const auto x_factor = |
462 | 53.4k | Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx])); |
463 | 53.4k | const auto b_factor = |
464 | 53.4k | Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx])); |
465 | 53.4k | for (size_t bx = tx * kColorTileDimInBlocks; |
466 | 451k | bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { |
467 | 398k | const AcStrategy acs = ac_strategy_row[bx]; |
468 | 398k | if (!acs.IsFirstBlock()) continue; |
469 | | |
470 | 241k | size_t xblocks = acs.covered_blocks_x(); |
471 | 241k | size_t yblocks = acs.covered_blocks_y(); |
472 | | |
473 | 241k | CoefficientLayout(&yblocks, &xblocks); |
474 | | |
475 | 241k | size_t size = kDCTBlockSize * xblocks * yblocks; |
476 | | |
477 | | // DCT Y channel, roundtrip-quantize it and set DC. |
478 | 241k | int32_t quant_ac = row_quant_ac[bx]; |
479 | 725k | for (size_t c : {0, 1, 2}) { |
480 | 725k | TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim, |
481 | 725k | opsin_stride, coeffs_in + c * size, |
482 | 725k | scratch_space); |
483 | 725k | } |
484 | 241k | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, |
485 | 241k | dc_rows[1] + bx, dc_stride, scratch_space); |
486 | | |
487 | 241k | QuantizeRoundtripYBlockAC( |
488 | 241k | enc_state, size, enc_state->shared.quantizer, error_diffusion, |
489 | 241k | acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac, |
490 | 241k | coeffs_in, quantized); |
491 | | |
492 | | // Unapply color correlation |
493 | 3.42M | for (size_t k = 0; k < size; k += Lanes(d)) { |
494 | 3.18M | const auto in_x = Load(d, coeffs_in + k); |
495 | 3.18M | const auto in_y = Load(d, coeffs_in + size + k); |
496 | 3.18M | const auto in_b = Load(d, coeffs_in + 2 * size + k); |
497 | 3.18M | const auto out_x = NegMulAdd(x_factor, in_y, in_x); |
498 | 3.18M | const auto out_b = NegMulAdd(b_factor, in_y, in_b); |
499 | 3.18M | Store(out_x, d, coeffs_in + k); |
500 | 3.18M | Store(out_b, d, coeffs_in + 2 * size + k); |
501 | 3.18M | } |
502 | | |
503 | | // Quantize X and B channels and set DC. |
504 | 483k | for (size_t c : {0, 2}) { |
505 | 483k | float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f}; |
506 | 483k | QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, |
507 | 483k | c == 0 ? enc_state->x_qm_multiplier |
508 | 483k | : enc_state->b_qm_multiplier, |
509 | 483k | acs.Strategy(), xblocks, yblocks, &thres[0], |
510 | 483k | coeffs_in + c * size, &quant_ac, |
511 | 483k | quantized + c * size); |
512 | 483k | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size, |
513 | 483k | dc_rows[c] + bx, dc_stride, scratch_space); |
514 | 483k | } |
515 | 241k | row_quant_ac[bx] = quant_ac; |
516 | 967k | for (size_t c = 0; c < 3; c++) { |
517 | 725k | enc_state->progressive_splitter.SplitACCoefficients( |
518 | 725k | quantized + c * size, acs, bx, by, coeffs[c]); |
519 | 1.45M | for (size_t p = 0; p < num_passes; p++) { |
520 | 725k | coeffs[c][p] += size; |
521 | 725k | } |
522 | 725k | } |
523 | 241k | } |
524 | 53.4k | } |
525 | 14.4k | } |
526 | 595 | } |
527 | 0 | return true; |
528 | 595 | } Unexecuted instantiation: jxl::N_SSE4::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*) jxl::N_AVX2::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*) Line | Count | Source | 384 | 595 | Image3F* dc) { | 385 | 595 | JxlMemoryManager* memory_manager = opsin.memory_manager(); | 386 | 595 | const Rect block_group_rect = | 387 | 595 | enc_state->shared.frame_dim.BlockGroupRect(group_idx); | 388 | 595 | const Rect cmap_rect( | 389 | 595 | block_group_rect.x0() / kColorTileDimInBlocks, | 390 | 595 | block_group_rect.y0() / kColorTileDimInBlocks, | 391 | 595 | DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), | 392 | 595 | DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); | 393 | 595 | const Rect group_rect = | 394 | 595 | enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(), | 395 | 595 | rect.y0()); | 396 | | | 397 | 595 | const size_t xsize_blocks = block_group_rect.xsize(); | 398 | 595 | const size_t ysize_blocks = block_group_rect.ysize(); | 399 | | | 400 | 595 | const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow()); | 401 | 595 | const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow()); | 402 | | | 403 | 595 | ImageI& full_quant_field = enc_state->shared.raw_quant_field; | 404 | 595 | const CompressParams& cparams = enc_state->cparams; | 405 | | | 406 | 595 | const size_t dct_scratch_size = | 407 | 595 | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; | 408 | | | 409 | | // TODO(veluca): consider strategies to reduce this memory. | 410 | 595 | size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t); | 411 | 595 | JXL_ASSIGN_OR_RETURN(auto mem, | 412 | 595 | AlignedMemory::Create(memory_manager, mem_bytes)); | 413 | 595 | size_t fmem_bytes = | 414 | 595 | (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float); | 415 | 595 | JXL_ASSIGN_OR_RETURN(auto fmem, | 416 | 595 | AlignedMemory::Create(memory_manager, fmem_bytes)); | 417 | 595 | float* JXL_RESTRICT scratch_space = | 418 | 595 | fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea; | 419 | 595 | { | 420 | | // Only use error diffusion in Squirrel mode or slower. | 421 | 595 | const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; | 422 | 595 | constexpr HWY_CAPPED(float, kDCTBlockSize) d; | 423 | | | 424 | 595 | int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {}; | 425 | 595 | size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); | 426 | 595 | JXL_ENSURE(num_passes > 0); | 427 | 1.19k | for (size_t i = 0; i < num_passes; i++) { | 428 | | // TODO(veluca): 16-bit quantized coeffs are not implemented yet. | 429 | 595 | JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32); | 430 | 2.38k | for (size_t c = 0; c < 3; c++) { | 431 | 1.78k | coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; | 432 | 1.78k | } | 433 | 595 | } | 434 | | | 435 | 595 | HWY_ALIGN float* coeffs_in = fmem.address<float>(); | 436 | 595 | HWY_ALIGN int32_t* quantized = mem.address<int32_t>(); | 437 | | | 438 | 15.0k | for (size_t by = 0; by < ysize_blocks; ++by) { | 439 | 14.4k | int32_t* JXL_RESTRICT row_quant_ac = | 440 | 14.4k | block_group_rect.Row(&full_quant_field, by); | 441 | 14.4k | size_t ty = by / kColorTileDimInBlocks; | 442 | 14.4k | const int8_t* JXL_RESTRICT row_cmap[3] = { | 443 | 14.4k | cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), | 444 | 14.4k | nullptr, | 445 | 14.4k | cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), | 446 | 14.4k | }; | 447 | 14.4k | const float* JXL_RESTRICT opsin_rows[3] = { | 448 | 14.4k | group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), | 449 | 14.4k | group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), | 450 | 14.4k | group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), | 451 | 14.4k | }; | 452 | 14.4k | float* JXL_RESTRICT dc_rows[3] = { | 453 | 14.4k | block_group_rect.PlaneRow(dc, 0, by), | 454 | 14.4k | block_group_rect.PlaneRow(dc, 1, by), | 455 | 14.4k | block_group_rect.PlaneRow(dc, 2, by), | 456 | 14.4k | }; | 457 | 14.4k | AcStrategyRow ac_strategy_row = | 458 | 14.4k | enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); | 459 | 67.9k | for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); | 460 | 53.4k | tx++) { | 461 | 53.4k | const auto x_factor = | 462 | 53.4k | Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx])); | 463 | 53.4k | const auto b_factor = | 464 | 53.4k | Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx])); | 465 | 53.4k | for (size_t bx = tx * kColorTileDimInBlocks; | 466 | 451k | bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { | 467 | 398k | const AcStrategy acs = ac_strategy_row[bx]; | 468 | 398k | if (!acs.IsFirstBlock()) continue; | 469 | | | 470 | 241k | size_t xblocks = acs.covered_blocks_x(); | 471 | 241k | size_t yblocks = acs.covered_blocks_y(); | 472 | | | 473 | 241k | CoefficientLayout(&yblocks, &xblocks); | 474 | | | 475 | 241k | size_t size = kDCTBlockSize * xblocks * yblocks; | 476 | | | 477 | | // DCT Y channel, roundtrip-quantize it and set DC. | 478 | 241k | int32_t quant_ac = row_quant_ac[bx]; | 479 | 725k | for (size_t c : {0, 1, 2}) { | 480 | 725k | TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim, | 481 | 725k | opsin_stride, coeffs_in + c * size, | 482 | 725k | scratch_space); | 483 | 725k | } | 484 | 241k | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, | 485 | 241k | dc_rows[1] + bx, dc_stride, scratch_space); | 486 | | | 487 | 241k | QuantizeRoundtripYBlockAC( | 488 | 241k | enc_state, size, enc_state->shared.quantizer, error_diffusion, | 489 | 241k | acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac, | 490 | 241k | coeffs_in, quantized); | 491 | | | 492 | | // Unapply color correlation | 493 | 3.42M | for (size_t k = 0; k < size; k += Lanes(d)) { | 494 | 3.18M | const auto in_x = Load(d, coeffs_in + k); | 495 | 3.18M | const auto in_y = Load(d, coeffs_in + size + k); | 496 | 3.18M | const auto in_b = Load(d, coeffs_in + 2 * size + k); | 497 | 3.18M | const auto out_x = NegMulAdd(x_factor, in_y, in_x); | 498 | 3.18M | const auto out_b = NegMulAdd(b_factor, in_y, in_b); | 499 | 3.18M | Store(out_x, d, coeffs_in + k); | 500 | 3.18M | Store(out_b, d, coeffs_in + 2 * size + k); | 501 | 3.18M | } | 502 | | | 503 | | // Quantize X and B channels and set DC. | 504 | 483k | for (size_t c : {0, 2}) { | 505 | 483k | float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f}; | 506 | 483k | QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, | 507 | 483k | c == 0 ? enc_state->x_qm_multiplier | 508 | 483k | : enc_state->b_qm_multiplier, | 509 | 483k | acs.Strategy(), xblocks, yblocks, &thres[0], | 510 | 483k | coeffs_in + c * size, &quant_ac, | 511 | 483k | quantized + c * size); | 512 | 483k | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size, | 513 | 483k | dc_rows[c] + bx, dc_stride, scratch_space); | 514 | 483k | } | 515 | 241k | row_quant_ac[bx] = quant_ac; | 516 | 967k | for (size_t c = 0; c < 3; c++) { | 517 | 725k | enc_state->progressive_splitter.SplitACCoefficients( | 518 | 725k | quantized + c * size, acs, bx, by, coeffs[c]); | 519 | 1.45M | for (size_t p = 0; p < num_passes; p++) { | 520 | 725k | coeffs[c][p] += size; | 521 | 725k | } | 522 | 725k | } | 523 | 241k | } | 524 | 53.4k | } | 525 | 14.4k | } | 526 | 595 | } | 527 | 0 | return true; | 528 | 595 | } |
Unexecuted instantiation: jxl::N_SSE2::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*) |
529 | | |
530 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
531 | | } // namespace HWY_NAMESPACE |
532 | | } // namespace jxl |
533 | | HWY_AFTER_NAMESPACE(); |
534 | | |
535 | | #if HWY_ONCE |
536 | | namespace jxl { |
537 | | HWY_EXPORT(ComputeCoefficients); |
538 | | Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, |
539 | | const Image3F& opsin, const Rect& rect, |
540 | 595 | Image3F* dc) { |
541 | 595 | return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin, |
542 | 595 | rect, dc); |
543 | 595 | } |
544 | | |
545 | | Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx, |
546 | | size_t histogram_idx, |
547 | | const PassesEncoderState& enc_state, |
548 | 595 | BitWriter* writer, AuxOut* aux_out) { |
549 | | // Select which histogram to use among those of the current pass. |
550 | 595 | const size_t num_histograms = enc_state.shared.num_histograms; |
551 | | // num_histograms is 0 only for lossless. |
552 | 595 | JXL_ENSURE(num_histograms == 0 || histogram_idx < num_histograms); |
553 | 595 | size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); |
554 | | |
555 | 595 | if (histo_selector_bits != 0) { |
556 | 0 | JXL_RETURN_IF_ERROR( |
557 | 0 | writer->WithMaxBits(histo_selector_bits, LayerType::Ac, aux_out, [&] { |
558 | 0 | writer->Write(histo_selector_bits, histogram_idx); |
559 | 0 | return true; |
560 | 0 | })); |
561 | 0 | } |
562 | 595 | size_t context_offset = |
563 | 595 | histogram_idx * enc_state.shared.block_ctx_map.NumACContexts(); |
564 | 595 | JXL_RETURN_IF_ERROR( |
565 | 595 | WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx], |
566 | 595 | enc_state.passes[pass_idx].codes, context_offset, writer, |
567 | 595 | LayerType::AcTokens, aux_out)); |
568 | | |
569 | 595 | return true; |
570 | 595 | } |
571 | | |
572 | | } // namespace jxl |
573 | | #endif // HWY_ONCE |