/src/libjxl/lib/jxl/enc_group.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_group.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include "lib/jxl/base/status.h" |
11 | | #include "lib/jxl/memory_manager_internal.h" |
12 | | |
13 | | #undef HWY_TARGET_INCLUDE |
14 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc" |
15 | | #include <hwy/foreach_target.h> |
16 | | #include <hwy/highway.h> |
17 | | |
18 | | #include "lib/jxl/ac_strategy.h" |
19 | | #include "lib/jxl/base/bits.h" |
20 | | #include "lib/jxl/base/compiler_specific.h" |
21 | | #include "lib/jxl/base/rect.h" |
22 | | #include "lib/jxl/common.h" // kMaxNumPasses |
23 | | #include "lib/jxl/dct_util.h" |
24 | | #include "lib/jxl/dec_transforms-inl.h" |
25 | | #include "lib/jxl/enc_aux_out.h" |
26 | | #include "lib/jxl/enc_cache.h" |
27 | | #include "lib/jxl/enc_params.h" |
28 | | #include "lib/jxl/enc_transforms-inl.h" |
29 | | #include "lib/jxl/image.h" |
30 | | #include "lib/jxl/quantizer-inl.h" |
31 | | #include "lib/jxl/quantizer.h" |
32 | | #include "lib/jxl/simd_util.h" |
33 | | HWY_BEFORE_NAMESPACE(); |
34 | | namespace jxl { |
35 | | namespace HWY_NAMESPACE { |
36 | | |
37 | | // These templates are not found via ADL. |
38 | | using hwy::HWY_NAMESPACE::Abs; |
39 | | using hwy::HWY_NAMESPACE::Ge; |
40 | | using hwy::HWY_NAMESPACE::IfThenElse; |
41 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
42 | | using hwy::HWY_NAMESPACE::MaskFromVec; |
43 | | using hwy::HWY_NAMESPACE::Round; |
44 | | |
45 | | // NOTE: caller takes care of extracting quant from rect of RawQuantField. |
46 | | void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion, |
47 | | size_t c, float qm_multiplier, AcStrategyType quant_kind, |
48 | | size_t xsize, size_t ysize, float* thresholds, |
49 | | const float* JXL_RESTRICT block_in, const int32_t* quant, |
50 | 714 | int32_t* JXL_RESTRICT block_out) { |
51 | 714 | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); |
52 | 714 | float qac = quantizer.Scale() * (*quant); |
53 | | // Not SIMD-ified for now. |
54 | 714 | if (c != 1 && xsize * ysize >= 4) { |
55 | 0 | for (int i = 0; i < 4; ++i) { |
56 | 0 | thresholds[i] -= 0.00744f * xsize * ysize; |
57 | 0 | if (thresholds[i] < 0.5) { |
58 | 0 | thresholds[i] = 0.5; |
59 | 0 | } |
60 | 0 | } |
61 | 0 | } |
62 | 714 | HWY_CAPPED(float, kBlockDim) df; |
63 | 714 | HWY_CAPPED(int32_t, kBlockDim) di; |
64 | 714 | HWY_CAPPED(uint32_t, kBlockDim) du; |
65 | 714 | const auto quantv = Set(df, qac * qm_multiplier); |
66 | 6.42k | for (size_t y = 0; y < ysize * kBlockDim; y++) { |
67 | 5.71k | size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2; |
68 | 5.71k | const size_t off = y * kBlockDim * xsize; |
69 | 11.4k | for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { |
70 | 5.71k | auto threshold = Zero(df); |
71 | 5.71k | if (xsize == 1) { |
72 | 5.71k | HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u}; |
73 | 5.71k | const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); |
74 | 5.71k | threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]), |
75 | 5.71k | Set(df, thresholds[yfix])); |
76 | 5.71k | } else { |
77 | | // Same for all lanes in the vector. |
78 | 0 | threshold = Set( |
79 | 0 | df, |
80 | 0 | thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]); |
81 | 0 | } |
82 | 5.71k | const auto q = Mul(Load(df, qm + off + x), quantv); |
83 | 5.71k | const auto in = Load(df, block_in + off + x); |
84 | 5.71k | const auto val = Mul(q, in); |
85 | 5.71k | const auto nzero_mask = Ge(Abs(val), threshold); |
86 | 5.71k | const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); |
87 | 5.71k | Store(v, di, block_out + off + x); |
88 | 5.71k | } |
89 | 5.71k | } |
90 | 714 | } Unexecuted instantiation: jxl::N_SSE4::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*) jxl::N_AVX2::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*) Line | Count | Source | 50 | 714 | int32_t* JXL_RESTRICT block_out) { | 51 | 714 | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); | 52 | 714 | float qac = quantizer.Scale() * (*quant); | 53 | | // Not SIMD-ified for now. | 54 | 714 | if (c != 1 && xsize * ysize >= 4) { | 55 | 0 | for (int i = 0; i < 4; ++i) { | 56 | 0 | thresholds[i] -= 0.00744f * xsize * ysize; | 57 | 0 | if (thresholds[i] < 0.5) { | 58 | 0 | thresholds[i] = 0.5; | 59 | 0 | } | 60 | 0 | } | 61 | 0 | } | 62 | 714 | HWY_CAPPED(float, kBlockDim) df; | 63 | 714 | HWY_CAPPED(int32_t, kBlockDim) di; | 64 | 714 | HWY_CAPPED(uint32_t, kBlockDim) du; | 65 | 714 | const auto quantv = Set(df, qac * qm_multiplier); | 66 | 6.42k | for (size_t y = 0; y < ysize * kBlockDim; y++) { | 67 | 5.71k | size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2; | 68 | 5.71k | const size_t off = y * kBlockDim * xsize; | 69 | 11.4k | for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { | 70 | 5.71k | auto threshold = Zero(df); | 71 | 5.71k | if (xsize == 1) { | 72 | 5.71k | HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u}; | 73 | 5.71k | const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); | 74 | 5.71k | threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]), | 75 | 5.71k | Set(df, thresholds[yfix])); | 76 | 5.71k | } else { | 77 | | // Same for all lanes in the vector. | 78 | 0 | threshold = Set( | 79 | 0 | df, | 80 | 0 | thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]); | 81 | 0 | } | 82 | 5.71k | const auto q = Mul(Load(df, qm + off + x), quantv); | 83 | 5.71k | const auto in = Load(df, block_in + off + x); | 84 | 5.71k | const auto val = Mul(q, in); | 85 | 5.71k | const auto nzero_mask = Ge(Abs(val), threshold); | 86 | 5.71k | const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); | 87 | 5.71k | Store(v, di, block_out + off + x); | 88 | 5.71k | } | 89 | 5.71k | } | 90 | 714 | } |
Unexecuted instantiation: jxl::N_SSE2::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*) |
91 | | |
92 | | void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c, |
93 | | float qm_multiplier, AcStrategyType quant_kind, |
94 | | size_t xsize, size_t ysize, float* thresholds, |
95 | 714 | const float* JXL_RESTRICT block_in, int32_t* quant) { |
96 | | // No quantization adjusting for these small blocks. |
97 | | // Quantization adjusting attempts to fix some known issues |
98 | | // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness |
99 | | // when there are not many non-zeros. |
100 | 714 | constexpr size_t kPartialBlockKinds = |
101 | 714 | (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) | |
102 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) | |
103 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) | |
104 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) | |
105 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) | |
106 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV0)) | |
107 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV1)) | |
108 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV2)) | |
109 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV3)); |
110 | 714 | if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) { |
111 | 714 | return; |
112 | 714 | } |
113 | | |
114 | 0 | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); |
115 | 0 | float qac = quantizer.Scale() * (*quant); |
116 | 0 | if (xsize > 1 || ysize > 1) { |
117 | 0 | for (int i = 0; i < 4; ++i) { |
118 | 0 | thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f); |
119 | 0 | if (thresholds[i] < 0.54) { |
120 | 0 | thresholds[i] = 0.54; |
121 | 0 | } |
122 | 0 | } |
123 | 0 | } |
124 | 0 | float sum_of_highest_freq_row_and_column = 0; |
125 | 0 | float sum_of_error = 0; |
126 | 0 | float sum_of_vals = 0; |
127 | 0 | float hfNonZeros[4] = {}; |
128 | 0 | float hfMaxError[4] = {}; |
129 | |
|
130 | 0 | for (size_t y = 0; y < ysize * kBlockDim; y++) { |
131 | 0 | for (size_t x = 0; x < xsize * kBlockDim; x++) { |
132 | 0 | const size_t pos = y * kBlockDim * xsize + x; |
133 | 0 | if (x < xsize && y < ysize) { |
134 | 0 | continue; |
135 | 0 | } |
136 | 0 | const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 + |
137 | 0 | static_cast<size_t>(x >= xsize * kBlockDim / 2)); |
138 | 0 | const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); |
139 | 0 | const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val); |
140 | 0 | const float error = std::abs(val - v); |
141 | 0 | sum_of_error += error; |
142 | 0 | sum_of_vals += std::abs(v); |
143 | 0 | if (c == 1 && v == 0) { |
144 | 0 | if (hfMaxError[hfix] < error) { |
145 | 0 | hfMaxError[hfix] = error; |
146 | 0 | } |
147 | 0 | } |
148 | 0 | if (v != 0.0f) { |
149 | 0 | hfNonZeros[hfix] += std::abs(v); |
150 | 0 | bool in_corner = y >= 7 * ysize && x >= 7 * xsize; |
151 | 0 | bool on_border = |
152 | 0 | y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1; |
153 | 0 | bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize; |
154 | 0 | if (in_corner || (on_border && in_larger_corner)) { |
155 | 0 | sum_of_highest_freq_row_and_column += std::abs(val); |
156 | 0 | } |
157 | 0 | } |
158 | 0 | } |
159 | 0 | } |
160 | 0 | if (c == 1 && sum_of_vals * 8 < xsize * ysize) { |
161 | 0 | static const double kLimit[4] = { |
162 | 0 | 0.46, |
163 | 0 | 0.46, |
164 | 0 | 0.46, |
165 | 0 | 0.46, |
166 | 0 | }; |
167 | 0 | static const double kMul[4] = { |
168 | 0 | 0.9999, |
169 | 0 | 0.9999, |
170 | 0 | 0.9999, |
171 | 0 | 0.9999, |
172 | 0 | }; |
173 | 0 | const int32_t orig_quant = *quant; |
174 | 0 | int32_t new_quant = *quant; |
175 | 0 | for (int i = 1; i < 4; ++i) { |
176 | 0 | if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) { |
177 | 0 | new_quant = orig_quant + 1; |
178 | 0 | break; |
179 | 0 | } |
180 | 0 | } |
181 | 0 | *quant = new_quant; |
182 | 0 | if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) { |
183 | 0 | thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant; |
184 | 0 | } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) || |
185 | 0 | (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) { |
186 | 0 | thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) * |
187 | 0 | new_quant / orig_quant; |
188 | 0 | thresholds[2] = thresholds[1]; |
189 | 0 | } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) { |
190 | 0 | thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant; |
191 | 0 | } |
192 | 0 | } |
193 | | // Heuristic for improving accuracy of high-frequency patterns |
194 | | // occurring in an environment with no medium-frequency masking |
195 | | // patterns. |
196 | 0 | { |
197 | 0 | float all = |
198 | 0 | hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1; |
199 | 0 | float mul[3] = {70, 30, 60}; |
200 | 0 | if (mul[c] * sum_of_highest_freq_row_and_column >= all) { |
201 | 0 | *quant += mul[c] * sum_of_highest_freq_row_and_column / all; |
202 | 0 | if (*quant >= Quantizer::kQuantMax) { |
203 | 0 | *quant = Quantizer::kQuantMax - 1; |
204 | 0 | } |
205 | 0 | } |
206 | 0 | } |
207 | 0 | if (quant_kind == AcStrategyType::DCT) { |
208 | | // If this 8x8 block is too flat, increase the adaptive quantization level |
209 | | // a bit to reduce visible block boundaries and requantize the block. |
210 | 0 | if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) { |
211 | 0 | *quant += 1; |
212 | 0 | if (*quant >= Quantizer::kQuantMax) { |
213 | 0 | *quant = Quantizer::kQuantMax - 1; |
214 | 0 | } |
215 | 0 | } |
216 | 0 | } |
217 | 0 | { |
218 | 0 | static const double kMul1[4][3] = { |
219 | 0 | { |
220 | 0 | 0.22080615753848404, |
221 | 0 | 0.45797479824262011, |
222 | 0 | 0.29859235095977965, |
223 | 0 | }, |
224 | 0 | { |
225 | 0 | 0.70109486510286834, |
226 | 0 | 0.16185281305512639, |
227 | 0 | 0.14387691730035473, |
228 | 0 | }, |
229 | 0 | { |
230 | 0 | 0.114985964456218638, |
231 | 0 | 0.44656840441027695, |
232 | 0 | 0.10587658215149048, |
233 | 0 | }, |
234 | 0 | { |
235 | 0 | 0.46849665264409396, |
236 | 0 | 0.41239077937781954, |
237 | 0 | 0.088667407767185444, |
238 | 0 | }, |
239 | 0 | }; |
240 | 0 | static const double kMul2[4][3] = { |
241 | 0 | { |
242 | 0 | 0.27450281941822197, |
243 | 0 | 1.1255766549984996, |
244 | 0 | 0.98950459134128388, |
245 | 0 | }, |
246 | 0 | { |
247 | 0 | 0.4652168675598285, |
248 | 0 | 0.40945807983455818, |
249 | 0 | 0.36581899811751367, |
250 | 0 | }, |
251 | 0 | { |
252 | 0 | 0.28034972424715715, |
253 | 0 | 0.9182653201929738, |
254 | 0 | 1.5581531543057416, |
255 | 0 | }, |
256 | 0 | { |
257 | 0 | 0.26873118114033728, |
258 | 0 | 0.68863712390392484, |
259 | 0 | 1.2082185408666786, |
260 | 0 | }, |
261 | 0 | }; |
262 | 0 | static const double kQuantNormalizer = 2.2942708343284721; |
263 | 0 | sum_of_error *= kQuantNormalizer; |
264 | 0 | sum_of_vals *= kQuantNormalizer; |
265 | 0 | if (quant_kind >= AcStrategyType::DCT16X16) { |
266 | 0 | int ix = 3; |
267 | 0 | if (quant_kind == AcStrategyType::DCT32X16 || |
268 | 0 | quant_kind == AcStrategyType::DCT16X32) { |
269 | 0 | ix = 1; |
270 | 0 | } else if (quant_kind == AcStrategyType::DCT16X16) { |
271 | 0 | ix = 0; |
272 | 0 | } else if (quant_kind == AcStrategyType::DCT32X32) { |
273 | 0 | ix = 2; |
274 | 0 | } |
275 | 0 | int step = |
276 | 0 | sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + |
277 | 0 | kMul2[ix][c] * sum_of_vals); |
278 | 0 | if (step >= 2) { |
279 | 0 | step = 2; |
280 | 0 | } |
281 | 0 | if (step < 0) { |
282 | 0 | step = 0; |
283 | 0 | } |
284 | 0 | if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + |
285 | 0 | kMul2[ix][c] * sum_of_vals) { |
286 | 0 | *quant += step; |
287 | 0 | if (*quant >= Quantizer::kQuantMax) { |
288 | 0 | *quant = Quantizer::kQuantMax - 1; |
289 | 0 | } |
290 | 0 | } |
291 | 0 | } |
292 | 0 | } |
293 | 0 | { |
294 | | // Reduce quant in highly active areas. |
295 | 0 | int32_t div = (xsize * ysize); |
296 | 0 | int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div; |
297 | 0 | int32_t orig_qp_limit = std::max(4, *quant / 2); |
298 | 0 | for (int i = 1; i < 4; ++i) { |
299 | 0 | activity = std::min( |
300 | 0 | activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div); |
301 | 0 | } |
302 | 0 | if (activity >= 15) { |
303 | 0 | activity = 15; |
304 | 0 | } |
305 | 0 | int32_t qp = *quant - activity; |
306 | 0 | if (c == 1) { |
307 | 0 | for (int i = 1; i < 4; ++i) { |
308 | 0 | thresholds[i] += 0.01 * activity; |
309 | 0 | } |
310 | 0 | } |
311 | 0 | if (qp < orig_qp_limit) { |
312 | 0 | qp = orig_qp_limit; |
313 | 0 | } |
314 | 0 | *quant = qp; |
315 | 0 | } |
316 | 0 | } Unexecuted instantiation: jxl::N_SSE4::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*) jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*) Line | Count | Source | 95 | 714 | const float* JXL_RESTRICT block_in, int32_t* quant) { | 96 | | // No quantization adjusting for these small blocks. | 97 | | // Quantization adjusting attempts to fix some known issues | 98 | | // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness | 99 | | // when there are not many non-zeros. | 100 | 714 | constexpr size_t kPartialBlockKinds = | 101 | 714 | (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) | | 102 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) | | 103 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) | | 104 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) | | 105 | 714 | (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) | | 106 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV0)) | | 107 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV1)) | | 108 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV2)) | | 109 | 714 | (1 << static_cast<size_t>(AcStrategyType::AFV3)); | 110 | 714 | if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) { | 111 | 714 | return; | 112 | 714 | } | 113 | | | 114 | 0 | const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); | 115 | 0 | float qac = quantizer.Scale() * (*quant); | 116 | 0 | if (xsize > 1 || ysize > 1) { | 117 | 0 | for (int i = 0; i < 4; ++i) { | 118 | 0 | thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f); | 119 | 0 | if (thresholds[i] < 0.54) { | 120 | 0 | thresholds[i] = 0.54; | 121 | 0 | } | 122 | 0 | } | 123 | 0 | } | 124 | 0 | float sum_of_highest_freq_row_and_column = 0; | 125 | 0 | float sum_of_error = 0; | 126 | 0 | float sum_of_vals = 0; | 127 | 0 | float hfNonZeros[4] = {}; | 128 | 0 | float hfMaxError[4] = {}; | 129 | |
| 130 | 0 | for (size_t y = 0; y < ysize * kBlockDim; y++) { | 131 | 0 | for (size_t x = 0; x < xsize * kBlockDim; x++) { | 132 | 0 | const size_t pos = y * kBlockDim * xsize + x; | 133 | 0 | if (x < xsize && y < ysize) { | 134 | 0 | continue; | 135 | 0 | } | 136 | 0 | const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 + | 137 | 0 | static_cast<size_t>(x >= xsize * kBlockDim / 2)); | 138 | 0 | const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); | 139 | 0 | const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val); | 140 | 0 | const float error = std::abs(val - v); | 141 | 0 | sum_of_error += error; | 142 | 0 | sum_of_vals += std::abs(v); | 143 | 0 | if (c == 1 && v == 0) { | 144 | 0 | if (hfMaxError[hfix] < error) { | 145 | 0 | hfMaxError[hfix] = error; | 146 | 0 | } | 147 | 0 | } | 148 | 0 | if (v != 0.0f) { | 149 | 0 | hfNonZeros[hfix] += std::abs(v); | 150 | 0 | bool in_corner = y >= 7 * ysize && x >= 7 * xsize; | 151 | 0 | bool on_border = | 152 | 0 | y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1; | 153 | 0 | bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize; | 154 | 0 | if (in_corner || (on_border && in_larger_corner)) { | 155 | 0 | sum_of_highest_freq_row_and_column += std::abs(val); | 156 | 0 | } | 157 | 0 | } | 158 | 0 | } | 159 | 0 | } | 160 | 0 | if (c == 1 && sum_of_vals * 8 < xsize * ysize) { | 161 | 0 | static const double kLimit[4] = { | 162 | 0 | 0.46, | 163 | 0 | 0.46, | 164 | 0 | 0.46, | 165 | 0 | 0.46, | 166 | 0 | }; | 167 | 0 | static const double kMul[4] = { | 168 | 0 | 0.9999, | 169 | 0 | 0.9999, | 170 | 0 | 0.9999, | 171 | 0 | 0.9999, | 172 | 0 | }; | 173 | 0 | const int32_t orig_quant = *quant; | 174 | 0 | int32_t new_quant = *quant; | 175 | 0 | for (int i = 1; i < 4; ++i) { | 176 | 0 | if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) { | 177 | 0 | new_quant = orig_quant + 1; | 178 | 0 | break; | 179 | 0 | } | 180 | 0 | } | 181 | 0 | *quant = new_quant; | 182 | 0 | if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) { | 183 | 0 | thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant; | 184 | 0 | } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) || | 185 | 0 | (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) { | 186 | 0 | thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) * | 187 | 0 | new_quant / orig_quant; | 188 | 0 | thresholds[2] = thresholds[1]; | 189 | 0 | } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) { | 190 | 0 | thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant; | 191 | 0 | } | 192 | 0 | } | 193 | | // Heuristic for improving accuracy of high-frequency patterns | 194 | | // occurring in an environment with no medium-frequency masking | 195 | | // patterns. | 196 | 0 | { | 197 | 0 | float all = | 198 | 0 | hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1; | 199 | 0 | float mul[3] = {70, 30, 60}; | 200 | 0 | if (mul[c] * sum_of_highest_freq_row_and_column >= all) { | 201 | 0 | *quant += mul[c] * sum_of_highest_freq_row_and_column / all; | 202 | 0 | if (*quant >= Quantizer::kQuantMax) { | 203 | 0 | *quant = Quantizer::kQuantMax - 1; | 204 | 0 | } | 205 | 0 | } | 206 | 0 | } | 207 | 0 | if (quant_kind == AcStrategyType::DCT) { | 208 | | // If this 8x8 block is too flat, increase the adaptive quantization level | 209 | | // a bit to reduce visible block boundaries and requantize the block. | 210 | 0 | if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) { | 211 | 0 | *quant += 1; | 212 | 0 | if (*quant >= Quantizer::kQuantMax) { | 213 | 0 | *quant = Quantizer::kQuantMax - 1; | 214 | 0 | } | 215 | 0 | } | 216 | 0 | } | 217 | 0 | { | 218 | 0 | static const double kMul1[4][3] = { | 219 | 0 | { | 220 | 0 | 0.22080615753848404, | 221 | 0 | 0.45797479824262011, | 222 | 0 | 0.29859235095977965, | 223 | 0 | }, | 224 | 0 | { | 225 | 0 | 0.70109486510286834, | 226 | 0 | 0.16185281305512639, | 227 | 0 | 0.14387691730035473, | 228 | 0 | }, | 229 | 0 | { | 230 | 0 | 0.114985964456218638, | 231 | 0 | 0.44656840441027695, | 232 | 0 | 0.10587658215149048, | 233 | 0 | }, | 234 | 0 | { | 235 | 0 | 0.46849665264409396, | 236 | 0 | 0.41239077937781954, | 237 | 0 | 0.088667407767185444, | 238 | 0 | }, | 239 | 0 | }; | 240 | 0 | static const double kMul2[4][3] = { | 241 | 0 | { | 242 | 0 | 0.27450281941822197, | 243 | 0 | 1.1255766549984996, | 244 | 0 | 0.98950459134128388, | 245 | 0 | }, | 246 | 0 | { | 247 | 0 | 0.4652168675598285, | 248 | 0 | 0.40945807983455818, | 249 | 0 | 0.36581899811751367, | 250 | 0 | }, | 251 | 0 | { | 252 | 0 | 0.28034972424715715, | 253 | 0 | 0.9182653201929738, | 254 | 0 | 1.5581531543057416, | 255 | 0 | }, | 256 | 0 | { | 257 | 0 | 0.26873118114033728, | 258 | 0 | 0.68863712390392484, | 259 | 0 | 1.2082185408666786, | 260 | 0 | }, | 261 | 0 | }; | 262 | 0 | static const double kQuantNormalizer = 2.2942708343284721; | 263 | 0 | sum_of_error *= kQuantNormalizer; | 264 | 0 | sum_of_vals *= kQuantNormalizer; | 265 | 0 | if (quant_kind >= AcStrategyType::DCT16X16) { | 266 | 0 | int ix = 3; | 267 | 0 | if (quant_kind == AcStrategyType::DCT32X16 || | 268 | 0 | quant_kind == AcStrategyType::DCT16X32) { | 269 | 0 | ix = 1; | 270 | 0 | } else if (quant_kind == AcStrategyType::DCT16X16) { | 271 | 0 | ix = 0; | 272 | 0 | } else if (quant_kind == AcStrategyType::DCT32X32) { | 273 | 0 | ix = 2; | 274 | 0 | } | 275 | 0 | int step = | 276 | 0 | sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + | 277 | 0 | kMul2[ix][c] * sum_of_vals); | 278 | 0 | if (step >= 2) { | 279 | 0 | step = 2; | 280 | 0 | } | 281 | 0 | if (step < 0) { | 282 | 0 | step = 0; | 283 | 0 | } | 284 | 0 | if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim + | 285 | 0 | kMul2[ix][c] * sum_of_vals) { | 286 | 0 | *quant += step; | 287 | 0 | if (*quant >= Quantizer::kQuantMax) { | 288 | 0 | *quant = Quantizer::kQuantMax - 1; | 289 | 0 | } | 290 | 0 | } | 291 | 0 | } | 292 | 0 | } | 293 | 0 | { | 294 | | // Reduce quant in highly active areas. | 295 | 0 | int32_t div = (xsize * ysize); | 296 | 0 | int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div; | 297 | 0 | int32_t orig_qp_limit = std::max(4, *quant / 2); | 298 | 0 | for (int i = 1; i < 4; ++i) { | 299 | 0 | activity = std::min( | 300 | 0 | activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div); | 301 | 0 | } | 302 | 0 | if (activity >= 15) { | 303 | 0 | activity = 15; | 304 | 0 | } | 305 | 0 | int32_t qp = *quant - activity; | 306 | 0 | if (c == 1) { | 307 | 0 | for (int i = 1; i < 4; ++i) { | 308 | 0 | thresholds[i] += 0.01 * activity; | 309 | 0 | } | 310 | 0 | } | 311 | 0 | if (qp < orig_qp_limit) { | 312 | 0 | qp = orig_qp_limit; | 313 | 0 | } | 314 | 0 | *quant = qp; | 315 | 0 | } | 316 | 0 | } |
Unexecuted instantiation: jxl::N_SSE2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*) |
317 | | |
318 | | // NOTE: caller takes care of extracting quant from rect of RawQuantField. |
319 | | void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size, |
320 | | const Quantizer& quantizer, |
321 | | const bool error_diffusion, |
322 | | AcStrategyType quant_kind, size_t xsize, |
323 | | size_t ysize, const float* JXL_RESTRICT biases, |
324 | | int32_t* quant, float* JXL_RESTRICT inout, |
325 | 238 | int32_t* JXL_RESTRICT quantized) { |
326 | 238 | float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f}; |
327 | 238 | if (enc_state->cparams.speed_tier <= SpeedTier::kHare) { |
328 | 238 | int32_t max_quant = 0; |
329 | 238 | int quant_orig = *quant; |
330 | 238 | float val[3] = {enc_state->x_qm_multiplier, 1.0f, |
331 | 238 | enc_state->b_qm_multiplier}; |
332 | 714 | for (int c : {1, 0, 2}) { |
333 | 714 | float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f}; |
334 | 714 | *quant = quant_orig; |
335 | 714 | AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize, |
336 | 714 | &thres[0], inout + c * size, quant); |
337 | | // Dead zone adjustment |
338 | 714 | if (c == 1) { |
339 | 1.19k | for (int k = 0; k < 4; ++k) { |
340 | 952 | thres_y[k] = thres[k]; |
341 | 952 | } |
342 | 238 | } |
343 | 714 | max_quant = std::max(*quant, max_quant); |
344 | 714 | } |
345 | 238 | *quant = max_quant; |
346 | 238 | } else { |
347 | 0 | thres_y[0] = 0.56; |
348 | 0 | thres_y[1] = 0.62; |
349 | 0 | thres_y[2] = 0.62; |
350 | 0 | thres_y[3] = 0.62; |
351 | 0 | } |
352 | | |
353 | 238 | QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize, |
354 | 238 | &thres_y[0], inout + size, quant, quantized + size); |
355 | | |
356 | 238 | const float* JXL_RESTRICT dequant_matrix = |
357 | 238 | quantizer.DequantMatrix(quant_kind, 1); |
358 | | |
359 | 238 | HWY_CAPPED(float, kDCTBlockSize) df; |
360 | 238 | HWY_CAPPED(int32_t, kDCTBlockSize) di; |
361 | 238 | const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant)); |
362 | 2.14k | for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { |
363 | 1.90k | const auto quant = Load(di, quantized + size + k); |
364 | 1.90k | const auto adj_quant = AdjustQuantBias(di, 1, quant, biases); |
365 | 1.90k | const auto dequantm = Load(df, dequant_matrix + k); |
366 | 1.90k | Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k); |
367 | 1.90k | } |
368 | 238 | } Unexecuted instantiation: jxl::N_SSE4::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*) jxl::N_AVX2::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*) Line | Count | Source | 325 | 238 | int32_t* JXL_RESTRICT quantized) { | 326 | 238 | float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f}; | 327 | 238 | if (enc_state->cparams.speed_tier <= SpeedTier::kHare) { | 328 | 238 | int32_t max_quant = 0; | 329 | 238 | int quant_orig = *quant; | 330 | 238 | float val[3] = {enc_state->x_qm_multiplier, 1.0f, | 331 | 238 | enc_state->b_qm_multiplier}; | 332 | 714 | for (int c : {1, 0, 2}) { | 333 | 714 | float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f}; | 334 | 714 | *quant = quant_orig; | 335 | 714 | AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize, | 336 | 714 | &thres[0], inout + c * size, quant); | 337 | | // Dead zone adjustment | 338 | 714 | if (c == 1) { | 339 | 1.19k | for (int k = 0; k < 4; ++k) { | 340 | 952 | thres_y[k] = thres[k]; | 341 | 952 | } | 342 | 238 | } | 343 | 714 | max_quant = std::max(*quant, max_quant); | 344 | 714 | } | 345 | 238 | *quant = max_quant; | 346 | 238 | } else { | 347 | 0 | thres_y[0] = 0.56; | 348 | 0 | thres_y[1] = 0.62; | 349 | 0 | thres_y[2] = 0.62; | 350 | 0 | thres_y[3] = 0.62; | 351 | 0 | } | 352 | | | 353 | 238 | QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize, | 354 | 238 | &thres_y[0], inout + size, quant, quantized + size); | 355 | | | 356 | 238 | const float* JXL_RESTRICT dequant_matrix = | 357 | 238 | quantizer.DequantMatrix(quant_kind, 1); | 358 | | | 359 | 238 | HWY_CAPPED(float, kDCTBlockSize) df; | 360 | 238 | HWY_CAPPED(int32_t, kDCTBlockSize) di; | 361 | 238 | const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant)); | 362 | 2.14k | for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { | 363 | 1.90k | const auto quant = Load(di, quantized + size + k); | 364 | 1.90k | const auto adj_quant = AdjustQuantBias(di, 1, quant, biases); | 365 | 1.90k | const auto dequantm = Load(df, dequant_matrix + k); | 366 | 1.90k | Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k); | 367 | 1.90k | } | 368 | 238 | } |
Unexecuted instantiation: jxl::N_SSE2::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*) |
369 | | |
370 | | Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, |
371 | | const Image3F& opsin, const Rect& rect, |
372 | 238 | Image3F* dc) { |
373 | 238 | JxlMemoryManager* memory_manager = opsin.memory_manager(); |
374 | 238 | const Rect block_group_rect = |
375 | 238 | enc_state->shared.frame_dim.BlockGroupRect(group_idx); |
376 | 238 | const Rect cmap_rect( |
377 | 238 | block_group_rect.x0() / kColorTileDimInBlocks, |
378 | 238 | block_group_rect.y0() / kColorTileDimInBlocks, |
379 | 238 | DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), |
380 | 238 | DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); |
381 | 238 | const Rect group_rect = |
382 | 238 | enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(), |
383 | 238 | rect.y0()); |
384 | | |
385 | 238 | const size_t xsize_blocks = block_group_rect.xsize(); |
386 | 238 | const size_t ysize_blocks = block_group_rect.ysize(); |
387 | | |
388 | 238 | const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow()); |
389 | 238 | const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow()); |
390 | | |
391 | 238 | ImageI& full_quant_field = enc_state->shared.raw_quant_field; |
392 | 238 | const CompressParams& cparams = enc_state->cparams; |
393 | | |
394 | 238 | const size_t dct_scratch_size = |
395 | 238 | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; |
396 | | |
397 | | // TODO(veluca): consider strategies to reduce this memory. |
398 | 238 | size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t); |
399 | 238 | JXL_ASSIGN_OR_RETURN(auto mem, |
400 | 238 | AlignedMemory::Create(memory_manager, mem_bytes)); |
401 | 238 | size_t fmem_bytes = |
402 | 238 | (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float); |
403 | 238 | JXL_ASSIGN_OR_RETURN(auto fmem, |
404 | 238 | AlignedMemory::Create(memory_manager, fmem_bytes)); |
405 | 238 | float* JXL_RESTRICT scratch_space = |
406 | 238 | fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea; |
407 | 238 | { |
408 | | // Only use error diffusion in Squirrel mode or slower. |
409 | 238 | const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; |
410 | 238 | constexpr HWY_CAPPED(float, kDCTBlockSize) d; |
411 | | |
412 | 238 | int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {}; |
413 | 238 | size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); |
414 | 238 | JXL_ENSURE(num_passes > 0); |
415 | 476 | for (size_t i = 0; i < num_passes; i++) { |
416 | | // TODO(veluca): 16-bit quantized coeffs are not implemented yet. |
417 | 238 | JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32); |
418 | 952 | for (size_t c = 0; c < 3; c++) { |
419 | 714 | coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; |
420 | 714 | } |
421 | 238 | } |
422 | | |
423 | 238 | HWY_ALIGN float* coeffs_in = fmem.address<float>(); |
424 | 238 | HWY_ALIGN int32_t* quantized = mem.address<int32_t>(); |
425 | | |
426 | 476 | for (size_t by = 0; by < ysize_blocks; ++by) { |
427 | 238 | int32_t* JXL_RESTRICT row_quant_ac = |
428 | 238 | block_group_rect.Row(&full_quant_field, by); |
429 | 238 | size_t ty = by / kColorTileDimInBlocks; |
430 | 238 | const int8_t* JXL_RESTRICT row_cmap[3] = { |
431 | 238 | cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), |
432 | 238 | nullptr, |
433 | 238 | cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), |
434 | 238 | }; |
435 | 238 | const float* JXL_RESTRICT opsin_rows[3] = { |
436 | 238 | group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), |
437 | 238 | group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), |
438 | 238 | group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), |
439 | 238 | }; |
440 | 238 | float* JXL_RESTRICT dc_rows[3] = { |
441 | 238 | block_group_rect.PlaneRow(dc, 0, by), |
442 | 238 | block_group_rect.PlaneRow(dc, 1, by), |
443 | 238 | block_group_rect.PlaneRow(dc, 2, by), |
444 | 238 | }; |
445 | 238 | AcStrategyRow ac_strategy_row = |
446 | 238 | enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); |
447 | 476 | for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); |
448 | 238 | tx++) { |
449 | 238 | const auto x_factor = |
450 | 238 | Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx])); |
451 | 238 | const auto b_factor = |
452 | 238 | Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx])); |
453 | 238 | for (size_t bx = tx * kColorTileDimInBlocks; |
454 | 476 | bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { |
455 | 238 | const AcStrategy acs = ac_strategy_row[bx]; |
456 | 238 | if (!acs.IsFirstBlock()) continue; |
457 | | |
458 | 238 | size_t xblocks = acs.covered_blocks_x(); |
459 | 238 | size_t yblocks = acs.covered_blocks_y(); |
460 | | |
461 | 238 | CoefficientLayout(&yblocks, &xblocks); |
462 | | |
463 | 238 | size_t size = kDCTBlockSize * xblocks * yblocks; |
464 | | |
465 | | // DCT Y channel, roundtrip-quantize it and set DC. |
466 | 238 | int32_t quant_ac = row_quant_ac[bx]; |
467 | 714 | for (size_t c : {0, 1, 2}) { |
468 | 714 | TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim, |
469 | 714 | opsin_stride, coeffs_in + c * size, |
470 | 714 | scratch_space); |
471 | 714 | } |
472 | 238 | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, |
473 | 238 | dc_rows[1] + bx, dc_stride); |
474 | | |
475 | 238 | QuantizeRoundtripYBlockAC( |
476 | 238 | enc_state, size, enc_state->shared.quantizer, error_diffusion, |
477 | 238 | acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac, |
478 | 238 | coeffs_in, quantized); |
479 | | |
480 | | // Unapply color correlation |
481 | 2.14k | for (size_t k = 0; k < size; k += Lanes(d)) { |
482 | 1.90k | const auto in_x = Load(d, coeffs_in + k); |
483 | 1.90k | const auto in_y = Load(d, coeffs_in + size + k); |
484 | 1.90k | const auto in_b = Load(d, coeffs_in + 2 * size + k); |
485 | 1.90k | const auto out_x = NegMulAdd(x_factor, in_y, in_x); |
486 | 1.90k | const auto out_b = NegMulAdd(b_factor, in_y, in_b); |
487 | 1.90k | Store(out_x, d, coeffs_in + k); |
488 | 1.90k | Store(out_b, d, coeffs_in + 2 * size + k); |
489 | 1.90k | } |
490 | | |
491 | | // Quantize X and B channels and set DC. |
492 | 476 | for (size_t c : {0, 2}) { |
493 | 476 | float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f}; |
494 | 476 | QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, |
495 | 476 | c == 0 ? enc_state->x_qm_multiplier |
496 | 476 | : enc_state->b_qm_multiplier, |
497 | 476 | acs.Strategy(), xblocks, yblocks, &thres[0], |
498 | 476 | coeffs_in + c * size, &quant_ac, |
499 | 476 | quantized + c * size); |
500 | 476 | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size, |
501 | 476 | dc_rows[c] + bx, dc_stride); |
502 | 476 | } |
503 | 238 | row_quant_ac[bx] = quant_ac; |
504 | 952 | for (size_t c = 0; c < 3; c++) { |
505 | 714 | enc_state->progressive_splitter.SplitACCoefficients( |
506 | 714 | quantized + c * size, acs, bx, by, coeffs[c]); |
507 | 1.42k | for (size_t p = 0; p < num_passes; p++) { |
508 | 714 | coeffs[c][p] += size; |
509 | 714 | } |
510 | 714 | } |
511 | 238 | } |
512 | 238 | } |
513 | 238 | } |
514 | 238 | } |
515 | 0 | return true; |
516 | 238 | } Unexecuted instantiation: jxl::N_SSE4::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*) jxl::N_AVX2::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*) Line | Count | Source | 372 | 238 | Image3F* dc) { | 373 | 238 | JxlMemoryManager* memory_manager = opsin.memory_manager(); | 374 | 238 | const Rect block_group_rect = | 375 | 238 | enc_state->shared.frame_dim.BlockGroupRect(group_idx); | 376 | 238 | const Rect cmap_rect( | 377 | 238 | block_group_rect.x0() / kColorTileDimInBlocks, | 378 | 238 | block_group_rect.y0() / kColorTileDimInBlocks, | 379 | 238 | DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), | 380 | 238 | DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); | 381 | 238 | const Rect group_rect = | 382 | 238 | enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(), | 383 | 238 | rect.y0()); | 384 | | | 385 | 238 | const size_t xsize_blocks = block_group_rect.xsize(); | 386 | 238 | const size_t ysize_blocks = block_group_rect.ysize(); | 387 | | | 388 | 238 | const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow()); | 389 | 238 | const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow()); | 390 | | | 391 | 238 | ImageI& full_quant_field = enc_state->shared.raw_quant_field; | 392 | 238 | const CompressParams& cparams = enc_state->cparams; | 393 | | | 394 | 238 | const size_t dct_scratch_size = | 395 | 238 | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; | 396 | | | 397 | | // TODO(veluca): consider strategies to reduce this memory. | 398 | 238 | size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t); | 399 | 238 | JXL_ASSIGN_OR_RETURN(auto mem, | 400 | 238 | AlignedMemory::Create(memory_manager, mem_bytes)); | 401 | 238 | size_t fmem_bytes = | 402 | 238 | (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float); | 403 | 238 | JXL_ASSIGN_OR_RETURN(auto fmem, | 404 | 238 | AlignedMemory::Create(memory_manager, fmem_bytes)); | 405 | 238 | float* JXL_RESTRICT scratch_space = | 406 | 238 | fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea; | 407 | 238 | { | 408 | | // Only use error diffusion in Squirrel mode or slower. | 409 | 238 | const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; | 410 | 238 | constexpr HWY_CAPPED(float, kDCTBlockSize) d; | 411 | | | 412 | 238 | int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {}; | 413 | 238 | size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); | 414 | 238 | JXL_ENSURE(num_passes > 0); | 415 | 476 | for (size_t i = 0; i < num_passes; i++) { | 416 | | // TODO(veluca): 16-bit quantized coeffs are not implemented yet. | 417 | 238 | JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32); | 418 | 952 | for (size_t c = 0; c < 3; c++) { | 419 | 714 | coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; | 420 | 714 | } | 421 | 238 | } | 422 | | | 423 | 238 | HWY_ALIGN float* coeffs_in = fmem.address<float>(); | 424 | 238 | HWY_ALIGN int32_t* quantized = mem.address<int32_t>(); | 425 | | | 426 | 476 | for (size_t by = 0; by < ysize_blocks; ++by) { | 427 | 238 | int32_t* JXL_RESTRICT row_quant_ac = | 428 | 238 | block_group_rect.Row(&full_quant_field, by); | 429 | 238 | size_t ty = by / kColorTileDimInBlocks; | 430 | 238 | const int8_t* JXL_RESTRICT row_cmap[3] = { | 431 | 238 | cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), | 432 | 238 | nullptr, | 433 | 238 | cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), | 434 | 238 | }; | 435 | 238 | const float* JXL_RESTRICT opsin_rows[3] = { | 436 | 238 | group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), | 437 | 238 | group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), | 438 | 238 | group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), | 439 | 238 | }; | 440 | 238 | float* JXL_RESTRICT dc_rows[3] = { | 441 | 238 | block_group_rect.PlaneRow(dc, 0, by), | 442 | 238 | block_group_rect.PlaneRow(dc, 1, by), | 443 | 238 | block_group_rect.PlaneRow(dc, 2, by), | 444 | 238 | }; | 445 | 238 | AcStrategyRow ac_strategy_row = | 446 | 238 | enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); | 447 | 476 | for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); | 448 | 238 | tx++) { | 449 | 238 | const auto x_factor = | 450 | 238 | Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx])); | 451 | 238 | const auto b_factor = | 452 | 238 | Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx])); | 453 | 238 | for (size_t bx = tx * kColorTileDimInBlocks; | 454 | 476 | bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { | 455 | 238 | const AcStrategy acs = ac_strategy_row[bx]; | 456 | 238 | if (!acs.IsFirstBlock()) continue; | 457 | | | 458 | 238 | size_t xblocks = acs.covered_blocks_x(); | 459 | 238 | size_t yblocks = acs.covered_blocks_y(); | 460 | | | 461 | 238 | CoefficientLayout(&yblocks, &xblocks); | 462 | | | 463 | 238 | size_t size = kDCTBlockSize * xblocks * yblocks; | 464 | | | 465 | | // DCT Y channel, roundtrip-quantize it and set DC. | 466 | 238 | int32_t quant_ac = row_quant_ac[bx]; | 467 | 714 | for (size_t c : {0, 1, 2}) { | 468 | 714 | TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim, | 469 | 714 | opsin_stride, coeffs_in + c * size, | 470 | 714 | scratch_space); | 471 | 714 | } | 472 | 238 | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, | 473 | 238 | dc_rows[1] + bx, dc_stride); | 474 | | | 475 | 238 | QuantizeRoundtripYBlockAC( | 476 | 238 | enc_state, size, enc_state->shared.quantizer, error_diffusion, | 477 | 238 | acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac, | 478 | 238 | coeffs_in, quantized); | 479 | | | 480 | | // Unapply color correlation | 481 | 2.14k | for (size_t k = 0; k < size; k += Lanes(d)) { | 482 | 1.90k | const auto in_x = Load(d, coeffs_in + k); | 483 | 1.90k | const auto in_y = Load(d, coeffs_in + size + k); | 484 | 1.90k | const auto in_b = Load(d, coeffs_in + 2 * size + k); | 485 | 1.90k | const auto out_x = NegMulAdd(x_factor, in_y, in_x); | 486 | 1.90k | const auto out_b = NegMulAdd(b_factor, in_y, in_b); | 487 | 1.90k | Store(out_x, d, coeffs_in + k); | 488 | 1.90k | Store(out_b, d, coeffs_in + 2 * size + k); | 489 | 1.90k | } | 490 | | | 491 | | // Quantize X and B channels and set DC. | 492 | 476 | for (size_t c : {0, 2}) { | 493 | 476 | float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f}; | 494 | 476 | QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, | 495 | 476 | c == 0 ? enc_state->x_qm_multiplier | 496 | 476 | : enc_state->b_qm_multiplier, | 497 | 476 | acs.Strategy(), xblocks, yblocks, &thres[0], | 498 | 476 | coeffs_in + c * size, &quant_ac, | 499 | 476 | quantized + c * size); | 500 | 476 | DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size, | 501 | 476 | dc_rows[c] + bx, dc_stride); | 502 | 476 | } | 503 | 238 | row_quant_ac[bx] = quant_ac; | 504 | 952 | for (size_t c = 0; c < 3; c++) { | 505 | 714 | enc_state->progressive_splitter.SplitACCoefficients( | 506 | 714 | quantized + c * size, acs, bx, by, coeffs[c]); | 507 | 1.42k | for (size_t p = 0; p < num_passes; p++) { | 508 | 714 | coeffs[c][p] += size; | 509 | 714 | } | 510 | 714 | } | 511 | 238 | } | 512 | 238 | } | 513 | 238 | } | 514 | 238 | } | 515 | 0 | return true; | 516 | 238 | } |
Unexecuted instantiation: jxl::N_SSE2::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*) |
517 | | |
518 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
519 | | } // namespace HWY_NAMESPACE |
520 | | } // namespace jxl |
521 | | HWY_AFTER_NAMESPACE(); |
522 | | |
523 | | #if HWY_ONCE |
524 | | namespace jxl { |
525 | | HWY_EXPORT(ComputeCoefficients); |
526 | | Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, |
527 | | const Image3F& opsin, const Rect& rect, |
528 | 238 | Image3F* dc) { |
529 | 238 | return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin, |
530 | 238 | rect, dc); |
531 | 238 | } |
532 | | |
533 | | Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx, |
534 | | size_t histogram_idx, |
535 | | const PassesEncoderState& enc_state, |
536 | 238 | BitWriter* writer, AuxOut* aux_out) { |
537 | | // Select which histogram to use among those of the current pass. |
538 | 238 | const size_t num_histograms = enc_state.shared.num_histograms; |
539 | | // num_histograms is 0 only for lossless. |
540 | 238 | JXL_ENSURE(num_histograms == 0 || histogram_idx < num_histograms); |
541 | 238 | size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); |
542 | | |
543 | 238 | if (histo_selector_bits != 0) { |
544 | 0 | JXL_RETURN_IF_ERROR( |
545 | 0 | writer->WithMaxBits(histo_selector_bits, LayerType::Ac, aux_out, [&] { |
546 | 0 | writer->Write(histo_selector_bits, histogram_idx); |
547 | 0 | return true; |
548 | 0 | })); |
549 | 0 | } |
550 | 238 | size_t context_offset = |
551 | 238 | histogram_idx * enc_state.shared.block_ctx_map.NumACContexts(); |
552 | 238 | JXL_RETURN_IF_ERROR(WriteTokens( |
553 | 238 | enc_state.passes[pass_idx].ac_tokens[group_idx], |
554 | 238 | enc_state.passes[pass_idx].codes, enc_state.passes[pass_idx].context_map, |
555 | 238 | context_offset, writer, LayerType::AcTokens, aux_out)); |
556 | | |
557 | 238 | return true; |
558 | 238 | } |
559 | | |
560 | | } // namespace jxl |
561 | | #endif // HWY_ONCE |