/src/libjxl/lib/jxl/enc_adaptive_quantization.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_adaptive_quantization.h" |
7 | | |
8 | | #include <jxl/cms_interface.h> |
9 | | #include <jxl/memory_manager.h> |
10 | | |
11 | | #include <algorithm> |
12 | | #include <cmath> |
13 | | #include <cstddef> |
14 | | #include <cstdint> |
15 | | #include <cstdio> |
16 | | #include <cstdlib> |
17 | | #include <memory> |
18 | | #include <string> |
19 | | #include <utility> |
20 | | #include <vector> |
21 | | |
22 | | #include "lib/jxl/cms/opsin_params.h" |
23 | | #include "lib/jxl/common.h" |
24 | | #include "lib/jxl/frame_header.h" |
25 | | #include "lib/jxl/image_metadata.h" |
26 | | #include "lib/jxl/memory_manager_internal.h" |
27 | | #include "lib/jxl/quantizer.h" |
28 | | #include "lib/jxl/render_pipeline/render_pipeline.h" |
29 | | |
30 | | #undef HWY_TARGET_INCLUDE |
31 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc" |
32 | | #include <hwy/foreach_target.h> |
33 | | #include <hwy/highway.h> |
34 | | |
35 | | #include "lib/jxl/ac_strategy.h" |
36 | | #include "lib/jxl/base/common.h" |
37 | | #include "lib/jxl/base/compiler_specific.h" |
38 | | #include "lib/jxl/base/data_parallel.h" |
39 | | #include "lib/jxl/base/fast_math-inl.h" |
40 | | #include "lib/jxl/base/rect.h" |
41 | | #include "lib/jxl/base/status.h" |
42 | | #include "lib/jxl/butteraugli/butteraugli.h" |
43 | | #include "lib/jxl/convolve.h" |
44 | | #include "lib/jxl/dec_cache.h" |
45 | | #include "lib/jxl/dec_group.h" |
46 | | #include "lib/jxl/enc_aux_out.h" |
47 | | #include "lib/jxl/enc_butteraugli_comparator.h" |
48 | | #include "lib/jxl/enc_cache.h" |
49 | | #include "lib/jxl/enc_debug_image.h" |
50 | | #include "lib/jxl/enc_group.h" |
51 | | #include "lib/jxl/enc_modular.h" |
52 | | #include "lib/jxl/enc_params.h" |
53 | | #include "lib/jxl/enc_transforms-inl.h" |
54 | | #include "lib/jxl/epf.h" |
55 | | #include "lib/jxl/frame_dimensions.h" |
56 | | #include "lib/jxl/image.h" |
57 | | #include "lib/jxl/image_bundle.h" |
58 | | #include "lib/jxl/image_ops.h" |
59 | | #include "lib/jxl/quant_weights.h" |
60 | | |
61 | | // Set JXL_DEBUG_ADAPTIVE_QUANTIZATION to 1 to enable debugging. |
62 | | #ifndef JXL_DEBUG_ADAPTIVE_QUANTIZATION |
63 | 0 | #define JXL_DEBUG_ADAPTIVE_QUANTIZATION 0 |
64 | | #endif |
65 | | |
66 | | HWY_BEFORE_NAMESPACE(); |
67 | | namespace jxl { |
68 | | namespace HWY_NAMESPACE { |
69 | | namespace { |
70 | | |
71 | | // These templates are not found via ADL. |
72 | | using hwy::HWY_NAMESPACE::AbsDiff; |
73 | | using hwy::HWY_NAMESPACE::Add; |
74 | | using hwy::HWY_NAMESPACE::And; |
75 | | using hwy::HWY_NAMESPACE::Gt; |
76 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
77 | | using hwy::HWY_NAMESPACE::Max; |
78 | | using hwy::HWY_NAMESPACE::Min; |
79 | | using hwy::HWY_NAMESPACE::Rebind; |
80 | | using hwy::HWY_NAMESPACE::Sqrt; |
81 | | using hwy::HWY_NAMESPACE::ZeroIfNegative; |
82 | | |
83 | | // The following functions modulate an exponent (out_val) and return the updated |
84 | | // value. Their descriptor is limited to 8 lanes for 8x8 blocks. |
85 | | |
86 | | // Hack for mask estimation. Eventually replace this code with butteraugli's |
87 | | // masking. |
88 | 398k | float ComputeMaskForAcStrategyUse(const float out_val) { |
89 | 398k | const float kMul = 1.0f; |
90 | 398k | const float kOffset = 0.001f; |
91 | 398k | return kMul / (out_val + kOffset); |
92 | 398k | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float) Line | Count | Source | 88 | 398k | float ComputeMaskForAcStrategyUse(const float out_val) { | 89 | 398k | const float kMul = 1.0f; | 90 | 398k | const float kOffset = 0.001f; | 91 | 398k | return kMul / (out_val + kOffset); | 92 | 398k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::ComputeMaskForAcStrategyUse(float) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float) |
93 | | |
94 | | template <class D, class V> |
95 | 398k | V ComputeMask(const D d, const V out_val) { |
96 | 398k | const auto kBase = Set(d, -0.7647f); |
97 | 398k | const auto kMul4 = Set(d, 9.4708735624378946f); |
98 | 398k | const auto kMul2 = Set(d, 17.35036561631863f); |
99 | 398k | const auto kOffset2 = Set(d, 302.59587815579727f); |
100 | 398k | const auto kMul3 = Set(d, 6.7943250517376494f); |
101 | 398k | const auto kOffset3 = Set(d, 3.7179635626140772f); |
102 | 398k | const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3); |
103 | 398k | const auto kMul0 = Set(d, 0.80061762862741759f); |
104 | 398k | const auto k1 = Set(d, 1.0f); |
105 | | |
106 | | // Avoid division by zero. |
107 | 398k | const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f)); |
108 | 398k | const auto v2 = Div(k1, Add(v1, kOffset2)); |
109 | 398k | const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3)); |
110 | 398k | const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4)); |
111 | | // TODO(jyrki): |
112 | | // A log or two here could make sense. In butteraugli we have effectively |
113 | | // log(log(x + C)) for this kind of use, as a single log is used in |
114 | | // saturating visual masking and here the modulation values are exponential, |
115 | | // another log would counter that. |
116 | 398k | return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3)))); |
117 | 398k | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::ComputeMask<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 95 | 398k | V ComputeMask(const D d, const V out_val) { | 96 | 398k | const auto kBase = Set(d, -0.7647f); | 97 | 398k | const auto kMul4 = Set(d, 9.4708735624378946f); | 98 | 398k | const auto kMul2 = Set(d, 17.35036561631863f); | 99 | 398k | const auto kOffset2 = Set(d, 302.59587815579727f); | 100 | 398k | const auto kMul3 = Set(d, 6.7943250517376494f); | 101 | 398k | const auto kOffset3 = Set(d, 3.7179635626140772f); | 102 | 398k | const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3); | 103 | 398k | const auto kMul0 = Set(d, 0.80061762862741759f); | 104 | 398k | const auto k1 = Set(d, 1.0f); | 105 | | | 106 | | // Avoid division by zero. | 107 | 398k | const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f)); | 108 | 398k | const auto v2 = Div(k1, Add(v1, kOffset2)); | 109 | 398k | const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3)); | 110 | 398k | const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4)); | 111 | | // TODO(jyrki): | 112 | | // A log or two here could make sense. In butteraugli we have effectively | 113 | | // log(log(x + C)) for this kind of use, as a single log is used in | 114 | | // saturating visual masking and here the modulation values are exponential, | 115 | | // another log would counter that. | 116 | 398k | return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3)))); | 117 | 398k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::ComputeMask<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::ComputeMask<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
118 | | |
119 | | // mul and mul2 represent a scaling difference between jxl and butteraugli. |
120 | | const float kSGmul = 226.77216153508914f; |
121 | | const float kSGmul2 = 1.0f / 73.377132366608819f; |
122 | | // Includes correction factor for std::log -> log2. |
123 | | const float kSGRetMul = kSGmul2 * 18.6580932135f * kInvLog2e; |
124 | | const float kSGVOffset = 7.7825991679894591f; |
125 | | |
126 | | template <bool invert, typename D, typename V> |
127 | 38.6M | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { |
128 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v |
129 | | // is related to the number of photons. |
130 | | // |
131 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. |
132 | | // This ratio allows quantization to move from jxl's opsin space to |
133 | | // butteraugli's log-gamma space. |
134 | 38.6M | float kEpsilon = 1e-2; |
135 | 38.6M | v = ZeroIfNegative(v); |
136 | 38.6M | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); |
137 | 38.6M | const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon); |
138 | 38.6M | const auto kDenMul = Set(d, kInvLog2e * kSGmul); |
139 | | |
140 | 38.6M | const auto v2 = Mul(v, v); |
141 | | |
142 | 38.6M | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); |
143 | 38.6M | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); |
144 | 38.6M | return invert ? Div(num, den) : Div(den, num); |
145 | 38.6M | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 127 | 28.7M | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { | 128 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v | 129 | | // is related to the number of photons. | 130 | | // | 131 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. | 132 | | // This ratio allows quantization to move from jxl's opsin space to | 133 | | // butteraugli's log-gamma space. | 134 | 28.7M | float kEpsilon = 1e-2; | 135 | 28.7M | v = ZeroIfNegative(v); | 136 | 28.7M | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); | 137 | 28.7M | const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon); | 138 | 28.7M | const auto kDenMul = Set(d, kInvLog2e * kSGmul); | 139 | | | 140 | 28.7M | const auto v2 = Mul(v, v); | 141 | | | 142 | 28.7M | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); | 143 | 28.7M | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); | 144 | 28.7M | return invert ? Div(num, den) : Div(den, num); | 145 | 28.7M | } |
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 127 | 3.54M | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { | 128 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v | 129 | | // is related to the number of photons. | 130 | | // | 131 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. | 132 | | // This ratio allows quantization to move from jxl's opsin space to | 133 | | // butteraugli's log-gamma space. | 134 | 3.54M | float kEpsilon = 1e-2; | 135 | 3.54M | v = ZeroIfNegative(v); | 136 | 3.54M | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); | 137 | 3.54M | const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon); | 138 | 3.54M | const auto kDenMul = Set(d, kInvLog2e * kSGmul); | 139 | | | 140 | 3.54M | const auto v2 = Mul(v, v); | 141 | | | 142 | 3.54M | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); | 143 | 3.54M | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); | 144 | 3.54M | return invert ? Div(num, den) : Div(den, num); | 145 | 3.54M | } |
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 127 | 6.36M | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { | 128 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v | 129 | | // is related to the number of photons. | 130 | | // | 131 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. | 132 | | // This ratio allows quantization to move from jxl's opsin space to | 133 | | // butteraugli's log-gamma space. | 134 | 6.36M | float kEpsilon = 1e-2; | 135 | 6.36M | v = ZeroIfNegative(v); | 136 | 6.36M | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); | 137 | 6.36M | const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon); | 138 | 6.36M | const auto kDenMul = Set(d, kInvLog2e * kSGmul); | 139 | | | 140 | 6.36M | const auto v2 = Mul(v, v); | 141 | | | 142 | 6.36M | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); | 143 | 6.36M | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); | 144 | 6.36M | return invert ? Div(num, den) : Div(den, num); | 145 | 6.36M | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
146 | | |
147 | | template <bool invert = false> |
148 | 28.7M | float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { |
149 | 28.7M | using DScalar = HWY_CAPPED(float, 1); |
150 | 28.7M | auto vscalar = Load(DScalar(), &v); |
151 | 28.7M | return GetLane( |
152 | 28.7M | RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar)); |
153 | 28.7M | } enc_adaptive_quantization.cc:float jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float) Line | Count | Source | 148 | 28.7M | float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { | 149 | 28.7M | using DScalar = HWY_CAPPED(float, 1); | 150 | 28.7M | auto vscalar = Load(DScalar(), &v); | 151 | 28.7M | return GetLane( | 152 | 28.7M | RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar)); | 153 | 28.7M | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float) Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float) |
154 | | |
155 | | // TODO(veluca): this function computes an approximation of the derivative of |
156 | | // SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or |
157 | | // exact derivatives. For reference, SimpleGamma was: |
158 | | /* |
159 | | template <typename D, typename V> |
160 | | V SimpleGamma(const D d, V v) { |
161 | | // A simple HDR compatible gamma function. |
162 | | const auto mul = Set(d, kSGmul); |
163 | | const auto kRetMul = Set(d, kSGRetMul); |
164 | | const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f); |
165 | | const auto kVOffset = Set(d, kSGVOffset); |
166 | | |
167 | | v *= mul; |
168 | | |
169 | | // This should happen rarely, but may lead to a NaN, which is rather |
170 | | // undesirable. Since negative photons don't exist we solve the NaNs by |
171 | | // clamping here. |
172 | | // TODO(veluca): with FastLog2f, this no longer leads to NaNs. |
173 | | v = ZeroIfNegative(v); |
174 | | return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd; |
175 | | } |
176 | | */ |
177 | | |
178 | | template <class D, class V> |
179 | | V GammaModulation(const D d, const size_t x, const size_t y, |
180 | | const ImageF& xyb_x, const ImageF& xyb_y, const Rect& rect, |
181 | 398k | const V out_val) { |
182 | 398k | const float kBias = 0.16f; |
183 | 398k | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]); |
184 | 398k | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]); |
185 | 398k | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]); |
186 | 398k | auto overall_ratio = Zero(d); |
187 | 398k | auto bias = Set(d, kBias); |
188 | 3.58M | for (size_t dy = 0; dy < 8; ++dy) { |
189 | 3.18M | const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy); |
190 | 3.18M | const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy); |
191 | 6.36M | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { |
192 | 3.18M | const auto iny = Add(Load(d, row_in_y + x + dx), bias); |
193 | 3.18M | const auto inx = Load(d, row_in_x + x + dx); |
194 | | |
195 | 3.18M | const auto r = Sub(iny, inx); |
196 | 3.18M | const auto ratio_r = |
197 | 3.18M | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r); |
198 | 3.18M | overall_ratio = Add(overall_ratio, ratio_r); |
199 | | |
200 | 3.18M | const auto g = Add(iny, inx); |
201 | 3.18M | const auto ratio_g = |
202 | 3.18M | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g); |
203 | 3.18M | overall_ratio = Add(overall_ratio, ratio_g); |
204 | 3.18M | } |
205 | 3.18M | } |
206 | 398k | overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64)); |
207 | | // ideally -1.0, but likely optimal correction adds some entropy, so slightly |
208 | | // less than that. |
209 | 398k | const auto kGamma = Set(d, 0.1005613337192697f); |
210 | 398k | return MulAdd(kGamma, FastLog2f(d, overall_ratio), out_val); |
211 | 398k | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::GammaModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 181 | 398k | const V out_val) { | 182 | 398k | const float kBias = 0.16f; | 183 | 398k | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]); | 184 | 398k | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]); | 185 | 398k | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]); | 186 | 398k | auto overall_ratio = Zero(d); | 187 | 398k | auto bias = Set(d, kBias); | 188 | 3.58M | for (size_t dy = 0; dy < 8; ++dy) { | 189 | 3.18M | const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy); | 190 | 3.18M | const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy); | 191 | 6.36M | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { | 192 | 3.18M | const auto iny = Add(Load(d, row_in_y + x + dx), bias); | 193 | 3.18M | const auto inx = Load(d, row_in_x + x + dx); | 194 | | | 195 | 3.18M | const auto r = Sub(iny, inx); | 196 | 3.18M | const auto ratio_r = | 197 | 3.18M | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r); | 198 | 3.18M | overall_ratio = Add(overall_ratio, ratio_r); | 199 | | | 200 | 3.18M | const auto g = Add(iny, inx); | 201 | 3.18M | const auto ratio_g = | 202 | 3.18M | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g); | 203 | 3.18M | overall_ratio = Add(overall_ratio, ratio_g); | 204 | 3.18M | } | 205 | 3.18M | } | 206 | 398k | overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64)); | 207 | | // ideally -1.0, but likely optimal correction adds some entropy, so slightly | 208 | | // less than that. | 209 | 398k | const auto kGamma = Set(d, 0.1005613337192697f); | 210 | 398k | return MulAdd(kGamma, FastLog2f(d, overall_ratio), out_val); | 211 | 398k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::GammaModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::GammaModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>) |
212 | | |
213 | | // Change precision in 8x8 blocks that have significant amounts of blue |
214 | | // content (but are not close to solid blue). |
215 | | // This is based on the idea that M and L cone activations saturate the |
216 | | // S (blue) receptors, and the S reception becomes more important when |
217 | | // both M and L levels are low. In that case M and L receptors may be |
218 | | // observing S-spectra instead and viewing them with higher spatial |
219 | | // accuracy, justifying spending more bits here. |
220 | | template <class D, class V> |
221 | | V BlueModulation(const D d, const size_t x, const size_t y, |
222 | | const ImageF& planex, const ImageF& planey, |
223 | 398k | const ImageF& planeb, const Rect& rect, const V out_val) { |
224 | 398k | auto sum = Zero(d); |
225 | 398k | static const float kLimit = 0.010474084867598155; |
226 | 398k | static const float kOffset = 0.0031994768654636393; |
227 | 3.58M | for (size_t dy = 0; dy < 8; ++dy) { |
228 | 3.18M | const float* JXL_RESTRICT row_in_x = rect.ConstRow(planex, y + dy) + x; |
229 | 3.18M | const float* JXL_RESTRICT row_in_y = rect.ConstRow(planey, y + dy) + x; |
230 | 3.18M | const float* JXL_RESTRICT row_in_b = rect.ConstRow(planeb, y + dy) + x; |
231 | 6.36M | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { |
232 | 3.18M | const auto p_x = Load(d, row_in_x + dx); |
233 | 3.18M | const auto p_b = Load(d, row_in_b + dx); |
234 | 3.18M | const auto p_y_raw = Add(Load(d, row_in_y + dx), Set(d, kOffset)); |
235 | 3.18M | const auto p_y_effective = Add(p_y_raw, Abs(p_x)); |
236 | 3.18M | sum = Add(sum, |
237 | 3.18M | IfThenElseZero(Gt(p_b, p_y_effective), |
238 | 3.18M | Min(Sub(p_b, p_y_effective), Set(d, kLimit)))); |
239 | 3.18M | } |
240 | 3.18M | } |
241 | 398k | static const float kMul = 0.90590804735610064; |
242 | 398k | sum = SumOfLanes(d, sum); |
243 | 398k | float scalar_sum = GetLane(sum); |
244 | | // If it is all blue, don't boost the quantization. |
245 | | // All blue likely means low frequency blue. Let's not make the most |
246 | | // perfect sky ever. |
247 | 398k | if (scalar_sum >= 32 * kLimit) { |
248 | 82.2k | scalar_sum = 64 * kLimit - scalar_sum; |
249 | 82.2k | } |
250 | 398k | static const float kMaxLimit = 15.463398341612438; |
251 | 398k | if (scalar_sum >= kMaxLimit * kLimit) { |
252 | 60.3k | scalar_sum = kMaxLimit * kLimit; |
253 | 60.3k | } |
254 | 398k | scalar_sum *= kMul; |
255 | 398k | return Add(Set(d, scalar_sum), out_val); |
256 | 398k | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::BlueModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 223 | 398k | const ImageF& planeb, const Rect& rect, const V out_val) { | 224 | 398k | auto sum = Zero(d); | 225 | 398k | static const float kLimit = 0.010474084867598155; | 226 | 398k | static const float kOffset = 0.0031994768654636393; | 227 | 3.58M | for (size_t dy = 0; dy < 8; ++dy) { | 228 | 3.18M | const float* JXL_RESTRICT row_in_x = rect.ConstRow(planex, y + dy) + x; | 229 | 3.18M | const float* JXL_RESTRICT row_in_y = rect.ConstRow(planey, y + dy) + x; | 230 | 3.18M | const float* JXL_RESTRICT row_in_b = rect.ConstRow(planeb, y + dy) + x; | 231 | 6.36M | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { | 232 | 3.18M | const auto p_x = Load(d, row_in_x + dx); | 233 | 3.18M | const auto p_b = Load(d, row_in_b + dx); | 234 | 3.18M | const auto p_y_raw = Add(Load(d, row_in_y + dx), Set(d, kOffset)); | 235 | 3.18M | const auto p_y_effective = Add(p_y_raw, Abs(p_x)); | 236 | 3.18M | sum = Add(sum, | 237 | 3.18M | IfThenElseZero(Gt(p_b, p_y_effective), | 238 | 3.18M | Min(Sub(p_b, p_y_effective), Set(d, kLimit)))); | 239 | 3.18M | } | 240 | 3.18M | } | 241 | 398k | static const float kMul = 0.90590804735610064; | 242 | 398k | sum = SumOfLanes(d, sum); | 243 | 398k | float scalar_sum = GetLane(sum); | 244 | | // If it is all blue, don't boost the quantization. | 245 | | // All blue likely means low frequency blue. Let's not make the most | 246 | | // perfect sky ever. | 247 | 398k | if (scalar_sum >= 32 * kLimit) { | 248 | 82.2k | scalar_sum = 64 * kLimit - scalar_sum; | 249 | 82.2k | } | 250 | 398k | static const float kMaxLimit = 15.463398341612438; | 251 | 398k | if (scalar_sum >= kMaxLimit * kLimit) { | 252 | 60.3k | scalar_sum = kMaxLimit * kLimit; | 253 | 60.3k | } | 254 | 398k | scalar_sum *= kMul; | 255 | 398k | return Add(Set(d, scalar_sum), out_val); | 256 | 398k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::BlueModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::BlueModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>) |
257 | | |
258 | | // Change precision in 8x8 blocks that have high frequency content. |
259 | | template <class D, class V> |
260 | | V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb_y, |
261 | 398k | const Rect& rect, const V out_val) { |
262 | | // Zero out the invalid differences for the rightmost value per row. |
263 | 398k | const Rebind<uint32_t, D> du; |
264 | 398k | HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, |
265 | 398k | ~0u, ~0u, ~0u, 0}; |
266 | | |
267 | | // Sums of deltas of y and x components between (approximate) |
268 | | // 4-connected pixels. |
269 | 398k | auto sum_y = Zero(d); |
270 | 398k | static const float valmin_y = 0.0206; |
271 | 398k | auto valminv_y = Set(d, valmin_y); |
272 | 3.58M | for (size_t dy = 0; dy < 8; ++dy) { |
273 | 3.18M | const float* JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy) + x; |
274 | 3.18M | const float* JXL_RESTRICT row_in_y_next = |
275 | 3.18M | dy == 7 ? row_in_y : rect.ConstRow(xyb_y, y + dy + 1) + x; |
276 | | |
277 | | // In SCALAR, there is no guarantee of having extra row padding. |
278 | | // Hence, we need to ensure we don't access pixels outside the row itself. |
279 | | // In SIMD modes, however, rows are padded, so it's safe to access one |
280 | | // garbage value after the row. The vector then gets masked with kMaskRight |
281 | | // to remove the influence of that value. |
282 | 3.18M | #if HWY_TARGET != HWY_SCALAR |
283 | 6.36M | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { |
284 | | #else |
285 | | for (size_t dx = 0; dx < 7; dx += Lanes(d)) { |
286 | | #endif |
287 | 3.18M | const auto mask = BitCast(d, Load(du, kMaskRight + dx)); |
288 | 3.18M | { |
289 | 3.18M | const auto p_y = Load(d, row_in_y + dx); |
290 | 3.18M | const auto pr_y = LoadU(d, row_in_y + dx + 1); |
291 | 3.18M | sum_y = Add(sum_y, And(mask, Min(valminv_y, AbsDiff(p_y, pr_y)))); |
292 | 3.18M | const auto pd_y = Load(d, row_in_y_next + dx); |
293 | 3.18M | sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y))); |
294 | 3.18M | } |
295 | 3.18M | } |
296 | | #if HWY_TARGET == HWY_SCALAR |
297 | | const auto p_y = Load(d, row_in_y + 7); |
298 | | const auto pd_y = Load(d, row_in_y_next + 7); |
299 | | sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y))); |
300 | | #endif |
301 | 3.18M | } |
302 | 398k | static const float kMul_y = -0.38; |
303 | 398k | sum_y = SumOfLanes(d, sum_y); |
304 | | |
305 | 398k | float scalar_sum_y = GetLane(sum_y); |
306 | 398k | scalar_sum_y *= kMul_y; |
307 | | |
308 | | // higher value -> more bpp |
309 | 398k | float kOffset = 0.42; |
310 | 398k | scalar_sum_y += kOffset; |
311 | | |
312 | 398k | return Add(Set(d, scalar_sum_y), out_val); |
313 | 398k | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::HfModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 261 | 398k | const Rect& rect, const V out_val) { | 262 | | // Zero out the invalid differences for the rightmost value per row. | 263 | 398k | const Rebind<uint32_t, D> du; | 264 | 398k | HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, | 265 | 398k | ~0u, ~0u, ~0u, 0}; | 266 | | | 267 | | // Sums of deltas of y and x components between (approximate) | 268 | | // 4-connected pixels. | 269 | 398k | auto sum_y = Zero(d); | 270 | 398k | static const float valmin_y = 0.0206; | 271 | 398k | auto valminv_y = Set(d, valmin_y); | 272 | 3.58M | for (size_t dy = 0; dy < 8; ++dy) { | 273 | 3.18M | const float* JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy) + x; | 274 | 3.18M | const float* JXL_RESTRICT row_in_y_next = | 275 | 3.18M | dy == 7 ? row_in_y : rect.ConstRow(xyb_y, y + dy + 1) + x; | 276 | | | 277 | | // In SCALAR, there is no guarantee of having extra row padding. | 278 | | // Hence, we need to ensure we don't access pixels outside the row itself. | 279 | | // In SIMD modes, however, rows are padded, so it's safe to access one | 280 | | // garbage value after the row. The vector then gets masked with kMaskRight | 281 | | // to remove the influence of that value. | 282 | 3.18M | #if HWY_TARGET != HWY_SCALAR | 283 | 6.36M | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { | 284 | | #else | 285 | | for (size_t dx = 0; dx < 7; dx += Lanes(d)) { | 286 | | #endif | 287 | 3.18M | const auto mask = BitCast(d, Load(du, kMaskRight + dx)); | 288 | 3.18M | { | 289 | 3.18M | const auto p_y = Load(d, row_in_y + dx); | 290 | 3.18M | const auto pr_y = LoadU(d, row_in_y + dx + 1); | 291 | 3.18M | sum_y = Add(sum_y, And(mask, Min(valminv_y, AbsDiff(p_y, pr_y)))); | 292 | 3.18M | const auto pd_y = Load(d, row_in_y_next + dx); | 293 | 3.18M | sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y))); | 294 | 3.18M | } | 295 | 3.18M | } | 296 | | #if HWY_TARGET == HWY_SCALAR | 297 | | const auto p_y = Load(d, row_in_y + 7); | 298 | | const auto pd_y = Load(d, row_in_y_next + 7); | 299 | | sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y))); | 300 | | #endif | 301 | 3.18M | } | 302 | 398k | static const float kMul_y = -0.38; | 303 | 398k | sum_y = SumOfLanes(d, sum_y); | 304 | | | 305 | 398k | float scalar_sum_y = GetLane(sum_y); | 306 | 398k | scalar_sum_y *= kMul_y; | 307 | | | 308 | | // higher value -> more bpp | 309 | 398k | float kOffset = 0.42; | 310 | 398k | scalar_sum_y += kOffset; | 311 | | | 312 | 398k | return Add(Set(d, scalar_sum_y), out_val); | 313 | 398k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::HfModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::HfModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>) |
314 | | |
315 | | void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x, |
316 | | const ImageF& xyb_y, const ImageF& xyb_b, |
317 | | const Rect& rect_in, const float scale, |
318 | 7.22k | const Rect& rect_out, ImageF* out) { |
319 | 7.22k | float base_level = 0.48f * scale; |
320 | 7.22k | float kDampenRampStart = 2.0f; |
321 | 7.22k | float kDampenRampEnd = 14.0f; |
322 | 7.22k | float dampen = 1.0f; |
323 | 7.22k | if (butteraugli_target >= kDampenRampStart) { |
324 | 0 | dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / |
325 | 0 | (kDampenRampEnd - kDampenRampStart)); |
326 | 0 | if (dampen < 0) { |
327 | 0 | dampen = 0; |
328 | 0 | } |
329 | 0 | } |
330 | 7.22k | const float mul = scale * dampen; |
331 | 7.22k | const float add = (1.0f - dampen) * base_level; |
332 | 60.6k | for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) { |
333 | 53.4k | const size_t y = iy * 8; |
334 | 53.4k | float* const JXL_RESTRICT row_out = out->Row(iy); |
335 | 53.4k | const HWY_CAPPED(float, kBlockDim) df; |
336 | 451k | for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) { |
337 | 398k | size_t x = ix * 8; |
338 | 398k | auto mask_val = ComputeMask(df, Set(df, row_out[ix])); |
339 | 398k | mask_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, mask_val); |
340 | 398k | auto out_val = HfModulation(df, x, y, xyb_y, rect_in, mask_val); |
341 | 398k | out_val = Min(out_val, BlueModulation(df, x, y, xyb_x, xyb_y, xyb_b, |
342 | 398k | rect_in, mask_val)); |
343 | | // We want multiplicative quantization field, so everything |
344 | | // until this point has been modulating the exponent. |
345 | 398k | row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; |
346 | 398k | } |
347 | 53.4k | } |
348 | 7.22k | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Line | Count | Source | 318 | 7.22k | const Rect& rect_out, ImageF* out) { | 319 | 7.22k | float base_level = 0.48f * scale; | 320 | 7.22k | float kDampenRampStart = 2.0f; | 321 | 7.22k | float kDampenRampEnd = 14.0f; | 322 | 7.22k | float dampen = 1.0f; | 323 | 7.22k | if (butteraugli_target >= kDampenRampStart) { | 324 | 0 | dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / | 325 | 0 | (kDampenRampEnd - kDampenRampStart)); | 326 | 0 | if (dampen < 0) { | 327 | 0 | dampen = 0; | 328 | 0 | } | 329 | 0 | } | 330 | 7.22k | const float mul = scale * dampen; | 331 | 7.22k | const float add = (1.0f - dampen) * base_level; | 332 | 60.6k | for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) { | 333 | 53.4k | const size_t y = iy * 8; | 334 | 53.4k | float* const JXL_RESTRICT row_out = out->Row(iy); | 335 | 53.4k | const HWY_CAPPED(float, kBlockDim) df; | 336 | 451k | for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) { | 337 | 398k | size_t x = ix * 8; | 338 | 398k | auto mask_val = ComputeMask(df, Set(df, row_out[ix])); | 339 | 398k | mask_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, mask_val); | 340 | 398k | auto out_val = HfModulation(df, x, y, xyb_y, rect_in, mask_val); | 341 | 398k | out_val = Min(out_val, BlueModulation(df, x, y, xyb_x, xyb_y, xyb_b, | 342 | 398k | rect_in, mask_val)); | 343 | | // We want multiplicative quantization field, so everything | 344 | | // until this point has been modulating the exponent. | 345 | 398k | row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; | 346 | 398k | } | 347 | 53.4k | } | 348 | 7.22k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) |
349 | | |
350 | | template <typename D, typename V> |
351 | 6.84M | V MaskingSqrt(const D d, V v) { |
352 | 6.84M | static const float kLogOffset = 27.505837037000106f; |
353 | 6.84M | static const float kMul = 211.66567973503678f; |
354 | 6.84M | const auto mul_v = Set(d, kMul * 1e8); |
355 | 6.84M | const auto offset_v = Set(d, kLogOffset); |
356 | 6.84M | return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v))); |
357 | 6.84M | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 351 | 3.29M | V MaskingSqrt(const D d, V v) { | 352 | 3.29M | static const float kLogOffset = 27.505837037000106f; | 353 | 3.29M | static const float kMul = 211.66567973503678f; | 354 | 3.29M | const auto mul_v = Set(d, kMul * 1e8); | 355 | 3.29M | const auto offset_v = Set(d, kLogOffset); | 356 | 3.29M | return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v))); | 357 | 3.29M | } |
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 351 | 3.54M | V MaskingSqrt(const D d, V v) { | 352 | 3.54M | static const float kLogOffset = 27.505837037000106f; | 353 | 3.54M | static const float kMul = 211.66567973503678f; | 354 | 3.54M | const auto mul_v = Set(d, kMul * 1e8); | 355 | 3.54M | const auto offset_v = Set(d, kLogOffset); | 356 | 3.54M | return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v))); | 357 | 3.54M | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
358 | | |
359 | 3.29M | float MaskingSqrt(const float v) { |
360 | 3.29M | using DScalar = HWY_CAPPED(float, 1); |
361 | 3.29M | auto vscalar = Load(DScalar(), &v); |
362 | 3.29M | return GetLane(MaskingSqrt(DScalar(), vscalar)); |
363 | 3.29M | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::MaskingSqrt(float) Line | Count | Source | 359 | 3.29M | float MaskingSqrt(const float v) { | 360 | 3.29M | using DScalar = HWY_CAPPED(float, 1); | 361 | 3.29M | auto vscalar = Load(DScalar(), &v); | 362 | 3.29M | return GetLane(MaskingSqrt(DScalar(), vscalar)); | 363 | 3.29M | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::MaskingSqrt(float) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::MaskingSqrt(float) |
364 | | |
365 | | inline void StoreMin4(const float v, float& min0, float& min1, float& min2, |
366 | 7.96M | float& min3) { |
367 | 7.96M | if (v < min3) { |
368 | 4.07M | if (v < min0) { |
369 | 923k | min3 = min2; |
370 | 923k | min2 = min1; |
371 | 923k | min1 = min0; |
372 | 923k | min0 = v; |
373 | 3.15M | } else if (v < min1) { |
374 | 1.02M | min3 = min2; |
375 | 1.02M | min2 = min1; |
376 | 1.02M | min1 = v; |
377 | 2.12M | } else if (v < min2) { |
378 | 1.06M | min3 = min2; |
379 | 1.06M | min2 = v; |
380 | 1.06M | } else { |
381 | 1.06M | min3 = v; |
382 | 1.06M | } |
383 | 4.07M | } |
384 | 7.96M | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&) Line | Count | Source | 366 | 7.96M | float& min3) { | 367 | 7.96M | if (v < min3) { | 368 | 4.07M | if (v < min0) { | 369 | 923k | min3 = min2; | 370 | 923k | min2 = min1; | 371 | 923k | min1 = min0; | 372 | 923k | min0 = v; | 373 | 3.15M | } else if (v < min1) { | 374 | 1.02M | min3 = min2; | 375 | 1.02M | min2 = min1; | 376 | 1.02M | min1 = v; | 377 | 2.12M | } else if (v < min2) { | 378 | 1.06M | min3 = min2; | 379 | 1.06M | min2 = v; | 380 | 1.06M | } else { | 381 | 1.06M | min3 = v; | 382 | 1.06M | } | 383 | 4.07M | } | 384 | 7.96M | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&) |
385 | | |
386 | | // Look for smooth areas near the area of degradation. |
387 | | // If the areas are generally smooth, don't do masking. |
388 | | // Output is downsampled 2x. |
389 | | Status FuzzyErosion(const float butteraugli_target, const Rect& from_rect, |
390 | 7.22k | const ImageF& from, const Rect& to_rect, ImageF* to) { |
391 | 7.22k | const size_t xsize = from.xsize(); |
392 | 7.22k | const size_t ysize = from.ysize(); |
393 | 7.22k | constexpr int kStep = 1; |
394 | 7.22k | static_assert(kStep == 1, "Step must be 1"); |
395 | 7.22k | JXL_ENSURE(to_rect.xsize() * 2 == from_rect.xsize()); |
396 | 7.22k | JXL_ENSURE(to_rect.ysize() * 2 == from_rect.ysize()); |
397 | 7.22k | static const float kMulBase[4] = { 0.125, 0.1, 0.09, 0.06 }; |
398 | 7.22k | static const float kMulAdd[4] = { 0.0, -0.1, -0.09, -0.06 }; |
399 | 7.22k | float mul = 0.0; |
400 | 7.22k | if (butteraugli_target < 2.0f) { |
401 | 7.22k | mul = (2.0f - butteraugli_target) * (1.0f / 2.0f); |
402 | 7.22k | } |
403 | 7.22k | float kMul[4] = { 0 }; |
404 | 7.22k | float norm_sum = 0.0; |
405 | 36.1k | for (size_t ii = 0; ii < 4; ++ii) { |
406 | 28.8k | kMul[ii] = kMulBase[ii] + mul * kMulAdd[ii]; |
407 | 28.8k | norm_sum += kMul[ii]; |
408 | 28.8k | } |
409 | 7.22k | static const float kTotal = 0.29959705784054957; |
410 | 36.1k | for (size_t ii = 0; ii < 4; ++ii) { |
411 | 28.8k | kMul[ii] *= kTotal / norm_sum; |
412 | 28.8k | } |
413 | 114k | for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { |
414 | 106k | size_t y = fy + from_rect.y0(); |
415 | 106k | size_t ym1 = y >= kStep ? y - kStep : y; |
416 | 106k | size_t yp1 = y + kStep < ysize ? y + kStep : y; |
417 | 106k | const float* rowt = from.Row(ym1); |
418 | 106k | const float* row = from.Row(y); |
419 | 106k | const float* rowb = from.Row(yp1); |
420 | 106k | float* row_out = to_rect.Row(to, fy / 2); |
421 | 1.69M | for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { |
422 | 1.59M | size_t x = fx + from_rect.x0(); |
423 | 1.59M | size_t xm1 = x >= kStep ? x - kStep : x; |
424 | 1.59M | size_t xp1 = x + kStep < xsize ? x + kStep : x; |
425 | 1.59M | float min[4] = { row[x], row[xm1], row[xp1], rowt[xm1] }; |
426 | | // Sort the first four values. |
427 | 1.59M | if (min[0] > min[1]) std::swap(min[0], min[1]); |
428 | 1.59M | if (min[0] > min[2]) std::swap(min[0], min[2]); |
429 | 1.59M | if (min[0] > min[3]) std::swap(min[0], min[3]); |
430 | 1.59M | if (min[1] > min[2]) std::swap(min[1], min[2]); |
431 | 1.59M | if (min[1] > min[3]) std::swap(min[1], min[3]); |
432 | 1.59M | if (min[2] > min[3]) std::swap(min[2], min[3]); |
433 | | // The remaining five values of a 3x3 neighbourhood. |
434 | 1.59M | StoreMin4(rowt[x], min[0], min[1], min[2], min[3]); |
435 | 1.59M | StoreMin4(rowt[xp1], min[0], min[1], min[2], min[3]); |
436 | 1.59M | StoreMin4(rowb[xm1], min[0], min[1], min[2], min[3]); |
437 | 1.59M | StoreMin4(rowb[x], min[0], min[1], min[2], min[3]); |
438 | 1.59M | StoreMin4(rowb[xp1], min[0], min[1], min[2], min[3]); |
439 | | |
440 | 1.59M | float v = kMul[0] * min[0] + kMul[1] * min[1] + kMul[2] * min[2] + kMul[3] * min[3]; |
441 | 1.59M | if (fx % 2 == 0 && fy % 2 == 0) { |
442 | 398k | row_out[fx / 2] = v; |
443 | 1.19M | } else { |
444 | 1.19M | row_out[fx / 2] += v; |
445 | 1.19M | } |
446 | 1.59M | } |
447 | 106k | } |
448 | 7.22k | return true; |
449 | 7.22k | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Line | Count | Source | 390 | 7.22k | const ImageF& from, const Rect& to_rect, ImageF* to) { | 391 | 7.22k | const size_t xsize = from.xsize(); | 392 | 7.22k | const size_t ysize = from.ysize(); | 393 | 7.22k | constexpr int kStep = 1; | 394 | 7.22k | static_assert(kStep == 1, "Step must be 1"); | 395 | 7.22k | JXL_ENSURE(to_rect.xsize() * 2 == from_rect.xsize()); | 396 | 7.22k | JXL_ENSURE(to_rect.ysize() * 2 == from_rect.ysize()); | 397 | 7.22k | static const float kMulBase[4] = { 0.125, 0.1, 0.09, 0.06 }; | 398 | 7.22k | static const float kMulAdd[4] = { 0.0, -0.1, -0.09, -0.06 }; | 399 | 7.22k | float mul = 0.0; | 400 | 7.22k | if (butteraugli_target < 2.0f) { | 401 | 7.22k | mul = (2.0f - butteraugli_target) * (1.0f / 2.0f); | 402 | 7.22k | } | 403 | 7.22k | float kMul[4] = { 0 }; | 404 | 7.22k | float norm_sum = 0.0; | 405 | 36.1k | for (size_t ii = 0; ii < 4; ++ii) { | 406 | 28.8k | kMul[ii] = kMulBase[ii] + mul * kMulAdd[ii]; | 407 | 28.8k | norm_sum += kMul[ii]; | 408 | 28.8k | } | 409 | 7.22k | static const float kTotal = 0.29959705784054957; | 410 | 36.1k | for (size_t ii = 0; ii < 4; ++ii) { | 411 | 28.8k | kMul[ii] *= kTotal / norm_sum; | 412 | 28.8k | } | 413 | 114k | for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { | 414 | 106k | size_t y = fy + from_rect.y0(); | 415 | 106k | size_t ym1 = y >= kStep ? y - kStep : y; | 416 | 106k | size_t yp1 = y + kStep < ysize ? y + kStep : y; | 417 | 106k | const float* rowt = from.Row(ym1); | 418 | 106k | const float* row = from.Row(y); | 419 | 106k | const float* rowb = from.Row(yp1); | 420 | 106k | float* row_out = to_rect.Row(to, fy / 2); | 421 | 1.69M | for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { | 422 | 1.59M | size_t x = fx + from_rect.x0(); | 423 | 1.59M | size_t xm1 = x >= kStep ? x - kStep : x; | 424 | 1.59M | size_t xp1 = x + kStep < xsize ? x + kStep : x; | 425 | 1.59M | float min[4] = { row[x], row[xm1], row[xp1], rowt[xm1] }; | 426 | | // Sort the first four values. | 427 | 1.59M | if (min[0] > min[1]) std::swap(min[0], min[1]); | 428 | 1.59M | if (min[0] > min[2]) std::swap(min[0], min[2]); | 429 | 1.59M | if (min[0] > min[3]) std::swap(min[0], min[3]); | 430 | 1.59M | if (min[1] > min[2]) std::swap(min[1], min[2]); | 431 | 1.59M | if (min[1] > min[3]) std::swap(min[1], min[3]); | 432 | 1.59M | if (min[2] > min[3]) std::swap(min[2], min[3]); | 433 | | // The remaining five values of a 3x3 neighbourhood. | 434 | 1.59M | StoreMin4(rowt[x], min[0], min[1], min[2], min[3]); | 435 | 1.59M | StoreMin4(rowt[xp1], min[0], min[1], min[2], min[3]); | 436 | 1.59M | StoreMin4(rowb[xm1], min[0], min[1], min[2], min[3]); | 437 | 1.59M | StoreMin4(rowb[x], min[0], min[1], min[2], min[3]); | 438 | 1.59M | StoreMin4(rowb[xp1], min[0], min[1], min[2], min[3]); | 439 | | | 440 | 1.59M | float v = kMul[0] * min[0] + kMul[1] * min[1] + kMul[2] * min[2] + kMul[3] * min[3]; | 441 | 1.59M | if (fx % 2 == 0 && fy % 2 == 0) { | 442 | 398k | row_out[fx / 2] = v; | 443 | 1.19M | } else { | 444 | 1.19M | row_out[fx / 2] += v; | 445 | 1.19M | } | 446 | 1.59M | } | 447 | 106k | } | 448 | 7.22k | return true; | 449 | 7.22k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) |
450 | | |
451 | | struct AdaptiveQuantizationImpl { |
452 | 186 | Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) { |
453 | 186 | JXL_ASSIGN_OR_RETURN( |
454 | 186 | diff_buffer, |
455 | 186 | ImageF::Create(memory_manager, kEncTileDim + 8, num_threads)); |
456 | 372 | for (size_t i = pre_erosion.size(); i < num_threads; i++) { |
457 | 186 | JXL_ASSIGN_OR_RETURN( |
458 | 186 | ImageF tmp, |
459 | 186 | ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2, |
460 | 186 | kEncTileDimInBlocks * 2 + 2)); |
461 | 186 | pre_erosion.emplace_back(std::move(tmp)); |
462 | 186 | } |
463 | 186 | return true; |
464 | 186 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long) Line | Count | Source | 452 | 186 | Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) { | 453 | 186 | JXL_ASSIGN_OR_RETURN( | 454 | 186 | diff_buffer, | 455 | 186 | ImageF::Create(memory_manager, kEncTileDim + 8, num_threads)); | 456 | 372 | for (size_t i = pre_erosion.size(); i < num_threads; i++) { | 457 | 186 | JXL_ASSIGN_OR_RETURN( | 458 | 186 | ImageF tmp, | 459 | 186 | ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2, | 460 | 186 | kEncTileDimInBlocks * 2 + 2)); | 461 | 186 | pre_erosion.emplace_back(std::move(tmp)); | 462 | 186 | } | 463 | 186 | return true; | 464 | 186 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long) |
465 | | |
466 | | Status ComputeTile(float butteraugli_target, float scale, const Image3F& xyb, |
467 | | const Rect& rect_in, const Rect& rect_out, |
468 | 7.22k | const int thread, ImageF* mask, ImageF* mask1x1) { |
469 | 7.22k | JXL_ENSURE(rect_in.x0() % kBlockDim == 0); |
470 | 7.22k | JXL_ENSURE(rect_in.y0() % kBlockDim == 0); |
471 | 7.22k | const size_t xsize = xyb.xsize(); |
472 | 7.22k | const size_t ysize = xyb.ysize(); |
473 | | |
474 | | // The XYB gamma is 3.0 to be able to decode faster with two muls. |
475 | | // Butteraugli's gamma is matching the gamma of human eye, around 2.6. |
476 | | // We approximate the gamma difference by adding one cubic root into |
477 | | // the adaptive quantization. This gives us a total gamma of 2.6666 |
478 | | // for quantization uses. |
479 | 7.22k | const float match_gamma_offset = 0.019; |
480 | | |
481 | 7.22k | const HWY_FULL(float) df; |
482 | | |
483 | 7.22k | size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8; |
484 | 7.22k | size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8; |
485 | | |
486 | 7.22k | size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8; |
487 | 7.22k | size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8; |
488 | | |
489 | 7.22k | if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2; |
490 | 7.22k | if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) { |
491 | 0 | x_end_1x1 += 2; |
492 | 0 | } |
493 | 7.22k | if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2; |
494 | 7.22k | if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) { |
495 | 0 | y_end_1x1 += 2; |
496 | 0 | } |
497 | | |
498 | | // Computes image (padded to multiple of 8x8) of local pixel differences. |
499 | | // Subsample both directions by 4. |
500 | | // 1x1 Laplacian of intensity. |
501 | 434k | for (size_t y = y_start_1x1; y < y_end_1x1; ++y) { |
502 | 427k | const size_t y2 = y + 1 < ysize ? y + 1 : y; |
503 | 427k | const size_t y1 = y > 0 ? y - 1 : y; |
504 | 427k | const float* row_in = xyb.ConstPlaneRow(1, y); |
505 | 427k | const float* row_in1 = xyb.ConstPlaneRow(1, y1); |
506 | 427k | const float* row_in2 = xyb.ConstPlaneRow(1, y2); |
507 | 427k | float* mask1x1_out = mask1x1->Row(y); |
508 | 25.4M | auto scalar_pixel1x1 = [&](size_t x) { |
509 | 25.4M | const size_t x2 = x + 1 < xsize ? x + 1 : x; |
510 | 25.4M | const size_t x1 = x > 0 ? x - 1 : x; |
511 | 25.4M | const float base = |
512 | 25.4M | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); |
513 | 25.4M | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( |
514 | 25.4M | row_in[x] + match_gamma_offset); |
515 | 25.4M | float diff = std::abs(gammac * (row_in[x] - base)); |
516 | 25.4M | static const double kScaler = 1.0; |
517 | 25.4M | diff *= kScaler; |
518 | 25.4M | diff = std::log1p(diff); |
519 | 25.4M | static const float kMul = 1.0; |
520 | 25.4M | static const float kOffset = 0.01; |
521 | 25.4M | mask1x1_out[x] = kMul / (diff + kOffset); |
522 | 25.4M | }; enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Line | Count | Source | 508 | 25.4M | auto scalar_pixel1x1 = [&](size_t x) { | 509 | 25.4M | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 510 | 25.4M | const size_t x1 = x > 0 ? x - 1 : x; | 511 | 25.4M | const float base = | 512 | 25.4M | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 513 | 25.4M | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 514 | 25.4M | row_in[x] + match_gamma_offset); | 515 | 25.4M | float diff = std::abs(gammac * (row_in[x] - base)); | 516 | 25.4M | static const double kScaler = 1.0; | 517 | 25.4M | diff *= kScaler; | 518 | 25.4M | diff = std::log1p(diff); | 519 | 25.4M | static const float kMul = 1.0; | 520 | 25.4M | static const float kOffset = 0.01; | 521 | 25.4M | mask1x1_out[x] = kMul / (diff + kOffset); | 522 | 25.4M | }; |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const |
523 | 25.9M | for (size_t x = x_start_1x1; x < x_end_1x1; ++x) { |
524 | 25.4M | scalar_pixel1x1(x); |
525 | 25.4M | } |
526 | 427k | } |
527 | | |
528 | 7.22k | size_t y_start = rect_in.y0() + rect_out.y0() * 8; |
529 | 7.22k | size_t y_end = y_start + rect_out.ysize() * 8; |
530 | | |
531 | 7.22k | size_t x_start = rect_in.x0() + rect_out.x0() * 8; |
532 | 7.22k | size_t x_end = x_start + rect_out.xsize() * 8; |
533 | | |
534 | 7.22k | if (x_start != 0) x_start -= 4; |
535 | 7.22k | if (x_end != xsize) x_end += 4; |
536 | 7.22k | if (y_start != 0) y_start -= 4; |
537 | 7.22k | if (y_end != ysize) y_end += 4; |
538 | 7.22k | JXL_RETURN_IF_ERROR(pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, |
539 | 7.22k | (y_end - y_start) / 4)); |
540 | | |
541 | 7.22k | static const float limit = 0.2f; |
542 | 483k | for (size_t y = y_start; y < y_end; ++y) { |
543 | 476k | size_t y2 = y + 1 < ysize ? y + 1 : y; |
544 | 476k | size_t y1 = y > 0 ? y - 1 : y; |
545 | | |
546 | 476k | const float* row_in = xyb.ConstPlaneRow(1, y); |
547 | 476k | const float* row_in1 = xyb.ConstPlaneRow(1, y1); |
548 | 476k | const float* row_in2 = xyb.ConstPlaneRow(1, y2); |
549 | 476k | float* JXL_RESTRICT row_out = diff_buffer.Row(thread); |
550 | | |
551 | 3.29M | auto scalar_pixel = [&](size_t x) { |
552 | 3.29M | const size_t x2 = x + 1 < xsize ? x + 1 : x; |
553 | 3.29M | const size_t x1 = x > 0 ? x - 1 : x; |
554 | 3.29M | const float base = |
555 | 3.29M | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); |
556 | 3.29M | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( |
557 | 3.29M | row_in[x] + match_gamma_offset); |
558 | 3.29M | float diff = gammac * (row_in[x] - base); |
559 | 3.29M | diff *= diff; |
560 | 3.29M | if (diff >= limit) { |
561 | 609k | diff = limit; |
562 | 609k | } |
563 | 3.29M | diff = MaskingSqrt(diff); |
564 | 3.29M | if ((y % 4) != 0) { |
565 | 2.47M | row_out[x - x_start] += diff; |
566 | 2.47M | } else { |
567 | 824k | row_out[x - x_start] = diff; |
568 | 824k | } |
569 | 3.29M | }; enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const Line | Count | Source | 551 | 3.29M | auto scalar_pixel = [&](size_t x) { | 552 | 3.29M | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 553 | 3.29M | const size_t x1 = x > 0 ? x - 1 : x; | 554 | 3.29M | const float base = | 555 | 3.29M | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 556 | 3.29M | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 557 | 3.29M | row_in[x] + match_gamma_offset); | 558 | 3.29M | float diff = gammac * (row_in[x] - base); | 559 | 3.29M | diff *= diff; | 560 | 3.29M | if (diff >= limit) { | 561 | 609k | diff = limit; | 562 | 609k | } | 563 | 3.29M | diff = MaskingSqrt(diff); | 564 | 3.29M | if ((y % 4) != 0) { | 565 | 2.47M | row_out[x - x_start] += diff; | 566 | 2.47M | } else { | 567 | 824k | row_out[x - x_start] = diff; | 568 | 824k | } | 569 | 3.29M | }; |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const |
570 | | |
571 | 476k | size_t x = x_start; |
572 | | // First pixel of the row. |
573 | 476k | if (x_start == 0) { |
574 | 65.2k | scalar_pixel(x_start); |
575 | 65.2k | ++x; |
576 | 65.2k | } |
577 | | // SIMD |
578 | 476k | const auto match_gamma_offset_v = Set(df, match_gamma_offset); |
579 | 476k | const auto quarter = Set(df, 0.25f); |
580 | 4.02M | for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) { |
581 | 3.54M | const auto in = LoadU(df, row_in + x); |
582 | 3.54M | const auto in_r = LoadU(df, row_in + x + 1); |
583 | 3.54M | const auto in_l = LoadU(df, row_in + x - 1); |
584 | 3.54M | const auto in_t = LoadU(df, row_in2 + x); |
585 | 3.54M | const auto in_b = LoadU(df, row_in1 + x); |
586 | 3.54M | auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b))); |
587 | 3.54M | auto gammacv = |
588 | 3.54M | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>( |
589 | 3.54M | df, Add(in, match_gamma_offset_v)); |
590 | 3.54M | auto diff = Mul(gammacv, Sub(in, base)); |
591 | 3.54M | diff = Mul(diff, diff); |
592 | 3.54M | diff = Min(diff, Set(df, limit)); |
593 | 3.54M | diff = MaskingSqrt(df, diff); |
594 | 3.54M | if ((y & 3) != 0) { |
595 | 2.66M | diff = Add(diff, LoadU(df, row_out + x - x_start)); |
596 | 2.66M | } |
597 | 3.54M | StoreU(diff, df, row_out + x - x_start); |
598 | 3.54M | } |
599 | | // Scalar |
600 | 3.70M | for (; x < x_end; ++x) { |
601 | 3.23M | scalar_pixel(x); |
602 | 3.23M | } |
603 | 476k | if (y % 4 == 3) { |
604 | 119k | float* row_d_out = pre_erosion[thread].Row((y - y_start) / 4); |
605 | 2.09M | for (size_t qx = 0; qx < (x_end - x_start) / 4; qx++) { |
606 | 1.98M | row_d_out[qx] = (row_out[qx * 4] + row_out[qx * 4 + 1] + |
607 | 1.98M | row_out[qx * 4 + 2] + row_out[qx * 4 + 3]) * |
608 | 1.98M | 0.25f; |
609 | 1.98M | } |
610 | 119k | } |
611 | 476k | } |
612 | 7.22k | JXL_ENSURE(x_start % (kBlockDim / 2) == 0); |
613 | 7.22k | JXL_ENSURE(y_start % (kBlockDim / 2) == 0); |
614 | 7.22k | Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, |
615 | 7.22k | rect_out.xsize() * 2, rect_out.ysize() * 2); |
616 | 7.22k | JXL_RETURN_IF_ERROR(FuzzyErosion(butteraugli_target, from_rect, |
617 | 7.22k | pre_erosion[thread], rect_out, &aq_map)); |
618 | 60.6k | for (size_t y = 0; y < rect_out.ysize(); ++y) { |
619 | 53.4k | const float* aq_map_row = rect_out.ConstRow(aq_map, y); |
620 | 53.4k | float* mask_row = rect_out.Row(mask, y); |
621 | 451k | for (size_t x = 0; x < rect_out.xsize(); ++x) { |
622 | 398k | mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); |
623 | 398k | } |
624 | 53.4k | } |
625 | 7.22k | PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), |
626 | 7.22k | xyb.Plane(2), rect_in, scale, rect_out, &aq_map); |
627 | 7.22k | return true; |
628 | 7.22k | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*) Line | Count | Source | 468 | 7.22k | const int thread, ImageF* mask, ImageF* mask1x1) { | 469 | 7.22k | JXL_ENSURE(rect_in.x0() % kBlockDim == 0); | 470 | 7.22k | JXL_ENSURE(rect_in.y0() % kBlockDim == 0); | 471 | 7.22k | const size_t xsize = xyb.xsize(); | 472 | 7.22k | const size_t ysize = xyb.ysize(); | 473 | | | 474 | | // The XYB gamma is 3.0 to be able to decode faster with two muls. | 475 | | // Butteraugli's gamma is matching the gamma of human eye, around 2.6. | 476 | | // We approximate the gamma difference by adding one cubic root into | 477 | | // the adaptive quantization. This gives us a total gamma of 2.6666 | 478 | | // for quantization uses. | 479 | 7.22k | const float match_gamma_offset = 0.019; | 480 | | | 481 | 7.22k | const HWY_FULL(float) df; | 482 | | | 483 | 7.22k | size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8; | 484 | 7.22k | size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8; | 485 | | | 486 | 7.22k | size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8; | 487 | 7.22k | size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8; | 488 | | | 489 | 7.22k | if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2; | 490 | 7.22k | if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) { | 491 | 0 | x_end_1x1 += 2; | 492 | 0 | } | 493 | 7.22k | if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2; | 494 | 7.22k | if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) { | 495 | 0 | y_end_1x1 += 2; | 496 | 0 | } | 497 | | | 498 | | // Computes image (padded to multiple of 8x8) of local pixel differences. | 499 | | // Subsample both directions by 4. | 500 | | // 1x1 Laplacian of intensity. | 501 | 434k | for (size_t y = y_start_1x1; y < y_end_1x1; ++y) { | 502 | 427k | const size_t y2 = y + 1 < ysize ? y + 1 : y; | 503 | 427k | const size_t y1 = y > 0 ? y - 1 : y; | 504 | 427k | const float* row_in = xyb.ConstPlaneRow(1, y); | 505 | 427k | const float* row_in1 = xyb.ConstPlaneRow(1, y1); | 506 | 427k | const float* row_in2 = xyb.ConstPlaneRow(1, y2); | 507 | 427k | float* mask1x1_out = mask1x1->Row(y); | 508 | 427k | auto scalar_pixel1x1 = [&](size_t x) { | 509 | 427k | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 510 | 427k | const size_t x1 = x > 0 ? x - 1 : x; | 511 | 427k | const float base = | 512 | 427k | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 513 | 427k | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 514 | 427k | row_in[x] + match_gamma_offset); | 515 | 427k | float diff = std::abs(gammac * (row_in[x] - base)); | 516 | 427k | static const double kScaler = 1.0; | 517 | 427k | diff *= kScaler; | 518 | 427k | diff = std::log1p(diff); | 519 | 427k | static const float kMul = 1.0; | 520 | 427k | static const float kOffset = 0.01; | 521 | 427k | mask1x1_out[x] = kMul / (diff + kOffset); | 522 | 427k | }; | 523 | 25.9M | for (size_t x = x_start_1x1; x < x_end_1x1; ++x) { | 524 | 25.4M | scalar_pixel1x1(x); | 525 | 25.4M | } | 526 | 427k | } | 527 | | | 528 | 7.22k | size_t y_start = rect_in.y0() + rect_out.y0() * 8; | 529 | 7.22k | size_t y_end = y_start + rect_out.ysize() * 8; | 530 | | | 531 | 7.22k | size_t x_start = rect_in.x0() + rect_out.x0() * 8; | 532 | 7.22k | size_t x_end = x_start + rect_out.xsize() * 8; | 533 | | | 534 | 7.22k | if (x_start != 0) x_start -= 4; | 535 | 7.22k | if (x_end != xsize) x_end += 4; | 536 | 7.22k | if (y_start != 0) y_start -= 4; | 537 | 7.22k | if (y_end != ysize) y_end += 4; | 538 | 7.22k | JXL_RETURN_IF_ERROR(pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, | 539 | 7.22k | (y_end - y_start) / 4)); | 540 | | | 541 | 7.22k | static const float limit = 0.2f; | 542 | 483k | for (size_t y = y_start; y < y_end; ++y) { | 543 | 476k | size_t y2 = y + 1 < ysize ? y + 1 : y; | 544 | 476k | size_t y1 = y > 0 ? y - 1 : y; | 545 | | | 546 | 476k | const float* row_in = xyb.ConstPlaneRow(1, y); | 547 | 476k | const float* row_in1 = xyb.ConstPlaneRow(1, y1); | 548 | 476k | const float* row_in2 = xyb.ConstPlaneRow(1, y2); | 549 | 476k | float* JXL_RESTRICT row_out = diff_buffer.Row(thread); | 550 | | | 551 | 476k | auto scalar_pixel = [&](size_t x) { | 552 | 476k | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 553 | 476k | const size_t x1 = x > 0 ? x - 1 : x; | 554 | 476k | const float base = | 555 | 476k | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 556 | 476k | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 557 | 476k | row_in[x] + match_gamma_offset); | 558 | 476k | float diff = gammac * (row_in[x] - base); | 559 | 476k | diff *= diff; | 560 | 476k | if (diff >= limit) { | 561 | 476k | diff = limit; | 562 | 476k | } | 563 | 476k | diff = MaskingSqrt(diff); | 564 | 476k | if ((y % 4) != 0) { | 565 | 476k | row_out[x - x_start] += diff; | 566 | 476k | } else { | 567 | 476k | row_out[x - x_start] = diff; | 568 | 476k | } | 569 | 476k | }; | 570 | | | 571 | 476k | size_t x = x_start; | 572 | | // First pixel of the row. | 573 | 476k | if (x_start == 0) { | 574 | 65.2k | scalar_pixel(x_start); | 575 | 65.2k | ++x; | 576 | 65.2k | } | 577 | | // SIMD | 578 | 476k | const auto match_gamma_offset_v = Set(df, match_gamma_offset); | 579 | 476k | const auto quarter = Set(df, 0.25f); | 580 | 4.02M | for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) { | 581 | 3.54M | const auto in = LoadU(df, row_in + x); | 582 | 3.54M | const auto in_r = LoadU(df, row_in + x + 1); | 583 | 3.54M | const auto in_l = LoadU(df, row_in + x - 1); | 584 | 3.54M | const auto in_t = LoadU(df, row_in2 + x); | 585 | 3.54M | const auto in_b = LoadU(df, row_in1 + x); | 586 | 3.54M | auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b))); | 587 | 3.54M | auto gammacv = | 588 | 3.54M | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>( | 589 | 3.54M | df, Add(in, match_gamma_offset_v)); | 590 | 3.54M | auto diff = Mul(gammacv, Sub(in, base)); | 591 | 3.54M | diff = Mul(diff, diff); | 592 | 3.54M | diff = Min(diff, Set(df, limit)); | 593 | 3.54M | diff = MaskingSqrt(df, diff); | 594 | 3.54M | if ((y & 3) != 0) { | 595 | 2.66M | diff = Add(diff, LoadU(df, row_out + x - x_start)); | 596 | 2.66M | } | 597 | 3.54M | StoreU(diff, df, row_out + x - x_start); | 598 | 3.54M | } | 599 | | // Scalar | 600 | 3.70M | for (; x < x_end; ++x) { | 601 | 3.23M | scalar_pixel(x); | 602 | 3.23M | } | 603 | 476k | if (y % 4 == 3) { | 604 | 119k | float* row_d_out = pre_erosion[thread].Row((y - y_start) / 4); | 605 | 2.09M | for (size_t qx = 0; qx < (x_end - x_start) / 4; qx++) { | 606 | 1.98M | row_d_out[qx] = (row_out[qx * 4] + row_out[qx * 4 + 1] + | 607 | 1.98M | row_out[qx * 4 + 2] + row_out[qx * 4 + 3]) * | 608 | 1.98M | 0.25f; | 609 | 1.98M | } | 610 | 119k | } | 611 | 476k | } | 612 | 7.22k | JXL_ENSURE(x_start % (kBlockDim / 2) == 0); | 613 | 7.22k | JXL_ENSURE(y_start % (kBlockDim / 2) == 0); | 614 | 7.22k | Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, | 615 | 7.22k | rect_out.xsize() * 2, rect_out.ysize() * 2); | 616 | 7.22k | JXL_RETURN_IF_ERROR(FuzzyErosion(butteraugli_target, from_rect, | 617 | 7.22k | pre_erosion[thread], rect_out, &aq_map)); | 618 | 60.6k | for (size_t y = 0; y < rect_out.ysize(); ++y) { | 619 | 53.4k | const float* aq_map_row = rect_out.ConstRow(aq_map, y); | 620 | 53.4k | float* mask_row = rect_out.Row(mask, y); | 621 | 451k | for (size_t x = 0; x < rect_out.xsize(); ++x) { | 622 | 398k | mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); | 623 | 398k | } | 624 | 53.4k | } | 625 | 7.22k | PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), | 626 | 7.22k | xyb.Plane(2), rect_in, scale, rect_out, &aq_map); | 627 | 7.22k | return true; | 628 | 7.22k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*) |
629 | | std::vector<ImageF> pre_erosion; |
630 | | ImageF aq_map; |
631 | | ImageF diff_buffer; |
632 | | }; |
633 | | |
634 | | Status Blur1x1Masking(JxlMemoryManager* memory_manager, ThreadPool* pool, |
635 | 186 | ImageF* mask1x1, const Rect& rect) { |
636 | | // Blur the mask1x1 to obtain the masking image. |
637 | | // Before blurring it contains an image of absolute value of the |
638 | | // Laplacian of the intensity channel. |
639 | 186 | static const float kFilterMask1x1[5] = { |
640 | 186 | 0.364911248, 0.05, 0.1688888021, 0.221069183, 0.306563504, |
641 | 186 | }; |
642 | 186 | double sum = |
643 | 186 | 1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] + |
644 | 186 | kFilterMask1x1[4] + 2 * kFilterMask1x1[3]); |
645 | 186 | if (sum < 1e-5) { |
646 | 0 | sum = 1e-5; |
647 | 0 | } |
648 | 186 | const float normalize = static_cast<float>(1.0 / sum); |
649 | 186 | WeightsSymmetric5 weights = |
650 | 186 | WeightsSymmetric5{{HWY_REP4(normalize)}, |
651 | 186 | {HWY_REP4(normalize * kFilterMask1x1[0])}, |
652 | 186 | {HWY_REP4(normalize * kFilterMask1x1[2])}, |
653 | 186 | {HWY_REP4(normalize * kFilterMask1x1[1])}, |
654 | 186 | {HWY_REP4(normalize * kFilterMask1x1[4])}, |
655 | 186 | {HWY_REP4(normalize * kFilterMask1x1[3])}}; |
656 | 186 | JXL_ASSIGN_OR_RETURN( |
657 | 186 | ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize())); |
658 | 186 | JXL_RETURN_IF_ERROR( |
659 | 186 | Symmetric5(*mask1x1, rect, weights, pool, &temp, Rect(temp))); |
660 | 186 | *mask1x1 = std::move(temp); |
661 | 186 | return true; |
662 | 186 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Line | Count | Source | 635 | 186 | ImageF* mask1x1, const Rect& rect) { | 636 | | // Blur the mask1x1 to obtain the masking image. | 637 | | // Before blurring it contains an image of absolute value of the | 638 | | // Laplacian of the intensity channel. | 639 | 186 | static const float kFilterMask1x1[5] = { | 640 | 186 | 0.364911248, 0.05, 0.1688888021, 0.221069183, 0.306563504, | 641 | 186 | }; | 642 | 186 | double sum = | 643 | 186 | 1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] + | 644 | 186 | kFilterMask1x1[4] + 2 * kFilterMask1x1[3]); | 645 | 186 | if (sum < 1e-5) { | 646 | 0 | sum = 1e-5; | 647 | 0 | } | 648 | 186 | const float normalize = static_cast<float>(1.0 / sum); | 649 | 186 | WeightsSymmetric5 weights = | 650 | 186 | WeightsSymmetric5{{HWY_REP4(normalize)}, | 651 | 186 | {HWY_REP4(normalize * kFilterMask1x1[0])}, | 652 | 186 | {HWY_REP4(normalize * kFilterMask1x1[2])}, | 653 | 186 | {HWY_REP4(normalize * kFilterMask1x1[1])}, | 654 | 186 | {HWY_REP4(normalize * kFilterMask1x1[4])}, | 655 | 186 | {HWY_REP4(normalize * kFilterMask1x1[3])}}; | 656 | 186 | JXL_ASSIGN_OR_RETURN( | 657 | 186 | ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize())); | 658 | 186 | JXL_RETURN_IF_ERROR( | 659 | 186 | Symmetric5(*mask1x1, rect, weights, pool, &temp, Rect(temp))); | 660 | 186 | *mask1x1 = std::move(temp); | 661 | 186 | return true; | 662 | 186 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) |
663 | | |
664 | | StatusOr<ImageF> AdaptiveQuantizationMap(const float butteraugli_target, |
665 | | const Image3F& xyb, const Rect& rect, |
666 | | float scale, ThreadPool* pool, |
667 | 186 | ImageF* mask, ImageF* mask1x1) { |
668 | 186 | JXL_ENSURE(rect.xsize() % kBlockDim == 0); |
669 | 186 | JXL_ENSURE(rect.ysize() % kBlockDim == 0); |
670 | 186 | AdaptiveQuantizationImpl impl; |
671 | 186 | const size_t xsize_blocks = rect.xsize() / kBlockDim; |
672 | 186 | const size_t ysize_blocks = rect.ysize() / kBlockDim; |
673 | 186 | JxlMemoryManager* memory_manager = xyb.memory_manager(); |
674 | 186 | JXL_ASSIGN_OR_RETURN( |
675 | 186 | impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); |
676 | 186 | JXL_ASSIGN_OR_RETURN( |
677 | 186 | *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); |
678 | 186 | JXL_ASSIGN_OR_RETURN( |
679 | 186 | *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize())); |
680 | 186 | const auto prepare = [&](const size_t num_threads) -> Status { |
681 | 186 | JXL_RETURN_IF_ERROR(impl.PrepareBuffers(memory_manager, num_threads)); |
682 | 186 | return true; |
683 | 186 | }; enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_0::operator()(unsigned long) const Line | Count | Source | 680 | 186 | const auto prepare = [&](const size_t num_threads) -> Status { | 681 | 186 | JXL_RETURN_IF_ERROR(impl.PrepareBuffers(memory_manager, num_threads)); | 682 | 186 | return true; | 683 | 186 | }; |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_0::operator()(unsigned long) const Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_0::operator()(unsigned long) const |
684 | 186 | const auto process_tile = [&](const uint32_t tid, |
685 | 7.22k | const size_t thread) -> Status { |
686 | 7.22k | size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks); |
687 | 7.22k | size_t tx = tid % n_enc_tiles; |
688 | 7.22k | size_t ty = tid / n_enc_tiles; |
689 | 7.22k | size_t by0 = ty * kEncTileDimInBlocks; |
690 | 7.22k | size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks); |
691 | 7.22k | size_t bx0 = tx * kEncTileDimInBlocks; |
692 | 7.22k | size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks); |
693 | 7.22k | Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0); |
694 | 7.22k | JXL_RETURN_IF_ERROR(impl.ComputeTile(butteraugli_target, scale, xyb, rect, |
695 | 7.22k | rect_out, thread, mask, mask1x1)); |
696 | 7.22k | return true; |
697 | 7.22k | }; enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_1::operator()(unsigned int, unsigned long) const Line | Count | Source | 685 | 7.22k | const size_t thread) -> Status { | 686 | 7.22k | size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks); | 687 | 7.22k | size_t tx = tid % n_enc_tiles; | 688 | 7.22k | size_t ty = tid / n_enc_tiles; | 689 | 7.22k | size_t by0 = ty * kEncTileDimInBlocks; | 690 | 7.22k | size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks); | 691 | 7.22k | size_t bx0 = tx * kEncTileDimInBlocks; | 692 | 7.22k | size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks); | 693 | 7.22k | Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0); | 694 | 7.22k | JXL_RETURN_IF_ERROR(impl.ComputeTile(butteraugli_target, scale, xyb, rect, | 695 | 7.22k | rect_out, thread, mask, mask1x1)); | 696 | 7.22k | return true; | 697 | 7.22k | }; |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_1::operator()(unsigned int, unsigned long) const Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_1::operator()(unsigned int, unsigned long) const |
698 | 186 | size_t num_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks) * |
699 | 186 | DivCeil(ysize_blocks, kEncTileDimInBlocks); |
700 | 186 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_tiles, prepare, process_tile, |
701 | 186 | "AQ DiffPrecompute")); |
702 | | |
703 | 186 | JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect)); |
704 | 186 | return std::move(impl).aq_map; |
705 | 186 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*) Line | Count | Source | 667 | 186 | ImageF* mask, ImageF* mask1x1) { | 668 | 186 | JXL_ENSURE(rect.xsize() % kBlockDim == 0); | 669 | 186 | JXL_ENSURE(rect.ysize() % kBlockDim == 0); | 670 | 186 | AdaptiveQuantizationImpl impl; | 671 | 186 | const size_t xsize_blocks = rect.xsize() / kBlockDim; | 672 | 186 | const size_t ysize_blocks = rect.ysize() / kBlockDim; | 673 | 186 | JxlMemoryManager* memory_manager = xyb.memory_manager(); | 674 | 186 | JXL_ASSIGN_OR_RETURN( | 675 | 186 | impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); | 676 | 186 | JXL_ASSIGN_OR_RETURN( | 677 | 186 | *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); | 678 | 186 | JXL_ASSIGN_OR_RETURN( | 679 | 186 | *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize())); | 680 | 186 | const auto prepare = [&](const size_t num_threads) -> Status { | 681 | 186 | JXL_RETURN_IF_ERROR(impl.PrepareBuffers(memory_manager, num_threads)); | 682 | 186 | return true; | 683 | 186 | }; | 684 | 186 | const auto process_tile = [&](const uint32_t tid, | 685 | 186 | const size_t thread) -> Status { | 686 | 186 | size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks); | 687 | 186 | size_t tx = tid % n_enc_tiles; | 688 | 186 | size_t ty = tid / n_enc_tiles; | 689 | 186 | size_t by0 = ty * kEncTileDimInBlocks; | 690 | 186 | size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks); | 691 | 186 | size_t bx0 = tx * kEncTileDimInBlocks; | 692 | 186 | size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks); | 693 | 186 | Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0); | 694 | 186 | JXL_RETURN_IF_ERROR(impl.ComputeTile(butteraugli_target, scale, xyb, rect, | 695 | 186 | rect_out, thread, mask, mask1x1)); | 696 | 186 | return true; | 697 | 186 | }; | 698 | 186 | size_t num_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks) * | 699 | 186 | DivCeil(ysize_blocks, kEncTileDimInBlocks); | 700 | 186 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_tiles, prepare, process_tile, | 701 | 186 | "AQ DiffPrecompute")); | 702 | | | 703 | 186 | JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect)); | 704 | 186 | return std::move(impl).aq_map; | 705 | 186 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*) |
706 | | |
707 | | } // namespace |
708 | | |
709 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
710 | | } // namespace HWY_NAMESPACE |
711 | | } // namespace jxl |
712 | | HWY_AFTER_NAMESPACE(); |
713 | | |
714 | | #if HWY_ONCE |
715 | | namespace jxl { |
716 | | HWY_EXPORT(AdaptiveQuantizationMap); |
717 | | |
718 | | namespace { |
719 | | |
720 | | // If true, prints the quantization maps at each iteration. |
721 | | constexpr bool FLAGS_dump_quant_state = false; |
722 | | |
723 | | Status DumpHeatmap(const CompressParams& cparams, const AuxOut* aux_out, |
724 | | const std::string& label, const ImageF& image, |
725 | 0 | float good_threshold, float bad_threshold) { |
726 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
727 | 0 | JXL_ASSIGN_OR_RETURN( |
728 | 0 | Image3F heatmap, |
729 | 0 | CreateHeatMapImage(image, good_threshold, bad_threshold)); |
730 | 0 | char filename[200]; |
731 | 0 | snprintf(filename, sizeof(filename), "%s%05d", label.c_str(), |
732 | 0 | aux_out->num_butteraugli_iters); |
733 | 0 | JXL_RETURN_IF_ERROR(DumpImage(cparams, filename, heatmap)); |
734 | 0 | } |
735 | 0 | return true; |
736 | 0 | } |
737 | | |
738 | | Status DumpHeatmaps(const CompressParams& cparams, const AuxOut* aux_out, |
739 | | float butteraugli_target, const ImageF& quant_field, |
740 | 0 | const ImageF& tile_heatmap, const ImageF& bt_diffmap) { |
741 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
742 | 0 | JxlMemoryManager* memory_manager = quant_field.memory_manager(); |
743 | 0 | if (!WantDebugOutput(cparams)) return true; |
744 | 0 | JXL_ASSIGN_OR_RETURN(ImageF inv_qmap, |
745 | 0 | ImageF::Create(memory_manager, quant_field.xsize(), |
746 | 0 | quant_field.ysize())); |
747 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
748 | 0 | const float* JXL_RESTRICT row_q = quant_field.ConstRow(y); |
749 | 0 | float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y); |
750 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
751 | 0 | row_inv_q[x] = 1.0f / row_q[x]; // never zero |
752 | 0 | } |
753 | 0 | } |
754 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "quant_heatmap", inv_qmap, |
755 | 0 | 4.0f * butteraugli_target, |
756 | 0 | 6.0f * butteraugli_target)); |
757 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "tile_heatmap", |
758 | 0 | tile_heatmap, butteraugli_target, |
759 | 0 | 1.5f * butteraugli_target)); |
760 | 0 | // matches heat maps produced by the command line tool. |
761 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "bt_diffmap", bt_diffmap, |
762 | 0 | ButteraugliFuzzyInverse(1.5), |
763 | 0 | ButteraugliFuzzyInverse(0.5))); |
764 | 0 | } |
765 | 0 | return true; |
766 | 0 | } |
767 | | |
768 | | StatusOr<ImageF> TileDistMap(const ImageF& distmap, int tile_size, int margin, |
769 | 0 | const AcStrategyImage& ac_strategy) { |
770 | 0 | const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size; |
771 | 0 | const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size; |
772 | 0 | JxlMemoryManager* memory_manager = distmap.memory_manager(); |
773 | 0 | JXL_ASSIGN_OR_RETURN(ImageF tile_distmap, |
774 | 0 | ImageF::Create(memory_manager, tile_xsize, tile_ysize)); |
775 | 0 | size_t distmap_stride = tile_distmap.PixelsPerRow(); |
776 | 0 | for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) { |
777 | 0 | AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y); |
778 | 0 | float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y); |
779 | 0 | for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) { |
780 | 0 | AcStrategy acs = ac_strategy_row[tile_x]; |
781 | 0 | if (!acs.IsFirstBlock()) continue; |
782 | 0 | int this_tile_xsize = acs.covered_blocks_x() * tile_size; |
783 | 0 | int this_tile_ysize = acs.covered_blocks_y() * tile_size; |
784 | 0 | int y_begin = std::max<int>(0, tile_size * tile_y - margin); |
785 | 0 | int y_end = std::min<int>(distmap.ysize(), |
786 | 0 | tile_size * tile_y + this_tile_ysize + margin); |
787 | 0 | int x_begin = std::max<int>(0, tile_size * tile_x - margin); |
788 | 0 | int x_end = std::min<int>(distmap.xsize(), |
789 | 0 | tile_size * tile_x + this_tile_xsize + margin); |
790 | 0 | float dist_norm = 0.0; |
791 | 0 | double pixels = 0; |
792 | 0 | for (int y = y_begin; y < y_end; ++y) { |
793 | 0 | float ymul = 1.0; |
794 | 0 | constexpr float kBorderMul = 0.98f; |
795 | 0 | constexpr float kCornerMul = 0.7f; |
796 | 0 | if (margin != 0 && (y == y_begin || y == y_end - 1)) { |
797 | 0 | ymul = kBorderMul; |
798 | 0 | } |
799 | 0 | const float* const JXL_RESTRICT row = distmap.Row(y); |
800 | 0 | for (int x = x_begin; x < x_end; ++x) { |
801 | 0 | float xmul = ymul; |
802 | 0 | if (margin != 0 && (x == x_begin || x == x_end - 1)) { |
803 | 0 | if (xmul == 1.0) { |
804 | 0 | xmul = kBorderMul; |
805 | 0 | } else { |
806 | 0 | xmul = kCornerMul; |
807 | 0 | } |
808 | 0 | } |
809 | 0 | float v = row[x]; |
810 | 0 | v *= v; |
811 | 0 | v *= v; |
812 | 0 | v *= v; |
813 | 0 | v *= v; |
814 | 0 | dist_norm += xmul * v; |
815 | 0 | pixels += xmul; |
816 | 0 | } |
817 | 0 | } |
818 | 0 | if (pixels == 0) pixels = 1; |
819 | | // 16th norm is less than the max norm, we reduce the difference |
820 | | // with this normalization factor. |
821 | 0 | constexpr float kTileNorm = 1.2f; |
822 | 0 | const float tile_dist = |
823 | 0 | kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f); |
824 | 0 | dist_row[tile_x] = tile_dist; |
825 | 0 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
826 | 0 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
827 | 0 | dist_row[tile_x + distmap_stride * iy + ix] = tile_dist; |
828 | 0 | } |
829 | 0 | } |
830 | 0 | } |
831 | 0 | } |
832 | 0 | return tile_distmap; |
833 | 0 | } |
834 | | |
835 | | const float kDcQuantPow = 0.83f; |
836 | | const float kDcQuant = 1.095924047623553f; |
837 | | const float kAcQuant = 0.765f; |
838 | | |
839 | | // Computes the decoded image for a given set of compression parameters. |
840 | | StatusOr<ImageBundle> RoundtripImage(const FrameHeader& frame_header, |
841 | | const Image3F& opsin, |
842 | | PassesEncoderState* enc_state, |
843 | | const JxlCmsInterface& cms, |
844 | 0 | ThreadPool* pool) { |
845 | 0 | JxlMemoryManager* memory_manager = enc_state->memory_manager(); |
846 | 0 | std::unique_ptr<PassesDecoderState> dec_state = |
847 | 0 | jxl::make_unique<PassesDecoderState>(memory_manager); |
848 | 0 | JXL_RETURN_IF_ERROR(dec_state->output_encoding_info.SetFromMetadata( |
849 | 0 | *enc_state->shared.metadata)); |
850 | 0 | dec_state->shared = &enc_state->shared; |
851 | 0 | JXL_ENSURE(opsin.ysize() % kBlockDim == 0); |
852 | | |
853 | 0 | const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim); |
854 | 0 | const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim); |
855 | 0 | const size_t num_groups = xsize_groups * ysize_groups; |
856 | |
|
857 | 0 | size_t num_special_frames = enc_state->special_frames.size(); |
858 | 0 | size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); |
859 | 0 | JXL_ASSIGN_OR_RETURN(auto modular_frame_encoder, |
860 | 0 | ModularFrameEncoder::Create(memory_manager, frame_header, |
861 | 0 | enc_state->cparams, false)); |
862 | 0 | JXL_RETURN_IF_ERROR( |
863 | 0 | InitializePassesEncoder(frame_header, opsin, Rect(opsin), cms, pool, |
864 | 0 | enc_state, modular_frame_encoder.get(), nullptr)); |
865 | 0 | JXL_RETURN_IF_ERROR(dec_state->Init(frame_header)); |
866 | 0 | JXL_RETURN_IF_ERROR(dec_state->InitForAC(num_passes, pool)); |
867 | | |
868 | 0 | ImageBundle decoded(memory_manager, &enc_state->shared.metadata->m); |
869 | 0 | decoded.origin = frame_header.frame_origin; |
870 | 0 | JXL_ASSIGN_OR_RETURN( |
871 | 0 | Image3F tmp, |
872 | 0 | Image3F::Create(memory_manager, opsin.xsize(), opsin.ysize())); |
873 | 0 | JXL_RETURN_IF_ERROR(decoded.SetFromImage( |
874 | 0 | std::move(tmp), dec_state->output_encoding_info.color_encoding)); |
875 | | |
876 | 0 | PassesDecoderState::PipelineOptions options; |
877 | 0 | options.use_slow_render_pipeline = false; |
878 | 0 | options.coalescing = false; |
879 | 0 | options.render_spotcolors = false; |
880 | 0 | options.render_noise = false; |
881 | | |
882 | | // Same as frame_header.nonserialized_metadata->m |
883 | 0 | const ImageMetadata& metadata = *decoded.metadata(); |
884 | |
|
885 | 0 | JXL_RETURN_IF_ERROR(dec_state->PreparePipeline( |
886 | 0 | frame_header, &enc_state->shared.metadata->m, &decoded, options)); |
887 | | |
888 | 0 | AlignedArray<GroupDecCache> group_dec_caches; |
889 | 0 | const auto allocate_storage = [&](const size_t num_threads) -> Status { |
890 | 0 | JXL_RETURN_IF_ERROR( |
891 | 0 | dec_state->render_pipeline->PrepareForThreads(num_threads, |
892 | 0 | /*use_group_ids=*/false)); |
893 | 0 | JXL_ASSIGN_OR_RETURN(group_dec_caches, AlignedArray<GroupDecCache>::Create( |
894 | 0 | memory_manager, num_threads)); |
895 | 0 | return true; |
896 | 0 | }; |
897 | 0 | const auto process_group = [&](const uint32_t group_index, |
898 | 0 | const size_t thread) -> Status { |
899 | 0 | if (frame_header.loop_filter.epf_iters > 0) { |
900 | 0 | JXL_RETURN_IF_ERROR( |
901 | 0 | ComputeSigma(frame_header.loop_filter, |
902 | 0 | dec_state->shared->frame_dim.BlockGroupRect(group_index), |
903 | 0 | dec_state.get())); |
904 | 0 | } |
905 | 0 | RenderPipelineInput input = |
906 | 0 | dec_state->render_pipeline->GetInputBuffers(group_index, thread); |
907 | 0 | JXL_RETURN_IF_ERROR(DecodeGroupForRoundtrip( |
908 | 0 | frame_header, enc_state->coeffs, group_index, dec_state.get(), |
909 | 0 | &group_dec_caches[thread], thread, input, nullptr, nullptr)); |
910 | 0 | for (size_t c = 0; c < metadata.num_extra_channels; c++) { |
911 | 0 | std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c); |
912 | 0 | FillPlane(0.0f, ri.first, ri.second); |
913 | 0 | } |
914 | 0 | JXL_RETURN_IF_ERROR(input.Done()); |
915 | 0 | return true; |
916 | 0 | }; |
917 | 0 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_groups, allocate_storage, |
918 | 0 | process_group, "AQ loop")); |
919 | | |
920 | | // Ensure we don't create any new special frames. |
921 | 0 | enc_state->special_frames.resize(num_special_frames); |
922 | |
|
923 | 0 | return decoded; |
924 | 0 | } |
925 | | |
926 | | constexpr int kDefaultButteraugliIters = 2; |
927 | | constexpr int kMaxButteraugliIters = 4; |
928 | | |
929 | | Status FindBestQuantization(const FrameHeader& frame_header, |
930 | | const Image3F& linear, const Image3F& opsin, |
931 | | ImageF& quant_field, PassesEncoderState* enc_state, |
932 | | const JxlCmsInterface& cms, ThreadPool* pool, |
933 | 0 | AuxOut* aux_out) { |
934 | 0 | const CompressParams& cparams = enc_state->cparams; |
935 | 0 | if (cparams.resampling > 1 && |
936 | 0 | cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) { |
937 | | // For downsampled opsin image, the butteraugli based adaptive quantization |
938 | | // loop would only make the size bigger without improving the distance much, |
939 | | // so in this case we enable it only for very high butteraugli targets. |
940 | 0 | return true; |
941 | 0 | } |
942 | 0 | JxlMemoryManager* memory_manager = enc_state->memory_manager(); |
943 | 0 | Quantizer& quantizer = enc_state->shared.quantizer; |
944 | 0 | ImageI& raw_quant_field = enc_state->shared.raw_quant_field; |
945 | |
|
946 | 0 | const float butteraugli_target = cparams.butteraugli_distance; |
947 | 0 | const float original_butteraugli = cparams.original_butteraugli_distance; |
948 | 0 | ButteraugliParams params; |
949 | 0 | const auto& tf = frame_header.nonserialized_metadata->m.color_encoding.Tf(); |
950 | 0 | params.intensity_target = |
951 | 0 | tf.IsPQ() || tf.IsHLG() |
952 | 0 | ? frame_header.nonserialized_metadata->m.IntensityTarget() |
953 | 0 | : 80.f; |
954 | 0 | JxlButteraugliComparator comparator(params, cms); |
955 | 0 | JXL_RETURN_IF_ERROR(comparator.SetLinearReferenceImage(linear)); |
956 | 0 | bool lower_is_better = |
957 | 0 | (comparator.GoodQualityScore() < comparator.BadQualityScore()); |
958 | 0 | const float initial_quant_dc = InitialQuantDC(butteraugli_target); |
959 | 0 | JXL_RETURN_IF_ERROR(AdjustQuantField(enc_state->shared.ac_strategy, |
960 | 0 | Rect(quant_field), original_butteraugli, |
961 | 0 | &quant_field)); |
962 | 0 | ImageF tile_distmap; |
963 | 0 | JXL_ASSIGN_OR_RETURN( |
964 | 0 | ImageF initial_quant_field, |
965 | 0 | ImageF::Create(memory_manager, quant_field.xsize(), quant_field.ysize())); |
966 | 0 | JXL_RETURN_IF_ERROR(CopyImageTo(quant_field, &initial_quant_field)); |
967 | | |
968 | 0 | float initial_qf_min; |
969 | 0 | float initial_qf_max; |
970 | 0 | ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max); |
971 | 0 | float initial_qf_ratio = initial_qf_max / initial_qf_min; |
972 | 0 | float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio); |
973 | 0 | float asymmetry = 2; |
974 | 0 | if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low; |
975 | 0 | float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low); |
976 | 0 | float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry); |
977 | |
|
978 | 0 | JXL_ENSURE(qf_higher / qf_lower < 253); |
979 | | |
980 | 0 | constexpr int kOriginalComparisonRound = 1; |
981 | 0 | int iters = kDefaultButteraugliIters; |
982 | 0 | if (cparams.speed_tier <= SpeedTier::kTortoise) { |
983 | 0 | iters = kMaxButteraugliIters; |
984 | 0 | } |
985 | 0 | for (int i = 0; i < iters + 1; ++i) { |
986 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
987 | 0 | printf("\nQuantization field:\n"); |
988 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
989 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
990 | 0 | printf(" %.5f", quant_field.Row(y)[x]); |
991 | 0 | } |
992 | 0 | printf("\n"); |
993 | 0 | } |
994 | 0 | } |
995 | 0 | JXL_RETURN_IF_ERROR(quantizer.SetQuantField(initial_quant_dc, quant_field, |
996 | 0 | &raw_quant_field)); |
997 | 0 | JXL_ASSIGN_OR_RETURN( |
998 | 0 | ImageBundle dec_linear, |
999 | 0 | RoundtripImage(frame_header, opsin, enc_state, cms, pool)); |
1000 | 0 | float score; |
1001 | 0 | ImageF diffmap; |
1002 | 0 | JXL_RETURN_IF_ERROR(comparator.CompareWith(dec_linear, &diffmap, &score)); |
1003 | 0 | if (!lower_is_better) { |
1004 | 0 | score = -score; |
1005 | 0 | ScaleImage(-1.0f, &diffmap); |
1006 | 0 | } |
1007 | 0 | JXL_ASSIGN_OR_RETURN(tile_distmap, |
1008 | 0 | TileDistMap(diffmap, 8 * cparams.resampling, 0, |
1009 | 0 | enc_state->shared.ac_strategy)); |
1010 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && WantDebugOutput(cparams)) { |
1011 | 0 | JXL_RETURN_IF_ERROR(DumpImage(cparams, ("dec" + ToString(i)).c_str(), |
1012 | 0 | *dec_linear.color())); |
1013 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmaps(cparams, aux_out, butteraugli_target, |
1014 | 0 | quant_field, tile_distmap, diffmap)); |
1015 | 0 | } |
1016 | 0 | if (aux_out != nullptr) ++aux_out->num_butteraugli_iters; |
1017 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
1018 | 0 | float minval; |
1019 | 0 | float maxval; |
1020 | 0 | ImageMinMax(quant_field, &minval, &maxval); |
1021 | 0 | printf("\nButteraugli iter: %d/%d\n", i, iters); |
1022 | 0 | printf("Butteraugli distance: %f (target = %f)\n", score, |
1023 | 0 | original_butteraugli); |
1024 | 0 | printf("quant range: %f ... %f DC quant: %f\n", minval, maxval, |
1025 | 0 | initial_quant_dc); |
1026 | 0 | if (FLAGS_dump_quant_state) { |
1027 | 0 | quantizer.DumpQuantizationMap(raw_quant_field); |
1028 | 0 | } |
1029 | 0 | } |
1030 | |
|
1031 | 0 | if (i == iters) break; |
1032 | | |
1033 | 0 | double kPow[8] = { |
1034 | 0 | 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
1035 | 0 | }; |
1036 | 0 | double kPowMod[8] = { |
1037 | 0 | 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
1038 | 0 | }; |
1039 | 0 | if (i == kOriginalComparisonRound) { |
1040 | | // Don't allow optimization to make the quant field a lot worse than |
1041 | | // what the initial guess was. This allows the AC field to have enough |
1042 | | // precision to reduce the oscillations due to the dc reconstruction. |
1043 | 0 | double kInitMul = 0.6; |
1044 | 0 | const double kOneMinusInitMul = 1.0 - kInitMul; |
1045 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
1046 | 0 | float* const JXL_RESTRICT row_q = quant_field.Row(y); |
1047 | 0 | const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y); |
1048 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
1049 | 0 | double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x]; |
1050 | 0 | if (row_q[x] < clamp) { |
1051 | 0 | row_q[x] = clamp; |
1052 | 0 | if (row_q[x] > qf_higher) row_q[x] = qf_higher; |
1053 | 0 | if (row_q[x] < qf_lower) row_q[x] = qf_lower; |
1054 | 0 | } |
1055 | 0 | } |
1056 | 0 | } |
1057 | 0 | } |
1058 | |
|
1059 | 0 | double cur_pow = 0.0; |
1060 | 0 | if (i < 7) { |
1061 | 0 | cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i]; |
1062 | 0 | if (cur_pow < 0) { |
1063 | 0 | cur_pow = 0; |
1064 | 0 | } |
1065 | 0 | } |
1066 | 0 | if (cur_pow == 0.0) { |
1067 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
1068 | 0 | const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); |
1069 | 0 | float* const JXL_RESTRICT row_q = quant_field.Row(y); |
1070 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
1071 | 0 | const float diff = row_dist[x] / original_butteraugli; |
1072 | 0 | if (diff > 1.0f) { |
1073 | 0 | float old = row_q[x]; |
1074 | 0 | row_q[x] *= diff; |
1075 | 0 | int qf_old = |
1076 | 0 | static_cast<int>(std::lround(old * quantizer.InvGlobalScale())); |
1077 | 0 | int qf_new = static_cast<int>( |
1078 | 0 | std::lround(row_q[x] * quantizer.InvGlobalScale())); |
1079 | 0 | if (qf_old == qf_new) { |
1080 | 0 | row_q[x] = old + quantizer.Scale(); |
1081 | 0 | } |
1082 | 0 | } |
1083 | 0 | if (row_q[x] > qf_higher) row_q[x] = qf_higher; |
1084 | 0 | if (row_q[x] < qf_lower) row_q[x] = qf_lower; |
1085 | 0 | } |
1086 | 0 | } |
1087 | 0 | } else { |
1088 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
1089 | 0 | const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); |
1090 | 0 | float* const JXL_RESTRICT row_q = quant_field.Row(y); |
1091 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
1092 | 0 | const float diff = row_dist[x] / original_butteraugli; |
1093 | 0 | if (diff <= 1.0f) { |
1094 | 0 | row_q[x] *= std::pow(diff, cur_pow); |
1095 | 0 | } else { |
1096 | 0 | float old = row_q[x]; |
1097 | 0 | row_q[x] *= diff; |
1098 | 0 | int qf_old = |
1099 | 0 | static_cast<int>(std::lround(old * quantizer.InvGlobalScale())); |
1100 | 0 | int qf_new = static_cast<int>( |
1101 | 0 | std::lround(row_q[x] * quantizer.InvGlobalScale())); |
1102 | 0 | if (qf_old == qf_new) { |
1103 | 0 | row_q[x] = old + quantizer.Scale(); |
1104 | 0 | } |
1105 | 0 | } |
1106 | 0 | if (row_q[x] > qf_higher) row_q[x] = qf_higher; |
1107 | 0 | if (row_q[x] < qf_lower) row_q[x] = qf_lower; |
1108 | 0 | } |
1109 | 0 | } |
1110 | 0 | } |
1111 | 0 | } |
1112 | 0 | JXL_RETURN_IF_ERROR( |
1113 | 0 | quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field)); |
1114 | 0 | return true; |
1115 | 0 | } |
1116 | | |
1117 | | Status FindBestQuantizationMaxError(const FrameHeader& frame_header, |
1118 | | const Image3F& opsin, ImageF& quant_field, |
1119 | | PassesEncoderState* enc_state, |
1120 | | const JxlCmsInterface& cms, |
1121 | 0 | ThreadPool* pool, AuxOut* aux_out) { |
1122 | | // TODO(szabadka): Make this work for non-opsin color spaces. |
1123 | 0 | const CompressParams& cparams = enc_state->cparams; |
1124 | 0 | Quantizer& quantizer = enc_state->shared.quantizer; |
1125 | 0 | ImageI& raw_quant_field = enc_state->shared.raw_quant_field; |
1126 | | |
1127 | | // TODO(veluca): better choice of this value. |
1128 | 0 | const float initial_quant_dc = |
1129 | 0 | 16 * std::sqrt(0.1f / cparams.butteraugli_distance); |
1130 | 0 | JXL_RETURN_IF_ERROR( |
1131 | 0 | AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), |
1132 | 0 | cparams.original_butteraugli_distance, &quant_field)); |
1133 | | |
1134 | 0 | const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0], |
1135 | 0 | 1.0f / enc_state->cparams.max_error[1], |
1136 | 0 | 1.0f / enc_state->cparams.max_error[2]}; |
1137 | |
|
1138 | 0 | for (int i = 0; i < kMaxButteraugliIters + 1; ++i) { |
1139 | 0 | JXL_RETURN_IF_ERROR(quantizer.SetQuantField(initial_quant_dc, quant_field, |
1140 | 0 | &raw_quant_field)); |
1141 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) { |
1142 | 0 | JXL_RETURN_IF_ERROR( |
1143 | 0 | DumpXybImage(cparams, ("ops" + ToString(i)).c_str(), opsin)); |
1144 | 0 | } |
1145 | 0 | JXL_ASSIGN_OR_RETURN( |
1146 | 0 | ImageBundle decoded, |
1147 | 0 | RoundtripImage(frame_header, opsin, enc_state, cms, pool)); |
1148 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) { |
1149 | 0 | JXL_RETURN_IF_ERROR(DumpXybImage(cparams, ("dec" + ToString(i)).c_str(), |
1150 | 0 | *decoded.color())); |
1151 | 0 | } |
1152 | 0 | for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) { |
1153 | 0 | AcStrategyRow ac_strategy_row = |
1154 | 0 | enc_state->shared.ac_strategy.ConstRow(by); |
1155 | 0 | for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) { |
1156 | 0 | AcStrategy acs = ac_strategy_row[bx]; |
1157 | 0 | if (!acs.IsFirstBlock()) continue; |
1158 | 0 | float max_error = 0; |
1159 | 0 | for (size_t c = 0; c < 3; c++) { |
1160 | 0 | for (size_t y = by * kBlockDim; |
1161 | 0 | y < (by + acs.covered_blocks_y()) * kBlockDim; y++) { |
1162 | 0 | if (y >= decoded.ysize()) continue; |
1163 | 0 | const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y); |
1164 | 0 | const float* JXL_RESTRICT dec_row = |
1165 | 0 | decoded.color()->ConstPlaneRow(c, y); |
1166 | 0 | for (size_t x = bx * kBlockDim; |
1167 | 0 | x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) { |
1168 | 0 | if (x >= decoded.xsize()) continue; |
1169 | 0 | max_error = std::max( |
1170 | 0 | std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error); |
1171 | 0 | } |
1172 | 0 | } |
1173 | 0 | } |
1174 | | // Target an error between max_error/2 and max_error. |
1175 | | // If the error in the varblock is above the target, increase the qf to |
1176 | | // compensate. If the error is below the target, decrease the qf. |
1177 | | // However, to avoid an excessive increase of the qf, only do so if the |
1178 | | // error is less than half the maximum allowed error. |
1179 | 0 | const float qf_mul = (max_error < 0.5f) ? max_error * 2.0f |
1180 | 0 | : (max_error > 1.0f) ? max_error |
1181 | 0 | : 1.0f; |
1182 | 0 | for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) { |
1183 | 0 | float* JXL_RESTRICT quant_field_row = quant_field.Row(qy); |
1184 | 0 | for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) { |
1185 | 0 | quant_field_row[qx] *= qf_mul; |
1186 | 0 | } |
1187 | 0 | } |
1188 | 0 | } |
1189 | 0 | } |
1190 | 0 | } |
1191 | 0 | JXL_RETURN_IF_ERROR( |
1192 | 0 | quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field)); |
1193 | 0 | return true; |
1194 | 0 | } |
1195 | | |
1196 | | } // namespace |
1197 | | |
1198 | | Status AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, |
1199 | 7.22k | float butteraugli_target, ImageF* quant_field) { |
1200 | | // Replace the whole quant_field in non-8x8 blocks with the maximum of each |
1201 | | // 8x8 block. |
1202 | 7.22k | size_t stride = quant_field->PixelsPerRow(); |
1203 | | |
1204 | | // At low distances it is great to use max, but mean works better |
1205 | | // at high distances. We interpolate between them for a distance |
1206 | | // range. |
1207 | 7.22k | float mean_max_mixer = 1.0f; |
1208 | 7.22k | { |
1209 | 7.22k | static const float kLimit = 1.54138f; |
1210 | 7.22k | static const float kMul = 0.56391f; |
1211 | 7.22k | static const float kMin = 0.0f; |
1212 | 7.22k | if (butteraugli_target > kLimit) { |
1213 | 0 | mean_max_mixer -= (butteraugli_target - kLimit) * kMul; |
1214 | 0 | if (mean_max_mixer < kMin) { |
1215 | 0 | mean_max_mixer = kMin; |
1216 | 0 | } |
1217 | 0 | } |
1218 | 7.22k | } |
1219 | 60.6k | for (size_t y = 0; y < rect.ysize(); ++y) { |
1220 | 53.4k | AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y); |
1221 | 53.4k | float* JXL_RESTRICT quant_row = rect.Row(quant_field, y); |
1222 | 451k | for (size_t x = 0; x < rect.xsize(); ++x) { |
1223 | 398k | AcStrategy acs = ac_strategy_row[x]; |
1224 | 398k | if (!acs.IsFirstBlock()) continue; |
1225 | 241k | JXL_ENSURE(x + acs.covered_blocks_x() <= quant_field->xsize()); |
1226 | 241k | JXL_ENSURE(y + acs.covered_blocks_y() <= quant_field->ysize()); |
1227 | 241k | float max = quant_row[x]; |
1228 | 241k | float mean = 0.0; |
1229 | 521k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
1230 | 678k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
1231 | 398k | mean += quant_row[x + ix + iy * stride]; |
1232 | 398k | max = std::max(quant_row[x + ix + iy * stride], max); |
1233 | 398k | } |
1234 | 279k | } |
1235 | 241k | mean /= acs.covered_blocks_y() * acs.covered_blocks_x(); |
1236 | 241k | if (acs.covered_blocks_y() * acs.covered_blocks_x() >= 4) { |
1237 | 15.1k | max *= mean_max_mixer; |
1238 | 15.1k | max += (1.0f - mean_max_mixer) * mean; |
1239 | 15.1k | } |
1240 | 521k | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
1241 | 678k | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
1242 | 398k | quant_row[x + ix + iy * stride] = max; |
1243 | 398k | } |
1244 | 279k | } |
1245 | 241k | } |
1246 | 53.4k | } |
1247 | 7.22k | return true; |
1248 | 7.22k | } |
1249 | | |
1250 | 186 | float InitialQuantDC(float butteraugli_target) { |
1251 | 186 | const float kDcMul = 0.3; // Butteraugli target where non-linearity kicks in. |
1252 | 186 | const float butteraugli_target_dc = std::max<float>( |
1253 | 186 | 0.5f * butteraugli_target, |
1254 | 186 | std::min<float>(butteraugli_target, |
1255 | 186 | kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target, |
1256 | 186 | kDcQuantPow))); |
1257 | | // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc. |
1258 | | // The maximum DC value might not be in the kXybRange because of inverse |
1259 | | // gaborish, so we add some slack to the maximum theoretical quant obtained |
1260 | | // this way (64). |
1261 | 186 | return std::min(kDcQuant / butteraugli_target_dc, 50.f); |
1262 | 186 | } |
1263 | | |
1264 | | StatusOr<ImageF> InitialQuantField(const float butteraugli_target, |
1265 | | const Image3F& opsin, const Rect& rect, |
1266 | | ThreadPool* pool, float rescale, |
1267 | 186 | ImageF* mask, ImageF* mask1x1) { |
1268 | 186 | const float quant_ac = kAcQuant / butteraugli_target; |
1269 | 186 | return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)( |
1270 | 186 | butteraugli_target, opsin, rect, quant_ac * rescale, pool, mask, mask1x1); |
1271 | 186 | } |
1272 | | |
1273 | | Status FindBestQuantizer(const FrameHeader& frame_header, const Image3F* linear, |
1274 | | const Image3F& opsin, ImageF& quant_field, |
1275 | | PassesEncoderState* enc_state, |
1276 | | const JxlCmsInterface& cms, ThreadPool* pool, |
1277 | 186 | AuxOut* aux_out, double rescale) { |
1278 | 186 | const CompressParams& cparams = enc_state->cparams; |
1279 | 186 | if (cparams.max_error_mode) { |
1280 | 0 | JXL_RETURN_IF_ERROR(FindBestQuantizationMaxError( |
1281 | 0 | frame_header, opsin, quant_field, enc_state, cms, pool, aux_out)); |
1282 | 186 | } else if (linear && cparams.speed_tier <= SpeedTier::kKitten) { |
1283 | | // Normal encoding to a butteraugli score. |
1284 | 0 | JXL_RETURN_IF_ERROR(FindBestQuantization(frame_header, *linear, opsin, |
1285 | 0 | quant_field, enc_state, cms, pool, |
1286 | 0 | aux_out)); |
1287 | 0 | } |
1288 | 186 | return true; |
1289 | 186 | } |
1290 | | |
1291 | | } // namespace jxl |
1292 | | #endif // HWY_ONCE |