/src/libjxl/lib/jxl/enc_adaptive_quantization.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_adaptive_quantization.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <atomic> |
12 | | #include <cmath> |
13 | | #include <cstddef> |
14 | | #include <cstdlib> |
15 | | #include <string> |
16 | | #include <vector> |
17 | | |
18 | | #undef HWY_TARGET_INCLUDE |
19 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc" |
20 | | #include <hwy/foreach_target.h> |
21 | | #include <hwy/highway.h> |
22 | | |
23 | | #include "lib/jxl/ac_strategy.h" |
24 | | #include "lib/jxl/base/common.h" |
25 | | #include "lib/jxl/base/compiler_specific.h" |
26 | | #include "lib/jxl/base/data_parallel.h" |
27 | | #include "lib/jxl/base/fast_math-inl.h" |
28 | | #include "lib/jxl/base/rect.h" |
29 | | #include "lib/jxl/base/status.h" |
30 | | #include "lib/jxl/butteraugli/butteraugli.h" |
31 | | #include "lib/jxl/cms/opsin_params.h" |
32 | | #include "lib/jxl/convolve.h" |
33 | | #include "lib/jxl/dec_cache.h" |
34 | | #include "lib/jxl/dec_group.h" |
35 | | #include "lib/jxl/enc_aux_out.h" |
36 | | #include "lib/jxl/enc_butteraugli_comparator.h" |
37 | | #include "lib/jxl/enc_cache.h" |
38 | | #include "lib/jxl/enc_debug_image.h" |
39 | | #include "lib/jxl/enc_group.h" |
40 | | #include "lib/jxl/enc_modular.h" |
41 | | #include "lib/jxl/enc_params.h" |
42 | | #include "lib/jxl/enc_transforms-inl.h" |
43 | | #include "lib/jxl/epf.h" |
44 | | #include "lib/jxl/frame_dimensions.h" |
45 | | #include "lib/jxl/image.h" |
46 | | #include "lib/jxl/image_bundle.h" |
47 | | #include "lib/jxl/image_ops.h" |
48 | | #include "lib/jxl/quant_weights.h" |
49 | | |
50 | | // Set JXL_DEBUG_ADAPTIVE_QUANTIZATION to 1 to enable debugging. |
51 | | #ifndef JXL_DEBUG_ADAPTIVE_QUANTIZATION |
52 | 0 | #define JXL_DEBUG_ADAPTIVE_QUANTIZATION 0 |
53 | | #endif |
54 | | |
55 | | HWY_BEFORE_NAMESPACE(); |
56 | | namespace jxl { |
57 | | namespace HWY_NAMESPACE { |
58 | | namespace { |
59 | | |
60 | | // These templates are not found via ADL. |
61 | | using hwy::HWY_NAMESPACE::AbsDiff; |
62 | | using hwy::HWY_NAMESPACE::Add; |
63 | | using hwy::HWY_NAMESPACE::And; |
64 | | using hwy::HWY_NAMESPACE::Max; |
65 | | using hwy::HWY_NAMESPACE::Rebind; |
66 | | using hwy::HWY_NAMESPACE::Sqrt; |
67 | | using hwy::HWY_NAMESPACE::ZeroIfNegative; |
68 | | |
69 | | // The following functions modulate an exponent (out_val) and return the updated |
70 | | // value. Their descriptor is limited to 8 lanes for 8x8 blocks. |
71 | | |
72 | | // Hack for mask estimation. Eventually replace this code with butteraugli's |
73 | | // masking. |
74 | 52 | float ComputeMaskForAcStrategyUse(const float out_val) { |
75 | 52 | const float kMul = 1.0f; |
76 | 52 | const float kOffset = 0.001f; |
77 | 52 | return kMul / (out_val + kOffset); |
78 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float) Line | Count | Source | 74 | 52 | float ComputeMaskForAcStrategyUse(const float out_val) { | 75 | 52 | const float kMul = 1.0f; | 76 | 52 | const float kOffset = 0.001f; | 77 | 52 | return kMul / (out_val + kOffset); | 78 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::ComputeMaskForAcStrategyUse(float) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float) |
79 | | |
80 | | template <class D, class V> |
81 | 52 | V ComputeMask(const D d, const V out_val) { |
82 | 52 | const auto kBase = Set(d, -0.7647f); |
83 | 52 | const auto kMul4 = Set(d, 9.4708735624378946f); |
84 | 52 | const auto kMul2 = Set(d, 17.35036561631863f); |
85 | 52 | const auto kOffset2 = Set(d, 302.59587815579727f); |
86 | 52 | const auto kMul3 = Set(d, 6.7943250517376494f); |
87 | 52 | const auto kOffset3 = Set(d, 3.7179635626140772f); |
88 | 52 | const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3); |
89 | 52 | const auto kMul0 = Set(d, 0.80061762862741759f); |
90 | 52 | const auto k1 = Set(d, 1.0f); |
91 | | |
92 | | // Avoid division by zero. |
93 | 52 | const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f)); |
94 | 52 | const auto v2 = Div(k1, Add(v1, kOffset2)); |
95 | 52 | const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3)); |
96 | 52 | const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4)); |
97 | | // TODO(jyrki): |
98 | | // A log or two here could make sense. In butteraugli we have effectively |
99 | | // log(log(x + C)) for this kind of use, as a single log is used in |
100 | | // saturating visual masking and here the modulation values are exponential, |
101 | | // another log would counter that. |
102 | 52 | return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3)))); |
103 | 52 | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::ComputeMask<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 81 | 52 | V ComputeMask(const D d, const V out_val) { | 82 | 52 | const auto kBase = Set(d, -0.7647f); | 83 | 52 | const auto kMul4 = Set(d, 9.4708735624378946f); | 84 | 52 | const auto kMul2 = Set(d, 17.35036561631863f); | 85 | 52 | const auto kOffset2 = Set(d, 302.59587815579727f); | 86 | 52 | const auto kMul3 = Set(d, 6.7943250517376494f); | 87 | 52 | const auto kOffset3 = Set(d, 3.7179635626140772f); | 88 | 52 | const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3); | 89 | 52 | const auto kMul0 = Set(d, 0.80061762862741759f); | 90 | 52 | const auto k1 = Set(d, 1.0f); | 91 | | | 92 | | // Avoid division by zero. | 93 | 52 | const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f)); | 94 | 52 | const auto v2 = Div(k1, Add(v1, kOffset2)); | 95 | 52 | const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3)); | 96 | 52 | const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4)); | 97 | | // TODO(jyrki): | 98 | | // A log or two here could make sense. In butteraugli we have effectively | 99 | | // log(log(x + C)) for this kind of use, as a single log is used in | 100 | | // saturating visual masking and here the modulation values are exponential, | 101 | | // another log would counter that. | 102 | 52 | return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3)))); | 103 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::ComputeMask<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::ComputeMask<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
104 | | |
105 | | // mul and mul2 represent a scaling difference between jxl and butteraugli. |
106 | | const float kSGmul = 226.77216153508914f; |
107 | | const float kSGmul2 = 1.0f / 73.377132366608819f; |
108 | | const float kLog2 = 0.693147181f; |
109 | | // Includes correction factor for std::log -> log2. |
110 | | const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2; |
111 | | const float kSGVOffset = 7.7825991679894591f; |
112 | | |
113 | | template <bool invert, typename D, typename V> |
114 | 7.48k | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { |
115 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v |
116 | | // is related to the number of photons. |
117 | | // |
118 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. |
119 | | // This ratio allows quantization to move from jxl's opsin space to |
120 | | // butteraugli's log-gamma space. |
121 | 7.48k | float kEpsilon = 1e-2; |
122 | 7.48k | v = ZeroIfNegative(v); |
123 | 7.48k | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); |
124 | 7.48k | const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon); |
125 | 7.48k | const auto kDenMul = Set(d, kLog2 * kSGmul); |
126 | | |
127 | 7.48k | const auto v2 = Mul(v, v); |
128 | | |
129 | 7.48k | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); |
130 | 7.48k | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); |
131 | 7.48k | return invert ? Div(num, den) : Div(den, num); |
132 | 7.48k | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 114 | 6.65k | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { | 115 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v | 116 | | // is related to the number of photons. | 117 | | // | 118 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. | 119 | | // This ratio allows quantization to move from jxl's opsin space to | 120 | | // butteraugli's log-gamma space. | 121 | 6.65k | float kEpsilon = 1e-2; | 122 | 6.65k | v = ZeroIfNegative(v); | 123 | 6.65k | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); | 124 | 6.65k | const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon); | 125 | 6.65k | const auto kDenMul = Set(d, kLog2 * kSGmul); | 126 | | | 127 | 6.65k | const auto v2 = Mul(v, v); | 128 | | | 129 | 6.65k | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); | 130 | 6.65k | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); | 131 | 6.65k | return invert ? Div(num, den) : Div(den, num); | 132 | 6.65k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 114 | 832 | V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { | 115 | | // The opsin space in jxl is the cubic root of photons, i.e., v * v * v | 116 | | // is related to the number of photons. | 117 | | // | 118 | | // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. | 119 | | // This ratio allows quantization to move from jxl's opsin space to | 120 | | // butteraugli's log-gamma space. | 121 | 832 | float kEpsilon = 1e-2; | 122 | 832 | v = ZeroIfNegative(v); | 123 | 832 | const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); | 124 | 832 | const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon); | 125 | 832 | const auto kDenMul = Set(d, kLog2 * kSGmul); | 126 | | | 127 | 832 | const auto v2 = Mul(v, v); | 128 | | | 129 | 832 | const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); | 130 | 832 | const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); | 131 | 832 | return invert ? Div(num, den) : Div(den, num); | 132 | 832 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
133 | | |
134 | | template <bool invert = false> |
135 | 6.65k | float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { |
136 | 6.65k | using DScalar = HWY_CAPPED(float, 1); |
137 | 6.65k | auto vscalar = Load(DScalar(), &v); |
138 | 6.65k | return GetLane( |
139 | 6.65k | RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar)); |
140 | 6.65k | } enc_adaptive_quantization.cc:float jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float) Line | Count | Source | 135 | 6.65k | float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { | 136 | 6.65k | using DScalar = HWY_CAPPED(float, 1); | 137 | 6.65k | auto vscalar = Load(DScalar(), &v); | 138 | 6.65k | return GetLane( | 139 | 6.65k | RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar)); | 140 | 6.65k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float) Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float) |
141 | | |
142 | | // TODO(veluca): this function computes an approximation of the derivative of |
143 | | // SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or |
144 | | // exact derivatives. For reference, SimpleGamma was: |
145 | | /* |
146 | | template <typename D, typename V> |
147 | | V SimpleGamma(const D d, V v) { |
148 | | // A simple HDR compatible gamma function. |
149 | | const auto mul = Set(d, kSGmul); |
150 | | const auto kRetMul = Set(d, kSGRetMul); |
151 | | const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f); |
152 | | const auto kVOffset = Set(d, kSGVOffset); |
153 | | |
154 | | v *= mul; |
155 | | |
156 | | // This should happen rarely, but may lead to a NaN, which is rather |
157 | | // undesirable. Since negative photons don't exist we solve the NaNs by |
158 | | // clamping here. |
159 | | // TODO(veluca): with FastLog2f, this no longer leads to NaNs. |
160 | | v = ZeroIfNegative(v); |
161 | | return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd; |
162 | | } |
163 | | */ |
164 | | |
165 | | template <class D, class V> |
166 | | V GammaModulation(const D d, const size_t x, const size_t y, |
167 | | const ImageF& xyb_x, const ImageF& xyb_y, const Rect& rect, |
168 | 52 | const V out_val) { |
169 | 52 | const float kBias = 0.16f; |
170 | 52 | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]); |
171 | 52 | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]); |
172 | 52 | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]); |
173 | 52 | auto overall_ratio = Zero(d); |
174 | 52 | auto bias = Set(d, kBias); |
175 | 468 | for (size_t dy = 0; dy < 8; ++dy) { |
176 | 416 | const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy); |
177 | 416 | const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy); |
178 | 832 | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { |
179 | 416 | const auto iny = Add(Load(d, row_in_y + x + dx), bias); |
180 | 416 | const auto inx = Load(d, row_in_x + x + dx); |
181 | | |
182 | 416 | const auto r = Sub(iny, inx); |
183 | 416 | const auto ratio_r = |
184 | 416 | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r); |
185 | 416 | overall_ratio = Add(overall_ratio, ratio_r); |
186 | | |
187 | 416 | const auto g = Add(iny, inx); |
188 | 416 | const auto ratio_g = |
189 | 416 | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g); |
190 | 416 | overall_ratio = Add(overall_ratio, ratio_g); |
191 | 416 | } |
192 | 416 | } |
193 | 52 | overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64)); |
194 | | // ideally -1.0, but likely optimal correction adds some entropy, so slightly |
195 | | // less than that. |
196 | 52 | const auto kGam = Set(d, 0.1005613337192697f); |
197 | 52 | return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val); |
198 | 52 | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::GammaModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 168 | 52 | const V out_val) { | 169 | 52 | const float kBias = 0.16f; | 170 | 52 | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]); | 171 | 52 | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]); | 172 | 52 | JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]); | 173 | 52 | auto overall_ratio = Zero(d); | 174 | 52 | auto bias = Set(d, kBias); | 175 | 468 | for (size_t dy = 0; dy < 8; ++dy) { | 176 | 416 | const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy); | 177 | 416 | const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy); | 178 | 832 | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { | 179 | 416 | const auto iny = Add(Load(d, row_in_y + x + dx), bias); | 180 | 416 | const auto inx = Load(d, row_in_x + x + dx); | 181 | | | 182 | 416 | const auto r = Sub(iny, inx); | 183 | 416 | const auto ratio_r = | 184 | 416 | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r); | 185 | 416 | overall_ratio = Add(overall_ratio, ratio_r); | 186 | | | 187 | 416 | const auto g = Add(iny, inx); | 188 | 416 | const auto ratio_g = | 189 | 416 | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g); | 190 | 416 | overall_ratio = Add(overall_ratio, ratio_g); | 191 | 416 | } | 192 | 416 | } | 193 | 52 | overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64)); | 194 | | // ideally -1.0, but likely optimal correction adds some entropy, so slightly | 195 | | // less than that. | 196 | 52 | const auto kGam = Set(d, 0.1005613337192697f); | 197 | 52 | return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val); | 198 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::GammaModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::GammaModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>) |
199 | | |
200 | | // Change precision in 8x8 blocks that have high frequency content. |
201 | | template <class D, class V> |
202 | | V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb, |
203 | 52 | const Rect& rect, const V out_val) { |
204 | | // Zero out the invalid differences for the rightmost value per row. |
205 | 52 | const Rebind<uint32_t, D> du; |
206 | 52 | HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, |
207 | 52 | ~0u, ~0u, ~0u, 0}; |
208 | | |
209 | 52 | auto sum = Zero(d); // sum of absolute differences with right and below |
210 | | |
211 | 52 | static const float valmin = 0.020602694503245016f; |
212 | 52 | auto valminv = Set(d, valmin); |
213 | 468 | for (size_t dy = 0; dy < 8; ++dy) { |
214 | 416 | const float* JXL_RESTRICT row_in = rect.ConstRow(xyb, y + dy) + x; |
215 | 416 | const float* JXL_RESTRICT row_in_next = |
216 | 416 | dy == 7 ? row_in : rect.ConstRow(xyb, y + dy + 1) + x; |
217 | | |
218 | | // In SCALAR, there is no guarantee of having extra row padding. |
219 | | // Hence, we need to ensure we don't access pixels outside the row itself. |
220 | | // In SIMD modes, however, rows are padded, so it's safe to access one |
221 | | // garbage value after the row. The vector then gets masked with kMaskRight |
222 | | // to remove the influence of that value. |
223 | 416 | #if HWY_TARGET != HWY_SCALAR |
224 | 832 | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { |
225 | | #else |
226 | | for (size_t dx = 0; dx < 7; dx += Lanes(d)) { |
227 | | #endif |
228 | 416 | const auto p = Load(d, row_in + dx); |
229 | 416 | const auto pr = LoadU(d, row_in + dx + 1); |
230 | 416 | const auto mask = BitCast(d, Load(du, kMaskRight + dx)); |
231 | 416 | sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr)))); |
232 | | |
233 | 416 | const auto pd = Load(d, row_in_next + dx); |
234 | 416 | sum = Add(sum, Min(valminv, AbsDiff(p, pd))); |
235 | 416 | } |
236 | | #if HWY_TARGET == HWY_SCALAR |
237 | | const auto p = Load(d, row_in + 7); |
238 | | const auto pd = Load(d, row_in_next + 7); |
239 | | sum = Add(sum, Min(valminv, AbsDiff(p, pd))); |
240 | | #endif |
241 | 416 | } |
242 | | // more negative value gives more bpp |
243 | 52 | static const float kOffset = -1.110929106987477; |
244 | 52 | static const float kMul = -0.38078920620238305; |
245 | 52 | sum = SumOfLanes(d, sum); |
246 | 52 | float scalar_sum = GetLane(sum); |
247 | 52 | scalar_sum += kOffset; |
248 | 52 | scalar_sum *= kMul; |
249 | 52 | return Add(Set(d, scalar_sum), out_val); |
250 | 52 | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::HfModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 203 | 52 | const Rect& rect, const V out_val) { | 204 | | // Zero out the invalid differences for the rightmost value per row. | 205 | 52 | const Rebind<uint32_t, D> du; | 206 | 52 | HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, | 207 | 52 | ~0u, ~0u, ~0u, 0}; | 208 | | | 209 | 52 | auto sum = Zero(d); // sum of absolute differences with right and below | 210 | | | 211 | 52 | static const float valmin = 0.020602694503245016f; | 212 | 52 | auto valminv = Set(d, valmin); | 213 | 468 | for (size_t dy = 0; dy < 8; ++dy) { | 214 | 416 | const float* JXL_RESTRICT row_in = rect.ConstRow(xyb, y + dy) + x; | 215 | 416 | const float* JXL_RESTRICT row_in_next = | 216 | 416 | dy == 7 ? row_in : rect.ConstRow(xyb, y + dy + 1) + x; | 217 | | | 218 | | // In SCALAR, there is no guarantee of having extra row padding. | 219 | | // Hence, we need to ensure we don't access pixels outside the row itself. | 220 | | // In SIMD modes, however, rows are padded, so it's safe to access one | 221 | | // garbage value after the row. The vector then gets masked with kMaskRight | 222 | | // to remove the influence of that value. | 223 | 416 | #if HWY_TARGET != HWY_SCALAR | 224 | 832 | for (size_t dx = 0; dx < 8; dx += Lanes(d)) { | 225 | | #else | 226 | | for (size_t dx = 0; dx < 7; dx += Lanes(d)) { | 227 | | #endif | 228 | 416 | const auto p = Load(d, row_in + dx); | 229 | 416 | const auto pr = LoadU(d, row_in + dx + 1); | 230 | 416 | const auto mask = BitCast(d, Load(du, kMaskRight + dx)); | 231 | 416 | sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr)))); | 232 | | | 233 | 416 | const auto pd = Load(d, row_in_next + dx); | 234 | 416 | sum = Add(sum, Min(valminv, AbsDiff(p, pd))); | 235 | 416 | } | 236 | | #if HWY_TARGET == HWY_SCALAR | 237 | | const auto p = Load(d, row_in + 7); | 238 | | const auto pd = Load(d, row_in_next + 7); | 239 | | sum = Add(sum, Min(valminv, AbsDiff(p, pd))); | 240 | | #endif | 241 | 416 | } | 242 | | // more negative value gives more bpp | 243 | 52 | static const float kOffset = -1.110929106987477; | 244 | 52 | static const float kMul = -0.38078920620238305; | 245 | 52 | sum = SumOfLanes(d, sum); | 246 | 52 | float scalar_sum = GetLane(sum); | 247 | 52 | scalar_sum += kOffset; | 248 | 52 | scalar_sum *= kMul; | 249 | 52 | return Add(Set(d, scalar_sum), out_val); | 250 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::HfModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::HfModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>) |
251 | | |
252 | | void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x, |
253 | | const ImageF& xyb_y, const ImageF& xyb_b, |
254 | | const Rect& rect_in, const float scale, |
255 | 52 | const Rect& rect_out, ImageF* out) { |
256 | 52 | float base_level = 0.48f * scale; |
257 | 52 | float kDampenRampStart = 2.0f; |
258 | 52 | float kDampenRampEnd = 14.0f; |
259 | 52 | float dampen = 1.0f; |
260 | 52 | if (butteraugli_target >= kDampenRampStart) { |
261 | 0 | dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / |
262 | 0 | (kDampenRampEnd - kDampenRampStart)); |
263 | 0 | if (dampen < 0) { |
264 | 0 | dampen = 0; |
265 | 0 | } |
266 | 0 | } |
267 | 52 | const float mul = scale * dampen; |
268 | 52 | const float add = (1.0f - dampen) * base_level; |
269 | 104 | for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) { |
270 | 52 | const size_t y = iy * 8; |
271 | 52 | float* const JXL_RESTRICT row_out = out->Row(iy); |
272 | 52 | const HWY_CAPPED(float, kBlockDim) df; |
273 | 104 | for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) { |
274 | 52 | size_t x = ix * 8; |
275 | 52 | auto out_val = Set(df, row_out[ix]); |
276 | 52 | out_val = ComputeMask(df, out_val); |
277 | 52 | out_val = HfModulation(df, x, y, xyb_y, rect_in, out_val); |
278 | 52 | out_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, out_val); |
279 | | // We want multiplicative quantization field, so everything |
280 | | // until this point has been modulating the exponent. |
281 | 52 | row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; |
282 | 52 | } |
283 | 52 | } |
284 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Line | Count | Source | 255 | 52 | const Rect& rect_out, ImageF* out) { | 256 | 52 | float base_level = 0.48f * scale; | 257 | 52 | float kDampenRampStart = 2.0f; | 258 | 52 | float kDampenRampEnd = 14.0f; | 259 | 52 | float dampen = 1.0f; | 260 | 52 | if (butteraugli_target >= kDampenRampStart) { | 261 | 0 | dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / | 262 | 0 | (kDampenRampEnd - kDampenRampStart)); | 263 | 0 | if (dampen < 0) { | 264 | 0 | dampen = 0; | 265 | 0 | } | 266 | 0 | } | 267 | 52 | const float mul = scale * dampen; | 268 | 52 | const float add = (1.0f - dampen) * base_level; | 269 | 104 | for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) { | 270 | 52 | const size_t y = iy * 8; | 271 | 52 | float* const JXL_RESTRICT row_out = out->Row(iy); | 272 | 52 | const HWY_CAPPED(float, kBlockDim) df; | 273 | 104 | for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) { | 274 | 52 | size_t x = ix * 8; | 275 | 52 | auto out_val = Set(df, row_out[ix]); | 276 | 52 | out_val = ComputeMask(df, out_val); | 277 | 52 | out_val = HfModulation(df, x, y, xyb_y, rect_in, out_val); | 278 | 52 | out_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, out_val); | 279 | | // We want multiplicative quantization field, so everything | 280 | | // until this point has been modulating the exponent. | 281 | 52 | row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; | 282 | 52 | } | 283 | 52 | } | 284 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) |
285 | | |
286 | | template <typename D, typename V> |
287 | 3.32k | V MaskingSqrt(const D d, V v) { |
288 | 3.32k | static const float kLogOffset = 27.97044946785558f; |
289 | 3.32k | static const float kMul = 211.53333281566171f; |
290 | 3.32k | const auto mul_v = Set(d, kMul * 1e8); |
291 | 3.32k | const auto offset_v = Set(d, kLogOffset); |
292 | 3.32k | return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v))); |
293 | 3.32k | } enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 287 | 3.32k | V MaskingSqrt(const D d, V v) { | 288 | 3.32k | static const float kLogOffset = 27.97044946785558f; | 289 | 3.32k | static const float kMul = 211.53333281566171f; | 290 | 3.32k | const auto mul_v = Set(d, kMul * 1e8); | 291 | 3.32k | const auto offset_v = Set(d, kLogOffset); | 292 | 3.32k | return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v))); | 293 | 3.32k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
294 | | |
295 | 3.32k | float MaskingSqrt(const float v) { |
296 | 3.32k | using DScalar = HWY_CAPPED(float, 1); |
297 | 3.32k | auto vscalar = Load(DScalar(), &v); |
298 | 3.32k | return GetLane(MaskingSqrt(DScalar(), vscalar)); |
299 | 3.32k | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::MaskingSqrt(float) Line | Count | Source | 295 | 3.32k | float MaskingSqrt(const float v) { | 296 | 3.32k | using DScalar = HWY_CAPPED(float, 1); | 297 | 3.32k | auto vscalar = Load(DScalar(), &v); | 298 | 3.32k | return GetLane(MaskingSqrt(DScalar(), vscalar)); | 299 | 3.32k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::MaskingSqrt(float) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::MaskingSqrt(float) |
300 | | |
301 | | void StoreMin4(const float v, float& min0, float& min1, float& min2, |
302 | 1.04k | float& min3) { |
303 | 1.04k | if (v < min3) { |
304 | 0 | if (v < min0) { |
305 | 0 | min3 = min2; |
306 | 0 | min2 = min1; |
307 | 0 | min1 = min0; |
308 | 0 | min0 = v; |
309 | 0 | } else if (v < min1) { |
310 | 0 | min3 = min2; |
311 | 0 | min2 = min1; |
312 | 0 | min1 = v; |
313 | 0 | } else if (v < min2) { |
314 | 0 | min3 = min2; |
315 | 0 | min2 = v; |
316 | 0 | } else { |
317 | 0 | min3 = v; |
318 | 0 | } |
319 | 0 | } |
320 | 1.04k | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&) Line | Count | Source | 302 | 1.04k | float& min3) { | 303 | 1.04k | if (v < min3) { | 304 | 0 | if (v < min0) { | 305 | 0 | min3 = min2; | 306 | 0 | min2 = min1; | 307 | 0 | min1 = min0; | 308 | 0 | min0 = v; | 309 | 0 | } else if (v < min1) { | 310 | 0 | min3 = min2; | 311 | 0 | min2 = min1; | 312 | 0 | min1 = v; | 313 | 0 | } else if (v < min2) { | 314 | 0 | min3 = min2; | 315 | 0 | min2 = v; | 316 | 0 | } else { | 317 | 0 | min3 = v; | 318 | 0 | } | 319 | 0 | } | 320 | 1.04k | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&) |
321 | | |
322 | | // Look for smooth areas near the area of degradation. |
323 | | // If the areas are generally smooth, don't do masking. |
324 | | // Output is downsampled 2x. |
325 | | void FuzzyErosion(const float butteraugli_target, const Rect& from_rect, |
326 | 52 | const ImageF& from, const Rect& to_rect, ImageF* to) { |
327 | 52 | const size_t xsize = from.xsize(); |
328 | 52 | const size_t ysize = from.ysize(); |
329 | 52 | constexpr int kStep = 1; |
330 | 52 | static_assert(kStep == 1, "Step must be 1"); |
331 | 52 | JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize()); |
332 | 52 | JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize()); |
333 | 52 | static const float kMulBase0 = 0.125; |
334 | 52 | static const float kMulBase1 = 0.10; |
335 | 52 | static const float kMulBase2 = 0.09; |
336 | 52 | static const float kMulBase3 = 0.06; |
337 | 52 | static const float kMulAdd0 = 0.0; |
338 | 52 | static const float kMulAdd1 = -0.10; |
339 | 52 | static const float kMulAdd2 = -0.09; |
340 | 52 | static const float kMulAdd3 = -0.06; |
341 | | |
342 | 52 | float mul = 0.0; |
343 | 52 | if (butteraugli_target < 2.0f) { |
344 | 52 | mul = (2.0f - butteraugli_target) * (1.0f / 2.0f); |
345 | 52 | } |
346 | 52 | float kMul0 = kMulBase0 + mul * kMulAdd0; |
347 | 52 | float kMul1 = kMulBase1 + mul * kMulAdd1; |
348 | 52 | float kMul2 = kMulBase2 + mul * kMulAdd2; |
349 | 52 | float kMul3 = kMulBase3 + mul * kMulAdd3; |
350 | 52 | static const float kTotal = 0.29959705784054957; |
351 | 52 | float norm = kTotal / (kMul0 + kMul1 + kMul2 + kMul3); |
352 | 52 | kMul0 *= norm; |
353 | 52 | kMul1 *= norm; |
354 | 52 | kMul2 *= norm; |
355 | 52 | kMul3 *= norm; |
356 | | |
357 | 156 | for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { |
358 | 104 | size_t y = fy + from_rect.y0(); |
359 | 104 | size_t ym1 = y >= kStep ? y - kStep : y; |
360 | 104 | size_t yp1 = y + kStep < ysize ? y + kStep : y; |
361 | 104 | const float* rowt = from.Row(ym1); |
362 | 104 | const float* row = from.Row(y); |
363 | 104 | const float* rowb = from.Row(yp1); |
364 | 104 | float* row_out = to_rect.Row(to, fy / 2); |
365 | 312 | for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { |
366 | 208 | size_t x = fx + from_rect.x0(); |
367 | 208 | size_t xm1 = x >= kStep ? x - kStep : x; |
368 | 208 | size_t xp1 = x + kStep < xsize ? x + kStep : x; |
369 | 208 | float min0 = row[x]; |
370 | 208 | float min1 = row[xm1]; |
371 | 208 | float min2 = row[xp1]; |
372 | 208 | float min3 = rowt[xm1]; |
373 | | // Sort the first four values. |
374 | 208 | if (min0 > min1) std::swap(min0, min1); |
375 | 208 | if (min0 > min2) std::swap(min0, min2); |
376 | 208 | if (min0 > min3) std::swap(min0, min3); |
377 | 208 | if (min1 > min2) std::swap(min1, min2); |
378 | 208 | if (min1 > min3) std::swap(min1, min3); |
379 | 208 | if (min2 > min3) std::swap(min2, min3); |
380 | | // The remaining five values of a 3x3 neighbourhood. |
381 | 208 | StoreMin4(rowt[x], min0, min1, min2, min3); |
382 | 208 | StoreMin4(rowt[xp1], min0, min1, min2, min3); |
383 | 208 | StoreMin4(rowb[xm1], min0, min1, min2, min3); |
384 | 208 | StoreMin4(rowb[x], min0, min1, min2, min3); |
385 | 208 | StoreMin4(rowb[xp1], min0, min1, min2, min3); |
386 | | |
387 | 208 | float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3; |
388 | 208 | if (fx % 2 == 0 && fy % 2 == 0) { |
389 | 52 | row_out[fx / 2] = v; |
390 | 156 | } else { |
391 | 156 | row_out[fx / 2] += v; |
392 | 156 | } |
393 | 208 | } |
394 | 104 | } |
395 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Line | Count | Source | 326 | 52 | const ImageF& from, const Rect& to_rect, ImageF* to) { | 327 | 52 | const size_t xsize = from.xsize(); | 328 | 52 | const size_t ysize = from.ysize(); | 329 | 52 | constexpr int kStep = 1; | 330 | 52 | static_assert(kStep == 1, "Step must be 1"); | 331 | 52 | JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize()); | 332 | 52 | JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize()); | 333 | 52 | static const float kMulBase0 = 0.125; | 334 | 52 | static const float kMulBase1 = 0.10; | 335 | 52 | static const float kMulBase2 = 0.09; | 336 | 52 | static const float kMulBase3 = 0.06; | 337 | 52 | static const float kMulAdd0 = 0.0; | 338 | 52 | static const float kMulAdd1 = -0.10; | 339 | 52 | static const float kMulAdd2 = -0.09; | 340 | 52 | static const float kMulAdd3 = -0.06; | 341 | | | 342 | 52 | float mul = 0.0; | 343 | 52 | if (butteraugli_target < 2.0f) { | 344 | 52 | mul = (2.0f - butteraugli_target) * (1.0f / 2.0f); | 345 | 52 | } | 346 | 52 | float kMul0 = kMulBase0 + mul * kMulAdd0; | 347 | 52 | float kMul1 = kMulBase1 + mul * kMulAdd1; | 348 | 52 | float kMul2 = kMulBase2 + mul * kMulAdd2; | 349 | 52 | float kMul3 = kMulBase3 + mul * kMulAdd3; | 350 | 52 | static const float kTotal = 0.29959705784054957; | 351 | 52 | float norm = kTotal / (kMul0 + kMul1 + kMul2 + kMul3); | 352 | 52 | kMul0 *= norm; | 353 | 52 | kMul1 *= norm; | 354 | 52 | kMul2 *= norm; | 355 | 52 | kMul3 *= norm; | 356 | | | 357 | 156 | for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { | 358 | 104 | size_t y = fy + from_rect.y0(); | 359 | 104 | size_t ym1 = y >= kStep ? y - kStep : y; | 360 | 104 | size_t yp1 = y + kStep < ysize ? y + kStep : y; | 361 | 104 | const float* rowt = from.Row(ym1); | 362 | 104 | const float* row = from.Row(y); | 363 | 104 | const float* rowb = from.Row(yp1); | 364 | 104 | float* row_out = to_rect.Row(to, fy / 2); | 365 | 312 | for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { | 366 | 208 | size_t x = fx + from_rect.x0(); | 367 | 208 | size_t xm1 = x >= kStep ? x - kStep : x; | 368 | 208 | size_t xp1 = x + kStep < xsize ? x + kStep : x; | 369 | 208 | float min0 = row[x]; | 370 | 208 | float min1 = row[xm1]; | 371 | 208 | float min2 = row[xp1]; | 372 | 208 | float min3 = rowt[xm1]; | 373 | | // Sort the first four values. | 374 | 208 | if (min0 > min1) std::swap(min0, min1); | 375 | 208 | if (min0 > min2) std::swap(min0, min2); | 376 | 208 | if (min0 > min3) std::swap(min0, min3); | 377 | 208 | if (min1 > min2) std::swap(min1, min2); | 378 | 208 | if (min1 > min3) std::swap(min1, min3); | 379 | 208 | if (min2 > min3) std::swap(min2, min3); | 380 | | // The remaining five values of a 3x3 neighbourhood. | 381 | 208 | StoreMin4(rowt[x], min0, min1, min2, min3); | 382 | 208 | StoreMin4(rowt[xp1], min0, min1, min2, min3); | 383 | 208 | StoreMin4(rowb[xm1], min0, min1, min2, min3); | 384 | 208 | StoreMin4(rowb[x], min0, min1, min2, min3); | 385 | 208 | StoreMin4(rowb[xp1], min0, min1, min2, min3); | 386 | | | 387 | 208 | float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3; | 388 | 208 | if (fx % 2 == 0 && fy % 2 == 0) { | 389 | 52 | row_out[fx / 2] = v; | 390 | 156 | } else { | 391 | 156 | row_out[fx / 2] += v; | 392 | 156 | } | 393 | 208 | } | 394 | 104 | } | 395 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) |
396 | | |
397 | | struct AdaptiveQuantizationImpl { |
398 | 52 | Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) { |
399 | 52 | JXL_ASSIGN_OR_RETURN( |
400 | 52 | diff_buffer, |
401 | 52 | ImageF::Create(memory_manager, kEncTileDim + 8, num_threads)); |
402 | 156 | for (size_t i = pre_erosion.size(); i < num_threads; i++) { |
403 | 104 | JXL_ASSIGN_OR_RETURN( |
404 | 104 | ImageF tmp, |
405 | 104 | ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2, |
406 | 104 | kEncTileDimInBlocks * 2 + 2)); |
407 | 104 | pre_erosion.emplace_back(std::move(tmp)); |
408 | 104 | } |
409 | 52 | return true; |
410 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long) Line | Count | Source | 398 | 52 | Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) { | 399 | 52 | JXL_ASSIGN_OR_RETURN( | 400 | 52 | diff_buffer, | 401 | 52 | ImageF::Create(memory_manager, kEncTileDim + 8, num_threads)); | 402 | 156 | for (size_t i = pre_erosion.size(); i < num_threads; i++) { | 403 | 104 | JXL_ASSIGN_OR_RETURN( | 404 | 104 | ImageF tmp, | 405 | 104 | ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2, | 406 | 104 | kEncTileDimInBlocks * 2 + 2)); | 407 | 104 | pre_erosion.emplace_back(std::move(tmp)); | 408 | 104 | } | 409 | 52 | return true; | 410 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long) |
411 | | |
412 | | void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb, |
413 | | const Rect& rect_in, const Rect& rect_out, const int thread, |
414 | 52 | ImageF* mask, ImageF* mask1x1) { |
415 | 52 | JXL_ASSERT(rect_in.x0() % 8 == 0); |
416 | 52 | JXL_ASSERT(rect_in.y0() % 8 == 0); |
417 | 52 | const size_t xsize = xyb.xsize(); |
418 | 52 | const size_t ysize = xyb.ysize(); |
419 | | |
420 | | // The XYB gamma is 3.0 to be able to decode faster with two muls. |
421 | | // Butteraugli's gamma is matching the gamma of human eye, around 2.6. |
422 | | // We approximate the gamma difference by adding one cubic root into |
423 | | // the adaptive quantization. This gives us a total gamma of 2.6666 |
424 | | // for quantization uses. |
425 | 52 | const float match_gamma_offset = 0.019; |
426 | | |
427 | 52 | const HWY_FULL(float) df; |
428 | | |
429 | 52 | size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8; |
430 | 52 | size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8; |
431 | | |
432 | 52 | size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8; |
433 | 52 | size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8; |
434 | | |
435 | 52 | if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2; |
436 | 52 | if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) { |
437 | 0 | x_end_1x1 += 2; |
438 | 0 | } |
439 | 52 | if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2; |
440 | 52 | if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) { |
441 | 0 | y_end_1x1 += 2; |
442 | 0 | } |
443 | | |
444 | | // Computes image (padded to multiple of 8x8) of local pixel differences. |
445 | | // Subsample both directions by 4. |
446 | | // 1x1 Laplacian of intensity. |
447 | 468 | for (size_t y = y_start_1x1; y < y_end_1x1; ++y) { |
448 | 416 | const size_t y2 = y + 1 < ysize ? y + 1 : y; |
449 | 416 | const size_t y1 = y > 0 ? y - 1 : y; |
450 | 416 | const float* row_in = xyb.ConstPlaneRow(1, y); |
451 | 416 | const float* row_in1 = xyb.ConstPlaneRow(1, y1); |
452 | 416 | const float* row_in2 = xyb.ConstPlaneRow(1, y2); |
453 | 416 | float* mask1x1_out = mask1x1->Row(y); |
454 | 3.32k | auto scalar_pixel1x1 = [&](size_t x) { |
455 | 3.32k | const size_t x2 = x + 1 < xsize ? x + 1 : x; |
456 | 3.32k | const size_t x1 = x > 0 ? x - 1 : x; |
457 | 3.32k | const float base = |
458 | 3.32k | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); |
459 | 3.32k | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( |
460 | 3.32k | row_in[x] + match_gamma_offset); |
461 | 3.32k | float diff = fabs(gammac * (row_in[x] - base)); |
462 | 3.32k | static const double kScaler = 1.0; |
463 | 3.32k | diff *= kScaler; |
464 | 3.32k | diff = log1p(diff); |
465 | 3.32k | static const float kMul = 1.0; |
466 | 3.32k | static const float kOffset = 0.01; |
467 | 3.32k | mask1x1_out[x] = kMul / (diff + kOffset); |
468 | 3.32k | }; enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Line | Count | Source | 454 | 3.32k | auto scalar_pixel1x1 = [&](size_t x) { | 455 | 3.32k | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 456 | 3.32k | const size_t x1 = x > 0 ? x - 1 : x; | 457 | 3.32k | const float base = | 458 | 3.32k | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 459 | 3.32k | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 460 | 3.32k | row_in[x] + match_gamma_offset); | 461 | 3.32k | float diff = fabs(gammac * (row_in[x] - base)); | 462 | 3.32k | static const double kScaler = 1.0; | 463 | 3.32k | diff *= kScaler; | 464 | 3.32k | diff = log1p(diff); | 465 | 3.32k | static const float kMul = 1.0; | 466 | 3.32k | static const float kOffset = 0.01; | 467 | 3.32k | mask1x1_out[x] = kMul / (diff + kOffset); | 468 | 3.32k | }; |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const |
469 | 3.74k | for (size_t x = x_start_1x1; x < x_end_1x1; ++x) { |
470 | 3.32k | scalar_pixel1x1(x); |
471 | 3.32k | } |
472 | 416 | } |
473 | | |
474 | 52 | size_t y_start = rect_in.y0() + rect_out.y0() * 8; |
475 | 52 | size_t y_end = y_start + rect_out.ysize() * 8; |
476 | | |
477 | 52 | size_t x_start = rect_in.x0() + rect_out.x0() * 8; |
478 | 52 | size_t x_end = x_start + rect_out.xsize() * 8; |
479 | | |
480 | 52 | if (x_start != 0) x_start -= 4; |
481 | 52 | if (x_end != xsize) x_end += 4; |
482 | 52 | if (y_start != 0) y_start -= 4; |
483 | 52 | if (y_end != ysize) y_end += 4; |
484 | 52 | pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4); |
485 | | |
486 | 52 | static const float limit = 0.2f; |
487 | 468 | for (size_t y = y_start; y < y_end; ++y) { |
488 | 416 | size_t y2 = y + 1 < ysize ? y + 1 : y; |
489 | 416 | size_t y1 = y > 0 ? y - 1 : y; |
490 | | |
491 | 416 | const float* row_in = xyb.ConstPlaneRow(1, y); |
492 | 416 | const float* row_in1 = xyb.ConstPlaneRow(1, y1); |
493 | 416 | const float* row_in2 = xyb.ConstPlaneRow(1, y2); |
494 | 416 | float* JXL_RESTRICT row_out = diff_buffer.Row(thread); |
495 | | |
496 | 3.32k | auto scalar_pixel = [&](size_t x) { |
497 | 3.32k | const size_t x2 = x + 1 < xsize ? x + 1 : x; |
498 | 3.32k | const size_t x1 = x > 0 ? x - 1 : x; |
499 | 3.32k | const float base = |
500 | 3.32k | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); |
501 | 3.32k | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( |
502 | 3.32k | row_in[x] + match_gamma_offset); |
503 | 3.32k | float diff = gammac * (row_in[x] - base); |
504 | 3.32k | diff *= diff; |
505 | 3.32k | if (diff >= limit) { |
506 | 0 | diff = limit; |
507 | 0 | } |
508 | 3.32k | diff = MaskingSqrt(diff); |
509 | 3.32k | if ((y % 4) != 0) { |
510 | 2.49k | row_out[x - x_start] += diff; |
511 | 2.49k | } else { |
512 | 832 | row_out[x - x_start] = diff; |
513 | 832 | } |
514 | 3.32k | }; enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const Line | Count | Source | 496 | 3.32k | auto scalar_pixel = [&](size_t x) { | 497 | 3.32k | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 498 | 3.32k | const size_t x1 = x > 0 ? x - 1 : x; | 499 | 3.32k | const float base = | 500 | 3.32k | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 501 | 3.32k | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 502 | 3.32k | row_in[x] + match_gamma_offset); | 503 | 3.32k | float diff = gammac * (row_in[x] - base); | 504 | 3.32k | diff *= diff; | 505 | 3.32k | if (diff >= limit) { | 506 | 0 | diff = limit; | 507 | 0 | } | 508 | 3.32k | diff = MaskingSqrt(diff); | 509 | 3.32k | if ((y % 4) != 0) { | 510 | 2.49k | row_out[x - x_start] += diff; | 511 | 2.49k | } else { | 512 | 832 | row_out[x - x_start] = diff; | 513 | 832 | } | 514 | 3.32k | }; |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const |
515 | | |
516 | 416 | size_t x = x_start; |
517 | | // First pixel of the row. |
518 | 416 | if (x_start == 0) { |
519 | 416 | scalar_pixel(x_start); |
520 | 416 | ++x; |
521 | 416 | } |
522 | | // SIMD |
523 | 416 | const auto match_gamma_offset_v = Set(df, match_gamma_offset); |
524 | 416 | const auto quarter = Set(df, 0.25f); |
525 | 416 | for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) { |
526 | 0 | const auto in = LoadU(df, row_in + x); |
527 | 0 | const auto in_r = LoadU(df, row_in + x + 1); |
528 | 0 | const auto in_l = LoadU(df, row_in + x - 1); |
529 | 0 | const auto in_t = LoadU(df, row_in2 + x); |
530 | 0 | const auto in_b = LoadU(df, row_in1 + x); |
531 | 0 | auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b))); |
532 | 0 | auto gammacv = |
533 | 0 | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>( |
534 | 0 | df, Add(in, match_gamma_offset_v)); |
535 | 0 | auto diff = Mul(gammacv, Sub(in, base)); |
536 | 0 | diff = Mul(diff, diff); |
537 | 0 | diff = Min(diff, Set(df, limit)); |
538 | 0 | diff = MaskingSqrt(df, diff); |
539 | 0 | if ((y & 3) != 0) { |
540 | 0 | diff = Add(diff, LoadU(df, row_out + x - x_start)); |
541 | 0 | } |
542 | 0 | StoreU(diff, df, row_out + x - x_start); |
543 | 0 | } |
544 | | // Scalar |
545 | 3.32k | for (; x < x_end; ++x) { |
546 | 2.91k | scalar_pixel(x); |
547 | 2.91k | } |
548 | 416 | if (y % 4 == 3) { |
549 | 104 | float* row_dout = pre_erosion[thread].Row((y - y_start) / 4); |
550 | 312 | for (size_t x = 0; x < (x_end - x_start) / 4; x++) { |
551 | 208 | row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] + |
552 | 208 | row_out[x * 4 + 2] + row_out[x * 4 + 3]) * |
553 | 208 | 0.25f; |
554 | 208 | } |
555 | 104 | } |
556 | 416 | } |
557 | 52 | Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, |
558 | 52 | rect_out.xsize() * 2, rect_out.ysize() * 2); |
559 | 52 | FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect_out, |
560 | 52 | &aq_map); |
561 | 104 | for (size_t y = 0; y < rect_out.ysize(); ++y) { |
562 | 52 | const float* aq_map_row = rect_out.ConstRow(aq_map, y); |
563 | 52 | float* mask_row = rect_out.Row(mask, y); |
564 | 104 | for (size_t x = 0; x < rect_out.xsize(); ++x) { |
565 | 52 | mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); |
566 | 52 | } |
567 | 52 | } |
568 | 52 | PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), |
569 | 52 | xyb.Plane(2), rect_in, scale, rect_out, &aq_map); |
570 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*) Line | Count | Source | 414 | 52 | ImageF* mask, ImageF* mask1x1) { | 415 | 52 | JXL_ASSERT(rect_in.x0() % 8 == 0); | 416 | 52 | JXL_ASSERT(rect_in.y0() % 8 == 0); | 417 | 52 | const size_t xsize = xyb.xsize(); | 418 | 52 | const size_t ysize = xyb.ysize(); | 419 | | | 420 | | // The XYB gamma is 3.0 to be able to decode faster with two muls. | 421 | | // Butteraugli's gamma is matching the gamma of human eye, around 2.6. | 422 | | // We approximate the gamma difference by adding one cubic root into | 423 | | // the adaptive quantization. This gives us a total gamma of 2.6666 | 424 | | // for quantization uses. | 425 | 52 | const float match_gamma_offset = 0.019; | 426 | | | 427 | 52 | const HWY_FULL(float) df; | 428 | | | 429 | 52 | size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8; | 430 | 52 | size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8; | 431 | | | 432 | 52 | size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8; | 433 | 52 | size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8; | 434 | | | 435 | 52 | if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2; | 436 | 52 | if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) { | 437 | 0 | x_end_1x1 += 2; | 438 | 0 | } | 439 | 52 | if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2; | 440 | 52 | if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) { | 441 | 0 | y_end_1x1 += 2; | 442 | 0 | } | 443 | | | 444 | | // Computes image (padded to multiple of 8x8) of local pixel differences. | 445 | | // Subsample both directions by 4. | 446 | | // 1x1 Laplacian of intensity. | 447 | 468 | for (size_t y = y_start_1x1; y < y_end_1x1; ++y) { | 448 | 416 | const size_t y2 = y + 1 < ysize ? y + 1 : y; | 449 | 416 | const size_t y1 = y > 0 ? y - 1 : y; | 450 | 416 | const float* row_in = xyb.ConstPlaneRow(1, y); | 451 | 416 | const float* row_in1 = xyb.ConstPlaneRow(1, y1); | 452 | 416 | const float* row_in2 = xyb.ConstPlaneRow(1, y2); | 453 | 416 | float* mask1x1_out = mask1x1->Row(y); | 454 | 416 | auto scalar_pixel1x1 = [&](size_t x) { | 455 | 416 | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 456 | 416 | const size_t x1 = x > 0 ? x - 1 : x; | 457 | 416 | const float base = | 458 | 416 | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 459 | 416 | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 460 | 416 | row_in[x] + match_gamma_offset); | 461 | 416 | float diff = fabs(gammac * (row_in[x] - base)); | 462 | 416 | static const double kScaler = 1.0; | 463 | 416 | diff *= kScaler; | 464 | 416 | diff = log1p(diff); | 465 | 416 | static const float kMul = 1.0; | 466 | 416 | static const float kOffset = 0.01; | 467 | 416 | mask1x1_out[x] = kMul / (diff + kOffset); | 468 | 416 | }; | 469 | 3.74k | for (size_t x = x_start_1x1; x < x_end_1x1; ++x) { | 470 | 3.32k | scalar_pixel1x1(x); | 471 | 3.32k | } | 472 | 416 | } | 473 | | | 474 | 52 | size_t y_start = rect_in.y0() + rect_out.y0() * 8; | 475 | 52 | size_t y_end = y_start + rect_out.ysize() * 8; | 476 | | | 477 | 52 | size_t x_start = rect_in.x0() + rect_out.x0() * 8; | 478 | 52 | size_t x_end = x_start + rect_out.xsize() * 8; | 479 | | | 480 | 52 | if (x_start != 0) x_start -= 4; | 481 | 52 | if (x_end != xsize) x_end += 4; | 482 | 52 | if (y_start != 0) y_start -= 4; | 483 | 52 | if (y_end != ysize) y_end += 4; | 484 | 52 | pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4); | 485 | | | 486 | 52 | static const float limit = 0.2f; | 487 | 468 | for (size_t y = y_start; y < y_end; ++y) { | 488 | 416 | size_t y2 = y + 1 < ysize ? y + 1 : y; | 489 | 416 | size_t y1 = y > 0 ? y - 1 : y; | 490 | | | 491 | 416 | const float* row_in = xyb.ConstPlaneRow(1, y); | 492 | 416 | const float* row_in1 = xyb.ConstPlaneRow(1, y1); | 493 | 416 | const float* row_in2 = xyb.ConstPlaneRow(1, y2); | 494 | 416 | float* JXL_RESTRICT row_out = diff_buffer.Row(thread); | 495 | | | 496 | 416 | auto scalar_pixel = [&](size_t x) { | 497 | 416 | const size_t x2 = x + 1 < xsize ? x + 1 : x; | 498 | 416 | const size_t x1 = x > 0 ? x - 1 : x; | 499 | 416 | const float base = | 500 | 416 | 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); | 501 | 416 | const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( | 502 | 416 | row_in[x] + match_gamma_offset); | 503 | 416 | float diff = gammac * (row_in[x] - base); | 504 | 416 | diff *= diff; | 505 | 416 | if (diff >= limit) { | 506 | 416 | diff = limit; | 507 | 416 | } | 508 | 416 | diff = MaskingSqrt(diff); | 509 | 416 | if ((y % 4) != 0) { | 510 | 416 | row_out[x - x_start] += diff; | 511 | 416 | } else { | 512 | 416 | row_out[x - x_start] = diff; | 513 | 416 | } | 514 | 416 | }; | 515 | | | 516 | 416 | size_t x = x_start; | 517 | | // First pixel of the row. | 518 | 416 | if (x_start == 0) { | 519 | 416 | scalar_pixel(x_start); | 520 | 416 | ++x; | 521 | 416 | } | 522 | | // SIMD | 523 | 416 | const auto match_gamma_offset_v = Set(df, match_gamma_offset); | 524 | 416 | const auto quarter = Set(df, 0.25f); | 525 | 416 | for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) { | 526 | 0 | const auto in = LoadU(df, row_in + x); | 527 | 0 | const auto in_r = LoadU(df, row_in + x + 1); | 528 | 0 | const auto in_l = LoadU(df, row_in + x - 1); | 529 | 0 | const auto in_t = LoadU(df, row_in2 + x); | 530 | 0 | const auto in_b = LoadU(df, row_in1 + x); | 531 | 0 | auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b))); | 532 | 0 | auto gammacv = | 533 | 0 | RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>( | 534 | 0 | df, Add(in, match_gamma_offset_v)); | 535 | 0 | auto diff = Mul(gammacv, Sub(in, base)); | 536 | 0 | diff = Mul(diff, diff); | 537 | 0 | diff = Min(diff, Set(df, limit)); | 538 | 0 | diff = MaskingSqrt(df, diff); | 539 | 0 | if ((y & 3) != 0) { | 540 | 0 | diff = Add(diff, LoadU(df, row_out + x - x_start)); | 541 | 0 | } | 542 | 0 | StoreU(diff, df, row_out + x - x_start); | 543 | 0 | } | 544 | | // Scalar | 545 | 3.32k | for (; x < x_end; ++x) { | 546 | 2.91k | scalar_pixel(x); | 547 | 2.91k | } | 548 | 416 | if (y % 4 == 3) { | 549 | 104 | float* row_dout = pre_erosion[thread].Row((y - y_start) / 4); | 550 | 312 | for (size_t x = 0; x < (x_end - x_start) / 4; x++) { | 551 | 208 | row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] + | 552 | 208 | row_out[x * 4 + 2] + row_out[x * 4 + 3]) * | 553 | 208 | 0.25f; | 554 | 208 | } | 555 | 104 | } | 556 | 416 | } | 557 | 52 | Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, | 558 | 52 | rect_out.xsize() * 2, rect_out.ysize() * 2); | 559 | 52 | FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect_out, | 560 | 52 | &aq_map); | 561 | 104 | for (size_t y = 0; y < rect_out.ysize(); ++y) { | 562 | 52 | const float* aq_map_row = rect_out.ConstRow(aq_map, y); | 563 | 52 | float* mask_row = rect_out.Row(mask, y); | 564 | 104 | for (size_t x = 0; x < rect_out.xsize(); ++x) { | 565 | 52 | mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); | 566 | 52 | } | 567 | 52 | } | 568 | 52 | PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), | 569 | 52 | xyb.Plane(2), rect_in, scale, rect_out, &aq_map); | 570 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*) |
571 | | std::vector<ImageF> pre_erosion; |
572 | | ImageF aq_map; |
573 | | ImageF diff_buffer; |
574 | | }; |
575 | | |
576 | | Status Blur1x1Masking(JxlMemoryManager* memory_manager, ThreadPool* pool, |
577 | 52 | ImageF* mask1x1, const Rect& rect) { |
578 | | // Blur the mask1x1 to obtain the masking image. |
579 | | // Before blurring it contains an image of absolute value of the |
580 | | // Laplacian of the intensity channel. |
581 | 52 | static const float kFilterMask1x1[5] = { |
582 | 52 | static_cast<float>(0.25647067633737227), |
583 | 52 | static_cast<float>(0.2050056912354399075), |
584 | 52 | static_cast<float>(0.154082048668497307), |
585 | 52 | static_cast<float>(0.08149576591362004441), |
586 | 52 | static_cast<float>(0.0512750104812308467), |
587 | 52 | }; |
588 | 52 | double sum = |
589 | 52 | 1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] + |
590 | 52 | kFilterMask1x1[4] + 2 * kFilterMask1x1[3]); |
591 | 52 | if (sum < 1e-5) { |
592 | 0 | sum = 1e-5; |
593 | 0 | } |
594 | 52 | const float normalize = static_cast<float>(1.0 / sum); |
595 | 52 | const float normalize_mul = normalize; |
596 | 52 | WeightsSymmetric5 weights = |
597 | 52 | WeightsSymmetric5{{HWY_REP4(normalize)}, |
598 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[0])}, |
599 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[2])}, |
600 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[1])}, |
601 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[4])}, |
602 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[3])}}; |
603 | 52 | JXL_ASSIGN_OR_RETURN( |
604 | 52 | ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize())); |
605 | 52 | Symmetric5(*mask1x1, rect, weights, pool, &temp); |
606 | 52 | *mask1x1 = std::move(temp); |
607 | 52 | return true; |
608 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Line | Count | Source | 577 | 52 | ImageF* mask1x1, const Rect& rect) { | 578 | | // Blur the mask1x1 to obtain the masking image. | 579 | | // Before blurring it contains an image of absolute value of the | 580 | | // Laplacian of the intensity channel. | 581 | 52 | static const float kFilterMask1x1[5] = { | 582 | 52 | static_cast<float>(0.25647067633737227), | 583 | 52 | static_cast<float>(0.2050056912354399075), | 584 | 52 | static_cast<float>(0.154082048668497307), | 585 | 52 | static_cast<float>(0.08149576591362004441), | 586 | 52 | static_cast<float>(0.0512750104812308467), | 587 | 52 | }; | 588 | 52 | double sum = | 589 | 52 | 1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] + | 590 | 52 | kFilterMask1x1[4] + 2 * kFilterMask1x1[3]); | 591 | 52 | if (sum < 1e-5) { | 592 | 0 | sum = 1e-5; | 593 | 0 | } | 594 | 52 | const float normalize = static_cast<float>(1.0 / sum); | 595 | 52 | const float normalize_mul = normalize; | 596 | 52 | WeightsSymmetric5 weights = | 597 | 52 | WeightsSymmetric5{{HWY_REP4(normalize)}, | 598 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[0])}, | 599 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[2])}, | 600 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[1])}, | 601 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[4])}, | 602 | 52 | {HWY_REP4(normalize_mul * kFilterMask1x1[3])}}; | 603 | 52 | JXL_ASSIGN_OR_RETURN( | 604 | 52 | ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize())); | 605 | 52 | Symmetric5(*mask1x1, rect, weights, pool, &temp); | 606 | 52 | *mask1x1 = std::move(temp); | 607 | 52 | return true; | 608 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) |
609 | | |
610 | | StatusOr<ImageF> AdaptiveQuantizationMap(const float butteraugli_target, |
611 | | const Image3F& xyb, const Rect& rect, |
612 | | float scale, ThreadPool* pool, |
613 | 52 | ImageF* mask, ImageF* mask1x1) { |
614 | 52 | JXL_DASSERT(rect.xsize() % kBlockDim == 0); |
615 | 52 | JXL_DASSERT(rect.ysize() % kBlockDim == 0); |
616 | 52 | AdaptiveQuantizationImpl impl; |
617 | 52 | const size_t xsize_blocks = rect.xsize() / kBlockDim; |
618 | 52 | const size_t ysize_blocks = rect.ysize() / kBlockDim; |
619 | 52 | JxlMemoryManager* memory_manager = xyb.memory_manager(); |
620 | 52 | JXL_ASSIGN_OR_RETURN( |
621 | 52 | impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); |
622 | 52 | JXL_ASSIGN_OR_RETURN( |
623 | 52 | *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); |
624 | 52 | JXL_ASSIGN_OR_RETURN( |
625 | 52 | *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize())); |
626 | 52 | JXL_CHECK(RunOnPool( |
627 | 52 | pool, 0, |
628 | 52 | DivCeil(xsize_blocks, kEncTileDimInBlocks) * |
629 | 52 | DivCeil(ysize_blocks, kEncTileDimInBlocks), |
630 | 52 | [&](const size_t num_threads) { |
631 | 52 | return !!impl.PrepareBuffers(memory_manager, num_threads); |
632 | 52 | }, |
633 | 52 | [&](const uint32_t tid, const size_t thread) { |
634 | 52 | size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks); |
635 | 52 | size_t tx = tid % n_enc_tiles; |
636 | 52 | size_t ty = tid / n_enc_tiles; |
637 | 52 | size_t by0 = ty * kEncTileDimInBlocks; |
638 | 52 | size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks); |
639 | 52 | size_t bx0 = tx * kEncTileDimInBlocks; |
640 | 52 | size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks); |
641 | 52 | Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0); |
642 | 52 | impl.ComputeTile(butteraugli_target, scale, xyb, rect, rect_out, thread, |
643 | 52 | mask, mask1x1); |
644 | 52 | }, |
645 | 52 | "AQ DiffPrecompute")); |
646 | | |
647 | 52 | JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect)); |
648 | 52 | return std::move(impl).aq_map; |
649 | 52 | } enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*) Line | Count | Source | 613 | 52 | ImageF* mask, ImageF* mask1x1) { | 614 | 52 | JXL_DASSERT(rect.xsize() % kBlockDim == 0); | 615 | 52 | JXL_DASSERT(rect.ysize() % kBlockDim == 0); | 616 | 52 | AdaptiveQuantizationImpl impl; | 617 | 52 | const size_t xsize_blocks = rect.xsize() / kBlockDim; | 618 | 52 | const size_t ysize_blocks = rect.ysize() / kBlockDim; | 619 | 52 | JxlMemoryManager* memory_manager = xyb.memory_manager(); | 620 | 52 | JXL_ASSIGN_OR_RETURN( | 621 | 52 | impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); | 622 | 52 | JXL_ASSIGN_OR_RETURN( | 623 | 52 | *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks)); | 624 | 52 | JXL_ASSIGN_OR_RETURN( | 625 | 52 | *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize())); | 626 | 52 | JXL_CHECK(RunOnPool( | 627 | 52 | pool, 0, | 628 | 52 | DivCeil(xsize_blocks, kEncTileDimInBlocks) * | 629 | 52 | DivCeil(ysize_blocks, kEncTileDimInBlocks), | 630 | 52 | [&](const size_t num_threads) { | 631 | 52 | return !!impl.PrepareBuffers(memory_manager, num_threads); | 632 | 52 | }, | 633 | 52 | [&](const uint32_t tid, const size_t thread) { | 634 | 52 | size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks); | 635 | 52 | size_t tx = tid % n_enc_tiles; | 636 | 52 | size_t ty = tid / n_enc_tiles; | 637 | 52 | size_t by0 = ty * kEncTileDimInBlocks; | 638 | 52 | size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks); | 639 | 52 | size_t bx0 = tx * kEncTileDimInBlocks; | 640 | 52 | size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks); | 641 | 52 | Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0); | 642 | 52 | impl.ComputeTile(butteraugli_target, scale, xyb, rect, rect_out, thread, | 643 | 52 | mask, mask1x1); | 644 | 52 | }, | 645 | 52 | "AQ DiffPrecompute")); | 646 | | | 647 | 52 | JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect)); | 648 | 52 | return std::move(impl).aq_map; | 649 | 52 | } |
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*) Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*) |
650 | | |
651 | | } // namespace |
652 | | |
653 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
654 | | } // namespace HWY_NAMESPACE |
655 | | } // namespace jxl |
656 | | HWY_AFTER_NAMESPACE(); |
657 | | |
658 | | #if HWY_ONCE |
659 | | namespace jxl { |
660 | | HWY_EXPORT(AdaptiveQuantizationMap); |
661 | | |
662 | | namespace { |
663 | | |
664 | | // If true, prints the quantization maps at each iteration. |
665 | | constexpr bool FLAGS_dump_quant_state = false; |
666 | | |
667 | | Status DumpHeatmap(const CompressParams& cparams, const AuxOut* aux_out, |
668 | | const std::string& label, const ImageF& image, |
669 | 0 | float good_threshold, float bad_threshold) { |
670 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
671 | 0 | JXL_ASSIGN_OR_RETURN( |
672 | 0 | Image3F heatmap, |
673 | 0 | CreateHeatMapImage(image, good_threshold, bad_threshold)); |
674 | 0 | char filename[200]; |
675 | 0 | snprintf(filename, sizeof(filename), "%s%05d", label.c_str(), |
676 | 0 | aux_out->num_butteraugli_iters); |
677 | 0 | JXL_RETURN_IF_ERROR(DumpImage(cparams, filename, heatmap)); |
678 | 0 | } |
679 | 0 | return true; |
680 | 0 | } |
681 | | |
682 | | Status DumpHeatmaps(const CompressParams& cparams, const AuxOut* aux_out, |
683 | | float ba_target, const ImageF& quant_field, |
684 | 0 | const ImageF& tile_heatmap, const ImageF& bt_diffmap) { |
685 | 0 | JxlMemoryManager* memory_manager = quant_field.memory_manager(); |
686 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
687 | 0 | if (!WantDebugOutput(cparams)) return true; |
688 | 0 | JXL_ASSIGN_OR_RETURN(ImageF inv_qmap, |
689 | 0 | ImageF::Create(memory_manager, quant_field.xsize(), |
690 | 0 | quant_field.ysize())); |
691 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
692 | 0 | const float* JXL_RESTRICT row_q = quant_field.ConstRow(y); |
693 | 0 | float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y); |
694 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
695 | 0 | row_inv_q[x] = 1.0f / row_q[x]; // never zero |
696 | 0 | } |
697 | 0 | } |
698 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "quant_heatmap", inv_qmap, |
699 | 0 | 4.0f * ba_target, 6.0f * ba_target)); |
700 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "tile_heatmap", |
701 | 0 | tile_heatmap, ba_target, 1.5f * ba_target)); |
702 | 0 | // matches heat maps produced by the command line tool. |
703 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "bt_diffmap", bt_diffmap, |
704 | 0 | ButteraugliFuzzyInverse(1.5), |
705 | 0 | ButteraugliFuzzyInverse(0.5))); |
706 | 0 | } |
707 | 0 | return true; |
708 | 0 | } |
709 | | |
710 | | StatusOr<ImageF> TileDistMap(const ImageF& distmap, int tile_size, int margin, |
711 | 0 | const AcStrategyImage& ac_strategy) { |
712 | 0 | const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size; |
713 | 0 | const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size; |
714 | 0 | JxlMemoryManager* memory_manager = distmap.memory_manager(); |
715 | 0 | JXL_ASSIGN_OR_RETURN(ImageF tile_distmap, |
716 | 0 | ImageF::Create(memory_manager, tile_xsize, tile_ysize)); |
717 | 0 | size_t distmap_stride = tile_distmap.PixelsPerRow(); |
718 | 0 | for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) { |
719 | 0 | AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y); |
720 | 0 | float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y); |
721 | 0 | for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) { |
722 | 0 | AcStrategy acs = ac_strategy_row[tile_x]; |
723 | 0 | if (!acs.IsFirstBlock()) continue; |
724 | 0 | int this_tile_xsize = acs.covered_blocks_x() * tile_size; |
725 | 0 | int this_tile_ysize = acs.covered_blocks_y() * tile_size; |
726 | 0 | int y_begin = std::max<int>(0, tile_size * tile_y - margin); |
727 | 0 | int y_end = std::min<int>(distmap.ysize(), |
728 | 0 | tile_size * tile_y + this_tile_ysize + margin); |
729 | 0 | int x_begin = std::max<int>(0, tile_size * tile_x - margin); |
730 | 0 | int x_end = std::min<int>(distmap.xsize(), |
731 | 0 | tile_size * tile_x + this_tile_xsize + margin); |
732 | 0 | float dist_norm = 0.0; |
733 | 0 | double pixels = 0; |
734 | 0 | for (int y = y_begin; y < y_end; ++y) { |
735 | 0 | float ymul = 1.0; |
736 | 0 | constexpr float kBorderMul = 0.98f; |
737 | 0 | constexpr float kCornerMul = 0.7f; |
738 | 0 | if (margin != 0 && (y == y_begin || y == y_end - 1)) { |
739 | 0 | ymul = kBorderMul; |
740 | 0 | } |
741 | 0 | const float* const JXL_RESTRICT row = distmap.Row(y); |
742 | 0 | for (int x = x_begin; x < x_end; ++x) { |
743 | 0 | float xmul = ymul; |
744 | 0 | if (margin != 0 && (x == x_begin || x == x_end - 1)) { |
745 | 0 | if (xmul == 1.0) { |
746 | 0 | xmul = kBorderMul; |
747 | 0 | } else { |
748 | 0 | xmul = kCornerMul; |
749 | 0 | } |
750 | 0 | } |
751 | 0 | float v = row[x]; |
752 | 0 | v *= v; |
753 | 0 | v *= v; |
754 | 0 | v *= v; |
755 | 0 | v *= v; |
756 | 0 | dist_norm += xmul * v; |
757 | 0 | pixels += xmul; |
758 | 0 | } |
759 | 0 | } |
760 | 0 | if (pixels == 0) pixels = 1; |
761 | | // 16th norm is less than the max norm, we reduce the difference |
762 | | // with this normalization factor. |
763 | 0 | constexpr float kTileNorm = 1.2f; |
764 | 0 | const float tile_dist = |
765 | 0 | kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f); |
766 | 0 | dist_row[tile_x] = tile_dist; |
767 | 0 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
768 | 0 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
769 | 0 | dist_row[tile_x + distmap_stride * iy + ix] = tile_dist; |
770 | 0 | } |
771 | 0 | } |
772 | 0 | } |
773 | 0 | } |
774 | 0 | return tile_distmap; |
775 | 0 | } |
776 | | |
777 | | const float kDcQuantPow = 0.83f; |
778 | | const float kDcQuant = 1.095924047623553f; |
779 | | const float kAcQuant = 0.725f; |
780 | | |
781 | | // Computes the decoded image for a given set of compression parameters. |
782 | | StatusOr<ImageBundle> RoundtripImage(const FrameHeader& frame_header, |
783 | | const Image3F& opsin, |
784 | | PassesEncoderState* enc_state, |
785 | | const JxlCmsInterface& cms, |
786 | 0 | ThreadPool* pool) { |
787 | 0 | JxlMemoryManager* memory_manager = enc_state->memory_manager(); |
788 | 0 | std::unique_ptr<PassesDecoderState> dec_state = |
789 | 0 | jxl::make_unique<PassesDecoderState>(memory_manager); |
790 | 0 | JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata( |
791 | 0 | *enc_state->shared.metadata)); |
792 | 0 | dec_state->shared = &enc_state->shared; |
793 | 0 | JXL_ASSERT(opsin.ysize() % kBlockDim == 0); |
794 | | |
795 | 0 | const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim); |
796 | 0 | const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim); |
797 | 0 | const size_t num_groups = xsize_groups * ysize_groups; |
798 | |
|
799 | 0 | size_t num_special_frames = enc_state->special_frames.size(); |
800 | 0 | size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); |
801 | 0 | ModularFrameEncoder modular_frame_encoder(memory_manager, frame_header, |
802 | 0 | enc_state->cparams, false); |
803 | 0 | JXL_CHECK(InitializePassesEncoder(frame_header, opsin, Rect(opsin), cms, pool, |
804 | 0 | enc_state, &modular_frame_encoder, |
805 | 0 | nullptr)); |
806 | 0 | JXL_CHECK(dec_state->Init(frame_header)); |
807 | 0 | JXL_CHECK(dec_state->InitForAC(num_passes, pool)); |
808 | | |
809 | 0 | ImageBundle decoded(memory_manager, &enc_state->shared.metadata->m); |
810 | 0 | decoded.origin = frame_header.frame_origin; |
811 | 0 | JXL_ASSIGN_OR_RETURN( |
812 | 0 | Image3F tmp, |
813 | 0 | Image3F::Create(memory_manager, opsin.xsize(), opsin.ysize())); |
814 | 0 | decoded.SetFromImage(std::move(tmp), |
815 | 0 | dec_state->output_encoding_info.color_encoding); |
816 | |
|
817 | 0 | PassesDecoderState::PipelineOptions options; |
818 | 0 | options.use_slow_render_pipeline = false; |
819 | 0 | options.coalescing = false; |
820 | 0 | options.render_spotcolors = false; |
821 | 0 | options.render_noise = false; |
822 | | |
823 | | // Same as frame_header.nonserialized_metadata->m |
824 | 0 | const ImageMetadata& metadata = *decoded.metadata(); |
825 | |
|
826 | 0 | JXL_CHECK(dec_state->PreparePipeline( |
827 | 0 | frame_header, &enc_state->shared.metadata->m, &decoded, options)); |
828 | | |
829 | 0 | hwy::AlignedUniquePtr<GroupDecCache[]> group_dec_caches; |
830 | 0 | const auto allocate_storage = [&](const size_t num_threads) -> Status { |
831 | 0 | JXL_RETURN_IF_ERROR( |
832 | 0 | dec_state->render_pipeline->PrepareForThreads(num_threads, |
833 | 0 | /*use_group_ids=*/false)); |
834 | 0 | group_dec_caches = hwy::MakeUniqueAlignedArray<GroupDecCache>(num_threads); |
835 | 0 | return true; |
836 | 0 | }; |
837 | 0 | std::atomic<bool> has_error{false}; |
838 | 0 | const auto process_group = [&](const uint32_t group_index, |
839 | 0 | const size_t thread) { |
840 | 0 | if (has_error) return; |
841 | 0 | if (frame_header.loop_filter.epf_iters > 0) { |
842 | 0 | ComputeSigma(frame_header.loop_filter, |
843 | 0 | dec_state->shared->frame_dim.BlockGroupRect(group_index), |
844 | 0 | dec_state.get()); |
845 | 0 | } |
846 | 0 | RenderPipelineInput input = |
847 | 0 | dec_state->render_pipeline->GetInputBuffers(group_index, thread); |
848 | 0 | JXL_CHECK(DecodeGroupForRoundtrip( |
849 | 0 | frame_header, enc_state->coeffs, group_index, dec_state.get(), |
850 | 0 | &group_dec_caches[thread], thread, input, nullptr, nullptr)); |
851 | 0 | for (size_t c = 0; c < metadata.num_extra_channels; c++) { |
852 | 0 | std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c); |
853 | 0 | FillPlane(0.0f, ri.first, ri.second); |
854 | 0 | } |
855 | 0 | if (!input.Done()) { |
856 | 0 | has_error = true; |
857 | 0 | return; |
858 | 0 | } |
859 | 0 | }; |
860 | 0 | JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group, |
861 | 0 | "AQ loop")); |
862 | 0 | if (has_error) return JXL_FAILURE("AQ loop failure"); |
863 | | |
864 | | // Ensure we don't create any new special frames. |
865 | 0 | enc_state->special_frames.resize(num_special_frames); |
866 | |
|
867 | 0 | return decoded; |
868 | 0 | } |
869 | | |
870 | | constexpr int kMaxButteraugliIters = 4; |
871 | | |
872 | | Status FindBestQuantization(const FrameHeader& frame_header, |
873 | | const Image3F& linear, const Image3F& opsin, |
874 | | ImageF& quant_field, PassesEncoderState* enc_state, |
875 | | const JxlCmsInterface& cms, ThreadPool* pool, |
876 | 0 | AuxOut* aux_out) { |
877 | 0 | const CompressParams& cparams = enc_state->cparams; |
878 | 0 | if (cparams.resampling > 1 && |
879 | 0 | cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) { |
880 | | // For downsampled opsin image, the butteraugli based adaptive quantization |
881 | | // loop would only make the size bigger without improving the distance much, |
882 | | // so in this case we enable it only for very high butteraugli targets. |
883 | 0 | return true; |
884 | 0 | } |
885 | 0 | JxlMemoryManager* memory_manager = enc_state->memory_manager(); |
886 | 0 | Quantizer& quantizer = enc_state->shared.quantizer; |
887 | 0 | ImageI& raw_quant_field = enc_state->shared.raw_quant_field; |
888 | |
|
889 | 0 | const float butteraugli_target = cparams.butteraugli_distance; |
890 | 0 | const float original_butteraugli = cparams.original_butteraugli_distance; |
891 | 0 | ButteraugliParams params; |
892 | 0 | params.intensity_target = 80.f; |
893 | 0 | JxlButteraugliComparator comparator(params, cms); |
894 | 0 | JXL_CHECK(comparator.SetLinearReferenceImage(linear)); |
895 | 0 | bool lower_is_better = |
896 | 0 | (comparator.GoodQualityScore() < comparator.BadQualityScore()); |
897 | 0 | const float initial_quant_dc = InitialQuantDC(butteraugli_target); |
898 | 0 | AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), |
899 | 0 | original_butteraugli, &quant_field); |
900 | 0 | ImageF tile_distmap; |
901 | 0 | JXL_ASSIGN_OR_RETURN( |
902 | 0 | ImageF initial_quant_field, |
903 | 0 | ImageF::Create(memory_manager, quant_field.xsize(), quant_field.ysize())); |
904 | 0 | CopyImageTo(quant_field, &initial_quant_field); |
905 | |
|
906 | 0 | float initial_qf_min; |
907 | 0 | float initial_qf_max; |
908 | 0 | ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max); |
909 | 0 | float initial_qf_ratio = initial_qf_max / initial_qf_min; |
910 | 0 | float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio); |
911 | 0 | float asymmetry = 2; |
912 | 0 | if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low; |
913 | 0 | float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low); |
914 | 0 | float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry); |
915 | |
|
916 | 0 | JXL_ASSERT(qf_higher / qf_lower < 253); |
917 | | |
918 | 0 | constexpr int kOriginalComparisonRound = 1; |
919 | 0 | int iters = kMaxButteraugliIters; |
920 | 0 | if (cparams.speed_tier != SpeedTier::kTortoise) { |
921 | 0 | iters = 2; |
922 | 0 | } |
923 | 0 | for (int i = 0; i < iters + 1; ++i) { |
924 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
925 | 0 | printf("\nQuantization field:\n"); |
926 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
927 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
928 | 0 | printf(" %.5f", quant_field.Row(y)[x]); |
929 | 0 | } |
930 | 0 | printf("\n"); |
931 | 0 | } |
932 | 0 | } |
933 | 0 | quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); |
934 | 0 | JXL_ASSIGN_OR_RETURN( |
935 | 0 | ImageBundle dec_linear, |
936 | 0 | RoundtripImage(frame_header, opsin, enc_state, cms, pool)); |
937 | 0 | float score; |
938 | 0 | ImageF diffmap; |
939 | 0 | JXL_CHECK(comparator.CompareWith(dec_linear, &diffmap, &score)); |
940 | 0 | if (!lower_is_better) { |
941 | 0 | score = -score; |
942 | 0 | ScaleImage(-1.0f, &diffmap); |
943 | 0 | } |
944 | 0 | JXL_ASSIGN_OR_RETURN(tile_distmap, |
945 | 0 | TileDistMap(diffmap, 8 * cparams.resampling, 0, |
946 | 0 | enc_state->shared.ac_strategy)); |
947 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && WantDebugOutput(cparams)) { |
948 | 0 | JXL_RETURN_IF_ERROR(DumpImage(cparams, ("dec" + ToString(i)).c_str(), |
949 | 0 | *dec_linear.color())); |
950 | 0 | JXL_RETURN_IF_ERROR(DumpHeatmaps(cparams, aux_out, butteraugli_target, |
951 | 0 | quant_field, tile_distmap, diffmap)); |
952 | 0 | } |
953 | 0 | if (aux_out != nullptr) ++aux_out->num_butteraugli_iters; |
954 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) { |
955 | 0 | float minval; |
956 | 0 | float maxval; |
957 | 0 | ImageMinMax(quant_field, &minval, &maxval); |
958 | 0 | printf("\nButteraugli iter: %d/%d\n", i, kMaxButteraugliIters); |
959 | 0 | printf("Butteraugli distance: %f (target = %f)\n", score, |
960 | 0 | original_butteraugli); |
961 | 0 | printf("quant range: %f ... %f DC quant: %f\n", minval, maxval, |
962 | 0 | initial_quant_dc); |
963 | 0 | if (FLAGS_dump_quant_state) { |
964 | 0 | quantizer.DumpQuantizationMap(raw_quant_field); |
965 | 0 | } |
966 | 0 | } |
967 | |
|
968 | 0 | if (i == iters) break; |
969 | | |
970 | 0 | double kPow[8] = { |
971 | 0 | 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
972 | 0 | }; |
973 | 0 | double kPowMod[8] = { |
974 | 0 | 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
975 | 0 | }; |
976 | 0 | if (i == kOriginalComparisonRound) { |
977 | | // Don't allow optimization to make the quant field a lot worse than |
978 | | // what the initial guess was. This allows the AC field to have enough |
979 | | // precision to reduce the oscillations due to the dc reconstruction. |
980 | 0 | double kInitMul = 0.6; |
981 | 0 | const double kOneMinusInitMul = 1.0 - kInitMul; |
982 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
983 | 0 | float* const JXL_RESTRICT row_q = quant_field.Row(y); |
984 | 0 | const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y); |
985 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
986 | 0 | double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x]; |
987 | 0 | if (row_q[x] < clamp) { |
988 | 0 | row_q[x] = clamp; |
989 | 0 | if (row_q[x] > qf_higher) row_q[x] = qf_higher; |
990 | 0 | if (row_q[x] < qf_lower) row_q[x] = qf_lower; |
991 | 0 | } |
992 | 0 | } |
993 | 0 | } |
994 | 0 | } |
995 | |
|
996 | 0 | double cur_pow = 0.0; |
997 | 0 | if (i < 7) { |
998 | 0 | cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i]; |
999 | 0 | if (cur_pow < 0) { |
1000 | 0 | cur_pow = 0; |
1001 | 0 | } |
1002 | 0 | } |
1003 | 0 | if (cur_pow == 0.0) { |
1004 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
1005 | 0 | const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); |
1006 | 0 | float* const JXL_RESTRICT row_q = quant_field.Row(y); |
1007 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
1008 | 0 | const float diff = row_dist[x] / original_butteraugli; |
1009 | 0 | if (diff > 1.0f) { |
1010 | 0 | float old = row_q[x]; |
1011 | 0 | row_q[x] *= diff; |
1012 | 0 | int qf_old = |
1013 | 0 | static_cast<int>(std::lround(old * quantizer.InvGlobalScale())); |
1014 | 0 | int qf_new = static_cast<int>( |
1015 | 0 | std::lround(row_q[x] * quantizer.InvGlobalScale())); |
1016 | 0 | if (qf_old == qf_new) { |
1017 | 0 | row_q[x] = old + quantizer.Scale(); |
1018 | 0 | } |
1019 | 0 | } |
1020 | 0 | if (row_q[x] > qf_higher) row_q[x] = qf_higher; |
1021 | 0 | if (row_q[x] < qf_lower) row_q[x] = qf_lower; |
1022 | 0 | } |
1023 | 0 | } |
1024 | 0 | } else { |
1025 | 0 | for (size_t y = 0; y < quant_field.ysize(); ++y) { |
1026 | 0 | const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); |
1027 | 0 | float* const JXL_RESTRICT row_q = quant_field.Row(y); |
1028 | 0 | for (size_t x = 0; x < quant_field.xsize(); ++x) { |
1029 | 0 | const float diff = row_dist[x] / original_butteraugli; |
1030 | 0 | if (diff <= 1.0f) { |
1031 | 0 | row_q[x] *= std::pow(diff, cur_pow); |
1032 | 0 | } else { |
1033 | 0 | float old = row_q[x]; |
1034 | 0 | row_q[x] *= diff; |
1035 | 0 | int qf_old = |
1036 | 0 | static_cast<int>(std::lround(old * quantizer.InvGlobalScale())); |
1037 | 0 | int qf_new = static_cast<int>( |
1038 | 0 | std::lround(row_q[x] * quantizer.InvGlobalScale())); |
1039 | 0 | if (qf_old == qf_new) { |
1040 | 0 | row_q[x] = old + quantizer.Scale(); |
1041 | 0 | } |
1042 | 0 | } |
1043 | 0 | if (row_q[x] > qf_higher) row_q[x] = qf_higher; |
1044 | 0 | if (row_q[x] < qf_lower) row_q[x] = qf_lower; |
1045 | 0 | } |
1046 | 0 | } |
1047 | 0 | } |
1048 | 0 | } |
1049 | 0 | quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); |
1050 | 0 | return true; |
1051 | 0 | } |
1052 | | |
1053 | | Status FindBestQuantizationMaxError(const FrameHeader& frame_header, |
1054 | | const Image3F& opsin, ImageF& quant_field, |
1055 | | PassesEncoderState* enc_state, |
1056 | | const JxlCmsInterface& cms, |
1057 | 0 | ThreadPool* pool, AuxOut* aux_out) { |
1058 | | // TODO(szabadka): Make this work for non-opsin color spaces. |
1059 | 0 | const CompressParams& cparams = enc_state->cparams; |
1060 | 0 | Quantizer& quantizer = enc_state->shared.quantizer; |
1061 | 0 | ImageI& raw_quant_field = enc_state->shared.raw_quant_field; |
1062 | | |
1063 | | // TODO(veluca): better choice of this value. |
1064 | 0 | const float initial_quant_dc = |
1065 | 0 | 16 * std::sqrt(0.1f / cparams.butteraugli_distance); |
1066 | 0 | AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), |
1067 | 0 | cparams.original_butteraugli_distance, &quant_field); |
1068 | |
|
1069 | 0 | const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0], |
1070 | 0 | 1.0f / enc_state->cparams.max_error[1], |
1071 | 0 | 1.0f / enc_state->cparams.max_error[2]}; |
1072 | |
|
1073 | 0 | for (int i = 0; i < kMaxButteraugliIters + 1; ++i) { |
1074 | 0 | quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); |
1075 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) { |
1076 | 0 | JXL_RETURN_IF_ERROR( |
1077 | 0 | DumpXybImage(cparams, ("ops" + ToString(i)).c_str(), opsin)); |
1078 | 0 | } |
1079 | 0 | JXL_ASSIGN_OR_RETURN( |
1080 | 0 | ImageBundle decoded, |
1081 | 0 | RoundtripImage(frame_header, opsin, enc_state, cms, pool)); |
1082 | 0 | if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) { |
1083 | 0 | JXL_RETURN_IF_ERROR(DumpXybImage(cparams, ("dec" + ToString(i)).c_str(), |
1084 | 0 | *decoded.color())); |
1085 | 0 | } |
1086 | 0 | for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) { |
1087 | 0 | AcStrategyRow ac_strategy_row = |
1088 | 0 | enc_state->shared.ac_strategy.ConstRow(by); |
1089 | 0 | for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) { |
1090 | 0 | AcStrategy acs = ac_strategy_row[bx]; |
1091 | 0 | if (!acs.IsFirstBlock()) continue; |
1092 | 0 | float max_error = 0; |
1093 | 0 | for (size_t c = 0; c < 3; c++) { |
1094 | 0 | for (size_t y = by * kBlockDim; |
1095 | 0 | y < (by + acs.covered_blocks_y()) * kBlockDim; y++) { |
1096 | 0 | if (y >= decoded.ysize()) continue; |
1097 | 0 | const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y); |
1098 | 0 | const float* JXL_RESTRICT dec_row = |
1099 | 0 | decoded.color()->ConstPlaneRow(c, y); |
1100 | 0 | for (size_t x = bx * kBlockDim; |
1101 | 0 | x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) { |
1102 | 0 | if (x >= decoded.xsize()) continue; |
1103 | 0 | max_error = std::max( |
1104 | 0 | std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error); |
1105 | 0 | } |
1106 | 0 | } |
1107 | 0 | } |
1108 | | // Target an error between max_error/2 and max_error. |
1109 | | // If the error in the varblock is above the target, increase the qf to |
1110 | | // compensate. If the error is below the target, decrease the qf. |
1111 | | // However, to avoid an excessive increase of the qf, only do so if the |
1112 | | // error is less than half the maximum allowed error. |
1113 | 0 | const float qf_mul = (max_error < 0.5f) ? max_error * 2.0f |
1114 | 0 | : (max_error > 1.0f) ? max_error |
1115 | 0 | : 1.0f; |
1116 | 0 | for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) { |
1117 | 0 | float* JXL_RESTRICT quant_field_row = quant_field.Row(qy); |
1118 | 0 | for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) { |
1119 | 0 | quant_field_row[qx] *= qf_mul; |
1120 | 0 | } |
1121 | 0 | } |
1122 | 0 | } |
1123 | 0 | } |
1124 | 0 | } |
1125 | 0 | quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); |
1126 | 0 | return true; |
1127 | 0 | } |
1128 | | |
1129 | | } // namespace |
1130 | | |
1131 | | void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, |
1132 | 52 | float butteraugli_target, ImageF* quant_field) { |
1133 | | // Replace the whole quant_field in non-8x8 blocks with the maximum of each |
1134 | | // 8x8 block. |
1135 | 52 | size_t stride = quant_field->PixelsPerRow(); |
1136 | | |
1137 | | // At low distances it is great to use max, but mean works better |
1138 | | // at high distances. We interpolate between them for a distance |
1139 | | // range. |
1140 | 52 | float mean_max_mixer = 1.0f; |
1141 | 52 | { |
1142 | 52 | static const float kLimit = 1.54138f; |
1143 | 52 | static const float kMul = 0.56391f; |
1144 | 52 | static const float kMin = 0.0f; |
1145 | 52 | if (butteraugli_target > kLimit) { |
1146 | 0 | mean_max_mixer -= (butteraugli_target - kLimit) * kMul; |
1147 | 0 | if (mean_max_mixer < kMin) { |
1148 | 0 | mean_max_mixer = kMin; |
1149 | 0 | } |
1150 | 0 | } |
1151 | 52 | } |
1152 | 104 | for (size_t y = 0; y < rect.ysize(); ++y) { |
1153 | 52 | AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y); |
1154 | 52 | float* JXL_RESTRICT quant_row = rect.Row(quant_field, y); |
1155 | 104 | for (size_t x = 0; x < rect.xsize(); ++x) { |
1156 | 52 | AcStrategy acs = ac_strategy_row[x]; |
1157 | 52 | if (!acs.IsFirstBlock()) continue; |
1158 | 52 | JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize()); |
1159 | 52 | JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize()); |
1160 | 52 | float max = quant_row[x]; |
1161 | 52 | float mean = 0.0; |
1162 | 104 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
1163 | 104 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
1164 | 52 | mean += quant_row[x + ix + iy * stride]; |
1165 | 52 | max = std::max(quant_row[x + ix + iy * stride], max); |
1166 | 52 | } |
1167 | 52 | } |
1168 | 52 | mean /= acs.covered_blocks_y() * acs.covered_blocks_x(); |
1169 | 52 | if (acs.covered_blocks_y() * acs.covered_blocks_x() >= 4) { |
1170 | 0 | max *= mean_max_mixer; |
1171 | 0 | max += (1.0f - mean_max_mixer) * mean; |
1172 | 0 | } |
1173 | 104 | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
1174 | 104 | for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { |
1175 | 52 | quant_row[x + ix + iy * stride] = max; |
1176 | 52 | } |
1177 | 52 | } |
1178 | 52 | } |
1179 | 52 | } |
1180 | 52 | } |
1181 | | |
1182 | 52 | float InitialQuantDC(float butteraugli_target) { |
1183 | 52 | const float kDcMul = 0.3; // Butteraugli target where non-linearity kicks in. |
1184 | 52 | const float butteraugli_target_dc = std::max<float>( |
1185 | 52 | 0.5f * butteraugli_target, |
1186 | 52 | std::min<float>(butteraugli_target, |
1187 | 52 | kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target, |
1188 | 52 | kDcQuantPow))); |
1189 | | // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc. |
1190 | | // The maximum DC value might not be in the kXybRange because of inverse |
1191 | | // gaborish, so we add some slack to the maximum theoretical quant obtained |
1192 | | // this way (64). |
1193 | 52 | return std::min(kDcQuant / butteraugli_target_dc, 50.f); |
1194 | 52 | } |
1195 | | |
1196 | | StatusOr<ImageF> InitialQuantField(const float butteraugli_target, |
1197 | | const Image3F& opsin, const Rect& rect, |
1198 | | ThreadPool* pool, float rescale, |
1199 | 52 | ImageF* mask, ImageF* mask1x1) { |
1200 | 52 | const float quant_ac = kAcQuant / butteraugli_target; |
1201 | 52 | return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)( |
1202 | 52 | butteraugli_target, opsin, rect, quant_ac * rescale, pool, mask, mask1x1); |
1203 | 52 | } |
1204 | | |
1205 | | Status FindBestQuantizer(const FrameHeader& frame_header, const Image3F* linear, |
1206 | | const Image3F& opsin, ImageF& quant_field, |
1207 | | PassesEncoderState* enc_state, |
1208 | | const JxlCmsInterface& cms, ThreadPool* pool, |
1209 | 52 | AuxOut* aux_out, double rescale) { |
1210 | 52 | const CompressParams& cparams = enc_state->cparams; |
1211 | 52 | if (cparams.max_error_mode) { |
1212 | 0 | JXL_RETURN_IF_ERROR(FindBestQuantizationMaxError( |
1213 | 0 | frame_header, opsin, quant_field, enc_state, cms, pool, aux_out)); |
1214 | 52 | } else if (linear && cparams.speed_tier <= SpeedTier::kKitten) { |
1215 | | // Normal encoding to a butteraugli score. |
1216 | 0 | JXL_RETURN_IF_ERROR(FindBestQuantization(frame_header, *linear, opsin, |
1217 | 0 | quant_field, enc_state, cms, pool, |
1218 | 0 | aux_out)); |
1219 | 0 | } |
1220 | 52 | return true; |
1221 | 52 | } |
1222 | | |
1223 | | } // namespace jxl |
1224 | | #endif // HWY_ONCE |