Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/lib/jxl/enc_adaptive_quantization.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_adaptive_quantization.h"
7
8
#include <jxl/cms_interface.h>
9
#include <jxl/memory_manager.h>
10
11
#include <algorithm>
12
#include <cmath>
13
#include <cstddef>
14
#include <cstdint>
15
#include <cstdio>
16
#include <cstdlib>
17
#include <memory>
18
#include <string>
19
#include <utility>
20
#include <vector>
21
22
#include "lib/jxl/cms/opsin_params.h"
23
#include "lib/jxl/common.h"
24
#include "lib/jxl/frame_header.h"
25
#include "lib/jxl/image_metadata.h"
26
#include "lib/jxl/memory_manager_internal.h"
27
#include "lib/jxl/quantizer.h"
28
#include "lib/jxl/render_pipeline/render_pipeline.h"
29
30
#undef HWY_TARGET_INCLUDE
31
#define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc"
32
#include <hwy/foreach_target.h>
33
#include <hwy/highway.h>
34
35
#include "lib/jxl/ac_strategy.h"
36
#include "lib/jxl/base/common.h"
37
#include "lib/jxl/base/compiler_specific.h"
38
#include "lib/jxl/base/data_parallel.h"
39
#include "lib/jxl/base/fast_math-inl.h"
40
#include "lib/jxl/base/rect.h"
41
#include "lib/jxl/base/status.h"
42
#include "lib/jxl/butteraugli/butteraugli.h"
43
#include "lib/jxl/convolve.h"
44
#include "lib/jxl/dec_cache.h"
45
#include "lib/jxl/dec_group.h"
46
#include "lib/jxl/enc_aux_out.h"
47
#include "lib/jxl/enc_butteraugli_comparator.h"
48
#include "lib/jxl/enc_cache.h"
49
#include "lib/jxl/enc_debug_image.h"
50
#include "lib/jxl/enc_group.h"
51
#include "lib/jxl/enc_modular.h"
52
#include "lib/jxl/enc_params.h"
53
#include "lib/jxl/enc_transforms-inl.h"
54
#include "lib/jxl/epf.h"
55
#include "lib/jxl/frame_dimensions.h"
56
#include "lib/jxl/image.h"
57
#include "lib/jxl/image_bundle.h"
58
#include "lib/jxl/image_ops.h"
59
#include "lib/jxl/quant_weights.h"
60
61
// Set JXL_DEBUG_ADAPTIVE_QUANTIZATION to 1 to enable debugging.
62
#ifndef JXL_DEBUG_ADAPTIVE_QUANTIZATION
63
0
#define JXL_DEBUG_ADAPTIVE_QUANTIZATION 0
64
#endif
65
66
HWY_BEFORE_NAMESPACE();
67
namespace jxl {
68
namespace HWY_NAMESPACE {
69
namespace {
70
71
// These templates are not found via ADL.
72
using hwy::HWY_NAMESPACE::AbsDiff;
73
using hwy::HWY_NAMESPACE::Add;
74
using hwy::HWY_NAMESPACE::And;
75
using hwy::HWY_NAMESPACE::Gt;
76
using hwy::HWY_NAMESPACE::IfThenElseZero;
77
using hwy::HWY_NAMESPACE::Max;
78
using hwy::HWY_NAMESPACE::Min;
79
using hwy::HWY_NAMESPACE::Rebind;
80
using hwy::HWY_NAMESPACE::Sqrt;
81
using hwy::HWY_NAMESPACE::ZeroIfNegative;
82
83
// The following functions modulate an exponent (out_val) and return the updated
84
// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
85
86
// Hack for mask estimation. Eventually replace this code with butteraugli's
87
// masking.
88
398k
float ComputeMaskForAcStrategyUse(const float out_val) {
89
398k
  const float kMul = 1.0f;
90
398k
  const float kOffset = 0.001f;
91
398k
  return kMul / (out_val + kOffset);
92
398k
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float)
Line
Count
Source
88
398k
float ComputeMaskForAcStrategyUse(const float out_val) {
89
398k
  const float kMul = 1.0f;
90
398k
  const float kOffset = 0.001f;
91
398k
  return kMul / (out_val + kOffset);
92
398k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::ComputeMaskForAcStrategyUse(float)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float)
93
94
template <class D, class V>
95
398k
V ComputeMask(const D d, const V out_val) {
96
398k
  const auto kBase = Set(d, -0.7647f);
97
398k
  const auto kMul4 = Set(d, 9.4708735624378946f);
98
398k
  const auto kMul2 = Set(d, 17.35036561631863f);
99
398k
  const auto kOffset2 = Set(d, 302.59587815579727f);
100
398k
  const auto kMul3 = Set(d, 6.7943250517376494f);
101
398k
  const auto kOffset3 = Set(d, 3.7179635626140772f);
102
398k
  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
103
398k
  const auto kMul0 = Set(d, 0.80061762862741759f);
104
398k
  const auto k1 = Set(d, 1.0f);
105
106
  // Avoid division by zero.
107
398k
  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
108
398k
  const auto v2 = Div(k1, Add(v1, kOffset2));
109
398k
  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
110
398k
  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
111
  // TODO(jyrki):
112
  // A log or two here could make sense. In butteraugli we have effectively
113
  // log(log(x + C)) for this kind of use, as a single log is used in
114
  // saturating visual masking and here the modulation values are exponential,
115
  // another log would counter that.
116
398k
  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
117
398k
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::ComputeMask<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
95
398k
V ComputeMask(const D d, const V out_val) {
96
398k
  const auto kBase = Set(d, -0.7647f);
97
398k
  const auto kMul4 = Set(d, 9.4708735624378946f);
98
398k
  const auto kMul2 = Set(d, 17.35036561631863f);
99
398k
  const auto kOffset2 = Set(d, 302.59587815579727f);
100
398k
  const auto kMul3 = Set(d, 6.7943250517376494f);
101
398k
  const auto kOffset3 = Set(d, 3.7179635626140772f);
102
398k
  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
103
398k
  const auto kMul0 = Set(d, 0.80061762862741759f);
104
398k
  const auto k1 = Set(d, 1.0f);
105
106
  // Avoid division by zero.
107
398k
  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
108
398k
  const auto v2 = Div(k1, Add(v1, kOffset2));
109
398k
  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
110
398k
  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
111
  // TODO(jyrki):
112
  // A log or two here could make sense. In butteraugli we have effectively
113
  // log(log(x + C)) for this kind of use, as a single log is used in
114
  // saturating visual masking and here the modulation values are exponential,
115
  // another log would counter that.
116
398k
  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
117
398k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::ComputeMask<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::ComputeMask<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
118
119
// mul and mul2 represent a scaling difference between jxl and butteraugli.
120
const float kSGmul = 226.77216153508914f;
121
const float kSGmul2 = 1.0f / 73.377132366608819f;
122
// Includes correction factor for std::log -> log2.
123
const float kSGRetMul = kSGmul2 * 18.6580932135f * kInvLog2e;
124
const float kSGVOffset = 7.7825991679894591f;
125
126
template <bool invert, typename D, typename V>
127
38.6M
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
128
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
129
  // is related to the number of photons.
130
  //
131
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
132
  // This ratio allows quantization to move from jxl's opsin space to
133
  // butteraugli's log-gamma space.
134
38.6M
  float kEpsilon = 1e-2;
135
38.6M
  v = ZeroIfNegative(v);
136
38.6M
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
137
38.6M
  const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon);
138
38.6M
  const auto kDenMul = Set(d, kInvLog2e * kSGmul);
139
140
38.6M
  const auto v2 = Mul(v, v);
141
142
38.6M
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
143
38.6M
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
144
38.6M
  return invert ? Div(num, den) : Div(den, num);
145
38.6M
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
127
28.7M
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
128
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
129
  // is related to the number of photons.
130
  //
131
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
132
  // This ratio allows quantization to move from jxl's opsin space to
133
  // butteraugli's log-gamma space.
134
28.7M
  float kEpsilon = 1e-2;
135
28.7M
  v = ZeroIfNegative(v);
136
28.7M
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
137
28.7M
  const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon);
138
28.7M
  const auto kDenMul = Set(d, kInvLog2e * kSGmul);
139
140
28.7M
  const auto v2 = Mul(v, v);
141
142
28.7M
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
143
28.7M
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
144
28.7M
  return invert ? Div(num, den) : Div(den, num);
145
28.7M
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
127
3.54M
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
128
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
129
  // is related to the number of photons.
130
  //
131
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
132
  // This ratio allows quantization to move from jxl's opsin space to
133
  // butteraugli's log-gamma space.
134
3.54M
  float kEpsilon = 1e-2;
135
3.54M
  v = ZeroIfNegative(v);
136
3.54M
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
137
3.54M
  const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon);
138
3.54M
  const auto kDenMul = Set(d, kInvLog2e * kSGmul);
139
140
3.54M
  const auto v2 = Mul(v, v);
141
142
3.54M
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
143
3.54M
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
144
3.54M
  return invert ? Div(num, den) : Div(den, num);
145
3.54M
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
127
6.36M
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
128
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
129
  // is related to the number of photons.
130
  //
131
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
132
  // This ratio allows quantization to move from jxl's opsin space to
133
  // butteraugli's log-gamma space.
134
6.36M
  float kEpsilon = 1e-2;
135
6.36M
  v = ZeroIfNegative(v);
136
6.36M
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
137
6.36M
  const auto kVOffset = Set(d, kSGVOffset * kInvLog2e + kEpsilon);
138
6.36M
  const auto kDenMul = Set(d, kInvLog2e * kSGmul);
139
140
6.36M
  const auto v2 = Mul(v, v);
141
142
6.36M
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
143
6.36M
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
144
6.36M
  return invert ? Div(num, den) : Div(den, num);
145
6.36M
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
146
147
template <bool invert = false>
148
28.7M
float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
149
28.7M
  using DScalar = HWY_CAPPED(float, 1);
150
28.7M
  auto vscalar = Load(DScalar(), &v);
151
28.7M
  return GetLane(
152
28.7M
      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
153
28.7M
}
enc_adaptive_quantization.cc:float jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float)
Line
Count
Source
148
28.7M
float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
149
28.7M
  using DScalar = HWY_CAPPED(float, 1);
150
28.7M
  auto vscalar = Load(DScalar(), &v);
151
28.7M
  return GetLane(
152
28.7M
      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
153
28.7M
}
Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float)
Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float)
154
155
// TODO(veluca): this function computes an approximation of the derivative of
156
// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
157
// exact derivatives. For reference, SimpleGamma was:
158
/*
159
template <typename D, typename V>
160
V SimpleGamma(const D d, V v) {
161
  // A simple HDR compatible gamma function.
162
  const auto mul = Set(d, kSGmul);
163
  const auto kRetMul = Set(d, kSGRetMul);
164
  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
165
  const auto kVOffset = Set(d, kSGVOffset);
166
167
  v *= mul;
168
169
  // This should happen rarely, but may lead to a NaN, which is rather
170
  // undesirable. Since negative photons don't exist we solve the NaNs by
171
  // clamping here.
172
  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
173
  v = ZeroIfNegative(v);
174
  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
175
}
176
*/
177
178
template <class D, class V>
179
V GammaModulation(const D d, const size_t x, const size_t y,
180
                  const ImageF& xyb_x, const ImageF& xyb_y, const Rect& rect,
181
398k
                  const V out_val) {
182
398k
  const float kBias = 0.16f;
183
398k
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]);
184
398k
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]);
185
398k
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]);
186
398k
  auto overall_ratio = Zero(d);
187
398k
  auto bias = Set(d, kBias);
188
3.58M
  for (size_t dy = 0; dy < 8; ++dy) {
189
3.18M
    const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy);
190
3.18M
    const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy);
191
6.36M
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
192
3.18M
      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
193
3.18M
      const auto inx = Load(d, row_in_x + x + dx);
194
195
3.18M
      const auto r = Sub(iny, inx);
196
3.18M
      const auto ratio_r =
197
3.18M
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r);
198
3.18M
      overall_ratio = Add(overall_ratio, ratio_r);
199
200
3.18M
      const auto g = Add(iny, inx);
201
3.18M
      const auto ratio_g =
202
3.18M
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g);
203
3.18M
      overall_ratio = Add(overall_ratio, ratio_g);
204
3.18M
    }
205
3.18M
  }
206
398k
  overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64));
207
  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
208
  // less than that.
209
398k
  const auto kGamma = Set(d, 0.1005613337192697f);
210
398k
  return MulAdd(kGamma, FastLog2f(d, overall_ratio), out_val);
211
398k
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::GammaModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
181
398k
                  const V out_val) {
182
398k
  const float kBias = 0.16f;
183
398k
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]);
184
398k
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]);
185
398k
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]);
186
398k
  auto overall_ratio = Zero(d);
187
398k
  auto bias = Set(d, kBias);
188
3.58M
  for (size_t dy = 0; dy < 8; ++dy) {
189
3.18M
    const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy);
190
3.18M
    const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy);
191
6.36M
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
192
3.18M
      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
193
3.18M
      const auto inx = Load(d, row_in_x + x + dx);
194
195
3.18M
      const auto r = Sub(iny, inx);
196
3.18M
      const auto ratio_r =
197
3.18M
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r);
198
3.18M
      overall_ratio = Add(overall_ratio, ratio_r);
199
200
3.18M
      const auto g = Add(iny, inx);
201
3.18M
      const auto ratio_g =
202
3.18M
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g);
203
3.18M
      overall_ratio = Add(overall_ratio, ratio_g);
204
3.18M
    }
205
3.18M
  }
206
398k
  overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64));
207
  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
208
  // less than that.
209
398k
  const auto kGamma = Set(d, 0.1005613337192697f);
210
398k
  return MulAdd(kGamma, FastLog2f(d, overall_ratio), out_val);
211
398k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::GammaModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::GammaModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>)
212
213
// Change precision in 8x8 blocks that have significant amounts of blue
214
// content (but are not close to solid blue).
215
// This is based on the idea that M and L cone activations saturate the
216
// S (blue) receptors, and the S reception becomes more important when
217
// both M and L levels are low. In that case M and L receptors may be
218
// observing S-spectra instead and viewing them with higher spatial
219
// accuracy, justifying spending more bits here.
220
template <class D, class V>
221
V BlueModulation(const D d, const size_t x, const size_t y,
222
                 const ImageF& planex, const ImageF& planey,
223
398k
                 const ImageF& planeb, const Rect& rect, const V out_val) {
224
398k
  auto sum = Zero(d);
225
398k
  static const float kLimit = 0.010474084867598155;
226
398k
  static const float kOffset = 0.0031994768654636393;
227
3.58M
  for (size_t dy = 0; dy < 8; ++dy) {
228
3.18M
    const float* JXL_RESTRICT row_in_x = rect.ConstRow(planex, y + dy) + x;
229
3.18M
    const float* JXL_RESTRICT row_in_y = rect.ConstRow(planey, y + dy) + x;
230
3.18M
    const float* JXL_RESTRICT row_in_b = rect.ConstRow(planeb, y + dy) + x;
231
6.36M
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
232
3.18M
      const auto p_x = Load(d, row_in_x + dx);
233
3.18M
      const auto p_b = Load(d, row_in_b + dx);
234
3.18M
      const auto p_y_raw = Add(Load(d, row_in_y + dx), Set(d, kOffset));
235
3.18M
      const auto p_y_effective = Add(p_y_raw, Abs(p_x));
236
3.18M
      sum = Add(sum,
237
3.18M
                IfThenElseZero(Gt(p_b, p_y_effective),
238
3.18M
                               Min(Sub(p_b, p_y_effective), Set(d, kLimit))));
239
3.18M
    }
240
3.18M
  }
241
398k
  static const float kMul = 0.90590804735610064;
242
398k
  sum = SumOfLanes(d, sum);
243
398k
  float scalar_sum = GetLane(sum);
244
  // If it is all blue, don't boost the quantization.
245
  // All blue likely means low frequency blue. Let's not make the most
246
  // perfect sky ever.
247
398k
  if (scalar_sum >= 32 * kLimit) {
248
82.2k
    scalar_sum = 64 * kLimit - scalar_sum;
249
82.2k
  }
250
398k
  static const float kMaxLimit = 15.463398341612438;
251
398k
  if (scalar_sum >= kMaxLimit * kLimit) {
252
60.3k
    scalar_sum = kMaxLimit * kLimit;
253
60.3k
  }
254
398k
  scalar_sum *= kMul;
255
398k
  return Add(Set(d, scalar_sum), out_val);
256
398k
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::BlueModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
223
398k
                 const ImageF& planeb, const Rect& rect, const V out_val) {
224
398k
  auto sum = Zero(d);
225
398k
  static const float kLimit = 0.010474084867598155;
226
398k
  static const float kOffset = 0.0031994768654636393;
227
3.58M
  for (size_t dy = 0; dy < 8; ++dy) {
228
3.18M
    const float* JXL_RESTRICT row_in_x = rect.ConstRow(planex, y + dy) + x;
229
3.18M
    const float* JXL_RESTRICT row_in_y = rect.ConstRow(planey, y + dy) + x;
230
3.18M
    const float* JXL_RESTRICT row_in_b = rect.ConstRow(planeb, y + dy) + x;
231
6.36M
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
232
3.18M
      const auto p_x = Load(d, row_in_x + dx);
233
3.18M
      const auto p_b = Load(d, row_in_b + dx);
234
3.18M
      const auto p_y_raw = Add(Load(d, row_in_y + dx), Set(d, kOffset));
235
3.18M
      const auto p_y_effective = Add(p_y_raw, Abs(p_x));
236
3.18M
      sum = Add(sum,
237
3.18M
                IfThenElseZero(Gt(p_b, p_y_effective),
238
3.18M
                               Min(Sub(p_b, p_y_effective), Set(d, kLimit))));
239
3.18M
    }
240
3.18M
  }
241
398k
  static const float kMul = 0.90590804735610064;
242
398k
  sum = SumOfLanes(d, sum);
243
398k
  float scalar_sum = GetLane(sum);
244
  // If it is all blue, don't boost the quantization.
245
  // All blue likely means low frequency blue. Let's not make the most
246
  // perfect sky ever.
247
398k
  if (scalar_sum >= 32 * kLimit) {
248
82.2k
    scalar_sum = 64 * kLimit - scalar_sum;
249
82.2k
  }
250
398k
  static const float kMaxLimit = 15.463398341612438;
251
398k
  if (scalar_sum >= kMaxLimit * kLimit) {
252
60.3k
    scalar_sum = kMaxLimit * kLimit;
253
60.3k
  }
254
398k
  scalar_sum *= kMul;
255
398k
  return Add(Set(d, scalar_sum), out_val);
256
398k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::BlueModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::BlueModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>)
257
258
// Change precision in 8x8 blocks that have high frequency content.
259
template <class D, class V>
260
V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb_y,
261
398k
               const Rect& rect, const V out_val) {
262
  // Zero out the invalid differences for the rightmost value per row.
263
398k
  const Rebind<uint32_t, D> du;
264
398k
  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
265
398k
                                                        ~0u, ~0u, ~0u, 0};
266
267
  // Sums of deltas of y and x components between (approximate)
268
  // 4-connected pixels.
269
398k
  auto sum_y = Zero(d);
270
398k
  static const float valmin_y = 0.0206;
271
398k
  auto valminv_y = Set(d, valmin_y);
272
3.58M
  for (size_t dy = 0; dy < 8; ++dy) {
273
3.18M
    const float* JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy) + x;
274
3.18M
    const float* JXL_RESTRICT row_in_y_next =
275
3.18M
        dy == 7 ? row_in_y : rect.ConstRow(xyb_y, y + dy + 1) + x;
276
277
    // In SCALAR, there is no guarantee of having extra row padding.
278
    // Hence, we need to ensure we don't access pixels outside the row itself.
279
    // In SIMD modes, however, rows are padded, so it's safe to access one
280
    // garbage value after the row. The vector then gets masked with kMaskRight
281
    // to remove the influence of that value.
282
3.18M
#if HWY_TARGET != HWY_SCALAR
283
6.36M
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
284
#else
285
    for (size_t dx = 0; dx < 7; dx += Lanes(d)) {
286
#endif
287
3.18M
      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
288
3.18M
      {
289
3.18M
        const auto p_y = Load(d, row_in_y + dx);
290
3.18M
        const auto pr_y = LoadU(d, row_in_y + dx + 1);
291
3.18M
        sum_y = Add(sum_y, And(mask, Min(valminv_y, AbsDiff(p_y, pr_y))));
292
3.18M
        const auto pd_y = Load(d, row_in_y_next + dx);
293
3.18M
        sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y)));
294
3.18M
      }
295
3.18M
    }
296
#if HWY_TARGET == HWY_SCALAR
297
    const auto p_y = Load(d, row_in_y + 7);
298
    const auto pd_y = Load(d, row_in_y_next + 7);
299
    sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y)));
300
#endif
301
3.18M
  }
302
398k
  static const float kMul_y = -0.38;
303
398k
  sum_y = SumOfLanes(d, sum_y);
304
305
398k
  float scalar_sum_y = GetLane(sum_y);
306
398k
  scalar_sum_y *= kMul_y;
307
308
  // higher value -> more bpp
309
398k
  float kOffset = 0.42;
310
398k
  scalar_sum_y += kOffset;
311
312
398k
  return Add(Set(d, scalar_sum_y), out_val);
313
398k
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::HfModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
261
398k
               const Rect& rect, const V out_val) {
262
  // Zero out the invalid differences for the rightmost value per row.
263
398k
  const Rebind<uint32_t, D> du;
264
398k
  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
265
398k
                                                        ~0u, ~0u, ~0u, 0};
266
267
  // Sums of deltas of y and x components between (approximate)
268
  // 4-connected pixels.
269
398k
  auto sum_y = Zero(d);
270
398k
  static const float valmin_y = 0.0206;
271
398k
  auto valminv_y = Set(d, valmin_y);
272
3.58M
  for (size_t dy = 0; dy < 8; ++dy) {
273
3.18M
    const float* JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy) + x;
274
3.18M
    const float* JXL_RESTRICT row_in_y_next =
275
3.18M
        dy == 7 ? row_in_y : rect.ConstRow(xyb_y, y + dy + 1) + x;
276
277
    // In SCALAR, there is no guarantee of having extra row padding.
278
    // Hence, we need to ensure we don't access pixels outside the row itself.
279
    // In SIMD modes, however, rows are padded, so it's safe to access one
280
    // garbage value after the row. The vector then gets masked with kMaskRight
281
    // to remove the influence of that value.
282
3.18M
#if HWY_TARGET != HWY_SCALAR
283
6.36M
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
284
#else
285
    for (size_t dx = 0; dx < 7; dx += Lanes(d)) {
286
#endif
287
3.18M
      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
288
3.18M
      {
289
3.18M
        const auto p_y = Load(d, row_in_y + dx);
290
3.18M
        const auto pr_y = LoadU(d, row_in_y + dx + 1);
291
3.18M
        sum_y = Add(sum_y, And(mask, Min(valminv_y, AbsDiff(p_y, pr_y))));
292
3.18M
        const auto pd_y = Load(d, row_in_y_next + dx);
293
3.18M
        sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y)));
294
3.18M
      }
295
3.18M
    }
296
#if HWY_TARGET == HWY_SCALAR
297
    const auto p_y = Load(d, row_in_y + 7);
298
    const auto pd_y = Load(d, row_in_y_next + 7);
299
    sum_y = Add(sum_y, Min(valminv_y, AbsDiff(p_y, pd_y)));
300
#endif
301
3.18M
  }
302
398k
  static const float kMul_y = -0.38;
303
398k
  sum_y = SumOfLanes(d, sum_y);
304
305
398k
  float scalar_sum_y = GetLane(sum_y);
306
398k
  scalar_sum_y *= kMul_y;
307
308
  // higher value -> more bpp
309
398k
  float kOffset = 0.42;
310
398k
  scalar_sum_y += kOffset;
311
312
398k
  return Add(Set(d, scalar_sum_y), out_val);
313
398k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::HfModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::HfModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>)
314
315
void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
316
                         const ImageF& xyb_y, const ImageF& xyb_b,
317
                         const Rect& rect_in, const float scale,
318
7.22k
                         const Rect& rect_out, ImageF* out) {
319
7.22k
  float base_level = 0.48f * scale;
320
7.22k
  float kDampenRampStart = 2.0f;
321
7.22k
  float kDampenRampEnd = 14.0f;
322
7.22k
  float dampen = 1.0f;
323
7.22k
  if (butteraugli_target >= kDampenRampStart) {
324
0
    dampen = 1.0f - ((butteraugli_target - kDampenRampStart) /
325
0
                     (kDampenRampEnd - kDampenRampStart));
326
0
    if (dampen < 0) {
327
0
      dampen = 0;
328
0
    }
329
0
  }
330
7.22k
  const float mul = scale * dampen;
331
7.22k
  const float add = (1.0f - dampen) * base_level;
332
60.6k
  for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) {
333
53.4k
    const size_t y = iy * 8;
334
53.4k
    float* const JXL_RESTRICT row_out = out->Row(iy);
335
53.4k
    const HWY_CAPPED(float, kBlockDim) df;
336
451k
    for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) {
337
398k
      size_t x = ix * 8;
338
398k
      auto mask_val = ComputeMask(df, Set(df, row_out[ix]));
339
398k
      mask_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, mask_val);
340
398k
      auto out_val = HfModulation(df, x, y, xyb_y, rect_in, mask_val);
341
398k
      out_val = Min(out_val, BlueModulation(df, x, y, xyb_x, xyb_y, xyb_b,
342
398k
                                            rect_in, mask_val));
343
      // We want multiplicative quantization field, so everything
344
      // until this point has been modulating the exponent.
345
398k
      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
346
398k
    }
347
53.4k
  }
348
7.22k
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Line
Count
Source
318
7.22k
                         const Rect& rect_out, ImageF* out) {
319
7.22k
  float base_level = 0.48f * scale;
320
7.22k
  float kDampenRampStart = 2.0f;
321
7.22k
  float kDampenRampEnd = 14.0f;
322
7.22k
  float dampen = 1.0f;
323
7.22k
  if (butteraugli_target >= kDampenRampStart) {
324
0
    dampen = 1.0f - ((butteraugli_target - kDampenRampStart) /
325
0
                     (kDampenRampEnd - kDampenRampStart));
326
0
    if (dampen < 0) {
327
0
      dampen = 0;
328
0
    }
329
0
  }
330
7.22k
  const float mul = scale * dampen;
331
7.22k
  const float add = (1.0f - dampen) * base_level;
332
60.6k
  for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) {
333
53.4k
    const size_t y = iy * 8;
334
53.4k
    float* const JXL_RESTRICT row_out = out->Row(iy);
335
53.4k
    const HWY_CAPPED(float, kBlockDim) df;
336
451k
    for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) {
337
398k
      size_t x = ix * 8;
338
398k
      auto mask_val = ComputeMask(df, Set(df, row_out[ix]));
339
398k
      mask_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, mask_val);
340
398k
      auto out_val = HfModulation(df, x, y, xyb_y, rect_in, mask_val);
341
398k
      out_val = Min(out_val, BlueModulation(df, x, y, xyb_x, xyb_y, xyb_b,
342
398k
                                            rect_in, mask_val));
343
      // We want multiplicative quantization field, so everything
344
      // until this point has been modulating the exponent.
345
398k
      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
346
398k
    }
347
53.4k
  }
348
7.22k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
349
350
template <typename D, typename V>
351
6.84M
V MaskingSqrt(const D d, V v) {
352
6.84M
  static const float kLogOffset = 27.505837037000106f;
353
6.84M
  static const float kMul = 211.66567973503678f;
354
6.84M
  const auto mul_v = Set(d, kMul * 1e8);
355
6.84M
  const auto offset_v = Set(d, kLogOffset);
356
6.84M
  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
357
6.84M
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
351
3.29M
V MaskingSqrt(const D d, V v) {
352
3.29M
  static const float kLogOffset = 27.505837037000106f;
353
3.29M
  static const float kMul = 211.66567973503678f;
354
3.29M
  const auto mul_v = Set(d, kMul * 1e8);
355
3.29M
  const auto offset_v = Set(d, kLogOffset);
356
3.29M
  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
357
3.29M
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
351
3.54M
V MaskingSqrt(const D d, V v) {
352
3.54M
  static const float kLogOffset = 27.505837037000106f;
353
3.54M
  static const float kMul = 211.66567973503678f;
354
3.54M
  const auto mul_v = Set(d, kMul * 1e8);
355
3.54M
  const auto offset_v = Set(d, kLogOffset);
356
3.54M
  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
357
3.54M
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
358
359
3.29M
float MaskingSqrt(const float v) {
360
3.29M
  using DScalar = HWY_CAPPED(float, 1);
361
3.29M
  auto vscalar = Load(DScalar(), &v);
362
3.29M
  return GetLane(MaskingSqrt(DScalar(), vscalar));
363
3.29M
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::MaskingSqrt(float)
Line
Count
Source
359
3.29M
float MaskingSqrt(const float v) {
360
3.29M
  using DScalar = HWY_CAPPED(float, 1);
361
3.29M
  auto vscalar = Load(DScalar(), &v);
362
3.29M
  return GetLane(MaskingSqrt(DScalar(), vscalar));
363
3.29M
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::MaskingSqrt(float)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::MaskingSqrt(float)
364
365
inline void StoreMin4(const float v, float& min0, float& min1, float& min2,
366
7.96M
               float& min3) {
367
7.96M
  if (v < min3) {
368
4.07M
    if (v < min0) {
369
923k
      min3 = min2;
370
923k
      min2 = min1;
371
923k
      min1 = min0;
372
923k
      min0 = v;
373
3.15M
    } else if (v < min1) {
374
1.02M
      min3 = min2;
375
1.02M
      min2 = min1;
376
1.02M
      min1 = v;
377
2.12M
    } else if (v < min2) {
378
1.06M
      min3 = min2;
379
1.06M
      min2 = v;
380
1.06M
    } else {
381
1.06M
      min3 = v;
382
1.06M
    }
383
4.07M
  }
384
7.96M
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&)
Line
Count
Source
366
7.96M
               float& min3) {
367
7.96M
  if (v < min3) {
368
4.07M
    if (v < min0) {
369
923k
      min3 = min2;
370
923k
      min2 = min1;
371
923k
      min1 = min0;
372
923k
      min0 = v;
373
3.15M
    } else if (v < min1) {
374
1.02M
      min3 = min2;
375
1.02M
      min2 = min1;
376
1.02M
      min1 = v;
377
2.12M
    } else if (v < min2) {
378
1.06M
      min3 = min2;
379
1.06M
      min2 = v;
380
1.06M
    } else {
381
1.06M
      min3 = v;
382
1.06M
    }
383
4.07M
  }
384
7.96M
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&)
385
386
// Look for smooth areas near the area of degradation.
387
// If the areas are generally smooth, don't do masking.
388
// Output is downsampled 2x.
389
Status FuzzyErosion(const float butteraugli_target, const Rect& from_rect,
390
7.22k
                    const ImageF& from, const Rect& to_rect, ImageF* to) {
391
7.22k
  const size_t xsize = from.xsize();
392
7.22k
  const size_t ysize = from.ysize();
393
7.22k
  constexpr int kStep = 1;
394
7.22k
  static_assert(kStep == 1, "Step must be 1");
395
7.22k
  JXL_ENSURE(to_rect.xsize() * 2 == from_rect.xsize());
396
7.22k
  JXL_ENSURE(to_rect.ysize() * 2 == from_rect.ysize());
397
7.22k
  static const float kMulBase[4] = { 0.125, 0.1, 0.09, 0.06 };
398
7.22k
  static const float kMulAdd[4] = { 0.0, -0.1, -0.09, -0.06 };
399
7.22k
  float mul = 0.0;
400
7.22k
  if (butteraugli_target < 2.0f) {
401
7.22k
    mul = (2.0f - butteraugli_target) * (1.0f / 2.0f);
402
7.22k
  }
403
7.22k
  float kMul[4] = { 0 };
404
7.22k
  float norm_sum = 0.0;
405
36.1k
  for (size_t ii = 0; ii < 4; ++ii) {
406
28.8k
    kMul[ii] = kMulBase[ii] + mul * kMulAdd[ii];
407
28.8k
    norm_sum += kMul[ii];
408
28.8k
  }
409
7.22k
  static const float kTotal = 0.29959705784054957;
410
36.1k
  for (size_t ii = 0; ii < 4; ++ii) {
411
28.8k
    kMul[ii] *= kTotal / norm_sum;
412
28.8k
  }
413
114k
  for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
414
106k
    size_t y = fy + from_rect.y0();
415
106k
    size_t ym1 = y >= kStep ? y - kStep : y;
416
106k
    size_t yp1 = y + kStep < ysize ? y + kStep : y;
417
106k
    const float* rowt = from.Row(ym1);
418
106k
    const float* row = from.Row(y);
419
106k
    const float* rowb = from.Row(yp1);
420
106k
    float* row_out = to_rect.Row(to, fy / 2);
421
1.69M
    for (size_t fx = 0; fx < from_rect.xsize(); ++fx) {
422
1.59M
      size_t x = fx + from_rect.x0();
423
1.59M
      size_t xm1 = x >= kStep ? x - kStep : x;
424
1.59M
      size_t xp1 = x + kStep < xsize ? x + kStep : x;
425
1.59M
      float min[4] = { row[x], row[xm1], row[xp1], rowt[xm1] };
426
      // Sort the first four values.
427
1.59M
      if (min[0] > min[1]) std::swap(min[0], min[1]);
428
1.59M
      if (min[0] > min[2]) std::swap(min[0], min[2]);
429
1.59M
      if (min[0] > min[3]) std::swap(min[0], min[3]);
430
1.59M
      if (min[1] > min[2]) std::swap(min[1], min[2]);
431
1.59M
      if (min[1] > min[3]) std::swap(min[1], min[3]);
432
1.59M
      if (min[2] > min[3]) std::swap(min[2], min[3]);
433
      // The remaining five values of a 3x3 neighbourhood.
434
1.59M
      StoreMin4(rowt[x], min[0], min[1], min[2], min[3]);
435
1.59M
      StoreMin4(rowt[xp1], min[0], min[1], min[2], min[3]);
436
1.59M
      StoreMin4(rowb[xm1], min[0], min[1], min[2], min[3]);
437
1.59M
      StoreMin4(rowb[x], min[0], min[1], min[2], min[3]);
438
1.59M
      StoreMin4(rowb[xp1], min[0], min[1], min[2], min[3]);
439
440
1.59M
      float v = kMul[0] * min[0] + kMul[1] * min[1] + kMul[2] * min[2] + kMul[3] * min[3];
441
1.59M
      if (fx % 2 == 0 && fy % 2 == 0) {
442
398k
        row_out[fx / 2] = v;
443
1.19M
      } else {
444
1.19M
        row_out[fx / 2] += v;
445
1.19M
      }
446
1.59M
    }
447
106k
  }
448
7.22k
  return true;
449
7.22k
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Line
Count
Source
390
7.22k
                    const ImageF& from, const Rect& to_rect, ImageF* to) {
391
7.22k
  const size_t xsize = from.xsize();
392
7.22k
  const size_t ysize = from.ysize();
393
7.22k
  constexpr int kStep = 1;
394
7.22k
  static_assert(kStep == 1, "Step must be 1");
395
7.22k
  JXL_ENSURE(to_rect.xsize() * 2 == from_rect.xsize());
396
7.22k
  JXL_ENSURE(to_rect.ysize() * 2 == from_rect.ysize());
397
7.22k
  static const float kMulBase[4] = { 0.125, 0.1, 0.09, 0.06 };
398
7.22k
  static const float kMulAdd[4] = { 0.0, -0.1, -0.09, -0.06 };
399
7.22k
  float mul = 0.0;
400
7.22k
  if (butteraugli_target < 2.0f) {
401
7.22k
    mul = (2.0f - butteraugli_target) * (1.0f / 2.0f);
402
7.22k
  }
403
7.22k
  float kMul[4] = { 0 };
404
7.22k
  float norm_sum = 0.0;
405
36.1k
  for (size_t ii = 0; ii < 4; ++ii) {
406
28.8k
    kMul[ii] = kMulBase[ii] + mul * kMulAdd[ii];
407
28.8k
    norm_sum += kMul[ii];
408
28.8k
  }
409
7.22k
  static const float kTotal = 0.29959705784054957;
410
36.1k
  for (size_t ii = 0; ii < 4; ++ii) {
411
28.8k
    kMul[ii] *= kTotal / norm_sum;
412
28.8k
  }
413
114k
  for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
414
106k
    size_t y = fy + from_rect.y0();
415
106k
    size_t ym1 = y >= kStep ? y - kStep : y;
416
106k
    size_t yp1 = y + kStep < ysize ? y + kStep : y;
417
106k
    const float* rowt = from.Row(ym1);
418
106k
    const float* row = from.Row(y);
419
106k
    const float* rowb = from.Row(yp1);
420
106k
    float* row_out = to_rect.Row(to, fy / 2);
421
1.69M
    for (size_t fx = 0; fx < from_rect.xsize(); ++fx) {
422
1.59M
      size_t x = fx + from_rect.x0();
423
1.59M
      size_t xm1 = x >= kStep ? x - kStep : x;
424
1.59M
      size_t xp1 = x + kStep < xsize ? x + kStep : x;
425
1.59M
      float min[4] = { row[x], row[xm1], row[xp1], rowt[xm1] };
426
      // Sort the first four values.
427
1.59M
      if (min[0] > min[1]) std::swap(min[0], min[1]);
428
1.59M
      if (min[0] > min[2]) std::swap(min[0], min[2]);
429
1.59M
      if (min[0] > min[3]) std::swap(min[0], min[3]);
430
1.59M
      if (min[1] > min[2]) std::swap(min[1], min[2]);
431
1.59M
      if (min[1] > min[3]) std::swap(min[1], min[3]);
432
1.59M
      if (min[2] > min[3]) std::swap(min[2], min[3]);
433
      // The remaining five values of a 3x3 neighbourhood.
434
1.59M
      StoreMin4(rowt[x], min[0], min[1], min[2], min[3]);
435
1.59M
      StoreMin4(rowt[xp1], min[0], min[1], min[2], min[3]);
436
1.59M
      StoreMin4(rowb[xm1], min[0], min[1], min[2], min[3]);
437
1.59M
      StoreMin4(rowb[x], min[0], min[1], min[2], min[3]);
438
1.59M
      StoreMin4(rowb[xp1], min[0], min[1], min[2], min[3]);
439
440
1.59M
      float v = kMul[0] * min[0] + kMul[1] * min[1] + kMul[2] * min[2] + kMul[3] * min[3];
441
1.59M
      if (fx % 2 == 0 && fy % 2 == 0) {
442
398k
        row_out[fx / 2] = v;
443
1.19M
      } else {
444
1.19M
        row_out[fx / 2] += v;
445
1.19M
      }
446
1.59M
    }
447
106k
  }
448
7.22k
  return true;
449
7.22k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
450
451
struct AdaptiveQuantizationImpl {
452
186
  Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) {
453
186
    JXL_ASSIGN_OR_RETURN(
454
186
        diff_buffer,
455
186
        ImageF::Create(memory_manager, kEncTileDim + 8, num_threads));
456
372
    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
457
186
      JXL_ASSIGN_OR_RETURN(
458
186
          ImageF tmp,
459
186
          ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2,
460
186
                         kEncTileDimInBlocks * 2 + 2));
461
186
      pre_erosion.emplace_back(std::move(tmp));
462
186
    }
463
186
    return true;
464
186
  }
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long)
Line
Count
Source
452
186
  Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) {
453
186
    JXL_ASSIGN_OR_RETURN(
454
186
        diff_buffer,
455
186
        ImageF::Create(memory_manager, kEncTileDim + 8, num_threads));
456
372
    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
457
186
      JXL_ASSIGN_OR_RETURN(
458
186
          ImageF tmp,
459
186
          ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2,
460
186
                         kEncTileDimInBlocks * 2 + 2));
461
186
      pre_erosion.emplace_back(std::move(tmp));
462
186
    }
463
186
    return true;
464
186
  }
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long)
465
466
  Status ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
467
                     const Rect& rect_in, const Rect& rect_out,
468
7.22k
                     const int thread, ImageF* mask, ImageF* mask1x1) {
469
7.22k
    JXL_ENSURE(rect_in.x0() % kBlockDim == 0);
470
7.22k
    JXL_ENSURE(rect_in.y0() % kBlockDim == 0);
471
7.22k
    const size_t xsize = xyb.xsize();
472
7.22k
    const size_t ysize = xyb.ysize();
473
474
    // The XYB gamma is 3.0 to be able to decode faster with two muls.
475
    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
476
    // We approximate the gamma difference by adding one cubic root into
477
    // the adaptive quantization. This gives us a total gamma of 2.6666
478
    // for quantization uses.
479
7.22k
    const float match_gamma_offset = 0.019;
480
481
7.22k
    const HWY_FULL(float) df;
482
483
7.22k
    size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8;
484
7.22k
    size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8;
485
486
7.22k
    size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8;
487
7.22k
    size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8;
488
489
7.22k
    if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2;
490
7.22k
    if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) {
491
0
      x_end_1x1 += 2;
492
0
    }
493
7.22k
    if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2;
494
7.22k
    if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) {
495
0
      y_end_1x1 += 2;
496
0
    }
497
498
    // Computes image (padded to multiple of 8x8) of local pixel differences.
499
    // Subsample both directions by 4.
500
    // 1x1 Laplacian of intensity.
501
434k
    for (size_t y = y_start_1x1; y < y_end_1x1; ++y) {
502
427k
      const size_t y2 = y + 1 < ysize ? y + 1 : y;
503
427k
      const size_t y1 = y > 0 ? y - 1 : y;
504
427k
      const float* row_in = xyb.ConstPlaneRow(1, y);
505
427k
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
506
427k
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
507
427k
      float* mask1x1_out = mask1x1->Row(y);
508
25.4M
      auto scalar_pixel1x1 = [&](size_t x) {
509
25.4M
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
510
25.4M
        const size_t x1 = x > 0 ? x - 1 : x;
511
25.4M
        const float base =
512
25.4M
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
513
25.4M
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
514
25.4M
            row_in[x] + match_gamma_offset);
515
25.4M
        float diff = std::abs(gammac * (row_in[x] - base));
516
25.4M
        static const double kScaler = 1.0;
517
25.4M
        diff *= kScaler;
518
25.4M
        diff = std::log1p(diff);
519
25.4M
        static const float kMul = 1.0;
520
25.4M
        static const float kOffset = 0.01;
521
25.4M
        mask1x1_out[x] = kMul / (diff + kOffset);
522
25.4M
      };
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Line
Count
Source
508
25.4M
      auto scalar_pixel1x1 = [&](size_t x) {
509
25.4M
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
510
25.4M
        const size_t x1 = x > 0 ? x - 1 : x;
511
25.4M
        const float base =
512
25.4M
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
513
25.4M
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
514
25.4M
            row_in[x] + match_gamma_offset);
515
25.4M
        float diff = std::abs(gammac * (row_in[x] - base));
516
25.4M
        static const double kScaler = 1.0;
517
25.4M
        diff *= kScaler;
518
25.4M
        diff = std::log1p(diff);
519
25.4M
        static const float kMul = 1.0;
520
25.4M
        static const float kOffset = 0.01;
521
25.4M
        mask1x1_out[x] = kMul / (diff + kOffset);
522
25.4M
      };
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
523
25.9M
      for (size_t x = x_start_1x1; x < x_end_1x1; ++x) {
524
25.4M
        scalar_pixel1x1(x);
525
25.4M
      }
526
427k
    }
527
528
7.22k
    size_t y_start = rect_in.y0() + rect_out.y0() * 8;
529
7.22k
    size_t y_end = y_start + rect_out.ysize() * 8;
530
531
7.22k
    size_t x_start = rect_in.x0() + rect_out.x0() * 8;
532
7.22k
    size_t x_end = x_start + rect_out.xsize() * 8;
533
534
7.22k
    if (x_start != 0) x_start -= 4;
535
7.22k
    if (x_end != xsize) x_end += 4;
536
7.22k
    if (y_start != 0) y_start -= 4;
537
7.22k
    if (y_end != ysize) y_end += 4;
538
7.22k
    JXL_RETURN_IF_ERROR(pre_erosion[thread].ShrinkTo((x_end - x_start) / 4,
539
7.22k
                                                     (y_end - y_start) / 4));
540
541
7.22k
    static const float limit = 0.2f;
542
483k
    for (size_t y = y_start; y < y_end; ++y) {
543
476k
      size_t y2 = y + 1 < ysize ? y + 1 : y;
544
476k
      size_t y1 = y > 0 ? y - 1 : y;
545
546
476k
      const float* row_in = xyb.ConstPlaneRow(1, y);
547
476k
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
548
476k
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
549
476k
      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
550
551
3.29M
      auto scalar_pixel = [&](size_t x) {
552
3.29M
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
553
3.29M
        const size_t x1 = x > 0 ? x - 1 : x;
554
3.29M
        const float base =
555
3.29M
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
556
3.29M
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
557
3.29M
            row_in[x] + match_gamma_offset);
558
3.29M
        float diff = gammac * (row_in[x] - base);
559
3.29M
        diff *= diff;
560
3.29M
        if (diff >= limit) {
561
609k
          diff = limit;
562
609k
        }
563
3.29M
        diff = MaskingSqrt(diff);
564
3.29M
        if ((y % 4) != 0) {
565
2.47M
          row_out[x - x_start] += diff;
566
2.47M
        } else {
567
824k
          row_out[x - x_start] = diff;
568
824k
        }
569
3.29M
      };
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const
Line
Count
Source
551
3.29M
      auto scalar_pixel = [&](size_t x) {
552
3.29M
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
553
3.29M
        const size_t x1 = x > 0 ? x - 1 : x;
554
3.29M
        const float base =
555
3.29M
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
556
3.29M
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
557
3.29M
            row_in[x] + match_gamma_offset);
558
3.29M
        float diff = gammac * (row_in[x] - base);
559
3.29M
        diff *= diff;
560
3.29M
        if (diff >= limit) {
561
609k
          diff = limit;
562
609k
        }
563
3.29M
        diff = MaskingSqrt(diff);
564
3.29M
        if ((y % 4) != 0) {
565
2.47M
          row_out[x - x_start] += diff;
566
2.47M
        } else {
567
824k
          row_out[x - x_start] = diff;
568
824k
        }
569
3.29M
      };
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const
570
571
476k
      size_t x = x_start;
572
      // First pixel of the row.
573
476k
      if (x_start == 0) {
574
65.2k
        scalar_pixel(x_start);
575
65.2k
        ++x;
576
65.2k
      }
577
      // SIMD
578
476k
      const auto match_gamma_offset_v = Set(df, match_gamma_offset);
579
476k
      const auto quarter = Set(df, 0.25f);
580
4.02M
      for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) {
581
3.54M
        const auto in = LoadU(df, row_in + x);
582
3.54M
        const auto in_r = LoadU(df, row_in + x + 1);
583
3.54M
        const auto in_l = LoadU(df, row_in + x - 1);
584
3.54M
        const auto in_t = LoadU(df, row_in2 + x);
585
3.54M
        const auto in_b = LoadU(df, row_in1 + x);
586
3.54M
        auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
587
3.54M
        auto gammacv =
588
3.54M
            RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
589
3.54M
                df, Add(in, match_gamma_offset_v));
590
3.54M
        auto diff = Mul(gammacv, Sub(in, base));
591
3.54M
        diff = Mul(diff, diff);
592
3.54M
        diff = Min(diff, Set(df, limit));
593
3.54M
        diff = MaskingSqrt(df, diff);
594
3.54M
        if ((y & 3) != 0) {
595
2.66M
          diff = Add(diff, LoadU(df, row_out + x - x_start));
596
2.66M
        }
597
3.54M
        StoreU(diff, df, row_out + x - x_start);
598
3.54M
      }
599
      // Scalar
600
3.70M
      for (; x < x_end; ++x) {
601
3.23M
        scalar_pixel(x);
602
3.23M
      }
603
476k
      if (y % 4 == 3) {
604
119k
        float* row_d_out = pre_erosion[thread].Row((y - y_start) / 4);
605
2.09M
        for (size_t qx = 0; qx < (x_end - x_start) / 4; qx++) {
606
1.98M
          row_d_out[qx] = (row_out[qx * 4] + row_out[qx * 4 + 1] +
607
1.98M
                           row_out[qx * 4 + 2] + row_out[qx * 4 + 3]) *
608
1.98M
                          0.25f;
609
1.98M
        }
610
119k
      }
611
476k
    }
612
7.22k
    JXL_ENSURE(x_start % (kBlockDim / 2) == 0);
613
7.22k
    JXL_ENSURE(y_start % (kBlockDim / 2) == 0);
614
7.22k
    Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
615
7.22k
                   rect_out.xsize() * 2, rect_out.ysize() * 2);
616
7.22k
    JXL_RETURN_IF_ERROR(FuzzyErosion(butteraugli_target, from_rect,
617
7.22k
                                     pre_erosion[thread], rect_out, &aq_map));
618
60.6k
    for (size_t y = 0; y < rect_out.ysize(); ++y) {
619
53.4k
      const float* aq_map_row = rect_out.ConstRow(aq_map, y);
620
53.4k
      float* mask_row = rect_out.Row(mask, y);
621
451k
      for (size_t x = 0; x < rect_out.xsize(); ++x) {
622
398k
        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
623
398k
      }
624
53.4k
    }
625
7.22k
    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
626
7.22k
                        xyb.Plane(2), rect_in, scale, rect_out, &aq_map);
627
7.22k
    return true;
628
7.22k
  }
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)
Line
Count
Source
468
7.22k
                     const int thread, ImageF* mask, ImageF* mask1x1) {
469
7.22k
    JXL_ENSURE(rect_in.x0() % kBlockDim == 0);
470
7.22k
    JXL_ENSURE(rect_in.y0() % kBlockDim == 0);
471
7.22k
    const size_t xsize = xyb.xsize();
472
7.22k
    const size_t ysize = xyb.ysize();
473
474
    // The XYB gamma is 3.0 to be able to decode faster with two muls.
475
    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
476
    // We approximate the gamma difference by adding one cubic root into
477
    // the adaptive quantization. This gives us a total gamma of 2.6666
478
    // for quantization uses.
479
7.22k
    const float match_gamma_offset = 0.019;
480
481
7.22k
    const HWY_FULL(float) df;
482
483
7.22k
    size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8;
484
7.22k
    size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8;
485
486
7.22k
    size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8;
487
7.22k
    size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8;
488
489
7.22k
    if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2;
490
7.22k
    if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) {
491
0
      x_end_1x1 += 2;
492
0
    }
493
7.22k
    if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2;
494
7.22k
    if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) {
495
0
      y_end_1x1 += 2;
496
0
    }
497
498
    // Computes image (padded to multiple of 8x8) of local pixel differences.
499
    // Subsample both directions by 4.
500
    // 1x1 Laplacian of intensity.
501
434k
    for (size_t y = y_start_1x1; y < y_end_1x1; ++y) {
502
427k
      const size_t y2 = y + 1 < ysize ? y + 1 : y;
503
427k
      const size_t y1 = y > 0 ? y - 1 : y;
504
427k
      const float* row_in = xyb.ConstPlaneRow(1, y);
505
427k
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
506
427k
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
507
427k
      float* mask1x1_out = mask1x1->Row(y);
508
427k
      auto scalar_pixel1x1 = [&](size_t x) {
509
427k
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
510
427k
        const size_t x1 = x > 0 ? x - 1 : x;
511
427k
        const float base =
512
427k
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
513
427k
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
514
427k
            row_in[x] + match_gamma_offset);
515
427k
        float diff = std::abs(gammac * (row_in[x] - base));
516
427k
        static const double kScaler = 1.0;
517
427k
        diff *= kScaler;
518
427k
        diff = std::log1p(diff);
519
427k
        static const float kMul = 1.0;
520
427k
        static const float kOffset = 0.01;
521
427k
        mask1x1_out[x] = kMul / (diff + kOffset);
522
427k
      };
523
25.9M
      for (size_t x = x_start_1x1; x < x_end_1x1; ++x) {
524
25.4M
        scalar_pixel1x1(x);
525
25.4M
      }
526
427k
    }
527
528
7.22k
    size_t y_start = rect_in.y0() + rect_out.y0() * 8;
529
7.22k
    size_t y_end = y_start + rect_out.ysize() * 8;
530
531
7.22k
    size_t x_start = rect_in.x0() + rect_out.x0() * 8;
532
7.22k
    size_t x_end = x_start + rect_out.xsize() * 8;
533
534
7.22k
    if (x_start != 0) x_start -= 4;
535
7.22k
    if (x_end != xsize) x_end += 4;
536
7.22k
    if (y_start != 0) y_start -= 4;
537
7.22k
    if (y_end != ysize) y_end += 4;
538
7.22k
    JXL_RETURN_IF_ERROR(pre_erosion[thread].ShrinkTo((x_end - x_start) / 4,
539
7.22k
                                                     (y_end - y_start) / 4));
540
541
7.22k
    static const float limit = 0.2f;
542
483k
    for (size_t y = y_start; y < y_end; ++y) {
543
476k
      size_t y2 = y + 1 < ysize ? y + 1 : y;
544
476k
      size_t y1 = y > 0 ? y - 1 : y;
545
546
476k
      const float* row_in = xyb.ConstPlaneRow(1, y);
547
476k
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
548
476k
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
549
476k
      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
550
551
476k
      auto scalar_pixel = [&](size_t x) {
552
476k
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
553
476k
        const size_t x1 = x > 0 ? x - 1 : x;
554
476k
        const float base =
555
476k
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
556
476k
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
557
476k
            row_in[x] + match_gamma_offset);
558
476k
        float diff = gammac * (row_in[x] - base);
559
476k
        diff *= diff;
560
476k
        if (diff >= limit) {
561
476k
          diff = limit;
562
476k
        }
563
476k
        diff = MaskingSqrt(diff);
564
476k
        if ((y % 4) != 0) {
565
476k
          row_out[x - x_start] += diff;
566
476k
        } else {
567
476k
          row_out[x - x_start] = diff;
568
476k
        }
569
476k
      };
570
571
476k
      size_t x = x_start;
572
      // First pixel of the row.
573
476k
      if (x_start == 0) {
574
65.2k
        scalar_pixel(x_start);
575
65.2k
        ++x;
576
65.2k
      }
577
      // SIMD
578
476k
      const auto match_gamma_offset_v = Set(df, match_gamma_offset);
579
476k
      const auto quarter = Set(df, 0.25f);
580
4.02M
      for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) {
581
3.54M
        const auto in = LoadU(df, row_in + x);
582
3.54M
        const auto in_r = LoadU(df, row_in + x + 1);
583
3.54M
        const auto in_l = LoadU(df, row_in + x - 1);
584
3.54M
        const auto in_t = LoadU(df, row_in2 + x);
585
3.54M
        const auto in_b = LoadU(df, row_in1 + x);
586
3.54M
        auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
587
3.54M
        auto gammacv =
588
3.54M
            RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
589
3.54M
                df, Add(in, match_gamma_offset_v));
590
3.54M
        auto diff = Mul(gammacv, Sub(in, base));
591
3.54M
        diff = Mul(diff, diff);
592
3.54M
        diff = Min(diff, Set(df, limit));
593
3.54M
        diff = MaskingSqrt(df, diff);
594
3.54M
        if ((y & 3) != 0) {
595
2.66M
          diff = Add(diff, LoadU(df, row_out + x - x_start));
596
2.66M
        }
597
3.54M
        StoreU(diff, df, row_out + x - x_start);
598
3.54M
      }
599
      // Scalar
600
3.70M
      for (; x < x_end; ++x) {
601
3.23M
        scalar_pixel(x);
602
3.23M
      }
603
476k
      if (y % 4 == 3) {
604
119k
        float* row_d_out = pre_erosion[thread].Row((y - y_start) / 4);
605
2.09M
        for (size_t qx = 0; qx < (x_end - x_start) / 4; qx++) {
606
1.98M
          row_d_out[qx] = (row_out[qx * 4] + row_out[qx * 4 + 1] +
607
1.98M
                           row_out[qx * 4 + 2] + row_out[qx * 4 + 3]) *
608
1.98M
                          0.25f;
609
1.98M
        }
610
119k
      }
611
476k
    }
612
7.22k
    JXL_ENSURE(x_start % (kBlockDim / 2) == 0);
613
7.22k
    JXL_ENSURE(y_start % (kBlockDim / 2) == 0);
614
7.22k
    Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
615
7.22k
                   rect_out.xsize() * 2, rect_out.ysize() * 2);
616
7.22k
    JXL_RETURN_IF_ERROR(FuzzyErosion(butteraugli_target, from_rect,
617
7.22k
                                     pre_erosion[thread], rect_out, &aq_map));
618
60.6k
    for (size_t y = 0; y < rect_out.ysize(); ++y) {
619
53.4k
      const float* aq_map_row = rect_out.ConstRow(aq_map, y);
620
53.4k
      float* mask_row = rect_out.Row(mask, y);
621
451k
      for (size_t x = 0; x < rect_out.xsize(); ++x) {
622
398k
        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
623
398k
      }
624
53.4k
    }
625
7.22k
    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
626
7.22k
                        xyb.Plane(2), rect_in, scale, rect_out, &aq_map);
627
7.22k
    return true;
628
7.22k
  }
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)
629
  std::vector<ImageF> pre_erosion;
630
  ImageF aq_map;
631
  ImageF diff_buffer;
632
};
633
634
Status Blur1x1Masking(JxlMemoryManager* memory_manager, ThreadPool* pool,
635
186
                      ImageF* mask1x1, const Rect& rect) {
636
  // Blur the mask1x1 to obtain the masking image.
637
  // Before blurring it contains an image of absolute value of the
638
  // Laplacian of the intensity channel.
639
186
  static const float kFilterMask1x1[5] = {
640
186
    0.364911248, 0.05, 0.1688888021, 0.221069183, 0.306563504,
641
186
  };
642
186
  double sum =
643
186
      1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] +
644
186
                 kFilterMask1x1[4] + 2 * kFilterMask1x1[3]);
645
186
  if (sum < 1e-5) {
646
0
    sum = 1e-5;
647
0
  }
648
186
  const float normalize = static_cast<float>(1.0 / sum);
649
186
  WeightsSymmetric5 weights =
650
186
      WeightsSymmetric5{{HWY_REP4(normalize)},
651
186
                        {HWY_REP4(normalize * kFilterMask1x1[0])},
652
186
                        {HWY_REP4(normalize * kFilterMask1x1[2])},
653
186
                        {HWY_REP4(normalize * kFilterMask1x1[1])},
654
186
                        {HWY_REP4(normalize * kFilterMask1x1[4])},
655
186
                        {HWY_REP4(normalize * kFilterMask1x1[3])}};
656
186
  JXL_ASSIGN_OR_RETURN(
657
186
      ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize()));
658
186
  JXL_RETURN_IF_ERROR(
659
186
      Symmetric5(*mask1x1, rect, weights, pool, &temp, Rect(temp)));
660
186
  *mask1x1 = std::move(temp);
661
186
  return true;
662
186
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Line
Count
Source
635
186
                      ImageF* mask1x1, const Rect& rect) {
636
  // Blur the mask1x1 to obtain the masking image.
637
  // Before blurring it contains an image of absolute value of the
638
  // Laplacian of the intensity channel.
639
186
  static const float kFilterMask1x1[5] = {
640
186
    0.364911248, 0.05, 0.1688888021, 0.221069183, 0.306563504,
641
186
  };
642
186
  double sum =
643
186
      1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] +
644
186
                 kFilterMask1x1[4] + 2 * kFilterMask1x1[3]);
645
186
  if (sum < 1e-5) {
646
0
    sum = 1e-5;
647
0
  }
648
186
  const float normalize = static_cast<float>(1.0 / sum);
649
186
  WeightsSymmetric5 weights =
650
186
      WeightsSymmetric5{{HWY_REP4(normalize)},
651
186
                        {HWY_REP4(normalize * kFilterMask1x1[0])},
652
186
                        {HWY_REP4(normalize * kFilterMask1x1[2])},
653
186
                        {HWY_REP4(normalize * kFilterMask1x1[1])},
654
186
                        {HWY_REP4(normalize * kFilterMask1x1[4])},
655
186
                        {HWY_REP4(normalize * kFilterMask1x1[3])}};
656
186
  JXL_ASSIGN_OR_RETURN(
657
186
      ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize()));
658
186
  JXL_RETURN_IF_ERROR(
659
186
      Symmetric5(*mask1x1, rect, weights, pool, &temp, Rect(temp)));
660
186
  *mask1x1 = std::move(temp);
661
186
  return true;
662
186
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
663
664
StatusOr<ImageF> AdaptiveQuantizationMap(const float butteraugli_target,
665
                                         const Image3F& xyb, const Rect& rect,
666
                                         float scale, ThreadPool* pool,
667
186
                                         ImageF* mask, ImageF* mask1x1) {
668
186
  JXL_ENSURE(rect.xsize() % kBlockDim == 0);
669
186
  JXL_ENSURE(rect.ysize() % kBlockDim == 0);
670
186
  AdaptiveQuantizationImpl impl;
671
186
  const size_t xsize_blocks = rect.xsize() / kBlockDim;
672
186
  const size_t ysize_blocks = rect.ysize() / kBlockDim;
673
186
  JxlMemoryManager* memory_manager = xyb.memory_manager();
674
186
  JXL_ASSIGN_OR_RETURN(
675
186
      impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
676
186
  JXL_ASSIGN_OR_RETURN(
677
186
      *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
678
186
  JXL_ASSIGN_OR_RETURN(
679
186
      *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize()));
680
186
  const auto prepare = [&](const size_t num_threads) -> Status {
681
186
    JXL_RETURN_IF_ERROR(impl.PrepareBuffers(memory_manager, num_threads));
682
186
    return true;
683
186
  };
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_0::operator()(unsigned long) const
Line
Count
Source
680
186
  const auto prepare = [&](const size_t num_threads) -> Status {
681
186
    JXL_RETURN_IF_ERROR(impl.PrepareBuffers(memory_manager, num_threads));
682
186
    return true;
683
186
  };
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_0::operator()(unsigned long) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_0::operator()(unsigned long) const
684
186
  const auto process_tile = [&](const uint32_t tid,
685
7.22k
                                const size_t thread) -> Status {
686
7.22k
    size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks);
687
7.22k
    size_t tx = tid % n_enc_tiles;
688
7.22k
    size_t ty = tid / n_enc_tiles;
689
7.22k
    size_t by0 = ty * kEncTileDimInBlocks;
690
7.22k
    size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks);
691
7.22k
    size_t bx0 = tx * kEncTileDimInBlocks;
692
7.22k
    size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks);
693
7.22k
    Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0);
694
7.22k
    JXL_RETURN_IF_ERROR(impl.ComputeTile(butteraugli_target, scale, xyb, rect,
695
7.22k
                                         rect_out, thread, mask, mask1x1));
696
7.22k
    return true;
697
7.22k
  };
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_1::operator()(unsigned int, unsigned long) const
Line
Count
Source
685
7.22k
                                const size_t thread) -> Status {
686
7.22k
    size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks);
687
7.22k
    size_t tx = tid % n_enc_tiles;
688
7.22k
    size_t ty = tid / n_enc_tiles;
689
7.22k
    size_t by0 = ty * kEncTileDimInBlocks;
690
7.22k
    size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks);
691
7.22k
    size_t bx0 = tx * kEncTileDimInBlocks;
692
7.22k
    size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks);
693
7.22k
    Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0);
694
7.22k
    JXL_RETURN_IF_ERROR(impl.ComputeTile(butteraugli_target, scale, xyb, rect,
695
7.22k
                                         rect_out, thread, mask, mask1x1));
696
7.22k
    return true;
697
7.22k
  };
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_1::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)::$_1::operator()(unsigned int, unsigned long) const
698
186
  size_t num_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks) *
699
186
                     DivCeil(ysize_blocks, kEncTileDimInBlocks);
700
186
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_tiles, prepare, process_tile,
701
186
                                "AQ DiffPrecompute"));
702
703
186
  JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect));
704
186
  return std::move(impl).aq_map;
705
186
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)
Line
Count
Source
667
186
                                         ImageF* mask, ImageF* mask1x1) {
668
186
  JXL_ENSURE(rect.xsize() % kBlockDim == 0);
669
186
  JXL_ENSURE(rect.ysize() % kBlockDim == 0);
670
186
  AdaptiveQuantizationImpl impl;
671
186
  const size_t xsize_blocks = rect.xsize() / kBlockDim;
672
186
  const size_t ysize_blocks = rect.ysize() / kBlockDim;
673
186
  JxlMemoryManager* memory_manager = xyb.memory_manager();
674
186
  JXL_ASSIGN_OR_RETURN(
675
186
      impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
676
186
  JXL_ASSIGN_OR_RETURN(
677
186
      *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
678
186
  JXL_ASSIGN_OR_RETURN(
679
186
      *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize()));
680
186
  const auto prepare = [&](const size_t num_threads) -> Status {
681
186
    JXL_RETURN_IF_ERROR(impl.PrepareBuffers(memory_manager, num_threads));
682
186
    return true;
683
186
  };
684
186
  const auto process_tile = [&](const uint32_t tid,
685
186
                                const size_t thread) -> Status {
686
186
    size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks);
687
186
    size_t tx = tid % n_enc_tiles;
688
186
    size_t ty = tid / n_enc_tiles;
689
186
    size_t by0 = ty * kEncTileDimInBlocks;
690
186
    size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks);
691
186
    size_t bx0 = tx * kEncTileDimInBlocks;
692
186
    size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks);
693
186
    Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0);
694
186
    JXL_RETURN_IF_ERROR(impl.ComputeTile(butteraugli_target, scale, xyb, rect,
695
186
                                         rect_out, thread, mask, mask1x1));
696
186
    return true;
697
186
  };
698
186
  size_t num_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks) *
699
186
                     DivCeil(ysize_blocks, kEncTileDimInBlocks);
700
186
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_tiles, prepare, process_tile,
701
186
                                "AQ DiffPrecompute"));
702
703
186
  JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect));
704
186
  return std::move(impl).aq_map;
705
186
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)
706
707
}  // namespace
708
709
// NOLINTNEXTLINE(google-readability-namespace-comments)
710
}  // namespace HWY_NAMESPACE
711
}  // namespace jxl
712
HWY_AFTER_NAMESPACE();
713
714
#if HWY_ONCE
715
namespace jxl {
716
HWY_EXPORT(AdaptiveQuantizationMap);
717
718
namespace {
719
720
// If true, prints the quantization maps at each iteration.
721
constexpr bool FLAGS_dump_quant_state = false;
722
723
Status DumpHeatmap(const CompressParams& cparams, const AuxOut* aux_out,
724
                   const std::string& label, const ImageF& image,
725
0
                   float good_threshold, float bad_threshold) {
726
0
  if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
727
0
    JXL_ASSIGN_OR_RETURN(
728
0
        Image3F heatmap,
729
0
        CreateHeatMapImage(image, good_threshold, bad_threshold));
730
0
    char filename[200];
731
0
    snprintf(filename, sizeof(filename), "%s%05d", label.c_str(),
732
0
             aux_out->num_butteraugli_iters);
733
0
    JXL_RETURN_IF_ERROR(DumpImage(cparams, filename, heatmap));
734
0
  }
735
0
  return true;
736
0
}
737
738
Status DumpHeatmaps(const CompressParams& cparams, const AuxOut* aux_out,
739
                    float butteraugli_target, const ImageF& quant_field,
740
0
                    const ImageF& tile_heatmap, const ImageF& bt_diffmap) {
741
0
  if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
742
0
    JxlMemoryManager* memory_manager = quant_field.memory_manager();
743
0
    if (!WantDebugOutput(cparams)) return true;
744
0
    JXL_ASSIGN_OR_RETURN(ImageF inv_qmap,
745
0
                         ImageF::Create(memory_manager, quant_field.xsize(),
746
0
                                        quant_field.ysize()));
747
0
    for (size_t y = 0; y < quant_field.ysize(); ++y) {
748
0
      const float* JXL_RESTRICT row_q = quant_field.ConstRow(y);
749
0
      float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y);
750
0
      for (size_t x = 0; x < quant_field.xsize(); ++x) {
751
0
        row_inv_q[x] = 1.0f / row_q[x];  // never zero
752
0
      }
753
0
    }
754
0
    JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "quant_heatmap", inv_qmap,
755
0
                                    4.0f * butteraugli_target,
756
0
                                    6.0f * butteraugli_target));
757
0
    JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "tile_heatmap",
758
0
                                    tile_heatmap, butteraugli_target,
759
0
                                    1.5f * butteraugli_target));
760
0
    // matches heat maps produced by the command line tool.
761
0
    JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "bt_diffmap", bt_diffmap,
762
0
                                    ButteraugliFuzzyInverse(1.5),
763
0
                                    ButteraugliFuzzyInverse(0.5)));
764
0
  }
765
0
  return true;
766
0
}
767
768
StatusOr<ImageF> TileDistMap(const ImageF& distmap, int tile_size, int margin,
769
0
                             const AcStrategyImage& ac_strategy) {
770
0
  const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size;
771
0
  const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size;
772
0
  JxlMemoryManager* memory_manager = distmap.memory_manager();
773
0
  JXL_ASSIGN_OR_RETURN(ImageF tile_distmap,
774
0
                       ImageF::Create(memory_manager, tile_xsize, tile_ysize));
775
0
  size_t distmap_stride = tile_distmap.PixelsPerRow();
776
0
  for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) {
777
0
    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y);
778
0
    float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y);
779
0
    for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) {
780
0
      AcStrategy acs = ac_strategy_row[tile_x];
781
0
      if (!acs.IsFirstBlock()) continue;
782
0
      int this_tile_xsize = acs.covered_blocks_x() * tile_size;
783
0
      int this_tile_ysize = acs.covered_blocks_y() * tile_size;
784
0
      int y_begin = std::max<int>(0, tile_size * tile_y - margin);
785
0
      int y_end = std::min<int>(distmap.ysize(),
786
0
                                tile_size * tile_y + this_tile_ysize + margin);
787
0
      int x_begin = std::max<int>(0, tile_size * tile_x - margin);
788
0
      int x_end = std::min<int>(distmap.xsize(),
789
0
                                tile_size * tile_x + this_tile_xsize + margin);
790
0
      float dist_norm = 0.0;
791
0
      double pixels = 0;
792
0
      for (int y = y_begin; y < y_end; ++y) {
793
0
        float ymul = 1.0;
794
0
        constexpr float kBorderMul = 0.98f;
795
0
        constexpr float kCornerMul = 0.7f;
796
0
        if (margin != 0 && (y == y_begin || y == y_end - 1)) {
797
0
          ymul = kBorderMul;
798
0
        }
799
0
        const float* const JXL_RESTRICT row = distmap.Row(y);
800
0
        for (int x = x_begin; x < x_end; ++x) {
801
0
          float xmul = ymul;
802
0
          if (margin != 0 && (x == x_begin || x == x_end - 1)) {
803
0
            if (xmul == 1.0) {
804
0
              xmul = kBorderMul;
805
0
            } else {
806
0
              xmul = kCornerMul;
807
0
            }
808
0
          }
809
0
          float v = row[x];
810
0
          v *= v;
811
0
          v *= v;
812
0
          v *= v;
813
0
          v *= v;
814
0
          dist_norm += xmul * v;
815
0
          pixels += xmul;
816
0
        }
817
0
      }
818
0
      if (pixels == 0) pixels = 1;
819
      // 16th norm is less than the max norm, we reduce the difference
820
      // with this normalization factor.
821
0
      constexpr float kTileNorm = 1.2f;
822
0
      const float tile_dist =
823
0
          kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f);
824
0
      dist_row[tile_x] = tile_dist;
825
0
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
826
0
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
827
0
          dist_row[tile_x + distmap_stride * iy + ix] = tile_dist;
828
0
        }
829
0
      }
830
0
    }
831
0
  }
832
0
  return tile_distmap;
833
0
}
834
835
const float kDcQuantPow = 0.83f;
836
const float kDcQuant = 1.095924047623553f;
837
const float kAcQuant = 0.765f;
838
839
// Computes the decoded image for a given set of compression parameters.
840
StatusOr<ImageBundle> RoundtripImage(const FrameHeader& frame_header,
841
                                     const Image3F& opsin,
842
                                     PassesEncoderState* enc_state,
843
                                     const JxlCmsInterface& cms,
844
0
                                     ThreadPool* pool) {
845
0
  JxlMemoryManager* memory_manager = enc_state->memory_manager();
846
0
  std::unique_ptr<PassesDecoderState> dec_state =
847
0
      jxl::make_unique<PassesDecoderState>(memory_manager);
848
0
  JXL_RETURN_IF_ERROR(dec_state->output_encoding_info.SetFromMetadata(
849
0
      *enc_state->shared.metadata));
850
0
  dec_state->shared = &enc_state->shared;
851
0
  JXL_ENSURE(opsin.ysize() % kBlockDim == 0);
852
853
0
  const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim);
854
0
  const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim);
855
0
  const size_t num_groups = xsize_groups * ysize_groups;
856
857
0
  size_t num_special_frames = enc_state->special_frames.size();
858
0
  size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
859
0
  JXL_ASSIGN_OR_RETURN(auto modular_frame_encoder,
860
0
                       ModularFrameEncoder::Create(memory_manager, frame_header,
861
0
                                                   enc_state->cparams, false));
862
0
  JXL_RETURN_IF_ERROR(
863
0
      InitializePassesEncoder(frame_header, opsin, Rect(opsin), cms, pool,
864
0
                              enc_state, modular_frame_encoder.get(), nullptr));
865
0
  JXL_RETURN_IF_ERROR(dec_state->Init(frame_header));
866
0
  JXL_RETURN_IF_ERROR(dec_state->InitForAC(num_passes, pool));
867
868
0
  ImageBundle decoded(memory_manager, &enc_state->shared.metadata->m);
869
0
  decoded.origin = frame_header.frame_origin;
870
0
  JXL_ASSIGN_OR_RETURN(
871
0
      Image3F tmp,
872
0
      Image3F::Create(memory_manager, opsin.xsize(), opsin.ysize()));
873
0
  JXL_RETURN_IF_ERROR(decoded.SetFromImage(
874
0
      std::move(tmp), dec_state->output_encoding_info.color_encoding));
875
876
0
  PassesDecoderState::PipelineOptions options;
877
0
  options.use_slow_render_pipeline = false;
878
0
  options.coalescing = false;
879
0
  options.render_spotcolors = false;
880
0
  options.render_noise = false;
881
882
  // Same as frame_header.nonserialized_metadata->m
883
0
  const ImageMetadata& metadata = *decoded.metadata();
884
885
0
  JXL_RETURN_IF_ERROR(dec_state->PreparePipeline(
886
0
      frame_header, &enc_state->shared.metadata->m, &decoded, options));
887
888
0
  AlignedArray<GroupDecCache> group_dec_caches;
889
0
  const auto allocate_storage = [&](const size_t num_threads) -> Status {
890
0
    JXL_RETURN_IF_ERROR(
891
0
        dec_state->render_pipeline->PrepareForThreads(num_threads,
892
0
                                                      /*use_group_ids=*/false));
893
0
    JXL_ASSIGN_OR_RETURN(group_dec_caches, AlignedArray<GroupDecCache>::Create(
894
0
                                               memory_manager, num_threads));
895
0
    return true;
896
0
  };
897
0
  const auto process_group = [&](const uint32_t group_index,
898
0
                                 const size_t thread) -> Status {
899
0
    if (frame_header.loop_filter.epf_iters > 0) {
900
0
      JXL_RETURN_IF_ERROR(
901
0
          ComputeSigma(frame_header.loop_filter,
902
0
                       dec_state->shared->frame_dim.BlockGroupRect(group_index),
903
0
                       dec_state.get()));
904
0
    }
905
0
    RenderPipelineInput input =
906
0
        dec_state->render_pipeline->GetInputBuffers(group_index, thread);
907
0
    JXL_RETURN_IF_ERROR(DecodeGroupForRoundtrip(
908
0
        frame_header, enc_state->coeffs, group_index, dec_state.get(),
909
0
        &group_dec_caches[thread], thread, input, nullptr, nullptr));
910
0
    for (size_t c = 0; c < metadata.num_extra_channels; c++) {
911
0
      std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c);
912
0
      FillPlane(0.0f, ri.first, ri.second);
913
0
    }
914
0
    JXL_RETURN_IF_ERROR(input.Done());
915
0
    return true;
916
0
  };
917
0
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_groups, allocate_storage,
918
0
                                process_group, "AQ loop"));
919
920
  // Ensure we don't create any new special frames.
921
0
  enc_state->special_frames.resize(num_special_frames);
922
923
0
  return decoded;
924
0
}
925
926
constexpr int kDefaultButteraugliIters = 2;
927
constexpr int kMaxButteraugliIters = 4;
928
929
Status FindBestQuantization(const FrameHeader& frame_header,
930
                            const Image3F& linear, const Image3F& opsin,
931
                            ImageF& quant_field, PassesEncoderState* enc_state,
932
                            const JxlCmsInterface& cms, ThreadPool* pool,
933
0
                            AuxOut* aux_out) {
934
0
  const CompressParams& cparams = enc_state->cparams;
935
0
  if (cparams.resampling > 1 &&
936
0
      cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) {
937
    // For downsampled opsin image, the butteraugli based adaptive quantization
938
    // loop would only make the size bigger without improving the distance much,
939
    // so in this case we enable it only for very high butteraugli targets.
940
0
    return true;
941
0
  }
942
0
  JxlMemoryManager* memory_manager = enc_state->memory_manager();
943
0
  Quantizer& quantizer = enc_state->shared.quantizer;
944
0
  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
945
946
0
  const float butteraugli_target = cparams.butteraugli_distance;
947
0
  const float original_butteraugli = cparams.original_butteraugli_distance;
948
0
  ButteraugliParams params;
949
0
  const auto& tf = frame_header.nonserialized_metadata->m.color_encoding.Tf();
950
0
  params.intensity_target =
951
0
      tf.IsPQ() || tf.IsHLG()
952
0
          ? frame_header.nonserialized_metadata->m.IntensityTarget()
953
0
          : 80.f;
954
0
  JxlButteraugliComparator comparator(params, cms);
955
0
  JXL_RETURN_IF_ERROR(comparator.SetLinearReferenceImage(linear));
956
0
  bool lower_is_better =
957
0
      (comparator.GoodQualityScore() < comparator.BadQualityScore());
958
0
  const float initial_quant_dc = InitialQuantDC(butteraugli_target);
959
0
  JXL_RETURN_IF_ERROR(AdjustQuantField(enc_state->shared.ac_strategy,
960
0
                                       Rect(quant_field), original_butteraugli,
961
0
                                       &quant_field));
962
0
  ImageF tile_distmap;
963
0
  JXL_ASSIGN_OR_RETURN(
964
0
      ImageF initial_quant_field,
965
0
      ImageF::Create(memory_manager, quant_field.xsize(), quant_field.ysize()));
966
0
  JXL_RETURN_IF_ERROR(CopyImageTo(quant_field, &initial_quant_field));
967
968
0
  float initial_qf_min;
969
0
  float initial_qf_max;
970
0
  ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max);
971
0
  float initial_qf_ratio = initial_qf_max / initial_qf_min;
972
0
  float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio);
973
0
  float asymmetry = 2;
974
0
  if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low;
975
0
  float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low);
976
0
  float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry);
977
978
0
  JXL_ENSURE(qf_higher / qf_lower < 253);
979
980
0
  constexpr int kOriginalComparisonRound = 1;
981
0
  int iters = kDefaultButteraugliIters;
982
0
  if (cparams.speed_tier <= SpeedTier::kTortoise) {
983
0
    iters = kMaxButteraugliIters;
984
0
  }
985
0
  for (int i = 0; i < iters + 1; ++i) {
986
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
987
0
      printf("\nQuantization field:\n");
988
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
989
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
990
0
          printf(" %.5f", quant_field.Row(y)[x]);
991
0
        }
992
0
        printf("\n");
993
0
      }
994
0
    }
995
0
    JXL_RETURN_IF_ERROR(quantizer.SetQuantField(initial_quant_dc, quant_field,
996
0
                                                &raw_quant_field));
997
0
    JXL_ASSIGN_OR_RETURN(
998
0
        ImageBundle dec_linear,
999
0
        RoundtripImage(frame_header, opsin, enc_state, cms, pool));
1000
0
    float score;
1001
0
    ImageF diffmap;
1002
0
    JXL_RETURN_IF_ERROR(comparator.CompareWith(dec_linear, &diffmap, &score));
1003
0
    if (!lower_is_better) {
1004
0
      score = -score;
1005
0
      ScaleImage(-1.0f, &diffmap);
1006
0
    }
1007
0
    JXL_ASSIGN_OR_RETURN(tile_distmap,
1008
0
                         TileDistMap(diffmap, 8 * cparams.resampling, 0,
1009
0
                                     enc_state->shared.ac_strategy));
1010
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && WantDebugOutput(cparams)) {
1011
0
      JXL_RETURN_IF_ERROR(DumpImage(cparams, ("dec" + ToString(i)).c_str(),
1012
0
                                    *dec_linear.color()));
1013
0
      JXL_RETURN_IF_ERROR(DumpHeatmaps(cparams, aux_out, butteraugli_target,
1014
0
                                       quant_field, tile_distmap, diffmap));
1015
0
    }
1016
0
    if (aux_out != nullptr) ++aux_out->num_butteraugli_iters;
1017
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
1018
0
      float minval;
1019
0
      float maxval;
1020
0
      ImageMinMax(quant_field, &minval, &maxval);
1021
0
      printf("\nButteraugli iter: %d/%d\n", i, iters);
1022
0
      printf("Butteraugli distance: %f  (target = %f)\n", score,
1023
0
             original_butteraugli);
1024
0
      printf("quant range: %f ... %f  DC quant: %f\n", minval, maxval,
1025
0
             initial_quant_dc);
1026
0
      if (FLAGS_dump_quant_state) {
1027
0
        quantizer.DumpQuantizationMap(raw_quant_field);
1028
0
      }
1029
0
    }
1030
1031
0
    if (i == iters) break;
1032
1033
0
    double kPow[8] = {
1034
0
        0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1035
0
    };
1036
0
    double kPowMod[8] = {
1037
0
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1038
0
    };
1039
0
    if (i == kOriginalComparisonRound) {
1040
      // Don't allow optimization to make the quant field a lot worse than
1041
      // what the initial guess was. This allows the AC field to have enough
1042
      // precision to reduce the oscillations due to the dc reconstruction.
1043
0
      double kInitMul = 0.6;
1044
0
      const double kOneMinusInitMul = 1.0 - kInitMul;
1045
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
1046
0
        float* const JXL_RESTRICT row_q = quant_field.Row(y);
1047
0
        const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y);
1048
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
1049
0
          double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x];
1050
0
          if (row_q[x] < clamp) {
1051
0
            row_q[x] = clamp;
1052
0
            if (row_q[x] > qf_higher) row_q[x] = qf_higher;
1053
0
            if (row_q[x] < qf_lower) row_q[x] = qf_lower;
1054
0
          }
1055
0
        }
1056
0
      }
1057
0
    }
1058
1059
0
    double cur_pow = 0.0;
1060
0
    if (i < 7) {
1061
0
      cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i];
1062
0
      if (cur_pow < 0) {
1063
0
        cur_pow = 0;
1064
0
      }
1065
0
    }
1066
0
    if (cur_pow == 0.0) {
1067
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
1068
0
        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
1069
0
        float* const JXL_RESTRICT row_q = quant_field.Row(y);
1070
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
1071
0
          const float diff = row_dist[x] / original_butteraugli;
1072
0
          if (diff > 1.0f) {
1073
0
            float old = row_q[x];
1074
0
            row_q[x] *= diff;
1075
0
            int qf_old =
1076
0
                static_cast<int>(std::lround(old * quantizer.InvGlobalScale()));
1077
0
            int qf_new = static_cast<int>(
1078
0
                std::lround(row_q[x] * quantizer.InvGlobalScale()));
1079
0
            if (qf_old == qf_new) {
1080
0
              row_q[x] = old + quantizer.Scale();
1081
0
            }
1082
0
          }
1083
0
          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
1084
0
          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
1085
0
        }
1086
0
      }
1087
0
    } else {
1088
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
1089
0
        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
1090
0
        float* const JXL_RESTRICT row_q = quant_field.Row(y);
1091
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
1092
0
          const float diff = row_dist[x] / original_butteraugli;
1093
0
          if (diff <= 1.0f) {
1094
0
            row_q[x] *= std::pow(diff, cur_pow);
1095
0
          } else {
1096
0
            float old = row_q[x];
1097
0
            row_q[x] *= diff;
1098
0
            int qf_old =
1099
0
                static_cast<int>(std::lround(old * quantizer.InvGlobalScale()));
1100
0
            int qf_new = static_cast<int>(
1101
0
                std::lround(row_q[x] * quantizer.InvGlobalScale()));
1102
0
            if (qf_old == qf_new) {
1103
0
              row_q[x] = old + quantizer.Scale();
1104
0
            }
1105
0
          }
1106
0
          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
1107
0
          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
1108
0
        }
1109
0
      }
1110
0
    }
1111
0
  }
1112
0
  JXL_RETURN_IF_ERROR(
1113
0
      quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field));
1114
0
  return true;
1115
0
}
1116
1117
Status FindBestQuantizationMaxError(const FrameHeader& frame_header,
1118
                                    const Image3F& opsin, ImageF& quant_field,
1119
                                    PassesEncoderState* enc_state,
1120
                                    const JxlCmsInterface& cms,
1121
0
                                    ThreadPool* pool, AuxOut* aux_out) {
1122
  // TODO(szabadka): Make this work for non-opsin color spaces.
1123
0
  const CompressParams& cparams = enc_state->cparams;
1124
0
  Quantizer& quantizer = enc_state->shared.quantizer;
1125
0
  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
1126
1127
  // TODO(veluca): better choice of this value.
1128
0
  const float initial_quant_dc =
1129
0
      16 * std::sqrt(0.1f / cparams.butteraugli_distance);
1130
0
  JXL_RETURN_IF_ERROR(
1131
0
      AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
1132
0
                       cparams.original_butteraugli_distance, &quant_field));
1133
1134
0
  const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0],
1135
0
                                1.0f / enc_state->cparams.max_error[1],
1136
0
                                1.0f / enc_state->cparams.max_error[2]};
1137
1138
0
  for (int i = 0; i < kMaxButteraugliIters + 1; ++i) {
1139
0
    JXL_RETURN_IF_ERROR(quantizer.SetQuantField(initial_quant_dc, quant_field,
1140
0
                                                &raw_quant_field));
1141
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) {
1142
0
      JXL_RETURN_IF_ERROR(
1143
0
          DumpXybImage(cparams, ("ops" + ToString(i)).c_str(), opsin));
1144
0
    }
1145
0
    JXL_ASSIGN_OR_RETURN(
1146
0
        ImageBundle decoded,
1147
0
        RoundtripImage(frame_header, opsin, enc_state, cms, pool));
1148
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) {
1149
0
      JXL_RETURN_IF_ERROR(DumpXybImage(cparams, ("dec" + ToString(i)).c_str(),
1150
0
                                       *decoded.color()));
1151
0
    }
1152
0
    for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) {
1153
0
      AcStrategyRow ac_strategy_row =
1154
0
          enc_state->shared.ac_strategy.ConstRow(by);
1155
0
      for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) {
1156
0
        AcStrategy acs = ac_strategy_row[bx];
1157
0
        if (!acs.IsFirstBlock()) continue;
1158
0
        float max_error = 0;
1159
0
        for (size_t c = 0; c < 3; c++) {
1160
0
          for (size_t y = by * kBlockDim;
1161
0
               y < (by + acs.covered_blocks_y()) * kBlockDim; y++) {
1162
0
            if (y >= decoded.ysize()) continue;
1163
0
            const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y);
1164
0
            const float* JXL_RESTRICT dec_row =
1165
0
                decoded.color()->ConstPlaneRow(c, y);
1166
0
            for (size_t x = bx * kBlockDim;
1167
0
                 x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) {
1168
0
              if (x >= decoded.xsize()) continue;
1169
0
              max_error = std::max(
1170
0
                  std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error);
1171
0
            }
1172
0
          }
1173
0
        }
1174
        // Target an error between max_error/2 and max_error.
1175
        // If the error in the varblock is above the target, increase the qf to
1176
        // compensate. If the error is below the target, decrease the qf.
1177
        // However, to avoid an excessive increase of the qf, only do so if the
1178
        // error is less than half the maximum allowed error.
1179
0
        const float qf_mul = (max_error < 0.5f)   ? max_error * 2.0f
1180
0
                             : (max_error > 1.0f) ? max_error
1181
0
                                                  : 1.0f;
1182
0
        for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) {
1183
0
          float* JXL_RESTRICT quant_field_row = quant_field.Row(qy);
1184
0
          for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) {
1185
0
            quant_field_row[qx] *= qf_mul;
1186
0
          }
1187
0
        }
1188
0
      }
1189
0
    }
1190
0
  }
1191
0
  JXL_RETURN_IF_ERROR(
1192
0
      quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field));
1193
0
  return true;
1194
0
}
1195
1196
}  // namespace
1197
1198
Status AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
1199
7.22k
                        float butteraugli_target, ImageF* quant_field) {
1200
  // Replace the whole quant_field in non-8x8 blocks with the maximum of each
1201
  // 8x8 block.
1202
7.22k
  size_t stride = quant_field->PixelsPerRow();
1203
1204
  // At low distances it is great to use max, but mean works better
1205
  // at high distances. We interpolate between them for a distance
1206
  // range.
1207
7.22k
  float mean_max_mixer = 1.0f;
1208
7.22k
  {
1209
7.22k
    static const float kLimit = 1.54138f;
1210
7.22k
    static const float kMul = 0.56391f;
1211
7.22k
    static const float kMin = 0.0f;
1212
7.22k
    if (butteraugli_target > kLimit) {
1213
0
      mean_max_mixer -= (butteraugli_target - kLimit) * kMul;
1214
0
      if (mean_max_mixer < kMin) {
1215
0
        mean_max_mixer = kMin;
1216
0
      }
1217
0
    }
1218
7.22k
  }
1219
60.6k
  for (size_t y = 0; y < rect.ysize(); ++y) {
1220
53.4k
    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y);
1221
53.4k
    float* JXL_RESTRICT quant_row = rect.Row(quant_field, y);
1222
451k
    for (size_t x = 0; x < rect.xsize(); ++x) {
1223
398k
      AcStrategy acs = ac_strategy_row[x];
1224
398k
      if (!acs.IsFirstBlock()) continue;
1225
241k
      JXL_ENSURE(x + acs.covered_blocks_x() <= quant_field->xsize());
1226
241k
      JXL_ENSURE(y + acs.covered_blocks_y() <= quant_field->ysize());
1227
241k
      float max = quant_row[x];
1228
241k
      float mean = 0.0;
1229
521k
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
1230
678k
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
1231
398k
          mean += quant_row[x + ix + iy * stride];
1232
398k
          max = std::max(quant_row[x + ix + iy * stride], max);
1233
398k
        }
1234
279k
      }
1235
241k
      mean /= acs.covered_blocks_y() * acs.covered_blocks_x();
1236
241k
      if (acs.covered_blocks_y() * acs.covered_blocks_x() >= 4) {
1237
15.1k
        max *= mean_max_mixer;
1238
15.1k
        max += (1.0f - mean_max_mixer) * mean;
1239
15.1k
      }
1240
521k
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
1241
678k
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
1242
398k
          quant_row[x + ix + iy * stride] = max;
1243
398k
        }
1244
279k
      }
1245
241k
    }
1246
53.4k
  }
1247
7.22k
  return true;
1248
7.22k
}
1249
1250
186
float InitialQuantDC(float butteraugli_target) {
1251
186
  const float kDcMul = 0.3;  // Butteraugli target where non-linearity kicks in.
1252
186
  const float butteraugli_target_dc = std::max<float>(
1253
186
      0.5f * butteraugli_target,
1254
186
      std::min<float>(butteraugli_target,
1255
186
                      kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target,
1256
186
                                        kDcQuantPow)));
1257
  // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc.
1258
  // The maximum DC value might not be in the kXybRange because of inverse
1259
  // gaborish, so we add some slack to the maximum theoretical quant obtained
1260
  // this way (64).
1261
186
  return std::min(kDcQuant / butteraugli_target_dc, 50.f);
1262
186
}
1263
1264
StatusOr<ImageF> InitialQuantField(const float butteraugli_target,
1265
                                   const Image3F& opsin, const Rect& rect,
1266
                                   ThreadPool* pool, float rescale,
1267
186
                                   ImageF* mask, ImageF* mask1x1) {
1268
186
  const float quant_ac = kAcQuant / butteraugli_target;
1269
186
  return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(
1270
186
      butteraugli_target, opsin, rect, quant_ac * rescale, pool, mask, mask1x1);
1271
186
}
1272
1273
Status FindBestQuantizer(const FrameHeader& frame_header, const Image3F* linear,
1274
                         const Image3F& opsin, ImageF& quant_field,
1275
                         PassesEncoderState* enc_state,
1276
                         const JxlCmsInterface& cms, ThreadPool* pool,
1277
186
                         AuxOut* aux_out, double rescale) {
1278
186
  const CompressParams& cparams = enc_state->cparams;
1279
186
  if (cparams.max_error_mode) {
1280
0
    JXL_RETURN_IF_ERROR(FindBestQuantizationMaxError(
1281
0
        frame_header, opsin, quant_field, enc_state, cms, pool, aux_out));
1282
186
  } else if (linear && cparams.speed_tier <= SpeedTier::kKitten) {
1283
    // Normal encoding to a butteraugli score.
1284
0
    JXL_RETURN_IF_ERROR(FindBestQuantization(frame_header, *linear, opsin,
1285
0
                                             quant_field, enc_state, cms, pool,
1286
0
                                             aux_out));
1287
0
  }
1288
186
  return true;
1289
186
}
1290
1291
}  // namespace jxl
1292
#endif  // HWY_ONCE