Coverage Report

Created: 2024-05-21 06:24

/src/libjxl/lib/jxl/enc_adaptive_quantization.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_adaptive_quantization.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <atomic>
12
#include <cmath>
13
#include <cstddef>
14
#include <cstdlib>
15
#include <string>
16
#include <vector>
17
18
#undef HWY_TARGET_INCLUDE
19
#define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc"
20
#include <hwy/foreach_target.h>
21
#include <hwy/highway.h>
22
23
#include "lib/jxl/ac_strategy.h"
24
#include "lib/jxl/base/common.h"
25
#include "lib/jxl/base/compiler_specific.h"
26
#include "lib/jxl/base/data_parallel.h"
27
#include "lib/jxl/base/fast_math-inl.h"
28
#include "lib/jxl/base/rect.h"
29
#include "lib/jxl/base/status.h"
30
#include "lib/jxl/butteraugli/butteraugli.h"
31
#include "lib/jxl/cms/opsin_params.h"
32
#include "lib/jxl/convolve.h"
33
#include "lib/jxl/dec_cache.h"
34
#include "lib/jxl/dec_group.h"
35
#include "lib/jxl/enc_aux_out.h"
36
#include "lib/jxl/enc_butteraugli_comparator.h"
37
#include "lib/jxl/enc_cache.h"
38
#include "lib/jxl/enc_debug_image.h"
39
#include "lib/jxl/enc_group.h"
40
#include "lib/jxl/enc_modular.h"
41
#include "lib/jxl/enc_params.h"
42
#include "lib/jxl/enc_transforms-inl.h"
43
#include "lib/jxl/epf.h"
44
#include "lib/jxl/frame_dimensions.h"
45
#include "lib/jxl/image.h"
46
#include "lib/jxl/image_bundle.h"
47
#include "lib/jxl/image_ops.h"
48
#include "lib/jxl/quant_weights.h"
49
50
// Set JXL_DEBUG_ADAPTIVE_QUANTIZATION to 1 to enable debugging.
51
#ifndef JXL_DEBUG_ADAPTIVE_QUANTIZATION
52
0
#define JXL_DEBUG_ADAPTIVE_QUANTIZATION 0
53
#endif
54
55
HWY_BEFORE_NAMESPACE();
56
namespace jxl {
57
namespace HWY_NAMESPACE {
58
namespace {
59
60
// These templates are not found via ADL.
61
using hwy::HWY_NAMESPACE::AbsDiff;
62
using hwy::HWY_NAMESPACE::Add;
63
using hwy::HWY_NAMESPACE::And;
64
using hwy::HWY_NAMESPACE::Max;
65
using hwy::HWY_NAMESPACE::Rebind;
66
using hwy::HWY_NAMESPACE::Sqrt;
67
using hwy::HWY_NAMESPACE::ZeroIfNegative;
68
69
// The following functions modulate an exponent (out_val) and return the updated
70
// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
71
72
// Hack for mask estimation. Eventually replace this code with butteraugli's
73
// masking.
74
52
float ComputeMaskForAcStrategyUse(const float out_val) {
75
52
  const float kMul = 1.0f;
76
52
  const float kOffset = 0.001f;
77
52
  return kMul / (out_val + kOffset);
78
52
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float)
Line
Count
Source
74
52
float ComputeMaskForAcStrategyUse(const float out_val) {
75
52
  const float kMul = 1.0f;
76
52
  const float kOffset = 0.001f;
77
52
  return kMul / (out_val + kOffset);
78
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::ComputeMaskForAcStrategyUse(float)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::ComputeMaskForAcStrategyUse(float)
79
80
template <class D, class V>
81
52
V ComputeMask(const D d, const V out_val) {
82
52
  const auto kBase = Set(d, -0.7647f);
83
52
  const auto kMul4 = Set(d, 9.4708735624378946f);
84
52
  const auto kMul2 = Set(d, 17.35036561631863f);
85
52
  const auto kOffset2 = Set(d, 302.59587815579727f);
86
52
  const auto kMul3 = Set(d, 6.7943250517376494f);
87
52
  const auto kOffset3 = Set(d, 3.7179635626140772f);
88
52
  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
89
52
  const auto kMul0 = Set(d, 0.80061762862741759f);
90
52
  const auto k1 = Set(d, 1.0f);
91
92
  // Avoid division by zero.
93
52
  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
94
52
  const auto v2 = Div(k1, Add(v1, kOffset2));
95
52
  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
96
52
  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
97
  // TODO(jyrki):
98
  // A log or two here could make sense. In butteraugli we have effectively
99
  // log(log(x + C)) for this kind of use, as a single log is used in
100
  // saturating visual masking and here the modulation values are exponential,
101
  // another log would counter that.
102
52
  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
103
52
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::ComputeMask<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
81
52
V ComputeMask(const D d, const V out_val) {
82
52
  const auto kBase = Set(d, -0.7647f);
83
52
  const auto kMul4 = Set(d, 9.4708735624378946f);
84
52
  const auto kMul2 = Set(d, 17.35036561631863f);
85
52
  const auto kOffset2 = Set(d, 302.59587815579727f);
86
52
  const auto kMul3 = Set(d, 6.7943250517376494f);
87
52
  const auto kOffset3 = Set(d, 3.7179635626140772f);
88
52
  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
89
52
  const auto kMul0 = Set(d, 0.80061762862741759f);
90
52
  const auto k1 = Set(d, 1.0f);
91
92
  // Avoid division by zero.
93
52
  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
94
52
  const auto v2 = Div(k1, Add(v1, kOffset2));
95
52
  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
96
52
  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
97
  // TODO(jyrki):
98
  // A log or two here could make sense. In butteraugli we have effectively
99
  // log(log(x + C)) for this kind of use, as a single log is used in
100
  // saturating visual masking and here the modulation values are exponential,
101
  // another log would counter that.
102
52
  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
103
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::ComputeMask<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::ComputeMask<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
104
105
// mul and mul2 represent a scaling difference between jxl and butteraugli.
106
const float kSGmul = 226.77216153508914f;
107
const float kSGmul2 = 1.0f / 73.377132366608819f;
108
const float kLog2 = 0.693147181f;
109
// Includes correction factor for std::log -> log2.
110
const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
111
const float kSGVOffset = 7.7825991679894591f;
112
113
template <bool invert, typename D, typename V>
114
7.48k
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
115
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
116
  // is related to the number of photons.
117
  //
118
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
119
  // This ratio allows quantization to move from jxl's opsin space to
120
  // butteraugli's log-gamma space.
121
7.48k
  float kEpsilon = 1e-2;
122
7.48k
  v = ZeroIfNegative(v);
123
7.48k
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
124
7.48k
  const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon);
125
7.48k
  const auto kDenMul = Set(d, kLog2 * kSGmul);
126
127
7.48k
  const auto v2 = Mul(v, v);
128
129
7.48k
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
130
7.48k
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
131
7.48k
  return invert ? Div(num, den) : Div(den, num);
132
7.48k
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
114
6.65k
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
115
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
116
  // is related to the number of photons.
117
  //
118
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
119
  // This ratio allows quantization to move from jxl's opsin space to
120
  // butteraugli's log-gamma space.
121
6.65k
  float kEpsilon = 1e-2;
122
6.65k
  v = ZeroIfNegative(v);
123
6.65k
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
124
6.65k
  const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon);
125
6.65k
  const auto kDenMul = Set(d, kLog2 * kSGmul);
126
127
6.65k
  const auto v2 = Mul(v, v);
128
129
6.65k
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
130
6.65k
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
131
6.65k
  return invert ? Div(num, den) : Div(den, num);
132
6.65k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
114
832
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
115
  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
116
  // is related to the number of photons.
117
  //
118
  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
119
  // This ratio allows quantization to move from jxl's opsin space to
120
  // butteraugli's log-gamma space.
121
832
  float kEpsilon = 1e-2;
122
832
  v = ZeroIfNegative(v);
123
832
  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
124
832
  const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon);
125
832
  const auto kDenMul = Set(d, kLog2 * kSGmul);
126
127
832
  const auto v2 = Mul(v, v);
128
129
832
  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
130
832
  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
131
832
  return invert ? Div(num, den) : Div(den, num);
132
832
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<true, hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
133
134
template <bool invert = false>
135
6.65k
float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
136
6.65k
  using DScalar = HWY_CAPPED(float, 1);
137
6.65k
  auto vscalar = Load(DScalar(), &v);
138
6.65k
  return GetLane(
139
6.65k
      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
140
6.65k
}
enc_adaptive_quantization.cc:float jxl::N_AVX2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float)
Line
Count
Source
135
6.65k
float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
136
6.65k
  using DScalar = HWY_CAPPED(float, 1);
137
6.65k
  auto vscalar = Load(DScalar(), &v);
138
6.65k
  return GetLane(
139
6.65k
      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
140
6.65k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE4::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float)
Unexecuted instantiation: enc_adaptive_quantization.cc:float jxl::N_SSE2::(anonymous namespace)::RatioOfDerivativesOfCubicRootToSimpleGamma<false>(float)
141
142
// TODO(veluca): this function computes an approximation of the derivative of
143
// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
144
// exact derivatives. For reference, SimpleGamma was:
145
/*
146
template <typename D, typename V>
147
V SimpleGamma(const D d, V v) {
148
  // A simple HDR compatible gamma function.
149
  const auto mul = Set(d, kSGmul);
150
  const auto kRetMul = Set(d, kSGRetMul);
151
  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
152
  const auto kVOffset = Set(d, kSGVOffset);
153
154
  v *= mul;
155
156
  // This should happen rarely, but may lead to a NaN, which is rather
157
  // undesirable. Since negative photons don't exist we solve the NaNs by
158
  // clamping here.
159
  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
160
  v = ZeroIfNegative(v);
161
  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
162
}
163
*/
164
165
template <class D, class V>
166
V GammaModulation(const D d, const size_t x, const size_t y,
167
                  const ImageF& xyb_x, const ImageF& xyb_y, const Rect& rect,
168
52
                  const V out_val) {
169
52
  const float kBias = 0.16f;
170
52
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]);
171
52
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]);
172
52
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]);
173
52
  auto overall_ratio = Zero(d);
174
52
  auto bias = Set(d, kBias);
175
468
  for (size_t dy = 0; dy < 8; ++dy) {
176
416
    const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy);
177
416
    const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy);
178
832
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
179
416
      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
180
416
      const auto inx = Load(d, row_in_x + x + dx);
181
182
416
      const auto r = Sub(iny, inx);
183
416
      const auto ratio_r =
184
416
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r);
185
416
      overall_ratio = Add(overall_ratio, ratio_r);
186
187
416
      const auto g = Add(iny, inx);
188
416
      const auto ratio_g =
189
416
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g);
190
416
      overall_ratio = Add(overall_ratio, ratio_g);
191
416
    }
192
416
  }
193
52
  overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64));
194
  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
195
  // less than that.
196
52
  const auto kGam = Set(d, 0.1005613337192697f);
197
52
  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
198
52
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::GammaModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
168
52
                  const V out_val) {
169
52
  const float kBias = 0.16f;
170
52
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]);
171
52
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]);
172
52
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]);
173
52
  auto overall_ratio = Zero(d);
174
52
  auto bias = Set(d, kBias);
175
468
  for (size_t dy = 0; dy < 8; ++dy) {
176
416
    const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy);
177
416
    const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy);
178
832
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
179
416
      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
180
416
      const auto inx = Load(d, row_in_x + x + dx);
181
182
416
      const auto r = Sub(iny, inx);
183
416
      const auto ratio_r =
184
416
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r);
185
416
      overall_ratio = Add(overall_ratio, ratio_r);
186
187
416
      const auto g = Add(iny, inx);
188
416
      const auto ratio_g =
189
416
          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g);
190
416
      overall_ratio = Add(overall_ratio, ratio_g);
191
416
    }
192
416
  }
193
52
  overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 0.5f / 64));
194
  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
195
  // less than that.
196
52
  const auto kGam = Set(d, 0.1005613337192697f);
197
52
  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
198
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::GammaModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::GammaModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>)
199
200
// Change precision in 8x8 blocks that have high frequency content.
201
template <class D, class V>
202
V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
203
52
               const Rect& rect, const V out_val) {
204
  // Zero out the invalid differences for the rightmost value per row.
205
52
  const Rebind<uint32_t, D> du;
206
52
  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
207
52
                                                        ~0u, ~0u, ~0u, 0};
208
209
52
  auto sum = Zero(d);  // sum of absolute differences with right and below
210
211
52
  static const float valmin = 0.020602694503245016f;
212
52
  auto valminv = Set(d, valmin);
213
468
  for (size_t dy = 0; dy < 8; ++dy) {
214
416
    const float* JXL_RESTRICT row_in = rect.ConstRow(xyb, y + dy) + x;
215
416
    const float* JXL_RESTRICT row_in_next =
216
416
        dy == 7 ? row_in : rect.ConstRow(xyb, y + dy + 1) + x;
217
218
    // In SCALAR, there is no guarantee of having extra row padding.
219
    // Hence, we need to ensure we don't access pixels outside the row itself.
220
    // In SIMD modes, however, rows are padded, so it's safe to access one
221
    // garbage value after the row. The vector then gets masked with kMaskRight
222
    // to remove the influence of that value.
223
416
#if HWY_TARGET != HWY_SCALAR
224
832
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
225
#else
226
    for (size_t dx = 0; dx < 7; dx += Lanes(d)) {
227
#endif
228
416
      const auto p = Load(d, row_in + dx);
229
416
      const auto pr = LoadU(d, row_in + dx + 1);
230
416
      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
231
416
      sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr))));
232
233
416
      const auto pd = Load(d, row_in_next + dx);
234
416
      sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
235
416
    }
236
#if HWY_TARGET == HWY_SCALAR
237
    const auto p = Load(d, row_in + 7);
238
    const auto pd = Load(d, row_in_next + 7);
239
    sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
240
#endif
241
416
  }
242
  // more negative value gives more bpp
243
52
  static const float kOffset = -1.110929106987477;
244
52
  static const float kMul = -0.38078920620238305;
245
52
  sum = SumOfLanes(d, sum);
246
52
  float scalar_sum = GetLane(sum);
247
52
  scalar_sum += kOffset;
248
52
  scalar_sum *= kMul;
249
52
  return Add(Set(d, scalar_sum), out_val);
250
52
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::HfModulation<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
203
52
               const Rect& rect, const V out_val) {
204
  // Zero out the invalid differences for the rightmost value per row.
205
52
  const Rebind<uint32_t, D> du;
206
52
  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
207
52
                                                        ~0u, ~0u, ~0u, 0};
208
209
52
  auto sum = Zero(d);  // sum of absolute differences with right and below
210
211
52
  static const float valmin = 0.020602694503245016f;
212
52
  auto valminv = Set(d, valmin);
213
468
  for (size_t dy = 0; dy < 8; ++dy) {
214
416
    const float* JXL_RESTRICT row_in = rect.ConstRow(xyb, y + dy) + x;
215
416
    const float* JXL_RESTRICT row_in_next =
216
416
        dy == 7 ? row_in : rect.ConstRow(xyb, y + dy + 1) + x;
217
218
    // In SCALAR, there is no guarantee of having extra row padding.
219
    // Hence, we need to ensure we don't access pixels outside the row itself.
220
    // In SIMD modes, however, rows are padded, so it's safe to access one
221
    // garbage value after the row. The vector then gets masked with kMaskRight
222
    // to remove the influence of that value.
223
416
#if HWY_TARGET != HWY_SCALAR
224
832
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
225
#else
226
    for (size_t dx = 0; dx < 7; dx += Lanes(d)) {
227
#endif
228
416
      const auto p = Load(d, row_in + dx);
229
416
      const auto pr = LoadU(d, row_in + dx + 1);
230
416
      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
231
416
      sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr))));
232
233
416
      const auto pd = Load(d, row_in_next + dx);
234
416
      sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
235
416
    }
236
#if HWY_TARGET == HWY_SCALAR
237
    const auto p = Load(d, row_in + 7);
238
    const auto pd = Load(d, row_in_next + 7);
239
    sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
240
#endif
241
416
  }
242
  // more negative value gives more bpp
243
52
  static const float kOffset = -1.110929106987477;
244
52
  static const float kMul = -0.38078920620238305;
245
52
  sum = SumOfLanes(d, sum);
246
52
  float scalar_sum = GetLane(sum);
247
52
  scalar_sum += kOffset;
248
52
  scalar_sum *= kMul;
249
52
  return Add(Set(d, scalar_sum), out_val);
250
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::HfModulation<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::HfModulation<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, unsigned long, unsigned long, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, hwy::N_SSE2::Vec128<float, 4ul>)
251
252
void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
253
                         const ImageF& xyb_y, const ImageF& xyb_b,
254
                         const Rect& rect_in, const float scale,
255
52
                         const Rect& rect_out, ImageF* out) {
256
52
  float base_level = 0.48f * scale;
257
52
  float kDampenRampStart = 2.0f;
258
52
  float kDampenRampEnd = 14.0f;
259
52
  float dampen = 1.0f;
260
52
  if (butteraugli_target >= kDampenRampStart) {
261
0
    dampen = 1.0f - ((butteraugli_target - kDampenRampStart) /
262
0
                     (kDampenRampEnd - kDampenRampStart));
263
0
    if (dampen < 0) {
264
0
      dampen = 0;
265
0
    }
266
0
  }
267
52
  const float mul = scale * dampen;
268
52
  const float add = (1.0f - dampen) * base_level;
269
104
  for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) {
270
52
    const size_t y = iy * 8;
271
52
    float* const JXL_RESTRICT row_out = out->Row(iy);
272
52
    const HWY_CAPPED(float, kBlockDim) df;
273
104
    for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) {
274
52
      size_t x = ix * 8;
275
52
      auto out_val = Set(df, row_out[ix]);
276
52
      out_val = ComputeMask(df, out_val);
277
52
      out_val = HfModulation(df, x, y, xyb_y, rect_in, out_val);
278
52
      out_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, out_val);
279
      // We want multiplicative quantization field, so everything
280
      // until this point has been modulating the exponent.
281
52
      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
282
52
    }
283
52
  }
284
52
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Line
Count
Source
255
52
                         const Rect& rect_out, ImageF* out) {
256
52
  float base_level = 0.48f * scale;
257
52
  float kDampenRampStart = 2.0f;
258
52
  float kDampenRampEnd = 14.0f;
259
52
  float dampen = 1.0f;
260
52
  if (butteraugli_target >= kDampenRampStart) {
261
0
    dampen = 1.0f - ((butteraugli_target - kDampenRampStart) /
262
0
                     (kDampenRampEnd - kDampenRampStart));
263
0
    if (dampen < 0) {
264
0
      dampen = 0;
265
0
    }
266
0
  }
267
52
  const float mul = scale * dampen;
268
52
  const float add = (1.0f - dampen) * base_level;
269
104
  for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) {
270
52
    const size_t y = iy * 8;
271
52
    float* const JXL_RESTRICT row_out = out->Row(iy);
272
52
    const HWY_CAPPED(float, kBlockDim) df;
273
104
    for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) {
274
52
      size_t x = ix * 8;
275
52
      auto out_val = Set(df, row_out[ix]);
276
52
      out_val = ComputeMask(df, out_val);
277
52
      out_val = HfModulation(df, x, y, xyb_y, rect_in, out_val);
278
52
      out_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, out_val);
279
      // We want multiplicative quantization field, so everything
280
      // until this point has been modulating the exponent.
281
52
      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
282
52
    }
283
52
  }
284
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::PerBlockModulations(float, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, float, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
285
286
template <typename D, typename V>
287
3.32k
V MaskingSqrt(const D d, V v) {
288
3.32k
  static const float kLogOffset = 27.97044946785558f;
289
3.32k
  static const float kMul = 211.53333281566171f;
290
3.32k
  const auto mul_v = Set(d, kMul * 1e8);
291
3.32k
  const auto offset_v = Set(d, kLogOffset);
292
3.32k
  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
293
3.32k
}
enc_adaptive_quantization.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
287
3.32k
V MaskingSqrt(const D d, V v) {
288
3.32k
  static const float kLogOffset = 27.97044946785558f;
289
3.32k
  static const float kMul = 211.53333281566171f;
290
3.32k
  const auto mul_v = Set(d, kMul * 1e8);
291
3.32k
  const auto offset_v = Set(d, kLogOffset);
292
3.32k
  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
293
3.32k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::MaskingSqrt<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::MaskingSqrt<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::(anonymous namespace)::MaskingSqrt<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
294
295
3.32k
float MaskingSqrt(const float v) {
296
3.32k
  using DScalar = HWY_CAPPED(float, 1);
297
3.32k
  auto vscalar = Load(DScalar(), &v);
298
3.32k
  return GetLane(MaskingSqrt(DScalar(), vscalar));
299
3.32k
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::MaskingSqrt(float)
Line
Count
Source
295
3.32k
float MaskingSqrt(const float v) {
296
3.32k
  using DScalar = HWY_CAPPED(float, 1);
297
3.32k
  auto vscalar = Load(DScalar(), &v);
298
3.32k
  return GetLane(MaskingSqrt(DScalar(), vscalar));
299
3.32k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::MaskingSqrt(float)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::MaskingSqrt(float)
300
301
void StoreMin4(const float v, float& min0, float& min1, float& min2,
302
1.04k
               float& min3) {
303
1.04k
  if (v < min3) {
304
0
    if (v < min0) {
305
0
      min3 = min2;
306
0
      min2 = min1;
307
0
      min1 = min0;
308
0
      min0 = v;
309
0
    } else if (v < min1) {
310
0
      min3 = min2;
311
0
      min2 = min1;
312
0
      min1 = v;
313
0
    } else if (v < min2) {
314
0
      min3 = min2;
315
0
      min2 = v;
316
0
    } else {
317
0
      min3 = v;
318
0
    }
319
0
  }
320
1.04k
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&)
Line
Count
Source
302
1.04k
               float& min3) {
303
1.04k
  if (v < min3) {
304
0
    if (v < min0) {
305
0
      min3 = min2;
306
0
      min2 = min1;
307
0
      min1 = min0;
308
0
      min0 = v;
309
0
    } else if (v < min1) {
310
0
      min3 = min2;
311
0
      min2 = min1;
312
0
      min1 = v;
313
0
    } else if (v < min2) {
314
0
      min3 = min2;
315
0
      min2 = v;
316
0
    } else {
317
0
      min3 = v;
318
0
    }
319
0
  }
320
1.04k
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::StoreMin4(float, float&, float&, float&, float&)
321
322
// Look for smooth areas near the area of degradation.
323
// If the areas are generally smooth, don't do masking.
324
// Output is downsampled 2x.
325
void FuzzyErosion(const float butteraugli_target, const Rect& from_rect,
326
52
                  const ImageF& from, const Rect& to_rect, ImageF* to) {
327
52
  const size_t xsize = from.xsize();
328
52
  const size_t ysize = from.ysize();
329
52
  constexpr int kStep = 1;
330
52
  static_assert(kStep == 1, "Step must be 1");
331
52
  JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize());
332
52
  JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize());
333
52
  static const float kMulBase0 = 0.125;
334
52
  static const float kMulBase1 = 0.10;
335
52
  static const float kMulBase2 = 0.09;
336
52
  static const float kMulBase3 = 0.06;
337
52
  static const float kMulAdd0 = 0.0;
338
52
  static const float kMulAdd1 = -0.10;
339
52
  static const float kMulAdd2 = -0.09;
340
52
  static const float kMulAdd3 = -0.06;
341
342
52
  float mul = 0.0;
343
52
  if (butteraugli_target < 2.0f) {
344
52
    mul = (2.0f - butteraugli_target) * (1.0f / 2.0f);
345
52
  }
346
52
  float kMul0 = kMulBase0 + mul * kMulAdd0;
347
52
  float kMul1 = kMulBase1 + mul * kMulAdd1;
348
52
  float kMul2 = kMulBase2 + mul * kMulAdd2;
349
52
  float kMul3 = kMulBase3 + mul * kMulAdd3;
350
52
  static const float kTotal = 0.29959705784054957;
351
52
  float norm = kTotal / (kMul0 + kMul1 + kMul2 + kMul3);
352
52
  kMul0 *= norm;
353
52
  kMul1 *= norm;
354
52
  kMul2 *= norm;
355
52
  kMul3 *= norm;
356
357
156
  for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
358
104
    size_t y = fy + from_rect.y0();
359
104
    size_t ym1 = y >= kStep ? y - kStep : y;
360
104
    size_t yp1 = y + kStep < ysize ? y + kStep : y;
361
104
    const float* rowt = from.Row(ym1);
362
104
    const float* row = from.Row(y);
363
104
    const float* rowb = from.Row(yp1);
364
104
    float* row_out = to_rect.Row(to, fy / 2);
365
312
    for (size_t fx = 0; fx < from_rect.xsize(); ++fx) {
366
208
      size_t x = fx + from_rect.x0();
367
208
      size_t xm1 = x >= kStep ? x - kStep : x;
368
208
      size_t xp1 = x + kStep < xsize ? x + kStep : x;
369
208
      float min0 = row[x];
370
208
      float min1 = row[xm1];
371
208
      float min2 = row[xp1];
372
208
      float min3 = rowt[xm1];
373
      // Sort the first four values.
374
208
      if (min0 > min1) std::swap(min0, min1);
375
208
      if (min0 > min2) std::swap(min0, min2);
376
208
      if (min0 > min3) std::swap(min0, min3);
377
208
      if (min1 > min2) std::swap(min1, min2);
378
208
      if (min1 > min3) std::swap(min1, min3);
379
208
      if (min2 > min3) std::swap(min2, min3);
380
      // The remaining five values of a 3x3 neighbourhood.
381
208
      StoreMin4(rowt[x], min0, min1, min2, min3);
382
208
      StoreMin4(rowt[xp1], min0, min1, min2, min3);
383
208
      StoreMin4(rowb[xm1], min0, min1, min2, min3);
384
208
      StoreMin4(rowb[x], min0, min1, min2, min3);
385
208
      StoreMin4(rowb[xp1], min0, min1, min2, min3);
386
387
208
      float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3;
388
208
      if (fx % 2 == 0 && fy % 2 == 0) {
389
52
        row_out[fx / 2] = v;
390
156
      } else {
391
156
        row_out[fx / 2] += v;
392
156
      }
393
208
    }
394
104
  }
395
52
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Line
Count
Source
326
52
                  const ImageF& from, const Rect& to_rect, ImageF* to) {
327
52
  const size_t xsize = from.xsize();
328
52
  const size_t ysize = from.ysize();
329
52
  constexpr int kStep = 1;
330
52
  static_assert(kStep == 1, "Step must be 1");
331
52
  JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize());
332
52
  JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize());
333
52
  static const float kMulBase0 = 0.125;
334
52
  static const float kMulBase1 = 0.10;
335
52
  static const float kMulBase2 = 0.09;
336
52
  static const float kMulBase3 = 0.06;
337
52
  static const float kMulAdd0 = 0.0;
338
52
  static const float kMulAdd1 = -0.10;
339
52
  static const float kMulAdd2 = -0.09;
340
52
  static const float kMulAdd3 = -0.06;
341
342
52
  float mul = 0.0;
343
52
  if (butteraugli_target < 2.0f) {
344
52
    mul = (2.0f - butteraugli_target) * (1.0f / 2.0f);
345
52
  }
346
52
  float kMul0 = kMulBase0 + mul * kMulAdd0;
347
52
  float kMul1 = kMulBase1 + mul * kMulAdd1;
348
52
  float kMul2 = kMulBase2 + mul * kMulAdd2;
349
52
  float kMul3 = kMulBase3 + mul * kMulAdd3;
350
52
  static const float kTotal = 0.29959705784054957;
351
52
  float norm = kTotal / (kMul0 + kMul1 + kMul2 + kMul3);
352
52
  kMul0 *= norm;
353
52
  kMul1 *= norm;
354
52
  kMul2 *= norm;
355
52
  kMul3 *= norm;
356
357
156
  for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
358
104
    size_t y = fy + from_rect.y0();
359
104
    size_t ym1 = y >= kStep ? y - kStep : y;
360
104
    size_t yp1 = y + kStep < ysize ? y + kStep : y;
361
104
    const float* rowt = from.Row(ym1);
362
104
    const float* row = from.Row(y);
363
104
    const float* rowb = from.Row(yp1);
364
104
    float* row_out = to_rect.Row(to, fy / 2);
365
312
    for (size_t fx = 0; fx < from_rect.xsize(); ++fx) {
366
208
      size_t x = fx + from_rect.x0();
367
208
      size_t xm1 = x >= kStep ? x - kStep : x;
368
208
      size_t xp1 = x + kStep < xsize ? x + kStep : x;
369
208
      float min0 = row[x];
370
208
      float min1 = row[xm1];
371
208
      float min2 = row[xp1];
372
208
      float min3 = rowt[xm1];
373
      // Sort the first four values.
374
208
      if (min0 > min1) std::swap(min0, min1);
375
208
      if (min0 > min2) std::swap(min0, min2);
376
208
      if (min0 > min3) std::swap(min0, min3);
377
208
      if (min1 > min2) std::swap(min1, min2);
378
208
      if (min1 > min3) std::swap(min1, min3);
379
208
      if (min2 > min3) std::swap(min2, min3);
380
      // The remaining five values of a 3x3 neighbourhood.
381
208
      StoreMin4(rowt[x], min0, min1, min2, min3);
382
208
      StoreMin4(rowt[xp1], min0, min1, min2, min3);
383
208
      StoreMin4(rowb[xm1], min0, min1, min2, min3);
384
208
      StoreMin4(rowb[x], min0, min1, min2, min3);
385
208
      StoreMin4(rowb[xp1], min0, min1, min2, min3);
386
387
208
      float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3;
388
208
      if (fx % 2 == 0 && fy % 2 == 0) {
389
52
        row_out[fx / 2] = v;
390
156
      } else {
391
156
        row_out[fx / 2] += v;
392
156
      }
393
208
    }
394
104
  }
395
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::FuzzyErosion(float, jxl::RectT<unsigned long> const&, jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::Plane<float>*)
396
397
struct AdaptiveQuantizationImpl {
398
52
  Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) {
399
52
    JXL_ASSIGN_OR_RETURN(
400
52
        diff_buffer,
401
52
        ImageF::Create(memory_manager, kEncTileDim + 8, num_threads));
402
156
    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
403
104
      JXL_ASSIGN_OR_RETURN(
404
104
          ImageF tmp,
405
104
          ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2,
406
104
                         kEncTileDimInBlocks * 2 + 2));
407
104
      pre_erosion.emplace_back(std::move(tmp));
408
104
    }
409
52
    return true;
410
52
  }
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long)
Line
Count
Source
398
52
  Status PrepareBuffers(JxlMemoryManager* memory_manager, size_t num_threads) {
399
52
    JXL_ASSIGN_OR_RETURN(
400
52
        diff_buffer,
401
52
        ImageF::Create(memory_manager, kEncTileDim + 8, num_threads));
402
156
    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
403
104
      JXL_ASSIGN_OR_RETURN(
404
104
          ImageF tmp,
405
104
          ImageF::Create(memory_manager, kEncTileDimInBlocks * 2 + 2,
406
104
                         kEncTileDimInBlocks * 2 + 2));
407
104
      pre_erosion.emplace_back(std::move(tmp));
408
104
    }
409
52
    return true;
410
52
  }
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::PrepareBuffers(JxlMemoryManagerStruct*, unsigned long)
411
412
  void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
413
                   const Rect& rect_in, const Rect& rect_out, const int thread,
414
52
                   ImageF* mask, ImageF* mask1x1) {
415
52
    JXL_ASSERT(rect_in.x0() % 8 == 0);
416
52
    JXL_ASSERT(rect_in.y0() % 8 == 0);
417
52
    const size_t xsize = xyb.xsize();
418
52
    const size_t ysize = xyb.ysize();
419
420
    // The XYB gamma is 3.0 to be able to decode faster with two muls.
421
    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
422
    // We approximate the gamma difference by adding one cubic root into
423
    // the adaptive quantization. This gives us a total gamma of 2.6666
424
    // for quantization uses.
425
52
    const float match_gamma_offset = 0.019;
426
427
52
    const HWY_FULL(float) df;
428
429
52
    size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8;
430
52
    size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8;
431
432
52
    size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8;
433
52
    size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8;
434
435
52
    if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2;
436
52
    if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) {
437
0
      x_end_1x1 += 2;
438
0
    }
439
52
    if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2;
440
52
    if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) {
441
0
      y_end_1x1 += 2;
442
0
    }
443
444
    // Computes image (padded to multiple of 8x8) of local pixel differences.
445
    // Subsample both directions by 4.
446
    // 1x1 Laplacian of intensity.
447
468
    for (size_t y = y_start_1x1; y < y_end_1x1; ++y) {
448
416
      const size_t y2 = y + 1 < ysize ? y + 1 : y;
449
416
      const size_t y1 = y > 0 ? y - 1 : y;
450
416
      const float* row_in = xyb.ConstPlaneRow(1, y);
451
416
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
452
416
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
453
416
      float* mask1x1_out = mask1x1->Row(y);
454
3.32k
      auto scalar_pixel1x1 = [&](size_t x) {
455
3.32k
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
456
3.32k
        const size_t x1 = x > 0 ? x - 1 : x;
457
3.32k
        const float base =
458
3.32k
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
459
3.32k
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
460
3.32k
            row_in[x] + match_gamma_offset);
461
3.32k
        float diff = fabs(gammac * (row_in[x] - base));
462
3.32k
        static const double kScaler = 1.0;
463
3.32k
        diff *= kScaler;
464
3.32k
        diff = log1p(diff);
465
3.32k
        static const float kMul = 1.0;
466
3.32k
        static const float kOffset = 0.01;
467
3.32k
        mask1x1_out[x] = kMul / (diff + kOffset);
468
3.32k
      };
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Line
Count
Source
454
3.32k
      auto scalar_pixel1x1 = [&](size_t x) {
455
3.32k
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
456
3.32k
        const size_t x1 = x > 0 ? x - 1 : x;
457
3.32k
        const float base =
458
3.32k
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
459
3.32k
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
460
3.32k
            row_in[x] + match_gamma_offset);
461
3.32k
        float diff = fabs(gammac * (row_in[x] - base));
462
3.32k
        static const double kScaler = 1.0;
463
3.32k
        diff *= kScaler;
464
3.32k
        diff = log1p(diff);
465
3.32k
        static const float kMul = 1.0;
466
3.32k
        static const float kOffset = 0.01;
467
3.32k
        mask1x1_out[x] = kMul / (diff + kOffset);
468
3.32k
      };
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
469
3.74k
      for (size_t x = x_start_1x1; x < x_end_1x1; ++x) {
470
3.32k
        scalar_pixel1x1(x);
471
3.32k
      }
472
416
    }
473
474
52
    size_t y_start = rect_in.y0() + rect_out.y0() * 8;
475
52
    size_t y_end = y_start + rect_out.ysize() * 8;
476
477
52
    size_t x_start = rect_in.x0() + rect_out.x0() * 8;
478
52
    size_t x_end = x_start + rect_out.xsize() * 8;
479
480
52
    if (x_start != 0) x_start -= 4;
481
52
    if (x_end != xsize) x_end += 4;
482
52
    if (y_start != 0) y_start -= 4;
483
52
    if (y_end != ysize) y_end += 4;
484
52
    pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4);
485
486
52
    static const float limit = 0.2f;
487
468
    for (size_t y = y_start; y < y_end; ++y) {
488
416
      size_t y2 = y + 1 < ysize ? y + 1 : y;
489
416
      size_t y1 = y > 0 ? y - 1 : y;
490
491
416
      const float* row_in = xyb.ConstPlaneRow(1, y);
492
416
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
493
416
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
494
416
      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
495
496
3.32k
      auto scalar_pixel = [&](size_t x) {
497
3.32k
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
498
3.32k
        const size_t x1 = x > 0 ? x - 1 : x;
499
3.32k
        const float base =
500
3.32k
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
501
3.32k
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
502
3.32k
            row_in[x] + match_gamma_offset);
503
3.32k
        float diff = gammac * (row_in[x] - base);
504
3.32k
        diff *= diff;
505
3.32k
        if (diff >= limit) {
506
0
          diff = limit;
507
0
        }
508
3.32k
        diff = MaskingSqrt(diff);
509
3.32k
        if ((y % 4) != 0) {
510
2.49k
          row_out[x - x_start] += diff;
511
2.49k
        } else {
512
832
          row_out[x - x_start] = diff;
513
832
        }
514
3.32k
      };
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const
Line
Count
Source
496
3.32k
      auto scalar_pixel = [&](size_t x) {
497
3.32k
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
498
3.32k
        const size_t x1 = x > 0 ? x - 1 : x;
499
3.32k
        const float base =
500
3.32k
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
501
3.32k
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
502
3.32k
            row_in[x] + match_gamma_offset);
503
3.32k
        float diff = gammac * (row_in[x] - base);
504
3.32k
        diff *= diff;
505
3.32k
        if (diff >= limit) {
506
0
          diff = limit;
507
0
        }
508
3.32k
        diff = MaskingSqrt(diff);
509
3.32k
        if ((y % 4) != 0) {
510
2.49k
          row_out[x - x_start] += diff;
511
2.49k
        } else {
512
832
          row_out[x - x_start] = diff;
513
832
        }
514
3.32k
      };
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)::{lambda(unsigned long)#2}::operator()(unsigned long) const
515
516
416
      size_t x = x_start;
517
      // First pixel of the row.
518
416
      if (x_start == 0) {
519
416
        scalar_pixel(x_start);
520
416
        ++x;
521
416
      }
522
      // SIMD
523
416
      const auto match_gamma_offset_v = Set(df, match_gamma_offset);
524
416
      const auto quarter = Set(df, 0.25f);
525
416
      for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) {
526
0
        const auto in = LoadU(df, row_in + x);
527
0
        const auto in_r = LoadU(df, row_in + x + 1);
528
0
        const auto in_l = LoadU(df, row_in + x - 1);
529
0
        const auto in_t = LoadU(df, row_in2 + x);
530
0
        const auto in_b = LoadU(df, row_in1 + x);
531
0
        auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
532
0
        auto gammacv =
533
0
            RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
534
0
                df, Add(in, match_gamma_offset_v));
535
0
        auto diff = Mul(gammacv, Sub(in, base));
536
0
        diff = Mul(diff, diff);
537
0
        diff = Min(diff, Set(df, limit));
538
0
        diff = MaskingSqrt(df, diff);
539
0
        if ((y & 3) != 0) {
540
0
          diff = Add(diff, LoadU(df, row_out + x - x_start));
541
0
        }
542
0
        StoreU(diff, df, row_out + x - x_start);
543
0
      }
544
      // Scalar
545
3.32k
      for (; x < x_end; ++x) {
546
2.91k
        scalar_pixel(x);
547
2.91k
      }
548
416
      if (y % 4 == 3) {
549
104
        float* row_dout = pre_erosion[thread].Row((y - y_start) / 4);
550
312
        for (size_t x = 0; x < (x_end - x_start) / 4; x++) {
551
208
          row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
552
208
                         row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
553
208
                        0.25f;
554
208
        }
555
104
      }
556
416
    }
557
52
    Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
558
52
                   rect_out.xsize() * 2, rect_out.ysize() * 2);
559
52
    FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect_out,
560
52
                 &aq_map);
561
104
    for (size_t y = 0; y < rect_out.ysize(); ++y) {
562
52
      const float* aq_map_row = rect_out.ConstRow(aq_map, y);
563
52
      float* mask_row = rect_out.Row(mask, y);
564
104
      for (size_t x = 0; x < rect_out.xsize(); ++x) {
565
52
        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
566
52
      }
567
52
    }
568
52
    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
569
52
                        xyb.Plane(2), rect_in, scale, rect_out, &aq_map);
570
52
  }
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)
Line
Count
Source
414
52
                   ImageF* mask, ImageF* mask1x1) {
415
52
    JXL_ASSERT(rect_in.x0() % 8 == 0);
416
52
    JXL_ASSERT(rect_in.y0() % 8 == 0);
417
52
    const size_t xsize = xyb.xsize();
418
52
    const size_t ysize = xyb.ysize();
419
420
    // The XYB gamma is 3.0 to be able to decode faster with two muls.
421
    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
422
    // We approximate the gamma difference by adding one cubic root into
423
    // the adaptive quantization. This gives us a total gamma of 2.6666
424
    // for quantization uses.
425
52
    const float match_gamma_offset = 0.019;
426
427
52
    const HWY_FULL(float) df;
428
429
52
    size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8;
430
52
    size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8;
431
432
52
    size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8;
433
52
    size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8;
434
435
52
    if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2;
436
52
    if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) {
437
0
      x_end_1x1 += 2;
438
0
    }
439
52
    if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2;
440
52
    if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) {
441
0
      y_end_1x1 += 2;
442
0
    }
443
444
    // Computes image (padded to multiple of 8x8) of local pixel differences.
445
    // Subsample both directions by 4.
446
    // 1x1 Laplacian of intensity.
447
468
    for (size_t y = y_start_1x1; y < y_end_1x1; ++y) {
448
416
      const size_t y2 = y + 1 < ysize ? y + 1 : y;
449
416
      const size_t y1 = y > 0 ? y - 1 : y;
450
416
      const float* row_in = xyb.ConstPlaneRow(1, y);
451
416
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
452
416
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
453
416
      float* mask1x1_out = mask1x1->Row(y);
454
416
      auto scalar_pixel1x1 = [&](size_t x) {
455
416
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
456
416
        const size_t x1 = x > 0 ? x - 1 : x;
457
416
        const float base =
458
416
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
459
416
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
460
416
            row_in[x] + match_gamma_offset);
461
416
        float diff = fabs(gammac * (row_in[x] - base));
462
416
        static const double kScaler = 1.0;
463
416
        diff *= kScaler;
464
416
        diff = log1p(diff);
465
416
        static const float kMul = 1.0;
466
416
        static const float kOffset = 0.01;
467
416
        mask1x1_out[x] = kMul / (diff + kOffset);
468
416
      };
469
3.74k
      for (size_t x = x_start_1x1; x < x_end_1x1; ++x) {
470
3.32k
        scalar_pixel1x1(x);
471
3.32k
      }
472
416
    }
473
474
52
    size_t y_start = rect_in.y0() + rect_out.y0() * 8;
475
52
    size_t y_end = y_start + rect_out.ysize() * 8;
476
477
52
    size_t x_start = rect_in.x0() + rect_out.x0() * 8;
478
52
    size_t x_end = x_start + rect_out.xsize() * 8;
479
480
52
    if (x_start != 0) x_start -= 4;
481
52
    if (x_end != xsize) x_end += 4;
482
52
    if (y_start != 0) y_start -= 4;
483
52
    if (y_end != ysize) y_end += 4;
484
52
    pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4);
485
486
52
    static const float limit = 0.2f;
487
468
    for (size_t y = y_start; y < y_end; ++y) {
488
416
      size_t y2 = y + 1 < ysize ? y + 1 : y;
489
416
      size_t y1 = y > 0 ? y - 1 : y;
490
491
416
      const float* row_in = xyb.ConstPlaneRow(1, y);
492
416
      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
493
416
      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
494
416
      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
495
496
416
      auto scalar_pixel = [&](size_t x) {
497
416
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
498
416
        const size_t x1 = x > 0 ? x - 1 : x;
499
416
        const float base =
500
416
            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
501
416
        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
502
416
            row_in[x] + match_gamma_offset);
503
416
        float diff = gammac * (row_in[x] - base);
504
416
        diff *= diff;
505
416
        if (diff >= limit) {
506
416
          diff = limit;
507
416
        }
508
416
        diff = MaskingSqrt(diff);
509
416
        if ((y % 4) != 0) {
510
416
          row_out[x - x_start] += diff;
511
416
        } else {
512
416
          row_out[x - x_start] = diff;
513
416
        }
514
416
      };
515
516
416
      size_t x = x_start;
517
      // First pixel of the row.
518
416
      if (x_start == 0) {
519
416
        scalar_pixel(x_start);
520
416
        ++x;
521
416
      }
522
      // SIMD
523
416
      const auto match_gamma_offset_v = Set(df, match_gamma_offset);
524
416
      const auto quarter = Set(df, 0.25f);
525
416
      for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) {
526
0
        const auto in = LoadU(df, row_in + x);
527
0
        const auto in_r = LoadU(df, row_in + x + 1);
528
0
        const auto in_l = LoadU(df, row_in + x - 1);
529
0
        const auto in_t = LoadU(df, row_in2 + x);
530
0
        const auto in_b = LoadU(df, row_in1 + x);
531
0
        auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
532
0
        auto gammacv =
533
0
            RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
534
0
                df, Add(in, match_gamma_offset_v));
535
0
        auto diff = Mul(gammacv, Sub(in, base));
536
0
        diff = Mul(diff, diff);
537
0
        diff = Min(diff, Set(df, limit));
538
0
        diff = MaskingSqrt(df, diff);
539
0
        if ((y & 3) != 0) {
540
0
          diff = Add(diff, LoadU(df, row_out + x - x_start));
541
0
        }
542
0
        StoreU(diff, df, row_out + x - x_start);
543
0
      }
544
      // Scalar
545
3.32k
      for (; x < x_end; ++x) {
546
2.91k
        scalar_pixel(x);
547
2.91k
      }
548
416
      if (y % 4 == 3) {
549
104
        float* row_dout = pre_erosion[thread].Row((y - y_start) / 4);
550
312
        for (size_t x = 0; x < (x_end - x_start) / 4; x++) {
551
208
          row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
552
208
                         row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
553
208
                        0.25f;
554
208
        }
555
104
      }
556
416
    }
557
52
    Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
558
52
                   rect_out.xsize() * 2, rect_out.ysize() * 2);
559
52
    FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect_out,
560
52
                 &aq_map);
561
104
    for (size_t y = 0; y < rect_out.ysize(); ++y) {
562
52
      const float* aq_map_row = rect_out.ConstRow(aq_map, y);
563
52
      float* mask_row = rect_out.Row(mask, y);
564
104
      for (size_t x = 0; x < rect_out.xsize(); ++x) {
565
52
        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
566
52
      }
567
52
    }
568
52
    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
569
52
                        xyb.Plane(2), rect_in, scale, rect_out, &aq_map);
570
52
  }
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::RectT<unsigned long> const&, int, jxl::Plane<float>*, jxl::Plane<float>*)
571
  std::vector<ImageF> pre_erosion;
572
  ImageF aq_map;
573
  ImageF diff_buffer;
574
};
575
576
Status Blur1x1Masking(JxlMemoryManager* memory_manager, ThreadPool* pool,
577
52
                      ImageF* mask1x1, const Rect& rect) {
578
  // Blur the mask1x1 to obtain the masking image.
579
  // Before blurring it contains an image of absolute value of the
580
  // Laplacian of the intensity channel.
581
52
  static const float kFilterMask1x1[5] = {
582
52
      static_cast<float>(0.25647067633737227),
583
52
      static_cast<float>(0.2050056912354399075),
584
52
      static_cast<float>(0.154082048668497307),
585
52
      static_cast<float>(0.08149576591362004441),
586
52
      static_cast<float>(0.0512750104812308467),
587
52
  };
588
52
  double sum =
589
52
      1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] +
590
52
                 kFilterMask1x1[4] + 2 * kFilterMask1x1[3]);
591
52
  if (sum < 1e-5) {
592
0
    sum = 1e-5;
593
0
  }
594
52
  const float normalize = static_cast<float>(1.0 / sum);
595
52
  const float normalize_mul = normalize;
596
52
  WeightsSymmetric5 weights =
597
52
      WeightsSymmetric5{{HWY_REP4(normalize)},
598
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[0])},
599
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[2])},
600
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[1])},
601
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[4])},
602
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[3])}};
603
52
  JXL_ASSIGN_OR_RETURN(
604
52
      ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize()));
605
52
  Symmetric5(*mask1x1, rect, weights, pool, &temp);
606
52
  *mask1x1 = std::move(temp);
607
52
  return true;
608
52
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Line
Count
Source
577
52
                      ImageF* mask1x1, const Rect& rect) {
578
  // Blur the mask1x1 to obtain the masking image.
579
  // Before blurring it contains an image of absolute value of the
580
  // Laplacian of the intensity channel.
581
52
  static const float kFilterMask1x1[5] = {
582
52
      static_cast<float>(0.25647067633737227),
583
52
      static_cast<float>(0.2050056912354399075),
584
52
      static_cast<float>(0.154082048668497307),
585
52
      static_cast<float>(0.08149576591362004441),
586
52
      static_cast<float>(0.0512750104812308467),
587
52
  };
588
52
  double sum =
589
52
      1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] +
590
52
                 kFilterMask1x1[4] + 2 * kFilterMask1x1[3]);
591
52
  if (sum < 1e-5) {
592
0
    sum = 1e-5;
593
0
  }
594
52
  const float normalize = static_cast<float>(1.0 / sum);
595
52
  const float normalize_mul = normalize;
596
52
  WeightsSymmetric5 weights =
597
52
      WeightsSymmetric5{{HWY_REP4(normalize)},
598
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[0])},
599
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[2])},
600
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[1])},
601
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[4])},
602
52
                        {HWY_REP4(normalize_mul * kFilterMask1x1[3])}};
603
52
  JXL_ASSIGN_OR_RETURN(
604
52
      ImageF temp, ImageF::Create(memory_manager, rect.xsize(), rect.ysize()));
605
52
  Symmetric5(*mask1x1, rect, weights, pool, &temp);
606
52
  *mask1x1 = std::move(temp);
607
52
  return true;
608
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::Blur1x1Masking(JxlMemoryManagerStruct*, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
609
610
StatusOr<ImageF> AdaptiveQuantizationMap(const float butteraugli_target,
611
                                         const Image3F& xyb, const Rect& rect,
612
                                         float scale, ThreadPool* pool,
613
52
                                         ImageF* mask, ImageF* mask1x1) {
614
52
  JXL_DASSERT(rect.xsize() % kBlockDim == 0);
615
52
  JXL_DASSERT(rect.ysize() % kBlockDim == 0);
616
52
  AdaptiveQuantizationImpl impl;
617
52
  const size_t xsize_blocks = rect.xsize() / kBlockDim;
618
52
  const size_t ysize_blocks = rect.ysize() / kBlockDim;
619
52
  JxlMemoryManager* memory_manager = xyb.memory_manager();
620
52
  JXL_ASSIGN_OR_RETURN(
621
52
      impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
622
52
  JXL_ASSIGN_OR_RETURN(
623
52
      *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
624
52
  JXL_ASSIGN_OR_RETURN(
625
52
      *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize()));
626
52
  JXL_CHECK(RunOnPool(
627
52
      pool, 0,
628
52
      DivCeil(xsize_blocks, kEncTileDimInBlocks) *
629
52
          DivCeil(ysize_blocks, kEncTileDimInBlocks),
630
52
      [&](const size_t num_threads) {
631
52
        return !!impl.PrepareBuffers(memory_manager, num_threads);
632
52
      },
633
52
      [&](const uint32_t tid, const size_t thread) {
634
52
        size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks);
635
52
        size_t tx = tid % n_enc_tiles;
636
52
        size_t ty = tid / n_enc_tiles;
637
52
        size_t by0 = ty * kEncTileDimInBlocks;
638
52
        size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks);
639
52
        size_t bx0 = tx * kEncTileDimInBlocks;
640
52
        size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks);
641
52
        Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0);
642
52
        impl.ComputeTile(butteraugli_target, scale, xyb, rect, rect_out, thread,
643
52
                         mask, mask1x1);
644
52
      },
645
52
      "AQ DiffPrecompute"));
646
647
52
  JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect));
648
52
  return std::move(impl).aq_map;
649
52
}
enc_adaptive_quantization.cc:jxl::N_AVX2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)
Line
Count
Source
613
52
                                         ImageF* mask, ImageF* mask1x1) {
614
52
  JXL_DASSERT(rect.xsize() % kBlockDim == 0);
615
52
  JXL_DASSERT(rect.ysize() % kBlockDim == 0);
616
52
  AdaptiveQuantizationImpl impl;
617
52
  const size_t xsize_blocks = rect.xsize() / kBlockDim;
618
52
  const size_t ysize_blocks = rect.ysize() / kBlockDim;
619
52
  JxlMemoryManager* memory_manager = xyb.memory_manager();
620
52
  JXL_ASSIGN_OR_RETURN(
621
52
      impl.aq_map, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
622
52
  JXL_ASSIGN_OR_RETURN(
623
52
      *mask, ImageF::Create(memory_manager, xsize_blocks, ysize_blocks));
624
52
  JXL_ASSIGN_OR_RETURN(
625
52
      *mask1x1, ImageF::Create(memory_manager, xyb.xsize(), xyb.ysize()));
626
52
  JXL_CHECK(RunOnPool(
627
52
      pool, 0,
628
52
      DivCeil(xsize_blocks, kEncTileDimInBlocks) *
629
52
          DivCeil(ysize_blocks, kEncTileDimInBlocks),
630
52
      [&](const size_t num_threads) {
631
52
        return !!impl.PrepareBuffers(memory_manager, num_threads);
632
52
      },
633
52
      [&](const uint32_t tid, const size_t thread) {
634
52
        size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks);
635
52
        size_t tx = tid % n_enc_tiles;
636
52
        size_t ty = tid / n_enc_tiles;
637
52
        size_t by0 = ty * kEncTileDimInBlocks;
638
52
        size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks);
639
52
        size_t bx0 = tx * kEncTileDimInBlocks;
640
52
        size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks);
641
52
        Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0);
642
52
        impl.ComputeTile(butteraugli_target, scale, xyb, rect, rect_out, thread,
643
52
                         mask, mask1x1);
644
52
      },
645
52
      "AQ DiffPrecompute"));
646
647
52
  JXL_RETURN_IF_ERROR(Blur1x1Masking(memory_manager, pool, mask1x1, rect));
648
52
  return std::move(impl).aq_map;
649
52
}
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE4::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SSE2::(anonymous namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>*, jxl::Plane<float>*)
650
651
}  // namespace
652
653
// NOLINTNEXTLINE(google-readability-namespace-comments)
654
}  // namespace HWY_NAMESPACE
655
}  // namespace jxl
656
HWY_AFTER_NAMESPACE();
657
658
#if HWY_ONCE
659
namespace jxl {
660
HWY_EXPORT(AdaptiveQuantizationMap);
661
662
namespace {
663
664
// If true, prints the quantization maps at each iteration.
665
constexpr bool FLAGS_dump_quant_state = false;
666
667
Status DumpHeatmap(const CompressParams& cparams, const AuxOut* aux_out,
668
                   const std::string& label, const ImageF& image,
669
0
                   float good_threshold, float bad_threshold) {
670
0
  if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
671
0
    JXL_ASSIGN_OR_RETURN(
672
0
        Image3F heatmap,
673
0
        CreateHeatMapImage(image, good_threshold, bad_threshold));
674
0
    char filename[200];
675
0
    snprintf(filename, sizeof(filename), "%s%05d", label.c_str(),
676
0
             aux_out->num_butteraugli_iters);
677
0
    JXL_RETURN_IF_ERROR(DumpImage(cparams, filename, heatmap));
678
0
  }
679
0
  return true;
680
0
}
681
682
Status DumpHeatmaps(const CompressParams& cparams, const AuxOut* aux_out,
683
                    float ba_target, const ImageF& quant_field,
684
0
                    const ImageF& tile_heatmap, const ImageF& bt_diffmap) {
685
0
  JxlMemoryManager* memory_manager = quant_field.memory_manager();
686
0
  if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
687
0
    if (!WantDebugOutput(cparams)) return true;
688
0
    JXL_ASSIGN_OR_RETURN(ImageF inv_qmap,
689
0
                         ImageF::Create(memory_manager, quant_field.xsize(),
690
0
                                        quant_field.ysize()));
691
0
    for (size_t y = 0; y < quant_field.ysize(); ++y) {
692
0
      const float* JXL_RESTRICT row_q = quant_field.ConstRow(y);
693
0
      float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y);
694
0
      for (size_t x = 0; x < quant_field.xsize(); ++x) {
695
0
        row_inv_q[x] = 1.0f / row_q[x];  // never zero
696
0
      }
697
0
    }
698
0
    JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "quant_heatmap", inv_qmap,
699
0
                                    4.0f * ba_target, 6.0f * ba_target));
700
0
    JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "tile_heatmap",
701
0
                                    tile_heatmap, ba_target, 1.5f * ba_target));
702
0
    // matches heat maps produced by the command line tool.
703
0
    JXL_RETURN_IF_ERROR(DumpHeatmap(cparams, aux_out, "bt_diffmap", bt_diffmap,
704
0
                                    ButteraugliFuzzyInverse(1.5),
705
0
                                    ButteraugliFuzzyInverse(0.5)));
706
0
  }
707
0
  return true;
708
0
}
709
710
StatusOr<ImageF> TileDistMap(const ImageF& distmap, int tile_size, int margin,
711
0
                             const AcStrategyImage& ac_strategy) {
712
0
  const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size;
713
0
  const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size;
714
0
  JxlMemoryManager* memory_manager = distmap.memory_manager();
715
0
  JXL_ASSIGN_OR_RETURN(ImageF tile_distmap,
716
0
                       ImageF::Create(memory_manager, tile_xsize, tile_ysize));
717
0
  size_t distmap_stride = tile_distmap.PixelsPerRow();
718
0
  for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) {
719
0
    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y);
720
0
    float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y);
721
0
    for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) {
722
0
      AcStrategy acs = ac_strategy_row[tile_x];
723
0
      if (!acs.IsFirstBlock()) continue;
724
0
      int this_tile_xsize = acs.covered_blocks_x() * tile_size;
725
0
      int this_tile_ysize = acs.covered_blocks_y() * tile_size;
726
0
      int y_begin = std::max<int>(0, tile_size * tile_y - margin);
727
0
      int y_end = std::min<int>(distmap.ysize(),
728
0
                                tile_size * tile_y + this_tile_ysize + margin);
729
0
      int x_begin = std::max<int>(0, tile_size * tile_x - margin);
730
0
      int x_end = std::min<int>(distmap.xsize(),
731
0
                                tile_size * tile_x + this_tile_xsize + margin);
732
0
      float dist_norm = 0.0;
733
0
      double pixels = 0;
734
0
      for (int y = y_begin; y < y_end; ++y) {
735
0
        float ymul = 1.0;
736
0
        constexpr float kBorderMul = 0.98f;
737
0
        constexpr float kCornerMul = 0.7f;
738
0
        if (margin != 0 && (y == y_begin || y == y_end - 1)) {
739
0
          ymul = kBorderMul;
740
0
        }
741
0
        const float* const JXL_RESTRICT row = distmap.Row(y);
742
0
        for (int x = x_begin; x < x_end; ++x) {
743
0
          float xmul = ymul;
744
0
          if (margin != 0 && (x == x_begin || x == x_end - 1)) {
745
0
            if (xmul == 1.0) {
746
0
              xmul = kBorderMul;
747
0
            } else {
748
0
              xmul = kCornerMul;
749
0
            }
750
0
          }
751
0
          float v = row[x];
752
0
          v *= v;
753
0
          v *= v;
754
0
          v *= v;
755
0
          v *= v;
756
0
          dist_norm += xmul * v;
757
0
          pixels += xmul;
758
0
        }
759
0
      }
760
0
      if (pixels == 0) pixels = 1;
761
      // 16th norm is less than the max norm, we reduce the difference
762
      // with this normalization factor.
763
0
      constexpr float kTileNorm = 1.2f;
764
0
      const float tile_dist =
765
0
          kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f);
766
0
      dist_row[tile_x] = tile_dist;
767
0
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
768
0
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
769
0
          dist_row[tile_x + distmap_stride * iy + ix] = tile_dist;
770
0
        }
771
0
      }
772
0
    }
773
0
  }
774
0
  return tile_distmap;
775
0
}
776
777
const float kDcQuantPow = 0.83f;
778
const float kDcQuant = 1.095924047623553f;
779
const float kAcQuant = 0.725f;
780
781
// Computes the decoded image for a given set of compression parameters.
782
StatusOr<ImageBundle> RoundtripImage(const FrameHeader& frame_header,
783
                                     const Image3F& opsin,
784
                                     PassesEncoderState* enc_state,
785
                                     const JxlCmsInterface& cms,
786
0
                                     ThreadPool* pool) {
787
0
  JxlMemoryManager* memory_manager = enc_state->memory_manager();
788
0
  std::unique_ptr<PassesDecoderState> dec_state =
789
0
      jxl::make_unique<PassesDecoderState>(memory_manager);
790
0
  JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata(
791
0
      *enc_state->shared.metadata));
792
0
  dec_state->shared = &enc_state->shared;
793
0
  JXL_ASSERT(opsin.ysize() % kBlockDim == 0);
794
795
0
  const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim);
796
0
  const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim);
797
0
  const size_t num_groups = xsize_groups * ysize_groups;
798
799
0
  size_t num_special_frames = enc_state->special_frames.size();
800
0
  size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
801
0
  ModularFrameEncoder modular_frame_encoder(memory_manager, frame_header,
802
0
                                            enc_state->cparams, false);
803
0
  JXL_CHECK(InitializePassesEncoder(frame_header, opsin, Rect(opsin), cms, pool,
804
0
                                    enc_state, &modular_frame_encoder,
805
0
                                    nullptr));
806
0
  JXL_CHECK(dec_state->Init(frame_header));
807
0
  JXL_CHECK(dec_state->InitForAC(num_passes, pool));
808
809
0
  ImageBundle decoded(memory_manager, &enc_state->shared.metadata->m);
810
0
  decoded.origin = frame_header.frame_origin;
811
0
  JXL_ASSIGN_OR_RETURN(
812
0
      Image3F tmp,
813
0
      Image3F::Create(memory_manager, opsin.xsize(), opsin.ysize()));
814
0
  decoded.SetFromImage(std::move(tmp),
815
0
                       dec_state->output_encoding_info.color_encoding);
816
817
0
  PassesDecoderState::PipelineOptions options;
818
0
  options.use_slow_render_pipeline = false;
819
0
  options.coalescing = false;
820
0
  options.render_spotcolors = false;
821
0
  options.render_noise = false;
822
823
  // Same as frame_header.nonserialized_metadata->m
824
0
  const ImageMetadata& metadata = *decoded.metadata();
825
826
0
  JXL_CHECK(dec_state->PreparePipeline(
827
0
      frame_header, &enc_state->shared.metadata->m, &decoded, options));
828
829
0
  hwy::AlignedUniquePtr<GroupDecCache[]> group_dec_caches;
830
0
  const auto allocate_storage = [&](const size_t num_threads) -> Status {
831
0
    JXL_RETURN_IF_ERROR(
832
0
        dec_state->render_pipeline->PrepareForThreads(num_threads,
833
0
                                                      /*use_group_ids=*/false));
834
0
    group_dec_caches = hwy::MakeUniqueAlignedArray<GroupDecCache>(num_threads);
835
0
    return true;
836
0
  };
837
0
  std::atomic<bool> has_error{false};
838
0
  const auto process_group = [&](const uint32_t group_index,
839
0
                                 const size_t thread) {
840
0
    if (has_error) return;
841
0
    if (frame_header.loop_filter.epf_iters > 0) {
842
0
      ComputeSigma(frame_header.loop_filter,
843
0
                   dec_state->shared->frame_dim.BlockGroupRect(group_index),
844
0
                   dec_state.get());
845
0
    }
846
0
    RenderPipelineInput input =
847
0
        dec_state->render_pipeline->GetInputBuffers(group_index, thread);
848
0
    JXL_CHECK(DecodeGroupForRoundtrip(
849
0
        frame_header, enc_state->coeffs, group_index, dec_state.get(),
850
0
        &group_dec_caches[thread], thread, input, nullptr, nullptr));
851
0
    for (size_t c = 0; c < metadata.num_extra_channels; c++) {
852
0
      std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c);
853
0
      FillPlane(0.0f, ri.first, ri.second);
854
0
    }
855
0
    if (!input.Done()) {
856
0
      has_error = true;
857
0
      return;
858
0
    }
859
0
  };
860
0
  JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group,
861
0
                      "AQ loop"));
862
0
  if (has_error) return JXL_FAILURE("AQ loop failure");
863
864
  // Ensure we don't create any new special frames.
865
0
  enc_state->special_frames.resize(num_special_frames);
866
867
0
  return decoded;
868
0
}
869
870
constexpr int kMaxButteraugliIters = 4;
871
872
Status FindBestQuantization(const FrameHeader& frame_header,
873
                            const Image3F& linear, const Image3F& opsin,
874
                            ImageF& quant_field, PassesEncoderState* enc_state,
875
                            const JxlCmsInterface& cms, ThreadPool* pool,
876
0
                            AuxOut* aux_out) {
877
0
  const CompressParams& cparams = enc_state->cparams;
878
0
  if (cparams.resampling > 1 &&
879
0
      cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) {
880
    // For downsampled opsin image, the butteraugli based adaptive quantization
881
    // loop would only make the size bigger without improving the distance much,
882
    // so in this case we enable it only for very high butteraugli targets.
883
0
    return true;
884
0
  }
885
0
  JxlMemoryManager* memory_manager = enc_state->memory_manager();
886
0
  Quantizer& quantizer = enc_state->shared.quantizer;
887
0
  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
888
889
0
  const float butteraugli_target = cparams.butteraugli_distance;
890
0
  const float original_butteraugli = cparams.original_butteraugli_distance;
891
0
  ButteraugliParams params;
892
0
  params.intensity_target = 80.f;
893
0
  JxlButteraugliComparator comparator(params, cms);
894
0
  JXL_CHECK(comparator.SetLinearReferenceImage(linear));
895
0
  bool lower_is_better =
896
0
      (comparator.GoodQualityScore() < comparator.BadQualityScore());
897
0
  const float initial_quant_dc = InitialQuantDC(butteraugli_target);
898
0
  AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
899
0
                   original_butteraugli, &quant_field);
900
0
  ImageF tile_distmap;
901
0
  JXL_ASSIGN_OR_RETURN(
902
0
      ImageF initial_quant_field,
903
0
      ImageF::Create(memory_manager, quant_field.xsize(), quant_field.ysize()));
904
0
  CopyImageTo(quant_field, &initial_quant_field);
905
906
0
  float initial_qf_min;
907
0
  float initial_qf_max;
908
0
  ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max);
909
0
  float initial_qf_ratio = initial_qf_max / initial_qf_min;
910
0
  float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio);
911
0
  float asymmetry = 2;
912
0
  if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low;
913
0
  float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low);
914
0
  float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry);
915
916
0
  JXL_ASSERT(qf_higher / qf_lower < 253);
917
918
0
  constexpr int kOriginalComparisonRound = 1;
919
0
  int iters = kMaxButteraugliIters;
920
0
  if (cparams.speed_tier != SpeedTier::kTortoise) {
921
0
    iters = 2;
922
0
  }
923
0
  for (int i = 0; i < iters + 1; ++i) {
924
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
925
0
      printf("\nQuantization field:\n");
926
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
927
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
928
0
          printf(" %.5f", quant_field.Row(y)[x]);
929
0
        }
930
0
        printf("\n");
931
0
      }
932
0
    }
933
0
    quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
934
0
    JXL_ASSIGN_OR_RETURN(
935
0
        ImageBundle dec_linear,
936
0
        RoundtripImage(frame_header, opsin, enc_state, cms, pool));
937
0
    float score;
938
0
    ImageF diffmap;
939
0
    JXL_CHECK(comparator.CompareWith(dec_linear, &diffmap, &score));
940
0
    if (!lower_is_better) {
941
0
      score = -score;
942
0
      ScaleImage(-1.0f, &diffmap);
943
0
    }
944
0
    JXL_ASSIGN_OR_RETURN(tile_distmap,
945
0
                         TileDistMap(diffmap, 8 * cparams.resampling, 0,
946
0
                                     enc_state->shared.ac_strategy));
947
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && WantDebugOutput(cparams)) {
948
0
      JXL_RETURN_IF_ERROR(DumpImage(cparams, ("dec" + ToString(i)).c_str(),
949
0
                                    *dec_linear.color()));
950
0
      JXL_RETURN_IF_ERROR(DumpHeatmaps(cparams, aux_out, butteraugli_target,
951
0
                                       quant_field, tile_distmap, diffmap));
952
0
    }
953
0
    if (aux_out != nullptr) ++aux_out->num_butteraugli_iters;
954
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
955
0
      float minval;
956
0
      float maxval;
957
0
      ImageMinMax(quant_field, &minval, &maxval);
958
0
      printf("\nButteraugli iter: %d/%d\n", i, kMaxButteraugliIters);
959
0
      printf("Butteraugli distance: %f  (target = %f)\n", score,
960
0
             original_butteraugli);
961
0
      printf("quant range: %f ... %f  DC quant: %f\n", minval, maxval,
962
0
             initial_quant_dc);
963
0
      if (FLAGS_dump_quant_state) {
964
0
        quantizer.DumpQuantizationMap(raw_quant_field);
965
0
      }
966
0
    }
967
968
0
    if (i == iters) break;
969
970
0
    double kPow[8] = {
971
0
        0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
972
0
    };
973
0
    double kPowMod[8] = {
974
0
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
975
0
    };
976
0
    if (i == kOriginalComparisonRound) {
977
      // Don't allow optimization to make the quant field a lot worse than
978
      // what the initial guess was. This allows the AC field to have enough
979
      // precision to reduce the oscillations due to the dc reconstruction.
980
0
      double kInitMul = 0.6;
981
0
      const double kOneMinusInitMul = 1.0 - kInitMul;
982
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
983
0
        float* const JXL_RESTRICT row_q = quant_field.Row(y);
984
0
        const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y);
985
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
986
0
          double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x];
987
0
          if (row_q[x] < clamp) {
988
0
            row_q[x] = clamp;
989
0
            if (row_q[x] > qf_higher) row_q[x] = qf_higher;
990
0
            if (row_q[x] < qf_lower) row_q[x] = qf_lower;
991
0
          }
992
0
        }
993
0
      }
994
0
    }
995
996
0
    double cur_pow = 0.0;
997
0
    if (i < 7) {
998
0
      cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i];
999
0
      if (cur_pow < 0) {
1000
0
        cur_pow = 0;
1001
0
      }
1002
0
    }
1003
0
    if (cur_pow == 0.0) {
1004
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
1005
0
        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
1006
0
        float* const JXL_RESTRICT row_q = quant_field.Row(y);
1007
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
1008
0
          const float diff = row_dist[x] / original_butteraugli;
1009
0
          if (diff > 1.0f) {
1010
0
            float old = row_q[x];
1011
0
            row_q[x] *= diff;
1012
0
            int qf_old =
1013
0
                static_cast<int>(std::lround(old * quantizer.InvGlobalScale()));
1014
0
            int qf_new = static_cast<int>(
1015
0
                std::lround(row_q[x] * quantizer.InvGlobalScale()));
1016
0
            if (qf_old == qf_new) {
1017
0
              row_q[x] = old + quantizer.Scale();
1018
0
            }
1019
0
          }
1020
0
          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
1021
0
          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
1022
0
        }
1023
0
      }
1024
0
    } else {
1025
0
      for (size_t y = 0; y < quant_field.ysize(); ++y) {
1026
0
        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
1027
0
        float* const JXL_RESTRICT row_q = quant_field.Row(y);
1028
0
        for (size_t x = 0; x < quant_field.xsize(); ++x) {
1029
0
          const float diff = row_dist[x] / original_butteraugli;
1030
0
          if (diff <= 1.0f) {
1031
0
            row_q[x] *= std::pow(diff, cur_pow);
1032
0
          } else {
1033
0
            float old = row_q[x];
1034
0
            row_q[x] *= diff;
1035
0
            int qf_old =
1036
0
                static_cast<int>(std::lround(old * quantizer.InvGlobalScale()));
1037
0
            int qf_new = static_cast<int>(
1038
0
                std::lround(row_q[x] * quantizer.InvGlobalScale()));
1039
0
            if (qf_old == qf_new) {
1040
0
              row_q[x] = old + quantizer.Scale();
1041
0
            }
1042
0
          }
1043
0
          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
1044
0
          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
1045
0
        }
1046
0
      }
1047
0
    }
1048
0
  }
1049
0
  quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
1050
0
  return true;
1051
0
}
1052
1053
Status FindBestQuantizationMaxError(const FrameHeader& frame_header,
1054
                                    const Image3F& opsin, ImageF& quant_field,
1055
                                    PassesEncoderState* enc_state,
1056
                                    const JxlCmsInterface& cms,
1057
0
                                    ThreadPool* pool, AuxOut* aux_out) {
1058
  // TODO(szabadka): Make this work for non-opsin color spaces.
1059
0
  const CompressParams& cparams = enc_state->cparams;
1060
0
  Quantizer& quantizer = enc_state->shared.quantizer;
1061
0
  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
1062
1063
  // TODO(veluca): better choice of this value.
1064
0
  const float initial_quant_dc =
1065
0
      16 * std::sqrt(0.1f / cparams.butteraugli_distance);
1066
0
  AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
1067
0
                   cparams.original_butteraugli_distance, &quant_field);
1068
1069
0
  const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0],
1070
0
                                1.0f / enc_state->cparams.max_error[1],
1071
0
                                1.0f / enc_state->cparams.max_error[2]};
1072
1073
0
  for (int i = 0; i < kMaxButteraugliIters + 1; ++i) {
1074
0
    quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
1075
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) {
1076
0
      JXL_RETURN_IF_ERROR(
1077
0
          DumpXybImage(cparams, ("ops" + ToString(i)).c_str(), opsin));
1078
0
    }
1079
0
    JXL_ASSIGN_OR_RETURN(
1080
0
        ImageBundle decoded,
1081
0
        RoundtripImage(frame_header, opsin, enc_state, cms, pool));
1082
0
    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) {
1083
0
      JXL_RETURN_IF_ERROR(DumpXybImage(cparams, ("dec" + ToString(i)).c_str(),
1084
0
                                       *decoded.color()));
1085
0
    }
1086
0
    for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) {
1087
0
      AcStrategyRow ac_strategy_row =
1088
0
          enc_state->shared.ac_strategy.ConstRow(by);
1089
0
      for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) {
1090
0
        AcStrategy acs = ac_strategy_row[bx];
1091
0
        if (!acs.IsFirstBlock()) continue;
1092
0
        float max_error = 0;
1093
0
        for (size_t c = 0; c < 3; c++) {
1094
0
          for (size_t y = by * kBlockDim;
1095
0
               y < (by + acs.covered_blocks_y()) * kBlockDim; y++) {
1096
0
            if (y >= decoded.ysize()) continue;
1097
0
            const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y);
1098
0
            const float* JXL_RESTRICT dec_row =
1099
0
                decoded.color()->ConstPlaneRow(c, y);
1100
0
            for (size_t x = bx * kBlockDim;
1101
0
                 x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) {
1102
0
              if (x >= decoded.xsize()) continue;
1103
0
              max_error = std::max(
1104
0
                  std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error);
1105
0
            }
1106
0
          }
1107
0
        }
1108
        // Target an error between max_error/2 and max_error.
1109
        // If the error in the varblock is above the target, increase the qf to
1110
        // compensate. If the error is below the target, decrease the qf.
1111
        // However, to avoid an excessive increase of the qf, only do so if the
1112
        // error is less than half the maximum allowed error.
1113
0
        const float qf_mul = (max_error < 0.5f)   ? max_error * 2.0f
1114
0
                             : (max_error > 1.0f) ? max_error
1115
0
                                                  : 1.0f;
1116
0
        for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) {
1117
0
          float* JXL_RESTRICT quant_field_row = quant_field.Row(qy);
1118
0
          for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) {
1119
0
            quant_field_row[qx] *= qf_mul;
1120
0
          }
1121
0
        }
1122
0
      }
1123
0
    }
1124
0
  }
1125
0
  quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
1126
0
  return true;
1127
0
}
1128
1129
}  // namespace
1130
1131
void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
1132
52
                      float butteraugli_target, ImageF* quant_field) {
1133
  // Replace the whole quant_field in non-8x8 blocks with the maximum of each
1134
  // 8x8 block.
1135
52
  size_t stride = quant_field->PixelsPerRow();
1136
1137
  // At low distances it is great to use max, but mean works better
1138
  // at high distances. We interpolate between them for a distance
1139
  // range.
1140
52
  float mean_max_mixer = 1.0f;
1141
52
  {
1142
52
    static const float kLimit = 1.54138f;
1143
52
    static const float kMul = 0.56391f;
1144
52
    static const float kMin = 0.0f;
1145
52
    if (butteraugli_target > kLimit) {
1146
0
      mean_max_mixer -= (butteraugli_target - kLimit) * kMul;
1147
0
      if (mean_max_mixer < kMin) {
1148
0
        mean_max_mixer = kMin;
1149
0
      }
1150
0
    }
1151
52
  }
1152
104
  for (size_t y = 0; y < rect.ysize(); ++y) {
1153
52
    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y);
1154
52
    float* JXL_RESTRICT quant_row = rect.Row(quant_field, y);
1155
104
    for (size_t x = 0; x < rect.xsize(); ++x) {
1156
52
      AcStrategy acs = ac_strategy_row[x];
1157
52
      if (!acs.IsFirstBlock()) continue;
1158
52
      JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize());
1159
52
      JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize());
1160
52
      float max = quant_row[x];
1161
52
      float mean = 0.0;
1162
104
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
1163
104
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
1164
52
          mean += quant_row[x + ix + iy * stride];
1165
52
          max = std::max(quant_row[x + ix + iy * stride], max);
1166
52
        }
1167
52
      }
1168
52
      mean /= acs.covered_blocks_y() * acs.covered_blocks_x();
1169
52
      if (acs.covered_blocks_y() * acs.covered_blocks_x() >= 4) {
1170
0
        max *= mean_max_mixer;
1171
0
        max += (1.0f - mean_max_mixer) * mean;
1172
0
      }
1173
104
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
1174
104
        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
1175
52
          quant_row[x + ix + iy * stride] = max;
1176
52
        }
1177
52
      }
1178
52
    }
1179
52
  }
1180
52
}
1181
1182
52
float InitialQuantDC(float butteraugli_target) {
1183
52
  const float kDcMul = 0.3;  // Butteraugli target where non-linearity kicks in.
1184
52
  const float butteraugli_target_dc = std::max<float>(
1185
52
      0.5f * butteraugli_target,
1186
52
      std::min<float>(butteraugli_target,
1187
52
                      kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target,
1188
52
                                        kDcQuantPow)));
1189
  // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc.
1190
  // The maximum DC value might not be in the kXybRange because of inverse
1191
  // gaborish, so we add some slack to the maximum theoretical quant obtained
1192
  // this way (64).
1193
52
  return std::min(kDcQuant / butteraugli_target_dc, 50.f);
1194
52
}
1195
1196
StatusOr<ImageF> InitialQuantField(const float butteraugli_target,
1197
                                   const Image3F& opsin, const Rect& rect,
1198
                                   ThreadPool* pool, float rescale,
1199
52
                                   ImageF* mask, ImageF* mask1x1) {
1200
52
  const float quant_ac = kAcQuant / butteraugli_target;
1201
52
  return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(
1202
52
      butteraugli_target, opsin, rect, quant_ac * rescale, pool, mask, mask1x1);
1203
52
}
1204
1205
Status FindBestQuantizer(const FrameHeader& frame_header, const Image3F* linear,
1206
                         const Image3F& opsin, ImageF& quant_field,
1207
                         PassesEncoderState* enc_state,
1208
                         const JxlCmsInterface& cms, ThreadPool* pool,
1209
52
                         AuxOut* aux_out, double rescale) {
1210
52
  const CompressParams& cparams = enc_state->cparams;
1211
52
  if (cparams.max_error_mode) {
1212
0
    JXL_RETURN_IF_ERROR(FindBestQuantizationMaxError(
1213
0
        frame_header, opsin, quant_field, enc_state, cms, pool, aux_out));
1214
52
  } else if (linear && cparams.speed_tier <= SpeedTier::kKitten) {
1215
    // Normal encoding to a butteraugli score.
1216
0
    JXL_RETURN_IF_ERROR(FindBestQuantization(frame_header, *linear, opsin,
1217
0
                                             quant_field, enc_state, cms, pool,
1218
0
                                             aux_out));
1219
0
  }
1220
52
  return true;
1221
52
}
1222
1223
}  // namespace jxl
1224
#endif  // HWY_ONCE