Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/lib/jxl/enc_chroma_from_luma.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_chroma_from_luma.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cmath>
12
#include <cstdlib>
13
#include <hwy/base.h>  // HWY_ALIGN_MAX
14
#include <limits>
15
16
#include "lib/jxl/ac_strategy.h"
17
#include "lib/jxl/base/compiler_specific.h"
18
#include "lib/jxl/base/span.h"
19
#include "lib/jxl/chroma_from_luma.h"
20
#include "lib/jxl/coeff_order_fwd.h"
21
#include "lib/jxl/enc_bit_writer.h"
22
#include "lib/jxl/fields.h"
23
#include "lib/jxl/frame_dimensions.h"
24
#include "lib/jxl/image.h"
25
#include "lib/jxl/quant_weights.h"
26
27
#undef HWY_TARGET_INCLUDE
28
#define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc"
29
#include <hwy/foreach_target.h>
30
#include <hwy/highway.h>
31
32
#include "lib/jxl/base/common.h"
33
#include "lib/jxl/base/rect.h"
34
#include "lib/jxl/base/status.h"
35
#include "lib/jxl/cms/opsin_params.h"
36
#include "lib/jxl/dec_transforms-inl.h"
37
#include "lib/jxl/enc_aux_out.h"
38
#include "lib/jxl/enc_params.h"
39
#include "lib/jxl/enc_transforms-inl.h"
40
#include "lib/jxl/quantizer.h"
41
#include "lib/jxl/simd_util.h"
42
HWY_BEFORE_NAMESPACE();
43
namespace jxl {
44
namespace HWY_NAMESPACE {
45
46
// These templates are not found via ADL.
47
using hwy::HWY_NAMESPACE::Abs;
48
using hwy::HWY_NAMESPACE::Ge;
49
using hwy::HWY_NAMESPACE::GetLane;
50
using hwy::HWY_NAMESPACE::IfThenElse;
51
using hwy::HWY_NAMESPACE::Lt;
52
53
static HWY_FULL(float) df;
54
55
struct CFLFunction {
56
  static constexpr float kCoeff = 1.f / 3;
57
  static constexpr float kThres = 100.0f;
58
  static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
59
  CFLFunction(const float* values_m, const float* values_s, size_t num,
60
              float base, float distance_mul)
61
28.8k
      : values_m(values_m),
62
28.8k
        values_s(values_s),
63
28.8k
        num(num),
64
28.8k
        base(base),
65
28.8k
        distance_mul(distance_mul) {
66
28.8k
    JXL_DASSERT(num % Lanes(df) == 0);
67
28.8k
  }
Unexecuted instantiation: jxl::N_SSE4::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float)
jxl::N_AVX2::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float)
Line
Count
Source
61
28.8k
      : values_m(values_m),
62
28.8k
        values_s(values_s),
63
28.8k
        num(num),
64
28.8k
        base(base),
65
28.8k
        distance_mul(distance_mul) {
66
28.8k
    JXL_DASSERT(num % Lanes(df) == 0);
67
28.8k
  }
Unexecuted instantiation: jxl::N_SSE2::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float)
68
69
  // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) +
70
  // distance_mul * x^2 * num.
71
313k
  float Compute(float x, float eps, float* fpeps, float* fmeps) const {
72
313k
    float first_derivative = 2 * distance_mul * num * x;
73
313k
    float first_derivative_peps = 2 * distance_mul * num * (x + eps);
74
313k
    float first_derivative_meps = 2 * distance_mul * num * (x - eps);
75
76
313k
    const auto inv_color_factor = Set(df, kInvColorFactor);
77
313k
    const auto thres = Set(df, kThres);
78
313k
    const auto coeffx2 = Set(df, kCoeff * 2.0f);
79
313k
    const auto one = Set(df, 1.0f);
80
313k
    const auto zero = Set(df, 0.0f);
81
313k
    const auto base_v = Set(df, base);
82
313k
    const auto x_v = Set(df, x);
83
313k
    const auto xpe_v = Set(df, x + eps);
84
313k
    const auto xme_v = Set(df, x - eps);
85
313k
    auto fd_v = Zero(df);
86
313k
    auto fdpe_v = Zero(df);
87
313k
    auto fdme_v = Zero(df);
88
89
139M
    for (size_t i = 0; i < num; i += Lanes(df)) {
90
      // color residual = ax + b
91
138M
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
92
138M
      const auto b =
93
138M
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
94
138M
      const auto v = MulAdd(a, x_v, b);
95
138M
      const auto vpe = MulAdd(a, xpe_v, b);
96
138M
      const auto vme = MulAdd(a, xme_v, b);
97
138M
      const auto av = Abs(v);
98
138M
      const auto avpe = Abs(vpe);
99
138M
      const auto avme = Abs(vme);
100
138M
      const auto acoeffx2 = Mul(coeffx2, a);
101
138M
      auto d = Mul(acoeffx2, Add(av, one));
102
138M
      auto dpe = Mul(acoeffx2, Add(avpe, one));
103
138M
      auto dme = Mul(acoeffx2, Add(avme, one));
104
138M
      d = IfThenElse(Lt(v, zero), Sub(zero, d), d);
105
138M
      dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe);
106
138M
      dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme);
107
138M
      const auto above = Ge(av, thres);
108
      // TODO(eustas): use IfThenElseZero
109
138M
      fd_v = Add(fd_v, IfThenElse(above, zero, d));
110
138M
      fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe));
111
138M
      fdme_v = Add(fdme_v, IfThenElse(above, zero, dme));
112
138M
    }
113
114
313k
    *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v));
115
313k
    *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v));
116
313k
    return first_derivative + GetLane(SumOfLanes(df, fd_v));
117
313k
  }
Unexecuted instantiation: jxl::N_SSE4::CFLFunction::Compute(float, float, float*, float*) const
jxl::N_AVX2::CFLFunction::Compute(float, float, float*, float*) const
Line
Count
Source
71
313k
  float Compute(float x, float eps, float* fpeps, float* fmeps) const {
72
313k
    float first_derivative = 2 * distance_mul * num * x;
73
313k
    float first_derivative_peps = 2 * distance_mul * num * (x + eps);
74
313k
    float first_derivative_meps = 2 * distance_mul * num * (x - eps);
75
76
313k
    const auto inv_color_factor = Set(df, kInvColorFactor);
77
313k
    const auto thres = Set(df, kThres);
78
313k
    const auto coeffx2 = Set(df, kCoeff * 2.0f);
79
313k
    const auto one = Set(df, 1.0f);
80
313k
    const auto zero = Set(df, 0.0f);
81
313k
    const auto base_v = Set(df, base);
82
313k
    const auto x_v = Set(df, x);
83
313k
    const auto xpe_v = Set(df, x + eps);
84
313k
    const auto xme_v = Set(df, x - eps);
85
313k
    auto fd_v = Zero(df);
86
313k
    auto fdpe_v = Zero(df);
87
313k
    auto fdme_v = Zero(df);
88
89
139M
    for (size_t i = 0; i < num; i += Lanes(df)) {
90
      // color residual = ax + b
91
138M
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
92
138M
      const auto b =
93
138M
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
94
138M
      const auto v = MulAdd(a, x_v, b);
95
138M
      const auto vpe = MulAdd(a, xpe_v, b);
96
138M
      const auto vme = MulAdd(a, xme_v, b);
97
138M
      const auto av = Abs(v);
98
138M
      const auto avpe = Abs(vpe);
99
138M
      const auto avme = Abs(vme);
100
138M
      const auto acoeffx2 = Mul(coeffx2, a);
101
138M
      auto d = Mul(acoeffx2, Add(av, one));
102
138M
      auto dpe = Mul(acoeffx2, Add(avpe, one));
103
138M
      auto dme = Mul(acoeffx2, Add(avme, one));
104
138M
      d = IfThenElse(Lt(v, zero), Sub(zero, d), d);
105
138M
      dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe);
106
138M
      dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme);
107
138M
      const auto above = Ge(av, thres);
108
      // TODO(eustas): use IfThenElseZero
109
138M
      fd_v = Add(fd_v, IfThenElse(above, zero, d));
110
138M
      fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe));
111
138M
      fdme_v = Add(fdme_v, IfThenElse(above, zero, dme));
112
138M
    }
113
114
313k
    *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v));
115
313k
    *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v));
116
313k
    return first_derivative + GetLane(SumOfLanes(df, fd_v));
117
313k
  }
Unexecuted instantiation: jxl::N_SSE2::CFLFunction::Compute(float, float, float*, float*) const
118
119
  const float* JXL_RESTRICT values_m;
120
  const float* JXL_RESTRICT values_s;
121
  size_t num;
122
  float base;
123
  float distance_mul;
124
};
125
126
// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
127
int32_t FindBestMultiplier(const float* values_m, const float* values_s,
128
                           size_t num, float base, float distance_mul,
129
28.8k
                           bool fast) {
130
28.8k
  if (num == 0) {
131
0
    return 0;
132
0
  }
133
28.8k
  float x;
134
28.8k
  if (fast) {
135
0
    static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
136
0
    auto ca = Zero(df);
137
0
    auto cb = Zero(df);
138
0
    const auto inv_color_factor = Set(df, kInvColorFactor);
139
0
    const auto base_v = Set(df, base);
140
0
    for (size_t i = 0; i < num; i += Lanes(df)) {
141
      // color residual = ax + b
142
0
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
143
0
      const auto b =
144
0
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
145
0
      ca = MulAdd(a, a, ca);
146
0
      cb = MulAdd(a, b, cb);
147
0
    }
148
    // + distance_mul * x^2 * num
149
0
    x = -GetLane(SumOfLanes(df, cb)) /
150
0
        (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
151
28.8k
  } else {
152
28.8k
    constexpr float eps = 100;
153
28.8k
    constexpr float kClamp = 20.0f;
154
28.8k
    CFLFunction fn(values_m, values_s, num, base, distance_mul);
155
28.8k
    x = 0;
156
    // Up to 20 Newton iterations, with approximate derivatives.
157
    // Derivatives are approximate due to the high amount of noise in the exact
158
    // derivatives.
159
324k
    for (size_t i = 0; i < 20; i++) {
160
313k
      float dfpeps;
161
313k
      float dfmeps;
162
313k
      float d_f = fn.Compute(x, eps, &dfpeps, &dfmeps);
163
313k
      float ddf = (dfpeps - dfmeps) / (2 * eps);
164
313k
      float kExperimentalInsignificantStabilizer = 0.85;
165
313k
      float step = d_f / (ddf + kExperimentalInsignificantStabilizer);
166
313k
      x -= std::min(kClamp, std::max(-kClamp, step));
167
313k
      if (std::abs(step) < 3e-3) break;
168
313k
    }
169
28.8k
  }
170
  // CFL seems to be tricky for larger transforms for HF components
171
  // close to zero. This heuristic brings the solutions closer to zero
172
  // and reduces red-green oscillations. A better approach would
173
  // look into variance of the multiplier within separate (e.g. 8x8)
174
  // areas and only apply this heuristic where there is a high variance.
175
  // This would give about 1 % more compression density.
176
28.8k
  float towards_zero = 2.6;
177
28.8k
  if (x >= towards_zero) {
178
4.26k
    x -= towards_zero;
179
24.6k
  } else if (x <= -towards_zero) {
180
8.15k
    x += towards_zero;
181
16.4k
  } else {
182
16.4k
    x = 0;
183
16.4k
  }
184
28.8k
  return jxl::Clamp1(std::round(x), -128.0f, 127.0f);
185
28.8k
}
Unexecuted instantiation: jxl::N_SSE4::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool)
jxl::N_AVX2::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool)
Line
Count
Source
129
28.8k
                           bool fast) {
130
28.8k
  if (num == 0) {
131
0
    return 0;
132
0
  }
133
28.8k
  float x;
134
28.8k
  if (fast) {
135
0
    static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
136
0
    auto ca = Zero(df);
137
0
    auto cb = Zero(df);
138
0
    const auto inv_color_factor = Set(df, kInvColorFactor);
139
0
    const auto base_v = Set(df, base);
140
0
    for (size_t i = 0; i < num; i += Lanes(df)) {
141
      // color residual = ax + b
142
0
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
143
0
      const auto b =
144
0
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
145
0
      ca = MulAdd(a, a, ca);
146
0
      cb = MulAdd(a, b, cb);
147
0
    }
148
    // + distance_mul * x^2 * num
149
0
    x = -GetLane(SumOfLanes(df, cb)) /
150
0
        (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
151
28.8k
  } else {
152
28.8k
    constexpr float eps = 100;
153
28.8k
    constexpr float kClamp = 20.0f;
154
28.8k
    CFLFunction fn(values_m, values_s, num, base, distance_mul);
155
28.8k
    x = 0;
156
    // Up to 20 Newton iterations, with approximate derivatives.
157
    // Derivatives are approximate due to the high amount of noise in the exact
158
    // derivatives.
159
324k
    for (size_t i = 0; i < 20; i++) {
160
313k
      float dfpeps;
161
313k
      float dfmeps;
162
313k
      float d_f = fn.Compute(x, eps, &dfpeps, &dfmeps);
163
313k
      float ddf = (dfpeps - dfmeps) / (2 * eps);
164
313k
      float kExperimentalInsignificantStabilizer = 0.85;
165
313k
      float step = d_f / (ddf + kExperimentalInsignificantStabilizer);
166
313k
      x -= std::min(kClamp, std::max(-kClamp, step));
167
313k
      if (std::abs(step) < 3e-3) break;
168
313k
    }
169
28.8k
  }
170
  // CFL seems to be tricky for larger transforms for HF components
171
  // close to zero. This heuristic brings the solutions closer to zero
172
  // and reduces red-green oscillations. A better approach would
173
  // look into variance of the multiplier within separate (e.g. 8x8)
174
  // areas and only apply this heuristic where there is a high variance.
175
  // This would give about 1 % more compression density.
176
28.8k
  float towards_zero = 2.6;
177
28.8k
  if (x >= towards_zero) {
178
4.26k
    x -= towards_zero;
179
24.6k
  } else if (x <= -towards_zero) {
180
8.15k
    x += towards_zero;
181
16.4k
  } else {
182
16.4k
    x = 0;
183
16.4k
  }
184
28.8k
  return jxl::Clamp1(std::round(x), -128.0f, 127.0f);
185
28.8k
}
Unexecuted instantiation: jxl::N_SSE2::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool)
186
187
Status InitDCStorage(JxlMemoryManager* memory_manager, size_t num_blocks,
188
186
                     ImageF* dc_values) {
189
  // First row: Y channel
190
  // Second row: X channel
191
  // Third row: Y channel
192
  // Fourth row: B channel
193
186
  JXL_ASSIGN_OR_RETURN(
194
186
      *dc_values,
195
186
      ImageF::Create(memory_manager, RoundUpTo(num_blocks, Lanes(df)), 4));
196
197
186
  JXL_ENSURE(dc_values->xsize() != 0);
198
  // Zero-fill the last lanes
199
930
  for (size_t y = 0; y < 4; y++) {
200
6.69k
    for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize();
201
5.95k
         x++) {
202
5.95k
      dc_values->Row(y)[x] = 0;
203
5.95k
    }
204
744
  }
205
186
  return true;
206
186
}
Unexecuted instantiation: jxl::N_SSE4::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*)
jxl::N_AVX2::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*)
Line
Count
Source
188
186
                     ImageF* dc_values) {
189
  // First row: Y channel
190
  // Second row: X channel
191
  // Third row: Y channel
192
  // Fourth row: B channel
193
186
  JXL_ASSIGN_OR_RETURN(
194
186
      *dc_values,
195
186
      ImageF::Create(memory_manager, RoundUpTo(num_blocks, Lanes(df)), 4));
196
197
186
  JXL_ENSURE(dc_values->xsize() != 0);
198
  // Zero-fill the last lanes
199
930
  for (size_t y = 0; y < 4; y++) {
200
6.69k
    for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize();
201
5.95k
         x++) {
202
5.95k
      dc_values->Row(y)[x] = 0;
203
5.95k
    }
204
744
  }
205
186
  return true;
206
186
}
Unexecuted instantiation: jxl::N_SSE2::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*)
207
208
Status ComputeTile(const Image3F& opsin, const Rect& opsin_rect,
209
                   const DequantMatrices& dequant,
210
                   const AcStrategyImage* ac_strategy,
211
                   const ImageI* raw_quant_field, const Quantizer* quantizer,
212
                   const Rect& rect, bool fast, bool use_dct8, ImageSB* map_x,
213
14.4k
                   ImageSB* map_b, ImageF* dc_values, Span<float> mem) {
214
14.4k
  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
215
14.4k
                "Invalid color tile dim");
216
14.4k
  size_t xsize_blocks = opsin_rect.xsize() / kBlockDim;
217
14.4k
  constexpr float kDistanceMultiplierAC = 1e-9f;
218
14.4k
  const size_t dct_scratch_size =
219
14.4k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
220
221
14.4k
  const size_t y0 = rect.y0();
222
14.4k
  const size_t x0 = rect.x0();
223
14.4k
  const size_t x1 = rect.x0() + rect.xsize();
224
14.4k
  const size_t y1 = rect.y0() + rect.ysize();
225
226
14.4k
  int ty = y0 / kColorTileDimInBlocks;
227
14.4k
  int tx = x0 / kColorTileDimInBlocks;
228
229
14.4k
  int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty);
230
14.4k
  int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty);
231
232
14.4k
  float* JXL_RESTRICT dc_values_yx = dc_values->Row(0);
233
14.4k
  float* JXL_RESTRICT dc_values_x = dc_values->Row(1);
234
14.4k
  float* JXL_RESTRICT dc_values_yb = dc_values->Row(2);
235
14.4k
  float* JXL_RESTRICT dc_values_b = dc_values->Row(3);
236
237
  // All are aligned.
238
14.4k
  float* HWY_RESTRICT block_y = mem.begin();
239
14.4k
  float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea;
240
14.4k
  float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea;
241
14.4k
  JXL_ENSURE(mem.remove_prefix(3 * AcStrategy::kMaxCoeffArea));
242
14.4k
  float* HWY_RESTRICT coeffs_yx = mem.begin();
243
14.4k
  float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim;
244
14.4k
  float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
245
14.4k
  float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
246
14.4k
  JXL_ENSURE(mem.remove_prefix(4 * kColorTileDim * kColorTileDim));
247
14.4k
  constexpr size_t dc_size =
248
14.4k
      AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks;
249
14.4k
  float* HWY_RESTRICT dc_y = mem.begin();
250
14.4k
  float* HWY_RESTRICT dc_x = dc_y + dc_size;
251
14.4k
  float* HWY_RESTRICT dc_b = dc_x + dc_size;
252
14.4k
  JXL_ENSURE(mem.remove_prefix(3 * dc_size));
253
14.4k
  float* HWY_RESTRICT scratch_space = mem.begin();
254
14.4k
  JXL_ENSURE(mem.size() == 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size);
255
256
14.4k
  size_t num_ac = 0;
257
258
121k
  for (size_t y = y0; y < y1; ++y) {
259
106k
    const float* JXL_RESTRICT row_y =
260
106k
        opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim);
261
106k
    const float* JXL_RESTRICT row_x =
262
106k
        opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim);
263
106k
    const float* JXL_RESTRICT row_b =
264
106k
        opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim);
265
106k
    size_t stride = opsin.PixelsPerRow();
266
267
903k
    for (size_t x = x0; x < x1; x++) {
268
796k
      AcStrategy acs = use_dct8
269
796k
                           ? AcStrategy::FromRawStrategy(AcStrategyType::DCT)
270
796k
                           : ac_strategy->ConstRow(y)[x];
271
796k
      if (!acs.IsFirstBlock()) continue;
272
639k
      size_t xs = acs.covered_blocks_x();
273
639k
      TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride,
274
639k
                          block_y, scratch_space);
275
639k
      DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs, scratch_space);
276
639k
      TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride,
277
639k
                          block_x, scratch_space);
278
639k
      DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs, scratch_space);
279
639k
      TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride,
280
639k
                          block_b, scratch_space);
281
639k
      DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs, scratch_space);
282
639k
      const float* const JXL_RESTRICT qm_x =
283
639k
          dequant.InvMatrix(acs.Strategy(), 0);
284
639k
      const float* const JXL_RESTRICT qm_b =
285
639k
          dequant.InvMatrix(acs.Strategy(), 2);
286
639k
      float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
287
639k
      float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
288
289
      // Copy DCs in dc_values.
290
1.31M
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
291
1.47M
        for (size_t ix = 0; ix < xs; ix++) {
292
796k
          dc_values_yx[(iy + y) * xsize_blocks + ix + x] =
293
796k
              dc_y[iy * xs + ix] * q_dc_x;
294
796k
          dc_values_x[(iy + y) * xsize_blocks + ix + x] =
295
796k
              dc_x[iy * xs + ix] * q_dc_x;
296
796k
          dc_values_yb[(iy + y) * xsize_blocks + ix + x] =
297
796k
              dc_y[iy * xs + ix] * q_dc_b;
298
796k
          dc_values_b[(iy + y) * xsize_blocks + ix + x] =
299
796k
              dc_b[iy * xs + ix] * q_dc_b;
300
796k
        }
301
678k
      }
302
303
      // Do not use this block for computing AC CfL.
304
639k
      if (acs.covered_blocks_x() + x0 > x1 ||
305
639k
          acs.covered_blocks_y() + y0 > y1) {
306
0
        continue;
307
0
      }
308
309
      // Copy AC coefficients in the local block. The order in which
310
      // coefficients get stored does not matter.
311
639k
      size_t cx = acs.covered_blocks_x();
312
639k
      size_t cy = acs.covered_blocks_y();
313
639k
      CoefficientLayout(&cy, &cx);
314
      // Zero out LFs. This introduces terms in the optimization loop that
315
      // don't affect the result, as they are all 0, but allow for simpler
316
      // SIMDfication.
317
1.30M
      for (size_t iy = 0; iy < cy; iy++) {
318
1.46M
        for (size_t ix = 0; ix < cx; ix++) {
319
796k
          block_y[cx * kBlockDim * iy + ix] = 0;
320
796k
          block_x[cx * kBlockDim * iy + ix] = 0;
321
796k
          block_b[cx * kBlockDim * iy + ix] = 0;
322
796k
        }
323
665k
      }
324
      // Unclear why this is like it is. (This works slightly better
325
      // than the previous approach which was also a hack.)
326
639k
      const float qq =
327
639k
          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
328
      // Experimentally values 128-130 seem best -- I don't know why we
329
      // need this multiplier.
330
639k
      const float kStrangeMultiplier = 128;
331
639k
      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
332
639k
      const auto qv = Set(df, q);
333
7.00M
      for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
334
6.36M
        const auto b_y = Load(df, block_y + i);
335
6.36M
        const auto b_x = Load(df, block_x + i);
336
6.36M
        const auto b_b = Load(df, block_b + i);
337
6.36M
        const auto qqm_x = Mul(qv, Load(df, qm_x + i));
338
6.36M
        const auto qqm_b = Mul(qv, Load(df, qm_b + i));
339
6.36M
        Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac);
340
6.36M
        Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac);
341
6.36M
        Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac);
342
6.36M
        Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac);
343
6.36M
        num_ac += Lanes(df);
344
6.36M
      }
345
639k
    }
346
106k
  }
347
14.4k
  JXL_ENSURE(num_ac % Lanes(df) == 0);
348
14.4k
  row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
349
14.4k
                                     kDistanceMultiplierAC, fast);
350
14.4k
  row_out_b[tx] =
351
14.4k
      FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
352
14.4k
                         kDistanceMultiplierAC, fast);
353
14.4k
  return true;
354
14.4k
}
Unexecuted instantiation: jxl::N_SSE4::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>)
jxl::N_AVX2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>)
Line
Count
Source
213
14.4k
                   ImageSB* map_b, ImageF* dc_values, Span<float> mem) {
214
14.4k
  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
215
14.4k
                "Invalid color tile dim");
216
14.4k
  size_t xsize_blocks = opsin_rect.xsize() / kBlockDim;
217
14.4k
  constexpr float kDistanceMultiplierAC = 1e-9f;
218
14.4k
  const size_t dct_scratch_size =
219
14.4k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
220
221
14.4k
  const size_t y0 = rect.y0();
222
14.4k
  const size_t x0 = rect.x0();
223
14.4k
  const size_t x1 = rect.x0() + rect.xsize();
224
14.4k
  const size_t y1 = rect.y0() + rect.ysize();
225
226
14.4k
  int ty = y0 / kColorTileDimInBlocks;
227
14.4k
  int tx = x0 / kColorTileDimInBlocks;
228
229
14.4k
  int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty);
230
14.4k
  int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty);
231
232
14.4k
  float* JXL_RESTRICT dc_values_yx = dc_values->Row(0);
233
14.4k
  float* JXL_RESTRICT dc_values_x = dc_values->Row(1);
234
14.4k
  float* JXL_RESTRICT dc_values_yb = dc_values->Row(2);
235
14.4k
  float* JXL_RESTRICT dc_values_b = dc_values->Row(3);
236
237
  // All are aligned.
238
14.4k
  float* HWY_RESTRICT block_y = mem.begin();
239
14.4k
  float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea;
240
14.4k
  float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea;
241
14.4k
  JXL_ENSURE(mem.remove_prefix(3 * AcStrategy::kMaxCoeffArea));
242
14.4k
  float* HWY_RESTRICT coeffs_yx = mem.begin();
243
14.4k
  float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim;
244
14.4k
  float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
245
14.4k
  float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
246
14.4k
  JXL_ENSURE(mem.remove_prefix(4 * kColorTileDim * kColorTileDim));
247
14.4k
  constexpr size_t dc_size =
248
14.4k
      AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks;
249
14.4k
  float* HWY_RESTRICT dc_y = mem.begin();
250
14.4k
  float* HWY_RESTRICT dc_x = dc_y + dc_size;
251
14.4k
  float* HWY_RESTRICT dc_b = dc_x + dc_size;
252
14.4k
  JXL_ENSURE(mem.remove_prefix(3 * dc_size));
253
14.4k
  float* HWY_RESTRICT scratch_space = mem.begin();
254
14.4k
  JXL_ENSURE(mem.size() == 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size);
255
256
14.4k
  size_t num_ac = 0;
257
258
121k
  for (size_t y = y0; y < y1; ++y) {
259
106k
    const float* JXL_RESTRICT row_y =
260
106k
        opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim);
261
106k
    const float* JXL_RESTRICT row_x =
262
106k
        opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim);
263
106k
    const float* JXL_RESTRICT row_b =
264
106k
        opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim);
265
106k
    size_t stride = opsin.PixelsPerRow();
266
267
903k
    for (size_t x = x0; x < x1; x++) {
268
796k
      AcStrategy acs = use_dct8
269
796k
                           ? AcStrategy::FromRawStrategy(AcStrategyType::DCT)
270
796k
                           : ac_strategy->ConstRow(y)[x];
271
796k
      if (!acs.IsFirstBlock()) continue;
272
639k
      size_t xs = acs.covered_blocks_x();
273
639k
      TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride,
274
639k
                          block_y, scratch_space);
275
639k
      DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs, scratch_space);
276
639k
      TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride,
277
639k
                          block_x, scratch_space);
278
639k
      DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs, scratch_space);
279
639k
      TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride,
280
639k
                          block_b, scratch_space);
281
639k
      DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs, scratch_space);
282
639k
      const float* const JXL_RESTRICT qm_x =
283
639k
          dequant.InvMatrix(acs.Strategy(), 0);
284
639k
      const float* const JXL_RESTRICT qm_b =
285
639k
          dequant.InvMatrix(acs.Strategy(), 2);
286
639k
      float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
287
639k
      float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
288
289
      // Copy DCs in dc_values.
290
1.31M
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
291
1.47M
        for (size_t ix = 0; ix < xs; ix++) {
292
796k
          dc_values_yx[(iy + y) * xsize_blocks + ix + x] =
293
796k
              dc_y[iy * xs + ix] * q_dc_x;
294
796k
          dc_values_x[(iy + y) * xsize_blocks + ix + x] =
295
796k
              dc_x[iy * xs + ix] * q_dc_x;
296
796k
          dc_values_yb[(iy + y) * xsize_blocks + ix + x] =
297
796k
              dc_y[iy * xs + ix] * q_dc_b;
298
796k
          dc_values_b[(iy + y) * xsize_blocks + ix + x] =
299
796k
              dc_b[iy * xs + ix] * q_dc_b;
300
796k
        }
301
678k
      }
302
303
      // Do not use this block for computing AC CfL.
304
639k
      if (acs.covered_blocks_x() + x0 > x1 ||
305
639k
          acs.covered_blocks_y() + y0 > y1) {
306
0
        continue;
307
0
      }
308
309
      // Copy AC coefficients in the local block. The order in which
310
      // coefficients get stored does not matter.
311
639k
      size_t cx = acs.covered_blocks_x();
312
639k
      size_t cy = acs.covered_blocks_y();
313
639k
      CoefficientLayout(&cy, &cx);
314
      // Zero out LFs. This introduces terms in the optimization loop that
315
      // don't affect the result, as they are all 0, but allow for simpler
316
      // SIMDfication.
317
1.30M
      for (size_t iy = 0; iy < cy; iy++) {
318
1.46M
        for (size_t ix = 0; ix < cx; ix++) {
319
796k
          block_y[cx * kBlockDim * iy + ix] = 0;
320
796k
          block_x[cx * kBlockDim * iy + ix] = 0;
321
796k
          block_b[cx * kBlockDim * iy + ix] = 0;
322
796k
        }
323
665k
      }
324
      // Unclear why this is like it is. (This works slightly better
325
      // than the previous approach which was also a hack.)
326
639k
      const float qq =
327
639k
          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
328
      // Experimentally values 128-130 seem best -- I don't know why we
329
      // need this multiplier.
330
639k
      const float kStrangeMultiplier = 128;
331
639k
      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
332
639k
      const auto qv = Set(df, q);
333
7.00M
      for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
334
6.36M
        const auto b_y = Load(df, block_y + i);
335
6.36M
        const auto b_x = Load(df, block_x + i);
336
6.36M
        const auto b_b = Load(df, block_b + i);
337
6.36M
        const auto qqm_x = Mul(qv, Load(df, qm_x + i));
338
6.36M
        const auto qqm_b = Mul(qv, Load(df, qm_b + i));
339
6.36M
        Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac);
340
6.36M
        Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac);
341
6.36M
        Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac);
342
6.36M
        Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac);
343
6.36M
        num_ac += Lanes(df);
344
6.36M
      }
345
639k
    }
346
106k
  }
347
14.4k
  JXL_ENSURE(num_ac % Lanes(df) == 0);
348
14.4k
  row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
349
14.4k
                                     kDistanceMultiplierAC, fast);
350
14.4k
  row_out_b[tx] =
351
14.4k
      FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
352
14.4k
                         kDistanceMultiplierAC, fast);
353
14.4k
  return true;
354
14.4k
}
Unexecuted instantiation: jxl::N_SSE2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>)
355
356
// NOLINTNEXTLINE(google-readability-namespace-comments)
357
}  // namespace HWY_NAMESPACE
358
}  // namespace jxl
359
HWY_AFTER_NAMESPACE();
360
361
#if HWY_ONCE
362
namespace jxl {
363
364
HWY_EXPORT(InitDCStorage);
365
HWY_EXPORT(ComputeTile);
366
367
186
Status CfLHeuristics::Init(const Rect& rect) {
368
186
  size_t xsize_blocks = rect.xsize() / kBlockDim;
369
186
  size_t ysize_blocks = rect.ysize() / kBlockDim;
370
186
  return HWY_DYNAMIC_DISPATCH(InitDCStorage)(
371
186
      memory_manager, xsize_blocks * ysize_blocks, &dc_values);
372
186
}
373
374
Status CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
375
                                  const Rect& opsin_rect,
376
                                  const DequantMatrices& dequant,
377
                                  const AcStrategyImage* ac_strategy,
378
                                  const ImageI* raw_quant_field,
379
                                  const Quantizer* quantizer, bool fast,
380
14.4k
                                  size_t thread, ColorCorrelationMap* cmap) {
381
14.4k
  bool use_dct8 = ac_strategy == nullptr;
382
14.4k
  Span<float> scratch(mem.address<float>() + thread * ItemsPerThread(),
383
14.4k
                      ItemsPerThread());
384
14.4k
  return HWY_DYNAMIC_DISPATCH(ComputeTile)(
385
14.4k
      opsin, opsin_rect, dequant, ac_strategy, raw_quant_field, quantizer, r,
386
14.4k
      fast, use_dct8, &cmap->ytox_map, &cmap->ytob_map, &dc_values, scratch);
387
14.4k
}
388
389
Status ColorCorrelationEncodeDC(const ColorCorrelation& color_correlation,
390
                                BitWriter* writer, LayerType layer,
391
186
                                AuxOut* aux_out) {
392
186
  float color_factor = color_correlation.GetColorFactor();
393
186
  float base_correlation_x = color_correlation.GetBaseCorrelationX();
394
186
  float base_correlation_b = color_correlation.GetBaseCorrelationB();
395
186
  int32_t ytox_dc = color_correlation.GetYToXDC();
396
186
  int32_t ytob_dc = color_correlation.GetYToBDC();
397
398
186
  return writer->WithMaxBits(
399
186
      1 + 2 * kBitsPerByte + 12 + 32, layer, aux_out, [&]() -> Status {
400
186
        if (ytox_dc == 0 && ytob_dc == 0 &&
401
186
            color_factor == kDefaultColorFactor && base_correlation_x == 0.0f &&
402
186
            base_correlation_b == jxl::cms::kYToBRatio) {
403
186
          writer->Write(1, 1);
404
186
          return true;
405
186
        }
406
0
        writer->Write(1, 0);
407
0
        JXL_RETURN_IF_ERROR(
408
0
            U32Coder::Write(kColorFactorDist, color_factor, writer));
409
0
        JXL_RETURN_IF_ERROR(F16Coder::Write(base_correlation_x, writer));
410
0
        JXL_RETURN_IF_ERROR(F16Coder::Write(base_correlation_b, writer));
411
0
        writer->Write(kBitsPerByte,
412
0
                      ytox_dc - std::numeric_limits<int8_t>::min());
413
0
        writer->Write(kBitsPerByte,
414
0
                      ytob_dc - std::numeric_limits<int8_t>::min());
415
0
        return true;
416
0
      });
417
186
}
418
419
}  // namespace jxl
420
#endif  // HWY_ONCE