Coverage Report

Created: 2025-08-12 07:37

/src/libjxl/lib/jxl/enc_chroma_from_luma.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_chroma_from_luma.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cmath>
12
#include <cstdlib>
13
#include <hwy/base.h>  // HWY_ALIGN_MAX
14
#include <limits>
15
16
#include "lib/jxl/ac_strategy.h"
17
#include "lib/jxl/base/compiler_specific.h"
18
#include "lib/jxl/base/span.h"
19
#include "lib/jxl/chroma_from_luma.h"
20
#include "lib/jxl/coeff_order_fwd.h"
21
#include "lib/jxl/enc_bit_writer.h"
22
#include "lib/jxl/fields.h"
23
#include "lib/jxl/frame_dimensions.h"
24
#include "lib/jxl/image.h"
25
#include "lib/jxl/quant_weights.h"
26
27
#undef HWY_TARGET_INCLUDE
28
#define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc"
29
#include <hwy/foreach_target.h>
30
#include <hwy/highway.h>
31
32
#include "lib/jxl/base/common.h"
33
#include "lib/jxl/base/rect.h"
34
#include "lib/jxl/base/status.h"
35
#include "lib/jxl/cms/opsin_params.h"
36
#include "lib/jxl/dec_transforms-inl.h"
37
#include "lib/jxl/enc_aux_out.h"
38
#include "lib/jxl/enc_params.h"
39
#include "lib/jxl/enc_transforms-inl.h"
40
#include "lib/jxl/quantizer.h"
41
#include "lib/jxl/simd_util.h"
42
HWY_BEFORE_NAMESPACE();
43
namespace jxl {
44
namespace HWY_NAMESPACE {
45
46
// These templates are not found via ADL.
47
using hwy::HWY_NAMESPACE::Abs;
48
using hwy::HWY_NAMESPACE::Ge;
49
using hwy::HWY_NAMESPACE::GetLane;
50
using hwy::HWY_NAMESPACE::IfThenElse;
51
using hwy::HWY_NAMESPACE::Lt;
52
53
static HWY_FULL(float) df;
54
55
struct CFLFunction {
56
  static constexpr float kCoeff = 1.f / 3;
57
  static constexpr float kThres = 100.0f;
58
  static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
59
  CFLFunction(const float* values_m, const float* values_s, size_t num,
60
              float base, float distance_mul)
61
25.9k
      : values_m(values_m),
62
25.9k
        values_s(values_s),
63
25.9k
        num(num),
64
25.9k
        base(base),
65
25.9k
        distance_mul(distance_mul) {
66
25.9k
    JXL_DASSERT(num % Lanes(df) == 0);
67
25.9k
  }
Unexecuted instantiation: jxl::N_SSE4::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float)
jxl::N_AVX2::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float)
Line
Count
Source
61
25.9k
      : values_m(values_m),
62
25.9k
        values_s(values_s),
63
25.9k
        num(num),
64
25.9k
        base(base),
65
25.9k
        distance_mul(distance_mul) {
66
25.9k
    JXL_DASSERT(num % Lanes(df) == 0);
67
25.9k
  }
Unexecuted instantiation: jxl::N_SSE2::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float)
68
69
  // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) +
70
  // distance_mul * x^2 * num.
71
279k
  float Compute(float x, float eps, float* fpeps, float* fmeps) const {
72
279k
    float first_derivative = 2 * distance_mul * num * x;
73
279k
    float first_derivative_peps = 2 * distance_mul * num * (x + eps);
74
279k
    float first_derivative_meps = 2 * distance_mul * num * (x - eps);
75
76
279k
    const auto inv_color_factor = Set(df, kInvColorFactor);
77
279k
    const auto thres = Set(df, kThres);
78
279k
    const auto coeffx2 = Set(df, kCoeff * 2.0f);
79
279k
    const auto one = Set(df, 1.0f);
80
279k
    const auto zero = Set(df, 0.0f);
81
279k
    const auto base_v = Set(df, base);
82
279k
    const auto x_v = Set(df, x);
83
279k
    const auto xpe_v = Set(df, x + eps);
84
279k
    const auto xme_v = Set(df, x - eps);
85
279k
    auto fd_v = Zero(df);
86
279k
    auto fdpe_v = Zero(df);
87
279k
    auto fdme_v = Zero(df);
88
89
123M
    for (size_t i = 0; i < num; i += Lanes(df)) {
90
      // color residual = ax + b
91
122M
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
92
122M
      const auto b =
93
122M
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
94
122M
      const auto v = MulAdd(a, x_v, b);
95
122M
      const auto vpe = MulAdd(a, xpe_v, b);
96
122M
      const auto vme = MulAdd(a, xme_v, b);
97
122M
      const auto av = Abs(v);
98
122M
      const auto avpe = Abs(vpe);
99
122M
      const auto avme = Abs(vme);
100
122M
      const auto acoeffx2 = Mul(coeffx2, a);
101
122M
      auto d = Mul(acoeffx2, Add(av, one));
102
122M
      auto dpe = Mul(acoeffx2, Add(avpe, one));
103
122M
      auto dme = Mul(acoeffx2, Add(avme, one));
104
122M
      d = IfThenElse(Lt(v, zero), Sub(zero, d), d);
105
122M
      dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe);
106
122M
      dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme);
107
122M
      const auto above = Ge(av, thres);
108
      // TODO(eustas): use IfThenElseZero
109
122M
      fd_v = Add(fd_v, IfThenElse(above, zero, d));
110
122M
      fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe));
111
122M
      fdme_v = Add(fdme_v, IfThenElse(above, zero, dme));
112
122M
    }
113
114
279k
    *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v));
115
279k
    *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v));
116
279k
    return first_derivative + GetLane(SumOfLanes(df, fd_v));
117
279k
  }
Unexecuted instantiation: jxl::N_SSE4::CFLFunction::Compute(float, float, float*, float*) const
jxl::N_AVX2::CFLFunction::Compute(float, float, float*, float*) const
Line
Count
Source
71
279k
  float Compute(float x, float eps, float* fpeps, float* fmeps) const {
72
279k
    float first_derivative = 2 * distance_mul * num * x;
73
279k
    float first_derivative_peps = 2 * distance_mul * num * (x + eps);
74
279k
    float first_derivative_meps = 2 * distance_mul * num * (x - eps);
75
76
279k
    const auto inv_color_factor = Set(df, kInvColorFactor);
77
279k
    const auto thres = Set(df, kThres);
78
279k
    const auto coeffx2 = Set(df, kCoeff * 2.0f);
79
279k
    const auto one = Set(df, 1.0f);
80
279k
    const auto zero = Set(df, 0.0f);
81
279k
    const auto base_v = Set(df, base);
82
279k
    const auto x_v = Set(df, x);
83
279k
    const auto xpe_v = Set(df, x + eps);
84
279k
    const auto xme_v = Set(df, x - eps);
85
279k
    auto fd_v = Zero(df);
86
279k
    auto fdpe_v = Zero(df);
87
279k
    auto fdme_v = Zero(df);
88
89
123M
    for (size_t i = 0; i < num; i += Lanes(df)) {
90
      // color residual = ax + b
91
122M
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
92
122M
      const auto b =
93
122M
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
94
122M
      const auto v = MulAdd(a, x_v, b);
95
122M
      const auto vpe = MulAdd(a, xpe_v, b);
96
122M
      const auto vme = MulAdd(a, xme_v, b);
97
122M
      const auto av = Abs(v);
98
122M
      const auto avpe = Abs(vpe);
99
122M
      const auto avme = Abs(vme);
100
122M
      const auto acoeffx2 = Mul(coeffx2, a);
101
122M
      auto d = Mul(acoeffx2, Add(av, one));
102
122M
      auto dpe = Mul(acoeffx2, Add(avpe, one));
103
122M
      auto dme = Mul(acoeffx2, Add(avme, one));
104
122M
      d = IfThenElse(Lt(v, zero), Sub(zero, d), d);
105
122M
      dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe);
106
122M
      dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme);
107
122M
      const auto above = Ge(av, thres);
108
      // TODO(eustas): use IfThenElseZero
109
122M
      fd_v = Add(fd_v, IfThenElse(above, zero, d));
110
122M
      fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe));
111
122M
      fdme_v = Add(fdme_v, IfThenElse(above, zero, dme));
112
122M
    }
113
114
279k
    *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v));
115
279k
    *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v));
116
279k
    return first_derivative + GetLane(SumOfLanes(df, fd_v));
117
279k
  }
Unexecuted instantiation: jxl::N_SSE2::CFLFunction::Compute(float, float, float*, float*) const
118
119
  const float* JXL_RESTRICT values_m;
120
  const float* JXL_RESTRICT values_s;
121
  size_t num;
122
  float base;
123
  float distance_mul;
124
};
125
126
// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
127
int32_t FindBestMultiplier(const float* values_m, const float* values_s,
128
                           size_t num, float base, float distance_mul,
129
25.9k
                           bool fast) {
130
25.9k
  if (num == 0) {
131
0
    return 0;
132
0
  }
133
25.9k
  float x;
134
25.9k
  if (fast) {
135
0
    static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
136
0
    auto ca = Zero(df);
137
0
    auto cb = Zero(df);
138
0
    const auto inv_color_factor = Set(df, kInvColorFactor);
139
0
    const auto base_v = Set(df, base);
140
0
    for (size_t i = 0; i < num; i += Lanes(df)) {
141
      // color residual = ax + b
142
0
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
143
0
      const auto b =
144
0
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
145
0
      ca = MulAdd(a, a, ca);
146
0
      cb = MulAdd(a, b, cb);
147
0
    }
148
    // + distance_mul * x^2 * num
149
0
    x = -GetLane(SumOfLanes(df, cb)) /
150
0
        (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
151
25.9k
  } else {
152
25.9k
    constexpr float eps = 100;
153
25.9k
    constexpr float kClamp = 20.0f;
154
25.9k
    CFLFunction fn(values_m, values_s, num, base, distance_mul);
155
25.9k
    x = 0;
156
    // Up to 20 Newton iterations, with approximate derivatives.
157
    // Derivatives are approximate due to the high amount of noise in the exact
158
    // derivatives.
159
289k
    for (size_t i = 0; i < 20; i++) {
160
279k
      float dfpeps;
161
279k
      float dfmeps;
162
279k
      float d_f = fn.Compute(x, eps, &dfpeps, &dfmeps);
163
279k
      float ddf = (dfpeps - dfmeps) / (2 * eps);
164
279k
      float kExperimentalInsignificantStabilizer = 0.85;
165
279k
      float step = d_f / (ddf + kExperimentalInsignificantStabilizer);
166
279k
      x -= std::min(kClamp, std::max(-kClamp, step));
167
279k
      if (std::abs(step) < 3e-3) break;
168
279k
    }
169
25.9k
  }
170
  // CFL seems to be tricky for larger transforms for HF components
171
  // close to zero. This heuristic brings the solutions closer to zero
172
  // and reduces red-green oscillations. A better approach would
173
  // look into variance of the multiplier within separate (e.g. 8x8)
174
  // areas and only apply this heuristic where there is a high variance.
175
  // This would give about 1 % more compression density.
176
25.9k
  float towards_zero = 2.6;
177
25.9k
  if (x >= towards_zero) {
178
6.28k
    x -= towards_zero;
179
19.6k
  } else if (x <= -towards_zero) {
180
6.19k
    x += towards_zero;
181
13.4k
  } else {
182
13.4k
    x = 0;
183
13.4k
  }
184
25.9k
  return jxl::Clamp1(std::round(x), -128.0f, 127.0f);
185
25.9k
}
Unexecuted instantiation: jxl::N_SSE4::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool)
jxl::N_AVX2::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool)
Line
Count
Source
129
25.9k
                           bool fast) {
130
25.9k
  if (num == 0) {
131
0
    return 0;
132
0
  }
133
25.9k
  float x;
134
25.9k
  if (fast) {
135
0
    static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
136
0
    auto ca = Zero(df);
137
0
    auto cb = Zero(df);
138
0
    const auto inv_color_factor = Set(df, kInvColorFactor);
139
0
    const auto base_v = Set(df, base);
140
0
    for (size_t i = 0; i < num; i += Lanes(df)) {
141
      // color residual = ax + b
142
0
      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
143
0
      const auto b =
144
0
          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
145
0
      ca = MulAdd(a, a, ca);
146
0
      cb = MulAdd(a, b, cb);
147
0
    }
148
    // + distance_mul * x^2 * num
149
0
    x = -GetLane(SumOfLanes(df, cb)) /
150
0
        (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
151
25.9k
  } else {
152
25.9k
    constexpr float eps = 100;
153
25.9k
    constexpr float kClamp = 20.0f;
154
25.9k
    CFLFunction fn(values_m, values_s, num, base, distance_mul);
155
25.9k
    x = 0;
156
    // Up to 20 Newton iterations, with approximate derivatives.
157
    // Derivatives are approximate due to the high amount of noise in the exact
158
    // derivatives.
159
289k
    for (size_t i = 0; i < 20; i++) {
160
279k
      float dfpeps;
161
279k
      float dfmeps;
162
279k
      float d_f = fn.Compute(x, eps, &dfpeps, &dfmeps);
163
279k
      float ddf = (dfpeps - dfmeps) / (2 * eps);
164
279k
      float kExperimentalInsignificantStabilizer = 0.85;
165
279k
      float step = d_f / (ddf + kExperimentalInsignificantStabilizer);
166
279k
      x -= std::min(kClamp, std::max(-kClamp, step));
167
279k
      if (std::abs(step) < 3e-3) break;
168
279k
    }
169
25.9k
  }
170
  // CFL seems to be tricky for larger transforms for HF components
171
  // close to zero. This heuristic brings the solutions closer to zero
172
  // and reduces red-green oscillations. A better approach would
173
  // look into variance of the multiplier within separate (e.g. 8x8)
174
  // areas and only apply this heuristic where there is a high variance.
175
  // This would give about 1 % more compression density.
176
25.9k
  float towards_zero = 2.6;
177
25.9k
  if (x >= towards_zero) {
178
6.28k
    x -= towards_zero;
179
19.6k
  } else if (x <= -towards_zero) {
180
6.19k
    x += towards_zero;
181
13.4k
  } else {
182
13.4k
    x = 0;
183
13.4k
  }
184
25.9k
  return jxl::Clamp1(std::round(x), -128.0f, 127.0f);
185
25.9k
}
Unexecuted instantiation: jxl::N_SSE2::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool)
186
187
Status InitDCStorage(JxlMemoryManager* memory_manager, size_t num_blocks,
188
162
                     ImageF* dc_values) {
189
  // First row: Y channel
190
  // Second row: X channel
191
  // Third row: Y channel
192
  // Fourth row: B channel
193
162
  JXL_ASSIGN_OR_RETURN(
194
162
      *dc_values,
195
162
      ImageF::Create(memory_manager, RoundUpTo(num_blocks, Lanes(df)), 4));
196
197
162
  JXL_ENSURE(dc_values->xsize() != 0);
198
  // Zero-fill the last lanes
199
810
  for (size_t y = 0; y < 4; y++) {
200
5.83k
    for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize();
201
5.18k
         x++) {
202
5.18k
      dc_values->Row(y)[x] = 0;
203
5.18k
    }
204
648
  }
205
162
  return true;
206
162
}
Unexecuted instantiation: jxl::N_SSE4::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*)
jxl::N_AVX2::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*)
Line
Count
Source
188
162
                     ImageF* dc_values) {
189
  // First row: Y channel
190
  // Second row: X channel
191
  // Third row: Y channel
192
  // Fourth row: B channel
193
162
  JXL_ASSIGN_OR_RETURN(
194
162
      *dc_values,
195
162
      ImageF::Create(memory_manager, RoundUpTo(num_blocks, Lanes(df)), 4));
196
197
162
  JXL_ENSURE(dc_values->xsize() != 0);
198
  // Zero-fill the last lanes
199
810
  for (size_t y = 0; y < 4; y++) {
200
5.83k
    for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize();
201
5.18k
         x++) {
202
5.18k
      dc_values->Row(y)[x] = 0;
203
5.18k
    }
204
648
  }
205
162
  return true;
206
162
}
Unexecuted instantiation: jxl::N_SSE2::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*)
207
208
Status ComputeTile(const Image3F& opsin, const Rect& opsin_rect,
209
                   const DequantMatrices& dequant,
210
                   const AcStrategyImage* ac_strategy,
211
                   const ImageI* raw_quant_field, const Quantizer* quantizer,
212
                   const Rect& rect, bool fast, bool use_dct8, ImageSB* map_x,
213
12.9k
                   ImageSB* map_b, ImageF* dc_values, Span<float> mem) {
214
12.9k
  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
215
12.9k
                "Invalid color tile dim");
216
12.9k
  size_t xsize_blocks = opsin_rect.xsize() / kBlockDim;
217
12.9k
  constexpr float kDistanceMultiplierAC = 1e-9f;
218
12.9k
  const size_t dct_scratch_size =
219
12.9k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
220
221
12.9k
  const size_t y0 = rect.y0();
222
12.9k
  const size_t x0 = rect.x0();
223
12.9k
  const size_t x1 = rect.x0() + rect.xsize();
224
12.9k
  const size_t y1 = rect.y0() + rect.ysize();
225
226
12.9k
  int ty = y0 / kColorTileDimInBlocks;
227
12.9k
  int tx = x0 / kColorTileDimInBlocks;
228
229
12.9k
  int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty);
230
12.9k
  int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty);
231
232
12.9k
  float* JXL_RESTRICT dc_values_yx = dc_values->Row(0);
233
12.9k
  float* JXL_RESTRICT dc_values_x = dc_values->Row(1);
234
12.9k
  float* JXL_RESTRICT dc_values_yb = dc_values->Row(2);
235
12.9k
  float* JXL_RESTRICT dc_values_b = dc_values->Row(3);
236
237
  // All are aligned.
238
12.9k
  float* HWY_RESTRICT block_y = mem.begin();
239
12.9k
  float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea;
240
12.9k
  float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea;
241
12.9k
  JXL_ENSURE(mem.remove_prefix(3 * AcStrategy::kMaxCoeffArea));
242
12.9k
  float* HWY_RESTRICT coeffs_yx = mem.begin();
243
12.9k
  float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim;
244
12.9k
  float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
245
12.9k
  float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
246
12.9k
  JXL_ENSURE(mem.remove_prefix(4 * kColorTileDim * kColorTileDim));
247
12.9k
  constexpr size_t dc_size =
248
12.9k
      AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks;
249
12.9k
  float* HWY_RESTRICT dc_y = mem.begin();
250
12.9k
  float* HWY_RESTRICT dc_x = dc_y + dc_size;
251
12.9k
  float* HWY_RESTRICT dc_b = dc_x + dc_size;
252
12.9k
  JXL_ENSURE(mem.remove_prefix(3 * dc_size));
253
12.9k
  float* HWY_RESTRICT scratch_space = mem.begin();
254
12.9k
  JXL_ENSURE(mem.size() == 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size);
255
256
12.9k
  size_t num_ac = 0;
257
258
109k
  for (size_t y = y0; y < y1; ++y) {
259
96.6k
    const float* JXL_RESTRICT row_y =
260
96.6k
        opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim);
261
96.6k
    const float* JXL_RESTRICT row_x =
262
96.6k
        opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim);
263
96.6k
    const float* JXL_RESTRICT row_b =
264
96.6k
        opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim);
265
96.6k
    size_t stride = opsin.PixelsPerRow();
266
267
802k
    for (size_t x = x0; x < x1; x++) {
268
706k
      AcStrategy acs = use_dct8
269
706k
                           ? AcStrategy::FromRawStrategy(AcStrategyType::DCT)
270
706k
                           : ac_strategy->ConstRow(y)[x];
271
706k
      if (!acs.IsFirstBlock()) continue;
272
534k
      size_t xs = acs.covered_blocks_x();
273
534k
      TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride,
274
534k
                          block_y, scratch_space);
275
534k
      DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs, scratch_space);
276
534k
      TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride,
277
534k
                          block_x, scratch_space);
278
534k
      DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs, scratch_space);
279
534k
      TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride,
280
534k
                          block_b, scratch_space);
281
534k
      DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs, scratch_space);
282
534k
      const float* const JXL_RESTRICT qm_x =
283
534k
          dequant.InvMatrix(acs.Strategy(), 0);
284
534k
      const float* const JXL_RESTRICT qm_b =
285
534k
          dequant.InvMatrix(acs.Strategy(), 2);
286
534k
      float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
287
534k
      float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
288
289
      // Copy DCs in dc_values.
290
1.11M
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
291
1.28M
        for (size_t ix = 0; ix < xs; ix++) {
292
706k
          dc_values_yx[(iy + y) * xsize_blocks + ix + x] =
293
706k
              dc_y[iy * xs + ix] * q_dc_x;
294
706k
          dc_values_x[(iy + y) * xsize_blocks + ix + x] =
295
706k
              dc_x[iy * xs + ix] * q_dc_x;
296
706k
          dc_values_yb[(iy + y) * xsize_blocks + ix + x] =
297
706k
              dc_y[iy * xs + ix] * q_dc_b;
298
706k
          dc_values_b[(iy + y) * xsize_blocks + ix + x] =
299
706k
              dc_b[iy * xs + ix] * q_dc_b;
300
706k
        }
301
575k
      }
302
303
      // Do not use this block for computing AC CfL.
304
534k
      if (acs.covered_blocks_x() + x0 > x1 ||
305
534k
          acs.covered_blocks_y() + y0 > y1) {
306
0
        continue;
307
0
      }
308
309
      // Copy AC coefficients in the local block. The order in which
310
      // coefficients get stored does not matter.
311
534k
      size_t cx = acs.covered_blocks_x();
312
534k
      size_t cy = acs.covered_blocks_y();
313
534k
      CoefficientLayout(&cy, &cx);
314
      // Zero out LFs. This introduces terms in the optimization loop that
315
      // don't affect the result, as they are all 0, but allow for simpler
316
      // SIMDfication.
317
1.09M
      for (size_t iy = 0; iy < cy; iy++) {
318
1.27M
        for (size_t ix = 0; ix < cx; ix++) {
319
706k
          block_y[cx * kBlockDim * iy + ix] = 0;
320
706k
          block_x[cx * kBlockDim * iy + ix] = 0;
321
706k
          block_b[cx * kBlockDim * iy + ix] = 0;
322
706k
        }
323
564k
      }
324
      // Unclear why this is like it is. (This works slightly better
325
      // than the previous approach which was also a hack.)
326
534k
      const float qq =
327
534k
          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
328
      // Experimentally values 128-130 seem best -- I don't know why we
329
      // need this multiplier.
330
534k
      const float kStrangeMultiplier = 128;
331
534k
      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
332
534k
      const auto qv = Set(df, q);
333
6.18M
      for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
334
5.65M
        const auto b_y = Load(df, block_y + i);
335
5.65M
        const auto b_x = Load(df, block_x + i);
336
5.65M
        const auto b_b = Load(df, block_b + i);
337
5.65M
        const auto qqm_x = Mul(qv, Load(df, qm_x + i));
338
5.65M
        const auto qqm_b = Mul(qv, Load(df, qm_b + i));
339
5.65M
        Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac);
340
5.65M
        Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac);
341
5.65M
        Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac);
342
5.65M
        Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac);
343
5.65M
        num_ac += Lanes(df);
344
5.65M
      }
345
534k
    }
346
96.6k
  }
347
12.9k
  JXL_ENSURE(num_ac % Lanes(df) == 0);
348
12.9k
  row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
349
12.9k
                                     kDistanceMultiplierAC, fast);
350
12.9k
  row_out_b[tx] =
351
12.9k
      FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
352
12.9k
                         kDistanceMultiplierAC, fast);
353
12.9k
  return true;
354
12.9k
}
Unexecuted instantiation: jxl::N_SSE4::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>)
jxl::N_AVX2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>)
Line
Count
Source
213
12.9k
                   ImageSB* map_b, ImageF* dc_values, Span<float> mem) {
214
12.9k
  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
215
12.9k
                "Invalid color tile dim");
216
12.9k
  size_t xsize_blocks = opsin_rect.xsize() / kBlockDim;
217
12.9k
  constexpr float kDistanceMultiplierAC = 1e-9f;
218
12.9k
  const size_t dct_scratch_size =
219
12.9k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
220
221
12.9k
  const size_t y0 = rect.y0();
222
12.9k
  const size_t x0 = rect.x0();
223
12.9k
  const size_t x1 = rect.x0() + rect.xsize();
224
12.9k
  const size_t y1 = rect.y0() + rect.ysize();
225
226
12.9k
  int ty = y0 / kColorTileDimInBlocks;
227
12.9k
  int tx = x0 / kColorTileDimInBlocks;
228
229
12.9k
  int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty);
230
12.9k
  int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty);
231
232
12.9k
  float* JXL_RESTRICT dc_values_yx = dc_values->Row(0);
233
12.9k
  float* JXL_RESTRICT dc_values_x = dc_values->Row(1);
234
12.9k
  float* JXL_RESTRICT dc_values_yb = dc_values->Row(2);
235
12.9k
  float* JXL_RESTRICT dc_values_b = dc_values->Row(3);
236
237
  // All are aligned.
238
12.9k
  float* HWY_RESTRICT block_y = mem.begin();
239
12.9k
  float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea;
240
12.9k
  float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea;
241
12.9k
  JXL_ENSURE(mem.remove_prefix(3 * AcStrategy::kMaxCoeffArea));
242
12.9k
  float* HWY_RESTRICT coeffs_yx = mem.begin();
243
12.9k
  float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim;
244
12.9k
  float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
245
12.9k
  float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
246
12.9k
  JXL_ENSURE(mem.remove_prefix(4 * kColorTileDim * kColorTileDim));
247
12.9k
  constexpr size_t dc_size =
248
12.9k
      AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks;
249
12.9k
  float* HWY_RESTRICT dc_y = mem.begin();
250
12.9k
  float* HWY_RESTRICT dc_x = dc_y + dc_size;
251
12.9k
  float* HWY_RESTRICT dc_b = dc_x + dc_size;
252
12.9k
  JXL_ENSURE(mem.remove_prefix(3 * dc_size));
253
12.9k
  float* HWY_RESTRICT scratch_space = mem.begin();
254
12.9k
  JXL_ENSURE(mem.size() == 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size);
255
256
12.9k
  size_t num_ac = 0;
257
258
109k
  for (size_t y = y0; y < y1; ++y) {
259
96.6k
    const float* JXL_RESTRICT row_y =
260
96.6k
        opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim);
261
96.6k
    const float* JXL_RESTRICT row_x =
262
96.6k
        opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim);
263
96.6k
    const float* JXL_RESTRICT row_b =
264
96.6k
        opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim);
265
96.6k
    size_t stride = opsin.PixelsPerRow();
266
267
802k
    for (size_t x = x0; x < x1; x++) {
268
706k
      AcStrategy acs = use_dct8
269
706k
                           ? AcStrategy::FromRawStrategy(AcStrategyType::DCT)
270
706k
                           : ac_strategy->ConstRow(y)[x];
271
706k
      if (!acs.IsFirstBlock()) continue;
272
534k
      size_t xs = acs.covered_blocks_x();
273
534k
      TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride,
274
534k
                          block_y, scratch_space);
275
534k
      DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs, scratch_space);
276
534k
      TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride,
277
534k
                          block_x, scratch_space);
278
534k
      DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs, scratch_space);
279
534k
      TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride,
280
534k
                          block_b, scratch_space);
281
534k
      DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs, scratch_space);
282
534k
      const float* const JXL_RESTRICT qm_x =
283
534k
          dequant.InvMatrix(acs.Strategy(), 0);
284
534k
      const float* const JXL_RESTRICT qm_b =
285
534k
          dequant.InvMatrix(acs.Strategy(), 2);
286
534k
      float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
287
534k
      float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
288
289
      // Copy DCs in dc_values.
290
1.11M
      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
291
1.28M
        for (size_t ix = 0; ix < xs; ix++) {
292
706k
          dc_values_yx[(iy + y) * xsize_blocks + ix + x] =
293
706k
              dc_y[iy * xs + ix] * q_dc_x;
294
706k
          dc_values_x[(iy + y) * xsize_blocks + ix + x] =
295
706k
              dc_x[iy * xs + ix] * q_dc_x;
296
706k
          dc_values_yb[(iy + y) * xsize_blocks + ix + x] =
297
706k
              dc_y[iy * xs + ix] * q_dc_b;
298
706k
          dc_values_b[(iy + y) * xsize_blocks + ix + x] =
299
706k
              dc_b[iy * xs + ix] * q_dc_b;
300
706k
        }
301
575k
      }
302
303
      // Do not use this block for computing AC CfL.
304
534k
      if (acs.covered_blocks_x() + x0 > x1 ||
305
534k
          acs.covered_blocks_y() + y0 > y1) {
306
0
        continue;
307
0
      }
308
309
      // Copy AC coefficients in the local block. The order in which
310
      // coefficients get stored does not matter.
311
534k
      size_t cx = acs.covered_blocks_x();
312
534k
      size_t cy = acs.covered_blocks_y();
313
534k
      CoefficientLayout(&cy, &cx);
314
      // Zero out LFs. This introduces terms in the optimization loop that
315
      // don't affect the result, as they are all 0, but allow for simpler
316
      // SIMDfication.
317
1.09M
      for (size_t iy = 0; iy < cy; iy++) {
318
1.27M
        for (size_t ix = 0; ix < cx; ix++) {
319
706k
          block_y[cx * kBlockDim * iy + ix] = 0;
320
706k
          block_x[cx * kBlockDim * iy + ix] = 0;
321
706k
          block_b[cx * kBlockDim * iy + ix] = 0;
322
706k
        }
323
564k
      }
324
      // Unclear why this is like it is. (This works slightly better
325
      // than the previous approach which was also a hack.)
326
534k
      const float qq =
327
534k
          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
328
      // Experimentally values 128-130 seem best -- I don't know why we
329
      // need this multiplier.
330
534k
      const float kStrangeMultiplier = 128;
331
534k
      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
332
534k
      const auto qv = Set(df, q);
333
6.18M
      for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
334
5.65M
        const auto b_y = Load(df, block_y + i);
335
5.65M
        const auto b_x = Load(df, block_x + i);
336
5.65M
        const auto b_b = Load(df, block_b + i);
337
5.65M
        const auto qqm_x = Mul(qv, Load(df, qm_x + i));
338
5.65M
        const auto qqm_b = Mul(qv, Load(df, qm_b + i));
339
5.65M
        Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac);
340
5.65M
        Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac);
341
5.65M
        Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac);
342
5.65M
        Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac);
343
5.65M
        num_ac += Lanes(df);
344
5.65M
      }
345
534k
    }
346
96.6k
  }
347
12.9k
  JXL_ENSURE(num_ac % Lanes(df) == 0);
348
12.9k
  row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
349
12.9k
                                     kDistanceMultiplierAC, fast);
350
12.9k
  row_out_b[tx] =
351
12.9k
      FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
352
12.9k
                         kDistanceMultiplierAC, fast);
353
12.9k
  return true;
354
12.9k
}
Unexecuted instantiation: jxl::N_SSE2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>)
355
356
// NOLINTNEXTLINE(google-readability-namespace-comments)
357
}  // namespace HWY_NAMESPACE
358
}  // namespace jxl
359
HWY_AFTER_NAMESPACE();
360
361
#if HWY_ONCE
362
namespace jxl {
363
364
HWY_EXPORT(InitDCStorage);
365
HWY_EXPORT(ComputeTile);
366
367
162
Status CfLHeuristics::Init(const Rect& rect) {
368
162
  size_t xsize_blocks = rect.xsize() / kBlockDim;
369
162
  size_t ysize_blocks = rect.ysize() / kBlockDim;
370
162
  return HWY_DYNAMIC_DISPATCH(InitDCStorage)(
371
162
      memory_manager, xsize_blocks * ysize_blocks, &dc_values);
372
162
}
373
374
Status CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
375
                                  const Rect& opsin_rect,
376
                                  const DequantMatrices& dequant,
377
                                  const AcStrategyImage* ac_strategy,
378
                                  const ImageI* raw_quant_field,
379
                                  const Quantizer* quantizer, bool fast,
380
12.9k
                                  size_t thread, ColorCorrelationMap* cmap) {
381
12.9k
  bool use_dct8 = ac_strategy == nullptr;
382
12.9k
  Span<float> scratch(mem.address<float>() + thread * ItemsPerThread(),
383
12.9k
                      ItemsPerThread());
384
12.9k
  return HWY_DYNAMIC_DISPATCH(ComputeTile)(
385
12.9k
      opsin, opsin_rect, dequant, ac_strategy, raw_quant_field, quantizer, r,
386
12.9k
      fast, use_dct8, &cmap->ytox_map, &cmap->ytob_map, &dc_values, scratch);
387
12.9k
}
388
389
Status ColorCorrelationEncodeDC(const ColorCorrelation& color_correlation,
390
                                BitWriter* writer, LayerType layer,
391
162
                                AuxOut* aux_out) {
392
162
  float color_factor = color_correlation.GetColorFactor();
393
162
  float base_correlation_x = color_correlation.GetBaseCorrelationX();
394
162
  float base_correlation_b = color_correlation.GetBaseCorrelationB();
395
162
  int32_t ytox_dc = color_correlation.GetYToXDC();
396
162
  int32_t ytob_dc = color_correlation.GetYToBDC();
397
398
162
  return writer->WithMaxBits(
399
162
      1 + 2 * kBitsPerByte + 12 + 32, layer, aux_out, [&]() -> Status {
400
162
        if (ytox_dc == 0 && ytob_dc == 0 &&
401
162
            color_factor == kDefaultColorFactor && base_correlation_x == 0.0f &&
402
162
            base_correlation_b == jxl::cms::kYToBRatio) {
403
162
          writer->Write(1, 1);
404
162
          return true;
405
162
        }
406
0
        writer->Write(1, 0);
407
0
        JXL_RETURN_IF_ERROR(
408
0
            U32Coder::Write(kColorFactorDist, color_factor, writer));
409
0
        JXL_RETURN_IF_ERROR(F16Coder::Write(base_correlation_x, writer));
410
0
        JXL_RETURN_IF_ERROR(F16Coder::Write(base_correlation_b, writer));
411
0
        writer->Write(kBitsPerByte,
412
0
                      ytox_dc - std::numeric_limits<int8_t>::min());
413
0
        writer->Write(kBitsPerByte,
414
0
                      ytob_dc - std::numeric_limits<int8_t>::min());
415
0
        return true;
416
0
      });
417
162
}
418
419
}  // namespace jxl
420
#endif  // HWY_ONCE