Coverage Report

Created: 2026-01-20 07:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_group.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_group.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cmath>
12
#include <cstddef>
13
#include <cstdint>
14
#include <cstdlib>
15
16
#include "lib/jxl/base/common.h"
17
#include "lib/jxl/base/status.h"
18
#include "lib/jxl/chroma_from_luma.h"
19
#include "lib/jxl/coeff_order_fwd.h"
20
#include "lib/jxl/enc_ans.h"
21
#include "lib/jxl/enc_bit_writer.h"
22
#include "lib/jxl/frame_dimensions.h"
23
#include "lib/jxl/memory_manager_internal.h"
24
25
#undef HWY_TARGET_INCLUDE
26
#define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc"
27
#include <hwy/foreach_target.h>
28
#include <hwy/highway.h>
29
30
#include "lib/jxl/ac_strategy.h"
31
#include "lib/jxl/base/bits.h"
32
#include "lib/jxl/base/compiler_specific.h"
33
#include "lib/jxl/base/rect.h"
34
#include "lib/jxl/common.h"  // kMaxNumPasses
35
#include "lib/jxl/dct_util.h"
36
#include "lib/jxl/dec_transforms-inl.h"
37
#include "lib/jxl/enc_aux_out.h"
38
#include "lib/jxl/enc_cache.h"
39
#include "lib/jxl/enc_params.h"
40
#include "lib/jxl/enc_transforms-inl.h"
41
#include "lib/jxl/image.h"
42
#include "lib/jxl/quantizer-inl.h"
43
#include "lib/jxl/quantizer.h"
44
#include "lib/jxl/simd_util.h"
45
HWY_BEFORE_NAMESPACE();
46
namespace jxl {
47
namespace HWY_NAMESPACE {
48
49
// These templates are not found via ADL.
50
using hwy::HWY_NAMESPACE::Abs;
51
using hwy::HWY_NAMESPACE::Ge;
52
using hwy::HWY_NAMESPACE::IfThenElse;
53
using hwy::HWY_NAMESPACE::IfThenElseZero;
54
using hwy::HWY_NAMESPACE::MaskFromVec;
55
using hwy::HWY_NAMESPACE::Round;
56
57
// NOTE: caller takes care of extracting quant from rect of RawQuantField.
58
void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion,
59
                     size_t c, float qm_multiplier, AcStrategyType quant_kind,
60
                     size_t xsize, size_t ysize, float* thresholds,
61
                     const float* JXL_RESTRICT block_in, const int32_t* quant,
62
9.46M
                     int32_t* JXL_RESTRICT block_out) {
63
9.46M
  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
64
9.46M
  float qac = quantizer.Scale() * (*quant);
65
  // Not SIMD-ified for now.
66
9.46M
  if (c != 1 && xsize * ysize >= 4) {
67
2.36M
    for (int i = 0; i < 4; ++i) {
68
1.89M
      thresholds[i] -= 0.00744f * xsize * ysize;
69
1.89M
      if (thresholds[i] < 0.5) {
70
296k
        thresholds[i] = 0.5;
71
296k
      }
72
1.89M
    }
73
473k
  }
74
9.46M
  HWY_CAPPED(float, kBlockDim) df;
75
9.46M
  HWY_CAPPED(int32_t, kBlockDim) di;
76
9.46M
  HWY_CAPPED(uint32_t, kBlockDim) du;
77
9.46M
  const auto quantv = Set(df, qac * qm_multiplier);
78
96.4M
  for (size_t y = 0; y < ysize * kBlockDim; y++) {
79
87.0M
    size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
80
87.0M
    const size_t off = y * kBlockDim * xsize;
81
237M
    for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
82
150M
      auto threshold = Zero(df);
83
150M
      if (xsize == 1) {
84
63.5M
        HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
85
63.5M
        const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
86
63.5M
        threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
87
63.5M
                               Set(df, thresholds[yfix]));
88
86.9M
      } else {
89
        // Same for all lanes in the vector.
90
86.9M
        threshold = Set(
91
86.9M
            df,
92
86.9M
            thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
93
86.9M
      }
94
150M
      const auto q = Mul(Load(df, qm + off + x), quantv);
95
150M
      const auto in = Load(df, block_in + off + x);
96
150M
      const auto val = Mul(q, in);
97
150M
      const auto nzero_mask = Ge(Abs(val), threshold);
98
150M
      const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
99
150M
      Store(v, di, block_out + off + x);
100
150M
    }
101
87.0M
  }
102
9.46M
}
Unexecuted instantiation: jxl::N_SSE4::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*)
jxl::N_AVX2::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*)
Line
Count
Source
62
9.46M
                     int32_t* JXL_RESTRICT block_out) {
63
9.46M
  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
64
9.46M
  float qac = quantizer.Scale() * (*quant);
65
  // Not SIMD-ified for now.
66
9.46M
  if (c != 1 && xsize * ysize >= 4) {
67
2.36M
    for (int i = 0; i < 4; ++i) {
68
1.89M
      thresholds[i] -= 0.00744f * xsize * ysize;
69
1.89M
      if (thresholds[i] < 0.5) {
70
296k
        thresholds[i] = 0.5;
71
296k
      }
72
1.89M
    }
73
473k
  }
74
9.46M
  HWY_CAPPED(float, kBlockDim) df;
75
9.46M
  HWY_CAPPED(int32_t, kBlockDim) di;
76
9.46M
  HWY_CAPPED(uint32_t, kBlockDim) du;
77
9.46M
  const auto quantv = Set(df, qac * qm_multiplier);
78
96.4M
  for (size_t y = 0; y < ysize * kBlockDim; y++) {
79
87.0M
    size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
80
87.0M
    const size_t off = y * kBlockDim * xsize;
81
237M
    for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
82
150M
      auto threshold = Zero(df);
83
150M
      if (xsize == 1) {
84
63.5M
        HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
85
63.5M
        const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
86
63.5M
        threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
87
63.5M
                               Set(df, thresholds[yfix]));
88
86.9M
      } else {
89
        // Same for all lanes in the vector.
90
86.9M
        threshold = Set(
91
86.9M
            df,
92
86.9M
            thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
93
86.9M
      }
94
150M
      const auto q = Mul(Load(df, qm + off + x), quantv);
95
150M
      const auto in = Load(df, block_in + off + x);
96
150M
      const auto val = Mul(q, in);
97
150M
      const auto nzero_mask = Ge(Abs(val), threshold);
98
150M
      const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
99
150M
      Store(v, di, block_out + off + x);
100
150M
    }
101
87.0M
  }
102
9.46M
}
Unexecuted instantiation: jxl::N_AVX3::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*)
Unexecuted instantiation: jxl::N_SSE2::QuantizeBlockAC(jxl::Quantizer const&, bool, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int const*, int*)
103
104
void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c,
105
                        float qm_multiplier, AcStrategyType quant_kind,
106
                        size_t xsize, size_t ysize, float* thresholds,
107
9.46M
                        const float* JXL_RESTRICT block_in, int32_t* quant) {
108
  // No quantization adjusting for these small blocks.
109
  // Quantization adjusting attempts to fix some known issues
110
  // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
111
  // when there are not many non-zeros.
112
9.46M
  constexpr size_t kPartialBlockKinds =
113
9.46M
      (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) |
114
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) |
115
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) |
116
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) |
117
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) |
118
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV0)) |
119
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV1)) |
120
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV2)) |
121
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV3));
122
9.46M
  if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) {
123
4.72M
    return;
124
4.72M
  }
125
126
4.73M
  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
127
4.73M
  float qac = quantizer.Scale() * (*quant);
128
4.73M
  if (xsize > 1 || ysize > 1) {
129
7.56M
    for (int i = 0; i < 4; ++i) {
130
6.04M
      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
131
6.04M
      if (thresholds[i] < 0.54) {
132
272k
        thresholds[i] = 0.54;
133
272k
      }
134
6.04M
    }
135
1.51M
  }
136
4.73M
  float sum_of_highest_freq_row_and_column = 0;
137
4.73M
  float sum_of_error = 0;
138
4.73M
  float sum_of_vals = 0;
139
4.73M
  float hfNonZeros[4] = {};
140
4.73M
  float hfMaxError[4] = {};
141
142
53.9M
  for (size_t y = 0; y < ysize * kBlockDim; y++) {
143
951M
    for (size_t x = 0; x < xsize * kBlockDim; x++) {
144
901M
      const size_t pos = y * kBlockDim * xsize + x;
145
901M
      if (x < xsize && y < ysize) {
146
14.0M
        continue;
147
14.0M
      }
148
887M
      const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
149
887M
                           static_cast<size_t>(x >= xsize * kBlockDim / 2));
150
887M
      const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
151
887M
      const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
152
887M
      const float error = std::abs(val - v);
153
887M
      sum_of_error += error;
154
887M
      sum_of_vals += std::abs(v);
155
887M
      if (c == 1 && v == 0) {
156
250M
        if (hfMaxError[hfix] < error) {
157
16.4M
          hfMaxError[hfix] = error;
158
16.4M
        }
159
250M
      }
160
887M
      if (v != 0.0f) {
161
70.6M
        hfNonZeros[hfix] += std::abs(v);
162
70.6M
        bool in_corner = y >= 7 * ysize && x >= 7 * xsize;
163
70.6M
        bool on_border =
164
70.6M
            y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1;
165
70.6M
        bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize;
166
70.6M
        if (in_corner || (on_border && in_larger_corner)) {
167
2.29M
          sum_of_highest_freq_row_and_column += std::abs(val);
168
2.29M
        }
169
70.6M
      }
170
887M
    }
171
49.1M
  }
172
4.73M
  if (c == 1 && sum_of_vals * 8 < xsize * ysize) {
173
446k
    static const double kLimit[4] = {
174
446k
        0.46,
175
446k
        0.46,
176
446k
        0.46,
177
446k
        0.46,
178
446k
    };
179
446k
    static const double kMul[4] = {
180
446k
        0.9999,
181
446k
        0.9999,
182
446k
        0.9999,
183
446k
        0.9999,
184
446k
    };
185
446k
    const int32_t orig_quant = *quant;
186
446k
    int32_t new_quant = *quant;
187
1.77M
    for (int i = 1; i < 4; ++i) {
188
1.33M
      if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) {
189
4.71k
        new_quant = orig_quant + 1;
190
4.71k
        break;
191
4.71k
      }
192
1.33M
    }
193
446k
    *quant = new_quant;
194
446k
    if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) {
195
755
      thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant;
196
446k
    } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) ||
197
443k
               (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) {
198
3.96k
      thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) *
199
3.96k
                      new_quant / orig_quant;
200
3.96k
      thresholds[2] = thresholds[1];
201
442k
    } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) {
202
18.4k
      thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant;
203
18.4k
    }
204
446k
  }
205
  // Heuristic for improving accuracy of high-frequency patterns
206
  // occurring in an environment with no medium-frequency masking
207
  // patterns.
208
4.73M
  {
209
4.73M
    float all =
210
4.73M
        hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1;
211
4.73M
    float mul[3] = {70, 30, 60};
212
4.73M
    if (mul[c] * sum_of_highest_freq_row_and_column >= all) {
213
583k
      *quant += mul[c] * sum_of_highest_freq_row_and_column / all;
214
583k
      if (*quant >= Quantizer::kQuantMax) {
215
0
        *quant = Quantizer::kQuantMax - 1;
216
0
      }
217
583k
    }
218
4.73M
  }
219
4.73M
  if (quant_kind == AcStrategyType::DCT) {
220
    // If this 8x8 block is too flat, increase the adaptive quantization level
221
    // a bit to reduce visible block boundaries and requantize the block.
222
3.22M
    if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
223
1.55M
      *quant += 1;
224
1.55M
      if (*quant >= Quantizer::kQuantMax) {
225
0
        *quant = Quantizer::kQuantMax - 1;
226
0
      }
227
1.55M
    }
228
3.22M
  }
229
4.73M
  {
230
4.73M
    static const double kMul1[4][3] = {
231
4.73M
        {
232
4.73M
            0.22080615753848404,
233
4.73M
            0.45797479824262011,
234
4.73M
            0.29859235095977965,
235
4.73M
        },
236
4.73M
        {
237
4.73M
            0.70109486510286834,
238
4.73M
            0.16185281305512639,
239
4.73M
            0.14387691730035473,
240
4.73M
        },
241
4.73M
        {
242
4.73M
            0.114985964456218638,
243
4.73M
            0.44656840441027695,
244
4.73M
            0.10587658215149048,
245
4.73M
        },
246
4.73M
        {
247
4.73M
            0.46849665264409396,
248
4.73M
            0.41239077937781954,
249
4.73M
            0.088667407767185444,
250
4.73M
        },
251
4.73M
    };
252
4.73M
    static const double kMul2[4][3] = {
253
4.73M
        {
254
4.73M
            0.27450281941822197,
255
4.73M
            1.1255766549984996,
256
4.73M
            0.98950459134128388,
257
4.73M
        },
258
4.73M
        {
259
4.73M
            0.4652168675598285,
260
4.73M
            0.40945807983455818,
261
4.73M
            0.36581899811751367,
262
4.73M
        },
263
4.73M
        {
264
4.73M
            0.28034972424715715,
265
4.73M
            0.9182653201929738,
266
4.73M
            1.5581531543057416,
267
4.73M
        },
268
4.73M
        {
269
4.73M
            0.26873118114033728,
270
4.73M
            0.68863712390392484,
271
4.73M
            1.2082185408666786,
272
4.73M
        },
273
4.73M
    };
274
4.73M
    static const double kQuantNormalizer = 2.2942708343284721;
275
4.73M
    sum_of_error *= kQuantNormalizer;
276
4.73M
    sum_of_vals *= kQuantNormalizer;
277
4.73M
    if (quant_kind >= AcStrategyType::DCT16X16) {
278
1.51M
      int ix = 3;
279
1.51M
      if (quant_kind == AcStrategyType::DCT32X16 ||
280
1.41M
          quant_kind == AcStrategyType::DCT16X32) {
281
242k
        ix = 1;
282
1.27M
      } else if (quant_kind == AcStrategyType::DCT16X16) {
283
195k
        ix = 0;
284
1.07M
      } else if (quant_kind == AcStrategyType::DCT32X32) {
285
215k
        ix = 2;
286
215k
      }
287
1.51M
      int step =
288
1.51M
          sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
289
1.51M
                          kMul2[ix][c] * sum_of_vals);
290
1.51M
      if (step >= 2) {
291
9.82k
        step = 2;
292
9.82k
      }
293
1.51M
      if (step < 0) {
294
0
        step = 0;
295
0
      }
296
1.51M
      if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
297
1.51M
                             kMul2[ix][c] * sum_of_vals) {
298
79.3k
        *quant += step;
299
79.3k
        if (*quant >= Quantizer::kQuantMax) {
300
0
          *quant = Quantizer::kQuantMax - 1;
301
0
        }
302
79.3k
      }
303
1.51M
    }
304
4.73M
  }
305
4.73M
  {
306
    // Reduce quant in highly active areas.
307
4.73M
    int32_t div = (xsize * ysize);
308
4.73M
    int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div;
309
4.73M
    int32_t orig_qp_limit = std::max(4, *quant / 2);
310
18.9M
    for (int i = 1; i < 4; ++i) {
311
14.2M
      activity = std::min(
312
14.2M
          activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div);
313
14.2M
    }
314
4.73M
    if (activity >= 15) {
315
206k
      activity = 15;
316
206k
    }
317
4.73M
    int32_t qp = *quant - activity;
318
4.73M
    if (c == 1) {
319
6.31M
      for (int i = 1; i < 4; ++i) {
320
4.73M
        thresholds[i] += 0.01 * activity;
321
4.73M
      }
322
1.57M
    }
323
4.73M
    if (qp < orig_qp_limit) {
324
1.14M
      qp = orig_qp_limit;
325
1.14M
    }
326
4.73M
    *quant = qp;
327
4.73M
  }
328
4.73M
}
Unexecuted instantiation: jxl::N_SSE4::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*)
jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*)
Line
Count
Source
107
9.46M
                        const float* JXL_RESTRICT block_in, int32_t* quant) {
108
  // No quantization adjusting for these small blocks.
109
  // Quantization adjusting attempts to fix some known issues
110
  // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
111
  // when there are not many non-zeros.
112
9.46M
  constexpr size_t kPartialBlockKinds =
113
9.46M
      (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) |
114
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) |
115
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) |
116
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) |
117
9.46M
      (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) |
118
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV0)) |
119
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV1)) |
120
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV2)) |
121
9.46M
      (1 << static_cast<size_t>(AcStrategyType::AFV3));
122
9.46M
  if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) {
123
4.72M
    return;
124
4.72M
  }
125
126
4.73M
  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
127
4.73M
  float qac = quantizer.Scale() * (*quant);
128
4.73M
  if (xsize > 1 || ysize > 1) {
129
7.56M
    for (int i = 0; i < 4; ++i) {
130
6.04M
      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
131
6.04M
      if (thresholds[i] < 0.54) {
132
272k
        thresholds[i] = 0.54;
133
272k
      }
134
6.04M
    }
135
1.51M
  }
136
4.73M
  float sum_of_highest_freq_row_and_column = 0;
137
4.73M
  float sum_of_error = 0;
138
4.73M
  float sum_of_vals = 0;
139
4.73M
  float hfNonZeros[4] = {};
140
4.73M
  float hfMaxError[4] = {};
141
142
53.9M
  for (size_t y = 0; y < ysize * kBlockDim; y++) {
143
951M
    for (size_t x = 0; x < xsize * kBlockDim; x++) {
144
901M
      const size_t pos = y * kBlockDim * xsize + x;
145
901M
      if (x < xsize && y < ysize) {
146
14.0M
        continue;
147
14.0M
      }
148
887M
      const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
149
887M
                           static_cast<size_t>(x >= xsize * kBlockDim / 2));
150
887M
      const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
151
887M
      const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
152
887M
      const float error = std::abs(val - v);
153
887M
      sum_of_error += error;
154
887M
      sum_of_vals += std::abs(v);
155
887M
      if (c == 1 && v == 0) {
156
250M
        if (hfMaxError[hfix] < error) {
157
16.4M
          hfMaxError[hfix] = error;
158
16.4M
        }
159
250M
      }
160
887M
      if (v != 0.0f) {
161
70.6M
        hfNonZeros[hfix] += std::abs(v);
162
70.6M
        bool in_corner = y >= 7 * ysize && x >= 7 * xsize;
163
70.6M
        bool on_border =
164
70.6M
            y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1;
165
70.6M
        bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize;
166
70.6M
        if (in_corner || (on_border && in_larger_corner)) {
167
2.29M
          sum_of_highest_freq_row_and_column += std::abs(val);
168
2.29M
        }
169
70.6M
      }
170
887M
    }
171
49.1M
  }
172
4.73M
  if (c == 1 && sum_of_vals * 8 < xsize * ysize) {
173
446k
    static const double kLimit[4] = {
174
446k
        0.46,
175
446k
        0.46,
176
446k
        0.46,
177
446k
        0.46,
178
446k
    };
179
446k
    static const double kMul[4] = {
180
446k
        0.9999,
181
446k
        0.9999,
182
446k
        0.9999,
183
446k
        0.9999,
184
446k
    };
185
446k
    const int32_t orig_quant = *quant;
186
446k
    int32_t new_quant = *quant;
187
1.77M
    for (int i = 1; i < 4; ++i) {
188
1.33M
      if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) {
189
4.71k
        new_quant = orig_quant + 1;
190
4.71k
        break;
191
4.71k
      }
192
1.33M
    }
193
446k
    *quant = new_quant;
194
446k
    if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) {
195
755
      thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant;
196
446k
    } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) ||
197
443k
               (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) {
198
3.96k
      thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) *
199
3.96k
                      new_quant / orig_quant;
200
3.96k
      thresholds[2] = thresholds[1];
201
442k
    } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) {
202
18.4k
      thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant;
203
18.4k
    }
204
446k
  }
205
  // Heuristic for improving accuracy of high-frequency patterns
206
  // occurring in an environment with no medium-frequency masking
207
  // patterns.
208
4.73M
  {
209
4.73M
    float all =
210
4.73M
        hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1;
211
4.73M
    float mul[3] = {70, 30, 60};
212
4.73M
    if (mul[c] * sum_of_highest_freq_row_and_column >= all) {
213
583k
      *quant += mul[c] * sum_of_highest_freq_row_and_column / all;
214
583k
      if (*quant >= Quantizer::kQuantMax) {
215
0
        *quant = Quantizer::kQuantMax - 1;
216
0
      }
217
583k
    }
218
4.73M
  }
219
4.73M
  if (quant_kind == AcStrategyType::DCT) {
220
    // If this 8x8 block is too flat, increase the adaptive quantization level
221
    // a bit to reduce visible block boundaries and requantize the block.
222
3.22M
    if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
223
1.55M
      *quant += 1;
224
1.55M
      if (*quant >= Quantizer::kQuantMax) {
225
0
        *quant = Quantizer::kQuantMax - 1;
226
0
      }
227
1.55M
    }
228
3.22M
  }
229
4.73M
  {
230
4.73M
    static const double kMul1[4][3] = {
231
4.73M
        {
232
4.73M
            0.22080615753848404,
233
4.73M
            0.45797479824262011,
234
4.73M
            0.29859235095977965,
235
4.73M
        },
236
4.73M
        {
237
4.73M
            0.70109486510286834,
238
4.73M
            0.16185281305512639,
239
4.73M
            0.14387691730035473,
240
4.73M
        },
241
4.73M
        {
242
4.73M
            0.114985964456218638,
243
4.73M
            0.44656840441027695,
244
4.73M
            0.10587658215149048,
245
4.73M
        },
246
4.73M
        {
247
4.73M
            0.46849665264409396,
248
4.73M
            0.41239077937781954,
249
4.73M
            0.088667407767185444,
250
4.73M
        },
251
4.73M
    };
252
4.73M
    static const double kMul2[4][3] = {
253
4.73M
        {
254
4.73M
            0.27450281941822197,
255
4.73M
            1.1255766549984996,
256
4.73M
            0.98950459134128388,
257
4.73M
        },
258
4.73M
        {
259
4.73M
            0.4652168675598285,
260
4.73M
            0.40945807983455818,
261
4.73M
            0.36581899811751367,
262
4.73M
        },
263
4.73M
        {
264
4.73M
            0.28034972424715715,
265
4.73M
            0.9182653201929738,
266
4.73M
            1.5581531543057416,
267
4.73M
        },
268
4.73M
        {
269
4.73M
            0.26873118114033728,
270
4.73M
            0.68863712390392484,
271
4.73M
            1.2082185408666786,
272
4.73M
        },
273
4.73M
    };
274
4.73M
    static const double kQuantNormalizer = 2.2942708343284721;
275
4.73M
    sum_of_error *= kQuantNormalizer;
276
4.73M
    sum_of_vals *= kQuantNormalizer;
277
4.73M
    if (quant_kind >= AcStrategyType::DCT16X16) {
278
1.51M
      int ix = 3;
279
1.51M
      if (quant_kind == AcStrategyType::DCT32X16 ||
280
1.41M
          quant_kind == AcStrategyType::DCT16X32) {
281
242k
        ix = 1;
282
1.27M
      } else if (quant_kind == AcStrategyType::DCT16X16) {
283
195k
        ix = 0;
284
1.07M
      } else if (quant_kind == AcStrategyType::DCT32X32) {
285
215k
        ix = 2;
286
215k
      }
287
1.51M
      int step =
288
1.51M
          sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
289
1.51M
                          kMul2[ix][c] * sum_of_vals);
290
1.51M
      if (step >= 2) {
291
9.82k
        step = 2;
292
9.82k
      }
293
1.51M
      if (step < 0) {
294
0
        step = 0;
295
0
      }
296
1.51M
      if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
297
1.51M
                             kMul2[ix][c] * sum_of_vals) {
298
79.3k
        *quant += step;
299
79.3k
        if (*quant >= Quantizer::kQuantMax) {
300
0
          *quant = Quantizer::kQuantMax - 1;
301
0
        }
302
79.3k
      }
303
1.51M
    }
304
4.73M
  }
305
4.73M
  {
306
    // Reduce quant in highly active areas.
307
4.73M
    int32_t div = (xsize * ysize);
308
4.73M
    int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div;
309
4.73M
    int32_t orig_qp_limit = std::max(4, *quant / 2);
310
18.9M
    for (int i = 1; i < 4; ++i) {
311
14.2M
      activity = std::min(
312
14.2M
          activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div);
313
14.2M
    }
314
4.73M
    if (activity >= 15) {
315
206k
      activity = 15;
316
206k
    }
317
4.73M
    int32_t qp = *quant - activity;
318
4.73M
    if (c == 1) {
319
6.31M
      for (int i = 1; i < 4; ++i) {
320
4.73M
        thresholds[i] += 0.01 * activity;
321
4.73M
      }
322
1.57M
    }
323
4.73M
    if (qp < orig_qp_limit) {
324
1.14M
      qp = orig_qp_limit;
325
1.14M
    }
326
4.73M
    *quant = qp;
327
4.73M
  }
328
4.73M
}
Unexecuted instantiation: jxl::N_AVX3::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*)
Unexecuted instantiation: jxl::N_SSE2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float, jxl::AcStrategyType, unsigned long, unsigned long, float*, float const*, int*)
329
330
// NOTE: caller takes care of extracting quant from rect of RawQuantField.
331
void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
332
                               const Quantizer& quantizer,
333
                               const bool error_diffusion,
334
                               AcStrategyType quant_kind, size_t xsize,
335
                               size_t ysize, const float* JXL_RESTRICT biases,
336
                               int32_t* quant, float* JXL_RESTRICT inout,
337
3.15M
                               int32_t* JXL_RESTRICT quantized) {
338
3.15M
  float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
339
3.15M
  if (enc_state->cparams.speed_tier <= SpeedTier::kHare) {
340
3.15M
    int32_t max_quant = 0;
341
3.15M
    int quant_orig = *quant;
342
3.15M
    float val[3] = {enc_state->x_qm_multiplier, 1.0f,
343
3.15M
                    enc_state->b_qm_multiplier};
344
9.46M
    for (int c : {1, 0, 2}) {
345
9.46M
      float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
346
9.46M
      *quant = quant_orig;
347
9.46M
      AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
348
9.46M
                         &thres[0], inout + c * size, quant);
349
      // Dead zone adjustment
350
9.46M
      if (c == 1) {
351
15.7M
        for (int k = 0; k < 4; ++k) {
352
12.6M
          thres_y[k] = thres[k];
353
12.6M
        }
354
3.15M
      }
355
9.46M
      max_quant = std::max(*quant, max_quant);
356
9.46M
    }
357
3.15M
    *quant = max_quant;
358
3.15M
  } else {
359
0
    thres_y[0] = 0.56;
360
0
    thres_y[1] = 0.62;
361
0
    thres_y[2] = 0.62;
362
0
    thres_y[3] = 0.62;
363
0
  }
364
365
3.15M
  QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
366
3.15M
                  &thres_y[0], inout + size, quant, quantized + size);
367
368
3.15M
  const float* JXL_RESTRICT dequant_matrix =
369
3.15M
      quantizer.DequantMatrix(quant_kind, 1);
370
371
3.15M
  HWY_CAPPED(float, kDCTBlockSize) df;
372
3.15M
  HWY_CAPPED(int32_t, kDCTBlockSize) di;
373
3.15M
  const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
374
53.3M
  for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
375
50.1M
    const auto oquant = Load(di, quantized + size + k);
376
50.1M
    const auto adj_quant = AdjustQuantBias(di, 1, oquant, biases);
377
50.1M
    const auto dequantm = Load(df, dequant_matrix + k);
378
50.1M
    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
379
50.1M
  }
380
3.15M
}
Unexecuted instantiation: jxl::N_SSE4::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*)
jxl::N_AVX2::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*)
Line
Count
Source
337
3.15M
                               int32_t* JXL_RESTRICT quantized) {
338
3.15M
  float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
339
3.15M
  if (enc_state->cparams.speed_tier <= SpeedTier::kHare) {
340
3.15M
    int32_t max_quant = 0;
341
3.15M
    int quant_orig = *quant;
342
3.15M
    float val[3] = {enc_state->x_qm_multiplier, 1.0f,
343
3.15M
                    enc_state->b_qm_multiplier};
344
9.46M
    for (int c : {1, 0, 2}) {
345
9.46M
      float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
346
9.46M
      *quant = quant_orig;
347
9.46M
      AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
348
9.46M
                         &thres[0], inout + c * size, quant);
349
      // Dead zone adjustment
350
9.46M
      if (c == 1) {
351
15.7M
        for (int k = 0; k < 4; ++k) {
352
12.6M
          thres_y[k] = thres[k];
353
12.6M
        }
354
3.15M
      }
355
9.46M
      max_quant = std::max(*quant, max_quant);
356
9.46M
    }
357
3.15M
    *quant = max_quant;
358
3.15M
  } else {
359
0
    thres_y[0] = 0.56;
360
0
    thres_y[1] = 0.62;
361
0
    thres_y[2] = 0.62;
362
0
    thres_y[3] = 0.62;
363
0
  }
364
365
3.15M
  QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
366
3.15M
                  &thres_y[0], inout + size, quant, quantized + size);
367
368
3.15M
  const float* JXL_RESTRICT dequant_matrix =
369
3.15M
      quantizer.DequantMatrix(quant_kind, 1);
370
371
3.15M
  HWY_CAPPED(float, kDCTBlockSize) df;
372
3.15M
  HWY_CAPPED(int32_t, kDCTBlockSize) di;
373
3.15M
  const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
374
53.3M
  for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
375
50.1M
    const auto oquant = Load(di, quantized + size + k);
376
50.1M
    const auto adj_quant = AdjustQuantBias(di, 1, oquant, biases);
377
50.1M
    const auto dequantm = Load(df, dequant_matrix + k);
378
50.1M
    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
379
50.1M
  }
380
3.15M
}
Unexecuted instantiation: jxl::N_AVX3::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*)
Unexecuted instantiation: jxl::N_SSE2::QuantizeRoundtripYBlockAC(jxl::PassesEncoderState*, unsigned long, jxl::Quantizer const&, bool, jxl::AcStrategyType, unsigned long, unsigned long, float const*, int*, float*, int*)
381
382
Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
383
                           const Image3F& opsin, const Rect& rect,
384
10.7k
                           Image3F* dc) {
385
10.7k
  JxlMemoryManager* memory_manager = opsin.memory_manager();
386
10.7k
  const Rect block_group_rect =
387
10.7k
      enc_state->shared.frame_dim.BlockGroupRect(group_idx);
388
10.7k
  const Rect cmap_rect(
389
10.7k
      block_group_rect.x0() / kColorTileDimInBlocks,
390
10.7k
      block_group_rect.y0() / kColorTileDimInBlocks,
391
10.7k
      DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
392
10.7k
      DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
393
10.7k
  const Rect group_rect =
394
10.7k
      enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(),
395
10.7k
                                                                 rect.y0());
396
397
10.7k
  const size_t xsize_blocks = block_group_rect.xsize();
398
10.7k
  const size_t ysize_blocks = block_group_rect.ysize();
399
400
10.7k
  const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
401
10.7k
  const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
402
403
10.7k
  ImageI& full_quant_field = enc_state->shared.raw_quant_field;
404
10.7k
  const CompressParams& cparams = enc_state->cparams;
405
406
10.7k
  const size_t dct_scratch_size =
407
10.7k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
408
409
  // TODO(veluca): consider strategies to reduce this memory.
410
10.7k
  size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t);
411
10.7k
  JXL_ASSIGN_OR_RETURN(auto mem,
412
10.7k
                       AlignedMemory::Create(memory_manager, mem_bytes));
413
10.7k
  size_t fmem_bytes =
414
10.7k
      (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float);
415
10.7k
  JXL_ASSIGN_OR_RETURN(auto fmem,
416
10.7k
                       AlignedMemory::Create(memory_manager, fmem_bytes));
417
10.7k
  float* JXL_RESTRICT scratch_space =
418
10.7k
      fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea;
419
10.7k
  {
420
    // Only use error diffusion in Squirrel mode or slower.
421
10.7k
    const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
422
10.7k
    constexpr HWY_CAPPED(float, kDCTBlockSize) d;
423
424
10.7k
    int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
425
10.7k
    size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
426
10.7k
    JXL_ENSURE(num_passes > 0);
427
21.4k
    for (size_t i = 0; i < num_passes; i++) {
428
      // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
429
10.7k
      JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32);
430
42.8k
      for (size_t c = 0; c < 3; c++) {
431
32.1k
        coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
432
32.1k
      }
433
10.7k
    }
434
435
10.7k
    HWY_ALIGN float* coeffs_in = fmem.address<float>();
436
10.7k
    HWY_ALIGN int32_t* quantized = mem.address<int32_t>();
437
438
260k
    for (size_t by = 0; by < ysize_blocks; ++by) {
439
249k
      int32_t* JXL_RESTRICT row_quant_ac =
440
249k
          block_group_rect.Row(&full_quant_field, by);
441
249k
      size_t ty = by / kColorTileDimInBlocks;
442
249k
      const int8_t* JXL_RESTRICT row_cmap[3] = {
443
249k
          cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
444
249k
          nullptr,
445
249k
          cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty),
446
249k
      };
447
249k
      const float* JXL_RESTRICT opsin_rows[3] = {
448
249k
          group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim),
449
249k
          group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim),
450
249k
          group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim),
451
249k
      };
452
249k
      float* JXL_RESTRICT dc_rows[3] = {
453
249k
          block_group_rect.PlaneRow(dc, 0, by),
454
249k
          block_group_rect.PlaneRow(dc, 1, by),
455
249k
          block_group_rect.PlaneRow(dc, 2, by),
456
249k
      };
457
249k
      AcStrategyRow ac_strategy_row =
458
249k
          enc_state->shared.ac_strategy.ConstRow(block_group_rect, by);
459
1.09M
      for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
460
846k
           tx++) {
461
846k
        const auto x_factor =
462
846k
            Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx]));
463
846k
        const auto b_factor =
464
846k
            Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx]));
465
846k
        for (size_t bx = tx * kColorTileDimInBlocks;
466
7.11M
             bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) {
467
6.27M
          const AcStrategy acs = ac_strategy_row[bx];
468
6.27M
          if (!acs.IsFirstBlock()) continue;
469
470
3.15M
          size_t xblocks = acs.covered_blocks_x();
471
3.15M
          size_t yblocks = acs.covered_blocks_y();
472
473
3.15M
          CoefficientLayout(&yblocks, &xblocks);
474
475
3.15M
          size_t size = kDCTBlockSize * xblocks * yblocks;
476
477
          // DCT Y channel, roundtrip-quantize it and set DC.
478
3.15M
          int32_t quant_ac = row_quant_ac[bx];
479
9.46M
          for (size_t c : {0, 1, 2}) {
480
9.46M
            TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
481
9.46M
                                opsin_stride, coeffs_in + c * size,
482
9.46M
                                scratch_space);
483
9.46M
          }
484
3.15M
          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
485
3.15M
                                  dc_rows[1] + bx, dc_stride, scratch_space);
486
487
3.15M
          QuantizeRoundtripYBlockAC(
488
3.15M
              enc_state, size, enc_state->shared.quantizer, error_diffusion,
489
3.15M
              acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
490
3.15M
              coeffs_in, quantized);
491
492
          // Unapply color correlation
493
53.3M
          for (size_t k = 0; k < size; k += Lanes(d)) {
494
50.1M
            const auto in_x = Load(d, coeffs_in + k);
495
50.1M
            const auto in_y = Load(d, coeffs_in + size + k);
496
50.1M
            const auto in_b = Load(d, coeffs_in + 2 * size + k);
497
50.1M
            const auto out_x = NegMulAdd(x_factor, in_y, in_x);
498
50.1M
            const auto out_b = NegMulAdd(b_factor, in_y, in_b);
499
50.1M
            Store(out_x, d, coeffs_in + k);
500
50.1M
            Store(out_b, d, coeffs_in + 2 * size + k);
501
50.1M
          }
502
503
          // Quantize X and B channels and set DC.
504
6.30M
          for (size_t c : {0, 2}) {
505
6.30M
            float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
506
6.30M
            QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
507
6.30M
                            c == 0 ? enc_state->x_qm_multiplier
508
6.30M
                                   : enc_state->b_qm_multiplier,
509
6.30M
                            acs.Strategy(), xblocks, yblocks, &thres[0],
510
6.30M
                            coeffs_in + c * size, &quant_ac,
511
6.30M
                            quantized + c * size);
512
6.30M
            DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
513
6.30M
                                    dc_rows[c] + bx, dc_stride, scratch_space);
514
6.30M
          }
515
3.15M
          row_quant_ac[bx] = quant_ac;
516
12.6M
          for (size_t c = 0; c < 3; c++) {
517
9.46M
            enc_state->progressive_splitter.SplitACCoefficients(
518
9.46M
                quantized + c * size, acs, bx, by, coeffs[c]);
519
18.9M
            for (size_t p = 0; p < num_passes; p++) {
520
9.46M
              coeffs[c][p] += size;
521
9.46M
            }
522
9.46M
          }
523
3.15M
        }
524
846k
      }
525
249k
    }
526
10.7k
  }
527
0
  return true;
528
10.7k
}
Unexecuted instantiation: jxl::N_SSE4::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*)
jxl::N_AVX2::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*)
Line
Count
Source
384
10.7k
                           Image3F* dc) {
385
10.7k
  JxlMemoryManager* memory_manager = opsin.memory_manager();
386
10.7k
  const Rect block_group_rect =
387
10.7k
      enc_state->shared.frame_dim.BlockGroupRect(group_idx);
388
10.7k
  const Rect cmap_rect(
389
10.7k
      block_group_rect.x0() / kColorTileDimInBlocks,
390
10.7k
      block_group_rect.y0() / kColorTileDimInBlocks,
391
10.7k
      DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
392
10.7k
      DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
393
10.7k
  const Rect group_rect =
394
10.7k
      enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(),
395
10.7k
                                                                 rect.y0());
396
397
10.7k
  const size_t xsize_blocks = block_group_rect.xsize();
398
10.7k
  const size_t ysize_blocks = block_group_rect.ysize();
399
400
10.7k
  const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
401
10.7k
  const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
402
403
10.7k
  ImageI& full_quant_field = enc_state->shared.raw_quant_field;
404
10.7k
  const CompressParams& cparams = enc_state->cparams;
405
406
10.7k
  const size_t dct_scratch_size =
407
10.7k
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
408
409
  // TODO(veluca): consider strategies to reduce this memory.
410
10.7k
  size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t);
411
10.7k
  JXL_ASSIGN_OR_RETURN(auto mem,
412
10.7k
                       AlignedMemory::Create(memory_manager, mem_bytes));
413
10.7k
  size_t fmem_bytes =
414
10.7k
      (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float);
415
10.7k
  JXL_ASSIGN_OR_RETURN(auto fmem,
416
10.7k
                       AlignedMemory::Create(memory_manager, fmem_bytes));
417
10.7k
  float* JXL_RESTRICT scratch_space =
418
10.7k
      fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea;
419
10.7k
  {
420
    // Only use error diffusion in Squirrel mode or slower.
421
10.7k
    const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
422
10.7k
    constexpr HWY_CAPPED(float, kDCTBlockSize) d;
423
424
10.7k
    int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
425
10.7k
    size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
426
10.7k
    JXL_ENSURE(num_passes > 0);
427
21.4k
    for (size_t i = 0; i < num_passes; i++) {
428
      // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
429
10.7k
      JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32);
430
42.8k
      for (size_t c = 0; c < 3; c++) {
431
32.1k
        coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
432
32.1k
      }
433
10.7k
    }
434
435
10.7k
    HWY_ALIGN float* coeffs_in = fmem.address<float>();
436
10.7k
    HWY_ALIGN int32_t* quantized = mem.address<int32_t>();
437
438
260k
    for (size_t by = 0; by < ysize_blocks; ++by) {
439
249k
      int32_t* JXL_RESTRICT row_quant_ac =
440
249k
          block_group_rect.Row(&full_quant_field, by);
441
249k
      size_t ty = by / kColorTileDimInBlocks;
442
249k
      const int8_t* JXL_RESTRICT row_cmap[3] = {
443
249k
          cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
444
249k
          nullptr,
445
249k
          cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty),
446
249k
      };
447
249k
      const float* JXL_RESTRICT opsin_rows[3] = {
448
249k
          group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim),
449
249k
          group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim),
450
249k
          group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim),
451
249k
      };
452
249k
      float* JXL_RESTRICT dc_rows[3] = {
453
249k
          block_group_rect.PlaneRow(dc, 0, by),
454
249k
          block_group_rect.PlaneRow(dc, 1, by),
455
249k
          block_group_rect.PlaneRow(dc, 2, by),
456
249k
      };
457
249k
      AcStrategyRow ac_strategy_row =
458
249k
          enc_state->shared.ac_strategy.ConstRow(block_group_rect, by);
459
1.09M
      for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
460
846k
           tx++) {
461
846k
        const auto x_factor =
462
846k
            Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx]));
463
846k
        const auto b_factor =
464
846k
            Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx]));
465
846k
        for (size_t bx = tx * kColorTileDimInBlocks;
466
7.11M
             bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) {
467
6.27M
          const AcStrategy acs = ac_strategy_row[bx];
468
6.27M
          if (!acs.IsFirstBlock()) continue;
469
470
3.15M
          size_t xblocks = acs.covered_blocks_x();
471
3.15M
          size_t yblocks = acs.covered_blocks_y();
472
473
3.15M
          CoefficientLayout(&yblocks, &xblocks);
474
475
3.15M
          size_t size = kDCTBlockSize * xblocks * yblocks;
476
477
          // DCT Y channel, roundtrip-quantize it and set DC.
478
3.15M
          int32_t quant_ac = row_quant_ac[bx];
479
9.46M
          for (size_t c : {0, 1, 2}) {
480
9.46M
            TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
481
9.46M
                                opsin_stride, coeffs_in + c * size,
482
9.46M
                                scratch_space);
483
9.46M
          }
484
3.15M
          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
485
3.15M
                                  dc_rows[1] + bx, dc_stride, scratch_space);
486
487
3.15M
          QuantizeRoundtripYBlockAC(
488
3.15M
              enc_state, size, enc_state->shared.quantizer, error_diffusion,
489
3.15M
              acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
490
3.15M
              coeffs_in, quantized);
491
492
          // Unapply color correlation
493
53.3M
          for (size_t k = 0; k < size; k += Lanes(d)) {
494
50.1M
            const auto in_x = Load(d, coeffs_in + k);
495
50.1M
            const auto in_y = Load(d, coeffs_in + size + k);
496
50.1M
            const auto in_b = Load(d, coeffs_in + 2 * size + k);
497
50.1M
            const auto out_x = NegMulAdd(x_factor, in_y, in_x);
498
50.1M
            const auto out_b = NegMulAdd(b_factor, in_y, in_b);
499
50.1M
            Store(out_x, d, coeffs_in + k);
500
50.1M
            Store(out_b, d, coeffs_in + 2 * size + k);
501
50.1M
          }
502
503
          // Quantize X and B channels and set DC.
504
6.30M
          for (size_t c : {0, 2}) {
505
6.30M
            float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
506
6.30M
            QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
507
6.30M
                            c == 0 ? enc_state->x_qm_multiplier
508
6.30M
                                   : enc_state->b_qm_multiplier,
509
6.30M
                            acs.Strategy(), xblocks, yblocks, &thres[0],
510
6.30M
                            coeffs_in + c * size, &quant_ac,
511
6.30M
                            quantized + c * size);
512
6.30M
            DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
513
6.30M
                                    dc_rows[c] + bx, dc_stride, scratch_space);
514
6.30M
          }
515
3.15M
          row_quant_ac[bx] = quant_ac;
516
12.6M
          for (size_t c = 0; c < 3; c++) {
517
9.46M
            enc_state->progressive_splitter.SplitACCoefficients(
518
9.46M
                quantized + c * size, acs, bx, by, coeffs[c]);
519
18.9M
            for (size_t p = 0; p < num_passes; p++) {
520
9.46M
              coeffs[c][p] += size;
521
9.46M
            }
522
9.46M
          }
523
3.15M
        }
524
846k
      }
525
249k
    }
526
10.7k
  }
527
0
  return true;
528
10.7k
}
Unexecuted instantiation: jxl::N_AVX3::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*)
Unexecuted instantiation: jxl::N_AVX3_SPR::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*)
Unexecuted instantiation: jxl::N_SSE2::ComputeCoefficients(unsigned long, jxl::PassesEncoderState*, jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::Image3<float>*)
529
530
// NOLINTNEXTLINE(google-readability-namespace-comments)
531
}  // namespace HWY_NAMESPACE
532
}  // namespace jxl
533
HWY_AFTER_NAMESPACE();
534
535
#if HWY_ONCE
536
namespace jxl {
537
HWY_EXPORT(ComputeCoefficients);
538
Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
539
                           const Image3F& opsin, const Rect& rect,
540
10.7k
                           Image3F* dc) {
541
10.7k
  return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin,
542
10.7k
                                                   rect, dc);
543
10.7k
}
544
545
Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
546
                                        size_t histogram_idx,
547
                                        const PassesEncoderState& enc_state,
548
10.7k
                                        BitWriter* writer, AuxOut* aux_out) {
549
  // Select which histogram to use among those of the current pass.
550
10.7k
  const size_t num_histograms = enc_state.shared.num_histograms;
551
  // num_histograms is 0 only for lossless.
552
10.7k
  JXL_ENSURE(num_histograms == 0 || histogram_idx < num_histograms);
553
10.7k
  size_t histo_selector_bits = CeilLog2Nonzero(num_histograms);
554
555
10.7k
  if (histo_selector_bits != 0) {
556
0
    JXL_RETURN_IF_ERROR(
557
0
        writer->WithMaxBits(histo_selector_bits, LayerType::Ac, aux_out, [&] {
558
0
          writer->Write(histo_selector_bits, histogram_idx);
559
0
          return true;
560
0
        }));
561
0
  }
562
10.7k
  size_t context_offset =
563
10.7k
      histogram_idx * enc_state.shared.block_ctx_map.NumACContexts();
564
10.7k
  JXL_RETURN_IF_ERROR(
565
10.7k
      WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx],
566
10.7k
                  enc_state.passes[pass_idx].codes, context_offset, writer,
567
10.7k
                  LayerType::AcTokens, aux_out));
568
569
10.7k
  return true;
570
10.7k
}
571
572
}  // namespace jxl
573
#endif  // HWY_ONCE